markdown-to-confluence 0.4.4__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
md2conf/converter.py CHANGED
@@ -7,6 +7,7 @@ Copyright 2022-2025, Levente Hunyadi
7
7
  """
8
8
 
9
9
  import dataclasses
10
+ import enum
10
11
  import hashlib
11
12
  import logging
12
13
  import os.path
@@ -15,17 +16,18 @@ import uuid
15
16
  from abc import ABC, abstractmethod
16
17
  from dataclasses import dataclass
17
18
  from pathlib import Path
18
- from typing import Any, Literal, Optional, Union
19
- from urllib.parse import ParseResult, quote_plus, urlparse, urlunparse
19
+ from typing import ClassVar, Literal, Optional, Union
20
+ from urllib.parse import ParseResult, quote_plus, urlparse
20
21
 
21
22
  import lxml.etree as ET
22
23
  from strong_typing.core import JsonType
23
24
 
24
25
  from . import drawio, mermaid
25
26
  from .collection import ConfluencePageCollection
26
- from .csf import AC_ATTR, AC_ELEM, HTML, RI_ATTR, RI_ELEM, ParseError, elements_from_strings, elements_to_string
27
+ from .csf import AC_ATTR, AC_ELEM, HTML, RI_ATTR, RI_ELEM, ParseError, elements_from_strings, elements_to_string, normalize_inline
27
28
  from .domain import ConfluenceDocumentOptions, ConfluencePageID
28
29
  from .extra import override, path_relative_to
30
+ from .latex import get_png_dimensions, remove_png_chunks, render_latex
29
31
  from .markdown import markdown_to_html
30
32
  from .metadata import ConfluenceSiteMetadata
31
33
  from .properties import PageError
@@ -88,86 +90,90 @@ def encode_title(text: str) -> str:
88
90
 
89
91
 
90
92
  # supported code block languages, for which syntax highlighting is available
91
- _LANGUAGES = [
92
- "abap",
93
- "actionscript3",
94
- "ada",
95
- "applescript",
96
- "arduino",
97
- "autoit",
98
- "bash",
99
- "c",
100
- "clojure",
101
- "coffeescript",
102
- "coldfusion",
103
- "cpp",
104
- "csharp",
105
- "css",
106
- "cuda",
107
- "d",
108
- "dart",
109
- "delphi",
110
- "diff",
111
- "elixir",
112
- "erlang",
113
- "fortran",
114
- "foxpro",
115
- "go",
116
- "graphql",
117
- "groovy",
118
- "haskell",
119
- "haxe",
120
- "html",
121
- "java",
122
- "javafx",
123
- "javascript",
124
- "json",
125
- "jsx",
126
- "julia",
127
- "kotlin",
128
- "livescript",
129
- "lua",
130
- "mermaid",
131
- "mathematica",
132
- "matlab",
133
- "objectivec",
134
- "objectivej",
135
- "ocaml",
136
- "octave",
137
- "pascal",
138
- "perl",
139
- "php",
140
- "powershell",
141
- "prolog",
142
- "puppet",
143
- "python",
144
- "qml",
145
- "r",
146
- "racket",
147
- "rst",
148
- "ruby",
149
- "rust",
150
- "sass",
151
- "scala",
152
- "scheme",
153
- "shell",
154
- "smalltalk",
155
- "splunk",
156
- "sql",
157
- "standardml",
158
- "swift",
159
- "tcl",
160
- "tex",
161
- "tsx",
162
- "typescript",
163
- "vala",
164
- "vb",
165
- "verilog",
166
- "vhdl",
167
- "xml",
168
- "xquery",
169
- "yaml",
170
- ]
93
+ _LANGUAGES = {
94
+ "abap": "abap",
95
+ "actionscript3": "actionscript3",
96
+ "ada": "ada",
97
+ "applescript": "applescript",
98
+ "arduino": "arduino",
99
+ "autoit": "autoit",
100
+ "bash": "bash",
101
+ "c": "c",
102
+ "c#": "c#",
103
+ "clojure": "clojure",
104
+ "coffeescript": "coffeescript",
105
+ "coldfusion": "coldfusion",
106
+ "cpp": "cpp",
107
+ "csharp": "c#",
108
+ "css": "css",
109
+ "cuda": "cuda",
110
+ "d": "d",
111
+ "dart": "dart",
112
+ "delphi": "delphi",
113
+ "diff": "diff",
114
+ "elixir": "elixir",
115
+ "erl": "erl",
116
+ "erlang": "erl",
117
+ "fortran": "fortran",
118
+ "foxpro": "foxpro",
119
+ "go": "go",
120
+ "graphql": "graphql",
121
+ "groovy": "groovy",
122
+ "haskell": "haskell",
123
+ "haxe": "haxe",
124
+ "html": "html",
125
+ "java": "java",
126
+ "javafx": "javafx",
127
+ "javascript": "js",
128
+ "js": "js",
129
+ "json": "json",
130
+ "jsx": "jsx",
131
+ "julia": "julia",
132
+ "kotlin": "kotlin",
133
+ "livescript": "livescript",
134
+ "lua": "lua",
135
+ "mermaid": "mermaid",
136
+ "mathematica": "mathematica",
137
+ "matlab": "matlab",
138
+ "objectivec": "objectivec",
139
+ "objectivej": "objectivej",
140
+ "ocaml": "ocaml",
141
+ "octave": "octave",
142
+ "pascal": "pascal",
143
+ "perl": "perl",
144
+ "php": "php",
145
+ "powershell": "powershell",
146
+ "prolog": "prolog",
147
+ "puppet": "puppet",
148
+ "py": "py",
149
+ "python": "py",
150
+ "qml": "qml",
151
+ "r": "r",
152
+ "racket": "racket",
153
+ "rst": "rst",
154
+ "ruby": "ruby",
155
+ "rust": "rust",
156
+ "sass": "sass",
157
+ "scala": "scala",
158
+ "scheme": "scheme",
159
+ "shell": "shell",
160
+ "smalltalk": "smalltalk",
161
+ "splunk": "splunk",
162
+ "sql": "sql",
163
+ "standardml": "standardml",
164
+ "swift": "swift",
165
+ "tcl": "tcl",
166
+ "tex": "tex",
167
+ "tsx": "tsx",
168
+ "typescript": "typescript",
169
+ "vala": "vala",
170
+ "vb": "vb",
171
+ "verilog": "verilog",
172
+ "vhdl": "vhdl",
173
+ "xml": "xml",
174
+ "xquery": "xquery",
175
+ "yaml": "yaml",
176
+ }
171
177
 
172
178
 
173
179
  class NodeVisitor(ABC):
@@ -181,6 +187,11 @@ class NodeVisitor(ABC):
181
187
  source = node[index]
182
188
  target = self.transform(source)
183
189
  if target is not None:
190
+ # chain sibling text node that immediately follows original element
191
+ target.tail = source.tail
192
+ source.tail = None
193
+
194
+ # replace original element with transformed element
184
195
  node[index] = target
185
196
  else:
186
197
  self.visit(source)
@@ -206,19 +217,99 @@ def element_text_starts_with_any(node: ET._Element, prefixes: list[str]) -> bool
206
217
  return starts_with_any(node.text, prefixes)
207
218
 
208
219
 
220
+ def is_placeholder_for(node: ET._Element, name: str) -> bool:
221
+ """
222
+ Identifies a Confluence widget placeholder, e.g. `[[_TOC_]]` or `[[_LISTING_]]`.
223
+
224
+ :param node: The element to check.
225
+ :param name: The placeholder name.
226
+ """
227
+
228
+ # `[[_TOC_]]` is represented in HTML as <p>[[<em>TOC</em>]]</p>
229
+ if node.text != "[[" or len(node) != 1:
230
+ return False
231
+
232
+ child = node[0]
233
+ if child.tag != "em" or child.text != name or child.tail != "]]":
234
+ return False
235
+
236
+ return True
237
+
238
+
239
+ @enum.unique
240
+ class FormattingContext(enum.Enum):
241
+ "Identifies the formatting context for the element."
242
+
243
+ BLOCK = "block"
244
+ INLINE = "inline"
245
+
246
+
209
247
  @dataclass
210
248
  class ImageAttributes:
211
249
  """
212
250
  Attributes applied to an `<img>` element.
213
251
 
214
- :param caption: Caption text (`alt` attribute).
252
+ :param context: Identifies the formatting context for the element (block or inline).
215
253
  :param width: Natural image width in pixels.
216
254
  :param height: Natural image height in pixels.
255
+ :param alt: Alternate text.
256
+ :param title: Title text (a.k.a. image tooltip).
257
+ :param caption: Caption text (shown below figure).
217
258
  """
218
259
 
260
+ context: FormattingContext
261
+ width: Optional[int]
262
+ height: Optional[int]
263
+ alt: Optional[str]
264
+ title: Optional[str]
219
265
  caption: Optional[str]
220
- width: Optional[str]
221
- height: Optional[str]
266
+
267
+ def __post_init__(self) -> None:
268
+ if self.caption is None and self.context is FormattingContext.BLOCK:
269
+ self.caption = self.title or self.alt
270
+
271
+ def as_dict(self) -> dict[str, str]:
272
+ attributes: dict[str, str] = {}
273
+ if self.context is FormattingContext.BLOCK:
274
+ attributes[AC_ATTR("align")] = "center"
275
+ attributes[AC_ATTR("layout")] = "center"
276
+ if self.width is not None:
277
+ attributes[AC_ATTR("original-width")] = str(self.width)
278
+ if self.height is not None:
279
+ attributes[AC_ATTR("original-height")] = str(self.height)
280
+ if self.width is not None:
281
+ attributes[AC_ATTR("custom-width")] = "true"
282
+ attributes[AC_ATTR("width")] = str(self.width)
283
+
284
+ elif self.context is FormattingContext.INLINE:
285
+ if self.width is not None:
286
+ attributes[AC_ATTR("width")] = str(self.width)
287
+ if self.height is not None:
288
+ attributes[AC_ATTR("height")] = str(self.height)
289
+ else:
290
+ raise NotImplementedError("match not exhaustive for enumeration")
291
+
292
+ if self.alt is not None:
293
+ attributes.update({AC_ATTR("alt"): self.alt})
294
+ if self.title is not None:
295
+ attributes.update({AC_ATTR("title"): self.title})
296
+ return attributes
297
+
298
+ EMPTY_BLOCK: ClassVar["ImageAttributes"]
299
+ EMPTY_INLINE: ClassVar["ImageAttributes"]
300
+
301
+ @classmethod
302
+ def empty(cls, context: FormattingContext) -> "ImageAttributes":
303
+ if context is FormattingContext.BLOCK:
304
+ return cls.EMPTY_BLOCK
305
+ elif context is FormattingContext.INLINE:
306
+ return cls.EMPTY_INLINE
307
+ else:
308
+ raise NotImplementedError("match not exhaustive for enumeration")
309
+
310
+
311
+ ImageAttributes.EMPTY_BLOCK = ImageAttributes(FormattingContext.BLOCK, None, None, None, None, None)
312
+ ImageAttributes.EMPTY_INLINE = ImageAttributes(FormattingContext.INLINE, None, None, None, None, None)
222
313
 
223
314
 
224
315
  @dataclass
@@ -233,6 +324,7 @@ class ConfluenceConverterOptions:
233
324
  :param prefer_raster: Whether to choose PNG files over SVG files when available.
234
325
  :param render_drawio: Whether to pre-render (or use the pre-rendered version of) draw.io diagrams.
235
326
  :param render_mermaid: Whether to pre-render Mermaid diagrams into PNG/SVG images.
327
+ :param render_latex: Whether to pre-render LaTeX formulas into PNG/SVG images.
236
328
  :param diagram_output_format: Target image format for diagrams.
237
329
  :param webui_links: When true, convert relative URLs to Confluence Web UI links.
238
330
  """
@@ -242,10 +334,23 @@ class ConfluenceConverterOptions:
242
334
  prefer_raster: bool = True
243
335
  render_drawio: bool = False
244
336
  render_mermaid: bool = False
337
+ render_latex: bool = False
245
338
  diagram_output_format: Literal["png", "svg"] = "png"
246
339
  webui_links: bool = False
247
340
 
248
341
 
342
+ @dataclass
343
+ class ImageData:
344
+ path: Path
345
+ description: Optional[str] = None
346
+
347
+
348
+ @dataclass
349
+ class EmbeddedFileData:
350
+ data: bytes
351
+ description: Optional[str] = None
352
+
353
+
249
354
  class ConfluenceStorageFormatConverter(NodeVisitor):
250
355
  "Transforms a plain HTML tree into Confluence Storage Format."
251
356
 
@@ -255,8 +360,8 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
255
360
  root_dir: Path
256
361
  toc: TableOfContentsBuilder
257
362
  links: list[str]
258
- images: list[Path]
259
- embedded_files: dict[str, bytes]
363
+ images: list[ImageData]
364
+ embedded_files: dict[str, EmbeddedFileData]
260
365
  site_metadata: ConfluenceSiteMetadata
261
366
  page_metadata: ConfluencePageCollection
262
367
 
@@ -285,7 +390,19 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
285
390
  self.page_metadata = page_metadata
286
391
 
287
392
  def _transform_heading(self, heading: ET._Element) -> None:
288
- "Adds anchors to headings in the same document (if *heading anchors* is enabled)."
393
+ """
394
+ Adds anchors to headings in the same document (if *heading anchors* is enabled).
395
+
396
+ Original:
397
+ ```
398
+ <h1>Heading text</h1>
399
+ ```
400
+
401
+ Transformed:
402
+ ```
403
+ <h1><structured-macro name="anchor">...</structured-macro>Heading text</h1>
404
+ ```
405
+ """
289
406
 
290
407
  for e in heading:
291
408
  self.visit(e)
@@ -325,7 +442,10 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
325
442
  * Links to documents in the source hierarchy are mapped into full Confluence URLs.
326
443
  """
327
444
 
328
- url = anchor.attrib.get("href")
445
+ # Confluence doesn't support `title` attribute on `<a>` elements
446
+ anchor.attrib.pop("title", None)
447
+
448
+ url = anchor.get("href")
329
449
  if url is None or is_absolute_url(url):
330
450
  return None
331
451
 
@@ -333,7 +453,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
333
453
  relative_url: ParseResult = urlparse(url)
334
454
 
335
455
  if not relative_url.scheme and not relative_url.netloc and not relative_url.path and not relative_url.params and not relative_url.query:
336
- LOGGER.debug("Found local URL: %s", url)
456
+ LOGGER.debug("Found same-page URL: %s", url)
337
457
  if self.options.heading_anchors:
338
458
  # <ac:link ac:anchor="anchor"><ac:link-body>...</ac:link-body></ac:link>
339
459
  target = relative_url.fragment.lstrip("#")
@@ -346,33 +466,39 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
346
466
  },
347
467
  link_body,
348
468
  )
349
- link_wrapper.tail = anchor.tail
350
469
  return link_wrapper
351
470
  else:
352
471
  return None
353
472
 
354
- # convert the relative URL to absolute URL based on the base path value, then look up
355
- # the absolute path in the page metadata dictionary to discover the relative path
356
- # within Confluence that should be used
473
+ # discard original value: relative links always require transformation
474
+ anchor.attrib.pop("href")
475
+
476
+ # convert the relative URL to absolute path based on the base path value
357
477
  absolute_path = (self.base_dir / relative_url.path).resolve()
478
+
479
+ # look up the absolute path in the page metadata dictionary to discover the relative path within Confluence that should be used
358
480
  if not is_directory_within(absolute_path, self.root_dir):
359
- anchor.attrib.pop("href")
360
481
  self._warn_or_raise(f"relative URL {url} points to outside root path: {self.root_dir}")
361
482
  return None
362
483
 
484
+ if absolute_path.suffix == ".md":
485
+ return self._transform_page_link(anchor, relative_url, absolute_path)
486
+ else:
487
+ return self._transform_attachment_link(anchor, absolute_path)
488
+
489
+ def _transform_page_link(self, anchor: ET._Element, relative_url: ParseResult, absolute_path: Path) -> Optional[ET._Element]:
490
+ """
491
+ Transforms links to other Markdown documents (Confluence pages).
492
+ """
493
+
363
494
  link_metadata = self.page_metadata.get(absolute_path)
364
495
  if link_metadata is None:
365
- msg = f"unable to find matching page for URL: {url}"
366
- if self.options.ignore_invalid_url:
367
- LOGGER.warning(msg)
368
- anchor.attrib.pop("href")
369
- return None
370
- else:
371
- raise DocumentError(msg)
496
+ self._warn_or_raise(f"unable to find matching page for URL: {relative_url.geturl()}")
497
+ return None
372
498
 
373
499
  relative_path = os.path.relpath(absolute_path, self.base_dir)
374
500
  LOGGER.debug("Found link to page %s with metadata: %s", relative_path, link_metadata)
375
- self.links.append(url)
501
+ self.links.append(relative_url.geturl())
376
502
 
377
503
  if self.options.webui_links:
378
504
  page_url = f"{self.site_metadata.base_path}pages/viewpage.action?pageId={link_metadata.page_id}"
@@ -384,7 +510,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
384
510
 
385
511
  page_url = f"{self.site_metadata.base_path}spaces/{space_key}/pages/{link_metadata.page_id}/{encode_title(link_metadata.title)}"
386
512
 
387
- components = ParseResult(
513
+ transformed_url = ParseResult(
388
514
  scheme="https",
389
515
  netloc=self.site_metadata.domain,
390
516
  path=page_url,
@@ -392,47 +518,83 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
392
518
  query="",
393
519
  fragment=relative_url.fragment,
394
520
  )
395
- transformed_url = urlunparse(components)
396
521
 
397
- LOGGER.debug("Transformed relative URL: %s to URL: %s", url, transformed_url)
398
- anchor.attrib["href"] = transformed_url
522
+ LOGGER.debug("Transformed relative URL: %s to URL: %s", relative_url.geturl(), transformed_url.geturl())
523
+ anchor.set("href", transformed_url.geturl())
399
524
  return None
400
525
 
526
+ def _transform_attachment_link(self, anchor: ET._Element, absolute_path: Path) -> Optional[ET._Element]:
527
+ """
528
+ Transforms links to document binaries such as PDF, DOCX or XLSX.
529
+ """
530
+
531
+ if not absolute_path.exists():
532
+ self._warn_or_raise(f"relative URL points to non-existing file: {absolute_path}")
533
+ return None
534
+
535
+ file_name = attachment_name(path_relative_to(absolute_path, self.base_dir))
536
+ self.images.append(ImageData(absolute_path))
537
+
538
+ link_body = AC_ELEM("link-body", {}, *list(anchor))
539
+ link_body.text = anchor.text
540
+ link_wrapper = AC_ELEM(
541
+ "link",
542
+ {},
543
+ RI_ELEM("attachment", {RI_ATTR("filename"): file_name}),
544
+ link_body,
545
+ )
546
+ return link_wrapper
547
+
401
548
  def _transform_status(self, color: str, caption: str) -> ET._Element:
402
549
  macro_id = str(uuid.uuid4())
403
- return AC_ELEM(
404
- "structured-macro",
405
- {
406
- AC_ATTR("name"): "status",
407
- AC_ATTR("schema-version"): "1",
408
- AC_ATTR("macro-id"): macro_id,
409
- },
410
- AC_ELEM(
411
- "parameter",
412
- {AC_ATTR("name"): "colour"},
413
- color.title(),
414
- ),
415
- AC_ELEM(
416
- "parameter",
417
- {AC_ATTR("name"): "title"},
418
- caption,
419
- ),
420
- )
550
+ attributes = {
551
+ AC_ATTR("name"): "status",
552
+ AC_ATTR("schema-version"): "1",
553
+ AC_ATTR("macro-id"): macro_id,
554
+ }
555
+ if color != "gray":
556
+ return AC_ELEM(
557
+ "structured-macro",
558
+ attributes,
559
+ AC_ELEM(
560
+ "parameter",
561
+ {AC_ATTR("name"): "colour"},
562
+ color.title(),
563
+ ),
564
+ AC_ELEM(
565
+ "parameter",
566
+ {AC_ATTR("name"): "title"},
567
+ caption,
568
+ ),
569
+ )
570
+ else:
571
+ return AC_ELEM(
572
+ "structured-macro",
573
+ attributes,
574
+ AC_ELEM(
575
+ "parameter",
576
+ {AC_ATTR("name"): "title"},
577
+ caption,
578
+ ),
579
+ )
421
580
 
422
- def _transform_image(self, image: ET._Element) -> ET._Element:
581
+ def _transform_image(self, context: FormattingContext, image: ET._Element) -> ET._Element:
423
582
  "Inserts an attached or external image."
424
583
 
425
- src = image.attrib.get("src")
584
+ src = image.get("src")
426
585
  if not src:
427
586
  raise DocumentError("image lacks `src` attribute")
428
587
 
429
- caption = image.attrib.get("alt")
430
- if caption is not None and src.startswith("urn:uuid:") and (color := status_images.get(src)) is not None:
431
- return self._transform_status(color, caption)
588
+ alt = image.get("alt")
589
+ if alt is not None and src.startswith("urn:uuid:") and (color := status_images.get(src)) is not None:
590
+ return self._transform_status(color, alt)
432
591
 
433
- width = image.attrib.get("width")
434
- height = image.attrib.get("height")
435
- attrs = ImageAttributes(caption, width, height)
592
+ title = image.get("title")
593
+ width = image.get("width")
594
+ height = image.get("height")
595
+ pixel_width = int(width) if width is not None and width.isdecimal() else None
596
+ pixel_height = int(height) if height is not None and height.isdecimal() else None
597
+ attrs = ImageAttributes(context, pixel_width, pixel_height, alt, title, None)
436
598
 
437
599
  if is_absolute_url(src):
438
600
  return self._transform_external_image(src, attrs)
@@ -441,7 +603,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
441
603
 
442
604
  absolute_path = self._verify_image_path(path)
443
605
  if absolute_path is None:
444
- return self._create_missing(path, caption)
606
+ return self._create_missing(path, attrs.caption)
445
607
 
446
608
  if absolute_path.name.endswith(".drawio.png") or absolute_path.name.endswith(".drawio.svg"):
447
609
  return self._transform_drawio_image(absolute_path, attrs)
@@ -455,15 +617,6 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
455
617
  def _transform_external_image(self, url: str, attrs: ImageAttributes) -> ET._Element:
456
618
  "Emits Confluence Storage Format XHTML for an external image."
457
619
 
458
- attributes: dict[str, Any] = {
459
- AC_ATTR("align"): "center",
460
- AC_ATTR("layout"): "center",
461
- }
462
- if attrs.width is not None:
463
- attributes.update({AC_ATTR("width"): attrs.width})
464
- if attrs.height is not None:
465
- attributes.update({AC_ATTR("height"): attrs.height})
466
-
467
620
  elements: list[ET._Element] = []
468
621
  elements.append(
469
622
  RI_ELEM(
@@ -472,10 +625,10 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
472
625
  {RI_ATTR("value"): url},
473
626
  )
474
627
  )
475
- if attrs.caption is not None:
476
- elements.append(AC_ELEM("caption", HTML.p(attrs.caption)))
628
+ if attrs.caption:
629
+ elements.append(AC_ELEM("caption", attrs.caption))
477
630
 
478
- return AC_ELEM("image", attributes, *elements)
631
+ return AC_ELEM("image", attrs.as_dict(), *elements)
479
632
 
480
633
  def _verify_image_path(self, path: Path) -> Optional[Path]:
481
634
  "Checks whether an image path is safe to use."
@@ -496,13 +649,13 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
496
649
  def _transform_attached_image(self, absolute_path: Path, attrs: ImageAttributes) -> ET._Element:
497
650
  "Emits Confluence Storage Format XHTML for an attached raster or vector image."
498
651
 
499
- if self.options.prefer_raster and absolute_path.name.endswith(".svg"):
652
+ if self.options.prefer_raster and absolute_path.suffix == ".svg":
500
653
  # prefer PNG over SVG; Confluence displays SVG in wrong size, and text labels are truncated
501
654
  png_file = absolute_path.with_suffix(".png")
502
655
  if png_file.exists():
503
656
  absolute_path = png_file
504
657
 
505
- self.images.append(absolute_path)
658
+ self.images.append(ImageData(absolute_path, attrs.alt))
506
659
  image_name = attachment_name(path_relative_to(absolute_path, self.base_dir))
507
660
  return self._create_attached_image(image_name, attrs)
508
661
 
@@ -512,15 +665,15 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
512
665
  if not absolute_path.name.endswith(".drawio.xml") and not absolute_path.name.endswith(".drawio"):
513
666
  raise DocumentError("invalid image format; expected: `*.drawio.xml` or `*.drawio`")
514
667
 
668
+ relative_path = path_relative_to(absolute_path, self.base_dir)
515
669
  if self.options.render_drawio:
516
670
  image_data = drawio.render_diagram(absolute_path, self.options.diagram_output_format)
517
- image_hash = hashlib.md5(image_data).hexdigest()
518
- image_filename = attachment_name(f"embedded_{image_hash}.{self.options.diagram_output_format}")
519
- self.embedded_files[image_filename] = image_data
671
+ image_filename = attachment_name(relative_path.with_suffix(f".{self.options.diagram_output_format}"))
672
+ self.embedded_files[image_filename] = EmbeddedFileData(image_data, attrs.alt)
520
673
  return self._create_attached_image(image_filename, attrs)
521
674
  else:
522
- self.images.append(absolute_path)
523
- image_filename = attachment_name(path_relative_to(absolute_path, self.base_dir))
675
+ self.images.append(ImageData(absolute_path, attrs.alt))
676
+ image_filename = attachment_name(relative_path)
524
677
  return self._create_drawio(image_filename, attrs)
525
678
 
526
679
  def _transform_drawio_image(self, absolute_path: Path, attrs: ImageAttributes) -> ET._Element:
@@ -535,22 +688,13 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
535
688
  # extract embedded editable diagram and upload as *.drawio
536
689
  image_data = drawio.extract_diagram(absolute_path)
537
690
  image_filename = attachment_name(path_relative_to(absolute_path.with_suffix(".xml"), self.base_dir))
538
- self.embedded_files[image_filename] = image_data
691
+ self.embedded_files[image_filename] = EmbeddedFileData(image_data, attrs.alt)
539
692
 
540
693
  return self._create_drawio(image_filename, attrs)
541
694
 
542
695
  def _create_attached_image(self, image_name: str, attrs: ImageAttributes) -> ET._Element:
543
696
  "An image embedded into the page, linking to an attachment."
544
697
 
545
- attributes: dict[str, Any] = {
546
- AC_ATTR("align"): "center",
547
- AC_ATTR("layout"): "center",
548
- }
549
- if attrs.width is not None:
550
- attributes.update({AC_ATTR("width"): attrs.width})
551
- if attrs.height is not None:
552
- attributes.update({AC_ATTR("height"): attrs.height})
553
-
554
698
  elements: list[ET._Element] = []
555
699
  elements.append(
556
700
  RI_ELEM(
@@ -559,10 +703,10 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
559
703
  {RI_ATTR("filename"): image_name},
560
704
  )
561
705
  )
562
- if attrs.caption is not None:
563
- elements.append(AC_ELEM("caption", HTML.p(attrs.caption)))
706
+ if attrs.caption:
707
+ elements.append(AC_ELEM("caption", attrs.caption))
564
708
 
565
- return AC_ELEM("image", attributes, *elements)
709
+ return AC_ELEM("image", attrs.as_dict(), *elements)
566
710
 
567
711
  def _create_drawio(self, filename: str, attrs: ImageAttributes) -> ET._Element:
568
712
  "A draw.io diagram embedded into the page, linking to an attachment."
@@ -579,7 +723,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
579
723
  AC_ELEM(
580
724
  "parameter",
581
725
  {AC_ATTR("name"): "width"},
582
- attrs.width,
726
+ str(attrs.width),
583
727
  ),
584
728
  )
585
729
  if attrs.height is not None:
@@ -587,7 +731,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
587
731
  AC_ELEM(
588
732
  "parameter",
589
733
  {AC_ATTR("name"): "height"},
590
- attrs.height,
734
+ str(attrs.height),
591
735
  ),
592
736
  )
593
737
 
@@ -633,20 +777,25 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
633
777
  def _transform_code_block(self, code: ET._Element) -> ET._Element:
634
778
  "Transforms a code block."
635
779
 
636
- language = code.attrib.get("class")
637
- if language:
638
- m = re.match("^language-(.*)$", language)
639
- if m:
640
- language = m.group(1)
780
+ if language_class := code.get("class"):
781
+ if m := re.match("^language-(.*)$", language_class):
782
+ language_name = m.group(1)
641
783
  else:
642
- language = "none"
643
- if language not in _LANGUAGES:
644
- language = "none"
784
+ language_name = None
785
+ else:
786
+ language_name = None
787
+
788
+ # translate name to standard name for (programming) language
789
+ if language_name is not None:
790
+ language_id = _LANGUAGES.get(language_name)
791
+ else:
792
+ language_id = None
793
+
645
794
  content: str = code.text or ""
646
795
  content = content.rstrip()
647
796
 
648
- if language == "mermaid":
649
- return self._transform_inline_mermaid(content)
797
+ if language_id == "mermaid":
798
+ return self._transform_fenced_mermaid(content)
650
799
 
651
800
  return AC_ELEM(
652
801
  "structured-macro",
@@ -654,15 +803,10 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
654
803
  AC_ATTR("name"): "code",
655
804
  AC_ATTR("schema-version"): "1",
656
805
  },
657
- AC_ELEM(
658
- "parameter",
659
- {AC_ATTR("name"): "theme"},
660
- "Default",
661
- ),
662
806
  AC_ELEM(
663
807
  "parameter",
664
808
  {AC_ATTR("name"): "language"},
665
- language,
809
+ language_id or "none",
666
810
  ),
667
811
  AC_ELEM("plain-text-body", ET.CDATA(content)),
668
812
  )
@@ -673,36 +817,35 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
673
817
  if not absolute_path.name.endswith(".mmd") and not absolute_path.name.endswith(".mermaid"):
674
818
  raise DocumentError("invalid image format; expected: `*.mmd` or `*.mermaid`")
675
819
 
820
+ relative_path = path_relative_to(absolute_path, self.base_dir)
676
821
  if self.options.render_mermaid:
677
822
  with open(absolute_path, "r", encoding="utf-8") as f:
678
823
  content = f.read()
679
- return self._create_mermaid_image(content, attrs)
824
+ image_data = mermaid.render_diagram(content, self.options.diagram_output_format)
825
+ image_filename = attachment_name(relative_path.with_suffix(f".{self.options.diagram_output_format}"))
826
+ self.embedded_files[image_filename] = EmbeddedFileData(image_data, attrs.alt)
827
+ return self._create_attached_image(image_filename, attrs)
680
828
  else:
681
- self.images.append(absolute_path)
682
- mermaid_filename = attachment_name(path_relative_to(absolute_path, self.base_dir))
829
+ self.images.append(ImageData(absolute_path, attrs.alt))
830
+ mermaid_filename = attachment_name(relative_path)
683
831
  return self._create_mermaid_embed(mermaid_filename)
684
832
 
685
- def _transform_inline_mermaid(self, content: str) -> ET._Element:
686
- "Emits Confluence Storage Format XHTML for a Mermaid diagram defined in a code block."
833
+ def _transform_fenced_mermaid(self, content: str) -> ET._Element:
834
+ "Emits Confluence Storage Format XHTML for a Mermaid diagram defined in a fenced code block."
687
835
 
688
836
  if self.options.render_mermaid:
689
- return self._create_mermaid_image(content, ImageAttributes(None, None, None))
837
+ image_data = mermaid.render_diagram(content, self.options.diagram_output_format)
838
+ image_hash = hashlib.md5(image_data).hexdigest()
839
+ image_filename = attachment_name(f"embedded_{image_hash}.{self.options.diagram_output_format}")
840
+ self.embedded_files[image_filename] = EmbeddedFileData(image_data)
841
+ return self._create_attached_image(image_filename, ImageAttributes.EMPTY_BLOCK)
690
842
  else:
691
843
  mermaid_data = content.encode("utf-8")
692
844
  mermaid_hash = hashlib.md5(mermaid_data).hexdigest()
693
845
  mermaid_filename = attachment_name(f"embedded_{mermaid_hash}.mmd")
694
- self.embedded_files[mermaid_filename] = mermaid_data
846
+ self.embedded_files[mermaid_filename] = EmbeddedFileData(mermaid_data)
695
847
  return self._create_mermaid_embed(mermaid_filename)
696
848
 
697
- def _create_mermaid_image(self, content: str, attrs: ImageAttributes) -> ET._Element:
698
- "A rendered Mermaid diagram, linking to an attachment uploaded as an image."
699
-
700
- image_data = mermaid.render_diagram(content, self.options.diagram_output_format)
701
- image_hash = hashlib.md5(image_data).hexdigest()
702
- image_filename = attachment_name(f"embedded_{image_hash}.{self.options.diagram_output_format}")
703
- self.embedded_files[image_filename] = image_data
704
- return self._create_attached_image(image_filename, attrs)
705
-
706
849
  def _create_mermaid_embed(self, filename: str) -> ET._Element:
707
850
  "A Mermaid diagram, linking to an attachment that captures the Mermaid source."
708
851
 
@@ -743,6 +886,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
743
886
  {
744
887
  AC_ATTR("name"): "toc",
745
888
  AC_ATTR("schema-version"): "1",
889
+ "data-layout": "default",
746
890
  },
747
891
  AC_ELEM("parameter", {AC_ATTR("name"): "outline"}, "clear"),
748
892
  AC_ELEM("parameter", {AC_ATTR("name"): "style"}, "default"),
@@ -769,8 +913,11 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
769
913
  syntax into one of the Confluence structured macros *info*, *tip*, *note*, or *warning*.
770
914
  """
771
915
 
916
+ if len(elem) < 1:
917
+ raise DocumentError("empty admonition")
918
+
772
919
  # <div class="admonition note">
773
- class_list = elem.attrib.get("class", "").split(" ")
920
+ class_list = elem.get("class", "").split(" ")
774
921
  class_name: Optional[str] = None
775
922
  if "info" in class_list:
776
923
  class_name = "info"
@@ -788,7 +935,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
788
935
  self.visit(e)
789
936
 
790
937
  # <p class="admonition-title">Note</p>
791
- if "admonition-title" in elem[0].attrib.get("class", "").split(" "):
938
+ if "admonition-title" in elem[0].get("class", "").split(" "):
792
939
  content = [
793
940
  AC_ELEM(
794
941
  "parameter",
@@ -809,12 +956,15 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
809
956
  *content,
810
957
  )
811
958
 
812
- def _transform_github_alert(self, elem: ET._Element) -> ET._Element:
959
+ def _transform_github_alert(self, blockquote: ET._Element) -> ET._Element:
813
960
  """
814
961
  Creates a GitHub-style panel, normally triggered with a block-quote starting with a capitalized string such as `[!TIP]`.
815
962
  """
816
963
 
817
- content = elem[0]
964
+ if len(blockquote) < 1:
965
+ raise DocumentError("empty GitHub alert")
966
+
967
+ content = blockquote[0]
818
968
  if content.text is None:
819
969
  raise DocumentError("empty content")
820
970
 
@@ -839,9 +989,9 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
839
989
  else:
840
990
  raise DocumentError(f"unsupported GitHub alert: {alert}")
841
991
 
842
- return self._transform_alert(elem, class_name, skip)
992
+ return self._transform_alert(blockquote, class_name, skip)
843
993
 
844
- def _transform_gitlab_alert(self, elem: ET._Element) -> ET._Element:
994
+ def _transform_gitlab_alert(self, blockquote: ET._Element) -> ET._Element:
845
995
  """
846
996
  Creates a classic GitLab-style panel.
847
997
 
@@ -849,7 +999,10 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
849
999
  This syntax does not use Hugo shortcode.
850
1000
  """
851
1001
 
852
- content = elem[0]
1002
+ if len(blockquote) < 1:
1003
+ raise DocumentError("empty GitLab alert")
1004
+
1005
+ content = blockquote[0]
853
1006
  if content.text is None:
854
1007
  raise DocumentError("empty content")
855
1008
 
@@ -872,9 +1025,9 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
872
1025
  else:
873
1026
  raise DocumentError(f"unsupported GitLab alert: {alert}")
874
1027
 
875
- return self._transform_alert(elem, class_name, skip)
1028
+ return self._transform_alert(blockquote, class_name, skip)
876
1029
 
877
- def _transform_alert(self, elem: ET._Element, class_name: Optional[str], skip: int) -> ET._Element:
1030
+ def _transform_alert(self, blockquote: ET._Element, class_name: Optional[str], skip: int) -> ET._Element:
878
1031
  """
879
1032
  Creates an info, tip, note or warning panel from a GitHub or GitLab alert.
880
1033
 
@@ -884,14 +1037,14 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
884
1037
  :see: https://docs.gitlab.com/ee/development/documentation/styleguide/#alert-boxes
885
1038
  """
886
1039
 
887
- content = elem[0]
1040
+ content = blockquote[0]
888
1041
  if content.text is None:
889
1042
  raise DocumentError("empty content")
890
1043
 
891
1044
  if class_name is None:
892
1045
  raise DocumentError("not an alert")
893
1046
 
894
- for e in elem:
1047
+ for e in blockquote:
895
1048
  self.visit(e)
896
1049
 
897
1050
  content.text = content.text[skip:]
@@ -901,10 +1054,10 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
901
1054
  AC_ATTR("name"): class_name,
902
1055
  AC_ATTR("schema-version"): "1",
903
1056
  },
904
- AC_ELEM("rich-text-body", {}, *list(elem)),
1057
+ AC_ELEM("rich-text-body", {}, *list(blockquote)),
905
1058
  )
906
1059
 
907
- def _transform_section(self, elem: ET._Element) -> ET._Element:
1060
+ def _transform_section(self, details: ET._Element) -> ET._Element:
908
1061
  """
909
1062
  Creates a collapsed section.
910
1063
 
@@ -913,16 +1066,31 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
913
1066
  :see: https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/organizing-information-with-collapsed-sections
914
1067
  """
915
1068
 
916
- if elem[0].tag != "summary":
1069
+ summary = details[0]
1070
+ if summary.tag != "summary":
917
1071
  raise DocumentError("expected: `<summary>` as first direct child of `<details>`")
918
- if elem[0].tail is not None:
1072
+ if details.text is not None or summary.tail is not None:
1073
+ # when `<details>` has attribute `markdown=1`, content is parsed as Markdown:
1074
+ # ```
1075
+ # <details>
1076
+ # <summary>...</summary>
1077
+ # <p>Text with <em>emphasis</em>.</p>
1078
+ # </details>
1079
+ # ```
1080
+ #
1081
+ # when `<details>` lacks attribute `markdown=1`, content is passed down as raw HTML, partly as `text` of `<detail>` or `tail` of `<summary>`:
1082
+ # ```
1083
+ # <details>
1084
+ # <summary>...</summary>
1085
+ # Text with *emphasis*.
1086
+ # </details>
919
1087
  raise DocumentError('expected: attribute `markdown="1"` on `<details>`')
920
1088
 
921
- summary = element_to_text(elem[0])
922
- elem.remove(elem[0])
1089
+ summary_text = element_to_text(summary)
1090
+ details.remove(summary)
923
1091
 
924
1092
  # transform Markdown to Confluence within collapsed section content
925
- self.visit(elem)
1093
+ self.visit(details)
926
1094
 
927
1095
  return AC_ELEM(
928
1096
  "structured-macro",
@@ -933,9 +1101,9 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
933
1101
  AC_ELEM(
934
1102
  "parameter",
935
1103
  {AC_ATTR("name"): "title"},
936
- summary,
1104
+ summary_text,
937
1105
  ),
938
- AC_ELEM("rich-text-body", {}, *list(elem)),
1106
+ AC_ELEM("rich-text-body", {}, *list(details)),
939
1107
  )
940
1108
 
941
1109
  def _transform_emoji(self, elem: ET._Element) -> ET._Element:
@@ -943,8 +1111,8 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
943
1111
  Inserts an inline emoji character.
944
1112
  """
945
1113
 
946
- shortname = elem.attrib.get("data-shortname", "")
947
- unicode = elem.attrib.get("data-unicode", None)
1114
+ shortname = elem.get("data-shortname", "")
1115
+ unicode = elem.get("data-unicode", None)
948
1116
  alt = elem.text or ""
949
1117
 
950
1118
  # <ac:emoticon ac:name="wink" ac:emoji-shortname=":wink:" ac:emoji-id="1f609" ac:emoji-fallback="&#128521;"/>
@@ -958,6 +1126,44 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
958
1126
  },
959
1127
  )
960
1128
 
1129
+ def _transform_mark(self, mark: ET._Element) -> ET._Element:
1130
+ """
1131
+ Adds inline highlighting to text.
1132
+ """
1133
+
1134
+ attrs = dict(mark.items())
1135
+ old_style = attrs.get("style")
1136
+ new_style = "background-color: rgb(254,222,200);"
1137
+ if old_style is not None:
1138
+ new_style += f" {old_style}"
1139
+ attrs["style"] = new_style
1140
+ span = HTML("span", attrs, *list(mark))
1141
+ span.text = mark.text
1142
+ return span
1143
+
1144
+ def _transform_latex(self, elem: ET._Element, context: FormattingContext) -> ET._Element:
1145
+ """
1146
+ Creates an image rendering of a LaTeX formula with Matplotlib.
1147
+ """
1148
+
1149
+ content = elem.text
1150
+ if not content:
1151
+ raise DocumentError("empty LaTeX formula")
1152
+
1153
+ image_data = render_latex(content, format=self.options.diagram_output_format)
1154
+ if self.options.diagram_output_format == "png":
1155
+ width, height = get_png_dimensions(data=image_data)
1156
+ image_data = remove_png_chunks(["pHYs"], source_data=image_data)
1157
+ attrs = ImageAttributes(context, width, height, content, None, "")
1158
+ else:
1159
+ attrs = ImageAttributes.empty(context)
1160
+
1161
+ image_hash = hashlib.md5(image_data).hexdigest()
1162
+ image_filename = attachment_name(f"formula_{image_hash}.{self.options.diagram_output_format}")
1163
+ self.embedded_files[image_filename] = EmbeddedFileData(image_data, content)
1164
+ image = self._create_attached_image(image_filename, attrs)
1165
+ return image
1166
+
961
1167
  def _transform_inline_math(self, elem: ET._Element) -> ET._Element:
962
1168
  """
963
1169
  Creates an inline LaTeX formula using the Confluence extension "LaTeX Math for Confluence - Math Formula & Equations".
@@ -965,12 +1171,15 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
965
1171
  :see: https://help.narva.net/latex-math-for-confluence/
966
1172
  """
967
1173
 
968
- content = elem.text or ""
1174
+ content = elem.text
969
1175
  if not content:
970
1176
  raise DocumentError("empty inline LaTeX formula")
971
1177
 
972
1178
  LOGGER.debug("Found inline LaTeX formula: %s", content)
973
1179
 
1180
+ if self.options.render_latex:
1181
+ return self._transform_latex(elem, FormattingContext.INLINE)
1182
+
974
1183
  local_id = str(uuid.uuid4())
975
1184
  macro_id = str(uuid.uuid4())
976
1185
  macro = AC_ELEM(
@@ -988,7 +1197,6 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
988
1197
  ),
989
1198
  AC_ELEM("parameter", {AC_ATTR("name"): "align"}, "center"),
990
1199
  )
991
- macro.tail = elem.tail # chain sibling text node that immediately follows original element
992
1200
  return macro
993
1201
 
994
1202
  def _transform_block_math(self, elem: ET._Element) -> ET._Element:
@@ -998,12 +1206,15 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
998
1206
  :see: https://help.narva.net/latex-math-for-confluence/
999
1207
  """
1000
1208
 
1001
- content = elem.text or ""
1209
+ content = elem.text
1002
1210
  if not content:
1003
1211
  raise DocumentError("empty block-level LaTeX formula")
1004
1212
 
1005
1213
  LOGGER.debug("Found block-level LaTeX formula: %s", content)
1006
1214
 
1215
+ if self.options.render_latex:
1216
+ return self._transform_latex(elem, FormattingContext.BLOCK)
1217
+
1007
1218
  local_id = str(uuid.uuid4())
1008
1219
  macro_id = str(uuid.uuid4())
1009
1220
 
@@ -1041,7 +1252,9 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1041
1252
  raise DocumentError("expected: attribute `id` of format `fnref:NAME` applied on `<sup>` for a footnote reference")
1042
1253
  footnote_ref = ref_id.removeprefix("fnref:")
1043
1254
 
1044
- link = elem[0]
1255
+ link = next((elem.iterchildren(tag="a")), None)
1256
+ if link is None:
1257
+ raise DocumentError("expected: `<a>` as the first HTML element in a footnote reference")
1045
1258
  def_href = link.attrib.pop("href", "")
1046
1259
  if not def_href.startswith("#fn:"):
1047
1260
  raise DocumentError("expected: attribute `href` of format `#fn:NAME` applied on `<a>` for a footnote reference")
@@ -1095,18 +1308,28 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1095
1308
  ```
1096
1309
  """
1097
1310
 
1098
- for list_item in elem[1]:
1311
+ ordered_list = next((elem.iterchildren(tag="ol")), None)
1312
+ if ordered_list is None:
1313
+ raise DocumentError("expected: `<ol>` as direct child of footnote definition block")
1314
+
1315
+ for list_item in ordered_list:
1316
+ if list_item.tag != "li":
1317
+ raise DocumentError("expected: `<li>` as children of `<ol>` in footnote definition block")
1318
+
1099
1319
  def_id = list_item.attrib.pop("id", "")
1100
1320
  if not def_id.startswith("fn:"):
1101
1321
  raise DocumentError("expected: attribute `id` of format `fn:NAME` applied on `<li>` for a footnote definition")
1102
1322
  footnote_def = def_id.removeprefix("fn:")
1103
1323
 
1104
- paragraph = list_item[0]
1105
- ref_anchor = paragraph[-1]
1106
- if ref_anchor.tag != "a":
1324
+ paragraph = next((list_item.iterchildren(tag="p")), None)
1325
+ if paragraph is None:
1326
+ raise DocumentError("expected: `<p>` as a child of `<li>` in a footnote definition")
1327
+
1328
+ ref_anchor = next((paragraph.iterchildren(tag="a", reversed=True)), None)
1329
+ if ref_anchor is None:
1107
1330
  raise DocumentError("expected: `<a>` as the last HTML element in a footnote definition")
1108
1331
 
1109
- ref_href = ref_anchor.attrib.get("href", "")
1332
+ ref_href = ref_anchor.get("href", "")
1110
1333
  if not ref_href.startswith("#fnref:"):
1111
1334
  raise DocumentError("expected: attribute `href` of format `#fnref:NAME` applied on last element `<a>` for a footnote definition")
1112
1335
  footnote_ref = ref_href.removeprefix("#fnref:")
@@ -1159,9 +1382,6 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1159
1382
  if not element_text_starts_with_any(item, ["[ ]", "[x]", "[X]"]):
1160
1383
  raise DocumentError("expected: each `<li>` in a task list starting with [ ] or [x]")
1161
1384
 
1162
- # transform Markdown to Confluence within tasklist content
1163
- self.visit(elem)
1164
-
1165
1385
  tasks: list[ET._Element] = []
1166
1386
  for index, item in enumerate(elem, start=1):
1167
1387
  if item.text is None:
@@ -1171,11 +1391,13 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1171
1391
  raise NotImplementedError("pre-condition check not exhaustive")
1172
1392
 
1173
1393
  status = "incomplete" if match.group(1).isspace() else "complete"
1394
+ item.text = item.text[3:]
1395
+
1396
+ # transform Markdown to Confluence within tasklist content
1397
+ self.visit(item)
1174
1398
 
1175
- body = AC_ELEM("task-body")
1176
- body.text = item.text[3:]
1177
- for child in item:
1178
- body.append(child)
1399
+ body = AC_ELEM("task-body", *list(item))
1400
+ body.text = item.text
1179
1401
  tasks.append(
1180
1402
  AC_ELEM(
1181
1403
  "task",
@@ -1194,47 +1416,32 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1194
1416
  Transforms an HTML element tree obtained from a Markdown document into a Confluence Storage Format element tree.
1195
1417
  """
1196
1418
 
1197
- # normalize line breaks to regular space in element text
1419
+ # replace line breaks with regular space in element text to minimize phantom changes
1198
1420
  if child.text:
1199
- text: str = child.text
1200
- child.text = text.replace("\n", " ")
1421
+ child.text = child.text.replace("\n", " ")
1201
1422
  if child.tail:
1202
- tail: str = child.tail
1203
- child.tail = tail.replace("\n", " ")
1423
+ child.tail = child.tail.replace("\n", " ")
1204
1424
 
1205
1425
  if not isinstance(child.tag, str):
1206
1426
  return None
1207
1427
 
1208
- # <h1>...</h1>
1209
- # <h2>...</h2> ...
1210
- m = re.match(r"^h([1-6])$", child.tag, flags=re.IGNORECASE)
1211
- if m is not None:
1212
- level = int(m.group(1))
1213
- title = element_to_text(child)
1214
- self.toc.add(level, title)
1215
-
1216
- if self.options.heading_anchors:
1217
- self._transform_heading(child)
1218
- return None
1219
-
1220
1428
  # <p>...</p>
1221
1429
  if child.tag == "p":
1222
1430
  # <p><img src="..." /></p>
1223
- if len(child) == 1 and child[0].tag == "img":
1224
- return self._transform_image(child[0])
1431
+ if len(child) == 1 and not child.text and child[0].tag == "img" and not child[0].tail:
1432
+ return self._transform_image(FormattingContext.BLOCK, child[0])
1225
1433
 
1226
- # <p>[[_TOC_]]</p> (represented as <p>[[<em>TOC</em>]]</p>)
1227
- # <p>[TOC]</p>
1228
- elif element_to_text(child) in ["[[TOC]]", "[TOC]"]:
1434
+ # <p>[[<em>TOC</em>]]</p> (represented in Markdown as `[[_TOC_]]`)
1435
+ elif is_placeholder_for(child, "TOC"):
1229
1436
  return self._transform_toc(child)
1230
1437
 
1231
- # <p>[[_LISTING_]]</p> (represented as <p>[[<em>LISTING</em>]]</p>)
1232
- elif element_to_text(child) in ["[[LISTING]]", "[LISTING]"]:
1438
+ # <p>[[<em>LISTING</em>]]</p> (represented in Markdown as `[[_LISTING_]]`)
1439
+ elif is_placeholder_for(child, "LISTING"):
1233
1440
  return self._transform_listing(child)
1234
1441
 
1235
1442
  # <div>...</div>
1236
1443
  elif child.tag == "div":
1237
- classes = child.attrib.get("class", "").split(" ")
1444
+ classes = child.get("class", "").split(" ")
1238
1445
 
1239
1446
  # <div class="arithmatex">...</div>
1240
1447
  if "arithmatex" in classes:
@@ -1293,46 +1500,85 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1293
1500
  elif child.tag == "details" and len(child) > 1 and child[0].tag == "summary":
1294
1501
  return self._transform_section(child)
1295
1502
 
1503
+ # <ol>...</ol>
1504
+ elif child.tag == "ol":
1505
+ # Confluence adds the attribute `start` for every ordered list
1506
+ child.set("start", "1")
1507
+ return None
1508
+
1296
1509
  # <ul>
1297
1510
  # <li>[ ] ...</li>
1298
1511
  # <li>[x] ...</li>
1299
1512
  # </ul>
1300
- elif child.tag == "ul" and len(child) > 0 and element_text_starts_with_any(child[0], ["[ ]", "[x]", "[X]"]):
1301
- return self._transform_tasklist(child)
1513
+ elif child.tag == "ul":
1514
+ if len(child) > 0 and element_text_starts_with_any(child[0], ["[ ]", "[x]", "[X]"]):
1515
+ return self._transform_tasklist(child)
1516
+
1517
+ return None
1518
+
1519
+ elif child.tag == "li":
1520
+ normalize_inline(child)
1521
+ return None
1302
1522
 
1303
1523
  # <pre><code class="language-java"> ... </code></pre>
1304
1524
  elif child.tag == "pre" and len(child) == 1 and child[0].tag == "code":
1305
1525
  return self._transform_code_block(child[0])
1306
1526
 
1527
+ # <table>...</table>
1528
+ elif child.tag == "table":
1529
+ child.set("data-layout", "default")
1530
+ return None
1531
+
1307
1532
  # <img src="..." alt="..." />
1308
1533
  elif child.tag == "img":
1309
- return self._transform_image(child)
1534
+ return self._transform_image(FormattingContext.INLINE, child)
1310
1535
 
1311
1536
  # <a href="..."> ... </a>
1312
1537
  elif child.tag == "a":
1313
1538
  return self._transform_link(child)
1314
1539
 
1540
+ # <mark>...</mark>
1541
+ elif child.tag == "mark":
1542
+ return self._transform_mark(child)
1543
+
1315
1544
  # <span>...</span>
1316
1545
  elif child.tag == "span":
1317
- classes = child.attrib.get("class", "").split(" ")
1546
+ classes = child.get("class", "").split(" ")
1318
1547
 
1319
1548
  # <span class="arithmatex">...</span>
1320
1549
  if "arithmatex" in classes:
1321
1550
  return self._transform_inline_math(child)
1322
1551
 
1323
1552
  # <sup id="fnref:NAME"><a class="footnote-ref" href="#fn:NAME">1</a></sup>
1324
- elif child.tag == "sup" and child.attrib.get("id", "").startswith("fnref:"):
1553
+ elif child.tag == "sup" and child.get("id", "").startswith("fnref:"):
1325
1554
  self._transform_footnote_ref(child)
1326
1555
  return None
1327
1556
 
1328
1557
  # <input type="date" value="1984-01-01" />
1329
- elif child.tag == "input" and child.attrib.get("type", "") == "date":
1330
- return HTML("time", {"datetime": child.attrib.get("value", "")})
1558
+ elif child.tag == "input" and child.get("type", "") == "date":
1559
+ return HTML("time", {"datetime": child.get("value", "")})
1560
+
1561
+ # <ins>...</ins>
1562
+ elif child.tag == "ins":
1563
+ # Confluence prefers <u> over <ins> for underline, and replaces <ins> with <u>
1564
+ child.tag = "u"
1331
1565
 
1332
1566
  # <x-emoji data-shortname="wink" data-unicode="1f609">😉</x-emoji>
1333
1567
  elif child.tag == "x-emoji":
1334
1568
  return self._transform_emoji(child)
1335
1569
 
1570
+ # <h1>...</h1>
1571
+ # <h2>...</h2> ...
1572
+ m = re.match(r"^h([1-6])$", child.tag, flags=re.IGNORECASE)
1573
+ if m is not None:
1574
+ level = int(m.group(1))
1575
+ title = element_to_text(child)
1576
+ self.toc.add(level, title)
1577
+
1578
+ if self.options.heading_anchors:
1579
+ self._transform_heading(child)
1580
+ return None
1581
+
1336
1582
  return None
1337
1583
 
1338
1584
 
@@ -1345,11 +1591,15 @@ class ConversionError(RuntimeError):
1345
1591
 
1346
1592
 
1347
1593
  class ConfluenceDocument:
1594
+ "Encapsulates an element tree for a Confluence document created by parsing a Markdown document."
1595
+
1348
1596
  title: Optional[str]
1349
1597
  labels: Optional[list[str]]
1350
1598
  properties: Optional[dict[str, JsonType]]
1599
+
1351
1600
  links: list[str]
1352
- images: list[Path]
1601
+ images: list[ImageData]
1602
+ embedded_files: dict[str, EmbeddedFileData]
1353
1603
 
1354
1604
  options: ConfluenceDocumentOptions
1355
1605
  root: ET._Element