markdown-to-confluence 0.4.3__py3-none-any.whl → 0.4.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
md2conf/converter.py CHANGED
@@ -6,60 +6,59 @@ Copyright 2022-2025, Levente Hunyadi
6
6
  :see: https://github.com/hunyadi/md2conf
7
7
  """
8
8
 
9
- # mypy: disable-error-code="dict-item"
10
-
11
9
  import dataclasses
10
+ import enum
12
11
  import hashlib
13
- import importlib.resources as resources
14
12
  import logging
15
13
  import os.path
16
14
  import re
17
15
  import uuid
16
+ from abc import ABC, abstractmethod
18
17
  from dataclasses import dataclass
19
18
  from pathlib import Path
20
- from typing import Any, Literal, Optional, Union
21
- from urllib.parse import ParseResult, quote_plus, urlparse, urlunparse
19
+ from typing import ClassVar, Literal, Optional, Union
20
+ from urllib.parse import ParseResult, quote_plus, urlparse
22
21
 
23
22
  import lxml.etree as ET
24
- from lxml.builder import ElementMaker
25
23
  from strong_typing.core import JsonType
26
24
 
27
25
  from . import drawio, mermaid
28
26
  from .collection import ConfluencePageCollection
27
+ from .csf import AC_ATTR, AC_ELEM, HTML, RI_ATTR, RI_ELEM, ParseError, elements_from_strings, elements_to_string, normalize_inline
29
28
  from .domain import ConfluenceDocumentOptions, ConfluencePageID
30
- from .extra import path_relative_to
29
+ from .extra import override, path_relative_to
30
+ from .latex import get_png_dimensions, remove_png_chunks, render_latex
31
31
  from .markdown import markdown_to_html
32
32
  from .metadata import ConfluenceSiteMetadata
33
33
  from .properties import PageError
34
34
  from .scanner import ScannedDocument, Scanner
35
-
36
- namespaces = {
37
- "ac": "http://atlassian.com/content",
38
- "ri": "http://atlassian.com/resource/identifier",
39
- }
40
- for key, value in namespaces.items():
41
- ET.register_namespace(key, value)
35
+ from .toc import TableOfContentsBuilder
36
+ from .uri import is_absolute_url, to_uuid_urn
37
+ from .xml import element_to_text
42
38
 
43
39
 
44
- def get_volatile_attributes() -> list[ET.QName]:
40
+ def get_volatile_attributes() -> list[str]:
45
41
  "Returns a list of volatile attributes that frequently change as a Confluence storage format XHTML document is updated."
46
42
 
47
43
  return [
48
- ET.QName(namespaces["ac"], "local-id"),
49
- ET.QName(namespaces["ac"], "macro-id"),
50
- ET.QName(namespaces["ri"], "version-at-save"),
44
+ AC_ATTR("local-id"),
45
+ AC_ATTR("macro-id"),
46
+ RI_ATTR("version-at-save"),
51
47
  ]
52
48
 
53
49
 
54
- HTML = ElementMaker()
55
- AC = ElementMaker(namespace=namespaces["ac"])
56
- RI = ElementMaker(namespace=namespaces["ri"])
50
+ def get_volatile_elements() -> list[str]:
51
+ "Returns a list of volatile elements whose content frequently changes as a Confluence storage format XHTML document is updated."
57
52
 
58
- LOGGER = logging.getLogger(__name__)
53
+ return [AC_ATTR("task-uuid")]
59
54
 
60
55
 
61
- class ParseError(RuntimeError):
62
- pass
56
+ status_images: dict[str, str] = {
57
+ to_uuid_urn(f'<svg height="10" width="10" xmlns="http://www.w3.org/2000/svg"><circle r="5" cx="5" cy="5" fill="{color}" /></svg>'): color
58
+ for color in ["gray", "purple", "blue", "red", "yellow", "green"]
59
+ }
60
+
61
+ LOGGER = logging.getLogger(__name__)
63
62
 
64
63
 
65
64
  def starts_with_any(text: str, prefixes: list[str]) -> bool:
@@ -71,16 +70,6 @@ def starts_with_any(text: str, prefixes: list[str]) -> bool:
71
70
  return False
72
71
 
73
72
 
74
- def is_absolute_url(url: str) -> bool:
75
- urlparts = urlparse(url)
76
- return bool(urlparts.scheme) or bool(urlparts.netloc)
77
-
78
-
79
- def is_relative_url(url: str) -> bool:
80
- urlparts = urlparse(url)
81
- return not bool(urlparts.scheme) and not bool(urlparts.netloc)
82
-
83
-
84
73
  def is_directory_within(absolute_path: Path, base_path: Path) -> bool:
85
74
  "True if the absolute path is nested within the base path."
86
75
 
@@ -100,132 +89,94 @@ def encode_title(text: str) -> str:
100
89
  return quote_plus(text.strip())
101
90
 
102
91
 
103
- def _elements_from_strings(dtd_path: Path, items: list[str]) -> ET._Element:
104
- """
105
- Creates a fragment of several XML nodes from their string representation wrapped in a root element.
106
-
107
- :param dtd_path: Path to a DTD document that defines entities like &cent; or &copy;.
108
- :param items: Strings to parse into XML fragments.
109
- :returns: An XML document as an element tree.
110
- """
111
-
112
- parser = ET.XMLParser(
113
- remove_blank_text=True,
114
- remove_comments=True,
115
- strip_cdata=False,
116
- load_dtd=True,
117
- )
92
+ # supported code block languages, for which syntax highlighting is available
93
+ _LANGUAGES = {
94
+ "abap": "abap",
95
+ "actionscript3": "actionscript3",
96
+ "ada": "ada",
97
+ "applescript": "applescript",
98
+ "arduino": "arduino",
99
+ "autoit": "autoit",
100
+ "bash": "bash",
101
+ "c": "c",
102
+ "c#": "c#",
103
+ "clojure": "clojure",
104
+ "coffeescript": "coffeescript",
105
+ "coldfusion": "coldfusion",
106
+ "cpp": "cpp",
107
+ "csharp": "c#",
108
+ "css": "css",
109
+ "cuda": "cuda",
110
+ "d": "d",
111
+ "dart": "dart",
112
+ "delphi": "delphi",
113
+ "diff": "diff",
114
+ "elixir": "elixir",
115
+ "erl": "erl",
116
+ "erlang": "erl",
117
+ "fortran": "fortran",
118
+ "foxpro": "foxpro",
119
+ "go": "go",
120
+ "graphql": "graphql",
121
+ "groovy": "groovy",
122
+ "haskell": "haskell",
123
+ "haxe": "haxe",
124
+ "html": "html",
125
+ "java": "java",
126
+ "javafx": "javafx",
127
+ "javascript": "js",
128
+ "js": "js",
129
+ "json": "json",
130
+ "jsx": "jsx",
131
+ "julia": "julia",
132
+ "kotlin": "kotlin",
133
+ "livescript": "livescript",
134
+ "lua": "lua",
135
+ "mermaid": "mermaid",
136
+ "mathematica": "mathematica",
137
+ "matlab": "matlab",
138
+ "objectivec": "objectivec",
139
+ "objectivej": "objectivej",
140
+ "ocaml": "ocaml",
141
+ "octave": "octave",
142
+ "pascal": "pascal",
143
+ "perl": "perl",
144
+ "php": "php",
145
+ "powershell": "powershell",
146
+ "prolog": "prolog",
147
+ "puppet": "puppet",
148
+ "py": "py",
149
+ "python": "py",
150
+ "qml": "qml",
151
+ "r": "r",
152
+ "racket": "racket",
153
+ "rst": "rst",
154
+ "ruby": "ruby",
155
+ "rust": "rust",
156
+ "sass": "sass",
157
+ "scala": "scala",
158
+ "scheme": "scheme",
159
+ "shell": "shell",
160
+ "smalltalk": "smalltalk",
161
+ "splunk": "splunk",
162
+ "sql": "sql",
163
+ "standardml": "standardml",
164
+ "swift": "swift",
165
+ "tcl": "tcl",
166
+ "tex": "tex",
167
+ "tsx": "tsx",
168
+ "typescript": "typescript",
169
+ "vala": "vala",
170
+ "vb": "vb",
171
+ "verilog": "verilog",
172
+ "vhdl": "vhdl",
173
+ "xml": "xml",
174
+ "xquery": "xquery",
175
+ "yaml": "yaml",
176
+ }
118
177
 
119
- ns_attr_list = "".join(f' xmlns:{key}="{value}"' for key, value in namespaces.items())
120
178
 
121
- data = [
122
- '<?xml version="1.0"?>',
123
- f'<!DOCTYPE ac:confluence PUBLIC "-//Atlassian//Confluence 4 Page//EN" "{dtd_path.as_posix()}"><root{ns_attr_list}>',
124
- ]
125
- data.extend(items)
126
- data.append("</root>")
127
-
128
- try:
129
- return ET.fromstringlist(data, parser=parser)
130
- except ET.XMLSyntaxError as ex:
131
- raise ParseError() from ex
132
-
133
-
134
- def elements_from_strings(items: list[str]) -> ET._Element:
135
- "Creates a fragment of several XML nodes from their string representation wrapped in a root element."
136
-
137
- resource_path = resources.files(__package__).joinpath("entities.dtd")
138
- with resources.as_file(resource_path) as dtd_path:
139
- return _elements_from_strings(dtd_path, items)
140
-
141
-
142
- def elements_from_string(content: str) -> ET._Element:
143
- return elements_from_strings([content])
144
-
145
-
146
- _languages = [
147
- "abap",
148
- "actionscript3",
149
- "ada",
150
- "applescript",
151
- "arduino",
152
- "autoit",
153
- "bash",
154
- "c",
155
- "clojure",
156
- "coffeescript",
157
- "coldfusion",
158
- "cpp",
159
- "csharp",
160
- "css",
161
- "cuda",
162
- "d",
163
- "dart",
164
- "delphi",
165
- "diff",
166
- "elixir",
167
- "erlang",
168
- "fortran",
169
- "foxpro",
170
- "go",
171
- "graphql",
172
- "groovy",
173
- "haskell",
174
- "haxe",
175
- "html",
176
- "java",
177
- "javafx",
178
- "javascript",
179
- "json",
180
- "jsx",
181
- "julia",
182
- "kotlin",
183
- "livescript",
184
- "lua",
185
- "mermaid",
186
- "mathematica",
187
- "matlab",
188
- "objectivec",
189
- "objectivej",
190
- "ocaml",
191
- "octave",
192
- "pascal",
193
- "perl",
194
- "php",
195
- "powershell",
196
- "prolog",
197
- "puppet",
198
- "python",
199
- "qml",
200
- "r",
201
- "racket",
202
- "rst",
203
- "ruby",
204
- "rust",
205
- "sass",
206
- "scala",
207
- "scheme",
208
- "shell",
209
- "smalltalk",
210
- "splunk",
211
- "sql",
212
- "standardml",
213
- "swift",
214
- "tcl",
215
- "tex",
216
- "tsx",
217
- "typescript",
218
- "vala",
219
- "vb",
220
- "verilog",
221
- "vhdl",
222
- "xml",
223
- "xquery",
224
- "yaml",
225
- ]
226
-
227
-
228
- class NodeVisitor:
179
+ class NodeVisitor(ABC):
229
180
  def visit(self, node: ET._Element) -> None:
230
181
  "Recursively visits all descendants of this node."
231
182
 
@@ -236,12 +187,17 @@ class NodeVisitor:
236
187
  source = node[index]
237
188
  target = self.transform(source)
238
189
  if target is not None:
190
+ # chain sibling text node that immediately follows original element
191
+ target.tail = source.tail
192
+ source.tail = None
193
+
194
+ # replace original element with transformed element
239
195
  node[index] = target
240
196
  else:
241
197
  self.visit(source)
242
198
 
243
- def transform(self, child: ET._Element) -> Optional[ET._Element]:
244
- pass
199
+ @abstractmethod
200
+ def transform(self, child: ET._Element) -> Optional[ET._Element]: ...
245
201
 
246
202
 
247
203
  def title_to_identifier(title: str) -> str:
@@ -253,58 +209,107 @@ def title_to_identifier(title: str) -> str:
253
209
  return s
254
210
 
255
211
 
256
- def element_to_text(node: ET._Element) -> str:
257
- "Returns all text contained in an element as a concatenated string."
212
+ def element_text_starts_with_any(node: ET._Element, prefixes: list[str]) -> bool:
213
+ "True if the text contained in an element starts with any of the specified prefix strings."
258
214
 
259
- return "".join(node.itertext()).strip()
215
+ if node.text is None:
216
+ return False
217
+ return starts_with_any(node.text, prefixes)
260
218
 
261
219
 
262
- @dataclass
263
- class ImageAttributes:
264
- caption: Optional[str]
265
- width: Optional[str]
266
- height: Optional[str]
220
+ def is_placeholder_for(node: ET._Element, name: str) -> bool:
221
+ """
222
+ Identifies a Confluence widget placeholder, e.g. `[[_TOC_]]` or `[[_LISTING_]]`.
267
223
 
224
+ :param node: The element to check.
225
+ :param name: The placeholder name.
226
+ """
268
227
 
269
- @dataclass
270
- class TableOfContentsEntry:
271
- level: int
272
- text: str
228
+ # `[[_TOC_]]` is represented in HTML as <p>[[<em>TOC</em>]]</p>
229
+ if node.text != "[[" or len(node) != 1:
230
+ return False
273
231
 
232
+ child = node[0]
233
+ if child.tag != "em" or child.text != name or child.tail != "]]":
234
+ return False
274
235
 
275
- class TableOfContents:
276
- "Builds a table of contents from Markdown headings."
236
+ return True
277
237
 
278
- headings: list[TableOfContentsEntry]
279
238
 
280
- def __init__(self) -> None:
281
- self.headings = []
239
+ @enum.unique
240
+ class FormattingContext(enum.Enum):
241
+ "Identifies the formatting context for the element."
282
242
 
283
- def add(self, level: int, text: str) -> None:
284
- """
285
- Adds a heading to the table of contents.
243
+ BLOCK = "block"
244
+ INLINE = "inline"
286
245
 
287
- :param level: Markdown heading level (e.g. `1` for first-level heading).
288
- :param text: Markdown heading text.
289
- """
290
246
 
291
- self.headings.append(TableOfContentsEntry(level, text))
247
+ @dataclass
248
+ class ImageAttributes:
249
+ """
250
+ Attributes applied to an `<img>` element.
251
+
252
+ :param context: Identifies the formatting context for the element (block or inline).
253
+ :param width: Natural image width in pixels.
254
+ :param height: Natural image height in pixels.
255
+ :param alt: Alternate text.
256
+ :param title: Title text (a.k.a. image tooltip).
257
+ :param caption: Caption text (shown below figure).
258
+ """
292
259
 
293
- def get_title(self) -> Optional[str]:
294
- """
295
- Returns a proposed document title (if unique).
260
+ context: FormattingContext
261
+ width: Optional[int]
262
+ height: Optional[int]
263
+ alt: Optional[str]
264
+ title: Optional[str]
265
+ caption: Optional[str]
296
266
 
297
- :returns: Title text, or `None` if no unique title can be inferred.
298
- """
267
+ def __post_init__(self) -> None:
268
+ if self.caption is None and self.context is FormattingContext.BLOCK:
269
+ self.caption = self.title or self.alt
270
+
271
+ def as_dict(self) -> dict[str, str]:
272
+ attributes: dict[str, str] = {}
273
+ if self.context is FormattingContext.BLOCK:
274
+ attributes[AC_ATTR("align")] = "center"
275
+ attributes[AC_ATTR("layout")] = "center"
276
+ if self.width is not None:
277
+ attributes[AC_ATTR("original-width")] = str(self.width)
278
+ if self.height is not None:
279
+ attributes[AC_ATTR("original-height")] = str(self.height)
280
+ if self.width is not None:
281
+ attributes[AC_ATTR("custom-width")] = "true"
282
+ attributes[AC_ATTR("width")] = str(self.width)
283
+
284
+ elif self.context is FormattingContext.INLINE:
285
+ if self.width is not None:
286
+ attributes[AC_ATTR("width")] = str(self.width)
287
+ if self.height is not None:
288
+ attributes[AC_ATTR("height")] = str(self.height)
289
+ else:
290
+ raise NotImplementedError("match not exhaustive for enumeration")
299
291
 
300
- for level in range(1, 7):
301
- try:
302
- (title,) = (item.text for item in self.headings if item.level == level)
303
- return title
304
- except ValueError:
305
- pass
292
+ if self.alt is not None:
293
+ attributes.update({AC_ATTR("alt"): self.alt})
294
+ if self.title is not None:
295
+ attributes.update({AC_ATTR("title"): self.title})
296
+ return attributes
306
297
 
307
- return None
298
+ EMPTY_BLOCK: ClassVar["ImageAttributes"]
299
+ EMPTY_INLINE: ClassVar["ImageAttributes"]
300
+
301
+ @classmethod
302
+ def empty(cls, context: FormattingContext) -> "ImageAttributes":
303
+ if context is FormattingContext.BLOCK:
304
+ return cls.EMPTY_BLOCK
305
+ elif context is FormattingContext.INLINE:
306
+ return cls.EMPTY_INLINE
307
+ else:
308
+ raise NotImplementedError("match not exhaustive for enumeration")
309
+
310
+
311
+ ImageAttributes.EMPTY_BLOCK = ImageAttributes(FormattingContext.BLOCK, None, None, None, None, None)
312
+ ImageAttributes.EMPTY_INLINE = ImageAttributes(FormattingContext.INLINE, None, None, None, None, None)
308
313
 
309
314
 
310
315
  @dataclass
@@ -319,6 +324,7 @@ class ConfluenceConverterOptions:
319
324
  :param prefer_raster: Whether to choose PNG files over SVG files when available.
320
325
  :param render_drawio: Whether to pre-render (or use the pre-rendered version of) draw.io diagrams.
321
326
  :param render_mermaid: Whether to pre-render Mermaid diagrams into PNG/SVG images.
327
+ :param render_latex: Whether to pre-render LaTeX formulas into PNG/SVG images.
322
328
  :param diagram_output_format: Target image format for diagrams.
323
329
  :param webui_links: When true, convert relative URLs to Confluence Web UI links.
324
330
  """
@@ -328,10 +334,23 @@ class ConfluenceConverterOptions:
328
334
  prefer_raster: bool = True
329
335
  render_drawio: bool = False
330
336
  render_mermaid: bool = False
337
+ render_latex: bool = False
331
338
  diagram_output_format: Literal["png", "svg"] = "png"
332
339
  webui_links: bool = False
333
340
 
334
341
 
342
+ @dataclass
343
+ class ImageData:
344
+ path: Path
345
+ description: Optional[str] = None
346
+
347
+
348
+ @dataclass
349
+ class EmbeddedFileData:
350
+ data: bytes
351
+ description: Optional[str] = None
352
+
353
+
335
354
  class ConfluenceStorageFormatConverter(NodeVisitor):
336
355
  "Transforms a plain HTML tree into Confluence Storage Format."
337
356
 
@@ -339,10 +358,10 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
339
358
  path: Path
340
359
  base_dir: Path
341
360
  root_dir: Path
342
- toc: TableOfContents
361
+ toc: TableOfContentsBuilder
343
362
  links: list[str]
344
- images: list[Path]
345
- embedded_images: dict[str, bytes]
363
+ images: list[ImageData]
364
+ embedded_files: dict[str, EmbeddedFileData]
346
365
  site_metadata: ConfluenceSiteMetadata
347
366
  page_metadata: ConfluencePageCollection
348
367
 
@@ -363,28 +382,40 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
363
382
  self.path = path
364
383
  self.base_dir = path.parent
365
384
  self.root_dir = root_dir
366
- self.toc = TableOfContents()
385
+ self.toc = TableOfContentsBuilder()
367
386
  self.links = []
368
387
  self.images = []
369
- self.embedded_images = {}
388
+ self.embedded_files = {}
370
389
  self.site_metadata = site_metadata
371
390
  self.page_metadata = page_metadata
372
391
 
373
392
  def _transform_heading(self, heading: ET._Element) -> None:
374
- "Adds anchors to headings in the same document (if *heading anchors* is enabled)."
393
+ """
394
+ Adds anchors to headings in the same document (if *heading anchors* is enabled).
395
+
396
+ Original:
397
+ ```
398
+ <h1>Heading text</h1>
399
+ ```
400
+
401
+ Transformed:
402
+ ```
403
+ <h1><structured-macro name="anchor">...</structured-macro>Heading text</h1>
404
+ ```
405
+ """
375
406
 
376
407
  for e in heading:
377
408
  self.visit(e)
378
409
 
379
- anchor = AC(
410
+ anchor = AC_ELEM(
380
411
  "structured-macro",
381
412
  {
382
- ET.QName(namespaces["ac"], "name"): "anchor",
383
- ET.QName(namespaces["ac"], "schema-version"): "1",
413
+ AC_ATTR("name"): "anchor",
414
+ AC_ATTR("schema-version"): "1",
384
415
  },
385
- AC(
416
+ AC_ELEM(
386
417
  "parameter",
387
- {ET.QName(namespaces["ac"], "name"): ""},
418
+ {AC_ATTR("name"): ""},
388
419
  title_to_identifier(element_to_text(heading)),
389
420
  ),
390
421
  )
@@ -395,7 +426,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
395
426
  heading.text = None
396
427
 
397
428
  def _warn_or_raise(self, msg: str) -> None:
398
- "Emit a warning or raise an exception when a path points to a resource that doesn't exist."
429
+ "Emit a warning or raise an exception when a path points to a resource that doesn't exist or is outside of the permitted hierarchy."
399
430
 
400
431
  if self.options.ignore_invalid_url:
401
432
  LOGGER.warning(msg)
@@ -411,7 +442,10 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
411
442
  * Links to documents in the source hierarchy are mapped into full Confluence URLs.
412
443
  """
413
444
 
414
- url = anchor.attrib.get("href")
445
+ # Confluence doesn't support `title` attribute on `<a>` elements
446
+ anchor.attrib.pop("title", None)
447
+
448
+ url = anchor.get("href")
415
449
  if url is None or is_absolute_url(url):
416
450
  return None
417
451
 
@@ -419,46 +453,52 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
419
453
  relative_url: ParseResult = urlparse(url)
420
454
 
421
455
  if not relative_url.scheme and not relative_url.netloc and not relative_url.path and not relative_url.params and not relative_url.query:
422
- LOGGER.debug("Found local URL: %s", url)
456
+ LOGGER.debug("Found same-page URL: %s", url)
423
457
  if self.options.heading_anchors:
424
458
  # <ac:link ac:anchor="anchor"><ac:link-body>...</ac:link-body></ac:link>
425
459
  target = relative_url.fragment.lstrip("#")
426
- link_body = AC("link-body", {}, *list(anchor))
460
+ link_body = AC_ELEM("link-body", {}, *list(anchor))
427
461
  link_body.text = anchor.text
428
- link_wrapper = AC(
462
+ link_wrapper = AC_ELEM(
429
463
  "link",
430
464
  {
431
- ET.QName(namespaces["ac"], "anchor"): target,
465
+ AC_ATTR("anchor"): target,
432
466
  },
433
467
  link_body,
434
468
  )
435
- link_wrapper.tail = anchor.tail
436
469
  return link_wrapper
437
470
  else:
438
471
  return None
439
472
 
440
- # convert the relative URL to absolute URL based on the base path value, then look up
441
- # the absolute path in the page metadata dictionary to discover the relative path
442
- # within Confluence that should be used
473
+ # discard original value: relative links always require transformation
474
+ anchor.attrib.pop("href")
475
+
476
+ # convert the relative URL to absolute path based on the base path value
443
477
  absolute_path = (self.base_dir / relative_url.path).resolve()
478
+
479
+ # look up the absolute path in the page metadata dictionary to discover the relative path within Confluence that should be used
444
480
  if not is_directory_within(absolute_path, self.root_dir):
445
- anchor.attrib.pop("href")
446
481
  self._warn_or_raise(f"relative URL {url} points to outside root path: {self.root_dir}")
447
482
  return None
448
483
 
484
+ if absolute_path.suffix == ".md":
485
+ return self._transform_page_link(anchor, relative_url, absolute_path)
486
+ else:
487
+ return self._transform_attachment_link(anchor, absolute_path)
488
+
489
+ def _transform_page_link(self, anchor: ET._Element, relative_url: ParseResult, absolute_path: Path) -> Optional[ET._Element]:
490
+ """
491
+ Transforms links to other Markdown documents (Confluence pages).
492
+ """
493
+
449
494
  link_metadata = self.page_metadata.get(absolute_path)
450
495
  if link_metadata is None:
451
- msg = f"unable to find matching page for URL: {url}"
452
- if self.options.ignore_invalid_url:
453
- LOGGER.warning(msg)
454
- anchor.attrib.pop("href")
455
- return None
456
- else:
457
- raise DocumentError(msg)
496
+ self._warn_or_raise(f"unable to find matching page for URL: {relative_url.geturl()}")
497
+ return None
458
498
 
459
499
  relative_path = os.path.relpath(absolute_path, self.base_dir)
460
500
  LOGGER.debug("Found link to page %s with metadata: %s", relative_path, link_metadata)
461
- self.links.append(url)
501
+ self.links.append(relative_url.geturl())
462
502
 
463
503
  if self.options.webui_links:
464
504
  page_url = f"{self.site_metadata.base_path}pages/viewpage.action?pageId={link_metadata.page_id}"
@@ -470,7 +510,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
470
510
 
471
511
  page_url = f"{self.site_metadata.base_path}spaces/{space_key}/pages/{link_metadata.page_id}/{encode_title(link_metadata.title)}"
472
512
 
473
- components = ParseResult(
513
+ transformed_url = ParseResult(
474
514
  scheme="https",
475
515
  netloc=self.site_metadata.domain,
476
516
  path=page_url,
@@ -478,24 +518,83 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
478
518
  query="",
479
519
  fragment=relative_url.fragment,
480
520
  )
481
- transformed_url = urlunparse(components)
482
521
 
483
- LOGGER.debug("Transformed relative URL: %s to URL: %s", url, transformed_url)
484
- anchor.attrib["href"] = transformed_url
522
+ LOGGER.debug("Transformed relative URL: %s to URL: %s", relative_url.geturl(), transformed_url.geturl())
523
+ anchor.set("href", transformed_url.geturl())
485
524
  return None
486
525
 
487
- def _transform_image(self, image: ET._Element) -> ET._Element:
488
- "Inserts an attached or external image."
526
+ def _transform_attachment_link(self, anchor: ET._Element, absolute_path: Path) -> Optional[ET._Element]:
527
+ """
528
+ Transforms links to document binaries such as PDF, DOCX or XLSX.
529
+ """
530
+
531
+ if not absolute_path.exists():
532
+ self._warn_or_raise(f"relative URL points to non-existing file: {absolute_path}")
533
+ return None
534
+
535
+ file_name = attachment_name(path_relative_to(absolute_path, self.base_dir))
536
+ self.images.append(ImageData(absolute_path))
537
+
538
+ link_body = AC_ELEM("link-body", {}, *list(anchor))
539
+ link_body.text = anchor.text
540
+ link_wrapper = AC_ELEM(
541
+ "link",
542
+ {},
543
+ RI_ELEM("attachment", {RI_ATTR("filename"): file_name}),
544
+ link_body,
545
+ )
546
+ return link_wrapper
547
+
548
+ def _transform_status(self, color: str, caption: str) -> ET._Element:
549
+ macro_id = str(uuid.uuid4())
550
+ attributes = {
551
+ AC_ATTR("name"): "status",
552
+ AC_ATTR("schema-version"): "1",
553
+ AC_ATTR("macro-id"): macro_id,
554
+ }
555
+ if color != "gray":
556
+ return AC_ELEM(
557
+ "structured-macro",
558
+ attributes,
559
+ AC_ELEM(
560
+ "parameter",
561
+ {AC_ATTR("name"): "colour"},
562
+ color.title(),
563
+ ),
564
+ AC_ELEM(
565
+ "parameter",
566
+ {AC_ATTR("name"): "title"},
567
+ caption,
568
+ ),
569
+ )
570
+ else:
571
+ return AC_ELEM(
572
+ "structured-macro",
573
+ attributes,
574
+ AC_ELEM(
575
+ "parameter",
576
+ {AC_ATTR("name"): "title"},
577
+ caption,
578
+ ),
579
+ )
489
580
 
490
- src = image.attrib.get("src")
581
+ def _transform_image(self, context: FormattingContext, image: ET._Element) -> ET._Element:
582
+ "Inserts an attached or external image."
491
583
 
584
+ src = image.get("src")
492
585
  if not src:
493
586
  raise DocumentError("image lacks `src` attribute")
494
587
 
495
- caption = image.attrib.get("alt")
496
- width = image.attrib.get("width")
497
- height = image.attrib.get("height")
498
- attrs = ImageAttributes(caption, width, height)
588
+ alt = image.get("alt")
589
+ if alt is not None and src.startswith("urn:uuid:") and (color := status_images.get(src)) is not None:
590
+ return self._transform_status(color, alt)
591
+
592
+ title = image.get("title")
593
+ width = image.get("width")
594
+ height = image.get("height")
595
+ pixel_width = int(width) if width is not None and width.isdecimal() else None
596
+ pixel_height = int(height) if height is not None and height.isdecimal() else None
597
+ attrs = ImageAttributes(context, pixel_width, pixel_height, alt, title, None)
499
598
 
500
599
  if is_absolute_url(src):
501
600
  return self._transform_external_image(src, attrs)
@@ -504,39 +603,32 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
504
603
 
505
604
  absolute_path = self._verify_image_path(path)
506
605
  if absolute_path is None:
507
- return self._create_missing(path, caption)
606
+ return self._create_missing(path, attrs.caption)
508
607
 
509
608
  if absolute_path.name.endswith(".drawio.png") or absolute_path.name.endswith(".drawio.svg"):
510
609
  return self._transform_drawio_image(absolute_path, attrs)
511
610
  elif absolute_path.name.endswith(".drawio.xml") or absolute_path.name.endswith(".drawio"):
512
611
  return self._transform_drawio(absolute_path, attrs)
612
+ elif absolute_path.name.endswith(".mmd") or absolute_path.name.endswith(".mermaid"):
613
+ return self._transform_external_mermaid(absolute_path, attrs)
513
614
  else:
514
615
  return self._transform_attached_image(absolute_path, attrs)
515
616
 
516
617
  def _transform_external_image(self, url: str, attrs: ImageAttributes) -> ET._Element:
517
618
  "Emits Confluence Storage Format XHTML for an external image."
518
619
 
519
- attributes: dict[str, Any] = {
520
- ET.QName(namespaces["ac"], "align"): "center",
521
- ET.QName(namespaces["ac"], "layout"): "center",
522
- }
523
- if attrs.width is not None:
524
- attributes.update({ET.QName(namespaces["ac"], "width"): attrs.width})
525
- if attrs.height is not None:
526
- attributes.update({ET.QName(namespaces["ac"], "height"): attrs.height})
527
-
528
620
  elements: list[ET._Element] = []
529
621
  elements.append(
530
- RI(
622
+ RI_ELEM(
531
623
  "url",
532
624
  # refers to an external image
533
- {ET.QName(namespaces["ri"], "value"): url},
625
+ {RI_ATTR("value"): url},
534
626
  )
535
627
  )
536
- if attrs.caption is not None:
537
- elements.append(AC("caption", HTML.p(attrs.caption)))
628
+ if attrs.caption:
629
+ elements.append(AC_ELEM("caption", attrs.caption))
538
630
 
539
- return AC("image", attributes, *elements)
631
+ return AC_ELEM("image", attrs.as_dict(), *elements)
540
632
 
541
633
  def _verify_image_path(self, path: Path) -> Optional[Path]:
542
634
  "Checks whether an image path is safe to use."
@@ -557,13 +649,13 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
557
649
  def _transform_attached_image(self, absolute_path: Path, attrs: ImageAttributes) -> ET._Element:
558
650
  "Emits Confluence Storage Format XHTML for an attached raster or vector image."
559
651
 
560
- if self.options.prefer_raster and absolute_path.name.endswith(".svg"):
652
+ if self.options.prefer_raster and absolute_path.suffix == ".svg":
561
653
  # prefer PNG over SVG; Confluence displays SVG in wrong size, and text labels are truncated
562
654
  png_file = absolute_path.with_suffix(".png")
563
655
  if png_file.exists():
564
656
  absolute_path = png_file
565
657
 
566
- self.images.append(absolute_path)
658
+ self.images.append(ImageData(absolute_path, attrs.alt))
567
659
  image_name = attachment_name(path_relative_to(absolute_path, self.base_dir))
568
660
  return self._create_attached_image(image_name, attrs)
569
661
 
@@ -573,15 +665,15 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
573
665
  if not absolute_path.name.endswith(".drawio.xml") and not absolute_path.name.endswith(".drawio"):
574
666
  raise DocumentError("invalid image format; expected: `*.drawio.xml` or `*.drawio`")
575
667
 
668
+ relative_path = path_relative_to(absolute_path, self.base_dir)
576
669
  if self.options.render_drawio:
577
670
  image_data = drawio.render_diagram(absolute_path, self.options.diagram_output_format)
578
- image_hash = hashlib.md5(image_data).hexdigest()
579
- image_filename = attachment_name(f"embedded_{image_hash}.{self.options.diagram_output_format}")
580
- self.embedded_images[image_filename] = image_data
671
+ image_filename = attachment_name(relative_path.with_suffix(f".{self.options.diagram_output_format}"))
672
+ self.embedded_files[image_filename] = EmbeddedFileData(image_data, attrs.alt)
581
673
  return self._create_attached_image(image_filename, attrs)
582
674
  else:
583
- self.images.append(absolute_path)
584
- image_filename = attachment_name(path_relative_to(absolute_path, self.base_dir))
675
+ self.images.append(ImageData(absolute_path, attrs.alt))
676
+ image_filename = attachment_name(relative_path)
585
677
  return self._create_drawio(image_filename, attrs)
586
678
 
587
679
  def _transform_drawio_image(self, absolute_path: Path, attrs: ImageAttributes) -> ET._Element:
@@ -596,72 +688,63 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
596
688
  # extract embedded editable diagram and upload as *.drawio
597
689
  image_data = drawio.extract_diagram(absolute_path)
598
690
  image_filename = attachment_name(path_relative_to(absolute_path.with_suffix(".xml"), self.base_dir))
599
- self.embedded_images[image_filename] = image_data
691
+ self.embedded_files[image_filename] = EmbeddedFileData(image_data, attrs.alt)
600
692
 
601
693
  return self._create_drawio(image_filename, attrs)
602
694
 
603
695
  def _create_attached_image(self, image_name: str, attrs: ImageAttributes) -> ET._Element:
604
696
  "An image embedded into the page, linking to an attachment."
605
697
 
606
- attributes: dict[str, Any] = {
607
- ET.QName(namespaces["ac"], "align"): "center",
608
- ET.QName(namespaces["ac"], "layout"): "center",
609
- }
610
- if attrs.width is not None:
611
- attributes.update({ET.QName(namespaces["ac"], "width"): attrs.width})
612
- if attrs.height is not None:
613
- attributes.update({ET.QName(namespaces["ac"], "height"): attrs.height})
614
-
615
698
  elements: list[ET._Element] = []
616
699
  elements.append(
617
- RI(
700
+ RI_ELEM(
618
701
  "attachment",
619
702
  # refers to an attachment uploaded alongside the page
620
- {ET.QName(namespaces["ri"], "filename"): image_name},
703
+ {RI_ATTR("filename"): image_name},
621
704
  )
622
705
  )
623
- if attrs.caption is not None:
624
- elements.append(AC("caption", HTML.p(attrs.caption)))
706
+ if attrs.caption:
707
+ elements.append(AC_ELEM("caption", attrs.caption))
625
708
 
626
- return AC("image", attributes, *elements)
709
+ return AC_ELEM("image", attrs.as_dict(), *elements)
627
710
 
628
711
  def _create_drawio(self, filename: str, attrs: ImageAttributes) -> ET._Element:
629
712
  "A draw.io diagram embedded into the page, linking to an attachment."
630
713
 
631
714
  parameters: list[ET._Element] = [
632
- AC(
715
+ AC_ELEM(
633
716
  "parameter",
634
- {ET.QName(namespaces["ac"], "name"): "diagramName"},
717
+ {AC_ATTR("name"): "diagramName"},
635
718
  filename,
636
719
  ),
637
720
  ]
638
721
  if attrs.width is not None:
639
722
  parameters.append(
640
- AC(
723
+ AC_ELEM(
641
724
  "parameter",
642
- {ET.QName(namespaces["ac"], "name"): "width"},
643
- attrs.width,
725
+ {AC_ATTR("name"): "width"},
726
+ str(attrs.width),
644
727
  ),
645
728
  )
646
729
  if attrs.height is not None:
647
730
  parameters.append(
648
- AC(
731
+ AC_ELEM(
649
732
  "parameter",
650
- {ET.QName(namespaces["ac"], "name"): "height"},
651
- attrs.height,
733
+ {AC_ATTR("name"): "height"},
734
+ str(attrs.height),
652
735
  ),
653
736
  )
654
737
 
655
738
  local_id = str(uuid.uuid4())
656
739
  macro_id = str(uuid.uuid4())
657
- return AC(
740
+ return AC_ELEM(
658
741
  "structured-macro",
659
742
  {
660
- ET.QName(namespaces["ac"], "name"): "drawio",
661
- ET.QName(namespaces["ac"], "schema-version"): "1",
743
+ AC_ATTR("name"): "drawio",
744
+ AC_ATTR("schema-version"): "1",
662
745
  "data-layout": "default",
663
- ET.QName(namespaces["ac"], "local-id"): local_id,
664
- ET.QName(namespaces["ac"], "macro-id"): macro_id,
746
+ AC_ATTR("local-id"): local_id,
747
+ AC_ATTR("macro-id"): macro_id,
665
748
  },
666
749
  *parameters,
667
750
  )
@@ -672,21 +755,21 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
672
755
  message = HTML.p("Missing image: ", HTML.code(path.as_posix()))
673
756
  if caption is not None:
674
757
  content = [
675
- AC(
758
+ AC_ELEM(
676
759
  "parameter",
677
- {ET.QName(namespaces["ac"], "name"): "title"},
760
+ {AC_ATTR("name"): "title"},
678
761
  caption,
679
762
  ),
680
- AC("rich-text-body", {}, message),
763
+ AC_ELEM("rich-text-body", {}, message),
681
764
  ]
682
765
  else:
683
- content = [AC("rich-text-body", {}, message)]
766
+ content = [AC_ELEM("rich-text-body", {}, message)]
684
767
 
685
- return AC(
768
+ return AC_ELEM(
686
769
  "structured-macro",
687
770
  {
688
- ET.QName(namespaces["ac"], "name"): "warning",
689
- ET.QName(namespaces["ac"], "schema-version"): "1",
771
+ AC_ATTR("name"): "warning",
772
+ AC_ATTR("schema-version"): "1",
690
773
  },
691
774
  *content,
692
775
  )
@@ -694,107 +777,132 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
694
777
  def _transform_code_block(self, code: ET._Element) -> ET._Element:
695
778
  "Transforms a code block."
696
779
 
697
- language = code.attrib.get("class")
698
- if language:
699
- m = re.match("^language-(.*)$", language)
700
- if m:
701
- language = m.group(1)
780
+ if language_class := code.get("class"):
781
+ if m := re.match("^language-(.*)$", language_class):
782
+ language_name = m.group(1)
702
783
  else:
703
- language = "none"
704
- if language not in _languages:
705
- language = "none"
784
+ language_name = None
785
+ else:
786
+ language_name = None
787
+
788
+ # translate name to standard name for (programming) language
789
+ if language_name is not None:
790
+ language_id = _LANGUAGES.get(language_name)
791
+ else:
792
+ language_id = None
793
+
706
794
  content: str = code.text or ""
707
795
  content = content.rstrip()
708
796
 
709
- if language == "mermaid":
710
- return self._transform_mermaid(content)
797
+ if language_id == "mermaid":
798
+ return self._transform_fenced_mermaid(content)
711
799
 
712
- return AC(
800
+ return AC_ELEM(
713
801
  "structured-macro",
714
802
  {
715
- ET.QName(namespaces["ac"], "name"): "code",
716
- ET.QName(namespaces["ac"], "schema-version"): "1",
803
+ AC_ATTR("name"): "code",
804
+ AC_ATTR("schema-version"): "1",
717
805
  },
718
- AC(
806
+ AC_ELEM(
719
807
  "parameter",
720
- {ET.QName(namespaces["ac"], "name"): "theme"},
721
- "Default",
808
+ {AC_ATTR("name"): "language"},
809
+ language_id or "none",
722
810
  ),
723
- AC(
724
- "parameter",
725
- {ET.QName(namespaces["ac"], "name"): "language"},
726
- language,
727
- ),
728
- AC("plain-text-body", ET.CDATA(content)),
811
+ AC_ELEM("plain-text-body", ET.CDATA(content)),
729
812
  )
730
813
 
731
- def _transform_mermaid(self, content: str) -> ET._Element:
732
- "Transforms a Mermaid diagram code block."
814
+ def _transform_external_mermaid(self, absolute_path: Path, attrs: ImageAttributes) -> ET._Element:
815
+ "Emits Confluence Storage Format XHTML for a Mermaid diagram read from an external file."
816
+
817
+ if not absolute_path.name.endswith(".mmd") and not absolute_path.name.endswith(".mermaid"):
818
+ raise DocumentError("invalid image format; expected: `*.mmd` or `*.mermaid`")
819
+
820
+ relative_path = path_relative_to(absolute_path, self.base_dir)
821
+ if self.options.render_mermaid:
822
+ with open(absolute_path, "r", encoding="utf-8") as f:
823
+ content = f.read()
824
+ image_data = mermaid.render_diagram(content, self.options.diagram_output_format)
825
+ image_filename = attachment_name(relative_path.with_suffix(f".{self.options.diagram_output_format}"))
826
+ self.embedded_files[image_filename] = EmbeddedFileData(image_data, attrs.alt)
827
+ return self._create_attached_image(image_filename, attrs)
828
+ else:
829
+ self.images.append(ImageData(absolute_path, attrs.alt))
830
+ mermaid_filename = attachment_name(relative_path)
831
+ return self._create_mermaid_embed(mermaid_filename)
832
+
833
+ def _transform_fenced_mermaid(self, content: str) -> ET._Element:
834
+ "Emits Confluence Storage Format XHTML for a Mermaid diagram defined in a fenced code block."
733
835
 
734
836
  if self.options.render_mermaid:
735
837
  image_data = mermaid.render_diagram(content, self.options.diagram_output_format)
736
838
  image_hash = hashlib.md5(image_data).hexdigest()
737
839
  image_filename = attachment_name(f"embedded_{image_hash}.{self.options.diagram_output_format}")
738
- self.embedded_images[image_filename] = image_data
739
- return self._create_attached_image(image_filename, ImageAttributes(None, None, None))
840
+ self.embedded_files[image_filename] = EmbeddedFileData(image_data)
841
+ return self._create_attached_image(image_filename, ImageAttributes.EMPTY_BLOCK)
740
842
  else:
741
- local_id = str(uuid.uuid4())
742
- macro_id = str(uuid.uuid4())
743
- return AC(
744
- "structured-macro",
745
- {
746
- ET.QName(namespaces["ac"], "name"): "macro-diagram",
747
- ET.QName(namespaces["ac"], "schema-version"): "1",
748
- "data-layout": "default",
749
- ET.QName(namespaces["ac"], "local-id"): local_id,
750
- ET.QName(namespaces["ac"], "macro-id"): macro_id,
751
- },
752
- AC(
753
- "parameter",
754
- {ET.QName(namespaces["ac"], "name"): "sourceType"},
755
- "MacroBody",
756
- ),
757
- AC(
758
- "parameter",
759
- {ET.QName(namespaces["ac"], "name"): "attachmentPageId"},
760
- ),
761
- AC(
762
- "parameter",
763
- {ET.QName(namespaces["ac"], "name"): "syntax"},
764
- "Mermaid",
765
- ),
766
- AC(
767
- "parameter",
768
- {ET.QName(namespaces["ac"], "name"): "attachmentId"},
769
- ),
770
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "url"}),
771
- AC("plain-text-body", ET.CDATA(content)),
772
- )
843
+ mermaid_data = content.encode("utf-8")
844
+ mermaid_hash = hashlib.md5(mermaid_data).hexdigest()
845
+ mermaid_filename = attachment_name(f"embedded_{mermaid_hash}.mmd")
846
+ self.embedded_files[mermaid_filename] = EmbeddedFileData(mermaid_data)
847
+ return self._create_mermaid_embed(mermaid_filename)
848
+
849
+ def _create_mermaid_embed(self, filename: str) -> ET._Element:
850
+ "A Mermaid diagram, linking to an attachment that captures the Mermaid source."
851
+
852
+ local_id = str(uuid.uuid4())
853
+ macro_id = str(uuid.uuid4())
854
+ return AC_ELEM(
855
+ "structured-macro",
856
+ {
857
+ AC_ATTR("name"): "mermaid-cloud",
858
+ AC_ATTR("schema-version"): "1",
859
+ "data-layout": "default",
860
+ AC_ATTR("local-id"): local_id,
861
+ AC_ATTR("macro-id"): macro_id,
862
+ },
863
+ AC_ELEM(
864
+ "parameter",
865
+ {AC_ATTR("name"): "filename"},
866
+ filename,
867
+ ),
868
+ AC_ELEM(
869
+ "parameter",
870
+ {AC_ATTR("name"): "toolbar"},
871
+ "bottom",
872
+ ),
873
+ AC_ELEM(
874
+ "parameter",
875
+ {AC_ATTR("name"): "zoom"},
876
+ "fit",
877
+ ),
878
+ AC_ELEM("parameter", {AC_ATTR("name"): "revision"}, "1"),
879
+ )
773
880
 
774
881
  def _transform_toc(self, code: ET._Element) -> ET._Element:
775
882
  "Creates a table of contents, constructed from headings in the document."
776
883
 
777
- return AC(
884
+ return AC_ELEM(
778
885
  "structured-macro",
779
886
  {
780
- ET.QName(namespaces["ac"], "name"): "toc",
781
- ET.QName(namespaces["ac"], "schema-version"): "1",
887
+ AC_ATTR("name"): "toc",
888
+ AC_ATTR("schema-version"): "1",
889
+ "data-layout": "default",
782
890
  },
783
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "outline"}, "clear"),
784
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "style"}, "default"),
891
+ AC_ELEM("parameter", {AC_ATTR("name"): "outline"}, "clear"),
892
+ AC_ELEM("parameter", {AC_ATTR("name"): "style"}, "default"),
785
893
  )
786
894
 
787
895
  def _transform_listing(self, code: ET._Element) -> ET._Element:
788
896
  "Creates a list of child pages."
789
897
 
790
- return AC(
898
+ return AC_ELEM(
791
899
  "structured-macro",
792
900
  {
793
- ET.QName(namespaces["ac"], "name"): "children",
794
- ET.QName(namespaces["ac"], "schema-version"): "2",
901
+ AC_ATTR("name"): "children",
902
+ AC_ATTR("schema-version"): "2",
795
903
  "data-layout": "default",
796
904
  },
797
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "allChildren"}, "true"),
905
+ AC_ELEM("parameter", {AC_ATTR("name"): "allChildren"}, "true"),
798
906
  )
799
907
 
800
908
  def _transform_admonition(self, elem: ET._Element) -> ET._Element:
@@ -805,8 +913,11 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
805
913
  syntax into one of the Confluence structured macros *info*, *tip*, *note*, or *warning*.
806
914
  """
807
915
 
916
+ if len(elem) < 1:
917
+ raise DocumentError("empty admonition")
918
+
808
919
  # <div class="admonition note">
809
- class_list = elem.attrib.get("class", "").split(" ")
920
+ class_list = elem.get("class", "").split(" ")
810
921
  class_name: Optional[str] = None
811
922
  if "info" in class_list:
812
923
  class_name = "info"
@@ -824,33 +935,36 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
824
935
  self.visit(e)
825
936
 
826
937
  # <p class="admonition-title">Note</p>
827
- if "admonition-title" in elem[0].attrib.get("class", "").split(" "):
938
+ if "admonition-title" in elem[0].get("class", "").split(" "):
828
939
  content = [
829
- AC(
940
+ AC_ELEM(
830
941
  "parameter",
831
- {ET.QName(namespaces["ac"], "name"): "title"},
942
+ {AC_ATTR("name"): "title"},
832
943
  elem[0].text or "",
833
944
  ),
834
- AC("rich-text-body", {}, *list(elem[1:])),
945
+ AC_ELEM("rich-text-body", {}, *list(elem[1:])),
835
946
  ]
836
947
  else:
837
- content = [AC("rich-text-body", {}, *list(elem))]
948
+ content = [AC_ELEM("rich-text-body", {}, *list(elem))]
838
949
 
839
- return AC(
950
+ return AC_ELEM(
840
951
  "structured-macro",
841
952
  {
842
- ET.QName(namespaces["ac"], "name"): class_name,
843
- ET.QName(namespaces["ac"], "schema-version"): "1",
953
+ AC_ATTR("name"): class_name,
954
+ AC_ATTR("schema-version"): "1",
844
955
  },
845
956
  *content,
846
957
  )
847
958
 
848
- def _transform_github_alert(self, elem: ET._Element) -> ET._Element:
959
+ def _transform_github_alert(self, blockquote: ET._Element) -> ET._Element:
849
960
  """
850
961
  Creates a GitHub-style panel, normally triggered with a block-quote starting with a capitalized string such as `[!TIP]`.
851
962
  """
852
963
 
853
- content = elem[0]
964
+ if len(blockquote) < 1:
965
+ raise DocumentError("empty GitHub alert")
966
+
967
+ content = blockquote[0]
854
968
  if content.text is None:
855
969
  raise DocumentError("empty content")
856
970
 
@@ -875,9 +989,9 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
875
989
  else:
876
990
  raise DocumentError(f"unsupported GitHub alert: {alert}")
877
991
 
878
- return self._transform_alert(elem, class_name, skip)
992
+ return self._transform_alert(blockquote, class_name, skip)
879
993
 
880
- def _transform_gitlab_alert(self, elem: ET._Element) -> ET._Element:
994
+ def _transform_gitlab_alert(self, blockquote: ET._Element) -> ET._Element:
881
995
  """
882
996
  Creates a classic GitLab-style panel.
883
997
 
@@ -885,7 +999,10 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
885
999
  This syntax does not use Hugo shortcode.
886
1000
  """
887
1001
 
888
- content = elem[0]
1002
+ if len(blockquote) < 1:
1003
+ raise DocumentError("empty GitLab alert")
1004
+
1005
+ content = blockquote[0]
889
1006
  if content.text is None:
890
1007
  raise DocumentError("empty content")
891
1008
 
@@ -908,69 +1025,85 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
908
1025
  else:
909
1026
  raise DocumentError(f"unsupported GitLab alert: {alert}")
910
1027
 
911
- return self._transform_alert(elem, class_name, skip)
1028
+ return self._transform_alert(blockquote, class_name, skip)
912
1029
 
913
- def _transform_alert(self, elem: ET._Element, class_name: Optional[str], skip: int) -> ET._Element:
1030
+ def _transform_alert(self, blockquote: ET._Element, class_name: Optional[str], skip: int) -> ET._Element:
914
1031
  """
915
1032
  Creates an info, tip, note or warning panel from a GitHub or GitLab alert.
916
1033
 
917
- Transforms
918
- [GitHub alert](https://docs.github.com/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#alerts)
919
- or [GitLab alert](https://docs.gitlab.com/ee/development/documentation/styleguide/#alert-boxes)
920
- syntax into one of the Confluence structured macros *info*, *tip*, *note*, or *warning*.
1034
+ Transforms GitHub alert or GitLab alert syntax into one of the Confluence structured macros *info*, *tip*, *note*, or *warning*.
1035
+
1036
+ :see: https://docs.github.com/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#alerts
1037
+ :see: https://docs.gitlab.com/ee/development/documentation/styleguide/#alert-boxes
921
1038
  """
922
1039
 
923
- content = elem[0]
1040
+ content = blockquote[0]
924
1041
  if content.text is None:
925
1042
  raise DocumentError("empty content")
926
1043
 
927
1044
  if class_name is None:
928
1045
  raise DocumentError("not an alert")
929
1046
 
930
- for e in elem:
1047
+ for e in blockquote:
931
1048
  self.visit(e)
932
1049
 
933
1050
  content.text = content.text[skip:]
934
- return AC(
1051
+ return AC_ELEM(
935
1052
  "structured-macro",
936
1053
  {
937
- ET.QName(namespaces["ac"], "name"): class_name,
938
- ET.QName(namespaces["ac"], "schema-version"): "1",
1054
+ AC_ATTR("name"): class_name,
1055
+ AC_ATTR("schema-version"): "1",
939
1056
  },
940
- AC("rich-text-body", {}, *list(elem)),
1057
+ AC_ELEM("rich-text-body", {}, *list(blockquote)),
941
1058
  )
942
1059
 
943
- def _transform_section(self, elem: ET._Element) -> ET._Element:
1060
+ def _transform_section(self, details: ET._Element) -> ET._Element:
944
1061
  """
945
1062
  Creates a collapsed section.
946
1063
 
947
- Transforms
948
- [GitHub collapsed section](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/organizing-information-with-collapsed-sections)
949
- syntax into the Confluence structured macro *expand*.
1064
+ Transforms a GitHub collapsed section syntax into the Confluence structured macro *expand*.
1065
+
1066
+ :see: https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/organizing-information-with-collapsed-sections
950
1067
  """
951
1068
 
952
- if elem[0].tag != "summary":
1069
+ summary = details[0]
1070
+ if summary.tag != "summary":
953
1071
  raise DocumentError("expected: `<summary>` as first direct child of `<details>`")
954
- if elem[0].tail is not None:
1072
+ if details.text is not None or summary.tail is not None:
1073
+ # when `<details>` has attribute `markdown=1`, content is parsed as Markdown:
1074
+ # ```
1075
+ # <details>
1076
+ # <summary>...</summary>
1077
+ # <p>Text with <em>emphasis</em>.</p>
1078
+ # </details>
1079
+ # ```
1080
+ #
1081
+ # when `<details>` lacks attribute `markdown=1`, content is passed down as raw HTML, partly as `text` of `<detail>` or `tail` of `<summary>`:
1082
+ # ```
1083
+ # <details>
1084
+ # <summary>...</summary>
1085
+ # Text with *emphasis*.
1086
+ # </details>
955
1087
  raise DocumentError('expected: attribute `markdown="1"` on `<details>`')
956
1088
 
957
- summary = "".join(elem[0].itertext()).strip()
958
- elem.remove(elem[0])
1089
+ summary_text = element_to_text(summary)
1090
+ details.remove(summary)
959
1091
 
960
- self.visit(elem)
1092
+ # transform Markdown to Confluence within collapsed section content
1093
+ self.visit(details)
961
1094
 
962
- return AC(
1095
+ return AC_ELEM(
963
1096
  "structured-macro",
964
1097
  {
965
- ET.QName(namespaces["ac"], "name"): "expand",
966
- ET.QName(namespaces["ac"], "schema-version"): "1",
1098
+ AC_ATTR("name"): "expand",
1099
+ AC_ATTR("schema-version"): "1",
967
1100
  },
968
- AC(
1101
+ AC_ELEM(
969
1102
  "parameter",
970
- {ET.QName(namespaces["ac"], "name"): "title"},
971
- summary,
1103
+ {AC_ATTR("name"): "title"},
1104
+ summary_text,
972
1105
  ),
973
- AC("rich-text-body", {}, *list(elem)),
1106
+ AC_ELEM("rich-text-body", {}, *list(details)),
974
1107
  )
975
1108
 
976
1109
  def _transform_emoji(self, elem: ET._Element) -> ET._Element:
@@ -978,23 +1111,59 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
978
1111
  Inserts an inline emoji character.
979
1112
  """
980
1113
 
981
- shortname = elem.attrib.get("data-emoji-shortname", "")
982
- unicode = elem.attrib.get("data-emoji-unicode", None)
1114
+ shortname = elem.get("data-shortname", "")
1115
+ unicode = elem.get("data-unicode", None)
983
1116
  alt = elem.text or ""
984
1117
 
985
1118
  # <ac:emoticon ac:name="wink" ac:emoji-shortname=":wink:" ac:emoji-id="1f609" ac:emoji-fallback="&#128521;"/>
986
- # <ac:emoticon ac:name="blue-star" ac:emoji-shortname=":heavy_plus_sign:" ac:emoji-id="2795" ac:emoji-fallback="&#10133;"/>
987
- # <ac:emoticon ac:name="blue-star" ac:emoji-shortname=":heavy_minus_sign:" ac:emoji-id="2796" ac:emoji-fallback="&#10134;"/>
988
- return AC(
1119
+ return AC_ELEM(
989
1120
  "emoticon",
990
1121
  {
991
- ET.QName(namespaces["ac"], "name"): shortname,
992
- ET.QName(namespaces["ac"], "emoji-shortname"): f":{shortname}:",
993
- ET.QName(namespaces["ac"], "emoji-id"): unicode,
994
- ET.QName(namespaces["ac"], "emoji-fallback"): alt,
1122
+ AC_ATTR("name"): shortname,
1123
+ AC_ATTR("emoji-shortname"): f":{shortname}:",
1124
+ AC_ATTR("emoji-id"): unicode,
1125
+ AC_ATTR("emoji-fallback"): alt,
995
1126
  },
996
1127
  )
997
1128
 
1129
+ def _transform_mark(self, mark: ET._Element) -> ET._Element:
1130
+ """
1131
+ Adds inline highlighting to text.
1132
+ """
1133
+
1134
+ attrs = dict(mark.items())
1135
+ old_style = attrs.get("style")
1136
+ new_style = "background-color: rgb(254,222,200);"
1137
+ if old_style is not None:
1138
+ new_style += f" {old_style}"
1139
+ attrs["style"] = new_style
1140
+ span = HTML("span", attrs, *list(mark))
1141
+ span.text = mark.text
1142
+ return span
1143
+
1144
+ def _transform_latex(self, elem: ET._Element, context: FormattingContext) -> ET._Element:
1145
+ """
1146
+ Creates an image rendering of a LaTeX formula with Matplotlib.
1147
+ """
1148
+
1149
+ content = elem.text
1150
+ if not content:
1151
+ raise DocumentError("empty LaTeX formula")
1152
+
1153
+ image_data = render_latex(content, format=self.options.diagram_output_format)
1154
+ if self.options.diagram_output_format == "png":
1155
+ width, height = get_png_dimensions(data=image_data)
1156
+ image_data = remove_png_chunks(["pHYs"], source_data=image_data)
1157
+ attrs = ImageAttributes(context, width, height, content, None, "")
1158
+ else:
1159
+ attrs = ImageAttributes.empty(context)
1160
+
1161
+ image_hash = hashlib.md5(image_data).hexdigest()
1162
+ image_filename = attachment_name(f"formula_{image_hash}.{self.options.diagram_output_format}")
1163
+ self.embedded_files[image_filename] = EmbeddedFileData(image_data, content)
1164
+ image = self._create_attached_image(image_filename, attrs)
1165
+ return image
1166
+
998
1167
  def _transform_inline_math(self, elem: ET._Element) -> ET._Element:
999
1168
  """
1000
1169
  Creates an inline LaTeX formula using the Confluence extension "LaTeX Math for Confluence - Math Formula & Equations".
@@ -1002,30 +1171,32 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1002
1171
  :see: https://help.narva.net/latex-math-for-confluence/
1003
1172
  """
1004
1173
 
1005
- content = elem.text or ""
1174
+ content = elem.text
1006
1175
  if not content:
1007
1176
  raise DocumentError("empty inline LaTeX formula")
1008
1177
 
1009
1178
  LOGGER.debug("Found inline LaTeX formula: %s", content)
1010
1179
 
1180
+ if self.options.render_latex:
1181
+ return self._transform_latex(elem, FormattingContext.INLINE)
1182
+
1011
1183
  local_id = str(uuid.uuid4())
1012
1184
  macro_id = str(uuid.uuid4())
1013
- macro = AC(
1185
+ macro = AC_ELEM(
1014
1186
  "structured-macro",
1015
1187
  {
1016
- ET.QName(namespaces["ac"], "name"): "eazy-math-inline",
1017
- ET.QName(namespaces["ac"], "schema-version"): "1",
1018
- ET.QName(namespaces["ac"], "local-id"): local_id,
1019
- ET.QName(namespaces["ac"], "macro-id"): macro_id,
1188
+ AC_ATTR("name"): "eazy-math-inline",
1189
+ AC_ATTR("schema-version"): "1",
1190
+ AC_ATTR("local-id"): local_id,
1191
+ AC_ATTR("macro-id"): macro_id,
1020
1192
  },
1021
- AC(
1193
+ AC_ELEM(
1022
1194
  "parameter",
1023
- {ET.QName(namespaces["ac"], "name"): "body"},
1195
+ {AC_ATTR("name"): "body"},
1024
1196
  content,
1025
1197
  ),
1026
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "align"}, "center"),
1198
+ AC_ELEM("parameter", {AC_ATTR("name"): "align"}, "center"),
1027
1199
  )
1028
- macro.tail = elem.tail # chain sibling text node that immediately follows original element
1029
1200
  return macro
1030
1201
 
1031
1202
  def _transform_block_math(self, elem: ET._Element) -> ET._Element:
@@ -1035,30 +1206,33 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1035
1206
  :see: https://help.narva.net/latex-math-for-confluence/
1036
1207
  """
1037
1208
 
1038
- content = elem.text or ""
1209
+ content = elem.text
1039
1210
  if not content:
1040
1211
  raise DocumentError("empty block-level LaTeX formula")
1041
1212
 
1042
1213
  LOGGER.debug("Found block-level LaTeX formula: %s", content)
1043
1214
 
1215
+ if self.options.render_latex:
1216
+ return self._transform_latex(elem, FormattingContext.BLOCK)
1217
+
1044
1218
  local_id = str(uuid.uuid4())
1045
1219
  macro_id = str(uuid.uuid4())
1046
1220
 
1047
- return AC(
1221
+ return AC_ELEM(
1048
1222
  "structured-macro",
1049
1223
  {
1050
- ET.QName(namespaces["ac"], "name"): "easy-math-block",
1051
- ET.QName(namespaces["ac"], "schema-version"): "1",
1224
+ AC_ATTR("name"): "easy-math-block",
1225
+ AC_ATTR("schema-version"): "1",
1052
1226
  "data-layout": "default",
1053
- ET.QName(namespaces["ac"], "local-id"): local_id,
1054
- ET.QName(namespaces["ac"], "macro-id"): macro_id,
1227
+ AC_ATTR("local-id"): local_id,
1228
+ AC_ATTR("macro-id"): macro_id,
1055
1229
  },
1056
- AC(
1230
+ AC_ELEM(
1057
1231
  "parameter",
1058
- {ET.QName(namespaces["ac"], "name"): "body"},
1232
+ {AC_ATTR("name"): "body"},
1059
1233
  content,
1060
1234
  ),
1061
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "align"}, "center"),
1235
+ AC_ELEM("parameter", {AC_ATTR("name"): "align"}, "center"),
1062
1236
  )
1063
1237
 
1064
1238
  def _transform_footnote_ref(self, elem: ET._Element) -> None:
@@ -1078,7 +1252,9 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1078
1252
  raise DocumentError("expected: attribute `id` of format `fnref:NAME` applied on `<sup>` for a footnote reference")
1079
1253
  footnote_ref = ref_id.removeprefix("fnref:")
1080
1254
 
1081
- link = elem[0]
1255
+ link = next((elem.iterchildren(tag="a")), None)
1256
+ if link is None:
1257
+ raise DocumentError("expected: `<a>` as the first HTML element in a footnote reference")
1082
1258
  def_href = link.attrib.pop("href", "")
1083
1259
  if not def_href.startswith("#fn:"):
1084
1260
  raise DocumentError("expected: attribute `href` of format `#fn:NAME` applied on `<a>` for a footnote reference")
@@ -1090,26 +1266,26 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1090
1266
  elem.remove(link)
1091
1267
 
1092
1268
  # build new anchor for footnote reference
1093
- ref_anchor = AC(
1269
+ ref_anchor = AC_ELEM(
1094
1270
  "structured-macro",
1095
1271
  {
1096
- ET.QName(namespaces["ac"], "name"): "anchor",
1097
- ET.QName(namespaces["ac"], "schema-version"): "1",
1272
+ AC_ATTR("name"): "anchor",
1273
+ AC_ATTR("schema-version"): "1",
1098
1274
  },
1099
- AC(
1275
+ AC_ELEM(
1100
1276
  "parameter",
1101
- {ET.QName(namespaces["ac"], "name"): ""},
1277
+ {AC_ATTR("name"): ""},
1102
1278
  f"footnote-ref-{footnote_ref}",
1103
1279
  ),
1104
1280
  )
1105
1281
 
1106
1282
  # build new link to footnote definition at the end of page
1107
- def_link = AC(
1283
+ def_link = AC_ELEM(
1108
1284
  "link",
1109
1285
  {
1110
- ET.QName(namespaces["ac"], "anchor"): f"footnote-def-{footnote_def}",
1286
+ AC_ATTR("anchor"): f"footnote-def-{footnote_def}",
1111
1287
  },
1112
- AC("link-body", ET.CDATA(text)),
1288
+ AC_ELEM("link-body", ET.CDATA(text)),
1113
1289
  )
1114
1290
 
1115
1291
  # append children synthesized for Confluence
@@ -1132,18 +1308,28 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1132
1308
  ```
1133
1309
  """
1134
1310
 
1135
- for list_item in elem[1]:
1311
+ ordered_list = next((elem.iterchildren(tag="ol")), None)
1312
+ if ordered_list is None:
1313
+ raise DocumentError("expected: `<ol>` as direct child of footnote definition block")
1314
+
1315
+ for list_item in ordered_list:
1316
+ if list_item.tag != "li":
1317
+ raise DocumentError("expected: `<li>` as children of `<ol>` in footnote definition block")
1318
+
1136
1319
  def_id = list_item.attrib.pop("id", "")
1137
1320
  if not def_id.startswith("fn:"):
1138
1321
  raise DocumentError("expected: attribute `id` of format `fn:NAME` applied on `<li>` for a footnote definition")
1139
1322
  footnote_def = def_id.removeprefix("fn:")
1140
1323
 
1141
- paragraph = list_item[0]
1142
- ref_anchor = paragraph[-1]
1143
- if ref_anchor.tag != "a":
1324
+ paragraph = next((list_item.iterchildren(tag="p")), None)
1325
+ if paragraph is None:
1326
+ raise DocumentError("expected: `<p>` as a child of `<li>` in a footnote definition")
1327
+
1328
+ ref_anchor = next((paragraph.iterchildren(tag="a", reversed=True)), None)
1329
+ if ref_anchor is None:
1144
1330
  raise DocumentError("expected: `<a>` as the last HTML element in a footnote definition")
1145
1331
 
1146
- ref_href = ref_anchor.attrib.get("href", "")
1332
+ ref_href = ref_anchor.get("href", "")
1147
1333
  if not ref_href.startswith("#fnref:"):
1148
1334
  raise DocumentError("expected: attribute `href` of format `#fnref:NAME` applied on last element `<a>` for a footnote definition")
1149
1335
  footnote_ref = ref_href.removeprefix("#fnref:")
@@ -1152,26 +1338,26 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1152
1338
  paragraph.remove(ref_anchor)
1153
1339
 
1154
1340
  # build new anchor for footnote definition
1155
- def_anchor = AC(
1341
+ def_anchor = AC_ELEM(
1156
1342
  "structured-macro",
1157
1343
  {
1158
- ET.QName(namespaces["ac"], "name"): "anchor",
1159
- ET.QName(namespaces["ac"], "schema-version"): "1",
1344
+ AC_ATTR("name"): "anchor",
1345
+ AC_ATTR("schema-version"): "1",
1160
1346
  },
1161
- AC(
1347
+ AC_ELEM(
1162
1348
  "parameter",
1163
- {ET.QName(namespaces["ac"], "name"): ""},
1349
+ {AC_ATTR("name"): ""},
1164
1350
  f"footnote-def-{footnote_def}",
1165
1351
  ),
1166
1352
  )
1167
1353
 
1168
1354
  # build new link to footnote reference in page body
1169
- ref_link = AC(
1355
+ ref_link = AC_ELEM(
1170
1356
  "link",
1171
1357
  {
1172
- ET.QName(namespaces["ac"], "anchor"): f"footnote-ref-{footnote_ref}",
1358
+ AC_ATTR("anchor"): f"footnote-ref-{footnote_ref}",
1173
1359
  },
1174
- AC("link-body", ET.CDATA("↩")),
1360
+ AC_ELEM("link-body", ET.CDATA("↩")),
1175
1361
  )
1176
1362
 
1177
1363
  # append children synthesized for Confluence
@@ -1180,79 +1366,132 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1180
1366
  paragraph.text = None
1181
1367
  paragraph.append(ref_link)
1182
1368
 
1369
+ def _transform_tasklist(self, elem: ET._Element) -> ET._Element:
1370
+ """
1371
+ Transforms a list of tasks into an action widget.
1372
+
1373
+ :see: https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-tasklists
1374
+ """
1375
+
1376
+ if elem.tag != "ul":
1377
+ raise DocumentError("expected: `<ul>` as the HTML element for a tasklist")
1378
+
1379
+ for item in elem:
1380
+ if item.tag != "li":
1381
+ raise DocumentError("expected: `<li>` as the HTML element for a task")
1382
+ if not element_text_starts_with_any(item, ["[ ]", "[x]", "[X]"]):
1383
+ raise DocumentError("expected: each `<li>` in a task list starting with [ ] or [x]")
1384
+
1385
+ tasks: list[ET._Element] = []
1386
+ for index, item in enumerate(elem, start=1):
1387
+ if item.text is None:
1388
+ raise NotImplementedError("pre-condition check not exhaustive")
1389
+ match = re.match(r"^\[([x X])\]", item.text)
1390
+ if match is None:
1391
+ raise NotImplementedError("pre-condition check not exhaustive")
1392
+
1393
+ status = "incomplete" if match.group(1).isspace() else "complete"
1394
+ item.text = item.text[3:]
1395
+
1396
+ # transform Markdown to Confluence within tasklist content
1397
+ self.visit(item)
1398
+
1399
+ body = AC_ELEM("task-body", *list(item))
1400
+ body.text = item.text
1401
+ tasks.append(
1402
+ AC_ELEM(
1403
+ "task",
1404
+ {},
1405
+ AC_ELEM("task-id", str(index)),
1406
+ AC_ELEM("task-uuid", str(uuid.uuid4())),
1407
+ AC_ELEM("task-status", status),
1408
+ body,
1409
+ ),
1410
+ )
1411
+ return AC_ELEM("task-list", {}, *tasks)
1412
+
1413
+ @override
1183
1414
  def transform(self, child: ET._Element) -> Optional[ET._Element]:
1184
1415
  """
1185
1416
  Transforms an HTML element tree obtained from a Markdown document into a Confluence Storage Format element tree.
1186
1417
  """
1187
1418
 
1188
- # normalize line breaks to regular space in element text
1419
+ # replace line breaks with regular space in element text to minimize phantom changes
1189
1420
  if child.text:
1190
- text: str = child.text
1191
- child.text = text.replace("\n", " ")
1421
+ child.text = child.text.replace("\n", " ")
1192
1422
  if child.tail:
1193
- tail: str = child.tail
1194
- child.tail = tail.replace("\n", " ")
1423
+ child.tail = child.tail.replace("\n", " ")
1195
1424
 
1196
1425
  if not isinstance(child.tag, str):
1197
1426
  return None
1198
1427
 
1199
- # <h1>...</h1>
1200
- # <h2>...</h2> ...
1201
- m = re.match(r"^h([1-6])$", child.tag, flags=re.IGNORECASE)
1202
- if m is not None:
1203
- level = int(m.group(1))
1204
- title = element_to_text(child)
1205
- self.toc.add(level, title)
1206
-
1207
- if self.options.heading_anchors:
1208
- self._transform_heading(child)
1428
+ # <p>...</p>
1429
+ if child.tag == "p":
1430
+ # <p><img src="..." /></p>
1431
+ if len(child) == 1 and not child.text and child[0].tag == "img" and not child[0].tail:
1432
+ return self._transform_image(FormattingContext.BLOCK, child[0])
1433
+
1434
+ # <p>[[<em>TOC</em>]]</p> (represented in Markdown as `[[_TOC_]]`)
1435
+ elif is_placeholder_for(child, "TOC"):
1436
+ return self._transform_toc(child)
1437
+
1438
+ # <p>[[<em>LISTING</em>]]</p> (represented in Markdown as `[[_LISTING_]]`)
1439
+ elif is_placeholder_for(child, "LISTING"):
1440
+ return self._transform_listing(child)
1441
+
1442
+ # <div>...</div>
1443
+ elif child.tag == "div":
1444
+ classes = child.get("class", "").split(" ")
1445
+
1446
+ # <div class="arithmatex">...</div>
1447
+ if "arithmatex" in classes:
1448
+ return self._transform_block_math(child)
1449
+
1450
+ # <div><ac:structured-macro ...>...</ac:structured-macro></div>
1451
+ elif "csf" in classes:
1452
+ if len(child) != 1:
1453
+ raise DocumentError("expected: single child in Confluence Storage Format block")
1454
+
1455
+ return child[0]
1456
+
1457
+ # <div class="footnote">
1458
+ # <hr/>
1459
+ # <ol>
1460
+ # <li id="fn:NAME"><p>TEXT <a class="footnote-backref" href="#fnref:NAME">↩</a></p></li>
1461
+ # </ol>
1462
+ # </div>
1463
+ elif "footnote" in classes:
1464
+ self._transform_footnote_def(child)
1209
1465
  return None
1210
1466
 
1211
- # <p><img src="..." /></p>
1212
- if child.tag == "p" and len(child) == 1 and child[0].tag == "img":
1213
- return self._transform_image(child[0])
1214
-
1215
- # <p>[[_TOC_]]</p>
1216
- # <p>[TOC]</p>
1217
- elif child.tag == "p" and "".join(child.itertext()) in ["[[TOC]]", "[TOC]"]:
1218
- return self._transform_toc(child)
1219
-
1220
- # <p>[[_LISTING_]]</p>
1221
- elif child.tag == "p" and "".join(child.itertext()) in ["[[LISTING]]", "[LISTING]"]:
1222
- return self._transform_listing(child)
1223
-
1224
- # <div class="admonition note">
1225
- # <p class="admonition-title">Note</p>
1226
- # <p>...</p>
1227
- # </div>
1228
- #
1229
- # --- OR ---
1230
- #
1231
- # <div class="admonition note">
1232
- # <p>...</p>
1233
- # </div>
1234
- elif child.tag == "div" and "admonition" in child.attrib.get("class", ""):
1235
- return self._transform_admonition(child)
1236
-
1237
- # Alerts in GitHub
1238
- # <blockquote>
1239
- # <p>[!TIP] ...</p>
1240
- # </blockquote>
1241
- elif child.tag == "blockquote" and len(child) > 0 and child[0].tag == "p" and child[0].text is not None and child[0].text.startswith("[!"):
1242
- return self._transform_github_alert(child)
1243
-
1244
- # Alerts in GitLab
1245
- # <blockquote>
1246
- # <p>DISCLAIMER: ...</p>
1247
- # </blockquote>
1248
- elif (
1249
- child.tag == "blockquote"
1250
- and len(child) > 0
1251
- and child[0].tag == "p"
1252
- and child[0].text is not None
1253
- and starts_with_any(child[0].text, ["FLAG:", "NOTE:", "WARNING:", "DISCLAIMER:"])
1254
- ):
1255
- return self._transform_gitlab_alert(child)
1467
+ # <div class="admonition note">
1468
+ # <p class="admonition-title">Note</p>
1469
+ # <p>...</p>
1470
+ # </div>
1471
+ #
1472
+ # --- OR ---
1473
+ #
1474
+ # <div class="admonition note">
1475
+ # <p>...</p>
1476
+ # </div>
1477
+ elif "admonition" in classes:
1478
+ return self._transform_admonition(child)
1479
+
1480
+ # <blockquote>...</blockquote>
1481
+ elif child.tag == "blockquote":
1482
+ # Alerts in GitHub
1483
+ # <blockquote>
1484
+ # <p>[!TIP] ...</p>
1485
+ # </blockquote>
1486
+ if len(child) > 0 and child[0].tag == "p" and child[0].text is not None and child[0].text.startswith("[!"):
1487
+ return self._transform_github_alert(child)
1488
+
1489
+ # Alerts in GitLab
1490
+ # <blockquote>
1491
+ # <p>DISCLAIMER: ...</p>
1492
+ # </blockquote>
1493
+ elif len(child) > 0 and child[0].tag == "p" and element_text_starts_with_any(child[0], ["FLAG:", "NOTE:", "WARNING:", "DISCLAIMER:"]):
1494
+ return self._transform_gitlab_alert(child)
1256
1495
 
1257
1496
  # <details markdown="1">
1258
1497
  # <summary>...</summary>
@@ -1261,44 +1500,84 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1261
1500
  elif child.tag == "details" and len(child) > 1 and child[0].tag == "summary":
1262
1501
  return self._transform_section(child)
1263
1502
 
1503
+ # <ol>...</ol>
1504
+ elif child.tag == "ol":
1505
+ # Confluence adds the attribute `start` for every ordered list
1506
+ child.set("start", "1")
1507
+ return None
1508
+
1509
+ # <ul>
1510
+ # <li>[ ] ...</li>
1511
+ # <li>[x] ...</li>
1512
+ # </ul>
1513
+ elif child.tag == "ul":
1514
+ if len(child) > 0 and element_text_starts_with_any(child[0], ["[ ]", "[x]", "[X]"]):
1515
+ return self._transform_tasklist(child)
1516
+
1517
+ return None
1518
+
1519
+ elif child.tag == "li":
1520
+ normalize_inline(child)
1521
+ return None
1522
+
1523
+ # <pre><code class="language-java"> ... </code></pre>
1524
+ elif child.tag == "pre" and len(child) == 1 and child[0].tag == "code":
1525
+ return self._transform_code_block(child[0])
1526
+
1527
+ # <table>...</table>
1528
+ elif child.tag == "table":
1529
+ child.set("data-layout", "default")
1530
+ return None
1531
+
1264
1532
  # <img src="..." alt="..." />
1265
1533
  elif child.tag == "img":
1266
- return self._transform_image(child)
1534
+ return self._transform_image(FormattingContext.INLINE, child)
1267
1535
 
1268
1536
  # <a href="..."> ... </a>
1269
1537
  elif child.tag == "a":
1270
1538
  return self._transform_link(child)
1271
1539
 
1272
- # <pre><code class="language-java"> ... </code></pre>
1273
- elif child.tag == "pre" and len(child) == 1 and child[0].tag == "code":
1274
- return self._transform_code_block(child[0])
1275
-
1276
- # <span data-emoji-shortname="..." data-emoji-unicode="...">...</span>
1277
- elif child.tag == "span" and child.attrib.has_key("data-emoji-shortname"):
1278
- return self._transform_emoji(child)
1540
+ # <mark>...</mark>
1541
+ elif child.tag == "mark":
1542
+ return self._transform_mark(child)
1279
1543
 
1280
- # <div class="arithmatex">...</div>
1281
- elif child.tag == "div" and "arithmatex" in child.attrib.get("class", "").split(" "):
1282
- return self._transform_block_math(child)
1544
+ # <span>...</span>
1545
+ elif child.tag == "span":
1546
+ classes = child.get("class", "").split(" ")
1283
1547
 
1284
- # <span class="arithmatex">...</span>
1285
- elif child.tag == "span" and "arithmatex" in child.attrib.get("class", "").split(" "):
1286
- return self._transform_inline_math(child)
1548
+ # <span class="arithmatex">...</span>
1549
+ if "arithmatex" in classes:
1550
+ return self._transform_inline_math(child)
1287
1551
 
1288
1552
  # <sup id="fnref:NAME"><a class="footnote-ref" href="#fn:NAME">1</a></sup>
1289
- elif child.tag == "sup" and child.attrib.get("id", "").startswith("fnref:"):
1553
+ elif child.tag == "sup" and child.get("id", "").startswith("fnref:"):
1290
1554
  self._transform_footnote_ref(child)
1291
1555
  return None
1292
1556
 
1293
- # <div class="footnote">
1294
- # <hr/>
1295
- # <ol>
1296
- # <li id="fn:NAME"><p>TEXT <a class="footnote-backref" href="#fnref:NAME">↩</a></p></li>
1297
- # </ol>
1298
- # </div>
1299
- elif child.tag == "div" and "footnote" in child.attrib.get("class", "").split(" "):
1300
- self._transform_footnote_def(child)
1301
- return None
1557
+ # <input type="date" value="1984-01-01" />
1558
+ elif child.tag == "input" and child.get("type", "") == "date":
1559
+ return HTML("time", {"datetime": child.get("value", "")})
1560
+
1561
+ # <ins>...</ins>
1562
+ elif child.tag == "ins":
1563
+ # Confluence prefers <u> over <ins> for underline, and replaces <ins> with <u>
1564
+ child.tag = "u"
1565
+
1566
+ # <x-emoji data-shortname="wink" data-unicode="1f609">😉</x-emoji>
1567
+ elif child.tag == "x-emoji":
1568
+ return self._transform_emoji(child)
1569
+
1570
+ # <h1>...</h1>
1571
+ # <h2>...</h2> ...
1572
+ m = re.match(r"^h([1-6])$", child.tag, flags=re.IGNORECASE)
1573
+ if m is not None:
1574
+ level = int(m.group(1))
1575
+ title = element_to_text(child)
1576
+ self.toc.add(level, title)
1577
+
1578
+ if self.options.heading_anchors:
1579
+ self._transform_heading(child)
1580
+ return None
1302
1581
 
1303
1582
  return None
1304
1583
 
@@ -1312,11 +1591,15 @@ class ConversionError(RuntimeError):
1312
1591
 
1313
1592
 
1314
1593
  class ConfluenceDocument:
1594
+ "Encapsulates an element tree for a Confluence document created by parsing a Markdown document."
1595
+
1315
1596
  title: Optional[str]
1316
1597
  labels: Optional[list[str]]
1317
1598
  properties: Optional[dict[str, JsonType]]
1599
+
1318
1600
  links: list[str]
1319
- images: list[Path]
1601
+ images: list[ImageData]
1602
+ embedded_files: dict[str, EmbeddedFileData]
1320
1603
 
1321
1604
  options: ConfluenceDocumentOptions
1322
1605
  root: ET._Element
@@ -1355,10 +1638,18 @@ class ConfluenceDocument:
1355
1638
  site_metadata: ConfluenceSiteMetadata,
1356
1639
  page_metadata: ConfluencePageCollection,
1357
1640
  ) -> None:
1641
+ "Converts a single Markdown document to Confluence Storage Format."
1642
+
1358
1643
  self.options = options
1359
1644
 
1645
+ # register auxiliary URL substitutions
1646
+ lines: list[str] = []
1647
+ for data_uri, color in status_images.items():
1648
+ lines.append(f"[STATUS-{color.upper()}]: {data_uri}")
1649
+ lines.append(document.text)
1650
+
1360
1651
  # convert to HTML
1361
- html = markdown_to_html(document.text)
1652
+ html = markdown_to_html("\n".join(lines))
1362
1653
 
1363
1654
  # parse Markdown document
1364
1655
  if self.options.generated_by is not None:
@@ -1390,10 +1681,13 @@ class ConfluenceDocument:
1390
1681
  site_metadata,
1391
1682
  page_metadata,
1392
1683
  )
1393
- converter.visit(self.root)
1684
+ try:
1685
+ converter.visit(self.root)
1686
+ except DocumentError as ex:
1687
+ raise ConversionError(path) from ex
1394
1688
  self.links = converter.links
1395
1689
  self.images = converter.images
1396
- self.embedded_images = converter.embedded_images
1690
+ self.embedded_files = converter.embedded_files
1397
1691
 
1398
1692
  self.title = document.title or converter.toc.get_title()
1399
1693
  self.labels = document.tags
@@ -1433,41 +1727,3 @@ def attachment_name(ref: Union[Path, str]) -> str:
1433
1727
 
1434
1728
  parts = [replace_part(p) for p in path.parts]
1435
1729
  return Path(*parts).as_posix().replace("/", "_")
1436
-
1437
-
1438
- def elements_to_string(root: ET._Element) -> str:
1439
- xml = ET.tostring(root, encoding="utf8", method="xml").decode("utf8")
1440
- m = re.match(r"^<root\s+[^>]*>(.*)</root>\s*$", xml, re.DOTALL)
1441
- if m:
1442
- return m.group(1)
1443
- else:
1444
- raise ValueError("expected: Confluence content")
1445
-
1446
-
1447
- def _content_to_string(dtd_path: Path, content: str) -> str:
1448
- parser = ET.XMLParser(
1449
- remove_blank_text=True,
1450
- remove_comments=True,
1451
- strip_cdata=False,
1452
- load_dtd=True,
1453
- )
1454
-
1455
- ns_attr_list = "".join(f' xmlns:{key}="{value}"' for key, value in namespaces.items())
1456
-
1457
- data = [
1458
- '<?xml version="1.0"?>',
1459
- f'<!DOCTYPE ac:confluence PUBLIC "-//Atlassian//Confluence 4 Page//EN" "{dtd_path.as_posix()}"><root{ns_attr_list}>',
1460
- ]
1461
- data.append(content)
1462
- data.append("</root>")
1463
-
1464
- tree = ET.fromstringlist(data, parser=parser)
1465
- return ET.tostring(tree, pretty_print=True).decode("utf-8")
1466
-
1467
-
1468
- def content_to_string(content: str) -> str:
1469
- "Converts a Confluence Storage Format document returned by the API into a readable XML document."
1470
-
1471
- resource_path = resources.files(__package__).joinpath("entities.dtd")
1472
- with resources.as_file(resource_path) as dtd_path:
1473
- return _content_to_string(dtd_path, content)