markdown-to-confluence 0.1.13__py3-none-any.whl → 0.2.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
md2conf/converter.py CHANGED
@@ -1,626 +1,965 @@
1
- # mypy: disable-error-code="dict-item"
2
-
3
- import importlib.resources as resources
4
- import logging
5
- import os.path
6
- import pathlib
7
- import re
8
- import sys
9
- from dataclasses import dataclass
10
- from typing import Dict, List, Optional, Tuple
11
- from urllib.parse import ParseResult, urlparse, urlunparse
12
-
13
- import lxml.etree as ET
14
- import markdown
15
- from lxml.builder import ElementMaker
16
-
17
- namespaces = {
18
- "ac": "http://atlassian.com/content",
19
- "ri": "http://atlassian.com/resource/identifier",
20
- }
21
- for key, value in namespaces.items():
22
- ET.register_namespace(key, value)
23
-
24
-
25
- HTML = ElementMaker()
26
- AC = ElementMaker(namespace=namespaces["ac"])
27
- RI = ElementMaker(namespace=namespaces["ri"])
28
-
29
- LOGGER = logging.getLogger(__name__)
30
-
31
-
32
- class ParseError(RuntimeError):
33
- pass
34
-
35
-
36
- def is_absolute_url(url: str) -> bool:
37
- urlparts = urlparse(url)
38
- return bool(urlparts.scheme) or bool(urlparts.netloc)
39
-
40
-
41
- def is_relative_url(url: str) -> bool:
42
- urlparts = urlparse(url)
43
- return not bool(urlparts.scheme) and not bool(urlparts.netloc)
44
-
45
-
46
- def markdown_to_html(content: str) -> str:
47
- return markdown.markdown(
48
- content,
49
- extensions=[
50
- "admonition",
51
- "markdown.extensions.tables",
52
- "markdown.extensions.fenced_code",
53
- "pymdownx.magiclink",
54
- "pymdownx.tilde",
55
- "sane_lists",
56
- ],
57
- )
58
-
59
-
60
- def _elements_from_strings(dtd_path: pathlib.Path, items: List[str]) -> ET._Element:
61
- """
62
- Creates a fragment of several XML nodes from their string representation wrapped in a root element.
63
-
64
- :param dtd_path: Path to a DTD document that defines entities like ¢ or ©.
65
- :param items: Strings to parse into XML fragments.
66
- :returns: An XML document as an element tree.
67
- """
68
-
69
- parser = ET.XMLParser(
70
- remove_blank_text=True,
71
- strip_cdata=False,
72
- load_dtd=True,
73
- )
74
-
75
- ns_attr_list = "".join(
76
- f' xmlns:{key}="{value}"' for key, value in namespaces.items()
77
- )
78
-
79
- data = [
80
- '<?xml version="1.0"?>',
81
- f'<!DOCTYPE ac:confluence PUBLIC "-//Atlassian//Confluence 4 Page//EN" "{dtd_path}">'
82
- f"<root{ns_attr_list}>",
83
- ]
84
- data.extend(items)
85
- data.append("</root>")
86
-
87
- try:
88
- return ET.fromstringlist(data, parser=parser)
89
- except ET.XMLSyntaxError as e:
90
- raise ParseError(e)
91
-
92
-
93
- def elements_from_strings(items: List[str]) -> ET._Element:
94
- "Creates a fragment of several XML nodes from their string representation wrapped in a root element."
95
-
96
- if sys.version_info >= (3, 9):
97
- resource_path = resources.files(__package__).joinpath("entities.dtd")
98
- with resources.as_file(resource_path) as dtd_path:
99
- return _elements_from_strings(dtd_path, items)
100
- else:
101
- with resources.path(__package__, "entities.dtd") as dtd_path:
102
- return _elements_from_strings(dtd_path, items)
103
-
104
-
105
- _languages = [
106
- "abap",
107
- "actionscript3",
108
- "ada",
109
- "applescript",
110
- "arduino",
111
- "autoit",
112
- "bash",
113
- "c",
114
- "clojure",
115
- "coffeescript",
116
- "coldfusion",
117
- "cpp",
118
- "csharp",
119
- "css",
120
- "cuda",
121
- "d",
122
- "dart",
123
- "delphi",
124
- "diff",
125
- "elixir",
126
- "erlang",
127
- "fortran",
128
- "foxpro",
129
- "go",
130
- "graphql",
131
- "groovy",
132
- "haskell",
133
- "haxe",
134
- "html",
135
- "java",
136
- "javafx",
137
- "javascript",
138
- "json",
139
- "jsx",
140
- "julia",
141
- "kotlin",
142
- "livescript",
143
- "lua",
144
- "mathematica",
145
- "matlab",
146
- "objectivec",
147
- "objectivej",
148
- "ocaml",
149
- "octave",
150
- "pascal",
151
- "perl",
152
- "php",
153
- "powershell",
154
- "prolog",
155
- "puppet",
156
- "python",
157
- "qml",
158
- "r",
159
- "racket",
160
- "rst",
161
- "ruby",
162
- "rust",
163
- "sass",
164
- "scala",
165
- "scheme",
166
- "shell",
167
- "smalltalk",
168
- "splunk",
169
- "sql",
170
- "standardml",
171
- "swift",
172
- "tcl",
173
- "tex",
174
- "tsx",
175
- "typescript",
176
- "vala",
177
- "vb",
178
- "verilog",
179
- "vhdl",
180
- "xml",
181
- "xquery",
182
- "yaml",
183
- ]
184
-
185
-
186
- @dataclass
187
- class ConfluencePageMetadata:
188
- domain: str
189
- base_path: str
190
- page_id: str
191
- space_key: str
192
- title: str
193
-
194
-
195
- class NodeVisitor:
196
- def visit(self, node: ET._Element) -> None:
197
- if len(node) < 1:
198
- return
199
-
200
- for index in range(len(node)):
201
- source = node[index]
202
- target = self.transform(source)
203
- if target is not None:
204
- node[index] = target
205
- else:
206
- self.visit(source)
207
-
208
- def transform(self, child: ET._Element) -> Optional[ET._Element]:
209
- pass
210
-
211
-
212
- @dataclass
213
- class ConfluenceConverterOptions:
214
- """
215
- Options for converting an HTML tree into Confluence storage format.
216
-
217
- :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
218
- plain text; when false, raise an exception.
219
- """
220
-
221
- ignore_invalid_url: bool = False
222
-
223
-
224
- class ConfluenceStorageFormatConverter(NodeVisitor):
225
- "Transforms a plain HTML tree into the Confluence storage format."
226
-
227
- options: ConfluenceConverterOptions
228
- path: pathlib.Path
229
- base_path: pathlib.Path
230
- links: List[str]
231
- images: List[str]
232
- page_metadata: Dict[pathlib.Path, ConfluencePageMetadata]
233
-
234
- def __init__(
235
- self,
236
- options: ConfluenceConverterOptions,
237
- path: pathlib.Path,
238
- page_metadata: Dict[pathlib.Path, ConfluencePageMetadata],
239
- ) -> None:
240
- super().__init__()
241
- self.options = options
242
- self.path = path
243
- self.base_path = path.parent
244
- self.links = []
245
- self.images = []
246
- self.page_metadata = page_metadata
247
-
248
- def _transform_link(self, anchor: ET._Element) -> None:
249
- url = anchor.attrib["href"]
250
- if is_absolute_url(url):
251
- return
252
-
253
- LOGGER.debug(f"found link {url} relative to {self.path}")
254
- relative_url: ParseResult = urlparse(url)
255
-
256
- if (
257
- not relative_url.scheme
258
- and not relative_url.netloc
259
- and not relative_url.path
260
- and not relative_url.params
261
- and not relative_url.query
262
- ):
263
- LOGGER.debug(f"found local URL: {url}")
264
- anchor.attrib["href"] = url
265
- return
266
-
267
- # convert the relative URL to absolute URL based on the base path value, then look up
268
- # the absolute path in the page metadata dictionary to discover the relative path
269
- # within Confluence that should be used
270
- absolute_path = (self.base_path / relative_url.path).absolute()
271
- if not str(absolute_path).startswith(str(self.base_path)):
272
- msg = f"relative URL {url} points to outside base path: {self.base_path}"
273
- if self.options.ignore_invalid_url:
274
- LOGGER.warning(msg)
275
- anchor.attrib.pop("href")
276
- return
277
- else:
278
- raise DocumentError(msg)
279
-
280
- relative_path = os.path.relpath(absolute_path, self.base_path)
281
-
282
- link_metadata = self.page_metadata.get(absolute_path)
283
- if link_metadata is None:
284
- msg = f"unable to find matching page for URL: {url}"
285
- if self.options.ignore_invalid_url:
286
- LOGGER.warning(msg)
287
- anchor.attrib.pop("href")
288
- return
289
- else:
290
- raise DocumentError(msg)
291
-
292
- LOGGER.debug(
293
- f"found link to page {relative_path} with metadata: {link_metadata}"
294
- )
295
- self.links.append(url)
296
-
297
- components = ParseResult(
298
- scheme="https",
299
- netloc=link_metadata.domain,
300
- path=f"{link_metadata.base_path}spaces/{link_metadata.space_key}/pages/{link_metadata.page_id}/{link_metadata.title}",
301
- params="",
302
- query="",
303
- fragment=relative_url.fragment,
304
- )
305
- transformed_url = urlunparse(components)
306
-
307
- LOGGER.debug(f"transformed relative URL: {url} to URL: {transformed_url}")
308
- anchor.attrib["href"] = transformed_url
309
-
310
- def _transform_image(self, image: ET._Element) -> ET._Element:
311
- path: str = image.attrib["src"]
312
-
313
- # prefer PNG over SVG; Confluence displays SVG in wrong size, and text labels are truncated
314
- if path and is_relative_url(path):
315
- relative_path = pathlib.Path(path)
316
- if (
317
- relative_path.suffix == ".svg"
318
- and (self.base_path / relative_path.with_suffix(".png")).exists()
319
- ):
320
- path = str(relative_path.with_suffix(".png"))
321
-
322
- self.images.append(path)
323
- caption = image.attrib["alt"]
324
- return AC(
325
- "image",
326
- {
327
- ET.QName(namespaces["ac"], "align"): "center",
328
- ET.QName(namespaces["ac"], "layout"): "center",
329
- },
330
- RI(
331
- "attachment",
332
- {ET.QName(namespaces["ri"], "filename"): attachment_name(path)},
333
- ),
334
- AC("caption", HTML.p(caption)),
335
- )
336
-
337
- def _transform_block(self, code: ET._Element) -> ET._Element:
338
- language = code.attrib.get("class")
339
- if language:
340
- m = re.match("^language-(.*)$", language)
341
- if m:
342
- language = m.group(1)
343
- else:
344
- language = "none"
345
- if language not in _languages:
346
- language = "none"
347
- content: str = code.text or ""
348
- content = content.rstrip()
349
- return AC(
350
- "structured-macro",
351
- {
352
- ET.QName(namespaces["ac"], "name"): "code",
353
- ET.QName(namespaces["ac"], "schema-version"): "1",
354
- },
355
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "theme"}, "Midnight"),
356
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "language"}, language),
357
- AC(
358
- "parameter", {ET.QName(namespaces["ac"], "name"): "linenumbers"}, "true"
359
- ),
360
- AC("plain-text-body", ET.CDATA(content)),
361
- )
362
-
363
- def _transform_toc(self, code: ET._Element) -> ET._Element:
364
- return AC(
365
- "structured-macro",
366
- {
367
- ET.QName(namespaces["ac"], "name"): "toc",
368
- ET.QName(namespaces["ac"], "schema-version"): "1",
369
- },
370
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "outline"}, "clear"),
371
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "style"}, "default"),
372
- )
373
-
374
- def _transform_admonition(self, elem: ET._Element) -> ET._Element:
375
- """
376
- Creates an info, tip, note or warning panel.
377
-
378
- Transforms [Python-Markdown admonition](https://python-markdown.github.io/extensions/admonition/) syntax
379
- into Confluence structured macro syntax.
380
- """
381
-
382
- # <div class="admonition note">
383
- class_list = elem.attrib.get("class", "").split(" ")
384
- class_name: Optional[str] = None
385
- if "info" in class_list:
386
- class_name = "info"
387
- elif "tip" in class_list:
388
- class_name = "tip"
389
- elif "note" in class_list:
390
- class_name = "note"
391
- elif "warning" in class_list:
392
- class_name = "warning"
393
-
394
- if class_name is None:
395
- raise DocumentError(f"unsupported admonition label: {class_list}")
396
-
397
- for e in elem:
398
- self.visit(e)
399
-
400
- # <p class="admonition-title">Note</p>
401
- if "admonition-title" in elem[0].attrib.get("class", "").split(" "):
402
- content = [
403
- AC(
404
- "parameter",
405
- {ET.QName(namespaces["ac"], "name"): "title"},
406
- elem[0].text or "",
407
- ),
408
- AC("rich-text-body", {}, *list(elem[1:])),
409
- ]
410
- else:
411
- content = [AC("rich-text-body", {}, *list(elem))]
412
-
413
- return AC(
414
- "structured-macro",
415
- {
416
- ET.QName(namespaces["ac"], "name"): class_name,
417
- ET.QName(namespaces["ac"], "schema-version"): "1",
418
- },
419
- *content,
420
- )
421
-
422
- def transform(self, child: ET._Element) -> Optional[ET._Element]:
423
- # normalize line breaks to regular space in element text
424
- if child.text:
425
- text: str = child.text
426
- child.text = text.replace("\n", " ")
427
- if child.tail:
428
- tail: str = child.tail
429
- child.tail = tail.replace("\n", " ")
430
-
431
- # <p><img src="..." /></p>
432
- if child.tag == "p" and len(child) == 1 and child[0].tag == "img":
433
- return self._transform_image(child[0])
434
-
435
- # <p>[[_TOC_]]</p>
436
- # <p>[TOC]</p>
437
- elif child.tag == "p" and "".join(child.itertext()) in ["[[TOC]]", "[TOC]"]:
438
- return self._transform_toc(child)
439
-
440
- # <div class="admonition note">
441
- # <p class="admonition-title">Note</p>
442
- # <p>...</p>
443
- # </div>
444
- #
445
- # --- OR ---
446
- #
447
- # <div class="admonition note">
448
- # <p>...</p>
449
- # </div>
450
- elif child.tag == "div" and "admonition" in child.attrib.get("class", ""):
451
- return self._transform_admonition(child)
452
-
453
- # <img src="..." alt="..." />
454
- elif child.tag == "img":
455
- return self._transform_image(child)
456
-
457
- # <a href="..."> ... </a>
458
- elif child.tag == "a":
459
- self._transform_link(child)
460
- return None
461
-
462
- # <pre><code class="language-java"> ... </code></pre>
463
- elif child.tag == "pre" and len(child) == 1 and child[0].tag == "code":
464
- return self._transform_block(child[0])
465
-
466
- return None
467
-
468
-
469
- class ConfluenceStorageFormatCleaner(NodeVisitor):
470
- "Removes volatile attributes from a Confluence storage format XHTML document."
471
-
472
- def transform(self, child: ET._Element) -> Optional[ET._Element]:
473
- child.attrib.pop(ET.QName(namespaces["ac"], "macro-id"), None)
474
- child.attrib.pop(ET.QName(namespaces["ri"], "version-at-save"), None)
475
- return None
476
-
477
-
478
- class DocumentError(RuntimeError):
479
- pass
480
-
481
-
482
- def extract_value(pattern: str, string: str) -> Tuple[Optional[str], str]:
483
- values: List[str] = []
484
-
485
- def _repl_func(matchobj: re.Match) -> str:
486
- values.append(matchobj.group(1))
487
- return ""
488
-
489
- string = re.sub(pattern, _repl_func, string, 1, re.ASCII)
490
- value = values[0] if values else None
491
- return value, string
492
-
493
-
494
- @dataclass
495
- class ConfluenceQualifiedID:
496
- page_id: str
497
- space_key: Optional[str] = None
498
-
499
-
500
- def extract_qualified_id(string: str) -> Tuple[Optional[ConfluenceQualifiedID], str]:
501
- page_id, string = extract_value(r"<!--\s+confluence-page-id:\s*(\d+)\s+-->", string)
502
-
503
- if page_id is None:
504
- return None, string
505
-
506
- # extract Confluence space key
507
- space_key, string = extract_value(
508
- r"<!--\s+confluence-space-key:\s*(\S+)\s+-->", string
509
- )
510
-
511
- return ConfluenceQualifiedID(page_id, space_key), string
512
-
513
-
514
- @dataclass
515
- class ConfluenceDocumentOptions:
516
- """
517
- Options that control the generated page content.
518
-
519
- :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
520
- plain text; when false, raise an exception.
521
- :param show_generated: Whether to display a prompt "This page has been generated with a tool."
522
- """
523
-
524
- ignore_invalid_url: bool = False
525
- generated_by: Optional[str] = "This page has been generated with a tool."
526
- root_page_id: Optional[str] = None
527
-
528
-
529
- class ConfluenceDocument:
530
- id: ConfluenceQualifiedID
531
- links: List[str]
532
- images: List[str]
533
-
534
- options: ConfluenceDocumentOptions
535
- root: ET._Element
536
-
537
- def __init__(
538
- self,
539
- path: pathlib.Path,
540
- options: ConfluenceDocumentOptions,
541
- page_metadata: Dict[pathlib.Path, ConfluencePageMetadata],
542
- ) -> None:
543
- self.options = options
544
- path = path.absolute()
545
-
546
- with open(path, "r", encoding="utf-8") as f:
547
- text = f.read()
548
-
549
- # extract Confluence page ID
550
- qualified_id, text = extract_qualified_id(text)
551
- if qualified_id is None:
552
- raise ValueError("missing Confluence page ID")
553
- self.id = qualified_id
554
-
555
- # extract 'generated-by' tag text
556
- generated_by_tag, text = extract_value(
557
- r"<!--\s+generated-by:\s*(.*)\s+-->", text
558
- )
559
-
560
- # extract frontmatter
561
- frontmatter, text = extract_value(r"(?ms)\A---$(.+?)^---$", text)
562
-
563
- # convert to HTML
564
- html = markdown_to_html(text)
565
-
566
- # parse Markdown document
567
- if self.options.generated_by is not None:
568
- generated_by = self.options.generated_by
569
- if generated_by_tag is not None:
570
- generated_by = generated_by_tag
571
-
572
- content = [
573
- '<ac:structured-macro ac:name="info" ac:schema-version="1">',
574
- f"<ac:rich-text-body><p>{generated_by}</p></ac:rich-text-body>",
575
- "</ac:structured-macro>",
576
- html,
577
- ]
578
- else:
579
- content = [html]
580
- self.root = elements_from_strings(content)
581
-
582
- converter = ConfluenceStorageFormatConverter(
583
- ConfluenceConverterOptions(
584
- ignore_invalid_url=self.options.ignore_invalid_url
585
- ),
586
- path,
587
- page_metadata,
588
- )
589
- converter.visit(self.root)
590
- self.links = converter.links
591
- self.images = converter.images
592
-
593
- def xhtml(self) -> str:
594
- return _content_to_string(self.root)
595
-
596
-
597
- def attachment_name(name: str) -> str:
598
- """
599
- Safe name for use with attachment uploads.
600
-
601
- Allowed characters:
602
- * Alphanumeric characters: 0-9, a-z, A-Z
603
- * Special characters: hyphen (-), underscore (_), period (.)
604
- """
605
-
606
- return re.sub(r"[^\-0-9A-Za-z_.]", "_", name)
607
-
608
-
609
- def sanitize_confluence(html: str) -> str:
610
- "Generates a sanitized version of a Confluence storage format XHTML document with no volatile attributes."
611
-
612
- if not html:
613
- return ""
614
-
615
- root = elements_from_strings([html])
616
- ConfluenceStorageFormatCleaner().visit(root)
617
- return _content_to_string(root)
618
-
619
-
620
- def _content_to_string(root: ET._Element) -> str:
621
- xml = ET.tostring(root, encoding="utf8", method="xml").decode("utf8")
622
- m = re.match(r"^<root\s+[^>]*>(.*)</root>\s*$", xml, re.DOTALL)
623
- if m:
624
- return m.group(1)
625
- else:
626
- raise ValueError("expected: Confluence content")
1
+ # mypy: disable-error-code="dict-item"
2
+
3
+ import hashlib
4
+ import importlib.resources as resources
5
+ import logging
6
+ import os.path
7
+ import re
8
+ import sys
9
+ import uuid
10
+ from dataclasses import dataclass
11
+ from pathlib import Path
12
+ from typing import Dict, List, Literal, Optional, Tuple
13
+ from urllib.parse import ParseResult, urlparse, urlunparse
14
+
15
+ import lxml.etree as ET
16
+ import markdown
17
+ from lxml.builder import ElementMaker
18
+
19
+ from . import mermaid
20
+
21
+ namespaces = {
22
+ "ac": "http://atlassian.com/content",
23
+ "ri": "http://atlassian.com/resource/identifier",
24
+ }
25
+ for key, value in namespaces.items():
26
+ ET.register_namespace(key, value)
27
+
28
+ HTML = ElementMaker()
29
+ AC = ElementMaker(namespace=namespaces["ac"])
30
+ RI = ElementMaker(namespace=namespaces["ri"])
31
+
32
+ LOGGER = logging.getLogger(__name__)
33
+
34
+
35
+ class ParseError(RuntimeError):
36
+ pass
37
+
38
+
39
+ def starts_with_any(text: str, prefixes: List[str]) -> bool:
40
+ "True if text starts with any of the listed prefixes."
41
+
42
+ for prefix in prefixes:
43
+ if text.startswith(prefix):
44
+ return True
45
+ return False
46
+
47
+
48
+ def is_absolute_url(url: str) -> bool:
49
+ urlparts = urlparse(url)
50
+ return bool(urlparts.scheme) or bool(urlparts.netloc)
51
+
52
+
53
+ def is_relative_url(url: str) -> bool:
54
+ urlparts = urlparse(url)
55
+ return not bool(urlparts.scheme) and not bool(urlparts.netloc)
56
+
57
+
58
+ def markdown_to_html(content: str) -> str:
59
+ return markdown.markdown(
60
+ content,
61
+ extensions=[
62
+ "admonition",
63
+ "markdown.extensions.tables",
64
+ "markdown.extensions.fenced_code",
65
+ "pymdownx.magiclink",
66
+ "pymdownx.tilde",
67
+ "sane_lists",
68
+ "md_in_html",
69
+ ],
70
+ )
71
+
72
+
73
+ def _elements_from_strings(dtd_path: Path, items: List[str]) -> ET._Element:
74
+ """
75
+ Creates a fragment of several XML nodes from their string representation wrapped in a root element.
76
+
77
+ :param dtd_path: Path to a DTD document that defines entities like &cent; or &copy;.
78
+ :param items: Strings to parse into XML fragments.
79
+ :returns: An XML document as an element tree.
80
+ """
81
+
82
+ parser = ET.XMLParser(
83
+ remove_blank_text=True,
84
+ strip_cdata=False,
85
+ load_dtd=True,
86
+ )
87
+
88
+ ns_attr_list = "".join(
89
+ f' xmlns:{key}="{value}"' for key, value in namespaces.items()
90
+ )
91
+
92
+ data = [
93
+ '<?xml version="1.0"?>',
94
+ f'<!DOCTYPE ac:confluence PUBLIC "-//Atlassian//Confluence 4 Page//EN" "{dtd_path}">'
95
+ f"<root{ns_attr_list}>",
96
+ ]
97
+ data.extend(items)
98
+ data.append("</root>")
99
+
100
+ try:
101
+ return ET.fromstringlist(data, parser=parser)
102
+ except ET.XMLSyntaxError as e:
103
+ raise ParseError(e)
104
+
105
+
106
+ def elements_from_strings(items: List[str]) -> ET._Element:
107
+ "Creates a fragment of several XML nodes from their string representation wrapped in a root element."
108
+
109
+ if sys.version_info >= (3, 9):
110
+ resource_path = resources.files(__package__).joinpath("entities.dtd")
111
+ with resources.as_file(resource_path) as dtd_path:
112
+ return _elements_from_strings(dtd_path, items)
113
+ else:
114
+ with resources.path(__package__, "entities.dtd") as dtd_path:
115
+ return _elements_from_strings(dtd_path, items)
116
+
117
+
118
+ def elements_from_string(content: str) -> ET._Element:
119
+ return elements_from_strings([content])
120
+
121
+
122
+ _languages = [
123
+ "abap",
124
+ "actionscript3",
125
+ "ada",
126
+ "applescript",
127
+ "arduino",
128
+ "autoit",
129
+ "bash",
130
+ "c",
131
+ "clojure",
132
+ "coffeescript",
133
+ "coldfusion",
134
+ "cpp",
135
+ "csharp",
136
+ "css",
137
+ "cuda",
138
+ "d",
139
+ "dart",
140
+ "delphi",
141
+ "diff",
142
+ "elixir",
143
+ "erlang",
144
+ "fortran",
145
+ "foxpro",
146
+ "go",
147
+ "graphql",
148
+ "groovy",
149
+ "haskell",
150
+ "haxe",
151
+ "html",
152
+ "java",
153
+ "javafx",
154
+ "javascript",
155
+ "json",
156
+ "jsx",
157
+ "julia",
158
+ "kotlin",
159
+ "livescript",
160
+ "lua",
161
+ "mermaid",
162
+ "mathematica",
163
+ "matlab",
164
+ "objectivec",
165
+ "objectivej",
166
+ "ocaml",
167
+ "octave",
168
+ "pascal",
169
+ "perl",
170
+ "php",
171
+ "powershell",
172
+ "prolog",
173
+ "puppet",
174
+ "python",
175
+ "qml",
176
+ "r",
177
+ "racket",
178
+ "rst",
179
+ "ruby",
180
+ "rust",
181
+ "sass",
182
+ "scala",
183
+ "scheme",
184
+ "shell",
185
+ "smalltalk",
186
+ "splunk",
187
+ "sql",
188
+ "standardml",
189
+ "swift",
190
+ "tcl",
191
+ "tex",
192
+ "tsx",
193
+ "typescript",
194
+ "vala",
195
+ "vb",
196
+ "verilog",
197
+ "vhdl",
198
+ "xml",
199
+ "xquery",
200
+ "yaml",
201
+ ]
202
+
203
+
204
+ @dataclass
205
+ class ConfluencePageMetadata:
206
+ domain: str
207
+ base_path: str
208
+ page_id: str
209
+ space_key: str
210
+ title: str
211
+
212
+
213
+ class NodeVisitor:
214
+ def visit(self, node: ET._Element) -> None:
215
+ "Recursively visits all descendants of this node."
216
+
217
+ if len(node) < 1:
218
+ return
219
+
220
+ for index in range(len(node)):
221
+ source = node[index]
222
+ target = self.transform(source)
223
+ if target is not None:
224
+ node[index] = target
225
+ else:
226
+ self.visit(source)
227
+
228
+ def transform(self, child: ET._Element) -> Optional[ET._Element]:
229
+ pass
230
+
231
+
232
+ def title_to_identifier(title: str) -> str:
233
+ "Converts a section heading title to a GitHub-style Markdown same-page anchor."
234
+
235
+ s = title.strip().lower()
236
+ s = re.sub("[^ A-Za-z0-9]", "", s)
237
+ s = s.replace(" ", "-")
238
+ return s
239
+
240
+
241
+ @dataclass
242
+ class ConfluenceConverterOptions:
243
+ """
244
+ Options for converting an HTML tree into Confluence storage format.
245
+
246
+ :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
247
+ plain text; when false, raise an exception.
248
+ :param heading_anchors: When true, emit a structured macro *anchor* for each section heading using GitHub
249
+ conversion rules for the identifier.
250
+ :param render_mermaid: Whether to pre-render Mermaid diagrams into PNG/SVG images.
251
+ :param diagram_output_format: Target image format for diagrams.
252
+ :param web_links: When true, convert relative URLs to Confluence Web UI links.
253
+ """
254
+
255
+ ignore_invalid_url: bool = False
256
+ heading_anchors: bool = False
257
+ render_mermaid: bool = False
258
+ diagram_output_format: Literal["png", "svg"] = "png"
259
+ webui_links: bool = False
260
+
261
+
262
+ class ConfluenceStorageFormatConverter(NodeVisitor):
263
+ "Transforms a plain HTML tree into the Confluence storage format."
264
+
265
+ options: ConfluenceConverterOptions
266
+ path: Path
267
+ base_path: Path
268
+ links: List[str]
269
+ images: List[str]
270
+ embedded_images: Dict[str, bytes]
271
+ page_metadata: Dict[Path, ConfluencePageMetadata]
272
+
273
+ def __init__(
274
+ self,
275
+ options: ConfluenceConverterOptions,
276
+ path: Path,
277
+ page_metadata: Dict[Path, ConfluencePageMetadata],
278
+ ) -> None:
279
+ super().__init__()
280
+ self.options = options
281
+ self.path = path
282
+ self.base_path = path.parent
283
+ self.links = []
284
+ self.images = []
285
+ self.embedded_images = {}
286
+ self.page_metadata = page_metadata
287
+
288
+ def _transform_heading(self, heading: ET._Element) -> None:
289
+ title = "".join(heading.itertext()).strip()
290
+
291
+ for e in heading:
292
+ self.visit(e)
293
+
294
+ anchor = AC(
295
+ "structured-macro",
296
+ {
297
+ ET.QName(namespaces["ac"], "name"): "anchor",
298
+ ET.QName(namespaces["ac"], "schema-version"): "1",
299
+ },
300
+ AC(
301
+ "parameter",
302
+ {ET.QName(namespaces["ac"], "name"): ""},
303
+ title_to_identifier(title),
304
+ ),
305
+ )
306
+
307
+ # insert anchor as first child, pushing any text nodes
308
+ heading.insert(0, anchor)
309
+ anchor.tail = heading.text
310
+ heading.text = None
311
+
312
+ def _transform_link(self, anchor: ET._Element) -> None:
313
+ url = anchor.attrib["href"]
314
+ if is_absolute_url(url):
315
+ return
316
+
317
+ LOGGER.debug(f"found link {url} relative to {self.path}")
318
+ relative_url: ParseResult = urlparse(url)
319
+
320
+ if (
321
+ not relative_url.scheme
322
+ and not relative_url.netloc
323
+ and not relative_url.path
324
+ and not relative_url.params
325
+ and not relative_url.query
326
+ ):
327
+ LOGGER.debug(f"found local URL: {url}")
328
+ anchor.attrib["href"] = url
329
+ return
330
+
331
+ # convert the relative URL to absolute URL based on the base path value, then look up
332
+ # the absolute path in the page metadata dictionary to discover the relative path
333
+ # within Confluence that should be used
334
+ absolute_path = (self.base_path / relative_url.path).absolute()
335
+ if not str(absolute_path).startswith(str(self.base_path)):
336
+ msg = f"relative URL {url} points to outside base path: {self.base_path}"
337
+ if self.options.ignore_invalid_url:
338
+ LOGGER.warning(msg)
339
+ anchor.attrib.pop("href")
340
+ return
341
+ else:
342
+ raise DocumentError(msg)
343
+
344
+ relative_path = os.path.relpath(absolute_path, self.base_path)
345
+
346
+ link_metadata = self.page_metadata.get(absolute_path)
347
+ if link_metadata is None:
348
+ msg = f"unable to find matching page for URL: {url}"
349
+ if self.options.ignore_invalid_url:
350
+ LOGGER.warning(msg)
351
+ anchor.attrib.pop("href")
352
+ return
353
+ else:
354
+ raise DocumentError(msg)
355
+
356
+ LOGGER.debug(
357
+ f"found link to page {relative_path} with metadata: {link_metadata}"
358
+ )
359
+ self.links.append(url)
360
+
361
+ if self.options.webui_links:
362
+ page_url = f"{link_metadata.base_path}pages/viewpage.action?pageId={link_metadata.page_id}"
363
+ else:
364
+ page_url = f"{link_metadata.base_path}spaces/{link_metadata.space_key}/pages/{link_metadata.page_id}/{link_metadata.title}"
365
+
366
+ components = ParseResult(
367
+ scheme="https",
368
+ netloc=link_metadata.domain,
369
+ path=page_url,
370
+ params="",
371
+ query="",
372
+ fragment=relative_url.fragment,
373
+ )
374
+ transformed_url = urlunparse(components)
375
+
376
+ LOGGER.debug(f"transformed relative URL: {url} to URL: {transformed_url}")
377
+ anchor.attrib["href"] = transformed_url
378
+
379
+ def _transform_image(self, image: ET._Element) -> ET._Element:
380
+ path: str = image.attrib["src"]
381
+
382
+ # prefer PNG over SVG; Confluence displays SVG in wrong size, and text labels are truncated
383
+ if path and is_relative_url(path):
384
+ relative_path = Path(path)
385
+ if (
386
+ relative_path.suffix == ".svg"
387
+ and (self.base_path / relative_path.with_suffix(".png")).exists()
388
+ ):
389
+ path = str(relative_path.with_suffix(".png"))
390
+
391
+ self.images.append(path)
392
+ caption = image.attrib["alt"]
393
+ return AC(
394
+ "image",
395
+ {
396
+ ET.QName(namespaces["ac"], "align"): "center",
397
+ ET.QName(namespaces["ac"], "layout"): "center",
398
+ },
399
+ RI(
400
+ "attachment",
401
+ {ET.QName(namespaces["ri"], "filename"): attachment_name(path)},
402
+ ),
403
+ AC("caption", HTML.p(caption)),
404
+ )
405
+
406
+ def _transform_block(self, code: ET._Element) -> ET._Element:
407
+ language = code.attrib.get("class")
408
+ if language:
409
+ m = re.match("^language-(.*)$", language)
410
+ if m:
411
+ language = m.group(1)
412
+ else:
413
+ language = "none"
414
+ if language not in _languages:
415
+ language = "none"
416
+ content: str = code.text or ""
417
+ content = content.rstrip()
418
+
419
+ if language == "mermaid":
420
+ return self._transform_mermaid(content)
421
+
422
+ return AC(
423
+ "structured-macro",
424
+ {
425
+ ET.QName(namespaces["ac"], "name"): "code",
426
+ ET.QName(namespaces["ac"], "schema-version"): "1",
427
+ },
428
+ AC(
429
+ "parameter",
430
+ {ET.QName(namespaces["ac"], "name"): "theme"},
431
+ "Midnight",
432
+ ),
433
+ AC(
434
+ "parameter",
435
+ {ET.QName(namespaces["ac"], "name"): "language"},
436
+ language,
437
+ ),
438
+ AC(
439
+ "parameter",
440
+ {ET.QName(namespaces["ac"], "name"): "linenumbers"},
441
+ "true",
442
+ ),
443
+ AC("plain-text-body", ET.CDATA(content)),
444
+ )
445
+
446
+ def _transform_mermaid(self, content: str) -> ET._Element:
447
+ "Transforms a Mermaid diagram code block."
448
+
449
+ if self.options.render_mermaid:
450
+ image_data = mermaid.render(content, self.options.diagram_output_format)
451
+ image_hash = hashlib.md5(image_data).hexdigest()
452
+ image_filename = attachment_name(
453
+ f"embedded_{image_hash}.{self.options.diagram_output_format}"
454
+ )
455
+ self.embedded_images[image_filename] = image_data
456
+ return AC(
457
+ "image",
458
+ {
459
+ ET.QName(namespaces["ac"], "align"): "center",
460
+ ET.QName(namespaces["ac"], "layout"): "center",
461
+ },
462
+ RI(
463
+ "attachment",
464
+ {ET.QName(namespaces["ri"], "filename"): image_filename},
465
+ ),
466
+ )
467
+ else:
468
+ local_id = str(uuid.uuid4())
469
+ macro_id = str(uuid.uuid4())
470
+ return AC(
471
+ "structured-macro",
472
+ {
473
+ ET.QName(namespaces["ac"], "name"): "macro-diagram",
474
+ ET.QName(namespaces["ac"], "schema-version"): "1",
475
+ ET.QName(namespaces["ac"], "data-layout"): "default",
476
+ ET.QName(namespaces["ac"], "local-id"): local_id,
477
+ ET.QName(namespaces["ac"], "macro-id"): macro_id,
478
+ },
479
+ AC(
480
+ "parameter",
481
+ {ET.QName(namespaces["ac"], "name"): "sourceType"},
482
+ "MacroBody",
483
+ ),
484
+ AC(
485
+ "parameter",
486
+ {ET.QName(namespaces["ac"], "name"): "attachmentPageId"},
487
+ ),
488
+ AC(
489
+ "parameter",
490
+ {ET.QName(namespaces["ac"], "name"): "syntax"},
491
+ "Mermaid",
492
+ ),
493
+ AC(
494
+ "parameter",
495
+ {ET.QName(namespaces["ac"], "name"): "attachmentId"},
496
+ ),
497
+ AC("parameter", {ET.QName(namespaces["ac"], "name"): "url"}),
498
+ AC("plain-text-body", ET.CDATA(content)),
499
+ )
500
+
501
+ def _transform_toc(self, code: ET._Element) -> ET._Element:
502
+ return AC(
503
+ "structured-macro",
504
+ {
505
+ ET.QName(namespaces["ac"], "name"): "toc",
506
+ ET.QName(namespaces["ac"], "schema-version"): "1",
507
+ },
508
+ AC("parameter", {ET.QName(namespaces["ac"], "name"): "outline"}, "clear"),
509
+ AC("parameter", {ET.QName(namespaces["ac"], "name"): "style"}, "default"),
510
+ )
511
+
512
+ def _transform_admonition(self, elem: ET._Element) -> ET._Element:
513
+ """
514
+ Creates an info, tip, note or warning panel from a Markdown admonition.
515
+
516
+ Transforms [Python-Markdown admonition](https://python-markdown.github.io/extensions/admonition/)
517
+ syntax into one of the Confluence structured macros *info*, *tip*, *note*, or *warning*.
518
+ """
519
+
520
+ # <div class="admonition note">
521
+ class_list = elem.attrib.get("class", "").split(" ")
522
+ class_name: Optional[str] = None
523
+ if "info" in class_list:
524
+ class_name = "info"
525
+ elif "tip" in class_list:
526
+ class_name = "tip"
527
+ elif "note" in class_list:
528
+ class_name = "note"
529
+ elif "warning" in class_list:
530
+ class_name = "warning"
531
+
532
+ if class_name is None:
533
+ raise DocumentError(f"unsupported admonition label: {class_list}")
534
+
535
+ for e in elem:
536
+ self.visit(e)
537
+
538
+ # <p class="admonition-title">Note</p>
539
+ if "admonition-title" in elem[0].attrib.get("class", "").split(" "):
540
+ content = [
541
+ AC(
542
+ "parameter",
543
+ {ET.QName(namespaces["ac"], "name"): "title"},
544
+ elem[0].text or "",
545
+ ),
546
+ AC("rich-text-body", {}, *list(elem[1:])),
547
+ ]
548
+ else:
549
+ content = [AC("rich-text-body", {}, *list(elem))]
550
+
551
+ return AC(
552
+ "structured-macro",
553
+ {
554
+ ET.QName(namespaces["ac"], "name"): class_name,
555
+ ET.QName(namespaces["ac"], "schema-version"): "1",
556
+ },
557
+ *content,
558
+ )
559
+
560
+ def _transform_github_alert(self, elem: ET._Element) -> ET._Element:
561
+ content = elem[0]
562
+ if content.text is None:
563
+ raise DocumentError("empty content")
564
+
565
+ class_name: Optional[str] = None
566
+ skip = 0
567
+
568
+ pattern = re.compile(r"^\[!([A-Z]+)\]\s*")
569
+ match = pattern.match(content.text)
570
+ if match:
571
+ skip = len(match.group(0))
572
+ alert = match.group(1)
573
+ if alert == "NOTE":
574
+ class_name = "note"
575
+ elif alert == "TIP":
576
+ class_name = "tip"
577
+ elif alert == "IMPORTANT":
578
+ class_name = "tip"
579
+ elif alert == "WARNING":
580
+ class_name = "warning"
581
+ elif alert == "CAUTION":
582
+ class_name = "warning"
583
+ else:
584
+ raise DocumentError(f"unsupported GitHub alert: {alert}")
585
+
586
+ return self._transform_alert(elem, class_name, skip)
587
+
588
+ def _transform_gitlab_alert(self, elem: ET._Element) -> ET._Element:
589
+ content = elem[0]
590
+ if content.text is None:
591
+ raise DocumentError("empty content")
592
+
593
+ class_name: Optional[str] = None
594
+ skip = 0
595
+
596
+ pattern = re.compile(r"^(FLAG|NOTE|WARNING|DISCLAIMER):\s*")
597
+ match = pattern.match(content.text)
598
+ if match:
599
+ skip = len(match.group(0))
600
+ alert = match.group(1)
601
+ if alert == "FLAG":
602
+ class_name = "note"
603
+ elif alert == "NOTE":
604
+ class_name = "note"
605
+ elif alert == "WARNING":
606
+ class_name = "warning"
607
+ elif alert == "DISCLAIMER":
608
+ class_name = "info"
609
+ else:
610
+ raise DocumentError(f"unsupported GitLab alert: {alert}")
611
+
612
+ return self._transform_alert(elem, class_name, skip)
613
+
614
+ def _transform_alert(
615
+ self, elem: ET._Element, class_name: Optional[str], skip: int
616
+ ) -> ET._Element:
617
+ """
618
+ Creates an info, tip, note or warning panel from a GitHub or GitLab alert.
619
+
620
+ Transforms
621
+ [GitHub alert](https://docs.github.com/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#alerts)
622
+ or [GitLab alert](https://docs.gitlab.com/ee/development/documentation/styleguide/#alert-boxes)
623
+ syntax into one of the Confluence structured macros *info*, *tip*, *note*, or *warning*.
624
+ """
625
+
626
+ content = elem[0]
627
+ if content.text is None:
628
+ raise DocumentError("empty content")
629
+
630
+ if class_name is None:
631
+ raise DocumentError("not an alert")
632
+
633
+ for e in elem:
634
+ self.visit(e)
635
+
636
+ content.text = content.text[skip:]
637
+ return AC(
638
+ "structured-macro",
639
+ {
640
+ ET.QName(namespaces["ac"], "name"): class_name,
641
+ ET.QName(namespaces["ac"], "schema-version"): "1",
642
+ },
643
+ AC("rich-text-body", {}, *list(elem)),
644
+ )
645
+
646
+ def _transform_section(self, elem: ET._Element) -> ET._Element:
647
+ """
648
+ Creates a collapsed section.
649
+
650
+ Transforms
651
+ [GitHub collapsed section](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/organizing-information-with-collapsed-sections) # noqa: E501 # no way to make this link shorter
652
+ syntax into the Confluence structured macro *expand*.
653
+ """
654
+
655
+ if elem[0].tag != "summary":
656
+ raise DocumentError(
657
+ "expected: `<summary>` as first direct child of `<details>`"
658
+ )
659
+ if elem[0].tail is not None:
660
+ raise DocumentError('expected: attribute `markdown="1"` on `<details>`')
661
+
662
+ summary = "".join(elem[0].itertext()).strip()
663
+ elem.remove(elem[0])
664
+
665
+ self.visit(elem)
666
+
667
+ return AC(
668
+ "structured-macro",
669
+ {
670
+ ET.QName(namespaces["ac"], "name"): "expand",
671
+ ET.QName(namespaces["ac"], "schema-version"): "1",
672
+ },
673
+ AC(
674
+ "parameter",
675
+ {ET.QName(namespaces["ac"], "name"): "title"},
676
+ summary,
677
+ ),
678
+ AC("rich-text-body", {}, *list(elem)),
679
+ )
680
+
681
+ def transform(self, child: ET._Element) -> Optional[ET._Element]:
682
+ # normalize line breaks to regular space in element text
683
+ if child.text:
684
+ text: str = child.text
685
+ child.text = text.replace("\n", " ")
686
+ if child.tail:
687
+ tail: str = child.tail
688
+ child.tail = tail.replace("\n", " ")
689
+
690
+ if self.options.heading_anchors:
691
+ # <h1>...</h1>
692
+ # <h2>...</h2> ...
693
+ if re.match(r"^h[1-6]$", child.tag, flags=re.IGNORECASE) is not None:
694
+ self._transform_heading(child)
695
+ return None
696
+
697
+ # <p><img src="..." /></p>
698
+ if child.tag == "p" and len(child) == 1 and child[0].tag == "img":
699
+ return self._transform_image(child[0])
700
+
701
+ # <p>[[_TOC_]]</p>
702
+ # <p>[TOC]</p>
703
+ elif child.tag == "p" and "".join(child.itertext()) in ["[[TOC]]", "[TOC]"]:
704
+ return self._transform_toc(child)
705
+
706
+ # <div class="admonition note">
707
+ # <p class="admonition-title">Note</p>
708
+ # <p>...</p>
709
+ # </div>
710
+ #
711
+ # --- OR ---
712
+ #
713
+ # <div class="admonition note">
714
+ # <p>...</p>
715
+ # </div>
716
+ elif child.tag == "div" and "admonition" in child.attrib.get("class", ""):
717
+ return self._transform_admonition(child)
718
+
719
+ # Alerts in GitHub
720
+ # <blockquote>
721
+ # <p>[!TIP] ...</p>
722
+ # </blockquote>
723
+ elif (
724
+ child.tag == "blockquote"
725
+ and len(child) > 0
726
+ and child[0].tag == "p"
727
+ and child[0].text is not None
728
+ and child[0].text.startswith("[!")
729
+ ):
730
+ return self._transform_github_alert(child)
731
+
732
+ # Alerts in GitLab
733
+ # <blockquote>
734
+ # <p>DISCLAIMER: ...</p>
735
+ # </blockquote>
736
+ elif (
737
+ child.tag == "blockquote"
738
+ and len(child) > 0
739
+ and child[0].tag == "p"
740
+ and child[0].text is not None
741
+ and starts_with_any(
742
+ child[0].text, ["FLAG:", "NOTE:", "WARNING:", "DISCLAIMER:"]
743
+ )
744
+ ):
745
+ return self._transform_gitlab_alert(child)
746
+
747
+ # <details markdown="1">
748
+ # <summary>...</summary>
749
+ # ...
750
+ # </details>
751
+ elif child.tag == "details" and len(child) > 1 and child[0].tag == "summary":
752
+ return self._transform_section(child)
753
+
754
+ # <img src="..." alt="..." />
755
+ elif child.tag == "img":
756
+ return self._transform_image(child)
757
+
758
+ # <a href="..."> ... </a>
759
+ elif child.tag == "a":
760
+ self._transform_link(child)
761
+ return None
762
+
763
+ # <pre><code class="language-java"> ... </code></pre>
764
+ elif child.tag == "pre" and len(child) == 1 and child[0].tag == "code":
765
+ return self._transform_block(child[0])
766
+
767
+ return None
768
+
769
+
770
+ class ConfluenceStorageFormatCleaner(NodeVisitor):
771
+ "Removes volatile attributes from a Confluence storage format XHTML document."
772
+
773
+ def transform(self, child: ET._Element) -> Optional[ET._Element]:
774
+ child.attrib.pop(ET.QName(namespaces["ac"], "macro-id"), None)
775
+ child.attrib.pop(ET.QName(namespaces["ri"], "version-at-save"), None)
776
+ return None
777
+
778
+
779
+ class DocumentError(RuntimeError):
780
+ pass
781
+
782
+
783
+ def extract_value(pattern: str, string: str) -> Tuple[Optional[str], str]:
784
+ values: List[str] = []
785
+
786
+ def _repl_func(matchobj: re.Match) -> str:
787
+ values.append(matchobj.group(1))
788
+ return ""
789
+
790
+ string = re.sub(pattern, _repl_func, string, 1, re.ASCII)
791
+ value = values[0] if values else None
792
+ return value, string
793
+
794
+
795
+ @dataclass
796
+ class ConfluenceQualifiedID:
797
+ page_id: str
798
+ space_key: Optional[str] = None
799
+
800
+ def __init__(self, page_id: str, space_key: Optional[str] = None):
801
+ self.page_id = page_id
802
+ self.space_key = space_key
803
+
804
+
805
+ def extract_qualified_id(string: str) -> Tuple[Optional[ConfluenceQualifiedID], str]:
806
+ "Extracts the Confluence page ID and space key from a Markdown document."
807
+
808
+ page_id, string = extract_value(r"<!--\s+confluence-page-id:\s*(\d+)\s+-->", string)
809
+
810
+ if page_id is None:
811
+ return None, string
812
+
813
+ # extract Confluence space key
814
+ space_key, string = extract_value(
815
+ r"<!--\s+confluence-space-key:\s*(\S+)\s+-->", string
816
+ )
817
+
818
+ return ConfluenceQualifiedID(page_id, space_key), string
819
+
820
+
821
+ def read_qualified_id(absolute_path: Path) -> Optional[ConfluenceQualifiedID]:
822
+ "Reads the Confluence page ID and space key from a Markdown document."
823
+
824
+ with open(absolute_path, "r", encoding="utf-8") as f:
825
+ document = f.read()
826
+
827
+ qualified_id, _ = extract_qualified_id(document)
828
+ return qualified_id
829
+
830
+
831
+ @dataclass
832
+ class ConfluenceDocumentOptions:
833
+ """
834
+ Options that control the generated page content.
835
+
836
+ :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
837
+ plain text; when false, raise an exception.
838
+ :param heading_anchors: When true, emit a structured macro *anchor* for each section heading using GitHub
839
+ conversion rules for the identifier.
840
+ :param generated_by: Text to use as the generated-by prompt.
841
+ :param show_generated: Whether to display a prompt "This page has been generated with a tool."
842
+ :param render_mermaid: Whether to pre-render Mermaid diagrams into PNG/SVG images.
843
+ :param diagram_output_format: Target image format for diagrams.
844
+ :param webui_links: When true, convert relative URLs to Confluence Web UI links.
845
+ """
846
+
847
+ ignore_invalid_url: bool = False
848
+ heading_anchors: bool = False
849
+ generated_by: Optional[str] = "This page has been generated with a tool."
850
+ root_page_id: Optional[str] = None
851
+ render_mermaid: bool = False
852
+ diagram_output_format: Literal["png", "svg"] = "png"
853
+ webui_links: bool = False
854
+
855
+
856
+ class ConfluenceDocument:
857
+ id: ConfluenceQualifiedID
858
+ links: List[str]
859
+ images: List[str]
860
+
861
+ options: ConfluenceDocumentOptions
862
+ root: ET._Element
863
+
864
+ def __init__(
865
+ self,
866
+ path: Path,
867
+ options: ConfluenceDocumentOptions,
868
+ page_metadata: Dict[Path, ConfluencePageMetadata],
869
+ ) -> None:
870
+ self.options = options
871
+ path = path.absolute()
872
+
873
+ with open(path, "r", encoding="utf-8") as f:
874
+ text = f.read()
875
+
876
+ # extract Confluence page ID
877
+ qualified_id, text = extract_qualified_id(text)
878
+ if qualified_id is None:
879
+ # look up Confluence page ID in metadata
880
+ metadata = page_metadata.get(path)
881
+ if metadata is not None:
882
+ qualified_id = ConfluenceQualifiedID(
883
+ metadata.page_id, metadata.space_key
884
+ )
885
+ if qualified_id is None:
886
+ raise ValueError("missing Confluence page ID")
887
+ self.id = qualified_id
888
+
889
+ # extract 'generated-by' tag text
890
+ generated_by_tag, text = extract_value(
891
+ r"<!--\s+generated-by:\s*(.*)\s+-->", text
892
+ )
893
+
894
+ # extract frontmatter
895
+ frontmatter, text = extract_value(r"(?ms)\A---$(.+?)^---$", text)
896
+
897
+ # convert to HTML
898
+ html = markdown_to_html(text)
899
+
900
+ # parse Markdown document
901
+ if self.options.generated_by is not None:
902
+ generated_by = self.options.generated_by
903
+ if generated_by_tag is not None:
904
+ generated_by = generated_by_tag
905
+
906
+ content = [
907
+ '<ac:structured-macro ac:name="info" ac:schema-version="1">',
908
+ f"<ac:rich-text-body><p>{generated_by}</p></ac:rich-text-body>",
909
+ "</ac:structured-macro>",
910
+ html,
911
+ ]
912
+ else:
913
+ content = [html]
914
+ self.root = elements_from_strings(content)
915
+
916
+ converter = ConfluenceStorageFormatConverter(
917
+ ConfluenceConverterOptions(
918
+ ignore_invalid_url=self.options.ignore_invalid_url,
919
+ heading_anchors=self.options.heading_anchors,
920
+ render_mermaid=self.options.render_mermaid,
921
+ diagram_output_format=self.options.diagram_output_format,
922
+ webui_links=self.options.webui_links,
923
+ ),
924
+ path,
925
+ page_metadata,
926
+ )
927
+ converter.visit(self.root)
928
+ self.links = converter.links
929
+ self.images = converter.images
930
+ self.embedded_images = converter.embedded_images
931
+
932
+ def xhtml(self) -> str:
933
+ return elements_to_string(self.root)
934
+
935
+
936
+ def attachment_name(name: str) -> str:
937
+ """
938
+ Safe name for use with attachment uploads.
939
+
940
+ Allowed characters:
941
+ * Alphanumeric characters: 0-9, a-z, A-Z
942
+ * Special characters: hyphen (-), underscore (_), period (.)
943
+ """
944
+
945
+ return re.sub(r"[^\-0-9A-Za-z_.]", "_", name)
946
+
947
+
948
+ def sanitize_confluence(html: str) -> str:
949
+ "Generates a sanitized version of a Confluence storage format XHTML document with no volatile attributes."
950
+
951
+ if not html:
952
+ return ""
953
+
954
+ root = elements_from_strings([html])
955
+ ConfluenceStorageFormatCleaner().visit(root)
956
+ return elements_to_string(root)
957
+
958
+
959
+ def elements_to_string(root: ET._Element) -> str:
960
+ xml = ET.tostring(root, encoding="utf8", method="xml").decode("utf8")
961
+ m = re.match(r"^<root\s+[^>]*>(.*)</root>\s*$", xml, re.DOTALL)
962
+ if m:
963
+ return m.group(1)
964
+ else:
965
+ raise ValueError("expected: Confluence content")