markdown-to-confluence 0.1.13__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
md2conf/converter.py CHANGED
@@ -1,626 +1,868 @@
1
- # mypy: disable-error-code="dict-item"
2
-
3
- import importlib.resources as resources
4
- import logging
5
- import os.path
6
- import pathlib
7
- import re
8
- import sys
9
- from dataclasses import dataclass
10
- from typing import Dict, List, Optional, Tuple
11
- from urllib.parse import ParseResult, urlparse, urlunparse
12
-
13
- import lxml.etree as ET
14
- import markdown
15
- from lxml.builder import ElementMaker
16
-
17
- namespaces = {
18
- "ac": "http://atlassian.com/content",
19
- "ri": "http://atlassian.com/resource/identifier",
20
- }
21
- for key, value in namespaces.items():
22
- ET.register_namespace(key, value)
23
-
24
-
25
- HTML = ElementMaker()
26
- AC = ElementMaker(namespace=namespaces["ac"])
27
- RI = ElementMaker(namespace=namespaces["ri"])
28
-
29
- LOGGER = logging.getLogger(__name__)
30
-
31
-
32
- class ParseError(RuntimeError):
33
- pass
34
-
35
-
36
- def is_absolute_url(url: str) -> bool:
37
- urlparts = urlparse(url)
38
- return bool(urlparts.scheme) or bool(urlparts.netloc)
39
-
40
-
41
- def is_relative_url(url: str) -> bool:
42
- urlparts = urlparse(url)
43
- return not bool(urlparts.scheme) and not bool(urlparts.netloc)
44
-
45
-
46
- def markdown_to_html(content: str) -> str:
47
- return markdown.markdown(
48
- content,
49
- extensions=[
50
- "admonition",
51
- "markdown.extensions.tables",
52
- "markdown.extensions.fenced_code",
53
- "pymdownx.magiclink",
54
- "pymdownx.tilde",
55
- "sane_lists",
56
- ],
57
- )
58
-
59
-
60
- def _elements_from_strings(dtd_path: pathlib.Path, items: List[str]) -> ET._Element:
61
- """
62
- Creates a fragment of several XML nodes from their string representation wrapped in a root element.
63
-
64
- :param dtd_path: Path to a DTD document that defines entities like ¢ or ©.
65
- :param items: Strings to parse into XML fragments.
66
- :returns: An XML document as an element tree.
67
- """
68
-
69
- parser = ET.XMLParser(
70
- remove_blank_text=True,
71
- strip_cdata=False,
72
- load_dtd=True,
73
- )
74
-
75
- ns_attr_list = "".join(
76
- f' xmlns:{key}="{value}"' for key, value in namespaces.items()
77
- )
78
-
79
- data = [
80
- '<?xml version="1.0"?>',
81
- f'<!DOCTYPE ac:confluence PUBLIC "-//Atlassian//Confluence 4 Page//EN" "{dtd_path}">'
82
- f"<root{ns_attr_list}>",
83
- ]
84
- data.extend(items)
85
- data.append("</root>")
86
-
87
- try:
88
- return ET.fromstringlist(data, parser=parser)
89
- except ET.XMLSyntaxError as e:
90
- raise ParseError(e)
91
-
92
-
93
- def elements_from_strings(items: List[str]) -> ET._Element:
94
- "Creates a fragment of several XML nodes from their string representation wrapped in a root element."
95
-
96
- if sys.version_info >= (3, 9):
97
- resource_path = resources.files(__package__).joinpath("entities.dtd")
98
- with resources.as_file(resource_path) as dtd_path:
99
- return _elements_from_strings(dtd_path, items)
100
- else:
101
- with resources.path(__package__, "entities.dtd") as dtd_path:
102
- return _elements_from_strings(dtd_path, items)
103
-
104
-
105
- _languages = [
106
- "abap",
107
- "actionscript3",
108
- "ada",
109
- "applescript",
110
- "arduino",
111
- "autoit",
112
- "bash",
113
- "c",
114
- "clojure",
115
- "coffeescript",
116
- "coldfusion",
117
- "cpp",
118
- "csharp",
119
- "css",
120
- "cuda",
121
- "d",
122
- "dart",
123
- "delphi",
124
- "diff",
125
- "elixir",
126
- "erlang",
127
- "fortran",
128
- "foxpro",
129
- "go",
130
- "graphql",
131
- "groovy",
132
- "haskell",
133
- "haxe",
134
- "html",
135
- "java",
136
- "javafx",
137
- "javascript",
138
- "json",
139
- "jsx",
140
- "julia",
141
- "kotlin",
142
- "livescript",
143
- "lua",
144
- "mathematica",
145
- "matlab",
146
- "objectivec",
147
- "objectivej",
148
- "ocaml",
149
- "octave",
150
- "pascal",
151
- "perl",
152
- "php",
153
- "powershell",
154
- "prolog",
155
- "puppet",
156
- "python",
157
- "qml",
158
- "r",
159
- "racket",
160
- "rst",
161
- "ruby",
162
- "rust",
163
- "sass",
164
- "scala",
165
- "scheme",
166
- "shell",
167
- "smalltalk",
168
- "splunk",
169
- "sql",
170
- "standardml",
171
- "swift",
172
- "tcl",
173
- "tex",
174
- "tsx",
175
- "typescript",
176
- "vala",
177
- "vb",
178
- "verilog",
179
- "vhdl",
180
- "xml",
181
- "xquery",
182
- "yaml",
183
- ]
184
-
185
-
186
- @dataclass
187
- class ConfluencePageMetadata:
188
- domain: str
189
- base_path: str
190
- page_id: str
191
- space_key: str
192
- title: str
193
-
194
-
195
- class NodeVisitor:
196
- def visit(self, node: ET._Element) -> None:
197
- if len(node) < 1:
198
- return
199
-
200
- for index in range(len(node)):
201
- source = node[index]
202
- target = self.transform(source)
203
- if target is not None:
204
- node[index] = target
205
- else:
206
- self.visit(source)
207
-
208
- def transform(self, child: ET._Element) -> Optional[ET._Element]:
209
- pass
210
-
211
-
212
- @dataclass
213
- class ConfluenceConverterOptions:
214
- """
215
- Options for converting an HTML tree into Confluence storage format.
216
-
217
- :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
218
- plain text; when false, raise an exception.
219
- """
220
-
221
- ignore_invalid_url: bool = False
222
-
223
-
224
- class ConfluenceStorageFormatConverter(NodeVisitor):
225
- "Transforms a plain HTML tree into the Confluence storage format."
226
-
227
- options: ConfluenceConverterOptions
228
- path: pathlib.Path
229
- base_path: pathlib.Path
230
- links: List[str]
231
- images: List[str]
232
- page_metadata: Dict[pathlib.Path, ConfluencePageMetadata]
233
-
234
- def __init__(
235
- self,
236
- options: ConfluenceConverterOptions,
237
- path: pathlib.Path,
238
- page_metadata: Dict[pathlib.Path, ConfluencePageMetadata],
239
- ) -> None:
240
- super().__init__()
241
- self.options = options
242
- self.path = path
243
- self.base_path = path.parent
244
- self.links = []
245
- self.images = []
246
- self.page_metadata = page_metadata
247
-
248
- def _transform_link(self, anchor: ET._Element) -> None:
249
- url = anchor.attrib["href"]
250
- if is_absolute_url(url):
251
- return
252
-
253
- LOGGER.debug(f"found link {url} relative to {self.path}")
254
- relative_url: ParseResult = urlparse(url)
255
-
256
- if (
257
- not relative_url.scheme
258
- and not relative_url.netloc
259
- and not relative_url.path
260
- and not relative_url.params
261
- and not relative_url.query
262
- ):
263
- LOGGER.debug(f"found local URL: {url}")
264
- anchor.attrib["href"] = url
265
- return
266
-
267
- # convert the relative URL to absolute URL based on the base path value, then look up
268
- # the absolute path in the page metadata dictionary to discover the relative path
269
- # within Confluence that should be used
270
- absolute_path = (self.base_path / relative_url.path).absolute()
271
- if not str(absolute_path).startswith(str(self.base_path)):
272
- msg = f"relative URL {url} points to outside base path: {self.base_path}"
273
- if self.options.ignore_invalid_url:
274
- LOGGER.warning(msg)
275
- anchor.attrib.pop("href")
276
- return
277
- else:
278
- raise DocumentError(msg)
279
-
280
- relative_path = os.path.relpath(absolute_path, self.base_path)
281
-
282
- link_metadata = self.page_metadata.get(absolute_path)
283
- if link_metadata is None:
284
- msg = f"unable to find matching page for URL: {url}"
285
- if self.options.ignore_invalid_url:
286
- LOGGER.warning(msg)
287
- anchor.attrib.pop("href")
288
- return
289
- else:
290
- raise DocumentError(msg)
291
-
292
- LOGGER.debug(
293
- f"found link to page {relative_path} with metadata: {link_metadata}"
294
- )
295
- self.links.append(url)
296
-
297
- components = ParseResult(
298
- scheme="https",
299
- netloc=link_metadata.domain,
300
- path=f"{link_metadata.base_path}spaces/{link_metadata.space_key}/pages/{link_metadata.page_id}/{link_metadata.title}",
301
- params="",
302
- query="",
303
- fragment=relative_url.fragment,
304
- )
305
- transformed_url = urlunparse(components)
306
-
307
- LOGGER.debug(f"transformed relative URL: {url} to URL: {transformed_url}")
308
- anchor.attrib["href"] = transformed_url
309
-
310
- def _transform_image(self, image: ET._Element) -> ET._Element:
311
- path: str = image.attrib["src"]
312
-
313
- # prefer PNG over SVG; Confluence displays SVG in wrong size, and text labels are truncated
314
- if path and is_relative_url(path):
315
- relative_path = pathlib.Path(path)
316
- if (
317
- relative_path.suffix == ".svg"
318
- and (self.base_path / relative_path.with_suffix(".png")).exists()
319
- ):
320
- path = str(relative_path.with_suffix(".png"))
321
-
322
- self.images.append(path)
323
- caption = image.attrib["alt"]
324
- return AC(
325
- "image",
326
- {
327
- ET.QName(namespaces["ac"], "align"): "center",
328
- ET.QName(namespaces["ac"], "layout"): "center",
329
- },
330
- RI(
331
- "attachment",
332
- {ET.QName(namespaces["ri"], "filename"): attachment_name(path)},
333
- ),
334
- AC("caption", HTML.p(caption)),
335
- )
336
-
337
- def _transform_block(self, code: ET._Element) -> ET._Element:
338
- language = code.attrib.get("class")
339
- if language:
340
- m = re.match("^language-(.*)$", language)
341
- if m:
342
- language = m.group(1)
343
- else:
344
- language = "none"
345
- if language not in _languages:
346
- language = "none"
347
- content: str = code.text or ""
348
- content = content.rstrip()
349
- return AC(
350
- "structured-macro",
351
- {
352
- ET.QName(namespaces["ac"], "name"): "code",
353
- ET.QName(namespaces["ac"], "schema-version"): "1",
354
- },
355
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "theme"}, "Midnight"),
356
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "language"}, language),
357
- AC(
358
- "parameter", {ET.QName(namespaces["ac"], "name"): "linenumbers"}, "true"
359
- ),
360
- AC("plain-text-body", ET.CDATA(content)),
361
- )
362
-
363
- def _transform_toc(self, code: ET._Element) -> ET._Element:
364
- return AC(
365
- "structured-macro",
366
- {
367
- ET.QName(namespaces["ac"], "name"): "toc",
368
- ET.QName(namespaces["ac"], "schema-version"): "1",
369
- },
370
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "outline"}, "clear"),
371
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "style"}, "default"),
372
- )
373
-
374
- def _transform_admonition(self, elem: ET._Element) -> ET._Element:
375
- """
376
- Creates an info, tip, note or warning panel.
377
-
378
- Transforms [Python-Markdown admonition](https://python-markdown.github.io/extensions/admonition/) syntax
379
- into Confluence structured macro syntax.
380
- """
381
-
382
- # <div class="admonition note">
383
- class_list = elem.attrib.get("class", "").split(" ")
384
- class_name: Optional[str] = None
385
- if "info" in class_list:
386
- class_name = "info"
387
- elif "tip" in class_list:
388
- class_name = "tip"
389
- elif "note" in class_list:
390
- class_name = "note"
391
- elif "warning" in class_list:
392
- class_name = "warning"
393
-
394
- if class_name is None:
395
- raise DocumentError(f"unsupported admonition label: {class_list}")
396
-
397
- for e in elem:
398
- self.visit(e)
399
-
400
- # <p class="admonition-title">Note</p>
401
- if "admonition-title" in elem[0].attrib.get("class", "").split(" "):
402
- content = [
403
- AC(
404
- "parameter",
405
- {ET.QName(namespaces["ac"], "name"): "title"},
406
- elem[0].text or "",
407
- ),
408
- AC("rich-text-body", {}, *list(elem[1:])),
409
- ]
410
- else:
411
- content = [AC("rich-text-body", {}, *list(elem))]
412
-
413
- return AC(
414
- "structured-macro",
415
- {
416
- ET.QName(namespaces["ac"], "name"): class_name,
417
- ET.QName(namespaces["ac"], "schema-version"): "1",
418
- },
419
- *content,
420
- )
421
-
422
- def transform(self, child: ET._Element) -> Optional[ET._Element]:
423
- # normalize line breaks to regular space in element text
424
- if child.text:
425
- text: str = child.text
426
- child.text = text.replace("\n", " ")
427
- if child.tail:
428
- tail: str = child.tail
429
- child.tail = tail.replace("\n", " ")
430
-
431
- # <p><img src="..." /></p>
432
- if child.tag == "p" and len(child) == 1 and child[0].tag == "img":
433
- return self._transform_image(child[0])
434
-
435
- # <p>[[_TOC_]]</p>
436
- # <p>[TOC]</p>
437
- elif child.tag == "p" and "".join(child.itertext()) in ["[[TOC]]", "[TOC]"]:
438
- return self._transform_toc(child)
439
-
440
- # <div class="admonition note">
441
- # <p class="admonition-title">Note</p>
442
- # <p>...</p>
443
- # </div>
444
- #
445
- # --- OR ---
446
- #
447
- # <div class="admonition note">
448
- # <p>...</p>
449
- # </div>
450
- elif child.tag == "div" and "admonition" in child.attrib.get("class", ""):
451
- return self._transform_admonition(child)
452
-
453
- # <img src="..." alt="..." />
454
- elif child.tag == "img":
455
- return self._transform_image(child)
456
-
457
- # <a href="..."> ... </a>
458
- elif child.tag == "a":
459
- self._transform_link(child)
460
- return None
461
-
462
- # <pre><code class="language-java"> ... </code></pre>
463
- elif child.tag == "pre" and len(child) == 1 and child[0].tag == "code":
464
- return self._transform_block(child[0])
465
-
466
- return None
467
-
468
-
469
- class ConfluenceStorageFormatCleaner(NodeVisitor):
470
- "Removes volatile attributes from a Confluence storage format XHTML document."
471
-
472
- def transform(self, child: ET._Element) -> Optional[ET._Element]:
473
- child.attrib.pop(ET.QName(namespaces["ac"], "macro-id"), None)
474
- child.attrib.pop(ET.QName(namespaces["ri"], "version-at-save"), None)
475
- return None
476
-
477
-
478
- class DocumentError(RuntimeError):
479
- pass
480
-
481
-
482
- def extract_value(pattern: str, string: str) -> Tuple[Optional[str], str]:
483
- values: List[str] = []
484
-
485
- def _repl_func(matchobj: re.Match) -> str:
486
- values.append(matchobj.group(1))
487
- return ""
488
-
489
- string = re.sub(pattern, _repl_func, string, 1, re.ASCII)
490
- value = values[0] if values else None
491
- return value, string
492
-
493
-
494
- @dataclass
495
- class ConfluenceQualifiedID:
496
- page_id: str
497
- space_key: Optional[str] = None
498
-
499
-
500
- def extract_qualified_id(string: str) -> Tuple[Optional[ConfluenceQualifiedID], str]:
501
- page_id, string = extract_value(r"<!--\s+confluence-page-id:\s*(\d+)\s+-->", string)
502
-
503
- if page_id is None:
504
- return None, string
505
-
506
- # extract Confluence space key
507
- space_key, string = extract_value(
508
- r"<!--\s+confluence-space-key:\s*(\S+)\s+-->", string
509
- )
510
-
511
- return ConfluenceQualifiedID(page_id, space_key), string
512
-
513
-
514
- @dataclass
515
- class ConfluenceDocumentOptions:
516
- """
517
- Options that control the generated page content.
518
-
519
- :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
520
- plain text; when false, raise an exception.
521
- :param show_generated: Whether to display a prompt "This page has been generated with a tool."
522
- """
523
-
524
- ignore_invalid_url: bool = False
525
- generated_by: Optional[str] = "This page has been generated with a tool."
526
- root_page_id: Optional[str] = None
527
-
528
-
529
- class ConfluenceDocument:
530
- id: ConfluenceQualifiedID
531
- links: List[str]
532
- images: List[str]
533
-
534
- options: ConfluenceDocumentOptions
535
- root: ET._Element
536
-
537
- def __init__(
538
- self,
539
- path: pathlib.Path,
540
- options: ConfluenceDocumentOptions,
541
- page_metadata: Dict[pathlib.Path, ConfluencePageMetadata],
542
- ) -> None:
543
- self.options = options
544
- path = path.absolute()
545
-
546
- with open(path, "r", encoding="utf-8") as f:
547
- text = f.read()
548
-
549
- # extract Confluence page ID
550
- qualified_id, text = extract_qualified_id(text)
551
- if qualified_id is None:
552
- raise ValueError("missing Confluence page ID")
553
- self.id = qualified_id
554
-
555
- # extract 'generated-by' tag text
556
- generated_by_tag, text = extract_value(
557
- r"<!--\s+generated-by:\s*(.*)\s+-->", text
558
- )
559
-
560
- # extract frontmatter
561
- frontmatter, text = extract_value(r"(?ms)\A---$(.+?)^---$", text)
562
-
563
- # convert to HTML
564
- html = markdown_to_html(text)
565
-
566
- # parse Markdown document
567
- if self.options.generated_by is not None:
568
- generated_by = self.options.generated_by
569
- if generated_by_tag is not None:
570
- generated_by = generated_by_tag
571
-
572
- content = [
573
- '<ac:structured-macro ac:name="info" ac:schema-version="1">',
574
- f"<ac:rich-text-body><p>{generated_by}</p></ac:rich-text-body>",
575
- "</ac:structured-macro>",
576
- html,
577
- ]
578
- else:
579
- content = [html]
580
- self.root = elements_from_strings(content)
581
-
582
- converter = ConfluenceStorageFormatConverter(
583
- ConfluenceConverterOptions(
584
- ignore_invalid_url=self.options.ignore_invalid_url
585
- ),
586
- path,
587
- page_metadata,
588
- )
589
- converter.visit(self.root)
590
- self.links = converter.links
591
- self.images = converter.images
592
-
593
- def xhtml(self) -> str:
594
- return _content_to_string(self.root)
595
-
596
-
597
- def attachment_name(name: str) -> str:
598
- """
599
- Safe name for use with attachment uploads.
600
-
601
- Allowed characters:
602
- * Alphanumeric characters: 0-9, a-z, A-Z
603
- * Special characters: hyphen (-), underscore (_), period (.)
604
- """
605
-
606
- return re.sub(r"[^\-0-9A-Za-z_.]", "_", name)
607
-
608
-
609
- def sanitize_confluence(html: str) -> str:
610
- "Generates a sanitized version of a Confluence storage format XHTML document with no volatile attributes."
611
-
612
- if not html:
613
- return ""
614
-
615
- root = elements_from_strings([html])
616
- ConfluenceStorageFormatCleaner().visit(root)
617
- return _content_to_string(root)
618
-
619
-
620
- def _content_to_string(root: ET._Element) -> str:
621
- xml = ET.tostring(root, encoding="utf8", method="xml").decode("utf8")
622
- m = re.match(r"^<root\s+[^>]*>(.*)</root>\s*$", xml, re.DOTALL)
623
- if m:
624
- return m.group(1)
625
- else:
626
- raise ValueError("expected: Confluence content")
1
+ # mypy: disable-error-code="dict-item"
2
+
3
+ import hashlib
4
+ import importlib.resources as resources
5
+ import logging
6
+ import os.path
7
+ import pathlib
8
+ import re
9
+ import sys
10
+ import uuid
11
+ from dataclasses import dataclass
12
+ from typing import Dict, List, Literal, Optional, Tuple
13
+ from urllib.parse import ParseResult, urlparse, urlunparse
14
+
15
+ import lxml.etree as ET
16
+ import markdown
17
+ from lxml.builder import ElementMaker
18
+
19
+ from . import mermaid
20
+
21
+ namespaces = {
22
+ "ac": "http://atlassian.com/content",
23
+ "ri": "http://atlassian.com/resource/identifier",
24
+ }
25
+ for key, value in namespaces.items():
26
+ ET.register_namespace(key, value)
27
+
28
+ HTML = ElementMaker()
29
+ AC = ElementMaker(namespace=namespaces["ac"])
30
+ RI = ElementMaker(namespace=namespaces["ri"])
31
+
32
+ LOGGER = logging.getLogger(__name__)
33
+
34
+
35
+ class ParseError(RuntimeError):
36
+ pass
37
+
38
+
39
+ def is_absolute_url(url: str) -> bool:
40
+ urlparts = urlparse(url)
41
+ return bool(urlparts.scheme) or bool(urlparts.netloc)
42
+
43
+
44
+ def is_relative_url(url: str) -> bool:
45
+ urlparts = urlparse(url)
46
+ return not bool(urlparts.scheme) and not bool(urlparts.netloc)
47
+
48
+
49
+ def markdown_to_html(content: str) -> str:
50
+ return markdown.markdown(
51
+ content,
52
+ extensions=[
53
+ "admonition",
54
+ "markdown.extensions.tables",
55
+ "markdown.extensions.fenced_code",
56
+ "pymdownx.magiclink",
57
+ "pymdownx.tilde",
58
+ "sane_lists",
59
+ "md_in_html",
60
+ ],
61
+ )
62
+
63
+
64
+ def _elements_from_strings(dtd_path: pathlib.Path, items: List[str]) -> ET._Element:
65
+ """
66
+ Creates a fragment of several XML nodes from their string representation wrapped in a root element.
67
+
68
+ :param dtd_path: Path to a DTD document that defines entities like &cent; or &copy;.
69
+ :param items: Strings to parse into XML fragments.
70
+ :returns: An XML document as an element tree.
71
+ """
72
+
73
+ parser = ET.XMLParser(
74
+ remove_blank_text=True,
75
+ strip_cdata=False,
76
+ load_dtd=True,
77
+ )
78
+
79
+ ns_attr_list = "".join(
80
+ f' xmlns:{key}="{value}"' for key, value in namespaces.items()
81
+ )
82
+
83
+ data = [
84
+ '<?xml version="1.0"?>',
85
+ f'<!DOCTYPE ac:confluence PUBLIC "-//Atlassian//Confluence 4 Page//EN" "{dtd_path}">'
86
+ f"<root{ns_attr_list}>",
87
+ ]
88
+ data.extend(items)
89
+ data.append("</root>")
90
+
91
+ try:
92
+ return ET.fromstringlist(data, parser=parser)
93
+ except ET.XMLSyntaxError as e:
94
+ raise ParseError(e)
95
+
96
+
97
+ def elements_from_strings(items: List[str]) -> ET._Element:
98
+ "Creates a fragment of several XML nodes from their string representation wrapped in a root element."
99
+
100
+ if sys.version_info >= (3, 9):
101
+ resource_path = resources.files(__package__).joinpath("entities.dtd")
102
+ with resources.as_file(resource_path) as dtd_path:
103
+ return _elements_from_strings(dtd_path, items)
104
+ else:
105
+ with resources.path(__package__, "entities.dtd") as dtd_path:
106
+ return _elements_from_strings(dtd_path, items)
107
+
108
+
109
+ def elements_from_string(content: str) -> ET._Element:
110
+ return elements_from_strings([content])
111
+
112
+
113
+ _languages = [
114
+ "abap",
115
+ "actionscript3",
116
+ "ada",
117
+ "applescript",
118
+ "arduino",
119
+ "autoit",
120
+ "bash",
121
+ "c",
122
+ "clojure",
123
+ "coffeescript",
124
+ "coldfusion",
125
+ "cpp",
126
+ "csharp",
127
+ "css",
128
+ "cuda",
129
+ "d",
130
+ "dart",
131
+ "delphi",
132
+ "diff",
133
+ "elixir",
134
+ "erlang",
135
+ "fortran",
136
+ "foxpro",
137
+ "go",
138
+ "graphql",
139
+ "groovy",
140
+ "haskell",
141
+ "haxe",
142
+ "html",
143
+ "java",
144
+ "javafx",
145
+ "javascript",
146
+ "json",
147
+ "jsx",
148
+ "julia",
149
+ "kotlin",
150
+ "livescript",
151
+ "lua",
152
+ "mermaid",
153
+ "mathematica",
154
+ "matlab",
155
+ "objectivec",
156
+ "objectivej",
157
+ "ocaml",
158
+ "octave",
159
+ "pascal",
160
+ "perl",
161
+ "php",
162
+ "powershell",
163
+ "prolog",
164
+ "puppet",
165
+ "python",
166
+ "qml",
167
+ "r",
168
+ "racket",
169
+ "rst",
170
+ "ruby",
171
+ "rust",
172
+ "sass",
173
+ "scala",
174
+ "scheme",
175
+ "shell",
176
+ "smalltalk",
177
+ "splunk",
178
+ "sql",
179
+ "standardml",
180
+ "swift",
181
+ "tcl",
182
+ "tex",
183
+ "tsx",
184
+ "typescript",
185
+ "vala",
186
+ "vb",
187
+ "verilog",
188
+ "vhdl",
189
+ "xml",
190
+ "xquery",
191
+ "yaml",
192
+ ]
193
+
194
+
195
+ @dataclass
196
+ class ConfluencePageMetadata:
197
+ domain: str
198
+ base_path: str
199
+ page_id: str
200
+ space_key: str
201
+ title: str
202
+
203
+
204
+ class NodeVisitor:
205
+ def visit(self, node: ET._Element) -> None:
206
+ "Recursively visits all descendants of this node."
207
+
208
+ if len(node) < 1:
209
+ return
210
+
211
+ for index in range(len(node)):
212
+ source = node[index]
213
+ target = self.transform(source)
214
+ if target is not None:
215
+ node[index] = target
216
+ else:
217
+ self.visit(source)
218
+
219
+ def transform(self, child: ET._Element) -> Optional[ET._Element]:
220
+ pass
221
+
222
+
223
+ def title_to_identifier(title: str) -> str:
224
+ "Converts a section heading title to a GitHub-style Markdown same-page anchor."
225
+
226
+ s = title.strip().lower()
227
+ s = re.sub("[^ A-Za-z0-9]", "", s)
228
+ s = s.replace(" ", "-")
229
+ return s
230
+
231
+
232
+ @dataclass
233
+ class ConfluenceConverterOptions:
234
+ """
235
+ Options for converting an HTML tree into Confluence storage format.
236
+
237
+ :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
238
+ plain text; when false, raise an exception.
239
+ :param heading_anchors: When true, emit a structured macro *anchor* for each section heading using GitHub
240
+ conversion rules for the identifier.
241
+ :param render_mermaid: Whether to pre-render Mermaid diagrams into PNG/SVG images.
242
+ :param diagram_output_format: Target image format for diagrams.
243
+ """
244
+
245
+ ignore_invalid_url: bool = False
246
+ heading_anchors: bool = False
247
+ render_mermaid: bool = False
248
+ diagram_output_format: Literal["png", "svg"] = "png"
249
+
250
+
251
+ class ConfluenceStorageFormatConverter(NodeVisitor):
252
+ "Transforms a plain HTML tree into the Confluence storage format."
253
+
254
+ options: ConfluenceConverterOptions
255
+ path: pathlib.Path
256
+ base_path: pathlib.Path
257
+ links: List[str]
258
+ images: List[str]
259
+ embedded_images: Dict[str, bytes]
260
+ page_metadata: Dict[pathlib.Path, ConfluencePageMetadata]
261
+
262
+ def __init__(
263
+ self,
264
+ options: ConfluenceConverterOptions,
265
+ path: pathlib.Path,
266
+ page_metadata: Dict[pathlib.Path, ConfluencePageMetadata],
267
+ ) -> None:
268
+ super().__init__()
269
+ self.options = options
270
+ self.path = path
271
+ self.base_path = path.parent
272
+ self.links = []
273
+ self.images = []
274
+ self.embedded_images = {}
275
+ self.page_metadata = page_metadata
276
+
277
+ def _transform_heading(self, heading: ET._Element) -> None:
278
+ title = "".join(heading.itertext()).strip()
279
+
280
+ for e in heading:
281
+ self.visit(e)
282
+
283
+ anchor = AC(
284
+ "structured-macro",
285
+ {
286
+ ET.QName(namespaces["ac"], "name"): "anchor",
287
+ ET.QName(namespaces["ac"], "schema-version"): "1",
288
+ },
289
+ AC(
290
+ "parameter",
291
+ {ET.QName(namespaces["ac"], "name"): ""},
292
+ title_to_identifier(title),
293
+ ),
294
+ )
295
+
296
+ # insert anchor as first child, pushing any text nodes
297
+ heading.insert(0, anchor)
298
+ anchor.tail = heading.text
299
+ heading.text = None
300
+
301
+ def _transform_link(self, anchor: ET._Element) -> None:
302
+ url = anchor.attrib["href"]
303
+ if is_absolute_url(url):
304
+ return
305
+
306
+ LOGGER.debug(f"found link {url} relative to {self.path}")
307
+ relative_url: ParseResult = urlparse(url)
308
+
309
+ if (
310
+ not relative_url.scheme
311
+ and not relative_url.netloc
312
+ and not relative_url.path
313
+ and not relative_url.params
314
+ and not relative_url.query
315
+ ):
316
+ LOGGER.debug(f"found local URL: {url}")
317
+ anchor.attrib["href"] = url
318
+ return
319
+
320
+ # convert the relative URL to absolute URL based on the base path value, then look up
321
+ # the absolute path in the page metadata dictionary to discover the relative path
322
+ # within Confluence that should be used
323
+ absolute_path = (self.base_path / relative_url.path).absolute()
324
+ if not str(absolute_path).startswith(str(self.base_path)):
325
+ msg = f"relative URL {url} points to outside base path: {self.base_path}"
326
+ if self.options.ignore_invalid_url:
327
+ LOGGER.warning(msg)
328
+ anchor.attrib.pop("href")
329
+ return
330
+ else:
331
+ raise DocumentError(msg)
332
+
333
+ relative_path = os.path.relpath(absolute_path, self.base_path)
334
+
335
+ link_metadata = self.page_metadata.get(absolute_path)
336
+ if link_metadata is None:
337
+ msg = f"unable to find matching page for URL: {url}"
338
+ if self.options.ignore_invalid_url:
339
+ LOGGER.warning(msg)
340
+ anchor.attrib.pop("href")
341
+ return
342
+ else:
343
+ raise DocumentError(msg)
344
+
345
+ LOGGER.debug(
346
+ f"found link to page {relative_path} with metadata: {link_metadata}"
347
+ )
348
+ self.links.append(url)
349
+
350
+ components = ParseResult(
351
+ scheme="https",
352
+ netloc=link_metadata.domain,
353
+ path=f"{link_metadata.base_path}spaces/{link_metadata.space_key}/pages/{link_metadata.page_id}/{link_metadata.title}",
354
+ params="",
355
+ query="",
356
+ fragment=relative_url.fragment,
357
+ )
358
+ transformed_url = urlunparse(components)
359
+
360
+ LOGGER.debug(f"transformed relative URL: {url} to URL: {transformed_url}")
361
+ anchor.attrib["href"] = transformed_url
362
+
363
+ def _transform_image(self, image: ET._Element) -> ET._Element:
364
+ path: str = image.attrib["src"]
365
+
366
+ # prefer PNG over SVG; Confluence displays SVG in wrong size, and text labels are truncated
367
+ if path and is_relative_url(path):
368
+ relative_path = pathlib.Path(path)
369
+ if (
370
+ relative_path.suffix == ".svg"
371
+ and (self.base_path / relative_path.with_suffix(".png")).exists()
372
+ ):
373
+ path = str(relative_path.with_suffix(".png"))
374
+
375
+ self.images.append(path)
376
+ caption = image.attrib["alt"]
377
+ return AC(
378
+ "image",
379
+ {
380
+ ET.QName(namespaces["ac"], "align"): "center",
381
+ ET.QName(namespaces["ac"], "layout"): "center",
382
+ },
383
+ RI(
384
+ "attachment",
385
+ {ET.QName(namespaces["ri"], "filename"): attachment_name(path)},
386
+ ),
387
+ AC("caption", HTML.p(caption)),
388
+ )
389
+
390
+ def _transform_block(self, code: ET._Element) -> ET._Element:
391
+ language = code.attrib.get("class")
392
+ if language:
393
+ m = re.match("^language-(.*)$", language)
394
+ if m:
395
+ language = m.group(1)
396
+ else:
397
+ language = "none"
398
+ if language not in _languages:
399
+ language = "none"
400
+ content: str = code.text or ""
401
+ content = content.rstrip()
402
+
403
+ if language == "mermaid":
404
+ return self._transform_mermaid(content)
405
+
406
+ return AC(
407
+ "structured-macro",
408
+ {
409
+ ET.QName(namespaces["ac"], "name"): "code",
410
+ ET.QName(namespaces["ac"], "schema-version"): "1",
411
+ },
412
+ AC(
413
+ "parameter",
414
+ {ET.QName(namespaces["ac"], "name"): "theme"},
415
+ "Midnight",
416
+ ),
417
+ AC(
418
+ "parameter",
419
+ {ET.QName(namespaces["ac"], "name"): "language"},
420
+ language,
421
+ ),
422
+ AC(
423
+ "parameter",
424
+ {ET.QName(namespaces["ac"], "name"): "linenumbers"},
425
+ "true",
426
+ ),
427
+ AC("plain-text-body", ET.CDATA(content)),
428
+ )
429
+
430
+ def _transform_mermaid(self, content: str) -> ET._Element:
431
+ "Transforms a Mermaid diagram code block."
432
+
433
+ if self.options.render_mermaid:
434
+ image_data = mermaid.render(content, self.options.diagram_output_format)
435
+ image_hash = hashlib.md5(image_data).hexdigest()
436
+ image_filename = attachment_name(
437
+ f"embedded_{image_hash}.{self.options.diagram_output_format}"
438
+ )
439
+ self.embedded_images[image_filename] = image_data
440
+ return AC(
441
+ "image",
442
+ {
443
+ ET.QName(namespaces["ac"], "align"): "center",
444
+ ET.QName(namespaces["ac"], "layout"): "center",
445
+ },
446
+ RI(
447
+ "attachment",
448
+ {ET.QName(namespaces["ri"], "filename"): image_filename},
449
+ ),
450
+ )
451
+ else:
452
+ local_id = str(uuid.uuid4())
453
+ macro_id = str(uuid.uuid4())
454
+ return AC(
455
+ "structured-macro",
456
+ {
457
+ ET.QName(namespaces["ac"], "name"): "macro-diagram",
458
+ ET.QName(namespaces["ac"], "schema-version"): "1",
459
+ ET.QName(namespaces["ac"], "data-layout"): "default",
460
+ ET.QName(namespaces["ac"], "local-id"): local_id,
461
+ ET.QName(namespaces["ac"], "macro-id"): macro_id,
462
+ },
463
+ AC(
464
+ "parameter",
465
+ {ET.QName(namespaces["ac"], "name"): "sourceType"},
466
+ "MacroBody",
467
+ ),
468
+ AC(
469
+ "parameter",
470
+ {ET.QName(namespaces["ac"], "name"): "attachmentPageId"},
471
+ ),
472
+ AC(
473
+ "parameter",
474
+ {ET.QName(namespaces["ac"], "name"): "syntax"},
475
+ "Mermaid",
476
+ ),
477
+ AC(
478
+ "parameter",
479
+ {ET.QName(namespaces["ac"], "name"): "attachmentId"},
480
+ ),
481
+ AC("parameter", {ET.QName(namespaces["ac"], "name"): "url"}),
482
+ AC("plain-text-body", ET.CDATA(content)),
483
+ )
484
+
485
+ def _transform_toc(self, code: ET._Element) -> ET._Element:
486
+ return AC(
487
+ "structured-macro",
488
+ {
489
+ ET.QName(namespaces["ac"], "name"): "toc",
490
+ ET.QName(namespaces["ac"], "schema-version"): "1",
491
+ },
492
+ AC("parameter", {ET.QName(namespaces["ac"], "name"): "outline"}, "clear"),
493
+ AC("parameter", {ET.QName(namespaces["ac"], "name"): "style"}, "default"),
494
+ )
495
+
496
+ def _transform_admonition(self, elem: ET._Element) -> ET._Element:
497
+ """
498
+ Creates an info, tip, note or warning panel from a Markdown admonition.
499
+
500
+ Transforms [Python-Markdown admonition](https://python-markdown.github.io/extensions/admonition/)
501
+ syntax into one of the Confluence structured macros *info*, *tip*, *note*, or *warning*.
502
+ """
503
+
504
+ # <div class="admonition note">
505
+ class_list = elem.attrib.get("class", "").split(" ")
506
+ class_name: Optional[str] = None
507
+ if "info" in class_list:
508
+ class_name = "info"
509
+ elif "tip" in class_list:
510
+ class_name = "tip"
511
+ elif "note" in class_list:
512
+ class_name = "note"
513
+ elif "warning" in class_list:
514
+ class_name = "warning"
515
+
516
+ if class_name is None:
517
+ raise DocumentError(f"unsupported admonition label: {class_list}")
518
+
519
+ for e in elem:
520
+ self.visit(e)
521
+
522
+ # <p class="admonition-title">Note</p>
523
+ if "admonition-title" in elem[0].attrib.get("class", "").split(" "):
524
+ content = [
525
+ AC(
526
+ "parameter",
527
+ {ET.QName(namespaces["ac"], "name"): "title"},
528
+ elem[0].text or "",
529
+ ),
530
+ AC("rich-text-body", {}, *list(elem[1:])),
531
+ ]
532
+ else:
533
+ content = [AC("rich-text-body", {}, *list(elem))]
534
+
535
+ return AC(
536
+ "structured-macro",
537
+ {
538
+ ET.QName(namespaces["ac"], "name"): class_name,
539
+ ET.QName(namespaces["ac"], "schema-version"): "1",
540
+ },
541
+ *content,
542
+ )
543
+
544
+ def _transform_alert(self, elem: ET._Element) -> ET._Element:
545
+ """
546
+ Creates an info, tip, note or warning panel from a GitHub alert.
547
+
548
+ Transforms
549
+ [GitHub alert](https://docs.github.com/en/get-started/writing-on-github/getting-started-with-writing-and-formatting-on-github/basic-writing-and-formatting-syntax#alerts) # noqa: E501 # no way to make this link shorter
550
+ syntax into one of the Confluence structured macros *info*, *tip*, *note*, or *warning*.
551
+ """
552
+
553
+ pattern = re.compile(r"^\[!([A-Z]+)\]\s*")
554
+
555
+ content = elem[0]
556
+ if content.text is None:
557
+ raise DocumentError("empty content")
558
+
559
+ match = pattern.match(content.text)
560
+ if match is None:
561
+ raise DocumentError("not an alert")
562
+ alert = match.group(1)
563
+
564
+ if alert == "NOTE":
565
+ class_name = "note"
566
+ elif alert == "TIP":
567
+ class_name = "tip"
568
+ elif alert == "IMPORTANT":
569
+ class_name = "tip"
570
+ elif alert == "WARNING":
571
+ class_name = "warning"
572
+ elif alert == "CAUTION":
573
+ class_name = "warning"
574
+ else:
575
+ raise DocumentError(f"unsupported alert: {alert}")
576
+
577
+ for e in elem:
578
+ self.visit(e)
579
+
580
+ content.text = pattern.sub("", content.text, count=1)
581
+ return AC(
582
+ "structured-macro",
583
+ {
584
+ ET.QName(namespaces["ac"], "name"): class_name,
585
+ ET.QName(namespaces["ac"], "schema-version"): "1",
586
+ },
587
+ AC("rich-text-body", {}, *list(elem)),
588
+ )
589
+
590
+ def _transform_section(self, elem: ET._Element) -> ET._Element:
591
+ """
592
+ Creates a collapsed section.
593
+
594
+ Transforms
595
+ [GitHub collapsed section](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/organizing-information-with-collapsed-sections) # noqa: E501 # no way to make this link shorter
596
+ syntax into the Confluence structured macro *expand*.
597
+ """
598
+
599
+ if elem[0].tag != "summary":
600
+ raise DocumentError(
601
+ "expected: `<summary>` as first direct child of `<details>`"
602
+ )
603
+ if elem[0].tail is not None:
604
+ raise DocumentError('expected: attribute `markdown="1"` on `<details>`')
605
+
606
+ summary = "".join(elem[0].itertext()).strip()
607
+ elem.remove(elem[0])
608
+
609
+ self.visit(elem)
610
+
611
+ return AC(
612
+ "structured-macro",
613
+ {
614
+ ET.QName(namespaces["ac"], "name"): "expand",
615
+ ET.QName(namespaces["ac"], "schema-version"): "1",
616
+ },
617
+ AC(
618
+ "parameter",
619
+ {ET.QName(namespaces["ac"], "name"): "title"},
620
+ summary,
621
+ ),
622
+ AC("rich-text-body", {}, *list(elem)),
623
+ )
624
+
625
+ def transform(self, child: ET._Element) -> Optional[ET._Element]:
626
+ # normalize line breaks to regular space in element text
627
+ if child.text:
628
+ text: str = child.text
629
+ child.text = text.replace("\n", " ")
630
+ if child.tail:
631
+ tail: str = child.tail
632
+ child.tail = tail.replace("\n", " ")
633
+
634
+ if self.options.heading_anchors:
635
+ # <h1>...</h1>
636
+ # <h2>...</h2> ...
637
+ if re.match(r"^h[1-6]$", child.tag, flags=re.IGNORECASE) is not None:
638
+ self._transform_heading(child)
639
+ return None
640
+
641
+ # <p><img src="..." /></p>
642
+ if child.tag == "p" and len(child) == 1 and child[0].tag == "img":
643
+ return self._transform_image(child[0])
644
+
645
+ # <p>[[_TOC_]]</p>
646
+ # <p>[TOC]</p>
647
+ elif child.tag == "p" and "".join(child.itertext()) in ["[[TOC]]", "[TOC]"]:
648
+ return self._transform_toc(child)
649
+
650
+ # <div class="admonition note">
651
+ # <p class="admonition-title">Note</p>
652
+ # <p>...</p>
653
+ # </div>
654
+ #
655
+ # --- OR ---
656
+ #
657
+ # <div class="admonition note">
658
+ # <p>...</p>
659
+ # </div>
660
+ elif child.tag == "div" and "admonition" in child.attrib.get("class", ""):
661
+ return self._transform_admonition(child)
662
+
663
+ # Alerts in GitHub
664
+ # <blockquote>
665
+ # <p>[!TIP] ...</p>
666
+ # </blockquote>
667
+ elif (
668
+ child.tag == "blockquote"
669
+ and len(child) > 0
670
+ and child[0].tag == "p"
671
+ and child[0].text is not None
672
+ and child[0].text.startswith("[!")
673
+ ):
674
+ return self._transform_alert(child)
675
+
676
+ # <details markdown="1">
677
+ # <summary>...</summary>
678
+ # ...
679
+ # </details>
680
+ elif child.tag == "details" and len(child) > 1 and child[0].tag == "summary":
681
+ return self._transform_section(child)
682
+
683
+ # <img src="..." alt="..." />
684
+ elif child.tag == "img":
685
+ return self._transform_image(child)
686
+
687
+ # <a href="..."> ... </a>
688
+ elif child.tag == "a":
689
+ self._transform_link(child)
690
+ return None
691
+
692
+ # <pre><code class="language-java"> ... </code></pre>
693
+ elif child.tag == "pre" and len(child) == 1 and child[0].tag == "code":
694
+ return self._transform_block(child[0])
695
+
696
+ return None
697
+
698
+
699
+ class ConfluenceStorageFormatCleaner(NodeVisitor):
700
+ "Removes volatile attributes from a Confluence storage format XHTML document."
701
+
702
+ def transform(self, child: ET._Element) -> Optional[ET._Element]:
703
+ child.attrib.pop(ET.QName(namespaces["ac"], "macro-id"), None)
704
+ child.attrib.pop(ET.QName(namespaces["ri"], "version-at-save"), None)
705
+ return None
706
+
707
+
708
+ class DocumentError(RuntimeError):
709
+ pass
710
+
711
+
712
+ def extract_value(pattern: str, string: str) -> Tuple[Optional[str], str]:
713
+ values: List[str] = []
714
+
715
+ def _repl_func(matchobj: re.Match) -> str:
716
+ values.append(matchobj.group(1))
717
+ return ""
718
+
719
+ string = re.sub(pattern, _repl_func, string, 1, re.ASCII)
720
+ value = values[0] if values else None
721
+ return value, string
722
+
723
+
724
+ @dataclass
725
+ class ConfluenceQualifiedID:
726
+ page_id: str
727
+ space_key: Optional[str] = None
728
+
729
+
730
+ def extract_qualified_id(string: str) -> Tuple[Optional[ConfluenceQualifiedID], str]:
731
+ page_id, string = extract_value(r"<!--\s+confluence-page-id:\s*(\d+)\s+-->", string)
732
+
733
+ if page_id is None:
734
+ return None, string
735
+
736
+ # extract Confluence space key
737
+ space_key, string = extract_value(
738
+ r"<!--\s+confluence-space-key:\s*(\S+)\s+-->", string
739
+ )
740
+
741
+ return ConfluenceQualifiedID(page_id, space_key), string
742
+
743
+
744
+ @dataclass
745
+ class ConfluenceDocumentOptions:
746
+ """
747
+ Options that control the generated page content.
748
+
749
+ :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
750
+ plain text; when false, raise an exception.
751
+ :param heading_anchors: When true, emit a structured macro *anchor* for each section heading using GitHub
752
+ conversion rules for the identifier.
753
+ :param generated_by: Text to use as the generated-by prompt.
754
+ :param show_generated: Whether to display a prompt "This page has been generated with a tool."
755
+ :param render_mermaid: Whether to pre-render Mermaid diagrams into PNG/SVG images.
756
+ :param diagram_output_format: Target image format for diagrams.
757
+ """
758
+
759
+ ignore_invalid_url: bool = False
760
+ heading_anchors: bool = False
761
+ generated_by: Optional[str] = "This page has been generated with a tool."
762
+ root_page_id: Optional[str] = None
763
+ render_mermaid: bool = False
764
+ diagram_output_format: Literal["png", "svg"] = "png"
765
+
766
+
767
+ class ConfluenceDocument:
768
+ id: ConfluenceQualifiedID
769
+ links: List[str]
770
+ images: List[str]
771
+
772
+ options: ConfluenceDocumentOptions
773
+ root: ET._Element
774
+
775
+ def __init__(
776
+ self,
777
+ path: pathlib.Path,
778
+ options: ConfluenceDocumentOptions,
779
+ page_metadata: Dict[pathlib.Path, ConfluencePageMetadata],
780
+ ) -> None:
781
+ self.options = options
782
+ path = path.absolute()
783
+
784
+ with open(path, "r", encoding="utf-8") as f:
785
+ text = f.read()
786
+
787
+ # extract Confluence page ID
788
+ qualified_id, text = extract_qualified_id(text)
789
+ if qualified_id is None:
790
+ raise ValueError("missing Confluence page ID")
791
+ self.id = qualified_id
792
+
793
+ # extract 'generated-by' tag text
794
+ generated_by_tag, text = extract_value(
795
+ r"<!--\s+generated-by:\s*(.*)\s+-->", text
796
+ )
797
+
798
+ # extract frontmatter
799
+ frontmatter, text = extract_value(r"(?ms)\A---$(.+?)^---$", text)
800
+
801
+ # convert to HTML
802
+ html = markdown_to_html(text)
803
+
804
+ # parse Markdown document
805
+ if self.options.generated_by is not None:
806
+ generated_by = self.options.generated_by
807
+ if generated_by_tag is not None:
808
+ generated_by = generated_by_tag
809
+
810
+ content = [
811
+ '<ac:structured-macro ac:name="info" ac:schema-version="1">',
812
+ f"<ac:rich-text-body><p>{generated_by}</p></ac:rich-text-body>",
813
+ "</ac:structured-macro>",
814
+ html,
815
+ ]
816
+ else:
817
+ content = [html]
818
+ self.root = elements_from_strings(content)
819
+
820
+ converter = ConfluenceStorageFormatConverter(
821
+ ConfluenceConverterOptions(
822
+ ignore_invalid_url=self.options.ignore_invalid_url,
823
+ heading_anchors=self.options.heading_anchors,
824
+ render_mermaid=self.options.render_mermaid,
825
+ diagram_output_format=self.options.diagram_output_format,
826
+ ),
827
+ path,
828
+ page_metadata,
829
+ )
830
+ converter.visit(self.root)
831
+ self.links = converter.links
832
+ self.images = converter.images
833
+ self.embedded_images = converter.embedded_images
834
+
835
+ def xhtml(self) -> str:
836
+ return elements_to_string(self.root)
837
+
838
+
839
+ def attachment_name(name: str) -> str:
840
+ """
841
+ Safe name for use with attachment uploads.
842
+
843
+ Allowed characters:
844
+ * Alphanumeric characters: 0-9, a-z, A-Z
845
+ * Special characters: hyphen (-), underscore (_), period (.)
846
+ """
847
+
848
+ return re.sub(r"[^\-0-9A-Za-z_.]", "_", name)
849
+
850
+
851
+ def sanitize_confluence(html: str) -> str:
852
+ "Generates a sanitized version of a Confluence storage format XHTML document with no volatile attributes."
853
+
854
+ if not html:
855
+ return ""
856
+
857
+ root = elements_from_strings([html])
858
+ ConfluenceStorageFormatCleaner().visit(root)
859
+ return elements_to_string(root)
860
+
861
+
862
+ def elements_to_string(root: ET._Element) -> str:
863
+ xml = ET.tostring(root, encoding="utf8", method="xml").decode("utf8")
864
+ m = re.match(r"^<root\s+[^>]*>(.*)</root>\s*$", xml, re.DOTALL)
865
+ if m:
866
+ return m.group(1)
867
+ else:
868
+ raise ValueError("expected: Confluence content")