markdown-to-confluence 0.1.11__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
md2conf/converter.py CHANGED
@@ -1,602 +1,626 @@
1
- import importlib.resources as resources
2
- import logging
3
- import os.path
4
- import pathlib
5
- import re
6
- import sys
7
- from dataclasses import dataclass
8
- from typing import Dict, List, Optional, Tuple
9
- from urllib.parse import ParseResult, urlparse, urlunparse
10
-
11
- import lxml.etree as ET
12
- import markdown
13
- from lxml.builder import ElementMaker
14
-
15
- namespaces = {
16
- "ac": "http://atlassian.com/content",
17
- "ri": "http://atlassian.com/resource/identifier",
18
- }
19
- for key, value in namespaces.items():
20
- ET.register_namespace(key, value)
21
-
22
-
23
- HTML = ElementMaker()
24
- AC = ElementMaker(namespace=namespaces["ac"])
25
- RI = ElementMaker(namespace=namespaces["ri"])
26
-
27
- LOGGER = logging.getLogger(__name__)
28
-
29
-
30
- class ParseError(RuntimeError):
31
- pass
32
-
33
-
34
- def is_absolute_url(url: str) -> bool:
35
- urlparts = urlparse(url)
36
- return bool(urlparts.scheme) or bool(urlparts.netloc)
37
-
38
-
39
- def is_relative_url(url: str) -> bool:
40
- urlparts = urlparse(url)
41
- return not bool(urlparts.scheme) and not bool(urlparts.netloc)
42
-
43
-
44
- def markdown_to_html(content: str) -> str:
45
- return markdown.markdown(
46
- content,
47
- extensions=[
48
- "admonition",
49
- "markdown.extensions.tables",
50
- "markdown.extensions.fenced_code",
51
- "pymdownx.magiclink",
52
- "pymdownx.tilde",
53
- "sane_lists",
54
- ],
55
- )
56
-
57
-
58
- def _elements_from_strings(dtd_path: pathlib.Path, items: List[str]) -> ET._Element:
59
- """
60
- Creates a fragment of several XML nodes from their string representation wrapped in a root element.
61
-
62
- :param dtd_path: Path to a DTD document that defines entities like ¢ or ©.
63
- :param items: Strings to parse into XML fragments.
64
- :returns: An XML document as an element tree.
65
- """
66
-
67
- parser = ET.XMLParser(
68
- remove_blank_text=True,
69
- strip_cdata=False,
70
- load_dtd=True,
71
- )
72
-
73
- ns_attr_list = "".join(
74
- f' xmlns:{key}="{value}"' for key, value in namespaces.items()
75
- )
76
-
77
- data = [
78
- '<?xml version="1.0"?>',
79
- f'<!DOCTYPE ac:confluence PUBLIC "-//Atlassian//Confluence 4 Page//EN" "{dtd_path}">'
80
- f"<root{ns_attr_list}>",
81
- ]
82
- data.extend(items)
83
- data.append("</root>")
84
-
85
- try:
86
- return ET.fromstringlist(data, parser=parser)
87
- except ET.XMLSyntaxError as e:
88
- raise ParseError(e)
89
-
90
-
91
- def elements_from_strings(items: List[str]) -> ET._Element:
92
- "Creates a fragment of several XML nodes from their string representation wrapped in a root element."
93
-
94
- if sys.version_info >= (3, 9):
95
- resource_path = resources.files(__package__).joinpath("entities.dtd")
96
- with resources.as_file(resource_path) as dtd_path:
97
- return _elements_from_strings(dtd_path, items)
98
- else:
99
- with resources.path(__package__, "entities.dtd") as dtd_path:
100
- return _elements_from_strings(dtd_path, items)
101
-
102
-
103
- _languages = [
104
- "abap",
105
- "actionscript3",
106
- "ada",
107
- "applescript",
108
- "arduino",
109
- "autoit",
110
- "bash",
111
- "c",
112
- "clojure",
113
- "coffeescript",
114
- "coldfusion",
115
- "cpp",
116
- "csharp",
117
- "css",
118
- "cuda",
119
- "d",
120
- "dart",
121
- "delphi",
122
- "diff",
123
- "elixir",
124
- "erlang",
125
- "fortran",
126
- "foxpro",
127
- "go",
128
- "graphql",
129
- "groovy",
130
- "haskell",
131
- "haxe",
132
- "html",
133
- "java",
134
- "javafx",
135
- "javascript",
136
- "json",
137
- "jsx",
138
- "julia",
139
- "kotlin",
140
- "livescript",
141
- "lua",
142
- "mathematica",
143
- "matlab",
144
- "objectivec",
145
- "objectivej",
146
- "ocaml",
147
- "octave",
148
- "pascal",
149
- "perl",
150
- "php",
151
- "powershell",
152
- "prolog",
153
- "puppet",
154
- "python",
155
- "qml",
156
- "r",
157
- "racket",
158
- "rst",
159
- "ruby",
160
- "rust",
161
- "sass",
162
- "scala",
163
- "scheme",
164
- "shell",
165
- "smalltalk",
166
- "splunk",
167
- "sql",
168
- "standardml",
169
- "swift",
170
- "tcl",
171
- "tex",
172
- "tsx",
173
- "typescript",
174
- "vala",
175
- "vb",
176
- "verilog",
177
- "vhdl",
178
- "xml",
179
- "xquery",
180
- "yaml",
181
- ]
182
-
183
-
184
- @dataclass
185
- class ConfluencePageMetadata:
186
- domain: str
187
- base_path: str
188
- page_id: str
189
- space_key: str
190
- title: str
191
-
192
-
193
- class NodeVisitor:
194
- def visit(self, node: ET._Element) -> None:
195
- if len(node) < 1:
196
- return
197
-
198
- for index in range(len(node)):
199
- source = node[index]
200
- target = self.transform(source)
201
- if target is not None:
202
- node[index] = target
203
- else:
204
- self.visit(source)
205
-
206
- def transform(self, child: ET._Element) -> Optional[ET._Element]:
207
- pass
208
-
209
-
210
- def _change_ext(path: str, target_ext: str) -> str:
211
- root, source_ext = os.path.splitext(path)
212
- return f"{root}{target_ext}"
213
-
214
-
215
- @dataclass
216
- class ConfluenceConverterOptions:
217
- """
218
- Options for converting an HTML tree into Confluence storage format.
219
-
220
- :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
221
- plain text; when false, raise an exception.
222
- """
223
-
224
- ignore_invalid_url: bool = False
225
-
226
-
227
- class ConfluenceStorageFormatConverter(NodeVisitor):
228
- "Transforms a plain HTML tree into the Confluence storage format."
229
-
230
- options: ConfluenceConverterOptions
231
- path: str
232
- base_path: str
233
- links: List[str]
234
- images: List[str]
235
- page_metadata: Dict[str, ConfluencePageMetadata]
236
-
237
- def __init__(
238
- self,
239
- options: ConfluenceConverterOptions,
240
- path: str,
241
- page_metadata: Dict[str, ConfluencePageMetadata],
242
- ) -> None:
243
- super().__init__()
244
- self.options = options
245
- self.path = path
246
- self.base_path = os.path.abspath(os.path.dirname(path)) + os.sep
247
- self.links = []
248
- self.images = []
249
- self.page_metadata = page_metadata
250
-
251
- def _transform_link(self, anchor: ET._Element) -> None:
252
- url = anchor.attrib["href"]
253
- if is_absolute_url(url):
254
- return
255
-
256
- LOGGER.debug(f"found link {url} relative to {self.path}")
257
- relative_url: ParseResult = urlparse(url)
258
-
259
- if (
260
- not relative_url.scheme
261
- and not relative_url.netloc
262
- and not relative_url.path
263
- and not relative_url.params
264
- and not relative_url.query
265
- ):
266
- LOGGER.debug(f"found local URL: {url}")
267
- anchor.attrib["href"] = url
268
- return
269
-
270
- # convert the relative URL to absolute URL based on the base path value, then look up
271
- # the absolute path in the page metadata dictionary to discover the relative path
272
- # within Confluence that should be used
273
- absolute_path = os.path.abspath(os.path.join(self.base_path, relative_url.path))
274
- if not absolute_path.startswith(self.base_path):
275
- msg = f"relative URL points to outside base path: {url}"
276
- if self.options.ignore_invalid_url:
277
- LOGGER.warning(msg)
278
- anchor.attrib.pop("href")
279
- return
280
- else:
281
- raise DocumentError(msg)
282
-
283
- relative_path = os.path.relpath(absolute_path, self.base_path)
284
-
285
- link_metadata = self.page_metadata.get(absolute_path)
286
- if link_metadata is None:
287
- msg = f"unable to find matching page for URL: {url}"
288
- if self.options.ignore_invalid_url:
289
- LOGGER.warning(msg)
290
- anchor.attrib.pop("href")
291
- return
292
- else:
293
- raise DocumentError(msg)
294
-
295
- LOGGER.debug(
296
- f"found link to page {relative_path} with metadata: {link_metadata}"
297
- )
298
- self.links.append(url)
299
-
300
- components = ParseResult(
301
- scheme="https",
302
- netloc=link_metadata.domain,
303
- path=f"{link_metadata.base_path}spaces/{link_metadata.space_key}/pages/{link_metadata.page_id}/{link_metadata.title}",
304
- params="",
305
- query="",
306
- fragment=relative_url.fragment,
307
- )
308
- transformed_url = urlunparse(components)
309
-
310
- LOGGER.debug(f"transformed relative URL: {url} to URL: {transformed_url}")
311
- anchor.attrib["href"] = transformed_url
312
-
313
- def _transform_image(self, image: ET._Element) -> ET._Element:
314
- path: str = image.attrib["src"]
315
-
316
- # prefer PNG over SVG; Confluence displays SVG in wrong size, and text labels are truncated
317
- if path and is_relative_url(path) and path.endswith(".svg"):
318
- replacement_path = _change_ext(path, ".png")
319
- if os.path.exists(os.path.join(self.base_path, replacement_path)):
320
- path = replacement_path
321
-
322
- self.images.append(path)
323
- caption = image.attrib["alt"]
324
- return AC(
325
- "image",
326
- {
327
- ET.QName(namespaces["ac"], "align"): "center",
328
- ET.QName(namespaces["ac"], "layout"): "center",
329
- },
330
- RI("attachment", {ET.QName(namespaces["ri"], "filename"): path}),
331
- AC("caption", HTML.p(caption)),
332
- )
333
-
334
- def _transform_block(self, code: ET._Element) -> ET._Element:
335
- language = code.attrib.get("class")
336
- if language:
337
- m = re.match("^language-(.*)$", language)
338
- if m:
339
- language = m.group(1)
340
- else:
341
- language = "none"
342
- if language not in _languages:
343
- language = "none"
344
- content: str = code.text or ""
345
- content = content.rstrip()
346
- return AC(
347
- "structured-macro",
348
- {
349
- ET.QName(namespaces["ac"], "name"): "code",
350
- ET.QName(namespaces["ac"], "schema-version"): "1",
351
- },
352
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "theme"}, "Midnight"),
353
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "language"}, language),
354
- AC(
355
- "parameter", {ET.QName(namespaces["ac"], "name"): "linenumbers"}, "true"
356
- ),
357
- AC("plain-text-body", ET.CDATA(content)),
358
- )
359
-
360
- def _transform_toc(self, code: ET._Element) -> ET._Element:
361
- return AC(
362
- "structured-macro",
363
- {
364
- ET.QName(namespaces["ac"], "name"): "toc",
365
- ET.QName(namespaces["ac"], "schema-version"): "1",
366
- },
367
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "outline"}, "clear"),
368
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "style"}, "default"),
369
- )
370
-
371
- def _transform_admonition(self, elem: ET._Element) -> ET._Element:
372
- """
373
- Creates an info, tip, note or warning panel.
374
-
375
- Transforms [Python-Markdown admonition](https://python-markdown.github.io/extensions/admonition/) syntax
376
- into Confluence structured macro syntax.
377
- """
378
-
379
- # <div class="admonition note">
380
- class_list = elem.attrib.get("class", "").split(" ")
381
- class_name: Optional[str] = None
382
- if "info" in class_list:
383
- class_name = "info"
384
- elif "tip" in class_list:
385
- class_name = "tip"
386
- elif "note" in class_list:
387
- class_name = "note"
388
- elif "warning" in class_list:
389
- class_name = "warning"
390
-
391
- if class_name is None:
392
- raise DocumentError(f"unsupported admonition label: {class_list}")
393
-
394
- # <p class="admonition-title">Note</p>
395
- if "admonition-title" in elem[0].attrib.get("class", "").split(" "):
396
- content = [
397
- AC(
398
- "parameter",
399
- {ET.QName(namespaces["ac"], "name"): "title"},
400
- elem[0].text,
401
- ),
402
- AC("rich-text-body", {}, *list(elem[1:])),
403
- ]
404
- else:
405
- content = [AC("rich-text-body", {}, *list(elem))]
406
-
407
- return AC(
408
- "structured-macro",
409
- {
410
- ET.QName(namespaces["ac"], "name"): class_name,
411
- ET.QName(namespaces["ac"], "schema-version"): "1",
412
- },
413
- *content,
414
- )
415
-
416
- def transform(self, child: ET._Element) -> Optional[ET._Element]:
417
- # normalize line breaks to regular space in element text
418
- if child.text:
419
- text: str = child.text
420
- child.text = text.replace("\n", " ")
421
- if child.tail:
422
- tail: str = child.tail
423
- child.tail = tail.replace("\n", " ")
424
-
425
- # <p><img src="..." /></p>
426
- if child.tag == "p" and len(child) == 1 and child[0].tag == "img":
427
- return self._transform_image(child[0])
428
-
429
- # <p>[[_TOC_]]</p>
430
- # <p>[TOC]</p>
431
- elif child.tag == "p" and "".join(child.itertext()) in ["[[TOC]]", "[TOC]"]:
432
- return self._transform_toc(child)
433
-
434
- # <div class="admonition note">
435
- # <p class="admonition-title">Note</p>
436
- # <p>...</p>
437
- # </div>
438
- #
439
- # --- OR ---
440
- #
441
- # <div class="admonition note">
442
- # <p>...</p>
443
- # </div>
444
- elif child.tag == "div" and "admonition" in child.attrib.get("class", ""):
445
- return self._transform_admonition(child)
446
-
447
- # <img src="..." alt="..." />
448
- elif child.tag == "img":
449
- return self._transform_image(child)
450
-
451
- # <a href="..."> ... </a>
452
- elif child.tag == "a":
453
- self._transform_link(child)
454
- return None
455
-
456
- # <pre><code class="language-java"> ... </code></pre>
457
- elif child.tag == "pre" and len(child) == 1 and child[0].tag == "code":
458
- return self._transform_block(child[0])
459
-
460
- return None
461
-
462
-
463
- class ConfluenceStorageFormatCleaner(NodeVisitor):
464
- "Removes volatile attributes from a Confluence storage format XHTML document."
465
-
466
- def transform(self, child: ET._Element) -> Optional[ET._Element]:
467
- child.attrib.pop(ET.QName(namespaces["ac"], "macro-id"), None)
468
- child.attrib.pop(ET.QName(namespaces["ri"], "version-at-save"), None)
469
- return None
470
-
471
-
472
- class DocumentError(RuntimeError):
473
- pass
474
-
475
-
476
- def extract_value(pattern: str, string: str) -> Tuple[Optional[str], str]:
477
- values: List[str] = []
478
-
479
- def _repl_func(matchobj: re.Match) -> str:
480
- values.append(matchobj.group(1))
481
- return ""
482
-
483
- string = re.sub(pattern, _repl_func, string, 1, re.ASCII)
484
- value = values[0] if values else None
485
- return value, string
486
-
487
-
488
- @dataclass
489
- class ConfluenceQualifiedID:
490
- page_id: str
491
- space_key: Optional[str] = None
492
-
493
-
494
- def extract_qualified_id(string: str) -> Tuple[Optional[ConfluenceQualifiedID], str]:
495
- page_id, string = extract_value(r"<!--\s+confluence-page-id:\s*(\d+)\s+-->", string)
496
-
497
- if page_id is None:
498
- return None, string
499
-
500
- # extract Confluence space key
501
- space_key, string = extract_value(
502
- r"<!--\s+confluence-space-key:\s*(\S+)\s+-->", string
503
- )
504
-
505
- return ConfluenceQualifiedID(page_id, space_key), string
506
-
507
-
508
- @dataclass
509
- class ConfluenceDocumentOptions:
510
- """
511
- Options that control the generated page content.
512
-
513
- :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
514
- plain text; when false, raise an exception.
515
- :param show_generated: Whether to display a prompt "This page has been generated with a tool."
516
- """
517
-
518
- ignore_invalid_url: bool = False
519
- generated_by: Optional[str] = "This page has been generated with a tool."
520
- root_page_id: Optional[str] = None
521
-
522
-
523
- class ConfluenceDocument:
524
- id: ConfluenceQualifiedID
525
- links: List[str]
526
- images: List[str]
527
-
528
- options: ConfluenceDocumentOptions
529
- root: ET._Element
530
-
531
- def __init__(
532
- self,
533
- path: str,
534
- options: ConfluenceDocumentOptions,
535
- page_metadata: Dict[str, ConfluencePageMetadata],
536
- ) -> None:
537
- self.options = options
538
- path = os.path.abspath(path)
539
-
540
- with open(path, "r") as f:
541
- html = markdown_to_html(f.read())
542
-
543
- # extract Confluence page ID
544
- qualified_id, html = extract_qualified_id(html)
545
- if qualified_id is None:
546
- raise ValueError("missing Confluence page ID")
547
- self.id = qualified_id
548
-
549
- # extract 'generated-by' tag text
550
- generated_by_tag, html = extract_value(
551
- r"<!--\s+generated-by:\s*(.*)\s+-->", html
552
- )
553
-
554
- # parse Markdown document
555
- if self.options.generated_by is not None:
556
- generated_by = self.options.generated_by
557
- if generated_by_tag is not None:
558
- generated_by = generated_by_tag
559
-
560
- content = [
561
- '<ac:structured-macro ac:name="info" ac:schema-version="1">',
562
- f"<ac:rich-text-body><p>{generated_by}</p></ac:rich-text-body>",
563
- "</ac:structured-macro>",
564
- html,
565
- ]
566
- else:
567
- content = [html]
568
- self.root = elements_from_strings(content)
569
-
570
- converter = ConfluenceStorageFormatConverter(
571
- ConfluenceConverterOptions(
572
- ignore_invalid_url=self.options.ignore_invalid_url
573
- ),
574
- path,
575
- page_metadata,
576
- )
577
- converter.visit(self.root)
578
- self.links = converter.links
579
- self.images = converter.images
580
-
581
- def xhtml(self) -> str:
582
- return _content_to_string(self.root)
583
-
584
-
585
- def sanitize_confluence(html: str) -> str:
586
- "Generates a sanitized version of a Confluence storage format XHTML document with no volatile attributes."
587
-
588
- if not html:
589
- return ""
590
-
591
- root = elements_from_strings([html])
592
- ConfluenceStorageFormatCleaner().visit(root)
593
- return _content_to_string(root)
594
-
595
-
596
- def _content_to_string(root: ET._Element) -> str:
597
- xml = ET.tostring(root, encoding="utf8", method="xml").decode("utf8")
598
- m = re.match(r"^<root\s+[^>]*>(.*)</root>\s*$", xml, re.DOTALL)
599
- if m:
600
- return m.group(1)
601
- else:
602
- raise ValueError("expected: Confluence content")
1
+ # mypy: disable-error-code="dict-item"
2
+
3
+ import importlib.resources as resources
4
+ import logging
5
+ import os.path
6
+ import pathlib
7
+ import re
8
+ import sys
9
+ from dataclasses import dataclass
10
+ from typing import Dict, List, Optional, Tuple
11
+ from urllib.parse import ParseResult, urlparse, urlunparse
12
+
13
+ import lxml.etree as ET
14
+ import markdown
15
+ from lxml.builder import ElementMaker
16
+
17
+ namespaces = {
18
+ "ac": "http://atlassian.com/content",
19
+ "ri": "http://atlassian.com/resource/identifier",
20
+ }
21
+ for key, value in namespaces.items():
22
+ ET.register_namespace(key, value)
23
+
24
+
25
+ HTML = ElementMaker()
26
+ AC = ElementMaker(namespace=namespaces["ac"])
27
+ RI = ElementMaker(namespace=namespaces["ri"])
28
+
29
+ LOGGER = logging.getLogger(__name__)
30
+
31
+
32
+ class ParseError(RuntimeError):
33
+ pass
34
+
35
+
36
+ def is_absolute_url(url: str) -> bool:
37
+ urlparts = urlparse(url)
38
+ return bool(urlparts.scheme) or bool(urlparts.netloc)
39
+
40
+
41
+ def is_relative_url(url: str) -> bool:
42
+ urlparts = urlparse(url)
43
+ return not bool(urlparts.scheme) and not bool(urlparts.netloc)
44
+
45
+
46
+ def markdown_to_html(content: str) -> str:
47
+ return markdown.markdown(
48
+ content,
49
+ extensions=[
50
+ "admonition",
51
+ "markdown.extensions.tables",
52
+ "markdown.extensions.fenced_code",
53
+ "pymdownx.magiclink",
54
+ "pymdownx.tilde",
55
+ "sane_lists",
56
+ ],
57
+ )
58
+
59
+
60
+ def _elements_from_strings(dtd_path: pathlib.Path, items: List[str]) -> ET._Element:
61
+ """
62
+ Creates a fragment of several XML nodes from their string representation wrapped in a root element.
63
+
64
+ :param dtd_path: Path to a DTD document that defines entities like &cent; or &copy;.
65
+ :param items: Strings to parse into XML fragments.
66
+ :returns: An XML document as an element tree.
67
+ """
68
+
69
+ parser = ET.XMLParser(
70
+ remove_blank_text=True,
71
+ strip_cdata=False,
72
+ load_dtd=True,
73
+ )
74
+
75
+ ns_attr_list = "".join(
76
+ f' xmlns:{key}="{value}"' for key, value in namespaces.items()
77
+ )
78
+
79
+ data = [
80
+ '<?xml version="1.0"?>',
81
+ f'<!DOCTYPE ac:confluence PUBLIC "-//Atlassian//Confluence 4 Page//EN" "{dtd_path}">'
82
+ f"<root{ns_attr_list}>",
83
+ ]
84
+ data.extend(items)
85
+ data.append("</root>")
86
+
87
+ try:
88
+ return ET.fromstringlist(data, parser=parser)
89
+ except ET.XMLSyntaxError as e:
90
+ raise ParseError(e)
91
+
92
+
93
+ def elements_from_strings(items: List[str]) -> ET._Element:
94
+ "Creates a fragment of several XML nodes from their string representation wrapped in a root element."
95
+
96
+ if sys.version_info >= (3, 9):
97
+ resource_path = resources.files(__package__).joinpath("entities.dtd")
98
+ with resources.as_file(resource_path) as dtd_path:
99
+ return _elements_from_strings(dtd_path, items)
100
+ else:
101
+ with resources.path(__package__, "entities.dtd") as dtd_path:
102
+ return _elements_from_strings(dtd_path, items)
103
+
104
+
105
+ _languages = [
106
+ "abap",
107
+ "actionscript3",
108
+ "ada",
109
+ "applescript",
110
+ "arduino",
111
+ "autoit",
112
+ "bash",
113
+ "c",
114
+ "clojure",
115
+ "coffeescript",
116
+ "coldfusion",
117
+ "cpp",
118
+ "csharp",
119
+ "css",
120
+ "cuda",
121
+ "d",
122
+ "dart",
123
+ "delphi",
124
+ "diff",
125
+ "elixir",
126
+ "erlang",
127
+ "fortran",
128
+ "foxpro",
129
+ "go",
130
+ "graphql",
131
+ "groovy",
132
+ "haskell",
133
+ "haxe",
134
+ "html",
135
+ "java",
136
+ "javafx",
137
+ "javascript",
138
+ "json",
139
+ "jsx",
140
+ "julia",
141
+ "kotlin",
142
+ "livescript",
143
+ "lua",
144
+ "mathematica",
145
+ "matlab",
146
+ "objectivec",
147
+ "objectivej",
148
+ "ocaml",
149
+ "octave",
150
+ "pascal",
151
+ "perl",
152
+ "php",
153
+ "powershell",
154
+ "prolog",
155
+ "puppet",
156
+ "python",
157
+ "qml",
158
+ "r",
159
+ "racket",
160
+ "rst",
161
+ "ruby",
162
+ "rust",
163
+ "sass",
164
+ "scala",
165
+ "scheme",
166
+ "shell",
167
+ "smalltalk",
168
+ "splunk",
169
+ "sql",
170
+ "standardml",
171
+ "swift",
172
+ "tcl",
173
+ "tex",
174
+ "tsx",
175
+ "typescript",
176
+ "vala",
177
+ "vb",
178
+ "verilog",
179
+ "vhdl",
180
+ "xml",
181
+ "xquery",
182
+ "yaml",
183
+ ]
184
+
185
+
186
+ @dataclass
187
+ class ConfluencePageMetadata:
188
+ domain: str
189
+ base_path: str
190
+ page_id: str
191
+ space_key: str
192
+ title: str
193
+
194
+
195
+ class NodeVisitor:
196
+ def visit(self, node: ET._Element) -> None:
197
+ if len(node) < 1:
198
+ return
199
+
200
+ for index in range(len(node)):
201
+ source = node[index]
202
+ target = self.transform(source)
203
+ if target is not None:
204
+ node[index] = target
205
+ else:
206
+ self.visit(source)
207
+
208
+ def transform(self, child: ET._Element) -> Optional[ET._Element]:
209
+ pass
210
+
211
+
212
+ @dataclass
213
+ class ConfluenceConverterOptions:
214
+ """
215
+ Options for converting an HTML tree into Confluence storage format.
216
+
217
+ :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
218
+ plain text; when false, raise an exception.
219
+ """
220
+
221
+ ignore_invalid_url: bool = False
222
+
223
+
224
+ class ConfluenceStorageFormatConverter(NodeVisitor):
225
+ "Transforms a plain HTML tree into the Confluence storage format."
226
+
227
+ options: ConfluenceConverterOptions
228
+ path: pathlib.Path
229
+ base_path: pathlib.Path
230
+ links: List[str]
231
+ images: List[str]
232
+ page_metadata: Dict[pathlib.Path, ConfluencePageMetadata]
233
+
234
+ def __init__(
235
+ self,
236
+ options: ConfluenceConverterOptions,
237
+ path: pathlib.Path,
238
+ page_metadata: Dict[pathlib.Path, ConfluencePageMetadata],
239
+ ) -> None:
240
+ super().__init__()
241
+ self.options = options
242
+ self.path = path
243
+ self.base_path = path.parent
244
+ self.links = []
245
+ self.images = []
246
+ self.page_metadata = page_metadata
247
+
248
+ def _transform_link(self, anchor: ET._Element) -> None:
249
+ url = anchor.attrib["href"]
250
+ if is_absolute_url(url):
251
+ return
252
+
253
+ LOGGER.debug(f"found link {url} relative to {self.path}")
254
+ relative_url: ParseResult = urlparse(url)
255
+
256
+ if (
257
+ not relative_url.scheme
258
+ and not relative_url.netloc
259
+ and not relative_url.path
260
+ and not relative_url.params
261
+ and not relative_url.query
262
+ ):
263
+ LOGGER.debug(f"found local URL: {url}")
264
+ anchor.attrib["href"] = url
265
+ return
266
+
267
+ # convert the relative URL to absolute URL based on the base path value, then look up
268
+ # the absolute path in the page metadata dictionary to discover the relative path
269
+ # within Confluence that should be used
270
+ absolute_path = (self.base_path / relative_url.path).absolute()
271
+ if not str(absolute_path).startswith(str(self.base_path)):
272
+ msg = f"relative URL {url} points to outside base path: {self.base_path}"
273
+ if self.options.ignore_invalid_url:
274
+ LOGGER.warning(msg)
275
+ anchor.attrib.pop("href")
276
+ return
277
+ else:
278
+ raise DocumentError(msg)
279
+
280
+ relative_path = os.path.relpath(absolute_path, self.base_path)
281
+
282
+ link_metadata = self.page_metadata.get(absolute_path)
283
+ if link_metadata is None:
284
+ msg = f"unable to find matching page for URL: {url}"
285
+ if self.options.ignore_invalid_url:
286
+ LOGGER.warning(msg)
287
+ anchor.attrib.pop("href")
288
+ return
289
+ else:
290
+ raise DocumentError(msg)
291
+
292
+ LOGGER.debug(
293
+ f"found link to page {relative_path} with metadata: {link_metadata}"
294
+ )
295
+ self.links.append(url)
296
+
297
+ components = ParseResult(
298
+ scheme="https",
299
+ netloc=link_metadata.domain,
300
+ path=f"{link_metadata.base_path}spaces/{link_metadata.space_key}/pages/{link_metadata.page_id}/{link_metadata.title}",
301
+ params="",
302
+ query="",
303
+ fragment=relative_url.fragment,
304
+ )
305
+ transformed_url = urlunparse(components)
306
+
307
+ LOGGER.debug(f"transformed relative URL: {url} to URL: {transformed_url}")
308
+ anchor.attrib["href"] = transformed_url
309
+
310
+ def _transform_image(self, image: ET._Element) -> ET._Element:
311
+ path: str = image.attrib["src"]
312
+
313
+ # prefer PNG over SVG; Confluence displays SVG in wrong size, and text labels are truncated
314
+ if path and is_relative_url(path):
315
+ relative_path = pathlib.Path(path)
316
+ if (
317
+ relative_path.suffix == ".svg"
318
+ and (self.base_path / relative_path.with_suffix(".png")).exists()
319
+ ):
320
+ path = str(relative_path.with_suffix(".png"))
321
+
322
+ self.images.append(path)
323
+ caption = image.attrib["alt"]
324
+ return AC(
325
+ "image",
326
+ {
327
+ ET.QName(namespaces["ac"], "align"): "center",
328
+ ET.QName(namespaces["ac"], "layout"): "center",
329
+ },
330
+ RI(
331
+ "attachment",
332
+ {ET.QName(namespaces["ri"], "filename"): attachment_name(path)},
333
+ ),
334
+ AC("caption", HTML.p(caption)),
335
+ )
336
+
337
+ def _transform_block(self, code: ET._Element) -> ET._Element:
338
+ language = code.attrib.get("class")
339
+ if language:
340
+ m = re.match("^language-(.*)$", language)
341
+ if m:
342
+ language = m.group(1)
343
+ else:
344
+ language = "none"
345
+ if language not in _languages:
346
+ language = "none"
347
+ content: str = code.text or ""
348
+ content = content.rstrip()
349
+ return AC(
350
+ "structured-macro",
351
+ {
352
+ ET.QName(namespaces["ac"], "name"): "code",
353
+ ET.QName(namespaces["ac"], "schema-version"): "1",
354
+ },
355
+ AC("parameter", {ET.QName(namespaces["ac"], "name"): "theme"}, "Midnight"),
356
+ AC("parameter", {ET.QName(namespaces["ac"], "name"): "language"}, language),
357
+ AC(
358
+ "parameter", {ET.QName(namespaces["ac"], "name"): "linenumbers"}, "true"
359
+ ),
360
+ AC("plain-text-body", ET.CDATA(content)),
361
+ )
362
+
363
+ def _transform_toc(self, code: ET._Element) -> ET._Element:
364
+ return AC(
365
+ "structured-macro",
366
+ {
367
+ ET.QName(namespaces["ac"], "name"): "toc",
368
+ ET.QName(namespaces["ac"], "schema-version"): "1",
369
+ },
370
+ AC("parameter", {ET.QName(namespaces["ac"], "name"): "outline"}, "clear"),
371
+ AC("parameter", {ET.QName(namespaces["ac"], "name"): "style"}, "default"),
372
+ )
373
+
374
+ def _transform_admonition(self, elem: ET._Element) -> ET._Element:
375
+ """
376
+ Creates an info, tip, note or warning panel.
377
+
378
+ Transforms [Python-Markdown admonition](https://python-markdown.github.io/extensions/admonition/) syntax
379
+ into Confluence structured macro syntax.
380
+ """
381
+
382
+ # <div class="admonition note">
383
+ class_list = elem.attrib.get("class", "").split(" ")
384
+ class_name: Optional[str] = None
385
+ if "info" in class_list:
386
+ class_name = "info"
387
+ elif "tip" in class_list:
388
+ class_name = "tip"
389
+ elif "note" in class_list:
390
+ class_name = "note"
391
+ elif "warning" in class_list:
392
+ class_name = "warning"
393
+
394
+ if class_name is None:
395
+ raise DocumentError(f"unsupported admonition label: {class_list}")
396
+
397
+ for e in elem:
398
+ self.visit(e)
399
+
400
+ # <p class="admonition-title">Note</p>
401
+ if "admonition-title" in elem[0].attrib.get("class", "").split(" "):
402
+ content = [
403
+ AC(
404
+ "parameter",
405
+ {ET.QName(namespaces["ac"], "name"): "title"},
406
+ elem[0].text or "",
407
+ ),
408
+ AC("rich-text-body", {}, *list(elem[1:])),
409
+ ]
410
+ else:
411
+ content = [AC("rich-text-body", {}, *list(elem))]
412
+
413
+ return AC(
414
+ "structured-macro",
415
+ {
416
+ ET.QName(namespaces["ac"], "name"): class_name,
417
+ ET.QName(namespaces["ac"], "schema-version"): "1",
418
+ },
419
+ *content,
420
+ )
421
+
422
+ def transform(self, child: ET._Element) -> Optional[ET._Element]:
423
+ # normalize line breaks to regular space in element text
424
+ if child.text:
425
+ text: str = child.text
426
+ child.text = text.replace("\n", " ")
427
+ if child.tail:
428
+ tail: str = child.tail
429
+ child.tail = tail.replace("\n", " ")
430
+
431
+ # <p><img src="..." /></p>
432
+ if child.tag == "p" and len(child) == 1 and child[0].tag == "img":
433
+ return self._transform_image(child[0])
434
+
435
+ # <p>[[_TOC_]]</p>
436
+ # <p>[TOC]</p>
437
+ elif child.tag == "p" and "".join(child.itertext()) in ["[[TOC]]", "[TOC]"]:
438
+ return self._transform_toc(child)
439
+
440
+ # <div class="admonition note">
441
+ # <p class="admonition-title">Note</p>
442
+ # <p>...</p>
443
+ # </div>
444
+ #
445
+ # --- OR ---
446
+ #
447
+ # <div class="admonition note">
448
+ # <p>...</p>
449
+ # </div>
450
+ elif child.tag == "div" and "admonition" in child.attrib.get("class", ""):
451
+ return self._transform_admonition(child)
452
+
453
+ # <img src="..." alt="..." />
454
+ elif child.tag == "img":
455
+ return self._transform_image(child)
456
+
457
+ # <a href="..."> ... </a>
458
+ elif child.tag == "a":
459
+ self._transform_link(child)
460
+ return None
461
+
462
+ # <pre><code class="language-java"> ... </code></pre>
463
+ elif child.tag == "pre" and len(child) == 1 and child[0].tag == "code":
464
+ return self._transform_block(child[0])
465
+
466
+ return None
467
+
468
+
469
+ class ConfluenceStorageFormatCleaner(NodeVisitor):
470
+ "Removes volatile attributes from a Confluence storage format XHTML document."
471
+
472
+ def transform(self, child: ET._Element) -> Optional[ET._Element]:
473
+ child.attrib.pop(ET.QName(namespaces["ac"], "macro-id"), None)
474
+ child.attrib.pop(ET.QName(namespaces["ri"], "version-at-save"), None)
475
+ return None
476
+
477
+
478
+ class DocumentError(RuntimeError):
479
+ pass
480
+
481
+
482
+ def extract_value(pattern: str, string: str) -> Tuple[Optional[str], str]:
483
+ values: List[str] = []
484
+
485
+ def _repl_func(matchobj: re.Match) -> str:
486
+ values.append(matchobj.group(1))
487
+ return ""
488
+
489
+ string = re.sub(pattern, _repl_func, string, 1, re.ASCII)
490
+ value = values[0] if values else None
491
+ return value, string
492
+
493
+
494
+ @dataclass
495
+ class ConfluenceQualifiedID:
496
+ page_id: str
497
+ space_key: Optional[str] = None
498
+
499
+
500
+ def extract_qualified_id(string: str) -> Tuple[Optional[ConfluenceQualifiedID], str]:
501
+ page_id, string = extract_value(r"<!--\s+confluence-page-id:\s*(\d+)\s+-->", string)
502
+
503
+ if page_id is None:
504
+ return None, string
505
+
506
+ # extract Confluence space key
507
+ space_key, string = extract_value(
508
+ r"<!--\s+confluence-space-key:\s*(\S+)\s+-->", string
509
+ )
510
+
511
+ return ConfluenceQualifiedID(page_id, space_key), string
512
+
513
+
514
+ @dataclass
515
+ class ConfluenceDocumentOptions:
516
+ """
517
+ Options that control the generated page content.
518
+
519
+ :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
520
+ plain text; when false, raise an exception.
521
+ :param show_generated: Whether to display a prompt "This page has been generated with a tool."
522
+ """
523
+
524
+ ignore_invalid_url: bool = False
525
+ generated_by: Optional[str] = "This page has been generated with a tool."
526
+ root_page_id: Optional[str] = None
527
+
528
+
529
+ class ConfluenceDocument:
530
+ id: ConfluenceQualifiedID
531
+ links: List[str]
532
+ images: List[str]
533
+
534
+ options: ConfluenceDocumentOptions
535
+ root: ET._Element
536
+
537
+ def __init__(
538
+ self,
539
+ path: pathlib.Path,
540
+ options: ConfluenceDocumentOptions,
541
+ page_metadata: Dict[pathlib.Path, ConfluencePageMetadata],
542
+ ) -> None:
543
+ self.options = options
544
+ path = path.absolute()
545
+
546
+ with open(path, "r", encoding="utf-8") as f:
547
+ text = f.read()
548
+
549
+ # extract Confluence page ID
550
+ qualified_id, text = extract_qualified_id(text)
551
+ if qualified_id is None:
552
+ raise ValueError("missing Confluence page ID")
553
+ self.id = qualified_id
554
+
555
+ # extract 'generated-by' tag text
556
+ generated_by_tag, text = extract_value(
557
+ r"<!--\s+generated-by:\s*(.*)\s+-->", text
558
+ )
559
+
560
+ # extract frontmatter
561
+ frontmatter, text = extract_value(r"(?ms)\A---$(.+?)^---$", text)
562
+
563
+ # convert to HTML
564
+ html = markdown_to_html(text)
565
+
566
+ # parse Markdown document
567
+ if self.options.generated_by is not None:
568
+ generated_by = self.options.generated_by
569
+ if generated_by_tag is not None:
570
+ generated_by = generated_by_tag
571
+
572
+ content = [
573
+ '<ac:structured-macro ac:name="info" ac:schema-version="1">',
574
+ f"<ac:rich-text-body><p>{generated_by}</p></ac:rich-text-body>",
575
+ "</ac:structured-macro>",
576
+ html,
577
+ ]
578
+ else:
579
+ content = [html]
580
+ self.root = elements_from_strings(content)
581
+
582
+ converter = ConfluenceStorageFormatConverter(
583
+ ConfluenceConverterOptions(
584
+ ignore_invalid_url=self.options.ignore_invalid_url
585
+ ),
586
+ path,
587
+ page_metadata,
588
+ )
589
+ converter.visit(self.root)
590
+ self.links = converter.links
591
+ self.images = converter.images
592
+
593
+ def xhtml(self) -> str:
594
+ return _content_to_string(self.root)
595
+
596
+
597
+ def attachment_name(name: str) -> str:
598
+ """
599
+ Safe name for use with attachment uploads.
600
+
601
+ Allowed characters:
602
+ * Alphanumeric characters: 0-9, a-z, A-Z
603
+ * Special characters: hyphen (-), underscore (_), period (.)
604
+ """
605
+
606
+ return re.sub(r"[^\-0-9A-Za-z_.]", "_", name)
607
+
608
+
609
+ def sanitize_confluence(html: str) -> str:
610
+ "Generates a sanitized version of a Confluence storage format XHTML document with no volatile attributes."
611
+
612
+ if not html:
613
+ return ""
614
+
615
+ root = elements_from_strings([html])
616
+ ConfluenceStorageFormatCleaner().visit(root)
617
+ return _content_to_string(root)
618
+
619
+
620
+ def _content_to_string(root: ET._Element) -> str:
621
+ xml = ET.tostring(root, encoding="utf8", method="xml").decode("utf8")
622
+ m = re.match(r"^<root\s+[^>]*>(.*)</root>\s*$", xml, re.DOTALL)
623
+ if m:
624
+ return m.group(1)
625
+ else:
626
+ raise ValueError("expected: Confluence content")