markdown-to-confluence 0.1.12__py3-none-any.whl → 0.1.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
md2conf/converter.py CHANGED
@@ -1,624 +1,626 @@
1
- import importlib.resources as resources
2
- import logging
3
- import os.path
4
- import pathlib
5
- import re
6
- import sys
7
- from dataclasses import dataclass
8
- from typing import Dict, List, Optional, Tuple
9
- from urllib.parse import ParseResult, urlparse, urlunparse
10
-
11
- import lxml.etree as ET
12
- import markdown
13
- from lxml.builder import ElementMaker
14
-
15
- namespaces = {
16
- "ac": "http://atlassian.com/content",
17
- "ri": "http://atlassian.com/resource/identifier",
18
- }
19
- for key, value in namespaces.items():
20
- ET.register_namespace(key, value)
21
-
22
-
23
- HTML = ElementMaker()
24
- AC = ElementMaker(namespace=namespaces["ac"])
25
- RI = ElementMaker(namespace=namespaces["ri"])
26
-
27
- LOGGER = logging.getLogger(__name__)
28
-
29
-
30
- class ParseError(RuntimeError):
31
- pass
32
-
33
-
34
- def is_absolute_url(url: str) -> bool:
35
- urlparts = urlparse(url)
36
- return bool(urlparts.scheme) or bool(urlparts.netloc)
37
-
38
-
39
- def is_relative_url(url: str) -> bool:
40
- urlparts = urlparse(url)
41
- return not bool(urlparts.scheme) and not bool(urlparts.netloc)
42
-
43
-
44
- def markdown_to_html(content: str) -> str:
45
- return markdown.markdown(
46
- content,
47
- extensions=[
48
- "admonition",
49
- "markdown.extensions.tables",
50
- "markdown.extensions.fenced_code",
51
- "pymdownx.magiclink",
52
- "pymdownx.tilde",
53
- "sane_lists",
54
- ],
55
- )
56
-
57
-
58
- def _elements_from_strings(dtd_path: pathlib.Path, items: List[str]) -> ET._Element:
59
- """
60
- Creates a fragment of several XML nodes from their string representation wrapped in a root element.
61
-
62
- :param dtd_path: Path to a DTD document that defines entities like ¢ or ©.
63
- :param items: Strings to parse into XML fragments.
64
- :returns: An XML document as an element tree.
65
- """
66
-
67
- parser = ET.XMLParser(
68
- remove_blank_text=True,
69
- strip_cdata=False,
70
- load_dtd=True,
71
- )
72
-
73
- ns_attr_list = "".join(
74
- f' xmlns:{key}="{value}"' for key, value in namespaces.items()
75
- )
76
-
77
- data = [
78
- '<?xml version="1.0"?>',
79
- f'<!DOCTYPE ac:confluence PUBLIC "-//Atlassian//Confluence 4 Page//EN" "{dtd_path}">'
80
- f"<root{ns_attr_list}>",
81
- ]
82
- data.extend(items)
83
- data.append("</root>")
84
-
85
- try:
86
- return ET.fromstringlist(data, parser=parser)
87
- except ET.XMLSyntaxError as e:
88
- raise ParseError(e)
89
-
90
-
91
- def elements_from_strings(items: List[str]) -> ET._Element:
92
- "Creates a fragment of several XML nodes from their string representation wrapped in a root element."
93
-
94
- if sys.version_info >= (3, 9):
95
- resource_path = resources.files(__package__).joinpath("entities.dtd")
96
- with resources.as_file(resource_path) as dtd_path:
97
- return _elements_from_strings(dtd_path, items)
98
- else:
99
- with resources.path(__package__, "entities.dtd") as dtd_path:
100
- return _elements_from_strings(dtd_path, items)
101
-
102
-
103
- _languages = [
104
- "abap",
105
- "actionscript3",
106
- "ada",
107
- "applescript",
108
- "arduino",
109
- "autoit",
110
- "bash",
111
- "c",
112
- "clojure",
113
- "coffeescript",
114
- "coldfusion",
115
- "cpp",
116
- "csharp",
117
- "css",
118
- "cuda",
119
- "d",
120
- "dart",
121
- "delphi",
122
- "diff",
123
- "elixir",
124
- "erlang",
125
- "fortran",
126
- "foxpro",
127
- "go",
128
- "graphql",
129
- "groovy",
130
- "haskell",
131
- "haxe",
132
- "html",
133
- "java",
134
- "javafx",
135
- "javascript",
136
- "json",
137
- "jsx",
138
- "julia",
139
- "kotlin",
140
- "livescript",
141
- "lua",
142
- "mathematica",
143
- "matlab",
144
- "objectivec",
145
- "objectivej",
146
- "ocaml",
147
- "octave",
148
- "pascal",
149
- "perl",
150
- "php",
151
- "powershell",
152
- "prolog",
153
- "puppet",
154
- "python",
155
- "qml",
156
- "r",
157
- "racket",
158
- "rst",
159
- "ruby",
160
- "rust",
161
- "sass",
162
- "scala",
163
- "scheme",
164
- "shell",
165
- "smalltalk",
166
- "splunk",
167
- "sql",
168
- "standardml",
169
- "swift",
170
- "tcl",
171
- "tex",
172
- "tsx",
173
- "typescript",
174
- "vala",
175
- "vb",
176
- "verilog",
177
- "vhdl",
178
- "xml",
179
- "xquery",
180
- "yaml",
181
- ]
182
-
183
-
184
- @dataclass
185
- class ConfluencePageMetadata:
186
- domain: str
187
- base_path: str
188
- page_id: str
189
- space_key: str
190
- title: str
191
-
192
-
193
- class NodeVisitor:
194
- def visit(self, node: ET._Element) -> None:
195
- if len(node) < 1:
196
- return
197
-
198
- for index in range(len(node)):
199
- source = node[index]
200
- target = self.transform(source)
201
- if target is not None:
202
- node[index] = target
203
- else:
204
- self.visit(source)
205
-
206
- def transform(self, child: ET._Element) -> Optional[ET._Element]:
207
- pass
208
-
209
-
210
- @dataclass
211
- class ConfluenceConverterOptions:
212
- """
213
- Options for converting an HTML tree into Confluence storage format.
214
-
215
- :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
216
- plain text; when false, raise an exception.
217
- """
218
-
219
- ignore_invalid_url: bool = False
220
-
221
-
222
- class ConfluenceStorageFormatConverter(NodeVisitor):
223
- "Transforms a plain HTML tree into the Confluence storage format."
224
-
225
- options: ConfluenceConverterOptions
226
- path: pathlib.Path
227
- base_path: pathlib.Path
228
- links: List[str]
229
- images: List[str]
230
- page_metadata: Dict[pathlib.Path, ConfluencePageMetadata]
231
-
232
- def __init__(
233
- self,
234
- options: ConfluenceConverterOptions,
235
- path: pathlib.Path,
236
- page_metadata: Dict[pathlib.Path, ConfluencePageMetadata],
237
- ) -> None:
238
- super().__init__()
239
- self.options = options
240
- self.path = path
241
- self.base_path = path.parent
242
- self.links = []
243
- self.images = []
244
- self.page_metadata = page_metadata
245
-
246
- def _transform_link(self, anchor: ET._Element) -> None:
247
- url = anchor.attrib["href"]
248
- if is_absolute_url(url):
249
- return
250
-
251
- LOGGER.debug(f"found link {url} relative to {self.path}")
252
- relative_url: ParseResult = urlparse(url)
253
-
254
- if (
255
- not relative_url.scheme
256
- and not relative_url.netloc
257
- and not relative_url.path
258
- and not relative_url.params
259
- and not relative_url.query
260
- ):
261
- LOGGER.debug(f"found local URL: {url}")
262
- anchor.attrib["href"] = url
263
- return
264
-
265
- # convert the relative URL to absolute URL based on the base path value, then look up
266
- # the absolute path in the page metadata dictionary to discover the relative path
267
- # within Confluence that should be used
268
- absolute_path = (self.base_path / relative_url.path).absolute()
269
- if not str(absolute_path).startswith(str(self.base_path)):
270
- msg = f"relative URL {url} points to outside base path: {self.base_path}"
271
- if self.options.ignore_invalid_url:
272
- LOGGER.warning(msg)
273
- anchor.attrib.pop("href")
274
- return
275
- else:
276
- raise DocumentError(msg)
277
-
278
- relative_path = os.path.relpath(absolute_path, self.base_path)
279
-
280
- link_metadata = self.page_metadata.get(absolute_path)
281
- if link_metadata is None:
282
- msg = f"unable to find matching page for URL: {url}"
283
- if self.options.ignore_invalid_url:
284
- LOGGER.warning(msg)
285
- anchor.attrib.pop("href")
286
- return
287
- else:
288
- raise DocumentError(msg)
289
-
290
- LOGGER.debug(
291
- f"found link to page {relative_path} with metadata: {link_metadata}"
292
- )
293
- self.links.append(url)
294
-
295
- components = ParseResult(
296
- scheme="https",
297
- netloc=link_metadata.domain,
298
- path=f"{link_metadata.base_path}spaces/{link_metadata.space_key}/pages/{link_metadata.page_id}/{link_metadata.title}",
299
- params="",
300
- query="",
301
- fragment=relative_url.fragment,
302
- )
303
- transformed_url = urlunparse(components)
304
-
305
- LOGGER.debug(f"transformed relative URL: {url} to URL: {transformed_url}")
306
- anchor.attrib["href"] = transformed_url
307
-
308
- def _transform_image(self, image: ET._Element) -> ET._Element:
309
- path: str = image.attrib["src"]
310
-
311
- # prefer PNG over SVG; Confluence displays SVG in wrong size, and text labels are truncated
312
- if path and is_relative_url(path):
313
- relative_path = pathlib.Path(path)
314
- if (
315
- relative_path.suffix == ".svg"
316
- and (self.base_path / relative_path.with_suffix(".png")).exists()
317
- ):
318
- path = str(relative_path.with_suffix(".png"))
319
-
320
- self.images.append(path)
321
- caption = image.attrib["alt"]
322
- return AC(
323
- "image",
324
- {
325
- ET.QName(namespaces["ac"], "align"): "center",
326
- ET.QName(namespaces["ac"], "layout"): "center",
327
- },
328
- RI(
329
- "attachment",
330
- {ET.QName(namespaces["ri"], "filename"): attachment_name(path)},
331
- ),
332
- AC("caption", HTML.p(caption)),
333
- )
334
-
335
- def _transform_block(self, code: ET._Element) -> ET._Element:
336
- language = code.attrib.get("class")
337
- if language:
338
- m = re.match("^language-(.*)$", language)
339
- if m:
340
- language = m.group(1)
341
- else:
342
- language = "none"
343
- if language not in _languages:
344
- language = "none"
345
- content: str = code.text or ""
346
- content = content.rstrip()
347
- return AC(
348
- "structured-macro",
349
- {
350
- ET.QName(namespaces["ac"], "name"): "code",
351
- ET.QName(namespaces["ac"], "schema-version"): "1",
352
- },
353
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "theme"}, "Midnight"),
354
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "language"}, language),
355
- AC(
356
- "parameter", {ET.QName(namespaces["ac"], "name"): "linenumbers"}, "true"
357
- ),
358
- AC("plain-text-body", ET.CDATA(content)),
359
- )
360
-
361
- def _transform_toc(self, code: ET._Element) -> ET._Element:
362
- return AC(
363
- "structured-macro",
364
- {
365
- ET.QName(namespaces["ac"], "name"): "toc",
366
- ET.QName(namespaces["ac"], "schema-version"): "1",
367
- },
368
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "outline"}, "clear"),
369
- AC("parameter", {ET.QName(namespaces["ac"], "name"): "style"}, "default"),
370
- )
371
-
372
- def _transform_admonition(self, elem: ET._Element) -> ET._Element:
373
- """
374
- Creates an info, tip, note or warning panel.
375
-
376
- Transforms [Python-Markdown admonition](https://python-markdown.github.io/extensions/admonition/) syntax
377
- into Confluence structured macro syntax.
378
- """
379
-
380
- # <div class="admonition note">
381
- class_list = elem.attrib.get("class", "").split(" ")
382
- class_name: Optional[str] = None
383
- if "info" in class_list:
384
- class_name = "info"
385
- elif "tip" in class_list:
386
- class_name = "tip"
387
- elif "note" in class_list:
388
- class_name = "note"
389
- elif "warning" in class_list:
390
- class_name = "warning"
391
-
392
- if class_name is None:
393
- raise DocumentError(f"unsupported admonition label: {class_list}")
394
-
395
- for e in elem:
396
- self.visit(e)
397
-
398
- # <p class="admonition-title">Note</p>
399
- if "admonition-title" in elem[0].attrib.get("class", "").split(" "):
400
- content = [
401
- AC(
402
- "parameter",
403
- {ET.QName(namespaces["ac"], "name"): "title"},
404
- elem[0].text,
405
- ),
406
- AC("rich-text-body", {}, *list(elem[1:])),
407
- ]
408
- else:
409
- content = [AC("rich-text-body", {}, *list(elem))]
410
-
411
- return AC(
412
- "structured-macro",
413
- {
414
- ET.QName(namespaces["ac"], "name"): class_name,
415
- ET.QName(namespaces["ac"], "schema-version"): "1",
416
- },
417
- *content,
418
- )
419
-
420
- def transform(self, child: ET._Element) -> Optional[ET._Element]:
421
- # normalize line breaks to regular space in element text
422
- if child.text:
423
- text: str = child.text
424
- child.text = text.replace("\n", " ")
425
- if child.tail:
426
- tail: str = child.tail
427
- child.tail = tail.replace("\n", " ")
428
-
429
- # <p><img src="..." /></p>
430
- if child.tag == "p" and len(child) == 1 and child[0].tag == "img":
431
- return self._transform_image(child[0])
432
-
433
- # <p>[[_TOC_]]</p>
434
- # <p>[TOC]</p>
435
- elif child.tag == "p" and "".join(child.itertext()) in ["[[TOC]]", "[TOC]"]:
436
- return self._transform_toc(child)
437
-
438
- # <div class="admonition note">
439
- # <p class="admonition-title">Note</p>
440
- # <p>...</p>
441
- # </div>
442
- #
443
- # --- OR ---
444
- #
445
- # <div class="admonition note">
446
- # <p>...</p>
447
- # </div>
448
- elif child.tag == "div" and "admonition" in child.attrib.get("class", ""):
449
- return self._transform_admonition(child)
450
-
451
- # <img src="..." alt="..." />
452
- elif child.tag == "img":
453
- return self._transform_image(child)
454
-
455
- # <a href="..."> ... </a>
456
- elif child.tag == "a":
457
- self._transform_link(child)
458
- return None
459
-
460
- # <pre><code class="language-java"> ... </code></pre>
461
- elif child.tag == "pre" and len(child) == 1 and child[0].tag == "code":
462
- return self._transform_block(child[0])
463
-
464
- return None
465
-
466
-
467
- class ConfluenceStorageFormatCleaner(NodeVisitor):
468
- "Removes volatile attributes from a Confluence storage format XHTML document."
469
-
470
- def transform(self, child: ET._Element) -> Optional[ET._Element]:
471
- child.attrib.pop(ET.QName(namespaces["ac"], "macro-id"), None)
472
- child.attrib.pop(ET.QName(namespaces["ri"], "version-at-save"), None)
473
- return None
474
-
475
-
476
- class DocumentError(RuntimeError):
477
- pass
478
-
479
-
480
- def extract_value(pattern: str, string: str) -> Tuple[Optional[str], str]:
481
- values: List[str] = []
482
-
483
- def _repl_func(matchobj: re.Match) -> str:
484
- values.append(matchobj.group(1))
485
- return ""
486
-
487
- string = re.sub(pattern, _repl_func, string, 1, re.ASCII)
488
- value = values[0] if values else None
489
- return value, string
490
-
491
-
492
- @dataclass
493
- class ConfluenceQualifiedID:
494
- page_id: str
495
- space_key: Optional[str] = None
496
-
497
-
498
- def extract_qualified_id(string: str) -> Tuple[Optional[ConfluenceQualifiedID], str]:
499
- page_id, string = extract_value(r"<!--\s+confluence-page-id:\s*(\d+)\s+-->", string)
500
-
501
- if page_id is None:
502
- return None, string
503
-
504
- # extract Confluence space key
505
- space_key, string = extract_value(
506
- r"<!--\s+confluence-space-key:\s*(\S+)\s+-->", string
507
- )
508
-
509
- return ConfluenceQualifiedID(page_id, space_key), string
510
-
511
-
512
- @dataclass
513
- class ConfluenceDocumentOptions:
514
- """
515
- Options that control the generated page content.
516
-
517
- :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
518
- plain text; when false, raise an exception.
519
- :param show_generated: Whether to display a prompt "This page has been generated with a tool."
520
- """
521
-
522
- ignore_invalid_url: bool = False
523
- generated_by: Optional[str] = "This page has been generated with a tool."
524
- root_page_id: Optional[str] = None
525
-
526
-
527
- class ConfluenceDocument:
528
- id: ConfluenceQualifiedID
529
- links: List[str]
530
- images: List[str]
531
-
532
- options: ConfluenceDocumentOptions
533
- root: ET._Element
534
-
535
- def __init__(
536
- self,
537
- path: pathlib.Path,
538
- options: ConfluenceDocumentOptions,
539
- page_metadata: Dict[pathlib.Path, ConfluencePageMetadata],
540
- ) -> None:
541
- self.options = options
542
- path = path.absolute()
543
-
544
- with open(path, "r") as f:
545
- text = f.read()
546
-
547
- # extract Confluence page ID
548
- qualified_id, text = extract_qualified_id(text)
549
- if qualified_id is None:
550
- raise ValueError("missing Confluence page ID")
551
- self.id = qualified_id
552
-
553
- # extract 'generated-by' tag text
554
- generated_by_tag, text = extract_value(
555
- r"<!--\s+generated-by:\s*(.*)\s+-->", text
556
- )
557
-
558
- # extract frontmatter
559
- frontmatter, text = extract_value(r"(?ms)\A---$(.+?)^---$", text)
560
-
561
- # convert to HTML
562
- html = markdown_to_html(text)
563
-
564
- # parse Markdown document
565
- if self.options.generated_by is not None:
566
- generated_by = self.options.generated_by
567
- if generated_by_tag is not None:
568
- generated_by = generated_by_tag
569
-
570
- content = [
571
- '<ac:structured-macro ac:name="info" ac:schema-version="1">',
572
- f"<ac:rich-text-body><p>{generated_by}</p></ac:rich-text-body>",
573
- "</ac:structured-macro>",
574
- html,
575
- ]
576
- else:
577
- content = [html]
578
- self.root = elements_from_strings(content)
579
-
580
- converter = ConfluenceStorageFormatConverter(
581
- ConfluenceConverterOptions(
582
- ignore_invalid_url=self.options.ignore_invalid_url
583
- ),
584
- path,
585
- page_metadata,
586
- )
587
- converter.visit(self.root)
588
- self.links = converter.links
589
- self.images = converter.images
590
-
591
- def xhtml(self) -> str:
592
- return _content_to_string(self.root)
593
-
594
-
595
- def attachment_name(name: str) -> str:
596
- """
597
- Safe name for use with attachment uploads.
598
-
599
- Allowed characters:
600
- * Alphanumeric characters: 0-9, a-z, A-Z
601
- * Special characters: hyphen (-), underscore (_), period (.)
602
- """
603
-
604
- return re.sub(r"[^\-0-9A-Za-z_.]", "_", name)
605
-
606
-
607
- def sanitize_confluence(html: str) -> str:
608
- "Generates a sanitized version of a Confluence storage format XHTML document with no volatile attributes."
609
-
610
- if not html:
611
- return ""
612
-
613
- root = elements_from_strings([html])
614
- ConfluenceStorageFormatCleaner().visit(root)
615
- return _content_to_string(root)
616
-
617
-
618
- def _content_to_string(root: ET._Element) -> str:
619
- xml = ET.tostring(root, encoding="utf8", method="xml").decode("utf8")
620
- m = re.match(r"^<root\s+[^>]*>(.*)</root>\s*$", xml, re.DOTALL)
621
- if m:
622
- return m.group(1)
623
- else:
624
- raise ValueError("expected: Confluence content")
1
+ # mypy: disable-error-code="dict-item"
2
+
3
+ import importlib.resources as resources
4
+ import logging
5
+ import os.path
6
+ import pathlib
7
+ import re
8
+ import sys
9
+ from dataclasses import dataclass
10
+ from typing import Dict, List, Optional, Tuple
11
+ from urllib.parse import ParseResult, urlparse, urlunparse
12
+
13
+ import lxml.etree as ET
14
+ import markdown
15
+ from lxml.builder import ElementMaker
16
+
17
+ namespaces = {
18
+ "ac": "http://atlassian.com/content",
19
+ "ri": "http://atlassian.com/resource/identifier",
20
+ }
21
+ for key, value in namespaces.items():
22
+ ET.register_namespace(key, value)
23
+
24
+
25
+ HTML = ElementMaker()
26
+ AC = ElementMaker(namespace=namespaces["ac"])
27
+ RI = ElementMaker(namespace=namespaces["ri"])
28
+
29
+ LOGGER = logging.getLogger(__name__)
30
+
31
+
32
+ class ParseError(RuntimeError):
33
+ pass
34
+
35
+
36
+ def is_absolute_url(url: str) -> bool:
37
+ urlparts = urlparse(url)
38
+ return bool(urlparts.scheme) or bool(urlparts.netloc)
39
+
40
+
41
+ def is_relative_url(url: str) -> bool:
42
+ urlparts = urlparse(url)
43
+ return not bool(urlparts.scheme) and not bool(urlparts.netloc)
44
+
45
+
46
+ def markdown_to_html(content: str) -> str:
47
+ return markdown.markdown(
48
+ content,
49
+ extensions=[
50
+ "admonition",
51
+ "markdown.extensions.tables",
52
+ "markdown.extensions.fenced_code",
53
+ "pymdownx.magiclink",
54
+ "pymdownx.tilde",
55
+ "sane_lists",
56
+ ],
57
+ )
58
+
59
+
60
+ def _elements_from_strings(dtd_path: pathlib.Path, items: List[str]) -> ET._Element:
61
+ """
62
+ Creates a fragment of several XML nodes from their string representation wrapped in a root element.
63
+
64
+ :param dtd_path: Path to a DTD document that defines entities like &cent; or &copy;.
65
+ :param items: Strings to parse into XML fragments.
66
+ :returns: An XML document as an element tree.
67
+ """
68
+
69
+ parser = ET.XMLParser(
70
+ remove_blank_text=True,
71
+ strip_cdata=False,
72
+ load_dtd=True,
73
+ )
74
+
75
+ ns_attr_list = "".join(
76
+ f' xmlns:{key}="{value}"' for key, value in namespaces.items()
77
+ )
78
+
79
+ data = [
80
+ '<?xml version="1.0"?>',
81
+ f'<!DOCTYPE ac:confluence PUBLIC "-//Atlassian//Confluence 4 Page//EN" "{dtd_path}">'
82
+ f"<root{ns_attr_list}>",
83
+ ]
84
+ data.extend(items)
85
+ data.append("</root>")
86
+
87
+ try:
88
+ return ET.fromstringlist(data, parser=parser)
89
+ except ET.XMLSyntaxError as e:
90
+ raise ParseError(e)
91
+
92
+
93
+ def elements_from_strings(items: List[str]) -> ET._Element:
94
+ "Creates a fragment of several XML nodes from their string representation wrapped in a root element."
95
+
96
+ if sys.version_info >= (3, 9):
97
+ resource_path = resources.files(__package__).joinpath("entities.dtd")
98
+ with resources.as_file(resource_path) as dtd_path:
99
+ return _elements_from_strings(dtd_path, items)
100
+ else:
101
+ with resources.path(__package__, "entities.dtd") as dtd_path:
102
+ return _elements_from_strings(dtd_path, items)
103
+
104
+
105
+ _languages = [
106
+ "abap",
107
+ "actionscript3",
108
+ "ada",
109
+ "applescript",
110
+ "arduino",
111
+ "autoit",
112
+ "bash",
113
+ "c",
114
+ "clojure",
115
+ "coffeescript",
116
+ "coldfusion",
117
+ "cpp",
118
+ "csharp",
119
+ "css",
120
+ "cuda",
121
+ "d",
122
+ "dart",
123
+ "delphi",
124
+ "diff",
125
+ "elixir",
126
+ "erlang",
127
+ "fortran",
128
+ "foxpro",
129
+ "go",
130
+ "graphql",
131
+ "groovy",
132
+ "haskell",
133
+ "haxe",
134
+ "html",
135
+ "java",
136
+ "javafx",
137
+ "javascript",
138
+ "json",
139
+ "jsx",
140
+ "julia",
141
+ "kotlin",
142
+ "livescript",
143
+ "lua",
144
+ "mathematica",
145
+ "matlab",
146
+ "objectivec",
147
+ "objectivej",
148
+ "ocaml",
149
+ "octave",
150
+ "pascal",
151
+ "perl",
152
+ "php",
153
+ "powershell",
154
+ "prolog",
155
+ "puppet",
156
+ "python",
157
+ "qml",
158
+ "r",
159
+ "racket",
160
+ "rst",
161
+ "ruby",
162
+ "rust",
163
+ "sass",
164
+ "scala",
165
+ "scheme",
166
+ "shell",
167
+ "smalltalk",
168
+ "splunk",
169
+ "sql",
170
+ "standardml",
171
+ "swift",
172
+ "tcl",
173
+ "tex",
174
+ "tsx",
175
+ "typescript",
176
+ "vala",
177
+ "vb",
178
+ "verilog",
179
+ "vhdl",
180
+ "xml",
181
+ "xquery",
182
+ "yaml",
183
+ ]
184
+
185
+
186
+ @dataclass
187
+ class ConfluencePageMetadata:
188
+ domain: str
189
+ base_path: str
190
+ page_id: str
191
+ space_key: str
192
+ title: str
193
+
194
+
195
+ class NodeVisitor:
196
+ def visit(self, node: ET._Element) -> None:
197
+ if len(node) < 1:
198
+ return
199
+
200
+ for index in range(len(node)):
201
+ source = node[index]
202
+ target = self.transform(source)
203
+ if target is not None:
204
+ node[index] = target
205
+ else:
206
+ self.visit(source)
207
+
208
+ def transform(self, child: ET._Element) -> Optional[ET._Element]:
209
+ pass
210
+
211
+
212
+ @dataclass
213
+ class ConfluenceConverterOptions:
214
+ """
215
+ Options for converting an HTML tree into Confluence storage format.
216
+
217
+ :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
218
+ plain text; when false, raise an exception.
219
+ """
220
+
221
+ ignore_invalid_url: bool = False
222
+
223
+
224
+ class ConfluenceStorageFormatConverter(NodeVisitor):
225
+ "Transforms a plain HTML tree into the Confluence storage format."
226
+
227
+ options: ConfluenceConverterOptions
228
+ path: pathlib.Path
229
+ base_path: pathlib.Path
230
+ links: List[str]
231
+ images: List[str]
232
+ page_metadata: Dict[pathlib.Path, ConfluencePageMetadata]
233
+
234
+ def __init__(
235
+ self,
236
+ options: ConfluenceConverterOptions,
237
+ path: pathlib.Path,
238
+ page_metadata: Dict[pathlib.Path, ConfluencePageMetadata],
239
+ ) -> None:
240
+ super().__init__()
241
+ self.options = options
242
+ self.path = path
243
+ self.base_path = path.parent
244
+ self.links = []
245
+ self.images = []
246
+ self.page_metadata = page_metadata
247
+
248
+ def _transform_link(self, anchor: ET._Element) -> None:
249
+ url = anchor.attrib["href"]
250
+ if is_absolute_url(url):
251
+ return
252
+
253
+ LOGGER.debug(f"found link {url} relative to {self.path}")
254
+ relative_url: ParseResult = urlparse(url)
255
+
256
+ if (
257
+ not relative_url.scheme
258
+ and not relative_url.netloc
259
+ and not relative_url.path
260
+ and not relative_url.params
261
+ and not relative_url.query
262
+ ):
263
+ LOGGER.debug(f"found local URL: {url}")
264
+ anchor.attrib["href"] = url
265
+ return
266
+
267
+ # convert the relative URL to absolute URL based on the base path value, then look up
268
+ # the absolute path in the page metadata dictionary to discover the relative path
269
+ # within Confluence that should be used
270
+ absolute_path = (self.base_path / relative_url.path).absolute()
271
+ if not str(absolute_path).startswith(str(self.base_path)):
272
+ msg = f"relative URL {url} points to outside base path: {self.base_path}"
273
+ if self.options.ignore_invalid_url:
274
+ LOGGER.warning(msg)
275
+ anchor.attrib.pop("href")
276
+ return
277
+ else:
278
+ raise DocumentError(msg)
279
+
280
+ relative_path = os.path.relpath(absolute_path, self.base_path)
281
+
282
+ link_metadata = self.page_metadata.get(absolute_path)
283
+ if link_metadata is None:
284
+ msg = f"unable to find matching page for URL: {url}"
285
+ if self.options.ignore_invalid_url:
286
+ LOGGER.warning(msg)
287
+ anchor.attrib.pop("href")
288
+ return
289
+ else:
290
+ raise DocumentError(msg)
291
+
292
+ LOGGER.debug(
293
+ f"found link to page {relative_path} with metadata: {link_metadata}"
294
+ )
295
+ self.links.append(url)
296
+
297
+ components = ParseResult(
298
+ scheme="https",
299
+ netloc=link_metadata.domain,
300
+ path=f"{link_metadata.base_path}spaces/{link_metadata.space_key}/pages/{link_metadata.page_id}/{link_metadata.title}",
301
+ params="",
302
+ query="",
303
+ fragment=relative_url.fragment,
304
+ )
305
+ transformed_url = urlunparse(components)
306
+
307
+ LOGGER.debug(f"transformed relative URL: {url} to URL: {transformed_url}")
308
+ anchor.attrib["href"] = transformed_url
309
+
310
+ def _transform_image(self, image: ET._Element) -> ET._Element:
311
+ path: str = image.attrib["src"]
312
+
313
+ # prefer PNG over SVG; Confluence displays SVG in wrong size, and text labels are truncated
314
+ if path and is_relative_url(path):
315
+ relative_path = pathlib.Path(path)
316
+ if (
317
+ relative_path.suffix == ".svg"
318
+ and (self.base_path / relative_path.with_suffix(".png")).exists()
319
+ ):
320
+ path = str(relative_path.with_suffix(".png"))
321
+
322
+ self.images.append(path)
323
+ caption = image.attrib["alt"]
324
+ return AC(
325
+ "image",
326
+ {
327
+ ET.QName(namespaces["ac"], "align"): "center",
328
+ ET.QName(namespaces["ac"], "layout"): "center",
329
+ },
330
+ RI(
331
+ "attachment",
332
+ {ET.QName(namespaces["ri"], "filename"): attachment_name(path)},
333
+ ),
334
+ AC("caption", HTML.p(caption)),
335
+ )
336
+
337
+ def _transform_block(self, code: ET._Element) -> ET._Element:
338
+ language = code.attrib.get("class")
339
+ if language:
340
+ m = re.match("^language-(.*)$", language)
341
+ if m:
342
+ language = m.group(1)
343
+ else:
344
+ language = "none"
345
+ if language not in _languages:
346
+ language = "none"
347
+ content: str = code.text or ""
348
+ content = content.rstrip()
349
+ return AC(
350
+ "structured-macro",
351
+ {
352
+ ET.QName(namespaces["ac"], "name"): "code",
353
+ ET.QName(namespaces["ac"], "schema-version"): "1",
354
+ },
355
+ AC("parameter", {ET.QName(namespaces["ac"], "name"): "theme"}, "Midnight"),
356
+ AC("parameter", {ET.QName(namespaces["ac"], "name"): "language"}, language),
357
+ AC(
358
+ "parameter", {ET.QName(namespaces["ac"], "name"): "linenumbers"}, "true"
359
+ ),
360
+ AC("plain-text-body", ET.CDATA(content)),
361
+ )
362
+
363
+ def _transform_toc(self, code: ET._Element) -> ET._Element:
364
+ return AC(
365
+ "structured-macro",
366
+ {
367
+ ET.QName(namespaces["ac"], "name"): "toc",
368
+ ET.QName(namespaces["ac"], "schema-version"): "1",
369
+ },
370
+ AC("parameter", {ET.QName(namespaces["ac"], "name"): "outline"}, "clear"),
371
+ AC("parameter", {ET.QName(namespaces["ac"], "name"): "style"}, "default"),
372
+ )
373
+
374
+ def _transform_admonition(self, elem: ET._Element) -> ET._Element:
375
+ """
376
+ Creates an info, tip, note or warning panel.
377
+
378
+ Transforms [Python-Markdown admonition](https://python-markdown.github.io/extensions/admonition/) syntax
379
+ into Confluence structured macro syntax.
380
+ """
381
+
382
+ # <div class="admonition note">
383
+ class_list = elem.attrib.get("class", "").split(" ")
384
+ class_name: Optional[str] = None
385
+ if "info" in class_list:
386
+ class_name = "info"
387
+ elif "tip" in class_list:
388
+ class_name = "tip"
389
+ elif "note" in class_list:
390
+ class_name = "note"
391
+ elif "warning" in class_list:
392
+ class_name = "warning"
393
+
394
+ if class_name is None:
395
+ raise DocumentError(f"unsupported admonition label: {class_list}")
396
+
397
+ for e in elem:
398
+ self.visit(e)
399
+
400
+ # <p class="admonition-title">Note</p>
401
+ if "admonition-title" in elem[0].attrib.get("class", "").split(" "):
402
+ content = [
403
+ AC(
404
+ "parameter",
405
+ {ET.QName(namespaces["ac"], "name"): "title"},
406
+ elem[0].text or "",
407
+ ),
408
+ AC("rich-text-body", {}, *list(elem[1:])),
409
+ ]
410
+ else:
411
+ content = [AC("rich-text-body", {}, *list(elem))]
412
+
413
+ return AC(
414
+ "structured-macro",
415
+ {
416
+ ET.QName(namespaces["ac"], "name"): class_name,
417
+ ET.QName(namespaces["ac"], "schema-version"): "1",
418
+ },
419
+ *content,
420
+ )
421
+
422
+ def transform(self, child: ET._Element) -> Optional[ET._Element]:
423
+ # normalize line breaks to regular space in element text
424
+ if child.text:
425
+ text: str = child.text
426
+ child.text = text.replace("\n", " ")
427
+ if child.tail:
428
+ tail: str = child.tail
429
+ child.tail = tail.replace("\n", " ")
430
+
431
+ # <p><img src="..." /></p>
432
+ if child.tag == "p" and len(child) == 1 and child[0].tag == "img":
433
+ return self._transform_image(child[0])
434
+
435
+ # <p>[[_TOC_]]</p>
436
+ # <p>[TOC]</p>
437
+ elif child.tag == "p" and "".join(child.itertext()) in ["[[TOC]]", "[TOC]"]:
438
+ return self._transform_toc(child)
439
+
440
+ # <div class="admonition note">
441
+ # <p class="admonition-title">Note</p>
442
+ # <p>...</p>
443
+ # </div>
444
+ #
445
+ # --- OR ---
446
+ #
447
+ # <div class="admonition note">
448
+ # <p>...</p>
449
+ # </div>
450
+ elif child.tag == "div" and "admonition" in child.attrib.get("class", ""):
451
+ return self._transform_admonition(child)
452
+
453
+ # <img src="..." alt="..." />
454
+ elif child.tag == "img":
455
+ return self._transform_image(child)
456
+
457
+ # <a href="..."> ... </a>
458
+ elif child.tag == "a":
459
+ self._transform_link(child)
460
+ return None
461
+
462
+ # <pre><code class="language-java"> ... </code></pre>
463
+ elif child.tag == "pre" and len(child) == 1 and child[0].tag == "code":
464
+ return self._transform_block(child[0])
465
+
466
+ return None
467
+
468
+
469
+ class ConfluenceStorageFormatCleaner(NodeVisitor):
470
+ "Removes volatile attributes from a Confluence storage format XHTML document."
471
+
472
+ def transform(self, child: ET._Element) -> Optional[ET._Element]:
473
+ child.attrib.pop(ET.QName(namespaces["ac"], "macro-id"), None)
474
+ child.attrib.pop(ET.QName(namespaces["ri"], "version-at-save"), None)
475
+ return None
476
+
477
+
478
+ class DocumentError(RuntimeError):
479
+ pass
480
+
481
+
482
+ def extract_value(pattern: str, string: str) -> Tuple[Optional[str], str]:
483
+ values: List[str] = []
484
+
485
+ def _repl_func(matchobj: re.Match) -> str:
486
+ values.append(matchobj.group(1))
487
+ return ""
488
+
489
+ string = re.sub(pattern, _repl_func, string, 1, re.ASCII)
490
+ value = values[0] if values else None
491
+ return value, string
492
+
493
+
494
+ @dataclass
495
+ class ConfluenceQualifiedID:
496
+ page_id: str
497
+ space_key: Optional[str] = None
498
+
499
+
500
+ def extract_qualified_id(string: str) -> Tuple[Optional[ConfluenceQualifiedID], str]:
501
+ page_id, string = extract_value(r"<!--\s+confluence-page-id:\s*(\d+)\s+-->", string)
502
+
503
+ if page_id is None:
504
+ return None, string
505
+
506
+ # extract Confluence space key
507
+ space_key, string = extract_value(
508
+ r"<!--\s+confluence-space-key:\s*(\S+)\s+-->", string
509
+ )
510
+
511
+ return ConfluenceQualifiedID(page_id, space_key), string
512
+
513
+
514
+ @dataclass
515
+ class ConfluenceDocumentOptions:
516
+ """
517
+ Options that control the generated page content.
518
+
519
+ :param ignore_invalid_url: When true, ignore invalid URLs in input, emit a warning and replace the anchor with
520
+ plain text; when false, raise an exception.
521
+ :param show_generated: Whether to display a prompt "This page has been generated with a tool."
522
+ """
523
+
524
+ ignore_invalid_url: bool = False
525
+ generated_by: Optional[str] = "This page has been generated with a tool."
526
+ root_page_id: Optional[str] = None
527
+
528
+
529
+ class ConfluenceDocument:
530
+ id: ConfluenceQualifiedID
531
+ links: List[str]
532
+ images: List[str]
533
+
534
+ options: ConfluenceDocumentOptions
535
+ root: ET._Element
536
+
537
+ def __init__(
538
+ self,
539
+ path: pathlib.Path,
540
+ options: ConfluenceDocumentOptions,
541
+ page_metadata: Dict[pathlib.Path, ConfluencePageMetadata],
542
+ ) -> None:
543
+ self.options = options
544
+ path = path.absolute()
545
+
546
+ with open(path, "r", encoding="utf-8") as f:
547
+ text = f.read()
548
+
549
+ # extract Confluence page ID
550
+ qualified_id, text = extract_qualified_id(text)
551
+ if qualified_id is None:
552
+ raise ValueError("missing Confluence page ID")
553
+ self.id = qualified_id
554
+
555
+ # extract 'generated-by' tag text
556
+ generated_by_tag, text = extract_value(
557
+ r"<!--\s+generated-by:\s*(.*)\s+-->", text
558
+ )
559
+
560
+ # extract frontmatter
561
+ frontmatter, text = extract_value(r"(?ms)\A---$(.+?)^---$", text)
562
+
563
+ # convert to HTML
564
+ html = markdown_to_html(text)
565
+
566
+ # parse Markdown document
567
+ if self.options.generated_by is not None:
568
+ generated_by = self.options.generated_by
569
+ if generated_by_tag is not None:
570
+ generated_by = generated_by_tag
571
+
572
+ content = [
573
+ '<ac:structured-macro ac:name="info" ac:schema-version="1">',
574
+ f"<ac:rich-text-body><p>{generated_by}</p></ac:rich-text-body>",
575
+ "</ac:structured-macro>",
576
+ html,
577
+ ]
578
+ else:
579
+ content = [html]
580
+ self.root = elements_from_strings(content)
581
+
582
+ converter = ConfluenceStorageFormatConverter(
583
+ ConfluenceConverterOptions(
584
+ ignore_invalid_url=self.options.ignore_invalid_url
585
+ ),
586
+ path,
587
+ page_metadata,
588
+ )
589
+ converter.visit(self.root)
590
+ self.links = converter.links
591
+ self.images = converter.images
592
+
593
+ def xhtml(self) -> str:
594
+ return _content_to_string(self.root)
595
+
596
+
597
+ def attachment_name(name: str) -> str:
598
+ """
599
+ Safe name for use with attachment uploads.
600
+
601
+ Allowed characters:
602
+ * Alphanumeric characters: 0-9, a-z, A-Z
603
+ * Special characters: hyphen (-), underscore (_), period (.)
604
+ """
605
+
606
+ return re.sub(r"[^\-0-9A-Za-z_.]", "_", name)
607
+
608
+
609
+ def sanitize_confluence(html: str) -> str:
610
+ "Generates a sanitized version of a Confluence storage format XHTML document with no volatile attributes."
611
+
612
+ if not html:
613
+ return ""
614
+
615
+ root = elements_from_strings([html])
616
+ ConfluenceStorageFormatCleaner().visit(root)
617
+ return _content_to_string(root)
618
+
619
+
620
+ def _content_to_string(root: ET._Element) -> str:
621
+ xml = ET.tostring(root, encoding="utf8", method="xml").decode("utf8")
622
+ m = re.match(r"^<root\s+[^>]*>(.*)</root>\s*$", xml, re.DOTALL)
623
+ if m:
624
+ return m.group(1)
625
+ else:
626
+ raise ValueError("expected: Confluence content")