markdown-to-confluence 0.5.4__py3-none-any.whl → 0.5.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
md2conf/converter.py CHANGED
@@ -7,13 +7,13 @@ Copyright 2022-2026, Levente Hunyadi
7
7
  """
8
8
 
9
9
  import copy
10
- import hashlib
11
10
  import logging
12
11
  import os.path
13
12
  import re
14
13
  import uuid
15
14
  from abc import ABC, abstractmethod
16
15
  from dataclasses import dataclass
16
+ from enum import Enum
17
17
  from pathlib import Path
18
18
  from typing import ClassVar
19
19
  from urllib.parse import ParseResult, quote_plus, urlparse
@@ -32,17 +32,17 @@ from .extension import ExtensionOptions, MarketplaceExtension
32
32
  from .formatting import FormattingContext, ImageAlignment, ImageAttributes
33
33
  from .image import ImageGenerator, ImageGeneratorOptions
34
34
  from .latex import render_latex
35
- from .markdown import markdown_to_html
35
+ from .markdown import markdown_to_html, markdown_with_line_numbers
36
36
  from .mermaid.extension import MermaidExtension
37
37
  from .metadata import ConfluenceSiteMetadata
38
38
  from .options import ConfluencePageID, ConverterOptions, DocumentOptions
39
39
  from .plantuml.extension import PlantUMLExtension
40
- from .png import extract_png_dimensions, remove_png_chunks
40
+ from .png import remove_png_chunks
41
41
  from .scanner import ScannedDocument, Scanner
42
42
  from .serializer import JsonType
43
43
  from .toc import TableOfContentsBuilder
44
44
  from .uri import is_absolute_url, to_uuid_urn
45
- from .xml import element_to_text
45
+ from .xml import element_to_text, remove_element
46
46
 
47
47
  ElementType = ET._Element # pyright: ignore [reportPrivateUsage]
48
48
 
@@ -114,14 +114,18 @@ def fix_absolute_path(path: Path, root_path: Path) -> Path:
114
114
  return root_path / path.relative_to(path.root)
115
115
 
116
116
 
117
+ _UNSAFE_CHAR_REGEXP = re.compile(r"[^A-Za-z0-9._~()'!*:@,;+?-]+")
118
+ _MULTIPLE_SPACE_REGEXP = re.compile(r"\s\s+")
119
+
120
+
117
121
  def encode_title(text: str) -> str:
118
122
  "Converts a title string such that it is safe to embed into a Confluence URL."
119
123
 
120
124
  # replace unsafe characters with space
121
- text = re.sub(r"[^A-Za-z0-9._~()'!*:@,;+?-]+", " ", text)
125
+ text = _UNSAFE_CHAR_REGEXP.sub(" ", text)
122
126
 
123
127
  # replace multiple consecutive spaces with single space
124
- text = re.sub(r"\s\s+", " ", text)
128
+ text = _MULTIPLE_SPACE_REGEXP.sub(" ", text)
125
129
 
126
130
  # URL-encode
127
131
  return quote_plus(text.strip())
@@ -215,6 +219,13 @@ _LANGUAGES = {
215
219
  # spellchecker: enable
216
220
 
217
221
 
222
+ class ElementAction(Enum):
223
+ "Captures standard actions a node visitor may take with the element."
224
+
225
+ RECURSE = "recurse"
226
+ REMOVE = "remove"
227
+
228
+
218
229
  class NodeVisitor(ABC):
219
230
  def visit(self, node: ElementType) -> None:
220
231
  "Recursively visits all descendants of this node."
@@ -222,29 +233,44 @@ class NodeVisitor(ABC):
222
233
  if len(node) < 1:
223
234
  return
224
235
 
225
- for index in range(len(node)):
236
+ index = 0
237
+ count = len(node)
238
+ while index < count:
226
239
  source = node[index]
227
240
  target = self.transform(source)
228
- if target is not None:
241
+ if isinstance(target, ElementAction):
242
+ match target:
243
+ case ElementAction.RECURSE:
244
+ # recurse into the element
245
+ self.visit(source)
246
+ index += 1
247
+ case ElementAction.REMOVE:
248
+ # remove the element from the tree
249
+ remove_element(source)
250
+ count -= 1
251
+ else:
229
252
  # chain sibling text node that immediately follows original element
230
253
  target.tail = source.tail
231
254
  source.tail = None
232
255
 
233
256
  # replace original element with transformed element
234
257
  node[index] = target
235
- else:
236
- self.visit(source)
258
+ index += 1
237
259
 
238
260
  @abstractmethod
239
- def transform(self, child: ElementType) -> ElementType | None: ...
261
+ def transform(self, child: ElementType) -> ElementType | ElementAction: ...
262
+
263
+
264
+ _DISALLOWED_CHAR_REGEXP = re.compile(r"[^\sA-Za-z0-9_\-]")
265
+ _SPACE_COLLAPSE_REGEXP = re.compile(r"\s+")
240
266
 
241
267
 
242
268
  def title_to_identifier(title: str) -> str:
243
269
  "Converts a section heading title to a GitHub-style Markdown same-page anchor."
244
270
 
245
271
  s = title.strip().lower()
246
- s = re.sub(r"[^\sA-Za-z0-9_\-]", "", s)
247
- s = re.sub(r"\s+", "-", s)
272
+ s = _DISALLOWED_CHAR_REGEXP.sub("", s)
273
+ s = _SPACE_COLLAPSE_REGEXP.sub("-", s)
248
274
  return s
249
275
 
250
276
 
@@ -256,6 +282,12 @@ def element_text_starts_with_any(node: ElementType, prefixes: list[str]) -> bool
256
282
  return starts_with_any(node.text, prefixes)
257
283
 
258
284
 
285
+ def child_count(node: ElementType) -> int:
286
+ "Number of children, excluding special elements."
287
+
288
+ return len(node) - sum(1 for _ in node.iterchildren("line-number"))
289
+
290
+
259
291
  def is_placeholder_for(node: ElementType, name: str) -> bool:
260
292
  """
261
293
  Identifies a Confluence widget placeholder, e.g. `[[_TOC_]]` or `[[_LISTING_]]`.
@@ -265,7 +297,7 @@ def is_placeholder_for(node: ElementType, name: str) -> bool:
265
297
  """
266
298
 
267
299
  # `[[_TOC_]]` is represented in HTML as <p>[[<em>TOC</em>]]</p>
268
- if node.text != "[[" or len(node) != 1:
300
+ if node.text != "[[" or child_count(node) != 1:
269
301
  return False
270
302
 
271
303
  child = node[0]
@@ -275,6 +307,65 @@ def is_placeholder_for(node: ElementType, name: str) -> bool:
275
307
  return True
276
308
 
277
309
 
310
+ class PreprocessingError(RuntimeError):
311
+ "Raised when a preprocessing step has failed."
312
+
313
+
314
+ class DocumentError(RuntimeError):
315
+ "Raised when a converted Markdown document has an unexpected element or attribute."
316
+
317
+ element: ElementType
318
+
319
+ def __init__(self, element: ElementType, message: str) -> None:
320
+ super().__init__(message)
321
+ self.element = element
322
+
323
+
324
+ class ConversionError(RuntimeError):
325
+ "Raised when a Markdown document cannot be converted to Confluence Storage Format."
326
+
327
+
328
+ def transform_skip_comments_in_html(html: str) -> str:
329
+ """
330
+ Transforms HTML comments marking skip sections into custom elements.
331
+
332
+ From:
333
+ ```
334
+ <!-- confluence-skip-start --> ... <!-- confluence-skip-end -->
335
+ ```
336
+ Into:
337
+ ```
338
+ <confluence-skip> ... </confluence-skip>
339
+ ```
340
+
341
+ This must run BEFORE the HTML (generated from Markdown) is parsed, as the XML parser strips comments (remove_comments=True).
342
+
343
+ :param html: HTML string with skip comment markers.
344
+ :returns: HTML string with comments replaced by custom elements.
345
+ """
346
+
347
+ start_pattern = re.compile(r"<!--\s*confluence-skip-start\s*-->")
348
+ end_pattern = re.compile(r"<!--\s*confluence-skip-end\s*-->")
349
+
350
+ start_count = sum(1 for _ in start_pattern.finditer(html))
351
+ end_count = sum(1 for _ in end_pattern.finditer(html))
352
+
353
+ if start_count != end_count:
354
+ raise PreprocessingError(f"unmatched confluence-skip markers: found {start_count} start marker(s) and {end_count} end marker(s)")
355
+
356
+ if start_count < 1:
357
+ return html
358
+
359
+ skip_pattern = re.compile(r"<!--\s*confluence-skip-start\s*-->(.*?)<!--\s*confluence-skip-end\s*-->", flags=re.DOTALL)
360
+ html = skip_pattern.sub(r"<confluence-skip>\1</confluence-skip>", html)
361
+
362
+ return html
363
+
364
+
365
+ _FOOTNOTE_REF_REGEXP = re.compile(r"^fnref(\d*):(.+)$")
366
+ _TASKLIST_REGEXP = re.compile(r"^\[([x X])\]")
367
+
368
+
278
369
  @dataclass
279
370
  class ConfluencePanel:
280
371
  emoji: str
@@ -402,14 +493,14 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
402
493
  def _anchor_warn_or_raise(self, anchor: ElementType, msg: str) -> None:
403
494
  "Emit a warning or raise an exception when a path points to a resource that doesn't exist or is outside of the permitted hierarchy."
404
495
 
405
- if self.options.ignore_invalid_url:
496
+ if self.options.force_valid_url:
497
+ raise DocumentError(anchor, msg)
498
+ else:
406
499
  LOGGER.warning(msg)
407
500
  if anchor.text:
408
501
  anchor.text = "❌ " + anchor.text
409
502
  elif len(anchor) > 0:
410
503
  anchor.text = "❌ "
411
- else:
412
- raise DocumentError(msg)
413
504
 
414
505
  def _transform_link(self, anchor: ElementType) -> ElementType | None:
415
506
  """
@@ -486,7 +577,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
486
577
  space_key = link_metadata.space_key or self.site_metadata.space_key
487
578
 
488
579
  if space_key is None:
489
- raise DocumentError("Confluence space key required for building full web URLs")
580
+ raise DocumentError(anchor, "Confluence space key required for building full web URLs")
490
581
 
491
582
  page_url = f"{self.site_metadata.base_path}spaces/{space_key}/pages/{link_metadata.page_id}/{encode_title(link_metadata.title)}"
492
583
 
@@ -563,7 +654,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
563
654
 
564
655
  src = image.get("src")
565
656
  if not src:
566
- raise DocumentError("image lacks `src` attribute")
657
+ raise DocumentError(image, "image lacks `src` attribute")
567
658
 
568
659
  alt = image.get("alt")
569
660
  if alt is not None and src.startswith("urn:uuid:") and (color := status_images.get(src)) is not None:
@@ -589,7 +680,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
589
680
  else:
590
681
  path = Path(src)
591
682
 
592
- absolute_path = self._verify_image_path(path)
683
+ absolute_path = self._verify_image_path(image, path)
593
684
  if absolute_path is None:
594
685
  return self._create_missing(path, attrs)
595
686
 
@@ -615,15 +706,15 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
615
706
 
616
707
  return AC_ELEM("image", attrs.as_dict(max_width=self.options.layout.image.max_width), *elements)
617
708
 
618
- def _warn_or_raise(self, msg: str) -> None:
709
+ def _warn_or_raise(self, image: ElementType, msg: str) -> None:
619
710
  "Emit a warning or raise an exception when a path points to a resource that doesn't exist or is outside of the permitted hierarchy."
620
711
 
621
- if self.options.ignore_invalid_url:
622
- LOGGER.warning(msg)
712
+ if self.options.force_valid_url:
713
+ raise DocumentError(image, msg)
623
714
  else:
624
- raise DocumentError(msg)
715
+ LOGGER.warning(msg)
625
716
 
626
- def _verify_image_path(self, path: Path) -> Path | None:
717
+ def _verify_image_path(self, image: ElementType, path: Path) -> Path | None:
627
718
  "Checks whether an image path is safe to use."
628
719
 
629
720
  if path.is_absolute():
@@ -633,11 +724,11 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
633
724
  absolute_path = (self.base_dir / path).resolve()
634
725
 
635
726
  if not absolute_path.exists():
636
- self._warn_or_raise(f"path to image does not exist: {path}")
727
+ self._warn_or_raise(image, f"path to image does not exist: {path}")
637
728
  return None
638
729
 
639
730
  if not is_directory_within(absolute_path, self.root_dir):
640
- self._warn_or_raise(f"path to image {path} points to outside root path {self.root_dir}")
731
+ self._warn_or_raise(image, f"path to image {path} points to outside root path {self.root_dir}")
641
732
  return None
642
733
 
643
734
  return absolute_path
@@ -744,15 +835,15 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
744
835
  """
745
836
 
746
837
  if len(elem) < 1:
747
- raise DocumentError("empty admonition")
838
+ raise DocumentError(elem, "empty admonition")
748
839
 
749
840
  # <div class="admonition note">
750
841
  class_list = elem.get("class", "").split(" ")
751
842
  class_list.remove("admonition")
752
843
  if len(class_list) > 1:
753
- raise DocumentError(f"too many admonition types: {class_list}")
844
+ raise DocumentError(elem, f"too many admonition types: {class_list}")
754
845
  elif len(class_list) < 1:
755
- raise DocumentError("missing specific admonition type")
846
+ raise DocumentError(elem, "missing specific admonition type")
756
847
  admonition = class_list[0]
757
848
 
758
849
  for e in elem:
@@ -761,11 +852,11 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
761
852
  # <p class="admonition-title">Note</p>
762
853
  if "admonition-title" in elem[0].get("class", "").split(" "):
763
854
  content = [HTML.p(HTML.strong(elem[0].text or "")), *list(elem[1:])]
764
- else:
765
- content = list(elem)
855
+ elem.clear(keep_tail=True)
856
+ elem.extend(content)
766
857
 
767
858
  if self.options.use_panel:
768
- return self._transform_panel(content, admonition)
859
+ return self._transform_panel(elem, admonition)
769
860
  else:
770
861
  admonition_to_csf = {
771
862
  "attention": "note",
@@ -781,7 +872,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
781
872
  }
782
873
  class_name = admonition_to_csf.get(admonition)
783
874
  if class_name is None:
784
- raise DocumentError(f"unsupported admonition type: {admonition}")
875
+ raise DocumentError(elem, f"unsupported admonition type: {admonition}")
785
876
 
786
877
  return AC_ELEM(
787
878
  "structured-macro",
@@ -789,7 +880,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
789
880
  AC_ATTR("name"): class_name,
790
881
  AC_ATTR("schema-version"): "1",
791
882
  },
792
- AC_ELEM("rich-text-body", {}, *content),
883
+ AC_ELEM("rich-text-body", {}, *list(elem)),
793
884
  )
794
885
 
795
886
  def _transform_github_alert(self, blockquote: ElementType) -> ElementType:
@@ -797,32 +888,32 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
797
888
  Creates a GitHub-style panel, normally triggered with a block-quote starting with a capitalized string such as `[!TIP]`.
798
889
  """
799
890
 
891
+ for e in blockquote:
892
+ self.visit(e)
893
+
800
894
  if len(blockquote) < 1:
801
- raise DocumentError("empty GitHub alert")
895
+ raise DocumentError(blockquote, "empty GitHub alert")
802
896
 
803
897
  content = blockquote[0]
804
898
  if content.text is None:
805
- raise DocumentError("empty content for GitHub alert")
899
+ raise DocumentError(blockquote, "empty content for GitHub alert")
806
900
 
807
901
  pattern = re.compile(r"^\[!([A-Z]+)\]\s*")
808
902
  match = pattern.match(content.text)
809
903
  if not match:
810
- raise DocumentError("not a GitHub alert")
904
+ raise DocumentError(blockquote, "not a GitHub alert")
905
+ alert = match.group(1)
811
906
 
812
907
  # remove alert indicator prefix
813
908
  content.text = content.text[len(match.group(0)) :]
814
909
 
815
- for e in blockquote:
816
- self.visit(e)
817
-
818
- alert = match.group(1)
819
910
  if self.options.use_panel:
820
- return self._transform_panel(list(blockquote), alert.lower())
911
+ return self._transform_panel(blockquote, alert.lower())
821
912
  else:
822
913
  alert_to_csf = {"NOTE": "info", "TIP": "tip", "IMPORTANT": "note", "WARNING": "note", "CAUTION": "warning"}
823
914
  class_name = alert_to_csf.get(alert)
824
915
  if class_name is None:
825
- raise DocumentError(f"unsupported GitHub alert: {alert}")
916
+ raise DocumentError(blockquote, f"unsupported GitHub alert: {alert}")
826
917
 
827
918
  return self._transform_alert(blockquote, class_name)
828
919
 
@@ -834,32 +925,32 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
834
925
  This syntax does not use Hugo shortcode.
835
926
  """
836
927
 
928
+ for e in blockquote:
929
+ self.visit(e)
930
+
837
931
  if len(blockquote) < 1:
838
- raise DocumentError("empty GitLab alert")
932
+ raise DocumentError(blockquote, "empty GitLab alert")
839
933
 
840
934
  content = blockquote[0]
841
935
  if content.text is None:
842
- raise DocumentError("empty content for GitLab alert")
936
+ raise DocumentError(blockquote, "empty content for GitLab alert")
843
937
 
844
938
  pattern = re.compile(r"^(FLAG|NOTE|WARNING|DISCLAIMER):\s*")
845
939
  match = pattern.match(content.text)
846
940
  if not match:
847
- raise DocumentError("not a GitLab alert")
941
+ raise DocumentError(blockquote, "not a GitLab alert")
942
+ alert = match.group(1)
848
943
 
849
944
  # remove alert indicator prefix
850
945
  content.text = content.text[len(match.group(0)) :]
851
946
 
852
- for e in blockquote:
853
- self.visit(e)
854
-
855
- alert = match.group(1)
856
947
  if self.options.use_panel:
857
- return self._transform_panel(list(blockquote), alert.lower())
948
+ return self._transform_panel(blockquote, alert.lower())
858
949
  else:
859
950
  alert_to_csf = {"FLAG": "note", "NOTE": "info", "WARNING": "note", "DISCLAIMER": "info"}
860
951
  class_name = alert_to_csf.get(alert)
861
952
  if class_name is None:
862
- raise DocumentError(f"unsupported GitLab alert: {alert}")
953
+ raise DocumentError(blockquote, f"unsupported GitLab alert: {alert}")
863
954
 
864
955
  return self._transform_alert(blockquote, class_name)
865
956
 
@@ -903,12 +994,12 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
903
994
  AC_ELEM("rich-text-body", {}, *list(blockquote)),
904
995
  )
905
996
 
906
- def _transform_panel(self, content: list[ElementType], class_name: str) -> ElementType:
997
+ def _transform_panel(self, elem: ElementType, class_name: str) -> ElementType:
907
998
  "Transforms a blockquote into a themed panel."
908
999
 
909
1000
  panel = ConfluencePanel.from_class.get(class_name)
910
1001
  if panel is None:
911
- raise DocumentError(f"unsupported panel class: {class_name}")
1002
+ raise DocumentError(elem, f"unsupported panel class: {class_name}")
912
1003
 
913
1004
  macro_id = str(uuid.uuid4())
914
1005
  return AC_ELEM(
@@ -922,7 +1013,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
922
1013
  AC_ELEM("parameter", {AC_ATTR("name"): "panelIconId"}, panel.emoji_unicode),
923
1014
  AC_ELEM("parameter", {AC_ATTR("name"): "panelIconText"}, panel.emoji),
924
1015
  AC_ELEM("parameter", {AC_ATTR("name"): "bgColor"}, panel.background_color),
925
- AC_ELEM("rich-text-body", {}, *content),
1016
+ AC_ELEM("rich-text-body", {}, *list(elem)),
926
1017
  )
927
1018
 
928
1019
  def _transform_collapsed(self, details: ElementType) -> ElementType:
@@ -936,7 +1027,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
936
1027
 
937
1028
  summary = details[0]
938
1029
  if summary.tag != "summary":
939
- raise DocumentError("expected: `<summary>` as first direct child of `<details>`")
1030
+ raise DocumentError(details, "expected: `<summary>` as first direct child of `<details>`")
940
1031
  if details.text is not None or summary.tail is not None:
941
1032
  # when `<details>` has attribute `markdown=1`, content is parsed as Markdown:
942
1033
  # ```
@@ -952,7 +1043,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
952
1043
  # <summary>...</summary>
953
1044
  # Text with *emphasis*.
954
1045
  # </details>
955
- raise DocumentError('expected: attribute `markdown="1"` on `<details>`')
1046
+ raise DocumentError(details, 'expected: attribute `markdown="1"` on `<details>`')
956
1047
 
957
1048
  summary_text = element_to_text(summary)
958
1049
  details.remove(summary)
@@ -1020,29 +1111,22 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1020
1111
 
1021
1112
  content = elem.text
1022
1113
  if not content:
1023
- raise DocumentError("empty LaTeX formula")
1114
+ raise DocumentError(elem, "empty LaTeX formula")
1024
1115
 
1025
1116
  image_data = render_latex(content, format=self.options.diagram_output_format)
1026
1117
  if self.options.diagram_output_format == "png":
1027
- width, height = extract_png_dimensions(data=image_data)
1028
1118
  image_data = remove_png_chunks(["pHYs"], source_data=image_data)
1029
- attrs = ImageAttributes(
1030
- context,
1031
- width=width,
1032
- height=height,
1033
- alt=content,
1034
- title=None,
1035
- caption="",
1036
- alignment=ImageAlignment(self.options.layout.get_image_alignment()),
1037
- )
1038
- else:
1039
- attrs = ImageAttributes.empty(context)
1040
1119
 
1041
- image_hash = hashlib.md5(image_data).hexdigest()
1042
- image_filename = attachment_name(f"formula_{image_hash}.{self.options.diagram_output_format}")
1043
- self.attachments.add_embed(image_filename, EmbeddedFileData(image_data, content))
1044
- image = self.image_generator.create_attached_image(image_filename, attrs)
1045
- return image
1120
+ attrs = ImageAttributes(
1121
+ context,
1122
+ width=None,
1123
+ height=None,
1124
+ alt=content,
1125
+ title=None,
1126
+ caption="",
1127
+ alignment=ImageAlignment(self.options.layout.get_image_alignment()),
1128
+ )
1129
+ return self.image_generator.transform_attached_data(image_data, attrs, image_type="formula")
1046
1130
 
1047
1131
  def _transform_inline_math(self, elem: ElementType) -> ElementType:
1048
1132
  """
@@ -1053,7 +1137,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1053
1137
 
1054
1138
  content = elem.text
1055
1139
  if not content:
1056
- raise DocumentError("empty inline LaTeX formula")
1140
+ raise DocumentError(elem, "empty inline LaTeX formula")
1057
1141
 
1058
1142
  LOGGER.debug("Found inline LaTeX formula: %s", content)
1059
1143
 
@@ -1088,7 +1172,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1088
1172
 
1089
1173
  content = elem.text
1090
1174
  if not content:
1091
- raise DocumentError("empty block-level LaTeX formula")
1175
+ raise DocumentError(elem, "empty block-level LaTeX formula")
1092
1176
 
1093
1177
  LOGGER.debug("Found block-level LaTeX formula: %s", content)
1094
1178
 
@@ -1133,13 +1217,13 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1133
1217
  """
1134
1218
 
1135
1219
  if elem.tag != "sup":
1136
- raise DocumentError("expected: `<sup>` as the HTML element for a footnote reference")
1220
+ raise DocumentError(elem, "expected: `<sup>` as the HTML element for a footnote reference")
1137
1221
 
1138
1222
  ref_id = elem.attrib.pop("id", "")
1139
1223
  # Match fnref:NAME, fnref2:NAME, fnref3:NAME, etc.
1140
- match = re.match(r"^fnref(\d*):(.+)$", ref_id)
1224
+ match = _FOOTNOTE_REF_REGEXP.match(ref_id)
1141
1225
  if match is None:
1142
- raise DocumentError("expected: attribute `id` of format `fnref:NAME` or `fnrefN:NAME` applied on `<sup>` for a footnote reference")
1226
+ raise DocumentError(elem, "expected: attribute `id` of format `fnref:NAME` or `fnrefN:NAME` applied on `<sup>` for a footnote reference")
1143
1227
  numeric_suffix = match.group(1)
1144
1228
  footnote_name = match.group(2)
1145
1229
  # Build anchor name: first reference uses NAME, subsequent references use NAME-N
@@ -1147,10 +1231,10 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1147
1231
 
1148
1232
  link = next((elem.iterchildren(tag="a")), None)
1149
1233
  if link is None:
1150
- raise DocumentError("expected: `<a>` as the first HTML element in a footnote reference")
1234
+ raise DocumentError(elem, "expected: `<a>` as the first HTML element in a footnote reference")
1151
1235
  def_href = link.attrib.pop("href", "")
1152
1236
  if not def_href.startswith("#fn:"):
1153
- raise DocumentError("expected: attribute `href` of format `#fn:NAME` applied on `<a>` for a footnote reference")
1237
+ raise DocumentError(elem, "expected: attribute `href` of format `#fn:NAME` applied on `<a>` for a footnote reference")
1154
1238
  footnote_def = def_href.removeprefix("#fn:")
1155
1239
 
1156
1240
  text = link.text or ""
@@ -1217,21 +1301,21 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1217
1301
 
1218
1302
  ordered_list = next((elem.iterchildren(tag="ol")), None)
1219
1303
  if ordered_list is None:
1220
- raise DocumentError("expected: `<ol>` as direct child of footnote definition block")
1304
+ raise DocumentError(elem, "expected: `<ol>` as direct child of footnote definition block")
1221
1305
 
1222
1306
  for list_item in ordered_list:
1223
1307
  if list_item.tag != "li":
1224
- raise DocumentError("expected: `<li>` as children of `<ol>` in footnote definition block")
1308
+ raise DocumentError(elem, "expected: `<li>` as children of `<ol>` in footnote definition block")
1225
1309
 
1226
1310
  def_id = list_item.attrib.pop("id", "")
1227
1311
  if not def_id.startswith("fn:"):
1228
- raise DocumentError("expected: attribute `id` of format `fn:NAME` applied on `<li>` for a footnote definition")
1312
+ raise DocumentError(elem, "expected: attribute `id` of format `fn:NAME` applied on `<li>` for a footnote definition")
1229
1313
  footnote_def = def_id.removeprefix("fn:")
1230
1314
 
1231
1315
  # find the last paragraph, which is where the backref links are placed
1232
1316
  paragraphs = list(list_item.iterchildren(tag="p"))
1233
1317
  if not paragraphs:
1234
- raise DocumentError("expected: `<p>` as a child of `<li>` in a footnote definition")
1318
+ raise DocumentError(elem, "expected: `<p>` as a child of `<li>` in a footnote definition")
1235
1319
  last_paragraph = paragraphs[-1]
1236
1320
 
1237
1321
  # collect all backref anchors (there may be multiple when a footnote is referenced multiple times)
@@ -1240,13 +1324,12 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1240
1324
  backref_info: list[tuple[ElementType, int | None, str]] = []
1241
1325
  for anchor in list(last_paragraph.iterchildren(tag="a")):
1242
1326
  href = anchor.get("href", "")
1243
- match = re.match(r"^#fnref(\d*):(.+)$", href)
1244
- if match is not None:
1327
+ if href.startswith("#") and (match := _FOOTNOTE_REF_REGEXP.match(href[1:])) is not None:
1245
1328
  backref_info.append((anchor, int(match.group(1), base=10) if match.group(1) else None, match.group(2)))
1246
1329
 
1247
1330
  if not backref_info:
1248
1331
  raise DocumentError(
1249
- "expected: at least one `<a>` element with `href` attribute of format `#fnref:NAME` or `#fnrefN:NAME` in a footnote definition"
1332
+ elem, "expected: at least one `<a>` element with `href` attribute of format `#fnref:NAME` or `#fnrefN:NAME` in a footnote definition"
1250
1333
  )
1251
1334
 
1252
1335
  # remove all back-links generated by Python-Markdown
@@ -1313,19 +1396,19 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1313
1396
  """
1314
1397
 
1315
1398
  if elem.tag != "ul":
1316
- raise DocumentError("expected: `<ul>` as the HTML element for a tasklist")
1399
+ raise DocumentError(elem, "expected: `<ul>` as the HTML element for a tasklist")
1317
1400
 
1318
1401
  for item in elem:
1319
1402
  if item.tag != "li":
1320
- raise DocumentError("expected: `<li>` as the HTML element for a task")
1321
- if not element_text_starts_with_any(item, ["[ ]", "[x]", "[X]"]):
1322
- raise DocumentError("expected: each `<li>` in a task list starting with [ ] or [x]")
1403
+ raise DocumentError(elem, "expected: `<li>` as the HTML element for a task")
1404
+ if not _TASKLIST_REGEXP.match(item.text or ""):
1405
+ raise DocumentError(elem, "expected: each `<li>` in a task list starting with [ ] or [x]")
1323
1406
 
1324
1407
  tasks: list[ElementType] = []
1325
1408
  for index, item in enumerate(elem, start=1):
1326
1409
  if item.text is None:
1327
1410
  raise NotImplementedError("pre-condition check for tasklist not exhaustive")
1328
- match = re.match(r"^\[([x X])\]", item.text)
1411
+ match = _TASKLIST_REGEXP.match(item.text)
1329
1412
  if match is None:
1330
1413
  raise NotImplementedError("pre-condition check for tasklist not exhaustive")
1331
1414
 
@@ -1350,7 +1433,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1350
1433
  return AC_ELEM("task-list", {}, *tasks)
1351
1434
 
1352
1435
  @override
1353
- def transform(self, child: ElementType) -> ElementType | None:
1436
+ def transform(self, child: ElementType) -> ElementType | ElementAction:
1354
1437
  """
1355
1438
  Transforms an HTML element tree obtained from a Markdown document into a Confluence Storage Format element tree.
1356
1439
  """
@@ -1362,13 +1445,17 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1362
1445
  child.tail = child.tail.replace("\n", " ")
1363
1446
 
1364
1447
  if not isinstance(child.tag, str):
1365
- return None
1448
+ return ElementAction.RECURSE
1366
1449
 
1367
1450
  match child.tag:
1451
+ # <line-number value="#" />
1452
+ case "line-number":
1453
+ return ElementAction.REMOVE
1454
+
1368
1455
  # <p>...</p>
1369
1456
  case "p":
1370
1457
  # <p><img src="..." /></p>
1371
- if len(child) == 1 and not child.text and child[0].tag == "img" and not child[0].tail:
1458
+ if child_count(child) == 1 and not child.text and child[0].tag == "img" and not child[0].tail:
1372
1459
  return self._transform_image(FormattingContext.BLOCK, child[0])
1373
1460
 
1374
1461
  # <p>[[<em>TOC</em>]]</p> (represented in Markdown as `[[_TOC_]]`)
@@ -1390,7 +1477,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1390
1477
  # <div><ac:structured-macro ...>...</ac:structured-macro></div>
1391
1478
  elif "csf" in classes:
1392
1479
  if len(child) != 1:
1393
- raise DocumentError("expected: single child in Confluence Storage Format block")
1480
+ raise DocumentError(child, "expected: single child in Confluence Storage Format block")
1394
1481
 
1395
1482
  return child[0]
1396
1483
 
@@ -1402,7 +1489,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1402
1489
  # </div>
1403
1490
  elif "footnote" in classes:
1404
1491
  self._transform_footnote_def(child)
1405
- return None
1492
+ return ElementAction.RECURSE
1406
1493
 
1407
1494
  # <div class="admonition note">
1408
1495
  # <p class="admonition-title">Note</p>
@@ -1444,7 +1531,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1444
1531
  case "ol":
1445
1532
  # Confluence adds the attribute `start` for every ordered list
1446
1533
  child.set("start", "1")
1447
- return None
1534
+ return ElementAction.RECURSE
1448
1535
 
1449
1536
  # <ul>
1450
1537
  # <li>[ ] ...</li>
@@ -1454,11 +1541,11 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1454
1541
  if len(child) > 0 and all(element_text_starts_with_any(item, ["[ ]", "[x]", "[X]"]) for item in child):
1455
1542
  return self._transform_tasklist(child)
1456
1543
 
1457
- return None
1544
+ return ElementAction.RECURSE
1458
1545
 
1459
1546
  case "li":
1460
1547
  normalize_inline(child)
1461
- return None
1548
+ return ElementAction.RECURSE
1462
1549
 
1463
1550
  # <pre><code class="language-java"> ... </code></pre>
1464
1551
  case "pre" if len(child) == 1 and child[0].tag == "code":
@@ -1479,7 +1566,7 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1479
1566
  if self.options.layout.table.width:
1480
1567
  child.set("data-table-width", str(self.options.layout.table.width))
1481
1568
 
1482
- return None
1569
+ return ElementAction.RECURSE
1483
1570
 
1484
1571
  # <img src="..." alt="..." />
1485
1572
  case "img":
@@ -1487,7 +1574,11 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1487
1574
 
1488
1575
  # <a href="..."> ... </a>
1489
1576
  case "a":
1490
- return self._transform_link(child)
1577
+ link = self._transform_link(child)
1578
+ if link is not None:
1579
+ return link
1580
+ else:
1581
+ return ElementAction.RECURSE
1491
1582
 
1492
1583
  # <mark>...</mark>
1493
1584
  case "mark":
@@ -1503,9 +1594,9 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1503
1594
 
1504
1595
  # <sup id="fnref:NAME"><a class="footnote-ref" href="#fn:NAME">1</a></sup>
1505
1596
  # Multiple references: <sup id="fnref2:NAME">...</sup>, <sup id="fnref3:NAME">...</sup>
1506
- case "sup" if re.match(r"^fnref\d*:", child.get("id", "")):
1597
+ case "sup" if _FOOTNOTE_REF_REGEXP.match(child.get("id", "")):
1507
1598
  self._transform_footnote_ref(child)
1508
- return None
1599
+ return ElementAction.RECURSE
1509
1600
 
1510
1601
  # <input type="date" value="1984-01-01" />
1511
1602
  case "input" if child.get("type", "") == "date":
@@ -1516,6 +1607,11 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1516
1607
  # Confluence prefers <u> over <ins> for underline, and replaces <ins> with <u>
1517
1608
  child.tag = "u"
1518
1609
 
1610
+ # <confluence-skip>...</confluence-skip>
1611
+ case "confluence-skip":
1612
+ # Content marked for exclusion from Confluence
1613
+ return ElementAction.REMOVE
1614
+
1519
1615
  # <x-emoji data-shortname="wink" data-unicode="1f609">😉</x-emoji>
1520
1616
  case "x-emoji":
1521
1617
  return self._transform_emoji(child)
@@ -1529,19 +1625,11 @@ class ConfluenceStorageFormatConverter(NodeVisitor):
1529
1625
 
1530
1626
  if self.options.heading_anchors:
1531
1627
  self._transform_heading(child)
1532
- return None
1628
+ return ElementAction.RECURSE
1533
1629
  case _:
1534
1630
  pass
1535
1631
 
1536
- return None
1537
-
1538
-
1539
- class DocumentError(RuntimeError):
1540
- "Raised when a converted Markdown document has an unexpected element or attribute."
1541
-
1542
-
1543
- class ConversionError(RuntimeError):
1544
- "Raised when a Markdown document cannot be converted to Confluence Storage Format."
1632
+ return ElementAction.RECURSE
1545
1633
 
1546
1634
 
1547
1635
  class ConfluenceDocument:
@@ -1602,11 +1690,21 @@ class ConfluenceDocument:
1602
1690
  lines: list[str] = []
1603
1691
  for data_uri, color in status_images.items():
1604
1692
  lines.append(f"[STATUS-{color.upper()}]: {data_uri}")
1605
- lines.append(document.text)
1693
+
1694
+ if options.line_numbers:
1695
+ lines.extend(markdown_with_line_numbers(document.text.splitlines(), document.start_line_number))
1696
+ else:
1697
+ lines.append(document.text)
1606
1698
 
1607
1699
  # parse Markdown document and convert to HTML
1608
1700
  html = markdown_to_html("\n".join(lines))
1609
1701
 
1702
+ try:
1703
+ # Transform skip markers in HTML string before parsing
1704
+ html = transform_skip_comments_in_html(html)
1705
+ except PreprocessingError as ex:
1706
+ raise ConversionError(f"failed to convert Markdown file: {path}") from ex
1707
+
1610
1708
  # modify HTML as necessary
1611
1709
  if self.options.generated_by is not None:
1612
1710
  generated_by = props.generated_by or self.options.generated_by
@@ -1641,6 +1739,21 @@ class ConfluenceDocument:
1641
1739
  # execute HTML-to-Confluence converter
1642
1740
  try:
1643
1741
  converter.visit(self.root)
1742
+ except DocumentError as ex:
1743
+ if options.line_numbers:
1744
+ # find closest paragraph ancestor
1745
+ elem = ex.element
1746
+ while elem.tag != "p" and (parent := elem.getparent()):
1747
+ elem = parent
1748
+
1749
+ # locate line number marker element
1750
+ line_number = 0
1751
+ for placeholder in elem.iterchildren("line-number"):
1752
+ line_number = int(placeholder.attrib["value"])
1753
+
1754
+ raise ConversionError(f"failed to convert Markdown file: {path} @ line {line_number}") from ex
1755
+ else:
1756
+ raise ConversionError(f"failed to convert Markdown file: {path}") from ex
1644
1757
  except RuntimeError as ex:
1645
1758
  raise ConversionError(f"failed to convert Markdown file: {path}") from ex
1646
1759
 
@@ -1671,40 +1784,19 @@ class ConfluenceDocument:
1671
1784
  Handles the case where a generated-by info panel may be present as the first child.
1672
1785
  """
1673
1786
 
1674
- # Find the first heading element (h1-h6) in the root
1787
+ # find the first heading element (h1-h6) in the root
1675
1788
  heading_pattern = re.compile(r"^h[1-6]$", re.IGNORECASE)
1676
1789
 
1677
- for idx, child in enumerate(self.root):
1790
+ for child in self.root:
1678
1791
  if not isinstance(child.tag, str):
1679
1792
  continue
1680
1793
 
1681
1794
  if heading_pattern.match(child.tag) is None:
1682
1795
  continue
1683
1796
 
1684
- # Preserve any text that comes after the heading (tail text)
1685
- tail = child.tail
1686
-
1687
- # Remove the heading
1688
- self.root.remove(child)
1689
-
1690
- # If there was tail text, attach it to the previous sibling's tail
1691
- # or to the parent's text if this was the first child
1692
- if tail:
1693
- if idx > 0:
1694
- # Append to previous sibling's tail
1695
- prev_sibling = self.root[idx - 1]
1696
- if prev_sibling.tail:
1697
- prev_sibling.tail += tail
1698
- else:
1699
- prev_sibling.tail = tail
1700
- else:
1701
- # No previous sibling, append to parent's text
1702
- if self.root.text:
1703
- self.root.text += tail
1704
- else:
1705
- self.root.text = tail
1797
+ remove_element(child)
1706
1798
 
1707
- # Only remove the FIRST heading, then stop
1799
+ # only remove the FIRST heading, then stop
1708
1800
  break
1709
1801
 
1710
1802
  def xhtml(self) -> str: