cfxmark 0.1.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. cfxmark-0.1.3/.gitignore +60 -0
  2. cfxmark-0.1.3/CHANGELOG.md +211 -0
  3. cfxmark-0.1.3/LICENSE +21 -0
  4. cfxmark-0.1.3/PKG-INFO +353 -0
  5. cfxmark-0.1.3/README.md +315 -0
  6. cfxmark-0.1.3/docs/OPAQUE.md +214 -0
  7. cfxmark-0.1.3/docs/REQUIREMENTS.md +103 -0
  8. cfxmark-0.1.3/docs/SPEC.md +175 -0
  9. cfxmark-0.1.3/pyproject.toml +145 -0
  10. cfxmark-0.1.3/src/cfxmark/__init__.py +34 -0
  11. cfxmark-0.1.3/src/cfxmark/_version.py +3 -0
  12. cfxmark-0.1.3/src/cfxmark/api.py +190 -0
  13. cfxmark-0.1.3/src/cfxmark/assets.py +255 -0
  14. cfxmark-0.1.3/src/cfxmark/ast.py +354 -0
  15. cfxmark-0.1.3/src/cfxmark/exceptions.py +32 -0
  16. cfxmark-0.1.3/src/cfxmark/macros/__init__.py +17 -0
  17. cfxmark-0.1.3/src/cfxmark/macros/builtins/__init__.py +15 -0
  18. cfxmark-0.1.3/src/cfxmark/macros/builtins/admonition.py +76 -0
  19. cfxmark-0.1.3/src/cfxmark/macros/builtins/expand.py +63 -0
  20. cfxmark-0.1.3/src/cfxmark/macros/builtins/jira.py +59 -0
  21. cfxmark-0.1.3/src/cfxmark/macros/builtins/toc.py +50 -0
  22. cfxmark-0.1.3/src/cfxmark/macros/registry.py +159 -0
  23. cfxmark-0.1.3/src/cfxmark/normalize.py +1168 -0
  24. cfxmark-0.1.3/src/cfxmark/opaque.py +437 -0
  25. cfxmark-0.1.3/src/cfxmark/parsers/__init__.py +1 -0
  26. cfxmark-0.1.3/src/cfxmark/parsers/cfx.py +1001 -0
  27. cfxmark-0.1.3/src/cfxmark/parsers/md.py +811 -0
  28. cfxmark-0.1.3/src/cfxmark/py.typed +0 -0
  29. cfxmark-0.1.3/src/cfxmark/renderers/__init__.py +1 -0
  30. cfxmark-0.1.3/src/cfxmark/renderers/cfx.py +513 -0
  31. cfxmark-0.1.3/src/cfxmark/renderers/md.py +728 -0
  32. cfxmark-0.1.3/src/cfxmark/xml_ns.py +157 -0
  33. cfxmark-0.1.3/tests/conftest.py +14 -0
  34. cfxmark-0.1.3/tests/corpus/.gitkeep +0 -0
  35. cfxmark-0.1.3/tests/corpus/synthetic.cfx.example +2 -0
  36. cfxmark-0.1.3/tests/property/__init__.py +0 -0
  37. cfxmark-0.1.3/tests/property/test_round_trip.py +109 -0
  38. cfxmark-0.1.3/tests/test_corpus.py +60 -0
  39. cfxmark-0.1.3/tests/unit/__init__.py +0 -0
  40. cfxmark-0.1.3/tests/unit/test_assets.py +185 -0
  41. cfxmark-0.1.3/tests/unit/test_cfx_to_md.py +106 -0
  42. cfxmark-0.1.3/tests/unit/test_md_to_cfx.py +192 -0
  43. cfxmark-0.1.3/tests/unit/test_opaque.py +49 -0
  44. cfxmark-0.1.3/tests/unit/test_security.py +230 -0
@@ -0,0 +1,60 @@
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+
7
+ # Distribution / packaging
8
+ .Python
9
+ build/
10
+ dist/
11
+ wheels/
12
+ *.egg-info/
13
+ *.egg
14
+
15
+ # uv
16
+ .venv/
17
+ .python-version
18
+
19
+ # Test / coverage
20
+ .pytest_cache/
21
+ .coverage
22
+ .coverage.*
23
+ htmlcov/
24
+ .tox/
25
+ .nox/
26
+ .hypothesis/
27
+
28
+ # Type checking
29
+ .mypy_cache/
30
+ .dmypy.json
31
+ .pyre/
32
+ .pytype/
33
+ .ruff_cache/
34
+
35
+ # IDE
36
+ .idea/
37
+ .vscode/
38
+ *.swp
39
+ *.swo
40
+ *~
41
+ .DS_Store
42
+
43
+ # Local dev artifacts
44
+ .omc/
45
+ .omx/
46
+ /tmp/
47
+ *.log
48
+
49
+ # Secrets (never commit)
50
+ .secrets/
51
+ *.pem
52
+ *.key
53
+
54
+ # Real-world corpus — may contain internal company content (Jira keys,
55
+ # attachments, drawio diagrams). Tests skip if absent. Place private
56
+ # samples here on your dev machine; never commit them.
57
+ /tests/corpus/*.cfx
58
+ /tests/corpus/*.json
59
+ /tests/corpus/private/
60
+ /tests/to_md/
@@ -0,0 +1,211 @@
1
+ # Changelog
2
+
3
+ All notable changes to **cfxmark** will be documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [0.1.3] — 2026-04-07
9
+
10
+ ### Changed
11
+
12
+ - `render_cfx` now returns a third element — a ``warnings`` list —
13
+ alongside ``(xhtml, attachments)``. Any ``::: jira`` / ``::: toc``
14
+ (or other parameter-only directive) whose body is silently dropped
15
+ by its handler now surfaces a human-readable warning via
16
+ ``ConversionResult.warnings`` so callers can correct their
17
+ Markdown instead of discovering the drop on Confluence.
18
+
19
+ ## [0.1.2] — 2026-04-07
20
+
21
+ ### Fixed
22
+
23
+ - Attachment enumeration now strips CDATA sections before scanning
24
+ for `<ri:attachment>`, so a Confluence ``code`` macro documenting
25
+ storage XML no longer leaks phantom filenames into
26
+ `result.attachments`. `resolve_assets(mode="sidecar")` applies the
27
+ same CDATA strip on its opaque-block fallback.
28
+ - `to_cfx` now emits only the **basename** of a local-image path in
29
+ `<ri:attachment ri:filename="...">` (Confluence stores attachments
30
+ in a flat per-page namespace). `result.attachments` still reports
31
+ the caller's original path — including any directory prefix — so
32
+ the caller knows where to read the bytes from on disk.
33
+
34
+ ## [0.1.1] — 2026-04-07
35
+
36
+ ### Fixed
37
+
38
+ - `ConversionResult.attachments` now enumerates **every**
39
+ `ri:attachment` reference in the output XHTML, including those
40
+ trapped inside Grade III opaque blocks (e.g. `<ac:image>` inside
41
+ `<pre><code>`). Previously only Grade I/II native `<ac:image>`
42
+ references were reported, so callers silently missed attachments
43
+ they needed to upload. `to_md` also populates `attachments` now
44
+ (previously always empty).
45
+ - `resolve_assets(mode="sidecar")` downloads opaque-block attachments
46
+ into `asset_dir` as a fallback, keeping the sidecar directory a
47
+ complete asset set regardless of how the image was preserved.
48
+ - `to_cfx` no longer crashes with `IndexError` when user-typed
49
+ Markdown contains a literal `` `CFXMARK_OPAQUE-N-CFXMARK` `` /
50
+ `` `CFXMARK_DIRECTIVE-N-CFXMARK` `` token whose index has no
51
+ matching capture — the region falls back to plain inline code.
52
+
53
+ ### Docs
54
+
55
+ - README custom-macro example uses a valid `AdmonitionHandler` flavour
56
+ (`info`/`note`/`warning`/`tip`); the previous `"danger"` example
57
+ raised `ValueError`.
58
+ - README docs/SPEC/OPAQUE/LICENSE links rewritten as absolute GitHub
59
+ URLs so they resolve correctly when rendered on PyPI.
60
+ - `docs/SPEC.md` no longer claims `<ac:layout>` wrappers become opaque
61
+ blocks; cfxmark flattens them transparently.
62
+
63
+ ### Packaging
64
+
65
+ - `pyproject.toml` switched to PEP 639 license metadata
66
+ (`license = "MIT"` + `license-files = ["LICENSE"]`); the redundant
67
+ `License :: OSI Approved :: MIT License` classifier was removed.
68
+ - Added `Documentation` and `Changelog` entries to `[project.urls]`
69
+ for the PyPI sidebar.
70
+
71
+ ## [0.1.0] — 2026-04-07
72
+
73
+ ### Added
74
+
75
+ #### Conversion API
76
+
77
+ - `cfxmark.to_cfx(markdown)` — Markdown → Confluence Storage Format XHTML.
78
+ - `cfxmark.to_md(xhtml)` — Confluence Storage Format XHTML → Markdown.
79
+ - Both return a `ConversionResult` carrying `xhtml` / `markdown`,
80
+ `attachments` (local file references for the caller to upload),
81
+ `warnings`, and the intermediate AST.
82
+
83
+ #### Native (grade I) constructs — lossless round-trip
84
+
85
+ - ATX headings `h1`–`h6`.
86
+ - Paragraphs, hard breaks, soft breaks, HTML entities.
87
+ - Inline emphasis: `**bold**`, `*italic*`, `` `code` ``, `~~strike~~`,
88
+ links, images.
89
+ - Lists: bullet, ordered, deeply nested, mixed paragraph + nested-list
90
+ list items.
91
+ - Block quotes, horizontal rules.
92
+ - Code fences with language tags (mapped to Confluence's `code` macro).
93
+ - GFM tables with **`colspan` / `rowspan`** support via the
94
+ MultiMarkdown `<` / `^` continuation cell convention. Multi-paragraph
95
+ cells flatten to inline content joined by `<br>` tags.
96
+
97
+ #### Directive (grade II) macros
98
+
99
+ Pluggable `MacroRegistry`. Default registry covers:
100
+
101
+ - `info`, `note`, `warning`, `tip` admonition panels.
102
+ - `jira` issue references (single + JQL query forms).
103
+ - `expand` collapsible sections.
104
+ - `toc` table of contents.
105
+
106
+ Each is rendered as a pandoc-style fenced div in Markdown:
107
+
108
+ ```
109
+ ::: info
110
+ body
111
+ :::
112
+ ```
113
+
114
+ #### Opaque (grade III) preservation
115
+
116
+ The signature feature: any Confluence construct cfxmark does not know
117
+ how to represent in Markdown is preserved **byte-for-byte**, including
118
+ the `ac:macro-id` UUID that gives Confluence its macro identity.
119
+
120
+ - **Block opaque**: HTML-comment sentinel + `cfx-storage` fenced code
121
+ block. SHA-256 fingerprint in the sentinel ID prevents accidental
122
+ collision with user-typed content.
123
+ - **Inline opaque**: short `[label](cfx:op-XXXXXXXX)` Markdown link
124
+ with the XML payload stored in a `cfxmark:payloads` sidecar at the
125
+ bottom of the document. The label is auto-derived from the
126
+ underlying element type (`@user-…`, `jira:PROJ-1`, `cfx:status`, …).
127
+ - **Header notice**: a single-line `<!-- cfxmark:notice ... -->` HTML
128
+ comment is injected at the top of any document containing opaque or
129
+ directive constructs, telling humans and AI agents not to delete the
130
+ markers.
131
+
132
+ #### Image asset workflow
133
+
134
+ - `to_md` automatically tags every local-attachment image with a
135
+ `<!-- cfxmark:asset src="..." -->` metadata marker carrying the
136
+ original Confluence filename.
137
+ - New `cfxmark.resolve_assets(md, fetcher, mode="sidecar"|"inline")`
138
+ function reads the markers, calls a caller-provided `fetcher` to
139
+ download the bytes, and either saves them to a sidecar directory
140
+ (with relative path links) or embeds them as `data:` URIs.
141
+ - Markers are preserved across resolution so the round trip back to
142
+ CFX always recovers the original Confluence filename — even after
143
+ the visible link target has been rewritten to a sidecar path.
144
+ - Image dimensions encoded in the URL fragment as
145
+ `#cfxmark:w=300,h=200` for round-trip preservation.
146
+
147
+ #### Canonicalization (`canonicalize_cfx`)
148
+
149
+ A deep XML normalization pass that lets two semantically equivalent
150
+ Confluence storage fragments compare equal:
151
+
152
+ - Strips volatile attributes (`ac:macro-id`, `ac:local-id`,
153
+ `ri:version-at-save`, `ac:schema-version`, `ac:thumbnail`,
154
+ `ac:border`, `ac:align`).
155
+ - Strips Confluence-editor data attributes
156
+ (`data-uuid`, `data-highlight-colour`, …).
157
+ - Removes purely cosmetic CSS (default text colour, `text-align`,
158
+ `width` / `height` on table family elements, `font-weight`,
159
+ `padding`, `margin`, `list-style-type`, `vertical-align`, …).
160
+ - Removes Confluence-editor class names
161
+ (`wrapped`, `fixed-width`, `auto-cursor-target`, `code-line`,
162
+ `has-list-bullet`, `internal-link`, `confluenceTd`, …).
163
+ - Unwraps decorative `<span>` and structural `<div>` wrappers
164
+ (including the `content-wrapper` div Confluence emits inside table
165
+ cells).
166
+ - Promotes header rows to `<thead>`, splits `<h*>` containing `<br/>`,
167
+ flattens singleton paragraphs inside `<li>`, drops empty paragraphs
168
+ and trailing breaks, normalizes `<pre><code>` to the structured
169
+ `code` macro form, removes cosmetic code parameters
170
+ (`linenumbers`, `theme`, `firstline`, …).
171
+
172
+ #### Security hardening
173
+
174
+ - Rejects any input containing `<!DOCTYPE>` or `<!ENTITY>` to block
175
+ XXE and billion-laughs attacks.
176
+ - lxml parser configured with `no_network=True`, `load_dtd=False`,
177
+ `huge_tree=False`.
178
+ - Opaque sentinels carry a SHA-256 fingerprint of their body — a user
179
+ who types the literal sentinel sequence in their Markdown is **not**
180
+ silently turned into an opaque block; the verification fails and
181
+ the region falls back to plain text.
182
+
183
+ #### Tooling
184
+
185
+ - `py.typed` marker for PEP 561 consumers.
186
+ - `pyproject.toml` configured for `uv` and `hatchling`.
187
+ - mypy clean (non-strict for v0.1; strict planned for v0.2).
188
+ - ruff clean.
189
+ - 65 tests:
190
+ - 39 unit tests (per-construct + edge cases)
191
+ - 7 image asset tests
192
+ - 8 security regression tests
193
+ - 1 corpus golden-file test (skipped if no private corpus available)
194
+ - 1 Hypothesis property-based round-trip test (100 random documents)
195
+ - Verified against 9 production Confluence pages totalling ~290 KB
196
+ of XHTML — all round-trip with byte-identical canonical form.
197
+
198
+ ### Known limitations
199
+
200
+ - **HTML comments in Markdown** are dropped with a warning, with one
201
+ exception: cfxmark's own opaque / asset / header markers are
202
+ preserved. Confluence does not preserve HTML comments either, so
203
+ this matches Confluence's own behaviour.
204
+ - **`drawio`, `plantuml`** and other rich diagram macros are
205
+ passed through as opaque blocks (preserved losslessly but not
206
+ rendered in Markdown).
207
+ - **`MacroHandler` protocol leaks lxml**. Custom macro handlers
208
+ currently receive and return `lxml.etree._Element` objects. A thin
209
+ adapter is planned for v0.2.
210
+ - **`<th scope="...">`, `<td title="...">`** attributes are stripped
211
+ during canonicalization since Markdown cannot preserve them.
cfxmark-0.1.3/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Eunsan Jo
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
cfxmark-0.1.3/PKG-INFO ADDED
@@ -0,0 +1,353 @@
1
+ Metadata-Version: 2.4
2
+ Name: cfxmark
3
+ Version: 0.1.3
4
+ Summary: Bidirectional Markdown <-> Confluence Storage XHTML converter with lossless opaque preservation.
5
+ Project-URL: Homepage, https://github.com/eunsanMountain/cfxmark
6
+ Project-URL: Repository, https://github.com/eunsanMountain/cfxmark
7
+ Project-URL: Issues, https://github.com/eunsanMountain/cfxmark/issues
8
+ Project-URL: Documentation, https://github.com/eunsanMountain/cfxmark/blob/main/docs/SPEC.md
9
+ Project-URL: Changelog, https://github.com/eunsanMountain/cfxmark/blob/main/CHANGELOG.md
10
+ Author: Eunsan Jo
11
+ License-Expression: MIT
12
+ License-File: LICENSE
13
+ Keywords: bidirectional,confluence,converter,markdown,round-trip,storage-format,xhtml
14
+ Classifier: Development Status :: 3 - Alpha
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3 :: Only
19
+ Classifier: Programming Language :: Python :: 3.10
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Text Processing :: Markup
24
+ Classifier: Topic :: Text Processing :: Markup :: Markdown
25
+ Classifier: Topic :: Text Processing :: Markup :: XML
26
+ Classifier: Typing :: Typed
27
+ Requires-Python: >=3.10
28
+ Requires-Dist: lxml>=5.0
29
+ Requires-Dist: mistletoe>=1.3
30
+ Provides-Extra: dev
31
+ Requires-Dist: hypothesis>=6.100; extra == 'dev'
32
+ Requires-Dist: lxml-stubs>=0.5; extra == 'dev'
33
+ Requires-Dist: mypy>=1.10; extra == 'dev'
34
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
35
+ Requires-Dist: pytest>=8.0; extra == 'dev'
36
+ Requires-Dist: ruff>=0.5; extra == 'dev'
37
+ Description-Content-Type: text/markdown
38
+
39
+ # cfxmark
40
+
41
+ **Bidirectional Markdown ↔ Confluence Storage XHTML converter** —
42
+ with lossless opaque preservation for everything cfxmark doesn't
43
+ explicitly know how to convert.
44
+
45
+ ```python
46
+ import cfxmark
47
+
48
+ # Markdown → Confluence storage XHTML
49
+ result = cfxmark.to_cfx(markdown_text)
50
+ result.xhtml # str — ready for Confluence REST PUT
51
+ result.attachments # tuple — local file refs the caller should upload
52
+ result.warnings # tuple — human-readable conversion warnings
53
+
54
+ # Confluence storage XHTML → Markdown
55
+ result = cfxmark.to_md(xhtml_text)
56
+ result.markdown # str — canonical markdown
57
+ result.warnings # tuple
58
+ ```
59
+
60
+ `ConversionResult` is the same dataclass for both directions —
61
+ `xhtml` is populated for `to_cfx`, `markdown` for `to_md`.
62
+
63
+ ## Why another converter?
64
+
65
+ Two existing projects inspired this one — [`md2cf`][md2cf] and
66
+ [`md2conf`][md2conf] — but both are **one-directional** (md → cf) and
67
+ neither preserves unknown macros across a round trip. `cfxmark` fills
68
+ both gaps:
69
+
70
+ 1. **Bidirectional.** `to_md(to_cfx(m))` is byte-identical to
71
+ `canonicalize(m)` for every construct in the supported subset.
72
+ 2. **Opaque preservation.** Confluence content cfxmark doesn't
73
+ understand (custom plugins, drawio diagrams, exotic table cells)
74
+ round-trips byte-for-byte, **including the `ac:macro-id` UUID**.
75
+ Confluence treats the round-tripped macro as the same instance, so
76
+ comments, attachments, and permissions stay attached.
77
+ 3. **Pure text-in / text-out.** No Confluence API, no network, no
78
+ attachment upload. The caller owns REST I/O. (See "Image assets"
79
+ below for the helper function that lets the caller plug in
80
+ network-bound logic without bloating cfxmark.)
81
+
82
+ [md2cf]: https://github.com/iamjackg/md2cf
83
+ [md2conf]: https://github.com/hunyadi/md2conf
84
+
85
+ ## Install
86
+
87
+ ```bash
88
+ # With uv (recommended):
89
+ uv add cfxmark
90
+
91
+ # With pip:
92
+ pip install cfxmark
93
+ ```
94
+
95
+ cfxmark depends on `lxml` and `mistletoe`. Python 3.10+.
96
+
97
+ ## The contract
98
+
99
+ cfxmark grades every Confluence construct into one of three buckets:
100
+
101
+ | Grade | Description | Behaviour |
102
+ |---|---|---|
103
+ | **I — Native** | Standard CommonMark / GFM (headings, lists, tables, code fences, links, images, blockquote, hr, inline emphasis) | Lossless round-trip after canonicalization. |
104
+ | **II — Directive** | Confluence macros with a known Markdown directive mapping (`info`, `note`, `warning`, `tip`, `jira`, `expand`, `toc`) | Lossless after canonicalization. Pluggable via `MacroRegistry`. |
105
+ | **III — Opaque** | Everything else | Captured byte-for-byte through cfxmark's opaque-block / inline-opaque mechanism. **Never dropped, never rewritten.** |
106
+
107
+ See [`docs/SPEC.md`](https://github.com/eunsanMountain/cfxmark/blob/main/docs/SPEC.md)
108
+ for the full mapping table and
109
+ [`docs/OPAQUE.md`](https://github.com/eunsanMountain/cfxmark/blob/main/docs/OPAQUE.md)
110
+ for the opaque-block format.
111
+
112
+ ## Usage
113
+
114
+ ### Round-trip a Confluence page through Markdown
115
+
116
+ ```python
117
+ import cfxmark
118
+
119
+ # Whatever fetched the page (REST API call, exported XML file, …)
120
+ xhtml = my_confluence_client.get_storage_format(page_id)
121
+
122
+ # Convert to Markdown
123
+ md_result = cfxmark.to_md(xhtml)
124
+ markdown = md_result.markdown
125
+
126
+ # … user edits the Markdown …
127
+
128
+ # Convert back to Confluence storage XHTML
129
+ cfx_result = cfxmark.to_cfx(markdown)
130
+ my_confluence_client.update_page(page_id, cfx_result.xhtml)
131
+
132
+ # Optionally upload any newly referenced local images
133
+ for filename in cfx_result.attachments:
134
+ my_confluence_client.upload_attachment(page_id, filename)
135
+ ```
136
+
137
+ ### Image assets
138
+
139
+ When you convert a Confluence page that references uploaded
140
+ attachments, the resulting Markdown looks like this:
141
+
142
+ ```markdown
143
+ ![](image-3.png#cfxmark:w=700)<!-- cfxmark:asset src="image-3.png" -->
144
+ ```
145
+
146
+ The image link still points at the original Confluence filename
147
+ (broken in any local Markdown viewer until you fetch the bytes), and
148
+ the `<!-- cfxmark:asset -->` HTML comment carries enough metadata for
149
+ a follow-up step to fetch and embed.
150
+
151
+ `cfxmark.resolve_assets` is that follow-up step. You provide a
152
+ fetcher callback that returns bytes for one filename at a time, and
153
+ choose between two output strategies:
154
+
155
+ ```python
156
+ import cfxmark
157
+ from pathlib import Path
158
+
159
+ def fetcher(filename: str) -> bytes:
160
+ # Whatever you use to download from Confluence:
161
+ return my_confluence_client.download_attachment(page_id, filename)
162
+
163
+ # Strategy A — sidecar directory (recommended for git-tracked docs).
164
+ # Saves bytes to ./assets/ and rewrites links to relative paths.
165
+ md = cfxmark.resolve_assets(
166
+ md_result.markdown,
167
+ fetcher,
168
+ mode="sidecar",
169
+ asset_dir="docs/page-42/assets",
170
+ md_path="docs/page-42.md",
171
+ )
172
+ Path("docs/page-42.md").write_text(md)
173
+ # docs/page-42/assets/image-3.png exists
174
+ # md link: ![](assets/image-3.png#cfxmark:w=700)<!-- cfxmark:asset src="image-3.png" -->
175
+
176
+ # Strategy B — inline data URIs (single self-contained file).
177
+ md = cfxmark.resolve_assets(md_result.markdown, fetcher, mode="inline")
178
+ # md link: ![](data:image/png;base64,iVBORw0K...)<!-- cfxmark:asset src="image-3.png" -->
179
+ ```
180
+
181
+ The asset markers are **preserved** through both strategies, so
182
+ `resolve_assets` is idempotent and a subsequent `to_cfx` call always
183
+ recovers the original Confluence filename — even if the visible link
184
+ target has been rewritten to a sidecar path or a data URI.
185
+
186
+ ### Mermaid diagrams
187
+
188
+ cfxmark maps Markdown's `` ```mermaid `` fenced code block to
189
+ Confluence's `code` macro with `language=mermaid`. If your Confluence
190
+ instance has a Mermaid plugin installed (e.g. *Mermaid Diagrams for
191
+ Confluence*) it will render the diagram automatically; otherwise the
192
+ content is shown as a syntax-highlighted code block.
193
+
194
+ ```markdown
195
+ ​```mermaid
196
+ graph LR
197
+ A --> B --> C
198
+ ​```
199
+ ```
200
+
201
+ ### Inline opaque references
202
+
203
+ Inline elements that have no native Markdown form — Confluence user
204
+ mentions, inline Jira issue macros, custom widget invocations, … —
205
+ become a short Markdown link with a `cfx:op-...` URL:
206
+
207
+ ```markdown
208
+ Contact the purchaser ([@user-2c9402cc](cfx:op-4fab0f8d))
209
+ ```
210
+
211
+ The `[label]` is auto-derived from the underlying element type
212
+ (`@user-…`, `jira:PROJ-1`, `cfx:status`, …) and the `op-XXXXXXXX` ID
213
+ is a SHA-256 prefix of the original XML payload. The full XML lives
214
+ in a `cfxmark:payloads` sidecar at the bottom of the same Markdown
215
+ file:
216
+
217
+ ```markdown
218
+ <!-- cfxmark:payloads -->
219
+ <!-- op-4fab0f8d
220
+ <ac:link><ri:user ri:userkey="2c9402cc83d4bcc40183d976ef730001"/></ac:link>
221
+ -->
222
+ <!-- /cfxmark:payloads -->
223
+ ```
224
+
225
+ The SHA-256 fingerprint means a user who **types** that exact link
226
+ syntax in their own Markdown is not silently re-interpreted as an
227
+ opaque payload — the verification fails and the region falls back to
228
+ ordinary text.
229
+
230
+ ### Block opaque blocks
231
+
232
+ Block-level Confluence content cfxmark doesn't know how to convert
233
+ (e.g. drawio diagrams, plantuml, complex tables) is wrapped in a
234
+ fenced code block with sentinel comments:
235
+
236
+ ````markdown
237
+ <!-- cfxmark:opaque id="op-1188e2b4" -->
238
+ ```cfx-storage
239
+ <ac:structured-macro ac:name="drawio" ac:macro-id="...">
240
+ <ac:parameter ac:name="diagramName">flow</ac:parameter>
241
+ ...
242
+ </ac:structured-macro>
243
+ ```
244
+ <!-- /cfxmark:opaque -->
245
+ ````
246
+
247
+ Editors render this as a clearly visible code block — a "do not
248
+ touch" signal for human readers. The Markdown parser detects the
249
+ sentinels first and round-trips the contents byte-for-byte, including
250
+ the original `ac:macro-id` UUID that Confluence uses to identify
251
+ macro instances.
252
+
253
+ ### Header notice
254
+
255
+ When a converted Markdown document contains any opaque or directive
256
+ markers, cfxmark prepends a single-line HTML comment explaining the
257
+ conventions to humans and AI agents:
258
+
259
+ ```markdown
260
+ <!-- cfxmark:notice Converted from Confluence storage format. Inline
261
+ [label](cfx:op-XXXXXXXX) references preserve Confluence content that
262
+ has no native Markdown form; the raw XML for each lives in the
263
+ cfxmark:payloads sidecar at the bottom of this file. Do not edit
264
+ those references or the sidecar — tampering invalidates a SHA-256
265
+ fingerprint and the round trip falls back to plain text. -->
266
+ ```
267
+
268
+ The comment is invisible in any Markdown viewer.
269
+
270
+ ### Custom macros
271
+
272
+ Promote a Confluence macro from "opaque" to "directive" by registering
273
+ a custom handler:
274
+
275
+ ```python
276
+ import cfxmark
277
+ from cfxmark.macros import MacroRegistry
278
+ from cfxmark.macros.builtins import AdmonitionHandler
279
+
280
+ # Start from the default registry and add your own.
281
+ my_registry = cfxmark.default_registry.copy()
282
+ # Built-in AdmonitionHandler accepts one of: "info", "note", "warning", "tip".
283
+ # To promote a previously-opaque macro, write a small MacroHandler subclass —
284
+ # see cfxmark/macros/builtins/admonition.py for a complete example.
285
+ my_registry.register(AdmonitionHandler("warning"))
286
+
287
+ result = cfxmark.to_md(xhtml, macros=my_registry)
288
+ ```
289
+
290
+ Implementing a `MacroHandler` from scratch requires a small amount
291
+ of lxml knowledge — see `cfxmark/macros/builtins/admonition.py` for
292
+ a complete example. A higher-level handler API that hides lxml is
293
+ planned for v0.2.
294
+
295
+ ### Canonicalization helpers
296
+
297
+ Two Confluence storage fragments are "the same" only after a deep
298
+ normalization pass that strips volatile attributes, editor noise,
299
+ and rendering hints. Use `canonicalize_cfx` to compare two snapshots:
300
+
301
+ ```python
302
+ import cfxmark
303
+
304
+ c1 = cfxmark.canonicalize_cfx(original_xhtml)
305
+ c2 = cfxmark.canonicalize_cfx(round_tripped_xhtml)
306
+ assert c1 == c2 # passes for any document in the supported subset
307
+ ```
308
+
309
+ `canonicalize_cfx` is the same function the test suite uses to
310
+ verify byte-identical round trips against real Confluence pages.
311
+
312
+ ## Security
313
+
314
+ cfxmark hardens its XML parser against XXE and billion-laughs attacks:
315
+
316
+ - Inputs containing `<!DOCTYPE>` or `<!ENTITY>` declarations are
317
+ rejected before lxml ever sees them.
318
+ - The lxml parser is configured with `no_network=True`,
319
+ `load_dtd=False`, and `huge_tree=False`.
320
+ - Opaque-block sentinels are SHA-256 verified — accidental sentinel
321
+ syntax in user-typed Markdown does **not** become a real opaque
322
+ block.
323
+
324
+ If you find a security issue, please open a GitHub issue.
325
+
326
+ ## Development
327
+
328
+ ```bash
329
+ git clone https://github.com/eunsanMountain/cfxmark
330
+ cd cfxmark
331
+ uv sync --all-extras
332
+
333
+ # Run all tests
334
+ uv run pytest
335
+
336
+ # Type-check
337
+ uv run mypy src/
338
+
339
+ # Lint
340
+ uv run ruff check .
341
+
342
+ # Build
343
+ uv build
344
+ ```
345
+
346
+ The corpus tests look for `.cfx` files in `tests/corpus/` (gitignored
347
+ to keep your own private samples out of version control). Drop your
348
+ own Confluence storage XHTML there and they will be exercised by
349
+ `pytest tests/test_corpus.py`.
350
+
351
+ ## License
352
+
353
+ MIT. See [`LICENSE`](https://github.com/eunsanMountain/cfxmark/blob/main/LICENSE).