cfxmark 0.1.3__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- cfxmark-0.1.3/.gitignore +60 -0
- cfxmark-0.1.3/CHANGELOG.md +211 -0
- cfxmark-0.1.3/LICENSE +21 -0
- cfxmark-0.1.3/PKG-INFO +353 -0
- cfxmark-0.1.3/README.md +315 -0
- cfxmark-0.1.3/docs/OPAQUE.md +214 -0
- cfxmark-0.1.3/docs/REQUIREMENTS.md +103 -0
- cfxmark-0.1.3/docs/SPEC.md +175 -0
- cfxmark-0.1.3/pyproject.toml +145 -0
- cfxmark-0.1.3/src/cfxmark/__init__.py +34 -0
- cfxmark-0.1.3/src/cfxmark/_version.py +3 -0
- cfxmark-0.1.3/src/cfxmark/api.py +190 -0
- cfxmark-0.1.3/src/cfxmark/assets.py +255 -0
- cfxmark-0.1.3/src/cfxmark/ast.py +354 -0
- cfxmark-0.1.3/src/cfxmark/exceptions.py +32 -0
- cfxmark-0.1.3/src/cfxmark/macros/__init__.py +17 -0
- cfxmark-0.1.3/src/cfxmark/macros/builtins/__init__.py +15 -0
- cfxmark-0.1.3/src/cfxmark/macros/builtins/admonition.py +76 -0
- cfxmark-0.1.3/src/cfxmark/macros/builtins/expand.py +63 -0
- cfxmark-0.1.3/src/cfxmark/macros/builtins/jira.py +59 -0
- cfxmark-0.1.3/src/cfxmark/macros/builtins/toc.py +50 -0
- cfxmark-0.1.3/src/cfxmark/macros/registry.py +159 -0
- cfxmark-0.1.3/src/cfxmark/normalize.py +1168 -0
- cfxmark-0.1.3/src/cfxmark/opaque.py +437 -0
- cfxmark-0.1.3/src/cfxmark/parsers/__init__.py +1 -0
- cfxmark-0.1.3/src/cfxmark/parsers/cfx.py +1001 -0
- cfxmark-0.1.3/src/cfxmark/parsers/md.py +811 -0
- cfxmark-0.1.3/src/cfxmark/py.typed +0 -0
- cfxmark-0.1.3/src/cfxmark/renderers/__init__.py +1 -0
- cfxmark-0.1.3/src/cfxmark/renderers/cfx.py +513 -0
- cfxmark-0.1.3/src/cfxmark/renderers/md.py +728 -0
- cfxmark-0.1.3/src/cfxmark/xml_ns.py +157 -0
- cfxmark-0.1.3/tests/conftest.py +14 -0
- cfxmark-0.1.3/tests/corpus/.gitkeep +0 -0
- cfxmark-0.1.3/tests/corpus/synthetic.cfx.example +2 -0
- cfxmark-0.1.3/tests/property/__init__.py +0 -0
- cfxmark-0.1.3/tests/property/test_round_trip.py +109 -0
- cfxmark-0.1.3/tests/test_corpus.py +60 -0
- cfxmark-0.1.3/tests/unit/__init__.py +0 -0
- cfxmark-0.1.3/tests/unit/test_assets.py +185 -0
- cfxmark-0.1.3/tests/unit/test_cfx_to_md.py +106 -0
- cfxmark-0.1.3/tests/unit/test_md_to_cfx.py +192 -0
- cfxmark-0.1.3/tests/unit/test_opaque.py +49 -0
- cfxmark-0.1.3/tests/unit/test_security.py +230 -0
cfxmark-0.1.3/.gitignore
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
# Byte-compiled / optimized / DLL files
|
|
2
|
+
__pycache__/
|
|
3
|
+
*.py[cod]
|
|
4
|
+
*$py.class
|
|
5
|
+
*.so
|
|
6
|
+
|
|
7
|
+
# Distribution / packaging
|
|
8
|
+
.Python
|
|
9
|
+
build/
|
|
10
|
+
dist/
|
|
11
|
+
wheels/
|
|
12
|
+
*.egg-info/
|
|
13
|
+
*.egg
|
|
14
|
+
|
|
15
|
+
# uv
|
|
16
|
+
.venv/
|
|
17
|
+
.python-version
|
|
18
|
+
|
|
19
|
+
# Test / coverage
|
|
20
|
+
.pytest_cache/
|
|
21
|
+
.coverage
|
|
22
|
+
.coverage.*
|
|
23
|
+
htmlcov/
|
|
24
|
+
.tox/
|
|
25
|
+
.nox/
|
|
26
|
+
.hypothesis/
|
|
27
|
+
|
|
28
|
+
# Type checking
|
|
29
|
+
.mypy_cache/
|
|
30
|
+
.dmypy.json
|
|
31
|
+
.pyre/
|
|
32
|
+
.pytype/
|
|
33
|
+
.ruff_cache/
|
|
34
|
+
|
|
35
|
+
# IDE
|
|
36
|
+
.idea/
|
|
37
|
+
.vscode/
|
|
38
|
+
*.swp
|
|
39
|
+
*.swo
|
|
40
|
+
*~
|
|
41
|
+
.DS_Store
|
|
42
|
+
|
|
43
|
+
# Local dev artifacts
|
|
44
|
+
.omc/
|
|
45
|
+
.omx/
|
|
46
|
+
/tmp/
|
|
47
|
+
*.log
|
|
48
|
+
|
|
49
|
+
# Secrets (never commit)
|
|
50
|
+
.secrets/
|
|
51
|
+
*.pem
|
|
52
|
+
*.key
|
|
53
|
+
|
|
54
|
+
# Real-world corpus — may contain internal company content (Jira keys,
|
|
55
|
+
# attachments, drawio diagrams). Tests skip if absent. Place private
|
|
56
|
+
# samples here on your dev machine; never commit them.
|
|
57
|
+
/tests/corpus/*.cfx
|
|
58
|
+
/tests/corpus/*.json
|
|
59
|
+
/tests/corpus/private/
|
|
60
|
+
/tests/to_md/
|
|
@@ -0,0 +1,211 @@
|
|
|
1
|
+
# Changelog
|
|
2
|
+
|
|
3
|
+
All notable changes to **cfxmark** will be documented in this file.
|
|
4
|
+
|
|
5
|
+
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
|
6
|
+
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
|
+
|
|
8
|
+
## [0.1.3] — 2026-04-07
|
|
9
|
+
|
|
10
|
+
### Changed
|
|
11
|
+
|
|
12
|
+
- `render_cfx` now returns a third element — a ``warnings`` list —
|
|
13
|
+
alongside ``(xhtml, attachments)``. Any ``::: jira`` / ``::: toc``
|
|
14
|
+
(or other parameter-only directive) whose body is silently dropped
|
|
15
|
+
by its handler now surfaces a human-readable warning via
|
|
16
|
+
``ConversionResult.warnings`` so callers can correct their
|
|
17
|
+
Markdown instead of discovering the drop on Confluence.
|
|
18
|
+
|
|
19
|
+
## [0.1.2] — 2026-04-07
|
|
20
|
+
|
|
21
|
+
### Fixed
|
|
22
|
+
|
|
23
|
+
- Attachment enumeration now strips CDATA sections before scanning
|
|
24
|
+
for `<ri:attachment>`, so a Confluence ``code`` macro documenting
|
|
25
|
+
storage XML no longer leaks phantom filenames into
|
|
26
|
+
`result.attachments`. `resolve_assets(mode="sidecar")` applies the
|
|
27
|
+
same CDATA strip on its opaque-block fallback.
|
|
28
|
+
- `to_cfx` now emits only the **basename** of a local-image path in
|
|
29
|
+
`<ri:attachment ri:filename="...">` (Confluence stores attachments
|
|
30
|
+
in a flat per-page namespace). `result.attachments` still reports
|
|
31
|
+
the caller's original path — including any directory prefix — so
|
|
32
|
+
the caller knows where to read the bytes from on disk.
|
|
33
|
+
|
|
34
|
+
## [0.1.1] — 2026-04-07
|
|
35
|
+
|
|
36
|
+
### Fixed
|
|
37
|
+
|
|
38
|
+
- `ConversionResult.attachments` now enumerates **every**
|
|
39
|
+
`ri:attachment` reference in the output XHTML, including those
|
|
40
|
+
trapped inside Grade III opaque blocks (e.g. `<ac:image>` inside
|
|
41
|
+
`<pre><code>`). Previously only Grade I/II native `<ac:image>`
|
|
42
|
+
references were reported, so callers silently missed attachments
|
|
43
|
+
they needed to upload. `to_md` also populates `attachments` now
|
|
44
|
+
(previously always empty).
|
|
45
|
+
- `resolve_assets(mode="sidecar")` downloads opaque-block attachments
|
|
46
|
+
into `asset_dir` as a fallback, keeping the sidecar directory a
|
|
47
|
+
complete asset set regardless of how the image was preserved.
|
|
48
|
+
- `to_cfx` no longer crashes with `IndexError` when user-typed
|
|
49
|
+
Markdown contains a literal `` `CFXMARK_OPAQUE-N-CFXMARK` `` /
|
|
50
|
+
`` `CFXMARK_DIRECTIVE-N-CFXMARK` `` token whose index has no
|
|
51
|
+
matching capture — the region falls back to plain inline code.
|
|
52
|
+
|
|
53
|
+
### Docs
|
|
54
|
+
|
|
55
|
+
- README custom-macro example uses a valid `AdmonitionHandler` flavour
|
|
56
|
+
(`info`/`note`/`warning`/`tip`); the previous `"danger"` example
|
|
57
|
+
raised `ValueError`.
|
|
58
|
+
- README docs/SPEC/OPAQUE/LICENSE links rewritten as absolute GitHub
|
|
59
|
+
URLs so they resolve correctly when rendered on PyPI.
|
|
60
|
+
- `docs/SPEC.md` no longer claims `<ac:layout>` wrappers become opaque
|
|
61
|
+
blocks; cfxmark flattens them transparently.
|
|
62
|
+
|
|
63
|
+
### Packaging
|
|
64
|
+
|
|
65
|
+
- `pyproject.toml` switched to PEP 639 license metadata
|
|
66
|
+
(`license = "MIT"` + `license-files = ["LICENSE"]`); the redundant
|
|
67
|
+
`License :: OSI Approved :: MIT License` classifier was removed.
|
|
68
|
+
- Added `Documentation` and `Changelog` entries to `[project.urls]`
|
|
69
|
+
for the PyPI sidebar.
|
|
70
|
+
|
|
71
|
+
## [0.1.0] — 2026-04-07
|
|
72
|
+
|
|
73
|
+
### Added
|
|
74
|
+
|
|
75
|
+
#### Conversion API
|
|
76
|
+
|
|
77
|
+
- `cfxmark.to_cfx(markdown)` — Markdown → Confluence Storage Format XHTML.
|
|
78
|
+
- `cfxmark.to_md(xhtml)` — Confluence Storage Format XHTML → Markdown.
|
|
79
|
+
- Both return a `ConversionResult` carrying `xhtml` / `markdown`,
|
|
80
|
+
`attachments` (local file references for the caller to upload),
|
|
81
|
+
`warnings`, and the intermediate AST.
|
|
82
|
+
|
|
83
|
+
#### Native (grade I) constructs — lossless round-trip
|
|
84
|
+
|
|
85
|
+
- ATX headings `h1`–`h6`.
|
|
86
|
+
- Paragraphs, hard breaks, soft breaks, HTML entities.
|
|
87
|
+
- Inline emphasis: `**bold**`, `*italic*`, `` `code` ``, `~~strike~~`,
|
|
88
|
+
links, images.
|
|
89
|
+
- Lists: bullet, ordered, deeply nested, mixed paragraph + nested-list
|
|
90
|
+
list items.
|
|
91
|
+
- Block quotes, horizontal rules.
|
|
92
|
+
- Code fences with language tags (mapped to Confluence's `code` macro).
|
|
93
|
+
- GFM tables with **`colspan` / `rowspan`** support via the
|
|
94
|
+
MultiMarkdown `<` / `^` continuation cell convention. Multi-paragraph
|
|
95
|
+
cells flatten to inline content joined by `<br>` tags.
|
|
96
|
+
|
|
97
|
+
#### Directive (grade II) macros
|
|
98
|
+
|
|
99
|
+
Pluggable `MacroRegistry`. Default registry covers:
|
|
100
|
+
|
|
101
|
+
- `info`, `note`, `warning`, `tip` admonition panels.
|
|
102
|
+
- `jira` issue references (single + JQL query forms).
|
|
103
|
+
- `expand` collapsible sections.
|
|
104
|
+
- `toc` table of contents.
|
|
105
|
+
|
|
106
|
+
Each is rendered as a pandoc-style fenced div in Markdown:
|
|
107
|
+
|
|
108
|
+
```
|
|
109
|
+
::: info
|
|
110
|
+
body
|
|
111
|
+
:::
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
#### Opaque (grade III) preservation
|
|
115
|
+
|
|
116
|
+
The signature feature: any Confluence construct cfxmark does not know
|
|
117
|
+
how to represent in Markdown is preserved **byte-for-byte**, including
|
|
118
|
+
the `ac:macro-id` UUID that gives Confluence its macro identity.
|
|
119
|
+
|
|
120
|
+
- **Block opaque**: HTML-comment sentinel + `cfx-storage` fenced code
|
|
121
|
+
block. SHA-256 fingerprint in the sentinel ID prevents accidental
|
|
122
|
+
collision with user-typed content.
|
|
123
|
+
- **Inline opaque**: short `[label](cfx:op-XXXXXXXX)` Markdown link
|
|
124
|
+
with the XML payload stored in a `cfxmark:payloads` sidecar at the
|
|
125
|
+
bottom of the document. The label is auto-derived from the
|
|
126
|
+
underlying element type (`@user-…`, `jira:PROJ-1`, `cfx:status`, …).
|
|
127
|
+
- **Header notice**: a single-line `<!-- cfxmark:notice ... -->` HTML
|
|
128
|
+
comment is injected at the top of any document containing opaque or
|
|
129
|
+
directive constructs, telling humans and AI agents not to delete the
|
|
130
|
+
markers.
|
|
131
|
+
|
|
132
|
+
#### Image asset workflow
|
|
133
|
+
|
|
134
|
+
- `to_md` automatically tags every local-attachment image with a
|
|
135
|
+
`<!-- cfxmark:asset src="..." -->` metadata marker carrying the
|
|
136
|
+
original Confluence filename.
|
|
137
|
+
- New `cfxmark.resolve_assets(md, fetcher, mode="sidecar"|"inline")`
|
|
138
|
+
function reads the markers, calls a caller-provided `fetcher` to
|
|
139
|
+
download the bytes, and either saves them to a sidecar directory
|
|
140
|
+
(with relative path links) or embeds them as `data:` URIs.
|
|
141
|
+
- Markers are preserved across resolution so the round trip back to
|
|
142
|
+
CFX always recovers the original Confluence filename — even after
|
|
143
|
+
the visible link target has been rewritten to a sidecar path.
|
|
144
|
+
- Image dimensions encoded in the URL fragment as
|
|
145
|
+
`#cfxmark:w=300,h=200` for round-trip preservation.
|
|
146
|
+
|
|
147
|
+
#### Canonicalization (`canonicalize_cfx`)
|
|
148
|
+
|
|
149
|
+
A deep XML normalization pass that lets two semantically equivalent
|
|
150
|
+
Confluence storage fragments compare equal:
|
|
151
|
+
|
|
152
|
+
- Strips volatile attributes (`ac:macro-id`, `ac:local-id`,
|
|
153
|
+
`ri:version-at-save`, `ac:schema-version`, `ac:thumbnail`,
|
|
154
|
+
`ac:border`, `ac:align`).
|
|
155
|
+
- Strips Confluence-editor data attributes
|
|
156
|
+
(`data-uuid`, `data-highlight-colour`, …).
|
|
157
|
+
- Removes purely cosmetic CSS (default text colour, `text-align`,
|
|
158
|
+
`width` / `height` on table family elements, `font-weight`,
|
|
159
|
+
`padding`, `margin`, `list-style-type`, `vertical-align`, …).
|
|
160
|
+
- Removes Confluence-editor class names
|
|
161
|
+
(`wrapped`, `fixed-width`, `auto-cursor-target`, `code-line`,
|
|
162
|
+
`has-list-bullet`, `internal-link`, `confluenceTd`, …).
|
|
163
|
+
- Unwraps decorative `<span>` and structural `<div>` wrappers
|
|
164
|
+
(including the `content-wrapper` div Confluence emits inside table
|
|
165
|
+
cells).
|
|
166
|
+
- Promotes header rows to `<thead>`, splits `<h*>` containing `<br/>`,
|
|
167
|
+
flattens singleton paragraphs inside `<li>`, drops empty paragraphs
|
|
168
|
+
and trailing breaks, normalizes `<pre><code>` to the structured
|
|
169
|
+
`code` macro form, removes cosmetic code parameters
|
|
170
|
+
(`linenumbers`, `theme`, `firstline`, …).
|
|
171
|
+
|
|
172
|
+
#### Security hardening
|
|
173
|
+
|
|
174
|
+
- Rejects any input containing `<!DOCTYPE>` or `<!ENTITY>` to block
|
|
175
|
+
XXE and billion-laughs attacks.
|
|
176
|
+
- lxml parser configured with `no_network=True`, `load_dtd=False`,
|
|
177
|
+
`huge_tree=False`.
|
|
178
|
+
- Opaque sentinels carry a SHA-256 fingerprint of their body — a user
|
|
179
|
+
who types the literal sentinel sequence in their Markdown is **not**
|
|
180
|
+
silently turned into an opaque block; the verification fails and
|
|
181
|
+
the region falls back to plain text.
|
|
182
|
+
|
|
183
|
+
#### Tooling
|
|
184
|
+
|
|
185
|
+
- `py.typed` marker for PEP 561 consumers.
|
|
186
|
+
- `pyproject.toml` configured for `uv` and `hatchling`.
|
|
187
|
+
- mypy clean (non-strict for v0.1; strict planned for v0.2).
|
|
188
|
+
- ruff clean.
|
|
189
|
+
- 65 tests:
|
|
190
|
+
- 39 unit tests (per-construct + edge cases)
|
|
191
|
+
- 7 image asset tests
|
|
192
|
+
- 8 security regression tests
|
|
193
|
+
- 1 corpus golden-file test (skipped if no private corpus available)
|
|
194
|
+
- 1 Hypothesis property-based round-trip test (100 random documents)
|
|
195
|
+
- Verified against 9 production Confluence pages totalling ~290 KB
|
|
196
|
+
of XHTML — all round-trip with byte-identical canonical form.
|
|
197
|
+
|
|
198
|
+
### Known limitations
|
|
199
|
+
|
|
200
|
+
- **HTML comments in Markdown** are dropped with a warning, with one
|
|
201
|
+
exception: cfxmark's own opaque / asset / header markers are
|
|
202
|
+
preserved. Confluence does not preserve HTML comments either, so
|
|
203
|
+
this matches Confluence's own behaviour.
|
|
204
|
+
- **`drawio`, `plantuml`** and other rich diagram macros are
|
|
205
|
+
passed through as opaque blocks (preserved losslessly but not
|
|
206
|
+
rendered in Markdown).
|
|
207
|
+
- **`MacroHandler` protocol leaks lxml**. Custom macro handlers
|
|
208
|
+
currently receive and return `lxml.etree._Element` objects. A thin
|
|
209
|
+
adapter is planned for v0.2.
|
|
210
|
+
- **`<th scope="...">`, `<td title="...">`** attributes are stripped
|
|
211
|
+
during canonicalization since Markdown cannot preserve them.
|
cfxmark-0.1.3/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Eunsan Jo
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
cfxmark-0.1.3/PKG-INFO
ADDED
|
@@ -0,0 +1,353 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: cfxmark
|
|
3
|
+
Version: 0.1.3
|
|
4
|
+
Summary: Bidirectional Markdown <-> Confluence Storage XHTML converter with lossless opaque preservation.
|
|
5
|
+
Project-URL: Homepage, https://github.com/eunsanMountain/cfxmark
|
|
6
|
+
Project-URL: Repository, https://github.com/eunsanMountain/cfxmark
|
|
7
|
+
Project-URL: Issues, https://github.com/eunsanMountain/cfxmark/issues
|
|
8
|
+
Project-URL: Documentation, https://github.com/eunsanMountain/cfxmark/blob/main/docs/SPEC.md
|
|
9
|
+
Project-URL: Changelog, https://github.com/eunsanMountain/cfxmark/blob/main/CHANGELOG.md
|
|
10
|
+
Author: Eunsan Jo
|
|
11
|
+
License-Expression: MIT
|
|
12
|
+
License-File: LICENSE
|
|
13
|
+
Keywords: bidirectional,confluence,converter,markdown,round-trip,storage-format,xhtml
|
|
14
|
+
Classifier: Development Status :: 3 - Alpha
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: Operating System :: OS Independent
|
|
17
|
+
Classifier: Programming Language :: Python :: 3
|
|
18
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
23
|
+
Classifier: Topic :: Text Processing :: Markup
|
|
24
|
+
Classifier: Topic :: Text Processing :: Markup :: Markdown
|
|
25
|
+
Classifier: Topic :: Text Processing :: Markup :: XML
|
|
26
|
+
Classifier: Typing :: Typed
|
|
27
|
+
Requires-Python: >=3.10
|
|
28
|
+
Requires-Dist: lxml>=5.0
|
|
29
|
+
Requires-Dist: mistletoe>=1.3
|
|
30
|
+
Provides-Extra: dev
|
|
31
|
+
Requires-Dist: hypothesis>=6.100; extra == 'dev'
|
|
32
|
+
Requires-Dist: lxml-stubs>=0.5; extra == 'dev'
|
|
33
|
+
Requires-Dist: mypy>=1.10; extra == 'dev'
|
|
34
|
+
Requires-Dist: pytest-cov>=5.0; extra == 'dev'
|
|
35
|
+
Requires-Dist: pytest>=8.0; extra == 'dev'
|
|
36
|
+
Requires-Dist: ruff>=0.5; extra == 'dev'
|
|
37
|
+
Description-Content-Type: text/markdown
|
|
38
|
+
|
|
39
|
+
# cfxmark
|
|
40
|
+
|
|
41
|
+
**Bidirectional Markdown ↔ Confluence Storage XHTML converter** —
|
|
42
|
+
with lossless opaque preservation for everything cfxmark doesn't
|
|
43
|
+
explicitly know how to convert.
|
|
44
|
+
|
|
45
|
+
```python
|
|
46
|
+
import cfxmark
|
|
47
|
+
|
|
48
|
+
# Markdown → Confluence storage XHTML
|
|
49
|
+
result = cfxmark.to_cfx(markdown_text)
|
|
50
|
+
result.xhtml # str — ready for Confluence REST PUT
|
|
51
|
+
result.attachments # tuple — local file refs the caller should upload
|
|
52
|
+
result.warnings # tuple — human-readable conversion warnings
|
|
53
|
+
|
|
54
|
+
# Confluence storage XHTML → Markdown
|
|
55
|
+
result = cfxmark.to_md(xhtml_text)
|
|
56
|
+
result.markdown # str — canonical markdown
|
|
57
|
+
result.warnings # tuple
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
`ConversionResult` is the same dataclass for both directions —
|
|
61
|
+
`xhtml` is populated for `to_cfx`, `markdown` for `to_md`.
|
|
62
|
+
|
|
63
|
+
## Why another converter?
|
|
64
|
+
|
|
65
|
+
Two existing projects inspired this one — [`md2cf`][md2cf] and
|
|
66
|
+
[`md2conf`][md2conf] — but both are **one-directional** (md → cf) and
|
|
67
|
+
neither preserves unknown macros across a round trip. `cfxmark` fills
|
|
68
|
+
both gaps:
|
|
69
|
+
|
|
70
|
+
1. **Bidirectional.** `to_md(to_cfx(m))` is byte-identical to
|
|
71
|
+
`canonicalize(m)` for every construct in the supported subset.
|
|
72
|
+
2. **Opaque preservation.** Confluence content cfxmark doesn't
|
|
73
|
+
understand (custom plugins, drawio diagrams, exotic table cells)
|
|
74
|
+
round-trips byte-for-byte, **including the `ac:macro-id` UUID**.
|
|
75
|
+
Confluence treats the round-tripped macro as the same instance, so
|
|
76
|
+
comments, attachments, and permissions stay attached.
|
|
77
|
+
3. **Pure text-in / text-out.** No Confluence API, no network, no
|
|
78
|
+
attachment upload. The caller owns REST I/O. (See "Image assets"
|
|
79
|
+
below for the helper function that lets the caller plug in
|
|
80
|
+
network-bound logic without bloating cfxmark.)
|
|
81
|
+
|
|
82
|
+
[md2cf]: https://github.com/iamjackg/md2cf
|
|
83
|
+
[md2conf]: https://github.com/hunyadi/md2conf
|
|
84
|
+
|
|
85
|
+
## Install
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# With uv (recommended):
|
|
89
|
+
uv add cfxmark
|
|
90
|
+
|
|
91
|
+
# With pip:
|
|
92
|
+
pip install cfxmark
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
cfxmark depends on `lxml` and `mistletoe`. Python 3.10+.
|
|
96
|
+
|
|
97
|
+
## The contract
|
|
98
|
+
|
|
99
|
+
cfxmark grades every Confluence construct into one of three buckets:
|
|
100
|
+
|
|
101
|
+
| Grade | Description | Behaviour |
|
|
102
|
+
|---|---|---|
|
|
103
|
+
| **I — Native** | Standard CommonMark / GFM (headings, lists, tables, code fences, links, images, blockquote, hr, inline emphasis) | Lossless round-trip after canonicalization. |
|
|
104
|
+
| **II — Directive** | Confluence macros with a known Markdown directive mapping (`info`, `note`, `warning`, `tip`, `jira`, `expand`, `toc`) | Lossless after canonicalization. Pluggable via `MacroRegistry`. |
|
|
105
|
+
| **III — Opaque** | Everything else | Captured byte-for-byte through cfxmark's opaque-block / inline-opaque mechanism. **Never dropped, never rewritten.** |
|
|
106
|
+
|
|
107
|
+
See [`docs/SPEC.md`](https://github.com/eunsanMountain/cfxmark/blob/main/docs/SPEC.md)
|
|
108
|
+
for the full mapping table and
|
|
109
|
+
[`docs/OPAQUE.md`](https://github.com/eunsanMountain/cfxmark/blob/main/docs/OPAQUE.md)
|
|
110
|
+
for the opaque-block format.
|
|
111
|
+
|
|
112
|
+
## Usage
|
|
113
|
+
|
|
114
|
+
### Round-trip a Confluence page through Markdown
|
|
115
|
+
|
|
116
|
+
```python
|
|
117
|
+
import cfxmark
|
|
118
|
+
|
|
119
|
+
# Whatever fetched the page (REST API call, exported XML file, …)
|
|
120
|
+
xhtml = my_confluence_client.get_storage_format(page_id)
|
|
121
|
+
|
|
122
|
+
# Convert to Markdown
|
|
123
|
+
md_result = cfxmark.to_md(xhtml)
|
|
124
|
+
markdown = md_result.markdown
|
|
125
|
+
|
|
126
|
+
# … user edits the Markdown …
|
|
127
|
+
|
|
128
|
+
# Convert back to Confluence storage XHTML
|
|
129
|
+
cfx_result = cfxmark.to_cfx(markdown)
|
|
130
|
+
my_confluence_client.update_page(page_id, cfx_result.xhtml)
|
|
131
|
+
|
|
132
|
+
# Optionally upload any newly referenced local images
|
|
133
|
+
for filename in cfx_result.attachments:
|
|
134
|
+
my_confluence_client.upload_attachment(page_id, filename)
|
|
135
|
+
```
|
|
136
|
+
|
|
137
|
+
### Image assets
|
|
138
|
+
|
|
139
|
+
When you convert a Confluence page that references uploaded
|
|
140
|
+
attachments, the resulting Markdown looks like this:
|
|
141
|
+
|
|
142
|
+
```markdown
|
|
143
|
+
<!-- cfxmark:asset src="image-3.png" -->
|
|
144
|
+
```
|
|
145
|
+
|
|
146
|
+
The image link still points at the original Confluence filename
|
|
147
|
+
(broken in any local Markdown viewer until you fetch the bytes), and
|
|
148
|
+
the `<!-- cfxmark:asset -->` HTML comment carries enough metadata for
|
|
149
|
+
a follow-up step to fetch and embed.
|
|
150
|
+
|
|
151
|
+
`cfxmark.resolve_assets` is that follow-up step. You provide a
|
|
152
|
+
fetcher callback that returns bytes for one filename at a time, and
|
|
153
|
+
choose between two output strategies:
|
|
154
|
+
|
|
155
|
+
```python
|
|
156
|
+
import cfxmark
|
|
157
|
+
from pathlib import Path
|
|
158
|
+
|
|
159
|
+
def fetcher(filename: str) -> bytes:
|
|
160
|
+
# Whatever you use to download from Confluence:
|
|
161
|
+
return my_confluence_client.download_attachment(page_id, filename)
|
|
162
|
+
|
|
163
|
+
# Strategy A — sidecar directory (recommended for git-tracked docs).
|
|
164
|
+
# Saves bytes to ./assets/ and rewrites links to relative paths.
|
|
165
|
+
md = cfxmark.resolve_assets(
|
|
166
|
+
md_result.markdown,
|
|
167
|
+
fetcher,
|
|
168
|
+
mode="sidecar",
|
|
169
|
+
asset_dir="docs/page-42/assets",
|
|
170
|
+
md_path="docs/page-42.md",
|
|
171
|
+
)
|
|
172
|
+
Path("docs/page-42.md").write_text(md)
|
|
173
|
+
# docs/page-42/assets/image-3.png exists
|
|
174
|
+
# md link: <!-- cfxmark:asset src="image-3.png" -->
|
|
175
|
+
|
|
176
|
+
# Strategy B — inline data URIs (single self-contained file).
|
|
177
|
+
md = cfxmark.resolve_assets(md_result.markdown, fetcher, mode="inline")
|
|
178
|
+
# md link: <!-- cfxmark:asset src="image-3.png" -->
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
The asset markers are **preserved** through both strategies, so
|
|
182
|
+
`resolve_assets` is idempotent and a subsequent `to_cfx` call always
|
|
183
|
+
recovers the original Confluence filename — even if the visible link
|
|
184
|
+
target has been rewritten to a sidecar path or a data URI.
|
|
185
|
+
|
|
186
|
+
### Mermaid diagrams
|
|
187
|
+
|
|
188
|
+
cfxmark maps Markdown's `` ```mermaid `` fenced code block to
|
|
189
|
+
Confluence's `code` macro with `language=mermaid`. If your Confluence
|
|
190
|
+
instance has a Mermaid plugin installed (e.g. *Mermaid Diagrams for
|
|
191
|
+
Confluence*) it will render the diagram automatically; otherwise the
|
|
192
|
+
content is shown as a syntax-highlighted code block.
|
|
193
|
+
|
|
194
|
+
```markdown
|
|
195
|
+
```mermaid
|
|
196
|
+
graph LR
|
|
197
|
+
A --> B --> C
|
|
198
|
+
```
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### Inline opaque references
|
|
202
|
+
|
|
203
|
+
Inline elements that have no native Markdown form — Confluence user
|
|
204
|
+
mentions, inline Jira issue macros, custom widget invocations, … —
|
|
205
|
+
become a short Markdown link with a `cfx:op-...` URL:
|
|
206
|
+
|
|
207
|
+
```markdown
|
|
208
|
+
Contact the purchaser ([@user-2c9402cc](cfx:op-4fab0f8d))
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
The `[label]` is auto-derived from the underlying element type
|
|
212
|
+
(`@user-…`, `jira:PROJ-1`, `cfx:status`, …) and the `op-XXXXXXXX` ID
|
|
213
|
+
is a SHA-256 prefix of the original XML payload. The full XML lives
|
|
214
|
+
in a `cfxmark:payloads` sidecar at the bottom of the same Markdown
|
|
215
|
+
file:
|
|
216
|
+
|
|
217
|
+
```markdown
|
|
218
|
+
<!-- cfxmark:payloads -->
|
|
219
|
+
<!-- op-4fab0f8d
|
|
220
|
+
<ac:link><ri:user ri:userkey="2c9402cc83d4bcc40183d976ef730001"/></ac:link>
|
|
221
|
+
-->
|
|
222
|
+
<!-- /cfxmark:payloads -->
|
|
223
|
+
```
|
|
224
|
+
|
|
225
|
+
The SHA-256 fingerprint means a user who **types** that exact link
|
|
226
|
+
syntax in their own Markdown is not silently re-interpreted as an
|
|
227
|
+
opaque payload — the verification fails and the region falls back to
|
|
228
|
+
ordinary text.
|
|
229
|
+
|
|
230
|
+
### Block opaque blocks
|
|
231
|
+
|
|
232
|
+
Block-level Confluence content cfxmark doesn't know how to convert
|
|
233
|
+
(e.g. drawio diagrams, plantuml, complex tables) is wrapped in a
|
|
234
|
+
fenced code block with sentinel comments:
|
|
235
|
+
|
|
236
|
+
````markdown
|
|
237
|
+
<!-- cfxmark:opaque id="op-1188e2b4" -->
|
|
238
|
+
```cfx-storage
|
|
239
|
+
<ac:structured-macro ac:name="drawio" ac:macro-id="...">
|
|
240
|
+
<ac:parameter ac:name="diagramName">flow</ac:parameter>
|
|
241
|
+
...
|
|
242
|
+
</ac:structured-macro>
|
|
243
|
+
```
|
|
244
|
+
<!-- /cfxmark:opaque -->
|
|
245
|
+
````
|
|
246
|
+
|
|
247
|
+
Editors render this as a clearly visible code block — a "do not
|
|
248
|
+
touch" signal for human readers. The Markdown parser detects the
|
|
249
|
+
sentinels first and round-trips the contents byte-for-byte, including
|
|
250
|
+
the original `ac:macro-id` UUID that Confluence uses to identify
|
|
251
|
+
macro instances.
|
|
252
|
+
|
|
253
|
+
### Header notice
|
|
254
|
+
|
|
255
|
+
When a converted Markdown document contains any opaque or directive
|
|
256
|
+
markers, cfxmark prepends a single-line HTML comment explaining the
|
|
257
|
+
conventions to humans and AI agents:
|
|
258
|
+
|
|
259
|
+
```markdown
|
|
260
|
+
<!-- cfxmark:notice Converted from Confluence storage format. Inline
|
|
261
|
+
[label](cfx:op-XXXXXXXX) references preserve Confluence content that
|
|
262
|
+
has no native Markdown form; the raw XML for each lives in the
|
|
263
|
+
cfxmark:payloads sidecar at the bottom of this file. Do not edit
|
|
264
|
+
those references or the sidecar — tampering invalidates a SHA-256
|
|
265
|
+
fingerprint and the round trip falls back to plain text. -->
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
The comment is invisible in any Markdown viewer.
|
|
269
|
+
|
|
270
|
+
### Custom macros
|
|
271
|
+
|
|
272
|
+
Promote a Confluence macro from "opaque" to "directive" by registering
|
|
273
|
+
a custom handler:
|
|
274
|
+
|
|
275
|
+
```python
|
|
276
|
+
import cfxmark
|
|
277
|
+
from cfxmark.macros import MacroRegistry
|
|
278
|
+
from cfxmark.macros.builtins import AdmonitionHandler
|
|
279
|
+
|
|
280
|
+
# Start from the default registry and add your own.
|
|
281
|
+
my_registry = cfxmark.default_registry.copy()
|
|
282
|
+
# Built-in AdmonitionHandler accepts one of: "info", "note", "warning", "tip".
|
|
283
|
+
# To promote a previously-opaque macro, write a small MacroHandler subclass —
|
|
284
|
+
# see cfxmark/macros/builtins/admonition.py for a complete example.
|
|
285
|
+
my_registry.register(AdmonitionHandler("warning"))
|
|
286
|
+
|
|
287
|
+
result = cfxmark.to_md(xhtml, macros=my_registry)
|
|
288
|
+
```
|
|
289
|
+
|
|
290
|
+
Implementing a `MacroHandler` from scratch requires a small amount
|
|
291
|
+
of lxml knowledge — see `cfxmark/macros/builtins/admonition.py` for
|
|
292
|
+
a complete example. A higher-level handler API that hides lxml is
|
|
293
|
+
planned for v0.2.
|
|
294
|
+
|
|
295
|
+
### Canonicalization helpers
|
|
296
|
+
|
|
297
|
+
Two Confluence storage fragments are "the same" only after a deep
|
|
298
|
+
normalization pass that strips volatile attributes, editor noise,
|
|
299
|
+
and rendering hints. Use `canonicalize_cfx` to compare two snapshots:
|
|
300
|
+
|
|
301
|
+
```python
|
|
302
|
+
import cfxmark
|
|
303
|
+
|
|
304
|
+
c1 = cfxmark.canonicalize_cfx(original_xhtml)
|
|
305
|
+
c2 = cfxmark.canonicalize_cfx(round_tripped_xhtml)
|
|
306
|
+
assert c1 == c2 # passes for any document in the supported subset
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
`canonicalize_cfx` is the same function the test suite uses to
|
|
310
|
+
verify byte-identical round trips against real Confluence pages.
|
|
311
|
+
|
|
312
|
+
## Security
|
|
313
|
+
|
|
314
|
+
cfxmark hardens its XML parser against XXE and billion-laughs attacks:
|
|
315
|
+
|
|
316
|
+
- Inputs containing `<!DOCTYPE>` or `<!ENTITY>` declarations are
|
|
317
|
+
rejected before lxml ever sees them.
|
|
318
|
+
- The lxml parser is configured with `no_network=True`,
|
|
319
|
+
`load_dtd=False`, and `huge_tree=False`.
|
|
320
|
+
- Opaque-block sentinels are SHA-256 verified — accidental sentinel
|
|
321
|
+
syntax in user-typed Markdown does **not** become a real opaque
|
|
322
|
+
block.
|
|
323
|
+
|
|
324
|
+
If you find a security issue, please open a GitHub issue.
|
|
325
|
+
|
|
326
|
+
## Development
|
|
327
|
+
|
|
328
|
+
```bash
|
|
329
|
+
git clone https://github.com/eunsanMountain/cfxmark
|
|
330
|
+
cd cfxmark
|
|
331
|
+
uv sync --all-extras
|
|
332
|
+
|
|
333
|
+
# Run all tests
|
|
334
|
+
uv run pytest
|
|
335
|
+
|
|
336
|
+
# Type-check
|
|
337
|
+
uv run mypy src/
|
|
338
|
+
|
|
339
|
+
# Lint
|
|
340
|
+
uv run ruff check .
|
|
341
|
+
|
|
342
|
+
# Build
|
|
343
|
+
uv build
|
|
344
|
+
```
|
|
345
|
+
|
|
346
|
+
The corpus tests look for `.cfx` files in `tests/corpus/` (gitignored
|
|
347
|
+
to keep your own private samples out of version control). Drop your
|
|
348
|
+
own Confluence storage XHTML there and they will be exercised by
|
|
349
|
+
`pytest tests/test_corpus.py`.
|
|
350
|
+
|
|
351
|
+
## License
|
|
352
|
+
|
|
353
|
+
MIT. See [`LICENSE`](https://github.com/eunsanMountain/cfxmark/blob/main/LICENSE).
|