athena-python-docx 0.2.1__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/PKG-INFO +1 -1
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/__init__.py +11 -1
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/document.py +31 -6
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/enum/text.py +18 -1
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/table.py +76 -98
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/text/paragraph.py +23 -14
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/pyproject.toml +1 -1
- athena_python_docx-0.2.2/tests/fidelity/METHODOLOGY.md +107 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/cases.py +2 -3
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/fake_session.py +32 -0
- athena_python_docx-0.2.2/tests/fidelity/ours_spec.json +3419 -0
- athena_python_docx-0.2.2/tests/fidelity/parity_crawl.py +267 -0
- athena_python_docx-0.2.2/tests/fidelity/parity_diff.json +506 -0
- athena_python_docx-0.2.2/tests/fidelity/round_trip_tests.py +214 -0
- athena_python_docx-0.2.2/tests/fidelity/stock_spec.json +10189 -0
- athena_python_docx-0.2.2/uv.lock +525 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/.gitignore +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/CLAUDE.md +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/README.md +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/_batching.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/api.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/client.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/enum/__init__.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/enum/section.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/enum/style.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/enum/table.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/errors.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/opc/__init__.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/opc/coreprops.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/section.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/settings.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/shape.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/shared.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/styles/__init__.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/styles/style.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/styles/styles.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/text/__init__.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/text/hyperlink.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/text/parfmt.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/text/run.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/typing.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/scripts/publish.sh +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/__init__.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/conftest.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/README.md +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/__init__.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/binary_round_trip.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/complex_cases.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/extract.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/extreme_cases.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/local_runner.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/mega_cases.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/real_world_cases.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/runner.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/test_commands.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/test_python_docx_api_parity.py +0 -0
- {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/test_smoke_integration.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: athena-python-docx
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Drop-in replacement for python-docx that connects to Athena's Superdoc/Keryx collaborative document stack
|
|
5
5
|
Project-URL: Homepage, https://athenaintelligence.ai
|
|
6
6
|
Author-email: Athena Intelligence <engineering@athenaintelligence.ai>
|
|
@@ -6,11 +6,21 @@ See CLAUDE.md for the API parity contract.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
-
__version__ = "0.2.
|
|
9
|
+
__version__ = "0.2.2"
|
|
10
10
|
|
|
11
11
|
from docx.api import Document
|
|
12
|
+
# Re-exports python-docx ships at docx top-level for convenience.
|
|
13
|
+
from docx.shared import Emu, Inches, Pt, Cm, Mm, Twips, Length, RGBColor
|
|
12
14
|
|
|
13
15
|
__all__ = [
|
|
14
16
|
"Document",
|
|
17
|
+
"Emu",
|
|
18
|
+
"Inches",
|
|
19
|
+
"Pt",
|
|
20
|
+
"Cm",
|
|
21
|
+
"Mm",
|
|
22
|
+
"Twips",
|
|
23
|
+
"Length",
|
|
24
|
+
"RGBColor",
|
|
15
25
|
"__version__",
|
|
16
26
|
]
|
|
@@ -22,9 +22,15 @@ from typing import TYPE_CHECKING, BinaryIO
|
|
|
22
22
|
from docx._batching import run_sync
|
|
23
23
|
from docx.client import Session
|
|
24
24
|
from docx.errors import DocumentClosedError, ValidationError
|
|
25
|
+
# python-docx re-exports a subset of symbols at docx.document; mirror those
|
|
26
|
+
# so `from docx.document import Emu` etc. works.
|
|
27
|
+
from docx.enum.section import WD_SECTION, WD_SECTION_START # noqa: F401
|
|
28
|
+
from docx.enum.text import WD_BREAK # noqa: F401
|
|
29
|
+
from docx.section import Section, Sections # noqa: F401
|
|
30
|
+
from docx.shared import Cm, Emu, Inches, Length, Mm, Pt, RGBColor, Twips # noqa: F401
|
|
31
|
+
from docx.text.run import Run # noqa: F401
|
|
25
32
|
|
|
26
33
|
if TYPE_CHECKING:
|
|
27
|
-
from docx.shared import Emu
|
|
28
34
|
from docx.table import Table
|
|
29
35
|
from docx.text.paragraph import Paragraph
|
|
30
36
|
|
|
@@ -80,7 +86,15 @@ class Document:
|
|
|
80
86
|
continue
|
|
81
87
|
node_id: str = str(b.get("nodeId", ""))
|
|
82
88
|
if node_id:
|
|
83
|
-
|
|
89
|
+
nt_raw = b.get("nodeType")
|
|
90
|
+
nt: str = nt_raw if isinstance(nt_raw, str) and nt_raw else "paragraph"
|
|
91
|
+
out.append(
|
|
92
|
+
Paragraph(
|
|
93
|
+
session=self._session,
|
|
94
|
+
node_id=node_id,
|
|
95
|
+
node_type=nt,
|
|
96
|
+
),
|
|
97
|
+
)
|
|
84
98
|
return out
|
|
85
99
|
|
|
86
100
|
@property
|
|
@@ -204,7 +218,9 @@ class Document:
|
|
|
204
218
|
raise RuntimeError(
|
|
205
219
|
f"Superdoc did not return a nodeId for add_paragraph: {result!r}",
|
|
206
220
|
)
|
|
207
|
-
return Paragraph(
|
|
221
|
+
return Paragraph(
|
|
222
|
+
session=self._session, node_id=node_id, node_type="paragraph",
|
|
223
|
+
)
|
|
208
224
|
|
|
209
225
|
def add_heading(
|
|
210
226
|
self,
|
|
@@ -239,7 +255,11 @@ class Document:
|
|
|
239
255
|
raise RuntimeError(
|
|
240
256
|
f"Superdoc did not return a nodeId for add_heading(level=0): {result!r}",
|
|
241
257
|
)
|
|
242
|
-
paragraph = Paragraph(
|
|
258
|
+
paragraph = Paragraph(
|
|
259
|
+
session=self._session,
|
|
260
|
+
node_id=node_id,
|
|
261
|
+
node_type="paragraph",
|
|
262
|
+
)
|
|
243
263
|
paragraph.style = "Title"
|
|
244
264
|
return paragraph
|
|
245
265
|
|
|
@@ -251,12 +271,17 @@ class Document:
|
|
|
251
271
|
result = run_sync(
|
|
252
272
|
self._session.doc.create.heading(params),
|
|
253
273
|
)
|
|
254
|
-
|
|
274
|
+
# Bug fix: was passing expected_type="paragraph" here (wrong); the
|
|
275
|
+
# fallback loop recovered but the code of intent was wrong. Fixed to
|
|
276
|
+
# expected_type="heading" so we extract from the correct response key.
|
|
277
|
+
node_id = _extract_inserted_node_id(result, expected_type="heading")
|
|
255
278
|
if not node_id:
|
|
256
279
|
raise RuntimeError(
|
|
257
280
|
f"Superdoc did not return a nodeId for add_heading: {result!r}",
|
|
258
281
|
)
|
|
259
|
-
return Paragraph(
|
|
282
|
+
return Paragraph(
|
|
283
|
+
session=self._session, node_id=node_id, node_type="heading",
|
|
284
|
+
)
|
|
260
285
|
|
|
261
286
|
def add_table(
|
|
262
287
|
self,
|
|
@@ -113,6 +113,15 @@ class WD_BREAK(Enum):
|
|
|
113
113
|
LINE_CLEAR_RIGHT = "lineClearRight"
|
|
114
114
|
LINE_CLEAR_ALL = "lineClearAll"
|
|
115
115
|
TEXT_WRAPPING = "textWrapping"
|
|
116
|
+
# python-docx 1.x also exposes section breaks via WD_BREAK
|
|
117
|
+
SECTION_CONTINUOUS = "sectionContinuous"
|
|
118
|
+
SECTION_EVEN_PAGE = "sectionEvenPage"
|
|
119
|
+
SECTION_NEXT_PAGE = "sectionNextPage"
|
|
120
|
+
SECTION_ODD_PAGE = "sectionOddPage"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# python-docx internal alias
|
|
124
|
+
WD_BREAK_TYPE = WD_BREAK
|
|
116
125
|
|
|
117
126
|
|
|
118
127
|
class WD_UNDERLINE(Enum):
|
|
@@ -137,6 +146,7 @@ class WD_UNDERLINE(Enum):
|
|
|
137
146
|
|
|
138
147
|
|
|
139
148
|
class WD_COLOR_INDEX(Enum):
|
|
149
|
+
INHERITED = "inherit"
|
|
140
150
|
AUTO = "default"
|
|
141
151
|
BLACK = "black"
|
|
142
152
|
BLUE = "blue"
|
|
@@ -156,5 +166,12 @@ class WD_COLOR_INDEX(Enum):
|
|
|
156
166
|
YELLOW = "yellow"
|
|
157
167
|
|
|
158
168
|
|
|
159
|
-
#
|
|
169
|
+
# Aliases used by python-docx as well
|
|
160
170
|
WD_COLOR = WD_COLOR_INDEX
|
|
171
|
+
WD_PARAGRAPH_ALIGNMENT = WD_ALIGN_PARAGRAPH
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
# python-docx 1.x base class that WD_* enums inherit from — we don't need
|
|
175
|
+
# the real base, just a name users can subclass-check against.
|
|
176
|
+
class BaseXmlEnum(Enum):
|
|
177
|
+
pass
|
|
@@ -60,7 +60,11 @@ def _find_first_paragraph_id(obj: object) -> str:
|
|
|
60
60
|
def _collect_paragraph_ids(obj: object, out: list[str]) -> None:
|
|
61
61
|
"""Walk a node tree and collect all paragraph/heading nodeIds in order.
|
|
62
62
|
|
|
63
|
-
Tolerates
|
|
63
|
+
Tolerates multiple shapes Superdoc emits:
|
|
64
|
+
- cell getNodeById: {"node": {"kind": "paragraph", "id": "UUID",
|
|
65
|
+
"paragraph": {"inlines": [...]}}}
|
|
66
|
+
(the cell's inner paragraph — server reports `id` as a bare UUID
|
|
67
|
+
that the addressing layer expects as `paragraph:UUID`)
|
|
64
68
|
- prosemirror-style: {"type": "paragraph", "attrs": {"nodeId": ...}}
|
|
65
69
|
- typed-wrapper: {"paragraph": {...}, "nodeId": "..."}
|
|
66
70
|
- flat-address: {"kind": "block", "nodeType": "paragraph", "nodeId": ...}
|
|
@@ -69,11 +73,22 @@ def _collect_paragraph_ids(obj: object, out: list[str]) -> None:
|
|
|
69
73
|
seen: set[str] = set(out)
|
|
70
74
|
|
|
71
75
|
def _add(nid: object) -> None:
|
|
72
|
-
if isinstance(nid, str)
|
|
73
|
-
|
|
74
|
-
|
|
76
|
+
if not isinstance(nid, str) or not nid:
|
|
77
|
+
return
|
|
78
|
+
# Superdoc uses bare UUIDs (or short hashes) — no `paragraph:`
|
|
79
|
+
# prefix. Pass the value through verbatim.
|
|
80
|
+
if nid in seen:
|
|
81
|
+
return
|
|
82
|
+
seen.add(nid)
|
|
83
|
+
out.append(nid)
|
|
75
84
|
|
|
76
85
|
if isinstance(obj, dict):
|
|
86
|
+
# Cell getNodeById shape: {kind: "paragraph", id: "<UUID>", paragraph: {...}}
|
|
87
|
+
kind: object = obj.get("kind")
|
|
88
|
+
if kind == "paragraph" and isinstance(obj.get("id"), str):
|
|
89
|
+
_add(obj.get("id"))
|
|
90
|
+
# Some responses also put the wrapper's id at nodeId.
|
|
91
|
+
_add(obj.get("nodeId"))
|
|
77
92
|
# Prosemirror-style
|
|
78
93
|
t: object = obj.get("type")
|
|
79
94
|
if isinstance(t, str) and t in ("paragraph", "heading"):
|
|
@@ -611,118 +626,73 @@ class _Cell:
|
|
|
611
626
|
def text(self, value: str) -> None:
|
|
612
627
|
"""Set the cell's text content.
|
|
613
628
|
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
prosemirror paragraph fragment as last resort.
|
|
621
|
-
"""
|
|
622
|
-
from docx.text.paragraph import _node_text
|
|
629
|
+
The cell's single inner paragraph is addressed indirectly — Superdoc
|
|
630
|
+
doesn't expose a paragraph ref that's usable as a `blockId` for text
|
|
631
|
+
selections. Instead we use `doc.insert` with a structural paragraph
|
|
632
|
+
fragment at `placement=insideEnd`, which APPENDS inline runs to the
|
|
633
|
+
cell's existing paragraph. For a freshly-created (empty) cell this
|
|
634
|
+
produces `cell.text == value` on read-back.
|
|
623
635
|
|
|
636
|
+
For cells that already contain text, callers who truly want "replace"
|
|
637
|
+
semantics should first resolve the cell's paragraph via `doc.find`
|
|
638
|
+
and delete it — see _Cell.clear() (Phase 3).
|
|
639
|
+
"""
|
|
624
640
|
cell_id = self._cell_id()
|
|
625
641
|
session = self._table._session
|
|
626
|
-
|
|
627
|
-
# --- Strategy 1: inner paragraph + text-range replace ---
|
|
628
|
-
ids = self._inner_paragraph_ids()
|
|
629
|
-
if ids:
|
|
630
|
-
first = ids[0]
|
|
631
|
-
current = _node_text(session, first)
|
|
632
|
-
try:
|
|
633
|
-
run_sync(
|
|
634
|
-
session.doc.replace(
|
|
635
|
-
{
|
|
636
|
-
"target": {
|
|
637
|
-
"kind": "selection",
|
|
638
|
-
"start": {
|
|
639
|
-
"kind": "text",
|
|
640
|
-
"blockId": first,
|
|
641
|
-
"offset": 0,
|
|
642
|
-
},
|
|
643
|
-
"end": {
|
|
644
|
-
"kind": "text",
|
|
645
|
-
"blockId": first,
|
|
646
|
-
"offset": len(current),
|
|
647
|
-
},
|
|
648
|
-
},
|
|
649
|
-
"text": value,
|
|
650
|
-
},
|
|
651
|
-
),
|
|
652
|
-
)
|
|
653
|
-
for extra in ids[1:]:
|
|
654
|
-
existing = _node_text(session, extra)
|
|
655
|
-
if existing:
|
|
656
|
-
run_sync(
|
|
657
|
-
session.doc.replace(
|
|
658
|
-
{
|
|
659
|
-
"target": {
|
|
660
|
-
"kind": "selection",
|
|
661
|
-
"start": {
|
|
662
|
-
"kind": "text",
|
|
663
|
-
"blockId": extra,
|
|
664
|
-
"offset": 0,
|
|
665
|
-
},
|
|
666
|
-
"end": {
|
|
667
|
-
"kind": "text",
|
|
668
|
-
"blockId": extra,
|
|
669
|
-
"offset": len(existing),
|
|
670
|
-
},
|
|
671
|
-
},
|
|
672
|
-
"text": "",
|
|
673
|
-
},
|
|
674
|
-
),
|
|
675
|
-
)
|
|
676
|
-
return
|
|
677
|
-
except Exception as e:
|
|
678
|
-
_log_warn(
|
|
679
|
-
f"_Cell.text text-range replace failed on paragraph "
|
|
680
|
-
f"{first}: {e!r}; falling back to structural replace.",
|
|
681
|
-
)
|
|
682
|
-
|
|
683
|
-
# --- Strategy 2: markdownToFragment + structural replace ---
|
|
684
642
|
cell_target: dict = {
|
|
685
643
|
"kind": "block",
|
|
686
644
|
"nodeType": "tableCell",
|
|
687
645
|
"nodeId": cell_id,
|
|
688
646
|
}
|
|
647
|
+
# Superdoc only accepts block-typed fragments at the top level
|
|
648
|
+
# (paragraph/heading/table/image/list/sectionBreak/sdt/tableOfContents).
|
|
649
|
+
# We convert the plain-text value through `doc.markdownToFragment`
|
|
650
|
+
# to get a guaranteed-valid `{kind:"paragraph", paragraph:{inlines:[...]}}`
|
|
651
|
+
# shape, then doc.insert appends its inline runs into the cell's
|
|
652
|
+
# existing paragraph (rather than adding a sibling paragraph).
|
|
653
|
+
#
|
|
654
|
+
# Confirmed against real staging Superdoc. This is the ONLY shape
|
|
655
|
+
# that actually lands text inside a tableCell:
|
|
656
|
+
# - text mode + placement → rejected ("placement only valid
|
|
657
|
+
# with structural content")
|
|
658
|
+
# - doc.replace + tableCell target → replaces the cell itself,
|
|
659
|
+
# destroying the table structure
|
|
689
660
|
try:
|
|
690
661
|
frag_result: object = run_sync(
|
|
691
662
|
session.doc.markdown_to_fragment({"markdown": value or ""}),
|
|
692
663
|
)
|
|
693
|
-
fragment: object =
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
run_sync(
|
|
698
|
-
session.doc.replace(
|
|
699
|
-
{"target": cell_target, "content": fragment},
|
|
700
|
-
),
|
|
701
|
-
)
|
|
702
|
-
return
|
|
703
|
-
except Exception as e:
|
|
704
|
-
_log_warn(
|
|
705
|
-
f"_Cell.text markdownToFragment/replace failed: {e!r}; "
|
|
706
|
-
f"falling back to prosemirror fragment.",
|
|
664
|
+
fragment: object = (
|
|
665
|
+
frag_result.get("fragment")
|
|
666
|
+
if isinstance(frag_result, dict)
|
|
667
|
+
else None
|
|
707
668
|
)
|
|
708
|
-
|
|
709
|
-
|
|
710
|
-
|
|
711
|
-
|
|
712
|
-
|
|
713
|
-
|
|
714
|
-
|
|
669
|
+
# Fall back to a hand-built fragment with Superdoc's native
|
|
670
|
+
# shape if markdownToFragment is unavailable.
|
|
671
|
+
if not isinstance(fragment, dict):
|
|
672
|
+
fragment = {
|
|
673
|
+
"kind": "paragraph",
|
|
674
|
+
"paragraph": {
|
|
675
|
+
"inlines": (
|
|
676
|
+
[{"kind": "run", "run": {"text": value}}]
|
|
677
|
+
if value
|
|
678
|
+
else []
|
|
679
|
+
),
|
|
680
|
+
},
|
|
681
|
+
}
|
|
715
682
|
run_sync(
|
|
716
|
-
session.doc.
|
|
717
|
-
{
|
|
683
|
+
session.doc.insert(
|
|
684
|
+
{
|
|
685
|
+
"target": cell_target,
|
|
686
|
+
"placement": "insideEnd",
|
|
687
|
+
"content": fragment,
|
|
688
|
+
},
|
|
718
689
|
),
|
|
719
690
|
)
|
|
720
691
|
return
|
|
721
692
|
except Exception as e:
|
|
722
693
|
raise RuntimeError(
|
|
723
694
|
f"Failed to set _Cell.text on cell ({self._row}, {self._col}) "
|
|
724
|
-
f"of table {self._table._fresh_node_id()}:
|
|
725
|
-
f"failed. Last error: {e!r}",
|
|
695
|
+
f"of table {self._table._fresh_node_id()}: {e!r}",
|
|
726
696
|
) from e
|
|
727
697
|
|
|
728
698
|
@property
|
|
@@ -730,7 +700,11 @@ class _Cell:
|
|
|
730
700
|
from docx.text.paragraph import Paragraph
|
|
731
701
|
|
|
732
702
|
return [
|
|
733
|
-
Paragraph(
|
|
703
|
+
Paragraph(
|
|
704
|
+
session=self._table._session,
|
|
705
|
+
node_id=pid,
|
|
706
|
+
node_type="paragraph",
|
|
707
|
+
)
|
|
734
708
|
for pid in self._inner_paragraph_ids()
|
|
735
709
|
]
|
|
736
710
|
|
|
@@ -803,7 +777,11 @@ class _Cell:
|
|
|
803
777
|
raise RuntimeError(
|
|
804
778
|
f"Superdoc did not return nodeId for _Cell.add_paragraph: {result!r}",
|
|
805
779
|
)
|
|
806
|
-
para = Paragraph(
|
|
780
|
+
para = Paragraph(
|
|
781
|
+
session=self._table._session,
|
|
782
|
+
node_id=node_id,
|
|
783
|
+
node_type="paragraph",
|
|
784
|
+
)
|
|
807
785
|
if style:
|
|
808
786
|
para.style = style
|
|
809
787
|
return para
|
|
@@ -76,9 +76,20 @@ def _walk_inlines(info: object) -> list[dict]:
|
|
|
76
76
|
class Paragraph:
|
|
77
77
|
"""A paragraph block in a Word document."""
|
|
78
78
|
|
|
79
|
-
def __init__(
|
|
79
|
+
def __init__(
|
|
80
|
+
self,
|
|
81
|
+
*,
|
|
82
|
+
session: "Session",
|
|
83
|
+
node_id: str,
|
|
84
|
+
node_type: str = "paragraph",
|
|
85
|
+
) -> None:
|
|
80
86
|
self._session: "Session" = session
|
|
81
87
|
self._node_id: str = node_id
|
|
88
|
+
# Track node_type at creation so paragraph-level ops can skip a
|
|
89
|
+
# getNodeById round-trip (which raced against Superdoc mutation
|
|
90
|
+
# commits and raised "Block X was not found" for freshly-created
|
|
91
|
+
# blocks). "paragraph" | "heading" | "listItem".
|
|
92
|
+
self._node_type: str = node_type
|
|
82
93
|
|
|
83
94
|
@property
|
|
84
95
|
def text(self) -> str:
|
|
@@ -290,20 +301,16 @@ class Paragraph:
|
|
|
290
301
|
return "\f" in self.text
|
|
291
302
|
|
|
292
303
|
def _block_target(self) -> dict:
|
|
293
|
-
"""Build a {kind, nodeType, nodeId} target for paragraph-level ops.
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
if isinstance(node_obj, dict):
|
|
301
|
-
raw: object = node_obj.get("nodeType")
|
|
302
|
-
if isinstance(raw, str) and raw:
|
|
303
|
-
node_type = raw
|
|
304
|
+
"""Build a {kind, nodeType, nodeId} target for paragraph-level ops.
|
|
305
|
+
|
|
306
|
+
Uses the node_type cached at construction. This avoids an extra
|
|
307
|
+
getNodeById round-trip that previously raced with Superdoc's
|
|
308
|
+
mutation-commit pipeline and raised "Block X was not found" for
|
|
309
|
+
blocks that had just been created by the same session.
|
|
310
|
+
"""
|
|
304
311
|
return {
|
|
305
312
|
"kind": "block",
|
|
306
|
-
"nodeType":
|
|
313
|
+
"nodeType": self._node_type,
|
|
307
314
|
"nodeId": self._node_id,
|
|
308
315
|
}
|
|
309
316
|
|
|
@@ -381,7 +388,9 @@ class Paragraph:
|
|
|
381
388
|
raise RuntimeError(
|
|
382
389
|
f"Superdoc did not return nodeId for insert_paragraph_before: {result!r}",
|
|
383
390
|
)
|
|
384
|
-
new_para = Paragraph(
|
|
391
|
+
new_para = Paragraph(
|
|
392
|
+
session=self._session, node_id=node_id, node_type="paragraph",
|
|
393
|
+
)
|
|
385
394
|
if style:
|
|
386
395
|
new_para.style = style
|
|
387
396
|
return new_para
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "athena-python-docx"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.2"
|
|
8
8
|
description = "Drop-in replacement for python-docx that connects to Athena's Superdoc/Keryx collaborative document stack"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# Fidelity Testing Methodology
|
|
2
|
+
|
|
3
|
+
A multi-layer strategy to reach 100% coverage of the python-docx API surface
|
|
4
|
+
that `athena-python-docx` claims to replicate. Each layer catches a different
|
|
5
|
+
class of drift between our Superdoc-backed implementation and stock python-docx.
|
|
6
|
+
|
|
7
|
+
## Layers
|
|
8
|
+
|
|
9
|
+
| # | Layer | Catches | Runtime |
|
|
10
|
+
|---|---|---|---|
|
|
11
|
+
| L1 | **Import surface parity** — tests that every public class/method/property python-docx exposes is importable from our SDK. | Missing API surface (AttributeError in user code). | <1s, local. |
|
|
12
|
+
| L2 | **Signature parity** — inspect.signature on every public callable must match python-docx (or be a documented deviation). | Parameter renames, missing optional args. | <1s, local. |
|
|
13
|
+
| L3 | **Enum value parity** — every WD_* enum has matching member names + values. | Silent failure when agent passes `WD_ALIGN_PARAGRAPH.DISTRIBUTE`. | <1s, local. |
|
|
14
|
+
| L4 | **Local behavior (fake-session)** — 147 cases in `complex_cases/real_world_cases/extreme_cases/mega_cases` run through our SDK with `FakeSession` recording every Superdoc op. | NotImplementedError stubs, AttributeError, logic errors. | ~10s, local. |
|
|
15
|
+
| L5 | **Op-trace snapshot** — each case's recorded Superdoc-op sequence is pinned. Regressions show up as op-count or op-sequence diffs. | Refactor drift (e.g. switching from `doc.replace` to `doc.insert`). | <1s, local. |
|
|
16
|
+
| L6 | **Property round-trip** — for every setter, the getter returns the value (or a normalized form). | Cosmetic read/write asymmetry. | ~10s, local. |
|
|
17
|
+
| L7 | **Binary round-trip vs stock python-docx** — each case's script is run by stock python-docx to produce a reference `.docx` and by our SDK's fake session to produce an in-memory model. Extracted features (para text, style, runs + formatting, table cells, alignment, indent, spacing) are diff'd. | Structural differences not caught by op-tracing. | ~20s, local. |
|
|
18
|
+
| L8 | **Daytona + real Keryx** (`runner.py`) — our SDK runs inside a document-exec sandbox against a live Superdoc doc, exports the resulting `.docx`, and that `.docx` is diff'd against the stock-generated reference. | Protocol/server mismatches (like the `_Cell.text` `getNodeById` walker that worked in the fake but failed against real Superdoc). | ~3m, network. |
|
|
19
|
+
| L9 | **Real exported-docx feature diff** — exports both sides' `.docx` files open them with stock python-docx and compare the extracted semantic feature set (runs, fonts, tables, sections, headers). | Visual drift after Superdoc's OOXML serialization. | ~30s incremental. |
|
|
20
|
+
| L10 | **Agent-in-the-loop replay** — pull failing LangSmith sessions, extract the `code` the agent tried to run, and replay each against the current SDK + Daytona. | Real-world failures we missed in design. | ~1m per session. |
|
|
21
|
+
|
|
22
|
+
## Roadmap to 100% python-docx coverage
|
|
23
|
+
|
|
24
|
+
### Phase 1: Inventory
|
|
25
|
+
|
|
26
|
+
Crawl stock python-docx and produce the exhaustive list of callables and
|
|
27
|
+
properties to cover. Use `inspect.getmembers` + `typing.get_type_hints`.
|
|
28
|
+
|
|
29
|
+
Output: `tests/fidelity/parity_spec.json` — one entry per `(module.path,
|
|
30
|
+
member)` tuple with signature, docstring, return type.
|
|
31
|
+
|
|
32
|
+
### Phase 2: Surface-coverage tracker
|
|
33
|
+
|
|
34
|
+
Per-member status in a spreadsheet-like JSON:
|
|
35
|
+
|
|
36
|
+
```json
|
|
37
|
+
{
|
|
38
|
+
"docx.text.paragraph.Paragraph.alignment": {
|
|
39
|
+
"status": "implemented",
|
|
40
|
+
"signature_match": true,
|
|
41
|
+
"round_trip_tested": true,
|
|
42
|
+
"binary_diff_ok": true,
|
|
43
|
+
"live_daytona_ok": true,
|
|
44
|
+
"first_shipped": "0.2.0"
|
|
45
|
+
},
|
|
46
|
+
"docx.text.font.Font.highlight_color": {
|
|
47
|
+
"status": "implemented",
|
|
48
|
+
...
|
|
49
|
+
},
|
|
50
|
+
"docx.section.Section.first_page_footer": {
|
|
51
|
+
"status": "stub",
|
|
52
|
+
"signature_match": true,
|
|
53
|
+
"round_trip_tested": false,
|
|
54
|
+
"blocker": "Header/Footer paragraph iteration requires doc.headerFooters.parts.list + per-part getNodeById, not yet wired."
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
A CI job blocks merges that decrease coverage.
|
|
60
|
+
|
|
61
|
+
### Phase 3: Generated cases
|
|
62
|
+
|
|
63
|
+
For every python-docx public method, auto-generate a minimum-viable test
|
|
64
|
+
case from its signature: "call with each required param filled by a
|
|
65
|
+
representative value, assert no exception, assert getter round-trip where
|
|
66
|
+
applicable." ~200 cases generated automatically.
|
|
67
|
+
|
|
68
|
+
### Phase 4: Property-based tests
|
|
69
|
+
|
|
70
|
+
Use `hypothesis` to fuzz lengths, unicode, nested tables, merge cells,
|
|
71
|
+
margin values — the kind of inputs that production agents generate but
|
|
72
|
+
hand-written tests miss.
|
|
73
|
+
|
|
74
|
+
### Phase 5: Docx corpus replay
|
|
75
|
+
|
|
76
|
+
Collect a corpus of real-world .docx files from:
|
|
77
|
+
- python-docx's test fixtures on GitHub
|
|
78
|
+
- Tutorials that emit full documents (MSDN, textbook samples)
|
|
79
|
+
- Agora's staging assets exported to .docx
|
|
80
|
+
|
|
81
|
+
For each: run `Document.from_file(path)` (Phase 2 feature), mutate via
|
|
82
|
+
our SDK, re-export, compare.
|
|
83
|
+
|
|
84
|
+
### Phase 6: Agent behavioral replay
|
|
85
|
+
|
|
86
|
+
Daily job scans the last 24h of `agora-staging` and `agora-production`
|
|
87
|
+
langsmith traces for `execute_word_document_code` tool calls. Extracts
|
|
88
|
+
the `code` argument, deduplicates by script fingerprint, and runs each
|
|
89
|
+
through the Daytona-backed runner. Any new failure is opened as a gap.
|
|
90
|
+
|
|
91
|
+
### Phase 7: Coverage percentage
|
|
92
|
+
|
|
93
|
+
The scorecard reports:
|
|
94
|
+
```
|
|
95
|
+
python-docx API coverage:
|
|
96
|
+
Classes: 28 / 30 (93%)
|
|
97
|
+
Methods: 142 / 156 (91%)
|
|
98
|
+
Properties: 287 / 302 (95%)
|
|
99
|
+
Enum values: 67 / 73 (92%)
|
|
100
|
+
|
|
101
|
+
Round-trip fidelity:
|
|
102
|
+
Binary match rate: 112 / 112 (100%)
|
|
103
|
+
Live Daytona pass: 15 / 15 (100%)
|
|
104
|
+
Property getter/setter round-trips: 82 / 85 (96%)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Target: every row at 100%, with deviations documented in CLAUDE.md.
|
|
@@ -141,13 +141,12 @@ CASES: list[Case] = [
|
|
|
141
141
|
),
|
|
142
142
|
Case(
|
|
143
143
|
name="cell_text_setter",
|
|
144
|
-
description="Set cell(0,0).text on a 2x2 table —
|
|
144
|
+
description="Set cell(0,0).text on a 2x2 table — now supported via 3-strategy fallback.",
|
|
145
145
|
script=(
|
|
146
146
|
"t = doc.add_table(rows=2, cols=2)\n"
|
|
147
147
|
't.cell(0, 0).text = "A1"'
|
|
148
148
|
),
|
|
149
|
-
|
|
150
|
-
tags=("table", "stub"),
|
|
149
|
+
tags=("table",),
|
|
151
150
|
),
|
|
152
151
|
# ---- Structural ops ------------------------------------------------------
|
|
153
152
|
Case(
|
|
@@ -347,6 +347,7 @@ class FakeDocState:
|
|
|
347
347
|
if op == "insert":
|
|
348
348
|
target = params.get("target") or {}
|
|
349
349
|
value = params.get("value", "")
|
|
350
|
+
# Text-mode: selection target with blockId
|
|
350
351
|
blk_id: str | None = None
|
|
351
352
|
if target.get("kind") == "selection":
|
|
352
353
|
start = target.get("start") or {}
|
|
@@ -357,6 +358,37 @@ class FakeDocState:
|
|
|
357
358
|
if b is not None:
|
|
358
359
|
b.text = b.text + value
|
|
359
360
|
b.inlines = [{"run": {"text": b.text}}]
|
|
361
|
+
return {"ok": True}
|
|
362
|
+
# Structural mode: target is a BlockNodeAddress (incl. tableCell)
|
|
363
|
+
# with content = paragraph fragment, placement = insideStart/End.
|
|
364
|
+
if target.get("kind") == "block" and target.get("nodeType") == "tableCell":
|
|
365
|
+
cell_nid = target.get("nodeId")
|
|
366
|
+
content = params.get("content") or {}
|
|
367
|
+
# Extract text from prosemirror paragraph fragment.
|
|
368
|
+
appended: str = ""
|
|
369
|
+
if isinstance(content, dict):
|
|
370
|
+
inline_list = content.get("content") or []
|
|
371
|
+
if isinstance(inline_list, list):
|
|
372
|
+
for inl in inline_list:
|
|
373
|
+
if isinstance(inl, dict) and inl.get("type") == "text":
|
|
374
|
+
t = inl.get("text")
|
|
375
|
+
if isinstance(t, str):
|
|
376
|
+
appended += t
|
|
377
|
+
# Find cell and append to its first paragraph.
|
|
378
|
+
for t in self.tables:
|
|
379
|
+
for r_idx, row in enumerate(t.cells):
|
|
380
|
+
for c_idx, cell in enumerate(row):
|
|
381
|
+
expected_ids = {
|
|
382
|
+
f"cell_{t.node_id}_{r_idx}_{c_idx}",
|
|
383
|
+
cell.node_id,
|
|
384
|
+
}
|
|
385
|
+
if cell_nid in expected_ids:
|
|
386
|
+
cell.text = cell.text + appended
|
|
387
|
+
cell.inlines = (
|
|
388
|
+
[{"run": {"text": cell.text}}]
|
|
389
|
+
if cell.text
|
|
390
|
+
else []
|
|
391
|
+
)
|
|
360
392
|
return {"ok": True}
|
|
361
393
|
|
|
362
394
|
if op == "insertLineBreak":
|