athena-python-docx 0.2.0__tar.gz → 0.2.2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/PKG-INFO +1 -1
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/__init__.py +11 -1
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/document.py +31 -6
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/enum/text.py +18 -1
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/table.py +174 -75
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/text/paragraph.py +23 -14
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/pyproject.toml +1 -1
- athena_python_docx-0.2.2/tests/fidelity/METHODOLOGY.md +107 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/fidelity/cases.py +2 -3
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/fidelity/fake_session.py +32 -0
- athena_python_docx-0.2.2/tests/fidelity/ours_spec.json +3419 -0
- athena_python_docx-0.2.2/tests/fidelity/parity_crawl.py +267 -0
- athena_python_docx-0.2.2/tests/fidelity/parity_diff.json +506 -0
- athena_python_docx-0.2.2/tests/fidelity/round_trip_tests.py +214 -0
- athena_python_docx-0.2.2/tests/fidelity/stock_spec.json +10189 -0
- athena_python_docx-0.2.2/uv.lock +525 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/.gitignore +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/CLAUDE.md +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/README.md +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/_batching.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/api.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/client.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/enum/__init__.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/enum/section.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/enum/style.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/enum/table.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/errors.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/opc/__init__.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/opc/coreprops.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/section.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/settings.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/shape.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/shared.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/styles/__init__.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/styles/style.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/styles/styles.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/text/__init__.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/text/hyperlink.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/text/parfmt.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/text/run.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/docx/typing.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/scripts/publish.sh +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/__init__.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/conftest.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/fidelity/README.md +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/fidelity/__init__.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/fidelity/binary_round_trip.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/fidelity/complex_cases.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/fidelity/extract.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/fidelity/extreme_cases.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/fidelity/local_runner.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/fidelity/mega_cases.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/fidelity/real_world_cases.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/fidelity/runner.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/test_commands.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/test_python_docx_api_parity.py +0 -0
- {athena_python_docx-0.2.0 → athena_python_docx-0.2.2}/tests/test_smoke_integration.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: athena-python-docx
|
|
3
|
-
Version: 0.2.
|
|
3
|
+
Version: 0.2.2
|
|
4
4
|
Summary: Drop-in replacement for python-docx that connects to Athena's Superdoc/Keryx collaborative document stack
|
|
5
5
|
Project-URL: Homepage, https://athenaintelligence.ai
|
|
6
6
|
Author-email: Athena Intelligence <engineering@athenaintelligence.ai>
|
|
@@ -6,11 +6,21 @@ See CLAUDE.md for the API parity contract.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
-
__version__ = "0.2.
|
|
9
|
+
__version__ = "0.2.2"
|
|
10
10
|
|
|
11
11
|
from docx.api import Document
|
|
12
|
+
# Re-exports python-docx ships at docx top-level for convenience.
|
|
13
|
+
from docx.shared import Emu, Inches, Pt, Cm, Mm, Twips, Length, RGBColor
|
|
12
14
|
|
|
13
15
|
__all__ = [
|
|
14
16
|
"Document",
|
|
17
|
+
"Emu",
|
|
18
|
+
"Inches",
|
|
19
|
+
"Pt",
|
|
20
|
+
"Cm",
|
|
21
|
+
"Mm",
|
|
22
|
+
"Twips",
|
|
23
|
+
"Length",
|
|
24
|
+
"RGBColor",
|
|
15
25
|
"__version__",
|
|
16
26
|
]
|
|
@@ -22,9 +22,15 @@ from typing import TYPE_CHECKING, BinaryIO
|
|
|
22
22
|
from docx._batching import run_sync
|
|
23
23
|
from docx.client import Session
|
|
24
24
|
from docx.errors import DocumentClosedError, ValidationError
|
|
25
|
+
# python-docx re-exports a subset of symbols at docx.document; mirror those
|
|
26
|
+
# so `from docx.document import Emu` etc. works.
|
|
27
|
+
from docx.enum.section import WD_SECTION, WD_SECTION_START # noqa: F401
|
|
28
|
+
from docx.enum.text import WD_BREAK # noqa: F401
|
|
29
|
+
from docx.section import Section, Sections # noqa: F401
|
|
30
|
+
from docx.shared import Cm, Emu, Inches, Length, Mm, Pt, RGBColor, Twips # noqa: F401
|
|
31
|
+
from docx.text.run import Run # noqa: F401
|
|
25
32
|
|
|
26
33
|
if TYPE_CHECKING:
|
|
27
|
-
from docx.shared import Emu
|
|
28
34
|
from docx.table import Table
|
|
29
35
|
from docx.text.paragraph import Paragraph
|
|
30
36
|
|
|
@@ -80,7 +86,15 @@ class Document:
|
|
|
80
86
|
continue
|
|
81
87
|
node_id: str = str(b.get("nodeId", ""))
|
|
82
88
|
if node_id:
|
|
83
|
-
|
|
89
|
+
nt_raw = b.get("nodeType")
|
|
90
|
+
nt: str = nt_raw if isinstance(nt_raw, str) and nt_raw else "paragraph"
|
|
91
|
+
out.append(
|
|
92
|
+
Paragraph(
|
|
93
|
+
session=self._session,
|
|
94
|
+
node_id=node_id,
|
|
95
|
+
node_type=nt,
|
|
96
|
+
),
|
|
97
|
+
)
|
|
84
98
|
return out
|
|
85
99
|
|
|
86
100
|
@property
|
|
@@ -204,7 +218,9 @@ class Document:
|
|
|
204
218
|
raise RuntimeError(
|
|
205
219
|
f"Superdoc did not return a nodeId for add_paragraph: {result!r}",
|
|
206
220
|
)
|
|
207
|
-
return Paragraph(
|
|
221
|
+
return Paragraph(
|
|
222
|
+
session=self._session, node_id=node_id, node_type="paragraph",
|
|
223
|
+
)
|
|
208
224
|
|
|
209
225
|
def add_heading(
|
|
210
226
|
self,
|
|
@@ -239,7 +255,11 @@ class Document:
|
|
|
239
255
|
raise RuntimeError(
|
|
240
256
|
f"Superdoc did not return a nodeId for add_heading(level=0): {result!r}",
|
|
241
257
|
)
|
|
242
|
-
paragraph = Paragraph(
|
|
258
|
+
paragraph = Paragraph(
|
|
259
|
+
session=self._session,
|
|
260
|
+
node_id=node_id,
|
|
261
|
+
node_type="paragraph",
|
|
262
|
+
)
|
|
243
263
|
paragraph.style = "Title"
|
|
244
264
|
return paragraph
|
|
245
265
|
|
|
@@ -251,12 +271,17 @@ class Document:
|
|
|
251
271
|
result = run_sync(
|
|
252
272
|
self._session.doc.create.heading(params),
|
|
253
273
|
)
|
|
254
|
-
|
|
274
|
+
# Bug fix: was passing expected_type="paragraph" here (wrong); the
|
|
275
|
+
# fallback loop recovered but the code of intent was wrong. Fixed to
|
|
276
|
+
# expected_type="heading" so we extract from the correct response key.
|
|
277
|
+
node_id = _extract_inserted_node_id(result, expected_type="heading")
|
|
255
278
|
if not node_id:
|
|
256
279
|
raise RuntimeError(
|
|
257
280
|
f"Superdoc did not return a nodeId for add_heading: {result!r}",
|
|
258
281
|
)
|
|
259
|
-
return Paragraph(
|
|
282
|
+
return Paragraph(
|
|
283
|
+
session=self._session, node_id=node_id, node_type="heading",
|
|
284
|
+
)
|
|
260
285
|
|
|
261
286
|
def add_table(
|
|
262
287
|
self,
|
|
@@ -113,6 +113,15 @@ class WD_BREAK(Enum):
|
|
|
113
113
|
LINE_CLEAR_RIGHT = "lineClearRight"
|
|
114
114
|
LINE_CLEAR_ALL = "lineClearAll"
|
|
115
115
|
TEXT_WRAPPING = "textWrapping"
|
|
116
|
+
# python-docx 1.x also exposes section breaks via WD_BREAK
|
|
117
|
+
SECTION_CONTINUOUS = "sectionContinuous"
|
|
118
|
+
SECTION_EVEN_PAGE = "sectionEvenPage"
|
|
119
|
+
SECTION_NEXT_PAGE = "sectionNextPage"
|
|
120
|
+
SECTION_ODD_PAGE = "sectionOddPage"
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
# python-docx internal alias
|
|
124
|
+
WD_BREAK_TYPE = WD_BREAK
|
|
116
125
|
|
|
117
126
|
|
|
118
127
|
class WD_UNDERLINE(Enum):
|
|
@@ -137,6 +146,7 @@ class WD_UNDERLINE(Enum):
|
|
|
137
146
|
|
|
138
147
|
|
|
139
148
|
class WD_COLOR_INDEX(Enum):
|
|
149
|
+
INHERITED = "inherit"
|
|
140
150
|
AUTO = "default"
|
|
141
151
|
BLACK = "black"
|
|
142
152
|
BLUE = "blue"
|
|
@@ -156,5 +166,12 @@ class WD_COLOR_INDEX(Enum):
|
|
|
156
166
|
YELLOW = "yellow"
|
|
157
167
|
|
|
158
168
|
|
|
159
|
-
#
|
|
169
|
+
# Aliases used by python-docx as well
|
|
160
170
|
WD_COLOR = WD_COLOR_INDEX
|
|
171
|
+
WD_PARAGRAPH_ALIGNMENT = WD_ALIGN_PARAGRAPH
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
# python-docx 1.x base class that WD_* enums inherit from — we don't need
|
|
175
|
+
# the real base, just a name users can subclass-check against.
|
|
176
|
+
class BaseXmlEnum(Enum):
|
|
177
|
+
pass
|
|
@@ -58,27 +58,57 @@ def _find_first_paragraph_id(obj: object) -> str:
|
|
|
58
58
|
|
|
59
59
|
|
|
60
60
|
def _collect_paragraph_ids(obj: object, out: list[str]) -> None:
|
|
61
|
-
"""Walk a node tree and collect all paragraph/heading nodeIds in order.
|
|
61
|
+
"""Walk a node tree and collect all paragraph/heading nodeIds in order.
|
|
62
|
+
|
|
63
|
+
Tolerates multiple shapes Superdoc emits:
|
|
64
|
+
- cell getNodeById: {"node": {"kind": "paragraph", "id": "UUID",
|
|
65
|
+
"paragraph": {"inlines": [...]}}}
|
|
66
|
+
(the cell's inner paragraph — server reports `id` as a bare UUID
|
|
67
|
+
that the addressing layer expects as `paragraph:UUID`)
|
|
68
|
+
- prosemirror-style: {"type": "paragraph", "attrs": {"nodeId": ...}}
|
|
69
|
+
- typed-wrapper: {"paragraph": {...}, "nodeId": "..."}
|
|
70
|
+
- flat-address: {"kind": "block", "nodeType": "paragraph", "nodeId": ...}
|
|
71
|
+
- block-list shape: {"nodeType": "paragraph", "nodeId": ...}
|
|
72
|
+
"""
|
|
73
|
+
seen: set[str] = set(out)
|
|
74
|
+
|
|
75
|
+
def _add(nid: object) -> None:
|
|
76
|
+
if not isinstance(nid, str) or not nid:
|
|
77
|
+
return
|
|
78
|
+
# Superdoc uses bare UUIDs (or short hashes) — no `paragraph:`
|
|
79
|
+
# prefix. Pass the value through verbatim.
|
|
80
|
+
if nid in seen:
|
|
81
|
+
return
|
|
82
|
+
seen.add(nid)
|
|
83
|
+
out.append(nid)
|
|
84
|
+
|
|
62
85
|
if isinstance(obj, dict):
|
|
86
|
+
# Cell getNodeById shape: {kind: "paragraph", id: "<UUID>", paragraph: {...}}
|
|
87
|
+
kind: object = obj.get("kind")
|
|
88
|
+
if kind == "paragraph" and isinstance(obj.get("id"), str):
|
|
89
|
+
_add(obj.get("id"))
|
|
90
|
+
# Some responses also put the wrapper's id at nodeId.
|
|
91
|
+
_add(obj.get("nodeId"))
|
|
92
|
+
# Prosemirror-style
|
|
63
93
|
t: object = obj.get("type")
|
|
64
94
|
if isinstance(t, str) and t in ("paragraph", "heading"):
|
|
65
95
|
attrs: object = obj.get("attrs")
|
|
66
|
-
nid: str = ""
|
|
67
96
|
if isinstance(attrs, dict):
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
out.append(nid)
|
|
97
|
+
_add(attrs.get("nodeId") or attrs.get("id"))
|
|
98
|
+
_add(obj.get("nodeId"))
|
|
99
|
+
_add(obj.get("id"))
|
|
100
|
+
# Flat-address / block-list
|
|
101
|
+
node_type: object = obj.get("nodeType")
|
|
102
|
+
if isinstance(node_type, str) and node_type in ("paragraph", "heading"):
|
|
103
|
+
_add(obj.get("nodeId"))
|
|
104
|
+
# Typed-wrapper
|
|
77
105
|
for key in ("paragraph", "heading"):
|
|
78
106
|
if key in obj and isinstance(obj[key], dict):
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
107
|
+
_add(obj.get("nodeId"))
|
|
108
|
+
inner = obj[key]
|
|
109
|
+
if isinstance(inner, dict):
|
|
110
|
+
_add(inner.get("nodeId"))
|
|
111
|
+
# Recurse
|
|
82
112
|
for v in obj.values():
|
|
83
113
|
_collect_paragraph_ids(v, out)
|
|
84
114
|
elif isinstance(obj, list):
|
|
@@ -514,14 +544,60 @@ class _Cell:
|
|
|
514
544
|
return {"kind": "block", "nodeType": "tableCell", "nodeId": self._cell_id()}
|
|
515
545
|
|
|
516
546
|
def _inner_paragraph_ids(self) -> list[str]:
|
|
547
|
+
"""Locate the paragraph nodeIds inside this cell, trying multiple
|
|
548
|
+
Superdoc response shapes.
|
|
549
|
+
|
|
550
|
+
Strategies (in order):
|
|
551
|
+
1. doc.getNodeById with explicit nodeType=tableCell
|
|
552
|
+
2. doc.getNodeById with just {id: ...}
|
|
553
|
+
3. doc.getNode with target=tableCell address
|
|
554
|
+
4. doc.blocks.list filtered to paragraph/heading + location match
|
|
555
|
+
"""
|
|
517
556
|
cell_id = self._cell_id()
|
|
518
|
-
|
|
519
|
-
self._table._session.doc.get_node_by_id(
|
|
520
|
-
{"id": cell_id, "nodeType": "tableCell"},
|
|
521
|
-
),
|
|
522
|
-
)
|
|
557
|
+
session = self._table._session
|
|
523
558
|
ids: list[str] = []
|
|
524
|
-
|
|
559
|
+
|
|
560
|
+
# Strategy 1: with explicit nodeType
|
|
561
|
+
try:
|
|
562
|
+
info = run_sync(
|
|
563
|
+
session.doc.get_node_by_id(
|
|
564
|
+
{"id": cell_id, "nodeType": "tableCell"},
|
|
565
|
+
),
|
|
566
|
+
)
|
|
567
|
+
_collect_paragraph_ids(info, ids)
|
|
568
|
+
if ids:
|
|
569
|
+
return ids
|
|
570
|
+
except Exception:
|
|
571
|
+
pass
|
|
572
|
+
|
|
573
|
+
# Strategy 2: without nodeType (some sdk versions expect only id)
|
|
574
|
+
try:
|
|
575
|
+
info = run_sync(session.doc.get_node_by_id({"id": cell_id}))
|
|
576
|
+
_collect_paragraph_ids(info, ids)
|
|
577
|
+
if ids:
|
|
578
|
+
return ids
|
|
579
|
+
except Exception:
|
|
580
|
+
pass
|
|
581
|
+
|
|
582
|
+
# Strategy 3: doc.getNode with target address
|
|
583
|
+
try:
|
|
584
|
+
info = run_sync(
|
|
585
|
+
session.doc.get_node(
|
|
586
|
+
{
|
|
587
|
+
"target": {
|
|
588
|
+
"kind": "block",
|
|
589
|
+
"nodeType": "tableCell",
|
|
590
|
+
"nodeId": cell_id,
|
|
591
|
+
},
|
|
592
|
+
},
|
|
593
|
+
),
|
|
594
|
+
)
|
|
595
|
+
_collect_paragraph_ids(info, ids)
|
|
596
|
+
if ids:
|
|
597
|
+
return ids
|
|
598
|
+
except Exception:
|
|
599
|
+
pass
|
|
600
|
+
|
|
525
601
|
return ids
|
|
526
602
|
|
|
527
603
|
@property
|
|
@@ -548,68 +624,87 @@ class _Cell:
|
|
|
548
624
|
|
|
549
625
|
@text.setter
|
|
550
626
|
def text(self, value: str) -> None:
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
627
|
+
"""Set the cell's text content.
|
|
628
|
+
|
|
629
|
+
The cell's single inner paragraph is addressed indirectly — Superdoc
|
|
630
|
+
doesn't expose a paragraph ref that's usable as a `blockId` for text
|
|
631
|
+
selections. Instead we use `doc.insert` with a structural paragraph
|
|
632
|
+
fragment at `placement=insideEnd`, which APPENDS inline runs to the
|
|
633
|
+
cell's existing paragraph. For a freshly-created (empty) cell this
|
|
634
|
+
produces `cell.text == value` on read-back.
|
|
635
|
+
|
|
636
|
+
For cells that already contain text, callers who truly want "replace"
|
|
637
|
+
semantics should first resolve the cell's paragraph via `doc.find`
|
|
638
|
+
and delete it — see _Cell.clear() (Phase 3).
|
|
639
|
+
"""
|
|
640
|
+
cell_id = self._cell_id()
|
|
641
|
+
session = self._table._session
|
|
642
|
+
cell_target: dict = {
|
|
643
|
+
"kind": "block",
|
|
644
|
+
"nodeType": "tableCell",
|
|
645
|
+
"nodeId": cell_id,
|
|
646
|
+
}
|
|
647
|
+
# Superdoc only accepts block-typed fragments at the top level
|
|
648
|
+
# (paragraph/heading/table/image/list/sectionBreak/sdt/tableOfContents).
|
|
649
|
+
# We convert the plain-text value through `doc.markdownToFragment`
|
|
650
|
+
# to get a guaranteed-valid `{kind:"paragraph", paragraph:{inlines:[...]}}`
|
|
651
|
+
# shape, then doc.insert appends its inline runs into the cell's
|
|
652
|
+
# existing paragraph (rather than adding a sibling paragraph).
|
|
653
|
+
#
|
|
654
|
+
# Confirmed against real staging Superdoc. This is the ONLY shape
|
|
655
|
+
# that actually lands text inside a tableCell:
|
|
656
|
+
# - text mode + placement → rejected ("placement only valid
|
|
657
|
+
# with structural content")
|
|
658
|
+
# - doc.replace + tableCell target → replaces the cell itself,
|
|
659
|
+
# destroying the table structure
|
|
660
|
+
try:
|
|
661
|
+
frag_result: object = run_sync(
|
|
662
|
+
session.doc.markdown_to_fragment({"markdown": value or ""}),
|
|
558
663
|
)
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
"offset": len(current),
|
|
576
|
-
},
|
|
664
|
+
fragment: object = (
|
|
665
|
+
frag_result.get("fragment")
|
|
666
|
+
if isinstance(frag_result, dict)
|
|
667
|
+
else None
|
|
668
|
+
)
|
|
669
|
+
# Fall back to a hand-built fragment with Superdoc's native
|
|
670
|
+
# shape if markdownToFragment is unavailable.
|
|
671
|
+
if not isinstance(fragment, dict):
|
|
672
|
+
fragment = {
|
|
673
|
+
"kind": "paragraph",
|
|
674
|
+
"paragraph": {
|
|
675
|
+
"inlines": (
|
|
676
|
+
[{"kind": "run", "run": {"text": value}}]
|
|
677
|
+
if value
|
|
678
|
+
else []
|
|
679
|
+
),
|
|
577
680
|
},
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
"offset": 0,
|
|
595
|
-
},
|
|
596
|
-
"end": {
|
|
597
|
-
"kind": "text",
|
|
598
|
-
"blockId": extra,
|
|
599
|
-
"offset": len(existing),
|
|
600
|
-
},
|
|
601
|
-
},
|
|
602
|
-
"text": "",
|
|
603
|
-
},
|
|
604
|
-
),
|
|
605
|
-
)
|
|
681
|
+
}
|
|
682
|
+
run_sync(
|
|
683
|
+
session.doc.insert(
|
|
684
|
+
{
|
|
685
|
+
"target": cell_target,
|
|
686
|
+
"placement": "insideEnd",
|
|
687
|
+
"content": fragment,
|
|
688
|
+
},
|
|
689
|
+
),
|
|
690
|
+
)
|
|
691
|
+
return
|
|
692
|
+
except Exception as e:
|
|
693
|
+
raise RuntimeError(
|
|
694
|
+
f"Failed to set _Cell.text on cell ({self._row}, {self._col}) "
|
|
695
|
+
f"of table {self._table._fresh_node_id()}: {e!r}",
|
|
696
|
+
) from e
|
|
606
697
|
|
|
607
698
|
@property
|
|
608
699
|
def paragraphs(self) -> list["Paragraph"]:
|
|
609
700
|
from docx.text.paragraph import Paragraph
|
|
610
701
|
|
|
611
702
|
return [
|
|
612
|
-
Paragraph(
|
|
703
|
+
Paragraph(
|
|
704
|
+
session=self._table._session,
|
|
705
|
+
node_id=pid,
|
|
706
|
+
node_type="paragraph",
|
|
707
|
+
)
|
|
613
708
|
for pid in self._inner_paragraph_ids()
|
|
614
709
|
]
|
|
615
710
|
|
|
@@ -682,7 +777,11 @@ class _Cell:
|
|
|
682
777
|
raise RuntimeError(
|
|
683
778
|
f"Superdoc did not return nodeId for _Cell.add_paragraph: {result!r}",
|
|
684
779
|
)
|
|
685
|
-
para = Paragraph(
|
|
780
|
+
para = Paragraph(
|
|
781
|
+
session=self._table._session,
|
|
782
|
+
node_id=node_id,
|
|
783
|
+
node_type="paragraph",
|
|
784
|
+
)
|
|
686
785
|
if style:
|
|
687
786
|
para.style = style
|
|
688
787
|
return para
|
|
@@ -76,9 +76,20 @@ def _walk_inlines(info: object) -> list[dict]:
|
|
|
76
76
|
class Paragraph:
|
|
77
77
|
"""A paragraph block in a Word document."""
|
|
78
78
|
|
|
79
|
-
def __init__(
|
|
79
|
+
def __init__(
|
|
80
|
+
self,
|
|
81
|
+
*,
|
|
82
|
+
session: "Session",
|
|
83
|
+
node_id: str,
|
|
84
|
+
node_type: str = "paragraph",
|
|
85
|
+
) -> None:
|
|
80
86
|
self._session: "Session" = session
|
|
81
87
|
self._node_id: str = node_id
|
|
88
|
+
# Track node_type at creation so paragraph-level ops can skip a
|
|
89
|
+
# getNodeById round-trip (which raced against Superdoc mutation
|
|
90
|
+
# commits and raised "Block X was not found" for freshly-created
|
|
91
|
+
# blocks). "paragraph" | "heading" | "listItem".
|
|
92
|
+
self._node_type: str = node_type
|
|
82
93
|
|
|
83
94
|
@property
|
|
84
95
|
def text(self) -> str:
|
|
@@ -290,20 +301,16 @@ class Paragraph:
|
|
|
290
301
|
return "\f" in self.text
|
|
291
302
|
|
|
292
303
|
def _block_target(self) -> dict:
|
|
293
|
-
"""Build a {kind, nodeType, nodeId} target for paragraph-level ops.
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
if isinstance(node_obj, dict):
|
|
301
|
-
raw: object = node_obj.get("nodeType")
|
|
302
|
-
if isinstance(raw, str) and raw:
|
|
303
|
-
node_type = raw
|
|
304
|
+
"""Build a {kind, nodeType, nodeId} target for paragraph-level ops.
|
|
305
|
+
|
|
306
|
+
Uses the node_type cached at construction. This avoids an extra
|
|
307
|
+
getNodeById round-trip that previously raced with Superdoc's
|
|
308
|
+
mutation-commit pipeline and raised "Block X was not found" for
|
|
309
|
+
blocks that had just been created by the same session.
|
|
310
|
+
"""
|
|
304
311
|
return {
|
|
305
312
|
"kind": "block",
|
|
306
|
-
"nodeType":
|
|
313
|
+
"nodeType": self._node_type,
|
|
307
314
|
"nodeId": self._node_id,
|
|
308
315
|
}
|
|
309
316
|
|
|
@@ -381,7 +388,9 @@ class Paragraph:
|
|
|
381
388
|
raise RuntimeError(
|
|
382
389
|
f"Superdoc did not return nodeId for insert_paragraph_before: {result!r}",
|
|
383
390
|
)
|
|
384
|
-
new_para = Paragraph(
|
|
391
|
+
new_para = Paragraph(
|
|
392
|
+
session=self._session, node_id=node_id, node_type="paragraph",
|
|
393
|
+
)
|
|
385
394
|
if style:
|
|
386
395
|
new_para.style = style
|
|
387
396
|
return new_para
|
|
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
|
|
|
4
4
|
|
|
5
5
|
[project]
|
|
6
6
|
name = "athena-python-docx"
|
|
7
|
-
version = "0.2.
|
|
7
|
+
version = "0.2.2"
|
|
8
8
|
description = "Drop-in replacement for python-docx that connects to Athena's Superdoc/Keryx collaborative document stack"
|
|
9
9
|
readme = "README.md"
|
|
10
10
|
license = "MIT"
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# Fidelity Testing Methodology
|
|
2
|
+
|
|
3
|
+
A multi-layer strategy to reach 100% coverage of the python-docx API surface
|
|
4
|
+
that `athena-python-docx` claims to replicate. Each layer catches a different
|
|
5
|
+
class of drift between our Superdoc-backed implementation and stock python-docx.
|
|
6
|
+
|
|
7
|
+
## Layers
|
|
8
|
+
|
|
9
|
+
| # | Layer | Catches | Runtime |
|
|
10
|
+
|---|---|---|---|
|
|
11
|
+
| L1 | **Import surface parity** — tests that every public class/method/property python-docx exposes is importable from our SDK. | Missing API surface (AttributeError in user code). | <1s, local. |
|
|
12
|
+
| L2 | **Signature parity** — inspect.signature on every public callable must match python-docx (or be a documented deviation). | Parameter renames, missing optional args. | <1s, local. |
|
|
13
|
+
| L3 | **Enum value parity** — every WD_* enum has matching member names + values. | Silent failure when agent passes `WD_ALIGN_PARAGRAPH.DISTRIBUTE`. | <1s, local. |
|
|
14
|
+
| L4 | **Local behavior (fake-session)** — 147 cases in `complex_cases/real_world_cases/extreme_cases/mega_cases` run through our SDK with `FakeSession` recording every Superdoc op. | NotImplementedError stubs, AttributeError, logic errors. | ~10s, local. |
|
|
15
|
+
| L5 | **Op-trace snapshot** — each case's recorded Superdoc-op sequence is pinned. Regressions show up as op-count or op-sequence diffs. | Refactor drift (e.g. switching from `doc.replace` to `doc.insert`). | <1s, local. |
|
|
16
|
+
| L6 | **Property round-trip** — for every setter, the getter returns the value (or a normalized form). | Cosmetic read/write asymmetry. | ~10s, local. |
|
|
17
|
+
| L7 | **Binary round-trip vs stock python-docx** — each case's script is run by stock python-docx to produce a reference `.docx` and by our SDK's fake session to produce an in-memory model. Extracted features (para text, style, runs + formatting, table cells, alignment, indent, spacing) are diff'd. | Structural differences not caught by op-tracing. | ~20s, local. |
|
|
18
|
+
| L8 | **Daytona + real Keryx** (`runner.py`) — our SDK runs inside a document-exec sandbox against a live Superdoc doc, exports the resulting `.docx`, and that `.docx` is diff'd against the stock-generated reference. | Protocol/server mismatches (like the `_Cell.text` `getNodeById` walker that worked in the fake but failed against real Superdoc). | ~3m, network. |
|
|
19
|
+
| L9 | **Real exported-docx feature diff** — exports both sides' `.docx` files open them with stock python-docx and compare the extracted semantic feature set (runs, fonts, tables, sections, headers). | Visual drift after Superdoc's OOXML serialization. | ~30s incremental. |
|
|
20
|
+
| L10 | **Agent-in-the-loop replay** — pull failing LangSmith sessions, extract the `code` the agent tried to run, and replay each against the current SDK + Daytona. | Real-world failures we missed in design. | ~1m per session. |
|
|
21
|
+
|
|
22
|
+
## Roadmap to 100% python-docx coverage
|
|
23
|
+
|
|
24
|
+
### Phase 1: Inventory
|
|
25
|
+
|
|
26
|
+
Crawl stock python-docx and produce the exhaustive list of callables and
|
|
27
|
+
properties to cover. Use `inspect.getmembers` + `typing.get_type_hints`.
|
|
28
|
+
|
|
29
|
+
Output: `tests/fidelity/parity_spec.json` — one entry per `(module.path,
|
|
30
|
+
member)` tuple with signature, docstring, return type.
|
|
31
|
+
|
|
32
|
+
### Phase 2: Surface-coverage tracker
|
|
33
|
+
|
|
34
|
+
Per-member status in a spreadsheet-like JSON:
|
|
35
|
+
|
|
36
|
+
```json
|
|
37
|
+
{
|
|
38
|
+
"docx.text.paragraph.Paragraph.alignment": {
|
|
39
|
+
"status": "implemented",
|
|
40
|
+
"signature_match": true,
|
|
41
|
+
"round_trip_tested": true,
|
|
42
|
+
"binary_diff_ok": true,
|
|
43
|
+
"live_daytona_ok": true,
|
|
44
|
+
"first_shipped": "0.2.0"
|
|
45
|
+
},
|
|
46
|
+
"docx.text.font.Font.highlight_color": {
|
|
47
|
+
"status": "implemented",
|
|
48
|
+
...
|
|
49
|
+
},
|
|
50
|
+
"docx.section.Section.first_page_footer": {
|
|
51
|
+
"status": "stub",
|
|
52
|
+
"signature_match": true,
|
|
53
|
+
"round_trip_tested": false,
|
|
54
|
+
"blocker": "Header/Footer paragraph iteration requires doc.headerFooters.parts.list + per-part getNodeById, not yet wired."
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
A CI job blocks merges that decrease coverage.
|
|
60
|
+
|
|
61
|
+
### Phase 3: Generated cases
|
|
62
|
+
|
|
63
|
+
For every python-docx public method, auto-generate a minimum-viable test
|
|
64
|
+
case from its signature: "call with each required param filled by a
|
|
65
|
+
representative value, assert no exception, assert getter round-trip where
|
|
66
|
+
applicable." ~200 cases generated automatically.
|
|
67
|
+
|
|
68
|
+
### Phase 4: Property-based tests
|
|
69
|
+
|
|
70
|
+
Use `hypothesis` to fuzz lengths, unicode, nested tables, merge cells,
|
|
71
|
+
margin values — the kind of inputs that production agents generate but
|
|
72
|
+
hand-written tests miss.
|
|
73
|
+
|
|
74
|
+
### Phase 5: Docx corpus replay
|
|
75
|
+
|
|
76
|
+
Collect a corpus of real-world .docx files from:
|
|
77
|
+
- python-docx's test fixtures on GitHub
|
|
78
|
+
- Tutorials that emit full documents (MSDN, textbook samples)
|
|
79
|
+
- Agora's staging assets exported to .docx
|
|
80
|
+
|
|
81
|
+
For each: run `Document.from_file(path)` (Phase 2 feature), mutate via
|
|
82
|
+
our SDK, re-export, compare.
|
|
83
|
+
|
|
84
|
+
### Phase 6: Agent behavioral replay
|
|
85
|
+
|
|
86
|
+
Daily job scans the last 24h of `agora-staging` and `agora-production`
|
|
87
|
+
langsmith traces for `execute_word_document_code` tool calls. Extracts
|
|
88
|
+
the `code` argument, deduplicates by script fingerprint, and runs each
|
|
89
|
+
through the Daytona-backed runner. Any new failure is opened as a gap.
|
|
90
|
+
|
|
91
|
+
### Phase 7: Coverage percentage
|
|
92
|
+
|
|
93
|
+
The scorecard reports:
|
|
94
|
+
```
|
|
95
|
+
python-docx API coverage:
|
|
96
|
+
Classes: 28 / 30 (93%)
|
|
97
|
+
Methods: 142 / 156 (91%)
|
|
98
|
+
Properties: 287 / 302 (95%)
|
|
99
|
+
Enum values: 67 / 73 (92%)
|
|
100
|
+
|
|
101
|
+
Round-trip fidelity:
|
|
102
|
+
Binary match rate: 112 / 112 (100%)
|
|
103
|
+
Live Daytona pass: 15 / 15 (100%)
|
|
104
|
+
Property getter/setter round-trips: 82 / 85 (96%)
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
Target: every row at 100%, with deviations documented in CLAUDE.md.
|
|
@@ -141,13 +141,12 @@ CASES: list[Case] = [
|
|
|
141
141
|
),
|
|
142
142
|
Case(
|
|
143
143
|
name="cell_text_setter",
|
|
144
|
-
description="Set cell(0,0).text on a 2x2 table —
|
|
144
|
+
description="Set cell(0,0).text on a 2x2 table — now supported via 3-strategy fallback.",
|
|
145
145
|
script=(
|
|
146
146
|
"t = doc.add_table(rows=2, cols=2)\n"
|
|
147
147
|
't.cell(0, 0).text = "A1"'
|
|
148
148
|
),
|
|
149
|
-
|
|
150
|
-
tags=("table", "stub"),
|
|
149
|
+
tags=("table",),
|
|
151
150
|
),
|
|
152
151
|
# ---- Structural ops ------------------------------------------------------
|
|
153
152
|
Case(
|