athena-python-docx 0.2.1__tar.gz → 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/PKG-INFO +1 -1
  2. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/__init__.py +11 -1
  3. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/document.py +31 -6
  4. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/enum/text.py +18 -1
  5. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/table.py +76 -98
  6. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/text/paragraph.py +23 -14
  7. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/pyproject.toml +1 -1
  8. athena_python_docx-0.2.2/tests/fidelity/METHODOLOGY.md +107 -0
  9. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/cases.py +2 -3
  10. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/fake_session.py +32 -0
  11. athena_python_docx-0.2.2/tests/fidelity/ours_spec.json +3419 -0
  12. athena_python_docx-0.2.2/tests/fidelity/parity_crawl.py +267 -0
  13. athena_python_docx-0.2.2/tests/fidelity/parity_diff.json +506 -0
  14. athena_python_docx-0.2.2/tests/fidelity/round_trip_tests.py +214 -0
  15. athena_python_docx-0.2.2/tests/fidelity/stock_spec.json +10189 -0
  16. athena_python_docx-0.2.2/uv.lock +525 -0
  17. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/.gitignore +0 -0
  18. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/CLAUDE.md +0 -0
  19. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/README.md +0 -0
  20. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/_batching.py +0 -0
  21. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/api.py +0 -0
  22. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/client.py +0 -0
  23. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/enum/__init__.py +0 -0
  24. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/enum/section.py +0 -0
  25. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/enum/style.py +0 -0
  26. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/enum/table.py +0 -0
  27. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/errors.py +0 -0
  28. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/opc/__init__.py +0 -0
  29. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/opc/coreprops.py +0 -0
  30. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/section.py +0 -0
  31. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/settings.py +0 -0
  32. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/shape.py +0 -0
  33. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/shared.py +0 -0
  34. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/styles/__init__.py +0 -0
  35. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/styles/style.py +0 -0
  36. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/styles/styles.py +0 -0
  37. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/text/__init__.py +0 -0
  38. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/text/hyperlink.py +0 -0
  39. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/text/parfmt.py +0 -0
  40. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/text/run.py +0 -0
  41. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/docx/typing.py +0 -0
  42. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/scripts/publish.sh +0 -0
  43. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/__init__.py +0 -0
  44. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/conftest.py +0 -0
  45. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/README.md +0 -0
  46. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/__init__.py +0 -0
  47. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/binary_round_trip.py +0 -0
  48. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/complex_cases.py +0 -0
  49. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/extract.py +0 -0
  50. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/extreme_cases.py +0 -0
  51. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/local_runner.py +0 -0
  52. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/mega_cases.py +0 -0
  53. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/real_world_cases.py +0 -0
  54. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/fidelity/runner.py +0 -0
  55. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/test_commands.py +0 -0
  56. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/test_python_docx_api_parity.py +0 -0
  57. {athena_python_docx-0.2.1 → athena_python_docx-0.2.2}/tests/test_smoke_integration.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: athena-python-docx
3
- Version: 0.2.1
3
+ Version: 0.2.2
4
4
  Summary: Drop-in replacement for python-docx that connects to Athena's Superdoc/Keryx collaborative document stack
5
5
  Project-URL: Homepage, https://athenaintelligence.ai
6
6
  Author-email: Athena Intelligence <engineering@athenaintelligence.ai>
@@ -6,11 +6,21 @@ See CLAUDE.md for the API parity contract.
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
- __version__ = "0.2.1"
9
+ __version__ = "0.2.2"
10
10
 
11
11
  from docx.api import Document
12
+ # Re-exports python-docx ships at docx top-level for convenience.
13
+ from docx.shared import Emu, Inches, Pt, Cm, Mm, Twips, Length, RGBColor
12
14
 
13
15
  __all__ = [
14
16
  "Document",
17
+ "Emu",
18
+ "Inches",
19
+ "Pt",
20
+ "Cm",
21
+ "Mm",
22
+ "Twips",
23
+ "Length",
24
+ "RGBColor",
15
25
  "__version__",
16
26
  ]
@@ -22,9 +22,15 @@ from typing import TYPE_CHECKING, BinaryIO
22
22
  from docx._batching import run_sync
23
23
  from docx.client import Session
24
24
  from docx.errors import DocumentClosedError, ValidationError
25
+ # python-docx re-exports a subset of symbols at docx.document; mirror those
26
+ # so `from docx.document import Emu` etc. works.
27
+ from docx.enum.section import WD_SECTION, WD_SECTION_START # noqa: F401
28
+ from docx.enum.text import WD_BREAK # noqa: F401
29
+ from docx.section import Section, Sections # noqa: F401
30
+ from docx.shared import Cm, Emu, Inches, Length, Mm, Pt, RGBColor, Twips # noqa: F401
31
+ from docx.text.run import Run # noqa: F401
25
32
 
26
33
  if TYPE_CHECKING:
27
- from docx.shared import Emu
28
34
  from docx.table import Table
29
35
  from docx.text.paragraph import Paragraph
30
36
 
@@ -80,7 +86,15 @@ class Document:
80
86
  continue
81
87
  node_id: str = str(b.get("nodeId", ""))
82
88
  if node_id:
83
- out.append(Paragraph(session=self._session, node_id=node_id))
89
+ nt_raw = b.get("nodeType")
90
+ nt: str = nt_raw if isinstance(nt_raw, str) and nt_raw else "paragraph"
91
+ out.append(
92
+ Paragraph(
93
+ session=self._session,
94
+ node_id=node_id,
95
+ node_type=nt,
96
+ ),
97
+ )
84
98
  return out
85
99
 
86
100
  @property
@@ -204,7 +218,9 @@ class Document:
204
218
  raise RuntimeError(
205
219
  f"Superdoc did not return a nodeId for add_paragraph: {result!r}",
206
220
  )
207
- return Paragraph(session=self._session, node_id=node_id)
221
+ return Paragraph(
222
+ session=self._session, node_id=node_id, node_type="paragraph",
223
+ )
208
224
 
209
225
  def add_heading(
210
226
  self,
@@ -239,7 +255,11 @@ class Document:
239
255
  raise RuntimeError(
240
256
  f"Superdoc did not return a nodeId for add_heading(level=0): {result!r}",
241
257
  )
242
- paragraph = Paragraph(session=self._session, node_id=node_id)
258
+ paragraph = Paragraph(
259
+ session=self._session,
260
+ node_id=node_id,
261
+ node_type="paragraph",
262
+ )
243
263
  paragraph.style = "Title"
244
264
  return paragraph
245
265
 
@@ -251,12 +271,17 @@ class Document:
251
271
  result = run_sync(
252
272
  self._session.doc.create.heading(params),
253
273
  )
254
- node_id = _extract_inserted_node_id(result, expected_type="paragraph")
274
+ # Bug fix: was passing expected_type="paragraph" here (wrong); the
275
+ # fallback loop recovered but the code of intent was wrong. Fixed to
276
+ # expected_type="heading" so we extract from the correct response key.
277
+ node_id = _extract_inserted_node_id(result, expected_type="heading")
255
278
  if not node_id:
256
279
  raise RuntimeError(
257
280
  f"Superdoc did not return a nodeId for add_heading: {result!r}",
258
281
  )
259
- return Paragraph(session=self._session, node_id=node_id)
282
+ return Paragraph(
283
+ session=self._session, node_id=node_id, node_type="heading",
284
+ )
260
285
 
261
286
  def add_table(
262
287
  self,
@@ -113,6 +113,15 @@ class WD_BREAK(Enum):
113
113
  LINE_CLEAR_RIGHT = "lineClearRight"
114
114
  LINE_CLEAR_ALL = "lineClearAll"
115
115
  TEXT_WRAPPING = "textWrapping"
116
+ # python-docx 1.x also exposes section breaks via WD_BREAK
117
+ SECTION_CONTINUOUS = "sectionContinuous"
118
+ SECTION_EVEN_PAGE = "sectionEvenPage"
119
+ SECTION_NEXT_PAGE = "sectionNextPage"
120
+ SECTION_ODD_PAGE = "sectionOddPage"
121
+
122
+
123
+ # python-docx internal alias
124
+ WD_BREAK_TYPE = WD_BREAK
116
125
 
117
126
 
118
127
  class WD_UNDERLINE(Enum):
@@ -137,6 +146,7 @@ class WD_UNDERLINE(Enum):
137
146
 
138
147
 
139
148
  class WD_COLOR_INDEX(Enum):
149
+ INHERITED = "inherit"
140
150
  AUTO = "default"
141
151
  BLACK = "black"
142
152
  BLUE = "blue"
@@ -156,5 +166,12 @@ class WD_COLOR_INDEX(Enum):
156
166
  YELLOW = "yellow"
157
167
 
158
168
 
159
- # Alias used by python-docx as well
169
+ # Aliases used by python-docx as well
160
170
  WD_COLOR = WD_COLOR_INDEX
171
+ WD_PARAGRAPH_ALIGNMENT = WD_ALIGN_PARAGRAPH
172
+
173
+
174
+ # python-docx 1.x base class that WD_* enums inherit from — we don't need
175
+ # the real base, just a name users can subclass-check against.
176
+ class BaseXmlEnum(Enum):
177
+ pass
@@ -60,7 +60,11 @@ def _find_first_paragraph_id(obj: object) -> str:
60
60
  def _collect_paragraph_ids(obj: object, out: list[str]) -> None:
61
61
  """Walk a node tree and collect all paragraph/heading nodeIds in order.
62
62
 
63
- Tolerates several shapes that Superdoc has emitted over versions:
63
+ Tolerates multiple shapes Superdoc emits:
64
+ - cell getNodeById: {"node": {"kind": "paragraph", "id": "UUID",
65
+ "paragraph": {"inlines": [...]}}}
66
+ (the cell's inner paragraph — server reports `id` as a bare UUID
67
+ that the addressing layer expects as `paragraph:UUID`)
64
68
  - prosemirror-style: {"type": "paragraph", "attrs": {"nodeId": ...}}
65
69
  - typed-wrapper: {"paragraph": {...}, "nodeId": "..."}
66
70
  - flat-address: {"kind": "block", "nodeType": "paragraph", "nodeId": ...}
@@ -69,11 +73,22 @@ def _collect_paragraph_ids(obj: object, out: list[str]) -> None:
69
73
  seen: set[str] = set(out)
70
74
 
71
75
  def _add(nid: object) -> None:
72
- if isinstance(nid, str) and nid and nid not in seen:
73
- seen.add(nid)
74
- out.append(nid)
76
+ if not isinstance(nid, str) or not nid:
77
+ return
78
+ # Superdoc uses bare UUIDs (or short hashes) — no `paragraph:`
79
+ # prefix. Pass the value through verbatim.
80
+ if nid in seen:
81
+ return
82
+ seen.add(nid)
83
+ out.append(nid)
75
84
 
76
85
  if isinstance(obj, dict):
86
+ # Cell getNodeById shape: {kind: "paragraph", id: "<UUID>", paragraph: {...}}
87
+ kind: object = obj.get("kind")
88
+ if kind == "paragraph" and isinstance(obj.get("id"), str):
89
+ _add(obj.get("id"))
90
+ # Some responses also put the wrapper's id at nodeId.
91
+ _add(obj.get("nodeId"))
77
92
  # Prosemirror-style
78
93
  t: object = obj.get("type")
79
94
  if isinstance(t, str) and t in ("paragraph", "heading"):
@@ -611,118 +626,73 @@ class _Cell:
611
626
  def text(self, value: str) -> None:
612
627
  """Set the cell's text content.
613
628
 
614
- Tries three strategies in order:
615
- 1. Text-range replace on the inner paragraph (fastest, preserves
616
- paragraph-level formatting like alignment, style).
617
- 2. Structural replace of the tableCell with a markdown-derived
618
- fragment via doc.markdownToFragment doc.replace.
619
- 3. Structural replace of the tableCell with a hand-built
620
- prosemirror paragraph fragment as last resort.
621
- """
622
- from docx.text.paragraph import _node_text
629
+ The cell's single inner paragraph is addressed indirectly — Superdoc
630
+ doesn't expose a paragraph ref that's usable as a `blockId` for text
631
+ selections. Instead we use `doc.insert` with a structural paragraph
632
+ fragment at `placement=insideEnd`, which APPENDS inline runs to the
633
+ cell's existing paragraph. For a freshly-created (empty) cell this
634
+ produces `cell.text == value` on read-back.
623
635
 
636
+ For cells that already contain text, callers who truly want "replace"
637
+ semantics should first resolve the cell's paragraph via `doc.find`
638
+ and delete it — see _Cell.clear() (Phase 3).
639
+ """
624
640
  cell_id = self._cell_id()
625
641
  session = self._table._session
626
-
627
- # --- Strategy 1: inner paragraph + text-range replace ---
628
- ids = self._inner_paragraph_ids()
629
- if ids:
630
- first = ids[0]
631
- current = _node_text(session, first)
632
- try:
633
- run_sync(
634
- session.doc.replace(
635
- {
636
- "target": {
637
- "kind": "selection",
638
- "start": {
639
- "kind": "text",
640
- "blockId": first,
641
- "offset": 0,
642
- },
643
- "end": {
644
- "kind": "text",
645
- "blockId": first,
646
- "offset": len(current),
647
- },
648
- },
649
- "text": value,
650
- },
651
- ),
652
- )
653
- for extra in ids[1:]:
654
- existing = _node_text(session, extra)
655
- if existing:
656
- run_sync(
657
- session.doc.replace(
658
- {
659
- "target": {
660
- "kind": "selection",
661
- "start": {
662
- "kind": "text",
663
- "blockId": extra,
664
- "offset": 0,
665
- },
666
- "end": {
667
- "kind": "text",
668
- "blockId": extra,
669
- "offset": len(existing),
670
- },
671
- },
672
- "text": "",
673
- },
674
- ),
675
- )
676
- return
677
- except Exception as e:
678
- _log_warn(
679
- f"_Cell.text text-range replace failed on paragraph "
680
- f"{first}: {e!r}; falling back to structural replace.",
681
- )
682
-
683
- # --- Strategy 2: markdownToFragment + structural replace ---
684
642
  cell_target: dict = {
685
643
  "kind": "block",
686
644
  "nodeType": "tableCell",
687
645
  "nodeId": cell_id,
688
646
  }
647
+ # Superdoc only accepts block-typed fragments at the top level
648
+ # (paragraph/heading/table/image/list/sectionBreak/sdt/tableOfContents).
649
+ # We convert the plain-text value through `doc.markdownToFragment`
650
+ # to get a guaranteed-valid `{kind:"paragraph", paragraph:{inlines:[...]}}`
651
+ # shape, then doc.insert appends its inline runs into the cell's
652
+ # existing paragraph (rather than adding a sibling paragraph).
653
+ #
654
+ # Confirmed against real staging Superdoc. This is the ONLY shape
655
+ # that actually lands text inside a tableCell:
656
+ # - text mode + placement → rejected ("placement only valid
657
+ # with structural content")
658
+ # - doc.replace + tableCell target → replaces the cell itself,
659
+ # destroying the table structure
689
660
  try:
690
661
  frag_result: object = run_sync(
691
662
  session.doc.markdown_to_fragment({"markdown": value or ""}),
692
663
  )
693
- fragment: object = None
694
- if isinstance(frag_result, dict):
695
- fragment = frag_result.get("fragment")
696
- if fragment is not None:
697
- run_sync(
698
- session.doc.replace(
699
- {"target": cell_target, "content": fragment},
700
- ),
701
- )
702
- return
703
- except Exception as e:
704
- _log_warn(
705
- f"_Cell.text markdownToFragment/replace failed: {e!r}; "
706
- f"falling back to prosemirror fragment.",
664
+ fragment: object = (
665
+ frag_result.get("fragment")
666
+ if isinstance(frag_result, dict)
667
+ else None
707
668
  )
708
-
709
- # --- Strategy 3: hand-built prosemirror paragraph fragment ---
710
- pm_fragment: dict = {
711
- "type": "paragraph",
712
- "content": [{"type": "text", "text": value}] if value else [],
713
- }
714
- try:
669
+ # Fall back to a hand-built fragment with Superdoc's native
670
+ # shape if markdownToFragment is unavailable.
671
+ if not isinstance(fragment, dict):
672
+ fragment = {
673
+ "kind": "paragraph",
674
+ "paragraph": {
675
+ "inlines": (
676
+ [{"kind": "run", "run": {"text": value}}]
677
+ if value
678
+ else []
679
+ ),
680
+ },
681
+ }
715
682
  run_sync(
716
- session.doc.replace(
717
- {"target": cell_target, "content": pm_fragment},
683
+ session.doc.insert(
684
+ {
685
+ "target": cell_target,
686
+ "placement": "insideEnd",
687
+ "content": fragment,
688
+ },
718
689
  ),
719
690
  )
720
691
  return
721
692
  except Exception as e:
722
693
  raise RuntimeError(
723
694
  f"Failed to set _Cell.text on cell ({self._row}, {self._col}) "
724
- f"of table {self._table._fresh_node_id()}: all three strategies "
725
- f"failed. Last error: {e!r}",
695
+ f"of table {self._table._fresh_node_id()}: {e!r}",
726
696
  ) from e
727
697
 
728
698
  @property
@@ -730,7 +700,11 @@ class _Cell:
730
700
  from docx.text.paragraph import Paragraph
731
701
 
732
702
  return [
733
- Paragraph(session=self._table._session, node_id=pid)
703
+ Paragraph(
704
+ session=self._table._session,
705
+ node_id=pid,
706
+ node_type="paragraph",
707
+ )
734
708
  for pid in self._inner_paragraph_ids()
735
709
  ]
736
710
 
@@ -803,7 +777,11 @@ class _Cell:
803
777
  raise RuntimeError(
804
778
  f"Superdoc did not return nodeId for _Cell.add_paragraph: {result!r}",
805
779
  )
806
- para = Paragraph(session=self._table._session, node_id=node_id)
780
+ para = Paragraph(
781
+ session=self._table._session,
782
+ node_id=node_id,
783
+ node_type="paragraph",
784
+ )
807
785
  if style:
808
786
  para.style = style
809
787
  return para
@@ -76,9 +76,20 @@ def _walk_inlines(info: object) -> list[dict]:
76
76
  class Paragraph:
77
77
  """A paragraph block in a Word document."""
78
78
 
79
- def __init__(self, *, session: "Session", node_id: str) -> None:
79
+ def __init__(
80
+ self,
81
+ *,
82
+ session: "Session",
83
+ node_id: str,
84
+ node_type: str = "paragraph",
85
+ ) -> None:
80
86
  self._session: "Session" = session
81
87
  self._node_id: str = node_id
88
+ # Track node_type at creation so paragraph-level ops can skip a
89
+ # getNodeById round-trip (which raced against Superdoc mutation
90
+ # commits and raised "Block X was not found" for freshly-created
91
+ # blocks). "paragraph" | "heading" | "listItem".
92
+ self._node_type: str = node_type
82
93
 
83
94
  @property
84
95
  def text(self) -> str:
@@ -290,20 +301,16 @@ class Paragraph:
290
301
  return "\f" in self.text
291
302
 
292
303
  def _block_target(self) -> dict:
293
- """Build a {kind, nodeType, nodeId} target for paragraph-level ops."""
294
- info: object = run_sync(
295
- self._session.doc.get_node_by_id({"id": self._node_id}),
296
- )
297
- node_type: str = "paragraph"
298
- if isinstance(info, dict):
299
- node_obj: object = info.get("node")
300
- if isinstance(node_obj, dict):
301
- raw: object = node_obj.get("nodeType")
302
- if isinstance(raw, str) and raw:
303
- node_type = raw
304
+ """Build a {kind, nodeType, nodeId} target for paragraph-level ops.
305
+
306
+ Uses the node_type cached at construction. This avoids an extra
307
+ getNodeById round-trip that previously raced with Superdoc's
308
+ mutation-commit pipeline and raised "Block X was not found" for
309
+ blocks that had just been created by the same session.
310
+ """
304
311
  return {
305
312
  "kind": "block",
306
- "nodeType": node_type,
313
+ "nodeType": self._node_type,
307
314
  "nodeId": self._node_id,
308
315
  }
309
316
 
@@ -381,7 +388,9 @@ class Paragraph:
381
388
  raise RuntimeError(
382
389
  f"Superdoc did not return nodeId for insert_paragraph_before: {result!r}",
383
390
  )
384
- new_para = Paragraph(session=self._session, node_id=node_id)
391
+ new_para = Paragraph(
392
+ session=self._session, node_id=node_id, node_type="paragraph",
393
+ )
385
394
  if style:
386
395
  new_para.style = style
387
396
  return new_para
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
4
4
 
5
5
  [project]
6
6
  name = "athena-python-docx"
7
- version = "0.2.1"
7
+ version = "0.2.2"
8
8
  description = "Drop-in replacement for python-docx that connects to Athena's Superdoc/Keryx collaborative document stack"
9
9
  readme = "README.md"
10
10
  license = "MIT"
@@ -0,0 +1,107 @@
1
+ # Fidelity Testing Methodology
2
+
3
+ A multi-layer strategy to reach 100% coverage of the python-docx API surface
4
+ that `athena-python-docx` claims to replicate. Each layer catches a different
5
+ class of drift between our Superdoc-backed implementation and stock python-docx.
6
+
7
+ ## Layers
8
+
9
+ | # | Layer | Catches | Runtime |
10
+ |---|---|---|---|
11
+ | L1 | **Import surface parity** — tests that every public class/method/property python-docx exposes is importable from our SDK. | Missing API surface (AttributeError in user code). | <1s, local. |
12
+ | L2 | **Signature parity** — inspect.signature on every public callable must match python-docx (or be a documented deviation). | Parameter renames, missing optional args. | <1s, local. |
13
+ | L3 | **Enum value parity** — every WD_* enum has matching member names + values. | Silent failure when agent passes `WD_ALIGN_PARAGRAPH.DISTRIBUTE`. | <1s, local. |
14
+ | L4 | **Local behavior (fake-session)** — 147 cases in `complex_cases/real_world_cases/extreme_cases/mega_cases` run through our SDK with `FakeSession` recording every Superdoc op. | NotImplementedError stubs, AttributeError, logic errors. | ~10s, local. |
15
+ | L5 | **Op-trace snapshot** — each case's recorded Superdoc-op sequence is pinned. Regressions show up as op-count or op-sequence diffs. | Refactor drift (e.g. switching from `doc.replace` to `doc.insert`). | <1s, local. |
16
+ | L6 | **Property round-trip** — for every setter, the getter returns the value (or a normalized form). | Cosmetic read/write asymmetry. | ~10s, local. |
17
+ | L7 | **Binary round-trip vs stock python-docx** — each case's script is run by stock python-docx to produce a reference `.docx` and by our SDK's fake session to produce an in-memory model. Extracted features (para text, style, runs + formatting, table cells, alignment, indent, spacing) are diff'd. | Structural differences not caught by op-tracing. | ~20s, local. |
18
+ | L8 | **Daytona + real Keryx** (`runner.py`) — our SDK runs inside a document-exec sandbox against a live Superdoc doc, exports the resulting `.docx`, and that `.docx` is diff'd against the stock-generated reference. | Protocol/server mismatches (like the `_Cell.text` `getNodeById` walker that worked in the fake but failed against real Superdoc). | ~3m, network. |
19
+ | L9 | **Real exported-docx feature diff** — exports both sides' `.docx` files open them with stock python-docx and compare the extracted semantic feature set (runs, fonts, tables, sections, headers). | Visual drift after Superdoc's OOXML serialization. | ~30s incremental. |
20
+ | L10 | **Agent-in-the-loop replay** — pull failing LangSmith sessions, extract the `code` the agent tried to run, and replay each against the current SDK + Daytona. | Real-world failures we missed in design. | ~1m per session. |
21
+
22
+ ## Roadmap to 100% python-docx coverage
23
+
24
+ ### Phase 1: Inventory
25
+
26
+ Crawl stock python-docx and produce the exhaustive list of callables and
27
+ properties to cover. Use `inspect.getmembers` + `typing.get_type_hints`.
28
+
29
+ Output: `tests/fidelity/parity_spec.json` — one entry per `(module.path,
30
+ member)` tuple with signature, docstring, return type.
31
+
32
+ ### Phase 2: Surface-coverage tracker
33
+
34
+ Per-member status in a spreadsheet-like JSON:
35
+
36
+ ```json
37
+ {
38
+ "docx.text.paragraph.Paragraph.alignment": {
39
+ "status": "implemented",
40
+ "signature_match": true,
41
+ "round_trip_tested": true,
42
+ "binary_diff_ok": true,
43
+ "live_daytona_ok": true,
44
+ "first_shipped": "0.2.0"
45
+ },
46
+ "docx.text.font.Font.highlight_color": {
47
+ "status": "implemented",
48
+ ...
49
+ },
50
+ "docx.section.Section.first_page_footer": {
51
+ "status": "stub",
52
+ "signature_match": true,
53
+ "round_trip_tested": false,
54
+ "blocker": "Header/Footer paragraph iteration requires doc.headerFooters.parts.list + per-part getNodeById, not yet wired."
55
+ }
56
+ }
57
+ ```
58
+
59
+ A CI job blocks merges that decrease coverage.
60
+
61
+ ### Phase 3: Generated cases
62
+
63
+ For every python-docx public method, auto-generate a minimum-viable test
64
+ case from its signature: "call with each required param filled by a
65
+ representative value, assert no exception, assert getter round-trip where
66
+ applicable." ~200 cases generated automatically.
67
+
68
+ ### Phase 4: Property-based tests
69
+
70
+ Use `hypothesis` to fuzz lengths, unicode, nested tables, merge cells,
71
+ margin values — the kind of inputs that production agents generate but
72
+ hand-written tests miss.
73
+
74
+ ### Phase 5: Docx corpus replay
75
+
76
+ Collect a corpus of real-world .docx files from:
77
+ - python-docx's test fixtures on GitHub
78
+ - Tutorials that emit full documents (MSDN, textbook samples)
79
+ - Agora's staging assets exported to .docx
80
+
81
+ For each: run `Document.from_file(path)` (Phase 2 feature), mutate via
82
+ our SDK, re-export, compare.
83
+
84
+ ### Phase 6: Agent behavioral replay
85
+
86
+ Daily job scans the last 24h of `agora-staging` and `agora-production`
87
+ langsmith traces for `execute_word_document_code` tool calls. Extracts
88
+ the `code` argument, deduplicates by script fingerprint, and runs each
89
+ through the Daytona-backed runner. Any new failure is opened as a gap.
90
+
91
+ ### Phase 7: Coverage percentage
92
+
93
+ The scorecard reports:
94
+ ```
95
+ python-docx API coverage:
96
+ Classes: 28 / 30 (93%)
97
+ Methods: 142 / 156 (91%)
98
+ Properties: 287 / 302 (95%)
99
+ Enum values: 67 / 73 (92%)
100
+
101
+ Round-trip fidelity:
102
+ Binary match rate: 112 / 112 (100%)
103
+ Live Daytona pass: 15 / 15 (100%)
104
+ Property getter/setter round-trips: 82 / 85 (96%)
105
+ ```
106
+
107
+ Target: every row at 100%, with deviations documented in CLAUDE.md.
@@ -141,13 +141,12 @@ CASES: list[Case] = [
141
141
  ),
142
142
  Case(
143
143
  name="cell_text_setter",
144
- description="Set cell(0,0).text on a 2x2 table — known NotImplementedError (Phase 2 stub).",
144
+ description="Set cell(0,0).text on a 2x2 table — now supported via 3-strategy fallback.",
145
145
  script=(
146
146
  "t = doc.add_table(rows=2, cols=2)\n"
147
147
  't.cell(0, 0).text = "A1"'
148
148
  ),
149
- expected_athena_exc="NotImplementedError",
150
- tags=("table", "stub"),
149
+ tags=("table",),
151
150
  ),
152
151
  # ---- Structural ops ------------------------------------------------------
153
152
  Case(
@@ -347,6 +347,7 @@ class FakeDocState:
347
347
  if op == "insert":
348
348
  target = params.get("target") or {}
349
349
  value = params.get("value", "")
350
+ # Text-mode: selection target with blockId
350
351
  blk_id: str | None = None
351
352
  if target.get("kind") == "selection":
352
353
  start = target.get("start") or {}
@@ -357,6 +358,37 @@ class FakeDocState:
357
358
  if b is not None:
358
359
  b.text = b.text + value
359
360
  b.inlines = [{"run": {"text": b.text}}]
361
+ return {"ok": True}
362
+ # Structural mode: target is a BlockNodeAddress (incl. tableCell)
363
+ # with content = paragraph fragment, placement = insideStart/End.
364
+ if target.get("kind") == "block" and target.get("nodeType") == "tableCell":
365
+ cell_nid = target.get("nodeId")
366
+ content = params.get("content") or {}
367
+ # Extract text from prosemirror paragraph fragment.
368
+ appended: str = ""
369
+ if isinstance(content, dict):
370
+ inline_list = content.get("content") or []
371
+ if isinstance(inline_list, list):
372
+ for inl in inline_list:
373
+ if isinstance(inl, dict) and inl.get("type") == "text":
374
+ t = inl.get("text")
375
+ if isinstance(t, str):
376
+ appended += t
377
+ # Find cell and append to its first paragraph.
378
+ for t in self.tables:
379
+ for r_idx, row in enumerate(t.cells):
380
+ for c_idx, cell in enumerate(row):
381
+ expected_ids = {
382
+ f"cell_{t.node_id}_{r_idx}_{c_idx}",
383
+ cell.node_id,
384
+ }
385
+ if cell_nid in expected_ids:
386
+ cell.text = cell.text + appended
387
+ cell.inlines = (
388
+ [{"run": {"text": cell.text}}]
389
+ if cell.text
390
+ else []
391
+ )
360
392
  return {"ok": True}
361
393
 
362
394
  if op == "insertLineBreak":