@heylemon/lemonade 0.0.4 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (105) hide show
  1. package/dist/build-info.json +3 -3
  2. package/dist/canvas-host/a2ui/.bundle.hash +1 -1
  3. package/package.json +1 -1
  4. package/skills/docx/SKILL.md +25 -30
  5. package/skills/docx/scripts/accept_changes.py +0 -17
  6. package/skills/docx/scripts/comment.py +10 -39
  7. package/skills/docx/scripts/office/helpers/merge_runs.py +1 -33
  8. package/skills/docx/scripts/office/helpers/simplify_redlines.py +0 -43
  9. package/skills/docx/scripts/office/pack.py +0 -30
  10. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
  11. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
  12. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
  13. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
  14. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
  15. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
  16. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
  17. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
  18. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
  19. package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
  20. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
  21. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
  22. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
  23. package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
  24. package/skills/docx/scripts/office/soffice.py +0 -55
  25. package/skills/docx/scripts/office/unpack.py +5 -27
  26. package/skills/docx/scripts/office/validate.py +19 -14
  27. package/skills/docx/scripts/office/validators/base.py +48 -224
  28. package/skills/docx/scripts/office/validators/docx.py +44 -117
  29. package/skills/docx/scripts/office/validators/pptx.py +2 -42
  30. package/skills/docx/scripts/office/validators/redlining.py +3 -40
  31. package/skills/pdf/SKILL.md +22 -15
  32. package/skills/pdf/{FORMS.md → forms.md} +0 -14
  33. package/skills/pdf/scripts/check_bounding_boxes.py +0 -5
  34. package/skills/pdf/scripts/check_fillable_fields.py +0 -1
  35. package/skills/pdf/scripts/convert_pdf_to_images.py +0 -2
  36. package/skills/pdf/scripts/create_validation_image.py +0 -4
  37. package/skills/pdf/scripts/extract_form_field_info.py +1 -31
  38. package/skills/pdf/scripts/extract_form_structure.py +0 -9
  39. package/skills/pdf/scripts/fill_fillable_fields.py +0 -23
  40. package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +3 -38
  41. package/skills/pptx/SKILL.md +2 -29
  42. package/skills/pptx/editing.md +2 -2
  43. package/skills/pptx/pptxgenjs.md +53 -8
  44. package/skills/pptx/scripts/add_slide.py +0 -30
  45. package/skills/pptx/scripts/clean.py +0 -23
  46. package/skills/pptx/scripts/office/helpers/merge_runs.py +1 -33
  47. package/skills/pptx/scripts/office/helpers/simplify_redlines.py +0 -43
  48. package/skills/pptx/scripts/office/pack.py +0 -30
  49. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
  50. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
  51. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
  52. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
  53. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
  54. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
  55. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
  56. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
  57. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
  58. package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
  59. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
  60. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
  61. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
  62. package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
  63. package/skills/pptx/scripts/office/soffice.py +0 -55
  64. package/skills/pptx/scripts/office/unpack.py +5 -27
  65. package/skills/pptx/scripts/office/validate.py +19 -14
  66. package/skills/pptx/scripts/office/validators/base.py +48 -224
  67. package/skills/pptx/scripts/office/validators/docx.py +44 -117
  68. package/skills/pptx/scripts/office/validators/pptx.py +2 -42
  69. package/skills/pptx/scripts/office/validators/redlining.py +3 -40
  70. package/skills/pptx/scripts/thumbnail.py +0 -31
  71. package/skills/xlsx/SKILL.md +3 -26
  72. package/skills/xlsx/scripts/office/helpers/merge_runs.py +1 -33
  73. package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +0 -43
  74. package/skills/xlsx/scripts/office/pack.py +0 -30
  75. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
  76. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
  77. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
  78. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
  79. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
  80. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
  81. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
  82. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
  83. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
  84. package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
  85. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
  86. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
  87. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
  88. package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
  89. package/skills/xlsx/scripts/office/soffice.py +0 -55
  90. package/skills/xlsx/scripts/office/unpack.py +5 -27
  91. package/skills/xlsx/scripts/office/validate.py +19 -14
  92. package/skills/xlsx/scripts/office/validators/base.py +48 -224
  93. package/skills/xlsx/scripts/office/validators/docx.py +44 -117
  94. package/skills/xlsx/scripts/office/validators/pptx.py +2 -42
  95. package/skills/xlsx/scripts/office/validators/redlining.py +3 -40
  96. package/skills/xlsx/scripts/recalc.py +2 -26
  97. package/skills/docx/scripts/__init__.py +0 -1
  98. package/skills/docx/scripts/office/helpers/__init__.py +0 -0
  99. package/skills/docx/scripts/office/validators/__init__.py +0 -15
  100. package/skills/pptx/scripts/__init__.py +0 -0
  101. package/skills/pptx/scripts/office/helpers/__init__.py +0 -0
  102. package/skills/pptx/scripts/office/validators/__init__.py +0 -15
  103. package/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
  104. package/skills/xlsx/scripts/office/validators/__init__.py +0 -15
  105. /package/skills/pdf/{REFERENCE.md → reference.md} +0 -0
@@ -1,5 +1,5 @@
1
1
  {
2
- "version": "0.0.4",
3
- "commit": "a26b38261e6a55844b0eed17963361b8374e9742",
4
- "builtAt": "2026-02-20T05:21:43.349Z"
2
+ "version": "0.0.5",
3
+ "commit": "e4c8970d613551e3b0086ba6e128b5624c5f2155",
4
+ "builtAt": "2026-02-20T05:39:04.795Z"
5
5
  }
@@ -1 +1 @@
1
- d27c866a9f91a1bf3b7a29bef001583ee57f61540140511ed6840c4b5dfcd5eb
1
+ 0084b5bec47a901a88b379363f5f31530feeb541ab85d15e5f944c012ebbef96
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@heylemon/lemonade",
3
- "version": "0.0.4",
3
+ "version": "0.0.5",
4
4
  "description": "AI gateway CLI for Lemon - local AI assistant with integrations",
5
5
  "publishConfig": {
6
6
  "access": "restricted"
@@ -1,30 +1,11 @@
1
1
  ---
2
2
  name: docx
3
- description: "Comprehensive document creation, editing, and analysis with support for tracked changes, comments, formatting preservation, and text extraction. When Claude needs to work with professional documents (.docx files) for: (1) Creating new documents, (2) Modifying or editing content, (3) Working with tracked changes, (4) Adding comments, or any other document tasks"
3
+ description: "Use this skill whenever the user wants to create, read, edit, or manipulate Word documents (.docx files). Triggers include: any mention of \"Word doc\", \"word document\", \".docx\", or requests to produce professional documents with formatting like tables of contents, headings, page numbers, or letterheads. Also use when extracting or reorganizing content from .docx files, inserting or replacing images in documents, performing find-and-replace in Word files, working with tracked changes or comments, or converting content into a polished Word document. If the user asks for a \"report\", \"memo\", \"letter\", \"template\", or similar deliverable as a Word or .docx file, use this skill. Do NOT use for PDFs, spreadsheets, Google Docs, or general coding tasks unrelated to document generation."
4
4
  license: Proprietary. LICENSE.txt has complete terms
5
5
  ---
6
6
 
7
7
  # DOCX creation, editing, and analysis
8
8
 
9
- ## IMPORTANT: Save to Desktop
10
-
11
- **Always save created `.docx` files to `~/Desktop/`** (e.g. `~/Desktop/document.docx`). Never save to the agent workspace or hidden directories — the user needs easy access to the file.
12
-
13
- ## CRITICAL: Integration Priority
14
-
15
- ### 1. `lemon-docs` CLI (For Google Docs)
16
- If user wants a Google Doc (shareable, collaborative), use `lemon-docs`:
17
- - `lemon-docs create "Title"` - Create a new Google Doc
18
- - `lemon-docs read <id>` - Read a document
19
-
20
- ### 2. Local DOCX (For Files)
21
- If user wants a local `.docx` file, or `lemon-docs` is not connected, use the local creation methods below.
22
-
23
- ### 3. Browser (LAST RESORT)
24
- Only if `lemon-docs` CLI fails AND user explicitly requests Google Docs in browser.
25
-
26
- ---
27
-
28
9
  ## Overview
29
10
 
30
11
  A .docx file is a ZIP archive containing XML files.
@@ -67,14 +48,14 @@ pdftoppm -jpeg -r 150 document.pdf page
67
48
  To produce a clean document with all tracked changes accepted (requires LibreOffice):
68
49
 
69
50
  ```bash
70
- python scripts/accept_changes.py input.docx ~/Desktop/output.docx
51
+ python scripts/accept_changes.py input.docx output.docx
71
52
  ```
72
53
 
73
54
  ---
74
55
 
75
56
  ## Creating New Documents
76
57
 
77
- Generate .docx files with JavaScript. Install: `npm install -g docx`
58
+ Generate .docx files with JavaScript, then validate. Install: `npm install -g docx`
78
59
 
79
60
  ### Setup
80
61
  ```javascript
@@ -87,6 +68,12 @@ const doc = new Document({ sections: [{ children: [/* content */] }] });
87
68
  Packer.toBuffer(doc).then(buffer => fs.writeFileSync("doc.docx", buffer));
88
69
  ```
89
70
 
71
+ ### Validation
72
+ After creating the file, validate it. If validation fails, unpack, fix the XML, and repack.
73
+ ```bash
74
+ python scripts/office/validate.py doc.docx
75
+ ```
76
+
90
77
  ### Page Size
91
78
 
92
79
  ```javascript
@@ -113,6 +100,16 @@ sections: [{
113
100
  | US Letter | 12,240 | 15,840 | 9,360 |
114
101
  | A4 (default) | 11,906 | 16,838 | 9,026 |
115
102
 
103
+ **Landscape orientation:** docx-js swaps width/height internally, so pass portrait dimensions and let it handle the swap:
104
+ ```javascript
105
+ size: {
106
+ width: 12240, // Pass SHORT edge as width
107
+ height: 15840, // Pass LONG edge as height
108
+ orientation: PageOrientation.LANDSCAPE // docx-js swaps them in the XML
109
+ },
110
+ // Content width = 15840 - left margin - right margin (uses the long edge)
111
+ ```
112
+
116
113
  ### Styles (Override Built-in Headings)
117
114
 
118
115
  Use Arial as the default font (universally supported). Keep titles black for readability.
@@ -184,8 +181,8 @@ const border = { style: BorderStyle.SINGLE, size: 1, color: "CCCCCC" };
184
181
  const borders = { top: border, bottom: border, left: border, right: border };
185
182
 
186
183
  new Table({
187
- width: { size: 100, type: WidthType.PERCENTAGE }, // Always set table width
188
- columnWidths: [4680, 4680], // Set at table level (DXA: 1440 = 1 inch)
184
+ width: { size: 9360, type: WidthType.DXA }, // Always use DXA (percentages break in Google Docs)
185
+ columnWidths: [4680, 4680], // Must sum to table width (DXA: 1440 = 1 inch)
189
186
  rows: [
190
187
  new TableRow({
191
188
  children: [
@@ -204,13 +201,9 @@ new Table({
204
201
 
205
202
  **Table width calculation:**
206
203
 
207
- Use `WidthType.PERCENTAGE` for simplicity, or `WidthType.DXA` for precise control:
204
+ Always use `WidthType.DXA` `WidthType.PERCENTAGE` breaks in Google Docs.
208
205
 
209
206
  ```javascript
210
- // Option 1: Percentage (recommended - automatically fits content area)
211
- width: { size: 100, type: WidthType.PERCENTAGE }
212
-
213
- // Option 2: DXA (precise control)
214
207
  // Table width = sum of columnWidths = content width
215
208
  // US Letter with 1" margins: 12240 - 2880 = 9360 DXA
216
209
  width: { size: 9360, type: WidthType.DXA },
@@ -218,6 +211,7 @@ columnWidths: [7000, 2360] // Must sum to table width
218
211
  ```
219
212
 
220
213
  **Width rules:**
214
+ - **Always use `WidthType.DXA`** — never `WidthType.PERCENTAGE` (incompatible with Google Docs)
221
215
  - Table width must equal the sum of `columnWidths`
222
216
  - Cell `width` must match corresponding `columnWidth`
223
217
  - Cell `margins` are internal padding - they reduce content area, not add to cell width
@@ -276,11 +270,12 @@ sections: [{
276
270
  ### Critical Rules for docx-js
277
271
 
278
272
  - **Set page size explicitly** - docx-js defaults to A4; use US Letter (12240 x 15840 DXA) for US documents
273
+ - **Landscape: pass portrait dimensions** - docx-js swaps width/height internally; pass short edge as `width`, long edge as `height`, and set `orientation: PageOrientation.LANDSCAPE`
279
274
  - **Never use `\n`** - use separate Paragraph elements
280
275
  - **Never use unicode bullets** - use `LevelFormat.BULLET` with numbering config
281
276
  - **PageBreak must be in Paragraph** - standalone creates invalid XML
282
277
  - **ImageRun requires `type`** - always specify png/jpg/etc
283
- - **Always set table `width`** - use `{ size: 100, type: WidthType.PERCENTAGE }` for full width
278
+ - **Always set table `width` with DXA** - never use `WidthType.PERCENTAGE` (breaks in Google Docs)
284
279
  - **Tables need dual widths** - `columnWidths` array AND cell `width`, both must match
285
280
  - **Table width = sum of columnWidths** - for DXA, ensure they add up exactly
286
281
  - **Always add cell margins** - use `margins: { top: 80, bottom: 80, left: 120, right: 120 }` for readable padding
@@ -1,4 +1,3 @@
1
- #!/usr/bin/env python3
2
1
  """Accept all tracked changes in a DOCX file using LibreOffice.
3
2
 
4
3
  Requires LibreOffice (soffice) to be installed.
@@ -14,7 +13,6 @@ from office.soffice import get_soffice_env
14
13
 
15
14
  logger = logging.getLogger(__name__)
16
15
 
17
- # LibreOffice profile directory for macro storage
18
16
  LIBREOFFICE_PROFILE = "/tmp/libreoffice_docx_profile"
19
17
  MACRO_DIR = f"{LIBREOFFICE_PROFILE}/user/basic/Standard"
20
18
 
@@ -39,15 +37,6 @@ def accept_changes(
39
37
  input_file: str,
40
38
  output_file: str,
41
39
  ) -> tuple[None, str]:
42
- """Accept all tracked changes in a DOCX file and save to output file.
43
-
44
- Args:
45
- input_file: Path to input DOCX file with tracked changes
46
- output_file: Path to output DOCX file (will be created/overwritten)
47
-
48
- Returns:
49
- (None, message) - message indicates success or failure
50
- """
51
40
  input_path = Path(input_file)
52
41
  output_path = Path(output_file)
53
42
 
@@ -57,18 +46,15 @@ def accept_changes(
57
46
  if not input_path.suffix.lower() == ".docx":
58
47
  return None, f"Error: Input file is not a DOCX file: {input_file}"
59
48
 
60
- # Copy input file to output file location
61
49
  try:
62
50
  output_path.parent.mkdir(parents=True, exist_ok=True)
63
51
  shutil.copy2(input_path, output_path)
64
52
  except Exception as e:
65
53
  return None, f"Error: Failed to copy input file to output location: {e}"
66
54
 
67
- # Setup LibreOffice macro
68
55
  if not _setup_libreoffice_macro():
69
56
  return None, "Error: Failed to setup LibreOffice macro"
70
57
 
71
- # Run LibreOffice with macro to accept changes
72
58
  cmd = [
73
59
  "soffice",
74
60
  "--headless",
@@ -88,7 +74,6 @@ def accept_changes(
88
74
  env=get_soffice_env(),
89
75
  )
90
76
  except subprocess.TimeoutExpired:
91
- # Timeout is expected - LibreOffice may hang after completing
92
77
  return (
93
78
  None,
94
79
  f"Successfully accepted all tracked changes: {input_file} -> {output_file}",
@@ -104,14 +89,12 @@ def accept_changes(
104
89
 
105
90
 
106
91
  def _setup_libreoffice_macro() -> bool:
107
- """Setup LibreOffice macro for accepting tracked changes."""
108
92
  macro_dir = Path(MACRO_DIR)
109
93
  macro_file = macro_dir / "Module1.xba"
110
94
 
111
95
  if macro_file.exists() and "AcceptAllTrackedChanges" in macro_file.read_text():
112
96
  return True
113
97
 
114
- # Initialize LibreOffice if needed (use custom profile)
115
98
  if not macro_dir.exists():
116
99
  subprocess.run(
117
100
  [
@@ -1,4 +1,3 @@
1
- #!/usr/bin/env python3
2
1
  """Add comments to DOCX documents.
3
2
 
4
3
  Usage:
@@ -32,7 +31,6 @@ NS = {
32
31
  "w16cex": "http://schemas.microsoft.com/office/word/2018/wordml/cex",
33
32
  }
34
33
 
35
- # XML template for comment content in comments.xml
36
34
  COMMENT_XML = """\
37
35
  <w:comment w:id="{id}" w:author="{author}" w:date="{date}" w:initials="{initials}">
38
36
  <w:p w14:paraId="{para_id}" w14:textId="77777777">
@@ -51,7 +49,6 @@ COMMENT_XML = """\
51
49
  </w:p>
52
50
  </w:comment>"""
53
51
 
54
- # Output templates for marker placement instructions
55
52
  COMMENT_MARKER_TEMPLATE = """
56
53
  Add to document.xml (markers must be direct children of w:p, never inside w:r):
57
54
  <w:commentRangeStart w:id="{cid}"/>
@@ -69,42 +66,36 @@ Nest markers inside parent {pid}'s markers (markers must be direct children of w
69
66
 
70
67
 
71
68
  def _generate_hex_id() -> str:
72
- """Random 8-char hex ID (satisfies paraId < 0x80000000, durableId < 0x7FFFFFFF)."""
73
69
  return f"{random.randint(0, 0x7FFFFFFE):08X}"
74
70
 
75
71
 
76
- # Smart quotes to re-encode after DOM serialization (DOM decodes entities to Unicode)
77
72
  SMART_QUOTE_ENTITIES = {
78
- "\u201c": "&#x201C;", # Left double quote
79
- "\u201d": "&#x201D;", # Right double quote
80
- "\u2018": "&#x2018;", # Left single quote
81
- "\u2019": "&#x2019;", # Right single quote
73
+ "\u201c": "&#x201C;",
74
+ "\u201d": "&#x201D;",
75
+ "\u2018": "&#x2018;",
76
+ "\u2019": "&#x2019;",
82
77
  }
83
78
 
84
79
 
85
80
  def _encode_smart_quotes(text: str) -> str:
86
- """Re-encode smart quotes as XML entities after DOM serialization."""
87
81
  for char, entity in SMART_QUOTE_ENTITIES.items():
88
82
  text = text.replace(char, entity)
89
83
  return text
90
84
 
91
85
 
92
86
  def _append_xml(xml_path: Path, root_tag: str, content: str) -> None:
93
- """Append content as child of root element."""
94
87
  dom = defusedxml.minidom.parseString(xml_path.read_text(encoding="utf-8"))
95
88
  root = dom.getElementsByTagName(root_tag)[0]
96
89
  ns_attrs = " ".join(f'xmlns:{k}="{v}"' for k, v in NS.items())
97
90
  wrapper_dom = defusedxml.minidom.parseString(f"<root {ns_attrs}>{content}</root>")
98
- for child in wrapper_dom.documentElement.childNodes: # type: ignore
91
+ for child in wrapper_dom.documentElement.childNodes:
99
92
  if child.nodeType == child.ELEMENT_NODE:
100
93
  root.appendChild(dom.importNode(child, True))
101
- # Re-encode smart quotes that DOM decoded to Unicode
102
94
  output = _encode_smart_quotes(dom.toxml(encoding="UTF-8").decode("utf-8"))
103
95
  xml_path.write_text(output, encoding="utf-8")
104
96
 
105
97
 
106
98
  def _find_para_id(comments_path: Path, comment_id: int) -> str | None:
107
- """Find para_id for a comment ID."""
108
99
  dom = defusedxml.minidom.parseString(comments_path.read_text(encoding="utf-8"))
109
100
  for c in dom.getElementsByTagName("w:comment"):
110
101
  if c.getAttribute("w:id") == str(comment_id):
@@ -115,7 +106,6 @@ def _find_para_id(comments_path: Path, comment_id: int) -> str | None:
115
106
 
116
107
 
117
108
  def _get_next_rid(rels_path: Path) -> int:
118
- """Get the next available rId number from document.xml.rels."""
119
109
  dom = defusedxml.minidom.parseString(rels_path.read_text(encoding="utf-8"))
120
110
  max_rid = 0
121
111
  for rel in dom.getElementsByTagName("Relationship"):
@@ -129,7 +119,6 @@ def _get_next_rid(rels_path: Path) -> int:
129
119
 
130
120
 
131
121
  def _has_relationship(rels_path: Path, target: str) -> bool:
132
- """Check if a relationship with given target exists."""
133
122
  dom = defusedxml.minidom.parseString(rels_path.read_text(encoding="utf-8"))
134
123
  for rel in dom.getElementsByTagName("Relationship"):
135
124
  if rel.getAttribute("Target") == target:
@@ -138,7 +127,6 @@ def _has_relationship(rels_path: Path, target: str) -> bool:
138
127
 
139
128
 
140
129
  def _has_content_type(ct_path: Path, part_name: str) -> bool:
141
- """Check if a content type override with given part name exists."""
142
130
  dom = defusedxml.minidom.parseString(ct_path.read_text(encoding="utf-8"))
143
131
  for override in dom.getElementsByTagName("Override"):
144
132
  if override.getAttribute("PartName") == part_name:
@@ -147,19 +135,17 @@ def _has_content_type(ct_path: Path, part_name: str) -> bool:
147
135
 
148
136
 
149
137
  def _ensure_comment_relationships(unpacked_dir: Path) -> None:
150
- """Ensure word/_rels/document.xml.rels has comment relationships."""
151
138
  rels_path = unpacked_dir / "word" / "_rels" / "document.xml.rels"
152
139
  if not rels_path.exists():
153
140
  return
154
141
 
155
142
  if _has_relationship(rels_path, "comments.xml"):
156
- return # Already has comment relationships
143
+ return
157
144
 
158
145
  dom = defusedxml.minidom.parseString(rels_path.read_text(encoding="utf-8"))
159
146
  root = dom.documentElement
160
147
  next_rid = _get_next_rid(rels_path)
161
148
 
162
- # Add relationship elements
163
149
  rels = [
164
150
  (
165
151
  "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments",
@@ -184,25 +170,23 @@ def _ensure_comment_relationships(unpacked_dir: Path) -> None:
184
170
  rel.setAttribute("Id", f"rId{next_rid}")
185
171
  rel.setAttribute("Type", rel_type)
186
172
  rel.setAttribute("Target", target)
187
- root.appendChild(rel) # type: ignore
173
+ root.appendChild(rel)
188
174
  next_rid += 1
189
175
 
190
176
  rels_path.write_bytes(dom.toxml(encoding="UTF-8"))
191
177
 
192
178
 
193
179
  def _ensure_comment_content_types(unpacked_dir: Path) -> None:
194
- """Ensure [Content_Types].xml has comment content types."""
195
180
  ct_path = unpacked_dir / "[Content_Types].xml"
196
181
  if not ct_path.exists():
197
182
  return
198
183
 
199
184
  if _has_content_type(ct_path, "/word/comments.xml"):
200
- return # Already has comment content types
185
+ return
201
186
 
202
187
  dom = defusedxml.minidom.parseString(ct_path.read_text(encoding="utf-8"))
203
188
  root = dom.documentElement
204
189
 
205
- # Add Override elements
206
190
  overrides = [
207
191
  (
208
192
  "/word/comments.xml",
@@ -226,7 +210,7 @@ def _ensure_comment_content_types(unpacked_dir: Path) -> None:
226
210
  override = dom.createElement("Override")
227
211
  override.setAttribute("PartName", part_name)
228
212
  override.setAttribute("ContentType", content_type)
229
- root.appendChild(override) # type: ignore
213
+ root.appendChild(override)
230
214
 
231
215
  ct_path.write_bytes(dom.toxml(encoding="UTF-8"))
232
216
 
@@ -239,14 +223,6 @@ def add_comment(
239
223
  initials: str = "C",
240
224
  parent_id: int | None = None,
241
225
  ) -> tuple[str, str]:
242
- """Add comment to unpacked DOCX.
243
-
244
- Args:
245
- text: Comment text, pre-escaped for XML (e.g., &amp; &#x2019;).
246
-
247
- Returns:
248
- (para_id, message) tuple.
249
- """
250
226
  word = Path(unpacked_dir) / "word"
251
227
  if not word.exists():
252
228
  return "", f"Error: {word} not found"
@@ -254,12 +230,10 @@ def add_comment(
254
230
  para_id, durable_id = _generate_hex_id(), _generate_hex_id()
255
231
  ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
256
232
 
257
- # comments.xml
258
233
  comments = word / "comments.xml"
259
234
  first_comment = not comments.exists()
260
235
  if first_comment:
261
236
  shutil.copy(TEMPLATE_DIR / "comments.xml", comments)
262
- # Add relationships and content types for comment files
263
237
  _ensure_comment_relationships(Path(unpacked_dir))
264
238
  _ensure_comment_content_types(Path(unpacked_dir))
265
239
  _append_xml(
@@ -271,11 +245,10 @@ def add_comment(
271
245
  date=ts,
272
246
  initials=initials,
273
247
  para_id=para_id,
274
- text=text, # Model provides pre-escaped XML content
248
+ text=text,
275
249
  ),
276
250
  )
277
251
 
278
- # commentsExtended.xml
279
252
  ext = word / "commentsExtended.xml"
280
253
  if not ext.exists():
281
254
  shutil.copy(TEMPLATE_DIR / "commentsExtended.xml", ext)
@@ -295,7 +268,6 @@ def add_comment(
295
268
  f'<w15:commentEx w15:paraId="{para_id}" w15:done="0"/>',
296
269
  )
297
270
 
298
- # commentsIds.xml
299
271
  ids = word / "commentsIds.xml"
300
272
  if not ids.exists():
301
273
  shutil.copy(TEMPLATE_DIR / "commentsIds.xml", ids)
@@ -305,7 +277,6 @@ def add_comment(
305
277
  f'<w16cid:commentId w16cid:paraId="{para_id}" w16cid:durableId="{durable_id}"/>',
306
278
  )
307
279
 
308
- # commentsExtensible.xml
309
280
  extensible = word / "commentsExtensible.xml"
310
281
  if not extensible.exists():
311
282
  shutil.copy(TEMPLATE_DIR / "commentsExtensible.xml", extensible)
@@ -14,14 +14,6 @@ import defusedxml.minidom
14
14
 
15
15
 
16
16
  def merge_runs(input_dir: str) -> tuple[int, str]:
17
- """Merge adjacent runs in document.xml.
18
-
19
- Args:
20
- input_dir: Path to unpacked DOCX directory
21
-
22
- Returns:
23
- (merge_count, message)
24
- """
25
17
  doc_xml = Path(input_dir) / "word" / "document.xml"
26
18
 
27
19
  if not doc_xml.exists():
@@ -31,14 +23,11 @@ def merge_runs(input_dir: str) -> tuple[int, str]:
31
23
  dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8"))
32
24
  root = dom.documentElement
33
25
 
34
- # Clean up elements that block merging
35
26
  _remove_elements(root, "proofErr")
36
27
  _strip_run_rsid_attrs(root)
37
28
 
38
- # Find all containers that have runs
39
29
  containers = {run.parentNode for run in _find_elements(root, "r")}
40
30
 
41
- # Merge runs in each container
42
31
  merge_count = 0
43
32
  for container in containers:
44
33
  merge_count += _merge_runs_in(container)
@@ -50,11 +39,9 @@ def merge_runs(input_dir: str) -> tuple[int, str]:
50
39
  return 0, f"Error: {e}"
51
40
 
52
41
 
53
- # --- Element helpers ---
54
42
 
55
43
 
56
44
  def _find_elements(root, tag: str) -> list:
57
- """Find all elements matching tag name (with or without namespace)."""
58
45
  results = []
59
46
 
60
47
  def traverse(node):
@@ -70,7 +57,6 @@ def _find_elements(root, tag: str) -> list:
70
57
 
71
58
 
72
59
  def _get_child(parent, tag: str):
73
- """Get first child element matching tag name."""
74
60
  for child in parent.childNodes:
75
61
  if child.nodeType == child.ELEMENT_NODE:
76
62
  name = child.localName or child.tagName
@@ -80,7 +66,6 @@ def _get_child(parent, tag: str):
80
66
 
81
67
 
82
68
  def _get_children(parent, tag: str) -> list:
83
- """Get all direct child elements matching tag name."""
84
69
  results = []
85
70
  for child in parent.childNodes:
86
71
  if child.nodeType == child.ELEMENT_NODE:
@@ -91,7 +76,6 @@ def _get_children(parent, tag: str) -> list:
91
76
 
92
77
 
93
78
  def _is_adjacent(elem1, elem2) -> bool:
94
- """Check if two elements are adjacent (only whitespace between them)."""
95
79
  node = elem1.nextSibling
96
80
  while node:
97
81
  if node == elem2:
@@ -104,34 +88,28 @@ def _is_adjacent(elem1, elem2) -> bool:
104
88
  return False
105
89
 
106
90
 
107
- # --- Cleanup functions ---
108
91
 
109
92
 
110
93
  def _remove_elements(root, tag: str):
111
- """Remove all elements matching tag name."""
112
94
  for elem in _find_elements(root, tag):
113
95
  if elem.parentNode:
114
96
  elem.parentNode.removeChild(elem)
115
97
 
116
98
 
117
99
  def _strip_run_rsid_attrs(root):
118
- """Remove rsid attributes from all run elements."""
119
100
  for run in _find_elements(root, "r"):
120
101
  for attr in list(run.attributes.values()):
121
102
  if "rsid" in attr.name.lower():
122
103
  run.removeAttribute(attr.name)
123
104
 
124
105
 
125
- # --- Merge functions ---
126
106
 
127
107
 
128
108
  def _merge_runs_in(container) -> int:
129
- """Merge adjacent runs with identical formatting in a container element."""
130
109
  merge_count = 0
131
110
  run = _first_child_run(container)
132
111
 
133
112
  while run:
134
- # Absorb adjacent runs with same formatting
135
113
  while True:
136
114
  next_elem = _next_element_sibling(run)
137
115
  if next_elem and _is_run(next_elem) and _can_merge(run, next_elem):
@@ -148,7 +126,6 @@ def _merge_runs_in(container) -> int:
148
126
 
149
127
 
150
128
  def _first_child_run(container):
151
- """Get the first run child of a container."""
152
129
  for child in container.childNodes:
153
130
  if child.nodeType == child.ELEMENT_NODE and _is_run(child):
154
131
  return child
@@ -156,7 +133,6 @@ def _first_child_run(container):
156
133
 
157
134
 
158
135
  def _next_element_sibling(node):
159
- """Get the next element sibling, skipping text/whitespace nodes."""
160
136
  sibling = node.nextSibling
161
137
  while sibling:
162
138
  if sibling.nodeType == sibling.ELEMENT_NODE:
@@ -166,25 +142,21 @@ def _next_element_sibling(node):
166
142
 
167
143
 
168
144
  def _next_sibling_run(node):
169
- """Get the next sibling that is a run element."""
170
145
  sibling = node.nextSibling
171
146
  while sibling:
172
147
  if sibling.nodeType == sibling.ELEMENT_NODE:
173
148
  if _is_run(sibling):
174
149
  return sibling
175
- # Skip non-run elements (bookmarks, etc.) but keep looking
176
150
  sibling = sibling.nextSibling
177
151
  return None
178
152
 
179
153
 
180
154
  def _is_run(node) -> bool:
181
- """Check if node is a run element."""
182
155
  name = node.localName or node.tagName
183
156
  return name == "r" or name.endswith(":r")
184
157
 
185
158
 
186
159
  def _can_merge(run1, run2) -> bool:
187
- """Check if two runs have identical formatting."""
188
160
  rpr1 = _get_child(run1, "rPr")
189
161
  rpr2 = _get_child(run2, "rPr")
190
162
 
@@ -192,11 +164,10 @@ def _can_merge(run1, run2) -> bool:
192
164
  return False
193
165
  if rpr1 is None:
194
166
  return True
195
- return rpr1.toxml() == rpr2.toxml() # type: ignore
167
+ return rpr1.toxml() == rpr2.toxml()
196
168
 
197
169
 
198
170
  def _merge_run_content(target, source):
199
- """Move content from source run to target run (excluding rPr)."""
200
171
  for child in list(source.childNodes):
201
172
  if child.nodeType == child.ELEMENT_NODE:
202
173
  name = child.localName or child.tagName
@@ -205,10 +176,8 @@ def _merge_run_content(target, source):
205
176
 
206
177
 
207
178
  def _consolidate_text(run):
208
- """Merge adjacent <w:t> elements within a run."""
209
179
  t_elements = _get_children(run, "t")
210
180
 
211
- # Work backwards to safely remove elements
212
181
  for i in range(len(t_elements) - 1, 0, -1):
213
182
  curr, prev = t_elements[i], t_elements[i - 1]
214
183
 
@@ -222,7 +191,6 @@ def _consolidate_text(run):
222
191
  else:
223
192
  prev.appendChild(run.ownerDocument.createTextNode(merged))
224
193
 
225
- # Preserve whitespace if needed
226
194
  if merged.startswith(" ") or merged.endswith(" "):
227
195
  prev.setAttribute("xml:space", "preserve")
228
196
  elif prev.hasAttribute("xml:space"):