@heylemon/lemonade 0.0.4 → 0.0.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/build-info.json +3 -3
- package/dist/canvas-host/a2ui/.bundle.hash +1 -1
- package/dist/gateway/skills-http.js +74 -19
- package/package.json +1 -1
- package/skills/docx/SKILL.md +25 -30
- package/skills/docx/scripts/accept_changes.py +0 -17
- package/skills/docx/scripts/comment.py +10 -39
- package/skills/docx/scripts/office/helpers/merge_runs.py +1 -33
- package/skills/docx/scripts/office/helpers/simplify_redlines.py +0 -43
- package/skills/docx/scripts/office/pack.py +0 -30
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
- package/skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
- package/skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
- package/skills/docx/scripts/office/soffice.py +0 -55
- package/skills/docx/scripts/office/unpack.py +5 -27
- package/skills/docx/scripts/office/validate.py +19 -14
- package/skills/docx/scripts/office/validators/base.py +48 -224
- package/skills/docx/scripts/office/validators/docx.py +44 -117
- package/skills/docx/scripts/office/validators/pptx.py +2 -42
- package/skills/docx/scripts/office/validators/redlining.py +3 -40
- package/skills/pdf/SKILL.md +22 -15
- package/skills/pdf/{FORMS.md → forms.md} +0 -14
- package/skills/pdf/scripts/check_bounding_boxes.py +0 -5
- package/skills/pdf/scripts/check_fillable_fields.py +0 -1
- package/skills/pdf/scripts/convert_pdf_to_images.py +0 -2
- package/skills/pdf/scripts/create_validation_image.py +0 -4
- package/skills/pdf/scripts/extract_form_field_info.py +1 -31
- package/skills/pdf/scripts/extract_form_structure.py +0 -9
- package/skills/pdf/scripts/fill_fillable_fields.py +0 -23
- package/skills/pdf/scripts/fill_pdf_form_with_annotations.py +3 -38
- package/skills/pptx/SKILL.md +2 -29
- package/skills/pptx/editing.md +2 -2
- package/skills/pptx/pptxgenjs.md +53 -8
- package/skills/pptx/scripts/add_slide.py +0 -30
- package/skills/pptx/scripts/clean.py +0 -23
- package/skills/pptx/scripts/office/helpers/merge_runs.py +1 -33
- package/skills/pptx/scripts/office/helpers/simplify_redlines.py +0 -43
- package/skills/pptx/scripts/office/pack.py +0 -30
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
- package/skills/pptx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
- package/skills/pptx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
- package/skills/pptx/scripts/office/soffice.py +0 -55
- package/skills/pptx/scripts/office/unpack.py +5 -27
- package/skills/pptx/scripts/office/validate.py +19 -14
- package/skills/pptx/scripts/office/validators/base.py +48 -224
- package/skills/pptx/scripts/office/validators/docx.py +44 -117
- package/skills/pptx/scripts/office/validators/pptx.py +2 -42
- package/skills/pptx/scripts/office/validators/redlining.py +3 -40
- package/skills/pptx/scripts/thumbnail.py +0 -31
- package/skills/xlsx/SKILL.md +3 -26
- package/skills/xlsx/scripts/office/helpers/merge_runs.py +1 -33
- package/skills/xlsx/scripts/office/helpers/simplify_redlines.py +0 -43
- package/skills/xlsx/scripts/office/pack.py +0 -30
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -1499
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -1085
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -3081
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -287
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -1676
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -174
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -582
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -4439
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -570
- package/skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -116
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -42
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -50
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -49
- package/skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -33
- package/skills/xlsx/scripts/office/soffice.py +0 -55
- package/skills/xlsx/scripts/office/unpack.py +5 -27
- package/skills/xlsx/scripts/office/validate.py +19 -14
- package/skills/xlsx/scripts/office/validators/base.py +48 -224
- package/skills/xlsx/scripts/office/validators/docx.py +44 -117
- package/skills/xlsx/scripts/office/validators/pptx.py +2 -42
- package/skills/xlsx/scripts/office/validators/redlining.py +3 -40
- package/skills/xlsx/scripts/recalc.py +2 -26
- package/skills/docx/scripts/__init__.py +0 -1
- package/skills/docx/scripts/office/helpers/__init__.py +0 -0
- package/skills/docx/scripts/office/validators/__init__.py +0 -15
- package/skills/pptx/scripts/__init__.py +0 -0
- package/skills/pptx/scripts/office/helpers/__init__.py +0 -0
- package/skills/pptx/scripts/office/validators/__init__.py +0 -15
- package/skills/xlsx/scripts/office/helpers/__init__.py +0 -0
- package/skills/xlsx/scripts/office/validators/__init__.py +0 -15
- /package/skills/pdf/{REFERENCE.md → reference.md} +0 -0
|
@@ -14,14 +14,6 @@ import defusedxml.minidom
|
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
def merge_runs(input_dir: str) -> tuple[int, str]:
|
|
17
|
-
"""Merge adjacent runs in document.xml.
|
|
18
|
-
|
|
19
|
-
Args:
|
|
20
|
-
input_dir: Path to unpacked DOCX directory
|
|
21
|
-
|
|
22
|
-
Returns:
|
|
23
|
-
(merge_count, message)
|
|
24
|
-
"""
|
|
25
17
|
doc_xml = Path(input_dir) / "word" / "document.xml"
|
|
26
18
|
|
|
27
19
|
if not doc_xml.exists():
|
|
@@ -31,14 +23,11 @@ def merge_runs(input_dir: str) -> tuple[int, str]:
|
|
|
31
23
|
dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8"))
|
|
32
24
|
root = dom.documentElement
|
|
33
25
|
|
|
34
|
-
# Clean up elements that block merging
|
|
35
26
|
_remove_elements(root, "proofErr")
|
|
36
27
|
_strip_run_rsid_attrs(root)
|
|
37
28
|
|
|
38
|
-
# Find all containers that have runs
|
|
39
29
|
containers = {run.parentNode for run in _find_elements(root, "r")}
|
|
40
30
|
|
|
41
|
-
# Merge runs in each container
|
|
42
31
|
merge_count = 0
|
|
43
32
|
for container in containers:
|
|
44
33
|
merge_count += _merge_runs_in(container)
|
|
@@ -50,11 +39,9 @@ def merge_runs(input_dir: str) -> tuple[int, str]:
|
|
|
50
39
|
return 0, f"Error: {e}"
|
|
51
40
|
|
|
52
41
|
|
|
53
|
-
# --- Element helpers ---
|
|
54
42
|
|
|
55
43
|
|
|
56
44
|
def _find_elements(root, tag: str) -> list:
|
|
57
|
-
"""Find all elements matching tag name (with or without namespace)."""
|
|
58
45
|
results = []
|
|
59
46
|
|
|
60
47
|
def traverse(node):
|
|
@@ -70,7 +57,6 @@ def _find_elements(root, tag: str) -> list:
|
|
|
70
57
|
|
|
71
58
|
|
|
72
59
|
def _get_child(parent, tag: str):
|
|
73
|
-
"""Get first child element matching tag name."""
|
|
74
60
|
for child in parent.childNodes:
|
|
75
61
|
if child.nodeType == child.ELEMENT_NODE:
|
|
76
62
|
name = child.localName or child.tagName
|
|
@@ -80,7 +66,6 @@ def _get_child(parent, tag: str):
|
|
|
80
66
|
|
|
81
67
|
|
|
82
68
|
def _get_children(parent, tag: str) -> list:
|
|
83
|
-
"""Get all direct child elements matching tag name."""
|
|
84
69
|
results = []
|
|
85
70
|
for child in parent.childNodes:
|
|
86
71
|
if child.nodeType == child.ELEMENT_NODE:
|
|
@@ -91,7 +76,6 @@ def _get_children(parent, tag: str) -> list:
|
|
|
91
76
|
|
|
92
77
|
|
|
93
78
|
def _is_adjacent(elem1, elem2) -> bool:
|
|
94
|
-
"""Check if two elements are adjacent (only whitespace between them)."""
|
|
95
79
|
node = elem1.nextSibling
|
|
96
80
|
while node:
|
|
97
81
|
if node == elem2:
|
|
@@ -104,34 +88,28 @@ def _is_adjacent(elem1, elem2) -> bool:
|
|
|
104
88
|
return False
|
|
105
89
|
|
|
106
90
|
|
|
107
|
-
# --- Cleanup functions ---
|
|
108
91
|
|
|
109
92
|
|
|
110
93
|
def _remove_elements(root, tag: str):
|
|
111
|
-
"""Remove all elements matching tag name."""
|
|
112
94
|
for elem in _find_elements(root, tag):
|
|
113
95
|
if elem.parentNode:
|
|
114
96
|
elem.parentNode.removeChild(elem)
|
|
115
97
|
|
|
116
98
|
|
|
117
99
|
def _strip_run_rsid_attrs(root):
|
|
118
|
-
"""Remove rsid attributes from all run elements."""
|
|
119
100
|
for run in _find_elements(root, "r"):
|
|
120
101
|
for attr in list(run.attributes.values()):
|
|
121
102
|
if "rsid" in attr.name.lower():
|
|
122
103
|
run.removeAttribute(attr.name)
|
|
123
104
|
|
|
124
105
|
|
|
125
|
-
# --- Merge functions ---
|
|
126
106
|
|
|
127
107
|
|
|
128
108
|
def _merge_runs_in(container) -> int:
|
|
129
|
-
"""Merge adjacent runs with identical formatting in a container element."""
|
|
130
109
|
merge_count = 0
|
|
131
110
|
run = _first_child_run(container)
|
|
132
111
|
|
|
133
112
|
while run:
|
|
134
|
-
# Absorb adjacent runs with same formatting
|
|
135
113
|
while True:
|
|
136
114
|
next_elem = _next_element_sibling(run)
|
|
137
115
|
if next_elem and _is_run(next_elem) and _can_merge(run, next_elem):
|
|
@@ -148,7 +126,6 @@ def _merge_runs_in(container) -> int:
|
|
|
148
126
|
|
|
149
127
|
|
|
150
128
|
def _first_child_run(container):
|
|
151
|
-
"""Get the first run child of a container."""
|
|
152
129
|
for child in container.childNodes:
|
|
153
130
|
if child.nodeType == child.ELEMENT_NODE and _is_run(child):
|
|
154
131
|
return child
|
|
@@ -156,7 +133,6 @@ def _first_child_run(container):
|
|
|
156
133
|
|
|
157
134
|
|
|
158
135
|
def _next_element_sibling(node):
|
|
159
|
-
"""Get the next element sibling, skipping text/whitespace nodes."""
|
|
160
136
|
sibling = node.nextSibling
|
|
161
137
|
while sibling:
|
|
162
138
|
if sibling.nodeType == sibling.ELEMENT_NODE:
|
|
@@ -166,25 +142,21 @@ def _next_element_sibling(node):
|
|
|
166
142
|
|
|
167
143
|
|
|
168
144
|
def _next_sibling_run(node):
|
|
169
|
-
"""Get the next sibling that is a run element."""
|
|
170
145
|
sibling = node.nextSibling
|
|
171
146
|
while sibling:
|
|
172
147
|
if sibling.nodeType == sibling.ELEMENT_NODE:
|
|
173
148
|
if _is_run(sibling):
|
|
174
149
|
return sibling
|
|
175
|
-
# Skip non-run elements (bookmarks, etc.) but keep looking
|
|
176
150
|
sibling = sibling.nextSibling
|
|
177
151
|
return None
|
|
178
152
|
|
|
179
153
|
|
|
180
154
|
def _is_run(node) -> bool:
|
|
181
|
-
"""Check if node is a run element."""
|
|
182
155
|
name = node.localName or node.tagName
|
|
183
156
|
return name == "r" or name.endswith(":r")
|
|
184
157
|
|
|
185
158
|
|
|
186
159
|
def _can_merge(run1, run2) -> bool:
|
|
187
|
-
"""Check if two runs have identical formatting."""
|
|
188
160
|
rpr1 = _get_child(run1, "rPr")
|
|
189
161
|
rpr2 = _get_child(run2, "rPr")
|
|
190
162
|
|
|
@@ -192,11 +164,10 @@ def _can_merge(run1, run2) -> bool:
|
|
|
192
164
|
return False
|
|
193
165
|
if rpr1 is None:
|
|
194
166
|
return True
|
|
195
|
-
return rpr1.toxml() == rpr2.toxml()
|
|
167
|
+
return rpr1.toxml() == rpr2.toxml()
|
|
196
168
|
|
|
197
169
|
|
|
198
170
|
def _merge_run_content(target, source):
|
|
199
|
-
"""Move content from source run to target run (excluding rPr)."""
|
|
200
171
|
for child in list(source.childNodes):
|
|
201
172
|
if child.nodeType == child.ELEMENT_NODE:
|
|
202
173
|
name = child.localName or child.tagName
|
|
@@ -205,10 +176,8 @@ def _merge_run_content(target, source):
|
|
|
205
176
|
|
|
206
177
|
|
|
207
178
|
def _consolidate_text(run):
|
|
208
|
-
"""Merge adjacent <w:t> elements within a run."""
|
|
209
179
|
t_elements = _get_children(run, "t")
|
|
210
180
|
|
|
211
|
-
# Work backwards to safely remove elements
|
|
212
181
|
for i in range(len(t_elements) - 1, 0, -1):
|
|
213
182
|
curr, prev = t_elements[i], t_elements[i - 1]
|
|
214
183
|
|
|
@@ -222,7 +191,6 @@ def _consolidate_text(run):
|
|
|
222
191
|
else:
|
|
223
192
|
prev.appendChild(run.ownerDocument.createTextNode(merged))
|
|
224
193
|
|
|
225
|
-
# Preserve whitespace if needed
|
|
226
194
|
if merged.startswith(" ") or merged.endswith(" "):
|
|
227
195
|
prev.setAttribute("xml:space", "preserve")
|
|
228
196
|
elif prev.hasAttribute("xml:space"):
|
|
@@ -20,14 +20,6 @@ WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def simplify_redlines(input_dir: str) -> tuple[int, str]:
|
|
23
|
-
"""Merge adjacent tracked changes from the same author in document.xml.
|
|
24
|
-
|
|
25
|
-
Args:
|
|
26
|
-
input_dir: Path to unpacked DOCX directory
|
|
27
|
-
|
|
28
|
-
Returns:
|
|
29
|
-
(merge_count, message)
|
|
30
|
-
"""
|
|
31
23
|
doc_xml = Path(input_dir) / "word" / "document.xml"
|
|
32
24
|
|
|
33
25
|
if not doc_xml.exists():
|
|
@@ -39,7 +31,6 @@ def simplify_redlines(input_dir: str) -> tuple[int, str]:
|
|
|
39
31
|
|
|
40
32
|
merge_count = 0
|
|
41
33
|
|
|
42
|
-
# Find all paragraphs and table cells (containers for content)
|
|
43
34
|
containers = _find_elements(root, "p") + _find_elements(root, "tc")
|
|
44
35
|
|
|
45
36
|
for container in containers:
|
|
@@ -54,10 +45,8 @@ def simplify_redlines(input_dir: str) -> tuple[int, str]:
|
|
|
54
45
|
|
|
55
46
|
|
|
56
47
|
def _merge_tracked_changes_in(container, tag: str) -> int:
|
|
57
|
-
"""Merge adjacent w:ins or w:del elements from the same author."""
|
|
58
48
|
merge_count = 0
|
|
59
49
|
|
|
60
|
-
# Get direct children that are tracked changes of this type
|
|
61
50
|
tracked = [
|
|
62
51
|
child
|
|
63
52
|
for child in container.childNodes
|
|
@@ -67,7 +56,6 @@ def _merge_tracked_changes_in(container, tag: str) -> int:
|
|
|
67
56
|
if len(tracked) < 2:
|
|
68
57
|
return 0
|
|
69
58
|
|
|
70
|
-
# Process from front: merge next into current when possible
|
|
71
59
|
i = 0
|
|
72
60
|
while i < len(tracked) - 1:
|
|
73
61
|
curr = tracked[i]
|
|
@@ -78,7 +66,6 @@ def _merge_tracked_changes_in(container, tag: str) -> int:
|
|
|
78
66
|
container.removeChild(next_elem)
|
|
79
67
|
tracked.pop(i + 1)
|
|
80
68
|
merge_count += 1
|
|
81
|
-
# Don't increment i - try to merge more into curr
|
|
82
69
|
else:
|
|
83
70
|
i += 1
|
|
84
71
|
|
|
@@ -86,13 +73,11 @@ def _merge_tracked_changes_in(container, tag: str) -> int:
|
|
|
86
73
|
|
|
87
74
|
|
|
88
75
|
def _is_element(node, tag: str) -> bool:
|
|
89
|
-
"""Check if node matches the given tag name."""
|
|
90
76
|
name = node.localName or node.tagName
|
|
91
77
|
return name == tag or name.endswith(f":{tag}")
|
|
92
78
|
|
|
93
79
|
|
|
94
80
|
def _get_author(elem) -> str:
|
|
95
|
-
"""Get the author attribute from a tracked change element."""
|
|
96
81
|
author = elem.getAttribute("w:author")
|
|
97
82
|
if not author:
|
|
98
83
|
for attr in elem.attributes.values():
|
|
@@ -102,12 +87,9 @@ def _get_author(elem) -> str:
|
|
|
102
87
|
|
|
103
88
|
|
|
104
89
|
def _can_merge_tracked(elem1, elem2) -> bool:
|
|
105
|
-
"""Check if two tracked change elements can be merged."""
|
|
106
|
-
# Must be same author
|
|
107
90
|
if _get_author(elem1) != _get_author(elem2):
|
|
108
91
|
return False
|
|
109
92
|
|
|
110
|
-
# Must be truly adjacent (only whitespace between them)
|
|
111
93
|
node = elem1.nextSibling
|
|
112
94
|
while node and node != elem2:
|
|
113
95
|
if node.nodeType == node.ELEMENT_NODE:
|
|
@@ -120,7 +102,6 @@ def _can_merge_tracked(elem1, elem2) -> bool:
|
|
|
120
102
|
|
|
121
103
|
|
|
122
104
|
def _merge_tracked_content(target, source):
|
|
123
|
-
"""Move all children from source tracked change to target."""
|
|
124
105
|
while source.firstChild:
|
|
125
106
|
child = source.firstChild
|
|
126
107
|
source.removeChild(child)
|
|
@@ -128,7 +109,6 @@ def _merge_tracked_content(target, source):
|
|
|
128
109
|
|
|
129
110
|
|
|
130
111
|
def _find_elements(root, tag: str) -> list:
|
|
131
|
-
"""Find all elements matching tag name (with or without namespace)."""
|
|
132
112
|
results = []
|
|
133
113
|
|
|
134
114
|
def traverse(node):
|
|
@@ -144,11 +124,6 @@ def _find_elements(root, tag: str) -> list:
|
|
|
144
124
|
|
|
145
125
|
|
|
146
126
|
def get_tracked_change_authors(doc_xml_path: Path) -> dict[str, int]:
|
|
147
|
-
"""Get authors and their tracked change counts from a document.xml file.
|
|
148
|
-
|
|
149
|
-
Returns:
|
|
150
|
-
Dict mapping author name to count of tracked changes (w:ins + w:del)
|
|
151
|
-
"""
|
|
152
127
|
if not doc_xml_path.exists():
|
|
153
128
|
return {}
|
|
154
129
|
|
|
@@ -172,7 +147,6 @@ def get_tracked_change_authors(doc_xml_path: Path) -> dict[str, int]:
|
|
|
172
147
|
|
|
173
148
|
|
|
174
149
|
def _get_authors_from_docx(docx_path: Path) -> dict[str, int]:
|
|
175
|
-
"""Get authors and counts from a packed DOCX file."""
|
|
176
150
|
try:
|
|
177
151
|
with zipfile.ZipFile(docx_path, "r") as zf:
|
|
178
152
|
if "word/document.xml" not in zf.namelist():
|
|
@@ -196,22 +170,6 @@ def _get_authors_from_docx(docx_path: Path) -> dict[str, int]:
|
|
|
196
170
|
|
|
197
171
|
|
|
198
172
|
def infer_author(modified_dir: Path, original_docx: Path, default: str = "Claude") -> str:
|
|
199
|
-
"""Infer the author to validate by finding who added tracked changes.
|
|
200
|
-
|
|
201
|
-
Compares tracked change counts between modified and original documents.
|
|
202
|
-
Returns the author who added new tracked changes.
|
|
203
|
-
|
|
204
|
-
Args:
|
|
205
|
-
modified_dir: Path to unpacked DOCX directory
|
|
206
|
-
original_docx: Path to original DOCX file
|
|
207
|
-
default: Default author if no new changes found
|
|
208
|
-
|
|
209
|
-
Returns:
|
|
210
|
-
Author name to use for validation
|
|
211
|
-
|
|
212
|
-
Raises:
|
|
213
|
-
ValueError: If multiple authors added new changes (ambiguous)
|
|
214
|
-
"""
|
|
215
173
|
modified_xml = modified_dir / "word" / "document.xml"
|
|
216
174
|
modified_authors = get_tracked_change_authors(modified_xml)
|
|
217
175
|
|
|
@@ -220,7 +178,6 @@ def infer_author(modified_dir: Path, original_docx: Path, default: str = "Claude
|
|
|
220
178
|
|
|
221
179
|
original_authors = _get_authors_from_docx(original_docx)
|
|
222
180
|
|
|
223
|
-
# Calculate new changes per author (modified count - original count)
|
|
224
181
|
new_changes: dict[str, int] = {}
|
|
225
182
|
for author, count in modified_authors.items():
|
|
226
183
|
original_count = original_authors.get(author, 0)
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
#!/usr/bin/env python3
|
|
2
1
|
"""Pack a directory into a DOCX, PPTX, or XLSX file.
|
|
3
2
|
|
|
4
3
|
Validates with auto-repair, condenses XML formatting, and creates the Office file.
|
|
@@ -29,18 +28,6 @@ def pack(
|
|
|
29
28
|
validate: bool = True,
|
|
30
29
|
infer_author_func=None,
|
|
31
30
|
) -> tuple[None, str]:
|
|
32
|
-
"""Pack a directory into an Office file (DOCX, PPTX, or XLSX).
|
|
33
|
-
|
|
34
|
-
Args:
|
|
35
|
-
input_directory: Path to unpacked Office document directory
|
|
36
|
-
output_file: Path to output Office file
|
|
37
|
-
original_file: Path to original file for validation comparison
|
|
38
|
-
validate: If True, run validation with auto-repair before packing
|
|
39
|
-
infer_author_func: Optional function to infer author for redlining validation
|
|
40
|
-
|
|
41
|
-
Returns:
|
|
42
|
-
(None, message) - message indicates success or failure
|
|
43
|
-
"""
|
|
44
31
|
input_dir = Path(input_directory)
|
|
45
32
|
output_path = Path(output_file)
|
|
46
33
|
suffix = output_path.suffix.lower()
|
|
@@ -51,7 +38,6 @@ def pack(
|
|
|
51
38
|
if suffix not in {".docx", ".pptx", ".xlsx"}:
|
|
52
39
|
return None, f"Error: {output_file} must be a .docx, .pptx, or .xlsx file"
|
|
53
40
|
|
|
54
|
-
# Validate with auto-repair if requested and original file provided
|
|
55
41
|
if validate and original_file:
|
|
56
42
|
original_path = Path(original_file)
|
|
57
43
|
if original_path.exists():
|
|
@@ -63,17 +49,14 @@ def pack(
|
|
|
63
49
|
if not success:
|
|
64
50
|
return None, f"Error: Validation failed for {input_dir}"
|
|
65
51
|
|
|
66
|
-
# Work in temporary directory to avoid modifying original
|
|
67
52
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
68
53
|
temp_content_dir = Path(temp_dir) / "content"
|
|
69
54
|
shutil.copytree(input_dir, temp_content_dir)
|
|
70
55
|
|
|
71
|
-
# Process XML files to remove pretty-printing whitespace
|
|
72
56
|
for pattern in ["*.xml", "*.rels"]:
|
|
73
57
|
for xml_file in temp_content_dir.rglob(pattern):
|
|
74
58
|
_condense_xml(xml_file)
|
|
75
59
|
|
|
76
|
-
# Create final Office file as zip archive
|
|
77
60
|
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
78
61
|
with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
|
79
62
|
for f in temp_content_dir.rglob("*"):
|
|
@@ -89,16 +72,10 @@ def _run_validation(
|
|
|
89
72
|
suffix: str,
|
|
90
73
|
infer_author_func=None,
|
|
91
74
|
) -> tuple[bool, str | None]:
|
|
92
|
-
"""Run validation with auto-repair.
|
|
93
|
-
|
|
94
|
-
Returns:
|
|
95
|
-
(success, output) - success is True if all validations pass
|
|
96
|
-
"""
|
|
97
75
|
output_lines = []
|
|
98
76
|
validators = []
|
|
99
77
|
|
|
100
78
|
if suffix == ".docx":
|
|
101
|
-
# Infer author for redlining validation
|
|
102
79
|
author = "Claude"
|
|
103
80
|
if infer_author_func:
|
|
104
81
|
try:
|
|
@@ -112,17 +89,14 @@ def _run_validation(
|
|
|
112
89
|
]
|
|
113
90
|
elif suffix == ".pptx":
|
|
114
91
|
validators = [PPTXSchemaValidator(unpacked_dir, original_file)]
|
|
115
|
-
# xlsx has no schema validator yet
|
|
116
92
|
|
|
117
93
|
if not validators:
|
|
118
94
|
return True, None
|
|
119
95
|
|
|
120
|
-
# Run auto-repair
|
|
121
96
|
total_repairs = sum(v.repair() for v in validators)
|
|
122
97
|
if total_repairs:
|
|
123
98
|
output_lines.append(f"Auto-repaired {total_repairs} issue(s)")
|
|
124
99
|
|
|
125
|
-
# Run validation
|
|
126
100
|
success = all(v.validate() for v in validators)
|
|
127
101
|
|
|
128
102
|
if success:
|
|
@@ -132,18 +106,14 @@ def _run_validation(
|
|
|
132
106
|
|
|
133
107
|
|
|
134
108
|
def _condense_xml(xml_file: Path) -> None:
|
|
135
|
-
"""Strip unnecessary whitespace and remove comments from XML."""
|
|
136
109
|
try:
|
|
137
110
|
with open(xml_file, encoding="utf-8") as f:
|
|
138
111
|
dom = defusedxml.minidom.parse(f)
|
|
139
112
|
|
|
140
|
-
# Process each element to remove whitespace and comments
|
|
141
113
|
for element in dom.getElementsByTagName("*"):
|
|
142
|
-
# Skip text elements (w:t, a:t, etc.) - preserve their content
|
|
143
114
|
if element.tagName.endswith(":t"):
|
|
144
115
|
continue
|
|
145
116
|
|
|
146
|
-
# Remove whitespace-only text nodes and comment nodes
|
|
147
117
|
for child in list(element.childNodes):
|
|
148
118
|
if (
|
|
149
119
|
child.nodeType == child.TEXT_NODE
|