@panda-agent/panda-cli 0.1.29 → 0.1.31
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/pandacli.mjs +6 -1
- package/bundled-preset-skills/.gitkeep +0 -0
- package/bundled-preset-skills/README.md +17 -0
- package/bundled-preset-skills/docx/.skill-metadata.yaml +173 -0
- package/bundled-preset-skills/docx/LICENSE.txt +30 -0
- package/bundled-preset-skills/docx/SKILL.md +589 -0
- package/bundled-preset-skills/docx/scripts/__init__.py +1 -0
- package/bundled-preset-skills/docx/scripts/accept_changes.py +206 -0
- package/bundled-preset-skills/docx/scripts/comment.py +442 -0
- package/bundled-preset-skills/docx/scripts/office/helpers/__init__.py +1 -0
- package/bundled-preset-skills/docx/scripts/office/helpers/merge_runs.py +190 -0
- package/bundled-preset-skills/docx/scripts/office/helpers/simplify_redlines.py +185 -0
- package/bundled-preset-skills/docx/scripts/office/pack.py +167 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/mce/mc.xsd +75 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/bundled-preset-skills/docx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/bundled-preset-skills/docx/scripts/office/soffice.py +194 -0
- package/bundled-preset-skills/docx/scripts/office/unpack.py +145 -0
- package/bundled-preset-skills/docx/scripts/office/validate.py +114 -0
- package/bundled-preset-skills/docx/scripts/office/validators/__init__.py +16 -0
- package/bundled-preset-skills/docx/scripts/office/validators/base.py +733 -0
- package/bundled-preset-skills/docx/scripts/office/validators/docx.py +354 -0
- package/bundled-preset-skills/docx/scripts/office/validators/pptx.py +230 -0
- package/bundled-preset-skills/docx/scripts/office/validators/redlining.py +212 -0
- package/bundled-preset-skills/docx/scripts/templates/comments.xml +3 -0
- package/bundled-preset-skills/docx/scripts/templates/commentsExtended.xml +3 -0
- package/bundled-preset-skills/docx/scripts/templates/commentsExtensible.xml +3 -0
- package/bundled-preset-skills/docx/scripts/templates/commentsIds.xml +3 -0
- package/bundled-preset-skills/docx/scripts/templates/people.xml +3 -0
- package/bundled-preset-skills/frontend-design/LICENSE.txt +177 -0
- package/bundled-preset-skills/frontend-design/SKILL.md +42 -0
- package/bundled-preset-skills/pdf/.skill-metadata.yaml +273 -0
- package/bundled-preset-skills/pdf/LICENSE.txt +30 -0
- package/bundled-preset-skills/pdf/SKILL.md +324 -0
- package/bundled-preset-skills/pdf/advanced-reference.md +609 -0
- package/bundled-preset-skills/pdf/form-filling-guide.md +318 -0
- package/bundled-preset-skills/pdf/forms.md +294 -0
- package/bundled-preset-skills/pdf/reference.md +612 -0
- package/bundled-preset-skills/pdf/scripts/check_bounding_boxes.py +198 -0
- package/bundled-preset-skills/pdf/scripts/check_fillable_fields.py +64 -0
- package/bundled-preset-skills/pdf/scripts/convert_pdf_to_images.py +102 -0
- package/bundled-preset-skills/pdf/scripts/create_validation_image.py +125 -0
- package/bundled-preset-skills/pdf/scripts/extract_form_field_info.py +220 -0
- package/bundled-preset-skills/pdf/scripts/extract_form_structure.py +202 -0
- package/bundled-preset-skills/pdf/scripts/fill_fillable_fields.py +205 -0
- package/bundled-preset-skills/pdf/scripts/fill_pdf_form_with_annotations.py +193 -0
- package/bundled-preset-skills/pptx-generator/SKILL.md +204 -0
- package/bundled-preset-skills/pptx-generator/assets/styles/business.json +8 -0
- package/bundled-preset-skills/pptx-generator/assets/styles/minimal.json +8 -0
- package/bundled-preset-skills/pptx-generator/assets/styles/modern.json +8 -0
- package/bundled-preset-skills/pptx-generator/assets/templates/ppt_data_template.json +40 -0
- package/bundled-preset-skills/pptx-generator/references/collaboration_guide.md +381 -0
- package/bundled-preset-skills/pptx-generator/references/json_format_spec.md +215 -0
- package/bundled-preset-skills/pptx-generator/references/layout_guide.md +290 -0
- package/bundled-preset-skills/pptx-generator/scripts/json_validator.py +194 -0
- package/bundled-preset-skills/pptx-generator/scripts/pptx_builder.py +340 -0
- package/bundled-preset-skills/pptx-generator/scripts/pptx_validator.py +162 -0
- package/bundled-preset-skills/skill-creator/LICENSE.txt +202 -0
- package/bundled-preset-skills/skill-creator/SKILL.md +479 -0
- package/bundled-preset-skills/skill-creator/agents/analyzer.md +274 -0
- package/bundled-preset-skills/skill-creator/agents/comparator.md +202 -0
- package/bundled-preset-skills/skill-creator/agents/grader.md +223 -0
- package/bundled-preset-skills/skill-creator/assets/eval_review.html +146 -0
- package/bundled-preset-skills/skill-creator/eval-viewer/generate_review.py +471 -0
- package/bundled-preset-skills/skill-creator/eval-viewer/viewer.html +1325 -0
- package/bundled-preset-skills/skill-creator/references/schemas.md +430 -0
- package/bundled-preset-skills/skill-creator/scripts/__init__.py +0 -0
- package/bundled-preset-skills/skill-creator/scripts/aggregate_benchmark.py +401 -0
- package/bundled-preset-skills/skill-creator/scripts/generate_report.py +326 -0
- package/bundled-preset-skills/skill-creator/scripts/improve_description.py +248 -0
- package/bundled-preset-skills/skill-creator/scripts/package_skill.py +136 -0
- package/bundled-preset-skills/skill-creator/scripts/quick_validate.py +103 -0
- package/bundled-preset-skills/skill-creator/scripts/run_eval.py +310 -0
- package/bundled-preset-skills/skill-creator/scripts/run_loop.py +332 -0
- package/bundled-preset-skills/skill-creator/scripts/utils.py +47 -0
- package/bundled-preset-skills/xlsx/.skill-metadata.yaml +185 -0
- package/bundled-preset-skills/xlsx/LICENSE.txt +30 -0
- package/bundled-preset-skills/xlsx/SKILL.md +233 -0
- package/bundled-preset-skills/xlsx/scripts/office/helpers/__init__.py +1 -0
- package/bundled-preset-skills/xlsx/scripts/office/helpers/merge_runs.py +226 -0
- package/bundled-preset-skills/xlsx/scripts/office/helpers/simplify_redlines.py +198 -0
- package/bundled-preset-skills/xlsx/scripts/office/pack.py +162 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/bundled-preset-skills/xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/bundled-preset-skills/xlsx/scripts/office/soffice.py +185 -0
- package/bundled-preset-skills/xlsx/scripts/office/unpack.py +146 -0
- package/bundled-preset-skills/xlsx/scripts/office/validate.py +108 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/__init__.py +13 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/base.py +800 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/docx.py +383 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/pptx.py +250 -0
- package/bundled-preset-skills/xlsx/scripts/office/validators/redlining.py +229 -0
- package/bundled-preset-skills/xlsx/scripts/recalc.py +296 -0
- package/dist/panda-cli-ink.bundle.mjs +276 -342
- package/package.json +6 -4
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Coalesce adjacent ``<w:r>`` elements that share identical formatting.
|
|
2
|
+
|
|
3
|
+
Operates on paragraphs *and* tracked-change containers (``<w:ins>``,
|
|
4
|
+
``<w:del>``). Additionally strips RSID attributes from runs and removes
|
|
5
|
+
``proofErr`` spell/grammar markers that would otherwise prevent merging.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import pathlib
|
|
9
|
+
|
|
10
|
+
import defusedxml.minidom
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
# ── DOM traversal utilities ──────────────────────────────────────────────────
|
|
14
|
+
|
|
15
|
+
def _collect_by_tag(root, tag: str) -> list:
|
|
16
|
+
"""Depth-first search for every element whose local name matches *tag*."""
|
|
17
|
+
hits: list = []
|
|
18
|
+
def _walk(nd):
|
|
19
|
+
if nd.nodeType == nd.ELEMENT_NODE:
|
|
20
|
+
lname = nd.localName or nd.tagName
|
|
21
|
+
if lname == tag or lname.endswith(":%s" % tag):
|
|
22
|
+
hits.append(nd)
|
|
23
|
+
for ch in nd.childNodes:
|
|
24
|
+
_walk(ch)
|
|
25
|
+
_walk(root)
|
|
26
|
+
return hits
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def _child_by_tag(parent, tag: str):
|
|
30
|
+
"""Return the first direct child element matching *tag*, or ``None``."""
|
|
31
|
+
for ch in parent.childNodes:
|
|
32
|
+
if ch.nodeType != ch.ELEMENT_NODE:
|
|
33
|
+
continue
|
|
34
|
+
lname = ch.localName or ch.tagName
|
|
35
|
+
if lname == tag or lname.endswith(":%s" % tag):
|
|
36
|
+
return ch
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _children_by_tag(parent, tag: str) -> list:
|
|
41
|
+
return [
|
|
42
|
+
ch for ch in parent.childNodes
|
|
43
|
+
if ch.nodeType == ch.ELEMENT_NODE
|
|
44
|
+
and ((ch.localName or ch.tagName) == tag
|
|
45
|
+
or (ch.localName or ch.tagName).endswith(":%s" % tag))
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def _directly_adjacent(a, b) -> bool:
|
|
50
|
+
"""True when *a* and *b* are separated only by insignificant whitespace."""
|
|
51
|
+
cur = a.nextSibling
|
|
52
|
+
while cur is not None:
|
|
53
|
+
if cur is b:
|
|
54
|
+
return True
|
|
55
|
+
if cur.nodeType == cur.ELEMENT_NODE:
|
|
56
|
+
return False
|
|
57
|
+
if cur.nodeType == cur.TEXT_NODE and cur.data.strip():
|
|
58
|
+
return False
|
|
59
|
+
cur = cur.nextSibling
|
|
60
|
+
return False
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def _tag_matches_run(nd) -> bool:
|
|
64
|
+
lname = nd.localName or nd.tagName
|
|
65
|
+
return lname == "r" or lname.endswith(":r")
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
# ── Cleanup passes ───────────────────────────────────────────────────────────
|
|
69
|
+
|
|
70
|
+
def _purge_elements(root, tag: str) -> None:
|
|
71
|
+
for el in _collect_by_tag(root, tag):
|
|
72
|
+
if el.parentNode:
|
|
73
|
+
el.parentNode.removeChild(el)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def _erase_rsid_attributes(root) -> None:
|
|
77
|
+
for rn in _collect_by_tag(root, "r"):
|
|
78
|
+
doomed = [a for a in rn.attributes.values() if "rsid" in a.name.lower()]
|
|
79
|
+
for attr in doomed:
|
|
80
|
+
rn.removeAttribute(attr.name)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
# ── Core merging logic ───────────────────────────────────────────────────────
|
|
84
|
+
|
|
85
|
+
def _formatting_equal(r1, r2) -> bool:
|
|
86
|
+
rpr_a = _child_by_tag(r1, "rPr")
|
|
87
|
+
rpr_b = _child_by_tag(r2, "rPr")
|
|
88
|
+
if (rpr_a is None) != (rpr_b is None):
|
|
89
|
+
return False
|
|
90
|
+
return True if rpr_a is None else rpr_a.toxml() == rpr_b.toxml()
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _absorb_content(dest, src) -> None:
|
|
94
|
+
"""Move non-rPr children of *src* into *dest*."""
|
|
95
|
+
for ch in list(src.childNodes):
|
|
96
|
+
if ch.nodeType != ch.ELEMENT_NODE:
|
|
97
|
+
continue
|
|
98
|
+
lname = ch.localName or ch.tagName
|
|
99
|
+
if lname == "rPr" or lname.endswith(":rPr"):
|
|
100
|
+
continue
|
|
101
|
+
dest.appendChild(ch)
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _next_elem(nd):
|
|
105
|
+
sib = nd.nextSibling
|
|
106
|
+
while sib is not None:
|
|
107
|
+
if sib.nodeType == sib.ELEMENT_NODE:
|
|
108
|
+
return sib
|
|
109
|
+
sib = sib.nextSibling
|
|
110
|
+
return None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
def _next_run_sibling(nd):
|
|
114
|
+
sib = nd.nextSibling
|
|
115
|
+
while sib is not None:
|
|
116
|
+
if sib.nodeType == sib.ELEMENT_NODE and _tag_matches_run(sib):
|
|
117
|
+
return sib
|
|
118
|
+
sib = sib.nextSibling
|
|
119
|
+
return None
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def _first_run_child(container):
|
|
123
|
+
for ch in container.childNodes:
|
|
124
|
+
if ch.nodeType == ch.ELEMENT_NODE and _tag_matches_run(ch):
|
|
125
|
+
return ch
|
|
126
|
+
return None
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _squash_text_nodes(run) -> None:
|
|
130
|
+
"""Combine consecutive ``<w:t>`` (or ``<w:delText>``) nodes inside *run*."""
|
|
131
|
+
t_nodes = _children_by_tag(run, "t")
|
|
132
|
+
idx = len(t_nodes) - 1
|
|
133
|
+
while idx > 0:
|
|
134
|
+
cur, prev = t_nodes[idx], t_nodes[idx - 1]
|
|
135
|
+
if _directly_adjacent(prev, cur):
|
|
136
|
+
txt_prev = prev.firstChild.data if prev.firstChild else ""
|
|
137
|
+
txt_cur = cur.firstChild.data if cur.firstChild else ""
|
|
138
|
+
combined = txt_prev + txt_cur
|
|
139
|
+
if prev.firstChild:
|
|
140
|
+
prev.firstChild.data = combined
|
|
141
|
+
else:
|
|
142
|
+
prev.appendChild(run.ownerDocument.createTextNode(combined))
|
|
143
|
+
if combined[0:1] == " " or combined[-1:] == " ":
|
|
144
|
+
prev.setAttribute("xml:space", "preserve")
|
|
145
|
+
elif prev.hasAttribute("xml:space"):
|
|
146
|
+
prev.removeAttribute("xml:space")
|
|
147
|
+
run.removeChild(cur)
|
|
148
|
+
idx -= 1
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _merge_within(container) -> int:
|
|
152
|
+
"""Merge consecutive runs with equal formatting inside *container*."""
|
|
153
|
+
total = 0
|
|
154
|
+
rn = _first_run_child(container)
|
|
155
|
+
while rn is not None:
|
|
156
|
+
while True:
|
|
157
|
+
nxt = _next_elem(rn)
|
|
158
|
+
if nxt is not None and _tag_matches_run(nxt) and _formatting_equal(rn, nxt):
|
|
159
|
+
_absorb_content(rn, nxt)
|
|
160
|
+
container.removeChild(nxt)
|
|
161
|
+
total += 1
|
|
162
|
+
else:
|
|
163
|
+
break
|
|
164
|
+
_squash_text_nodes(rn)
|
|
165
|
+
rn = _next_run_sibling(rn)
|
|
166
|
+
return total
|
|
167
|
+
|
|
168
|
+
|
|
169
|
+
# ── Public entry point ───────────────────────────────────────────────────────
|
|
170
|
+
|
|
171
|
+
def merge_runs(input_dir: str) -> tuple[int, str]:
|
|
172
|
+
"""Coalesce adjacent identically-formatted runs in ``document.xml``."""
|
|
173
|
+
doc_path = pathlib.Path(input_dir) / "word" / "document.xml"
|
|
174
|
+
if not doc_path.exists():
|
|
175
|
+
return 0, "Error: %s not found" % doc_path
|
|
176
|
+
|
|
177
|
+
try:
|
|
178
|
+
dom = defusedxml.minidom.parseString(doc_path.read_text(encoding="utf-8"))
|
|
179
|
+
top = dom.documentElement
|
|
180
|
+
|
|
181
|
+
_purge_elements(top, "proofErr")
|
|
182
|
+
_erase_rsid_attributes(top)
|
|
183
|
+
|
|
184
|
+
parents = {rn.parentNode for rn in _collect_by_tag(top, "r")}
|
|
185
|
+
merged = sum(_merge_within(p) for p in parents)
|
|
186
|
+
|
|
187
|
+
doc_path.write_bytes(dom.toxml(encoding="UTF-8"))
|
|
188
|
+
return merged, "Merged %d runs" % merged
|
|
189
|
+
except Exception as exc:
|
|
190
|
+
return 0, "Error: %s" % exc
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""Collapse consecutive tracked-change wrappers from the same reviewer.
|
|
2
|
+
|
|
3
|
+
Adjacent ``<w:ins>`` blocks by the same author are folded into one element;
|
|
4
|
+
likewise for ``<w:del>``. This dramatically reduces clutter in documents
|
|
5
|
+
with heavy revision history.
|
|
6
|
+
|
|
7
|
+
Constraints:
|
|
8
|
+
* Only same-type merges: ``ins`` with ``ins``, ``del`` with ``del``.
|
|
9
|
+
* Author must match (timestamps are ignored).
|
|
10
|
+
* Elements must be truly adjacent — only insignificant whitespace allowed
|
|
11
|
+
between them.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import pathlib
|
|
15
|
+
import xml.etree.ElementTree as ET
|
|
16
|
+
import zipfile
|
|
17
|
+
|
|
18
|
+
import defusedxml.minidom
|
|
19
|
+
|
|
20
|
+
_WML_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# ── DOM helpers (minidom) ────────────────────────────────────────────────────
|
|
24
|
+
|
|
25
|
+
def _scan_elements(root, tag: str) -> list:
|
|
26
|
+
found: list = []
|
|
27
|
+
def _recurse(nd):
|
|
28
|
+
if nd.nodeType == nd.ELEMENT_NODE:
|
|
29
|
+
lname = nd.localName or nd.tagName
|
|
30
|
+
if lname == tag or lname.endswith(":%s" % tag):
|
|
31
|
+
found.append(nd)
|
|
32
|
+
for ch in nd.childNodes:
|
|
33
|
+
_recurse(ch)
|
|
34
|
+
_recurse(root)
|
|
35
|
+
return found
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _tag_match(nd, tag: str) -> bool:
|
|
39
|
+
lname = nd.localName or nd.tagName
|
|
40
|
+
return lname == tag or lname.endswith(":%s" % tag)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _extract_author(elem) -> str:
|
|
44
|
+
val = elem.getAttribute("w:author")
|
|
45
|
+
if val:
|
|
46
|
+
return val
|
|
47
|
+
for attr in elem.attributes.values():
|
|
48
|
+
if attr.localName == "author" or attr.name.endswith(":author"):
|
|
49
|
+
return attr.value
|
|
50
|
+
return ""
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _only_whitespace_between(first, second) -> bool:
|
|
54
|
+
cur = first.nextSibling
|
|
55
|
+
while cur is not None and cur is not second:
|
|
56
|
+
if cur.nodeType == cur.ELEMENT_NODE:
|
|
57
|
+
return False
|
|
58
|
+
if cur.nodeType == cur.TEXT_NODE and cur.data.strip():
|
|
59
|
+
return False
|
|
60
|
+
cur = cur.nextSibling
|
|
61
|
+
return True
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _transplant_children(dest, src) -> None:
|
|
65
|
+
while src.firstChild:
|
|
66
|
+
node = src.firstChild
|
|
67
|
+
src.removeChild(node)
|
|
68
|
+
dest.appendChild(node)
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _fold_tracked_in(container, tag: str) -> int:
|
|
72
|
+
candidates = [
|
|
73
|
+
ch for ch in container.childNodes
|
|
74
|
+
if ch.nodeType == ch.ELEMENT_NODE and _tag_match(ch, tag)
|
|
75
|
+
]
|
|
76
|
+
if len(candidates) < 2:
|
|
77
|
+
return 0
|
|
78
|
+
|
|
79
|
+
count = 0
|
|
80
|
+
pos = 0
|
|
81
|
+
while pos < len(candidates) - 1:
|
|
82
|
+
left, right = candidates[pos], candidates[pos + 1]
|
|
83
|
+
if _extract_author(left) == _extract_author(right) and _only_whitespace_between(left, right):
|
|
84
|
+
_transplant_children(left, right)
|
|
85
|
+
container.removeChild(right)
|
|
86
|
+
candidates.pop(pos + 1)
|
|
87
|
+
count += 1
|
|
88
|
+
else:
|
|
89
|
+
pos += 1
|
|
90
|
+
return count
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# ── Public API (minidom-based) ───────────────────────────────────────────────
|
|
94
|
+
|
|
95
|
+
def simplify_redlines(input_dir: str) -> tuple[int, str]:
|
|
96
|
+
"""Merge adjacent same-author tracked changes in ``document.xml``."""
|
|
97
|
+
doc_path = pathlib.Path(input_dir) / "word" / "document.xml"
|
|
98
|
+
if not doc_path.exists():
|
|
99
|
+
return 0, "Error: %s not found" % doc_path
|
|
100
|
+
|
|
101
|
+
try:
|
|
102
|
+
dom = defusedxml.minidom.parseString(doc_path.read_text(encoding="utf-8"))
|
|
103
|
+
top = dom.documentElement
|
|
104
|
+
|
|
105
|
+
buckets = _scan_elements(top, "p") + _scan_elements(top, "tc")
|
|
106
|
+
total = 0
|
|
107
|
+
for bkt in buckets:
|
|
108
|
+
total += _fold_tracked_in(bkt, "ins")
|
|
109
|
+
total += _fold_tracked_in(bkt, "del")
|
|
110
|
+
|
|
111
|
+
dom_bytes = dom.toxml(encoding="UTF-8")
|
|
112
|
+
doc_path.write_bytes(dom_bytes)
|
|
113
|
+
return total, "Simplified %d tracked changes" % total
|
|
114
|
+
except Exception as exc:
|
|
115
|
+
return 0, "Error: %s" % exc
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# ── ElementTree-based author analysis ────────────────────────────────────────
|
|
119
|
+
|
|
120
|
+
def get_tracked_change_authors(doc_xml_path: pathlib.Path) -> dict[str, int]:
|
|
121
|
+
"""Return ``{author: change_count}`` from an unpacked ``document.xml``."""
|
|
122
|
+
if not doc_xml_path.exists():
|
|
123
|
+
return {}
|
|
124
|
+
try:
|
|
125
|
+
tree = ET.parse(doc_xml_path)
|
|
126
|
+
except ET.ParseError:
|
|
127
|
+
return {}
|
|
128
|
+
|
|
129
|
+
ns = {"w": _WML_NS}
|
|
130
|
+
attr_key = "{%s}author" % _WML_NS
|
|
131
|
+
tally: dict[str, int] = {}
|
|
132
|
+
for kind in ("ins", "del"):
|
|
133
|
+
for el in tree.getroot().findall(".//w:%s" % kind, ns):
|
|
134
|
+
who = el.get(attr_key)
|
|
135
|
+
if who:
|
|
136
|
+
tally[who] = tally.get(who, 0) + 1
|
|
137
|
+
return tally
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def _authors_inside_docx(docx_path: pathlib.Path) -> dict[str, int]:
|
|
141
|
+
"""Read author stats directly from a zipped ``.docx``."""
|
|
142
|
+
try:
|
|
143
|
+
with zipfile.ZipFile(docx_path, "r") as zf:
|
|
144
|
+
if "word/document.xml" not in zf.namelist():
|
|
145
|
+
return {}
|
|
146
|
+
with zf.open("word/document.xml") as fh:
|
|
147
|
+
tree = ET.parse(fh)
|
|
148
|
+
|
|
149
|
+
ns = {"w": _WML_NS}
|
|
150
|
+
attr_key = "{%s}author" % _WML_NS
|
|
151
|
+
tally: dict[str, int] = {}
|
|
152
|
+
for kind in ("ins", "del"):
|
|
153
|
+
for el in tree.getroot().findall(".//w:%s" % kind, ns):
|
|
154
|
+
who = el.get(attr_key)
|
|
155
|
+
if who:
|
|
156
|
+
tally[who] = tally.get(who, 0) + 1
|
|
157
|
+
return tally
|
|
158
|
+
except (zipfile.BadZipFile, ET.ParseError):
|
|
159
|
+
return {}
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
def infer_author(modified_dir: pathlib.Path, original_docx: pathlib.Path, default: str = "Claude") -> str:
|
|
163
|
+
"""Guess which single author introduced new tracked changes."""
|
|
164
|
+
mod_xml = modified_dir / "word" / "document.xml"
|
|
165
|
+
mod_authors = get_tracked_change_authors(mod_xml)
|
|
166
|
+
if not mod_authors:
|
|
167
|
+
return default
|
|
168
|
+
|
|
169
|
+
orig_authors = _authors_inside_docx(original_docx)
|
|
170
|
+
|
|
171
|
+
delta: dict[str, int] = {}
|
|
172
|
+
for who, n in mod_authors.items():
|
|
173
|
+
diff = n - orig_authors.get(who, 0)
|
|
174
|
+
if diff > 0:
|
|
175
|
+
delta[who] = diff
|
|
176
|
+
|
|
177
|
+
if not delta:
|
|
178
|
+
return default
|
|
179
|
+
if len(delta) == 1:
|
|
180
|
+
return next(iter(delta))
|
|
181
|
+
|
|
182
|
+
raise ValueError(
|
|
183
|
+
"Multiple authors added new changes: %s. "
|
|
184
|
+
"Cannot infer which author to validate." % delta
|
|
185
|
+
)
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
"""Reassemble an unpacked Office directory into a DOCX / PPTX / XLSX archive.
|
|
2
|
+
|
|
3
|
+
The tool validates with automatic repair, strips cosmetic whitespace from XML,
|
|
4
|
+
and produces the final ZIP-based file.
|
|
5
|
+
|
|
6
|
+
Invocation::
|
|
7
|
+
|
|
8
|
+
python pack.py <src_dir> <dest_file> [--original <file>] [--validate true|false]
|
|
9
|
+
|
|
10
|
+
Samples::
|
|
11
|
+
|
|
12
|
+
python pack.py unpacked/ output.docx --original input.docx
|
|
13
|
+
python pack.py unpacked/ output.pptx --validate false
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import shutil
|
|
18
|
+
import sys
|
|
19
|
+
import tempfile
|
|
20
|
+
import zipfile
|
|
21
|
+
import pathlib
|
|
22
|
+
|
|
23
|
+
import defusedxml.minidom
|
|
24
|
+
|
|
25
|
+
from validators import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
_ALLOWED_EXTENSIONS = {".docx", ".pptx", ".xlsx"}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _strip_xml_formatting(fp: pathlib.Path) -> None:
|
|
32
|
+
"""Collapse pretty-printed XML back to a compact single-line form."""
|
|
33
|
+
try:
|
|
34
|
+
with open(fp, encoding="utf-8") as fh:
|
|
35
|
+
parsed = defusedxml.minidom.parse(fh)
|
|
36
|
+
|
|
37
|
+
for el in parsed.getElementsByTagName("*"):
|
|
38
|
+
if el.tagName.endswith(":t"):
|
|
39
|
+
continue
|
|
40
|
+
children_to_drop = [
|
|
41
|
+
ch for ch in list(el.childNodes)
|
|
42
|
+
if (ch.nodeType == ch.TEXT_NODE and ch.nodeValue and ch.nodeValue.strip() == "")
|
|
43
|
+
or ch.nodeType == ch.COMMENT_NODE
|
|
44
|
+
]
|
|
45
|
+
for ch in children_to_drop:
|
|
46
|
+
el.removeChild(ch)
|
|
47
|
+
|
|
48
|
+
fp.write_bytes(parsed.toxml(encoding="UTF-8"))
|
|
49
|
+
except Exception as err:
|
|
50
|
+
print("ERROR: Failed to parse {}: {}".format(fp.name, err), file=sys.stderr)
|
|
51
|
+
raise
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _execute_validators(
|
|
55
|
+
src_dir: pathlib.Path,
|
|
56
|
+
orig: pathlib.Path,
|
|
57
|
+
ext: str,
|
|
58
|
+
author_fn=None,
|
|
59
|
+
) -> tuple[bool, str | None]:
|
|
60
|
+
"""Run the appropriate validator chain and return (ok, log_text)."""
|
|
61
|
+
log_parts: list[str] = []
|
|
62
|
+
checkers: list = []
|
|
63
|
+
|
|
64
|
+
if ext == ".docx":
|
|
65
|
+
writer = "Claude"
|
|
66
|
+
if author_fn:
|
|
67
|
+
try:
|
|
68
|
+
writer = author_fn(src_dir, orig)
|
|
69
|
+
except ValueError as ve:
|
|
70
|
+
print("Warning: {} Using default author 'Claude'.".format(ve), file=sys.stderr)
|
|
71
|
+
checkers = [
|
|
72
|
+
DOCXSchemaValidator(src_dir, orig),
|
|
73
|
+
RedliningValidator(src_dir, orig, author=writer),
|
|
74
|
+
]
|
|
75
|
+
elif ext == ".pptx":
|
|
76
|
+
checkers = [PPTXSchemaValidator(src_dir, orig)]
|
|
77
|
+
|
|
78
|
+
if not checkers:
|
|
79
|
+
return True, None
|
|
80
|
+
|
|
81
|
+
fixed = sum(c.repair() for c in checkers)
|
|
82
|
+
if fixed:
|
|
83
|
+
log_parts.append("Auto-repaired {} issue(s)".format(fixed))
|
|
84
|
+
|
|
85
|
+
ok = all(c.validate() for c in checkers)
|
|
86
|
+
if ok:
|
|
87
|
+
log_parts.append("All validations PASSED!")
|
|
88
|
+
|
|
89
|
+
return ok, "\n".join(log_parts) if log_parts else None
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def pack(
|
|
93
|
+
input_directory: str,
|
|
94
|
+
output_file: str,
|
|
95
|
+
original_file: str | None = None,
|
|
96
|
+
validate: bool = True,
|
|
97
|
+
infer_author_func=None,
|
|
98
|
+
) -> tuple[None, str]:
|
|
99
|
+
"""Pack *input_directory* into *output_file* (Office ZIP archive)."""
|
|
100
|
+
src = pathlib.Path(input_directory)
|
|
101
|
+
dest = pathlib.Path(output_file)
|
|
102
|
+
ext = dest.suffix.lower()
|
|
103
|
+
|
|
104
|
+
if not src.is_dir():
|
|
105
|
+
return None, "Error: {} is not a directory".format(src)
|
|
106
|
+
|
|
107
|
+
if ext not in _ALLOWED_EXTENSIONS:
|
|
108
|
+
return None, "Error: {} must be a .docx, .pptx, or .xlsx file".format(output_file)
|
|
109
|
+
|
|
110
|
+
if validate and original_file:
|
|
111
|
+
orig_p = pathlib.Path(original_file)
|
|
112
|
+
if orig_p.exists():
|
|
113
|
+
ok, report = _execute_validators(src, orig_p, ext, infer_author_func)
|
|
114
|
+
if report:
|
|
115
|
+
print(report)
|
|
116
|
+
if not ok:
|
|
117
|
+
return None, "Error: Validation failed for {}".format(src)
|
|
118
|
+
|
|
119
|
+
with tempfile.TemporaryDirectory() as scratch:
|
|
120
|
+
staging = pathlib.Path(scratch) / "content"
|
|
121
|
+
shutil.copytree(src, staging)
|
|
122
|
+
|
|
123
|
+
xml_globs = ("*.xml", "*.rels")
|
|
124
|
+
for g in xml_globs:
|
|
125
|
+
for xf in staging.rglob(g):
|
|
126
|
+
_strip_xml_formatting(xf)
|
|
127
|
+
|
|
128
|
+
dest.parent.mkdir(parents=True, exist_ok=True)
|
|
129
|
+
with zipfile.ZipFile(dest, "w", zipfile.ZIP_DEFLATED) as zf:
|
|
130
|
+
for item in staging.rglob("*"):
|
|
131
|
+
if item.is_file():
|
|
132
|
+
zf.write(item, item.relative_to(staging))
|
|
133
|
+
|
|
134
|
+
return None, "Successfully packed {} to {}".format(src, output_file)
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# ── CLI ──────────────────────────────────────────────────────────────────────
|
|
138
|
+
|
|
139
|
+
if __name__ == "__main__":
|
|
140
|
+
ap = argparse.ArgumentParser(
|
|
141
|
+
description="Pack a directory into a DOCX, PPTX, or XLSX file"
|
|
142
|
+
)
|
|
143
|
+
ap.add_argument("input_directory", help="Unpacked Office document directory")
|
|
144
|
+
ap.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)")
|
|
145
|
+
ap.add_argument(
|
|
146
|
+
"--original",
|
|
147
|
+
help="Original file for validation comparison",
|
|
148
|
+
)
|
|
149
|
+
ap.add_argument(
|
|
150
|
+
"--validate",
|
|
151
|
+
type=lambda v: v.lower() == "true",
|
|
152
|
+
default=True,
|
|
153
|
+
metavar="true|false",
|
|
154
|
+
help="Run validation with auto-repair (default: true)",
|
|
155
|
+
)
|
|
156
|
+
cli = ap.parse_args()
|
|
157
|
+
|
|
158
|
+
_, message = pack(
|
|
159
|
+
cli.input_directory,
|
|
160
|
+
cli.output_file,
|
|
161
|
+
original_file=cli.original,
|
|
162
|
+
validate=cli.validate,
|
|
163
|
+
)
|
|
164
|
+
print(message)
|
|
165
|
+
|
|
166
|
+
if "Error" in message:
|
|
167
|
+
sys.exit(1)
|