@kortix/sandbox 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/customize.sh +143 -0
- package/config/kortix-env-setup.sh +25 -0
- package/kortix-master/package.json +22 -0
- package/kortix-master/src/config.ts +22 -0
- package/kortix-master/src/index.ts +44 -0
- package/kortix-master/src/routes/env.ts +65 -0
- package/kortix-master/src/routes/proxy.ts +108 -0
- package/kortix-master/src/routes/update.ts +185 -0
- package/kortix-master/src/services/proxy.ts +43 -0
- package/kortix-master/src/services/secret-store.ts +156 -0
- package/kortix-master/tsconfig.json +14 -0
- package/opencode/agents/kortix-browser.md +142 -0
- package/opencode/agents/kortix-build.md +62 -0
- package/opencode/agents/kortix-explore.md +66 -0
- package/opencode/agents/kortix-image-gen.md +33 -0
- package/opencode/agents/kortix-main.md +450 -0
- package/opencode/agents/kortix-plan.md +100 -0
- package/opencode/agents/kortix-research.md +84 -0
- package/opencode/agents/kortix-sheets.md +61 -0
- package/opencode/agents/kortix-slides.md +64 -0
- package/opencode/agents/kortix-web-dev.md +572 -0
- package/opencode/commands/email.md +36 -0
- package/opencode/commands/init.md +43 -0
- package/opencode/commands/journal.md +44 -0
- package/opencode/commands/memory-init.md +81 -0
- package/opencode/commands/memory-search.md +50 -0
- package/opencode/commands/memory-status.md +56 -0
- package/opencode/commands/research.md +36 -0
- package/opencode/commands/search.md +38 -0
- package/opencode/commands/slides.md +32 -0
- package/opencode/commands/spreadsheet.md +30 -0
- package/opencode/memory.json +37 -0
- package/opencode/ocx.jsonc +10 -0
- package/opencode/opencode.jsonc +103 -0
- package/opencode/package.json +25 -0
- package/opencode/patches/apply.sh +19 -0
- package/opencode/patches/opencode-pty-spawn.txt +49 -0
- package/opencode/plugin/background-agents.ts.disabled +483 -0
- package/opencode/plugin/kdco-primitives/get-project-id.ts +172 -0
- package/opencode/plugin/kdco-primitives/index.ts +26 -0
- package/opencode/plugin/kdco-primitives/log-warn.ts +51 -0
- package/opencode/plugin/kdco-primitives/mutex.ts +122 -0
- package/opencode/plugin/kdco-primitives/shell.ts +138 -0
- package/opencode/plugin/kdco-primitives/temp.ts +36 -0
- package/opencode/plugin/kdco-primitives/terminal-detect.ts +34 -0
- package/opencode/plugin/kdco-primitives/types.ts +13 -0
- package/opencode/plugin/kdco-primitives/with-timeout.ts +84 -0
- package/opencode/plugin/memory.ts +306 -0
- package/opencode/plugin/worktree/state.ts +412 -0
- package/opencode/plugin/worktree/terminal.ts +1002 -0
- package/opencode/plugin/worktree.ts +861 -0
- package/opencode/skills/KORTIX-browser/SKILL.md +478 -0
- package/opencode/skills/KORTIX-cron-triggers/SKILL.md +173 -0
- package/opencode/skills/KORTIX-deep-research/SKILL.md +278 -0
- package/opencode/skills/KORTIX-docx/SKILL.md +398 -0
- package/opencode/skills/KORTIX-docx/scripts/__init__.py +1 -0
- package/opencode/skills/KORTIX-docx/scripts/accept_changes.py +104 -0
- package/opencode/skills/KORTIX-docx/scripts/comment.py +244 -0
- package/opencode/skills/KORTIX-docx/scripts/office/helpers/__init__.py +0 -0
- package/opencode/skills/KORTIX-docx/scripts/office/helpers/merge_runs.py +199 -0
- package/opencode/skills/KORTIX-docx/scripts/office/helpers/simplify_redlines.py +197 -0
- package/opencode/skills/KORTIX-docx/scripts/office/pack.py +159 -0
- package/opencode/skills/KORTIX-docx/scripts/office/soffice.py +183 -0
- package/opencode/skills/KORTIX-docx/scripts/office/unpack.py +132 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validate.py +111 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/__init__.py +15 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/base.py +847 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/docx.py +446 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/pptx.py +275 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/redlining.py +247 -0
- package/opencode/skills/KORTIX-docx/scripts/render_docx.py +179 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/comments.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/commentsExtended.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/commentsExtensible.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/commentsIds.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/people.xml +3 -0
- package/opencode/skills/KORTIX-domain-research/SKILL.md +96 -0
- package/opencode/skills/KORTIX-domain-research/scripts/domain-lookup.py +810 -0
- package/opencode/skills/KORTIX-elevenlabs/SKILL.md +230 -0
- package/opencode/skills/KORTIX-elevenlabs/scripts/tts.py +389 -0
- package/opencode/skills/KORTIX-email/SKILL.md +145 -0
- package/opencode/skills/KORTIX-legal-writer/SKILL.md +409 -0
- package/opencode/skills/KORTIX-legal-writer/references/bluebook.md +152 -0
- package/opencode/skills/KORTIX-legal-writer/references/document-types.md +416 -0
- package/opencode/skills/KORTIX-legal-writer/scripts/courtlistener.py +291 -0
- package/opencode/skills/KORTIX-legal-writer/scripts/ecfr_lookup.py +299 -0
- package/opencode/skills/KORTIX-legal-writer/scripts/verify-legal.py +507 -0
- package/opencode/skills/KORTIX-logo-creator/SKILL.md +293 -0
- package/opencode/skills/KORTIX-logo-creator/references/prompt-patterns.md +134 -0
- package/opencode/skills/KORTIX-logo-creator/scripts/compose_logo.py +406 -0
- package/opencode/skills/KORTIX-logo-creator/scripts/create_logo_sheet.py +258 -0
- package/opencode/skills/KORTIX-logo-creator/scripts/remove_bg.py +96 -0
- package/opencode/skills/KORTIX-memory/SKILL.md +261 -0
- package/opencode/skills/KORTIX-memory/scripts/export-sessions.py +409 -0
- package/opencode/skills/KORTIX-paper-creator/SKILL.md +549 -0
- package/opencode/skills/KORTIX-paper-creator/assets/template.tex +101 -0
- package/opencode/skills/KORTIX-paper-creator/scripts/compile.sh +177 -0
- package/opencode/skills/KORTIX-paper-creator/scripts/openalex_to_bibtex.py +220 -0
- package/opencode/skills/KORTIX-paper-creator/scripts/verify.sh +354 -0
- package/opencode/skills/KORTIX-paper-search/SKILL.md +418 -0
- package/opencode/skills/KORTIX-pdf/SKILL.md +232 -0
- package/opencode/skills/KORTIX-pdf/forms.md +36 -0
- package/opencode/skills/KORTIX-pdf/reference.md +105 -0
- package/opencode/skills/KORTIX-pdf/scripts/check_bounding_boxes.py +65 -0
- package/opencode/skills/KORTIX-pdf/scripts/check_fillable_fields.py +11 -0
- package/opencode/skills/KORTIX-pdf/scripts/convert_pdf_to_images.py +33 -0
- package/opencode/skills/KORTIX-pdf/scripts/create_validation_image.py +37 -0
- package/opencode/skills/KORTIX-pdf/scripts/extract_form_field_info.py +122 -0
- package/opencode/skills/KORTIX-pdf/scripts/extract_form_structure.py +115 -0
- package/opencode/skills/KORTIX-pdf/scripts/fill_fillable_fields.py +98 -0
- package/opencode/skills/KORTIX-pdf/scripts/fill_pdf_form_with_annotations.py +107 -0
- package/opencode/skills/KORTIX-plan/SKILL.md +228 -0
- package/opencode/skills/KORTIX-presentation-viewer/SKILL.md +87 -0
- package/opencode/skills/KORTIX-presentation-viewer/serve.ts +136 -0
- package/opencode/skills/KORTIX-presentation-viewer/viewer.html +559 -0
- package/opencode/skills/KORTIX-presentations/SKILL.md +344 -0
- package/opencode/skills/KORTIX-remotion/SKILL.md +56 -0
- package/opencode/skills/KORTIX-remotion/rules/3d.md +86 -0
- package/opencode/skills/KORTIX-remotion/rules/animations.md +29 -0
- package/opencode/skills/KORTIX-remotion/rules/assets.md +78 -0
- package/opencode/skills/KORTIX-remotion/rules/audio-visualization.md +198 -0
- package/opencode/skills/KORTIX-remotion/rules/audio.md +169 -0
- package/opencode/skills/KORTIX-remotion/rules/calculate-metadata.md +104 -0
- package/opencode/skills/KORTIX-remotion/rules/can-decode.md +75 -0
- package/opencode/skills/KORTIX-remotion/rules/charts.md +120 -0
- package/opencode/skills/KORTIX-remotion/rules/compositions.md +141 -0
- package/opencode/skills/KORTIX-remotion/rules/display-captions.md +184 -0
- package/opencode/skills/KORTIX-remotion/rules/extract-frames.md +229 -0
- package/opencode/skills/KORTIX-remotion/rules/ffmpeg.md +38 -0
- package/opencode/skills/KORTIX-remotion/rules/fonts.md +152 -0
- package/opencode/skills/KORTIX-remotion/rules/get-audio-duration.md +58 -0
- package/opencode/skills/KORTIX-remotion/rules/get-video-dimensions.md +68 -0
- package/opencode/skills/KORTIX-remotion/rules/get-video-duration.md +58 -0
- package/opencode/skills/KORTIX-remotion/rules/gifs.md +141 -0
- package/opencode/skills/KORTIX-remotion/rules/images.md +130 -0
- package/opencode/skills/KORTIX-remotion/rules/import-srt-captions.md +69 -0
- package/opencode/skills/KORTIX-remotion/rules/light-leaks.md +73 -0
- package/opencode/skills/KORTIX-remotion/rules/lottie.md +68 -0
- package/opencode/skills/KORTIX-remotion/rules/maps.md +401 -0
- package/opencode/skills/KORTIX-remotion/rules/measuring-dom-nodes.md +35 -0
- package/opencode/skills/KORTIX-remotion/rules/measuring-text.md +143 -0
- package/opencode/skills/KORTIX-remotion/rules/parameters.md +98 -0
- package/opencode/skills/KORTIX-remotion/rules/sequencing.md +118 -0
- package/opencode/skills/KORTIX-remotion/rules/subtitles.md +36 -0
- package/opencode/skills/KORTIX-remotion/rules/tailwind.md +11 -0
- package/opencode/skills/KORTIX-remotion/rules/text-animations.md +20 -0
- package/opencode/skills/KORTIX-remotion/rules/timing.md +179 -0
- package/opencode/skills/KORTIX-remotion/rules/transcribe-captions.md +70 -0
- package/opencode/skills/KORTIX-remotion/rules/transitions.md +197 -0
- package/opencode/skills/KORTIX-remotion/rules/transparent-videos.md +106 -0
- package/opencode/skills/KORTIX-remotion/rules/trimming.md +53 -0
- package/opencode/skills/KORTIX-remotion/rules/videos.md +171 -0
- package/opencode/skills/KORTIX-secrets/SKILL.md +280 -0
- package/opencode/skills/KORTIX-semantic-search/SKILL.md +213 -0
- package/opencode/skills/KORTIX-session-search/SKILL.md +807 -0
- package/opencode/skills/KORTIX-session-search/Untitled +1 -0
- package/opencode/skills/KORTIX-skill-creator/SKILL.md +163 -0
- package/opencode/skills/KORTIX-web-research/SKILL.md +69 -0
- package/opencode/skills/KORTIX-xlsx/LICENSE.txt +30 -0
- package/opencode/skills/KORTIX-xlsx/SKILL.md +549 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/__init__.py +0 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/merge_runs.py +199 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/simplify_redlines.py +197 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/pack.py +159 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/soffice.py +183 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/unpack.py +132 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validate.py +111 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/__init__.py +15 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/base.py +847 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/docx.py +446 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/pptx.py +275 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/redlining.py +247 -0
- package/opencode/skills/KORTIX-xlsx/scripts/recalc.py +184 -0
- package/opencode/tools/image-gen.ts +342 -0
- package/opencode/tools/image-search.ts +190 -0
- package/opencode/tools/memory-get.ts +168 -0
- package/opencode/tools/memory-search.ts +247 -0
- package/opencode/tools/presentation-gen.ts +723 -0
- package/opencode/tools/scrape-webpage.ts +115 -0
- package/opencode/tools/scripts/.python-version +1 -0
- package/opencode/tools/scripts/convert_pdf.py +184 -0
- package/opencode/tools/scripts/convert_pptx.py +562 -0
- package/opencode/tools/scripts/pyproject.toml +11 -0
- package/opencode/tools/scripts/uv.lock +287 -0
- package/opencode/tools/scripts/validate_slide.py +74 -0
- package/opencode/tools/show-user.ts +217 -0
- package/opencode/tools/tests/e2e-presentation-fix.ts +277 -0
- package/opencode/tools/tests/image-gen.test.ts +215 -0
- package/opencode/tools/tests/image-search.test.ts +125 -0
- package/opencode/tools/tests/memory-system-benchmark.ts +1076 -0
- package/opencode/tools/tests/presentation-gen.test.ts +389 -0
- package/opencode/tools/tests/scrape-webpage.test.ts +74 -0
- package/opencode/tools/tests/show-user.test.ts +241 -0
- package/opencode/tools/tests/video-gen.test.ts +110 -0
- package/opencode/tools/tests/web-search.test.ts +106 -0
- package/opencode/tools/video-gen.ts +200 -0
- package/opencode/tools/web-search.ts +153 -0
- package/opencode/tsconfig.json +29 -0
- package/package.json +36 -0
- package/patch-agent-browser.js +100 -0
- package/postinstall.sh +88 -0
- package/services/KORTIX-presentation-viewer/run +37 -0
- package/services/agent-browser-viewer/run +48 -0
- package/services/kortix-master/run +16 -0
- package/services/lss-sync/run +22 -0
- package/services/opencode-serve/run +25 -0
- package/services/opencode-web/run +21 -0
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
"""Simplify tracked changes by merging adjacent w:ins or w:del elements.
|
|
2
|
+
|
|
3
|
+
Merges adjacent <w:ins> elements from the same author into a single element.
|
|
4
|
+
Same for <w:del> elements. This makes heavily-redlined documents easier to
|
|
5
|
+
work with by reducing the number of tracked change wrappers.
|
|
6
|
+
|
|
7
|
+
Rules:
|
|
8
|
+
- Only merges w:ins with w:ins, w:del with w:del (same element type)
|
|
9
|
+
- Only merges if same author (ignores timestamp differences)
|
|
10
|
+
- Only merges if truly adjacent (only whitespace between them)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import xml.etree.ElementTree as ET
|
|
14
|
+
import zipfile
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
import defusedxml.minidom
|
|
18
|
+
|
|
19
|
+
WORD_NS = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def simplify_redlines(input_dir: str) -> tuple[int, str]:
|
|
23
|
+
doc_xml = Path(input_dir) / "word" / "document.xml"
|
|
24
|
+
|
|
25
|
+
if not doc_xml.exists():
|
|
26
|
+
return 0, f"Error: {doc_xml} not found"
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
dom = defusedxml.minidom.parseString(doc_xml.read_text(encoding="utf-8"))
|
|
30
|
+
root = dom.documentElement
|
|
31
|
+
|
|
32
|
+
merge_count = 0
|
|
33
|
+
|
|
34
|
+
containers = _find_elements(root, "p") + _find_elements(root, "tc")
|
|
35
|
+
|
|
36
|
+
for container in containers:
|
|
37
|
+
merge_count += _merge_tracked_changes_in(container, "ins")
|
|
38
|
+
merge_count += _merge_tracked_changes_in(container, "del")
|
|
39
|
+
|
|
40
|
+
doc_xml.write_bytes(dom.toxml(encoding="UTF-8"))
|
|
41
|
+
return merge_count, f"Simplified {merge_count} tracked changes"
|
|
42
|
+
|
|
43
|
+
except Exception as e:
|
|
44
|
+
return 0, f"Error: {e}"
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _merge_tracked_changes_in(container, tag: str) -> int:
|
|
48
|
+
merge_count = 0
|
|
49
|
+
|
|
50
|
+
tracked = [
|
|
51
|
+
child
|
|
52
|
+
for child in container.childNodes
|
|
53
|
+
if child.nodeType == child.ELEMENT_NODE and _is_element(child, tag)
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
if len(tracked) < 2:
|
|
57
|
+
return 0
|
|
58
|
+
|
|
59
|
+
i = 0
|
|
60
|
+
while i < len(tracked) - 1:
|
|
61
|
+
curr = tracked[i]
|
|
62
|
+
next_elem = tracked[i + 1]
|
|
63
|
+
|
|
64
|
+
if _can_merge_tracked(curr, next_elem):
|
|
65
|
+
_merge_tracked_content(curr, next_elem)
|
|
66
|
+
container.removeChild(next_elem)
|
|
67
|
+
tracked.pop(i + 1)
|
|
68
|
+
merge_count += 1
|
|
69
|
+
else:
|
|
70
|
+
i += 1
|
|
71
|
+
|
|
72
|
+
return merge_count
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _is_element(node, tag: str) -> bool:
|
|
76
|
+
name = node.localName or node.tagName
|
|
77
|
+
return name == tag or name.endswith(f":{tag}")
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def _get_author(elem) -> str:
|
|
81
|
+
author = elem.getAttribute("w:author")
|
|
82
|
+
if not author:
|
|
83
|
+
for attr in elem.attributes.values():
|
|
84
|
+
if attr.localName == "author" or attr.name.endswith(":author"):
|
|
85
|
+
return attr.value
|
|
86
|
+
return author
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def _can_merge_tracked(elem1, elem2) -> bool:
|
|
90
|
+
if _get_author(elem1) != _get_author(elem2):
|
|
91
|
+
return False
|
|
92
|
+
|
|
93
|
+
node = elem1.nextSibling
|
|
94
|
+
while node and node != elem2:
|
|
95
|
+
if node.nodeType == node.ELEMENT_NODE:
|
|
96
|
+
return False
|
|
97
|
+
if node.nodeType == node.TEXT_NODE and node.data.strip():
|
|
98
|
+
return False
|
|
99
|
+
node = node.nextSibling
|
|
100
|
+
|
|
101
|
+
return True
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def _merge_tracked_content(target, source):
|
|
105
|
+
while source.firstChild:
|
|
106
|
+
child = source.firstChild
|
|
107
|
+
source.removeChild(child)
|
|
108
|
+
target.appendChild(child)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def _find_elements(root, tag: str) -> list:
|
|
112
|
+
results = []
|
|
113
|
+
|
|
114
|
+
def traverse(node):
|
|
115
|
+
if node.nodeType == node.ELEMENT_NODE:
|
|
116
|
+
name = node.localName or node.tagName
|
|
117
|
+
if name == tag or name.endswith(f":{tag}"):
|
|
118
|
+
results.append(node)
|
|
119
|
+
for child in node.childNodes:
|
|
120
|
+
traverse(child)
|
|
121
|
+
|
|
122
|
+
traverse(root)
|
|
123
|
+
return results
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def get_tracked_change_authors(doc_xml_path: Path) -> dict[str, int]:
|
|
127
|
+
if not doc_xml_path.exists():
|
|
128
|
+
return {}
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
tree = ET.parse(doc_xml_path)
|
|
132
|
+
root = tree.getroot()
|
|
133
|
+
except ET.ParseError:
|
|
134
|
+
return {}
|
|
135
|
+
|
|
136
|
+
namespaces = {"w": WORD_NS}
|
|
137
|
+
author_attr = f"{{{WORD_NS}}}author"
|
|
138
|
+
|
|
139
|
+
authors: dict[str, int] = {}
|
|
140
|
+
for tag in ["ins", "del"]:
|
|
141
|
+
for elem in root.findall(f".//w:{tag}", namespaces):
|
|
142
|
+
author = elem.get(author_attr)
|
|
143
|
+
if author:
|
|
144
|
+
authors[author] = authors.get(author, 0) + 1
|
|
145
|
+
|
|
146
|
+
return authors
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _get_authors_from_docx(docx_path: Path) -> dict[str, int]:
|
|
150
|
+
try:
|
|
151
|
+
with zipfile.ZipFile(docx_path, "r") as zf:
|
|
152
|
+
if "word/document.xml" not in zf.namelist():
|
|
153
|
+
return {}
|
|
154
|
+
with zf.open("word/document.xml") as f:
|
|
155
|
+
tree = ET.parse(f)
|
|
156
|
+
root = tree.getroot()
|
|
157
|
+
|
|
158
|
+
namespaces = {"w": WORD_NS}
|
|
159
|
+
author_attr = f"{{{WORD_NS}}}author"
|
|
160
|
+
|
|
161
|
+
authors: dict[str, int] = {}
|
|
162
|
+
for tag in ["ins", "del"]:
|
|
163
|
+
for elem in root.findall(f".//w:{tag}", namespaces):
|
|
164
|
+
author = elem.get(author_attr)
|
|
165
|
+
if author:
|
|
166
|
+
authors[author] = authors.get(author, 0) + 1
|
|
167
|
+
return authors
|
|
168
|
+
except (zipfile.BadZipFile, ET.ParseError):
|
|
169
|
+
return {}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def infer_author(modified_dir: Path, original_docx: Path, default: str = "Claude") -> str:
|
|
173
|
+
modified_xml = modified_dir / "word" / "document.xml"
|
|
174
|
+
modified_authors = get_tracked_change_authors(modified_xml)
|
|
175
|
+
|
|
176
|
+
if not modified_authors:
|
|
177
|
+
return default
|
|
178
|
+
|
|
179
|
+
original_authors = _get_authors_from_docx(original_docx)
|
|
180
|
+
|
|
181
|
+
new_changes: dict[str, int] = {}
|
|
182
|
+
for author, count in modified_authors.items():
|
|
183
|
+
original_count = original_authors.get(author, 0)
|
|
184
|
+
diff = count - original_count
|
|
185
|
+
if diff > 0:
|
|
186
|
+
new_changes[author] = diff
|
|
187
|
+
|
|
188
|
+
if not new_changes:
|
|
189
|
+
return default
|
|
190
|
+
|
|
191
|
+
if len(new_changes) == 1:
|
|
192
|
+
return next(iter(new_changes))
|
|
193
|
+
|
|
194
|
+
raise ValueError(
|
|
195
|
+
f"Multiple authors added new changes: {new_changes}. "
|
|
196
|
+
"Cannot infer which author to validate."
|
|
197
|
+
)
|
|
@@ -0,0 +1,159 @@
|
|
|
1
|
+
"""Pack a directory into a DOCX, PPTX, or XLSX file.
|
|
2
|
+
|
|
3
|
+
Validates with auto-repair, condenses XML formatting, and creates the Office file.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python pack.py <input_directory> <output_file> [--original <file>] [--validate true|false]
|
|
7
|
+
|
|
8
|
+
Examples:
|
|
9
|
+
python pack.py unpacked/ output.docx --original input.docx
|
|
10
|
+
python pack.py unpacked/ output.pptx --validate false
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import argparse
|
|
14
|
+
import sys
|
|
15
|
+
import shutil
|
|
16
|
+
import tempfile
|
|
17
|
+
import zipfile
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
|
|
20
|
+
import defusedxml.minidom
|
|
21
|
+
|
|
22
|
+
from validators import DOCXSchemaValidator, PPTXSchemaValidator, RedliningValidator
|
|
23
|
+
|
|
24
|
+
def pack(
|
|
25
|
+
input_directory: str,
|
|
26
|
+
output_file: str,
|
|
27
|
+
original_file: str | None = None,
|
|
28
|
+
validate: bool = True,
|
|
29
|
+
infer_author_func=None,
|
|
30
|
+
) -> tuple[None, str]:
|
|
31
|
+
input_dir = Path(input_directory)
|
|
32
|
+
output_path = Path(output_file)
|
|
33
|
+
suffix = output_path.suffix.lower()
|
|
34
|
+
|
|
35
|
+
if not input_dir.is_dir():
|
|
36
|
+
return None, f"Error: {input_dir} is not a directory"
|
|
37
|
+
|
|
38
|
+
if suffix not in {".docx", ".pptx", ".xlsx"}:
|
|
39
|
+
return None, f"Error: {output_file} must be a .docx, .pptx, or .xlsx file"
|
|
40
|
+
|
|
41
|
+
if validate and original_file:
|
|
42
|
+
original_path = Path(original_file)
|
|
43
|
+
if original_path.exists():
|
|
44
|
+
success, output = _run_validation(
|
|
45
|
+
input_dir, original_path, suffix, infer_author_func
|
|
46
|
+
)
|
|
47
|
+
if output:
|
|
48
|
+
print(output)
|
|
49
|
+
if not success:
|
|
50
|
+
return None, f"Error: Validation failed for {input_dir}"
|
|
51
|
+
|
|
52
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
53
|
+
temp_content_dir = Path(temp_dir) / "content"
|
|
54
|
+
shutil.copytree(input_dir, temp_content_dir)
|
|
55
|
+
|
|
56
|
+
for pattern in ["*.xml", "*.rels"]:
|
|
57
|
+
for xml_file in temp_content_dir.rglob(pattern):
|
|
58
|
+
_condense_xml(xml_file)
|
|
59
|
+
|
|
60
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
61
|
+
with zipfile.ZipFile(output_path, "w", zipfile.ZIP_DEFLATED) as zf:
|
|
62
|
+
for f in temp_content_dir.rglob("*"):
|
|
63
|
+
if f.is_file():
|
|
64
|
+
zf.write(f, f.relative_to(temp_content_dir))
|
|
65
|
+
|
|
66
|
+
return None, f"Successfully packed {input_dir} to {output_file}"
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _run_validation(
|
|
70
|
+
unpacked_dir: Path,
|
|
71
|
+
original_file: Path,
|
|
72
|
+
suffix: str,
|
|
73
|
+
infer_author_func=None,
|
|
74
|
+
) -> tuple[bool, str | None]:
|
|
75
|
+
output_lines = []
|
|
76
|
+
validators = []
|
|
77
|
+
|
|
78
|
+
if suffix == ".docx":
|
|
79
|
+
author = "Claude"
|
|
80
|
+
if infer_author_func:
|
|
81
|
+
try:
|
|
82
|
+
author = infer_author_func(unpacked_dir, original_file)
|
|
83
|
+
except ValueError as e:
|
|
84
|
+
print(f"Warning: {e} Using default author 'Claude'.", file=sys.stderr)
|
|
85
|
+
|
|
86
|
+
validators = [
|
|
87
|
+
DOCXSchemaValidator(unpacked_dir, original_file),
|
|
88
|
+
RedliningValidator(unpacked_dir, original_file, author=author),
|
|
89
|
+
]
|
|
90
|
+
elif suffix == ".pptx":
|
|
91
|
+
validators = [PPTXSchemaValidator(unpacked_dir, original_file)]
|
|
92
|
+
|
|
93
|
+
if not validators:
|
|
94
|
+
return True, None
|
|
95
|
+
|
|
96
|
+
total_repairs = sum(v.repair() for v in validators)
|
|
97
|
+
if total_repairs:
|
|
98
|
+
output_lines.append(f"Auto-repaired {total_repairs} issue(s)")
|
|
99
|
+
|
|
100
|
+
success = all(v.validate() for v in validators)
|
|
101
|
+
|
|
102
|
+
if success:
|
|
103
|
+
output_lines.append("All validations PASSED!")
|
|
104
|
+
|
|
105
|
+
return success, "\n".join(output_lines) if output_lines else None
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _condense_xml(xml_file: Path) -> None:
|
|
109
|
+
try:
|
|
110
|
+
with open(xml_file, encoding="utf-8") as f:
|
|
111
|
+
dom = defusedxml.minidom.parse(f)
|
|
112
|
+
|
|
113
|
+
for element in dom.getElementsByTagName("*"):
|
|
114
|
+
if element.tagName.endswith(":t"):
|
|
115
|
+
continue
|
|
116
|
+
|
|
117
|
+
for child in list(element.childNodes):
|
|
118
|
+
if (
|
|
119
|
+
child.nodeType == child.TEXT_NODE
|
|
120
|
+
and child.nodeValue
|
|
121
|
+
and child.nodeValue.strip() == ""
|
|
122
|
+
) or child.nodeType == child.COMMENT_NODE:
|
|
123
|
+
element.removeChild(child)
|
|
124
|
+
|
|
125
|
+
xml_file.write_bytes(dom.toxml(encoding="UTF-8"))
|
|
126
|
+
except Exception as e:
|
|
127
|
+
print(f"ERROR: Failed to parse {xml_file.name}: {e}", file=sys.stderr)
|
|
128
|
+
raise
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
if __name__ == "__main__":
|
|
132
|
+
parser = argparse.ArgumentParser(
|
|
133
|
+
description="Pack a directory into a DOCX, PPTX, or XLSX file"
|
|
134
|
+
)
|
|
135
|
+
parser.add_argument("input_directory", help="Unpacked Office document directory")
|
|
136
|
+
parser.add_argument("output_file", help="Output Office file (.docx/.pptx/.xlsx)")
|
|
137
|
+
parser.add_argument(
|
|
138
|
+
"--original",
|
|
139
|
+
help="Original file for validation comparison",
|
|
140
|
+
)
|
|
141
|
+
parser.add_argument(
|
|
142
|
+
"--validate",
|
|
143
|
+
type=lambda x: x.lower() == "true",
|
|
144
|
+
default=True,
|
|
145
|
+
metavar="true|false",
|
|
146
|
+
help="Run validation with auto-repair (default: true)",
|
|
147
|
+
)
|
|
148
|
+
args = parser.parse_args()
|
|
149
|
+
|
|
150
|
+
_, message = pack(
|
|
151
|
+
args.input_directory,
|
|
152
|
+
args.output_file,
|
|
153
|
+
original_file=args.original,
|
|
154
|
+
validate=args.validate,
|
|
155
|
+
)
|
|
156
|
+
print(message)
|
|
157
|
+
|
|
158
|
+
if "Error" in message:
|
|
159
|
+
sys.exit(1)
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Helper for running LibreOffice (soffice) in environments where AF_UNIX
|
|
3
|
+
sockets may be blocked (e.g., sandboxed VMs). Detects the restriction
|
|
4
|
+
at runtime and applies an LD_PRELOAD shim if needed.
|
|
5
|
+
|
|
6
|
+
Usage:
|
|
7
|
+
from office.soffice import run_soffice, get_soffice_env
|
|
8
|
+
|
|
9
|
+
# Option 1 – run soffice directly
|
|
10
|
+
result = run_soffice(["--headless", "--convert-to", "pdf", "input.docx"])
|
|
11
|
+
|
|
12
|
+
# Option 2 – get env dict for your own subprocess calls
|
|
13
|
+
env = get_soffice_env()
|
|
14
|
+
subprocess.run(["soffice", ...], env=env)
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
import os
|
|
18
|
+
import socket
|
|
19
|
+
import subprocess
|
|
20
|
+
import tempfile
|
|
21
|
+
from pathlib import Path
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def get_soffice_env() -> dict:
|
|
25
|
+
env = os.environ.copy()
|
|
26
|
+
env["SAL_USE_VCLPLUGIN"] = "svp"
|
|
27
|
+
|
|
28
|
+
if _needs_shim():
|
|
29
|
+
shim = _ensure_shim()
|
|
30
|
+
env["LD_PRELOAD"] = str(shim)
|
|
31
|
+
|
|
32
|
+
return env
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def run_soffice(args: list[str], **kwargs) -> subprocess.CompletedProcess:
|
|
36
|
+
env = get_soffice_env()
|
|
37
|
+
return subprocess.run(["soffice"] + args, env=env, **kwargs)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
_SHIM_SO = Path(tempfile.gettempdir()) / "lo_socket_shim.so"
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _needs_shim() -> bool:
|
|
45
|
+
try:
|
|
46
|
+
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
47
|
+
s.close()
|
|
48
|
+
return False
|
|
49
|
+
except OSError:
|
|
50
|
+
return True
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def _ensure_shim() -> Path:
|
|
54
|
+
if _SHIM_SO.exists():
|
|
55
|
+
return _SHIM_SO
|
|
56
|
+
|
|
57
|
+
src = Path(tempfile.gettempdir()) / "lo_socket_shim.c"
|
|
58
|
+
src.write_text(_SHIM_SOURCE)
|
|
59
|
+
subprocess.run(
|
|
60
|
+
["gcc", "-shared", "-fPIC", "-o", str(_SHIM_SO), str(src), "-ldl"],
|
|
61
|
+
check=True,
|
|
62
|
+
capture_output=True,
|
|
63
|
+
)
|
|
64
|
+
src.unlink()
|
|
65
|
+
return _SHIM_SO
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
_SHIM_SOURCE = r"""
|
|
70
|
+
#define _GNU_SOURCE
|
|
71
|
+
#include <dlfcn.h>
|
|
72
|
+
#include <errno.h>
|
|
73
|
+
#include <signal.h>
|
|
74
|
+
#include <stdio.h>
|
|
75
|
+
#include <stdlib.h>
|
|
76
|
+
#include <sys/socket.h>
|
|
77
|
+
#include <unistd.h>
|
|
78
|
+
|
|
79
|
+
static int (*real_socket)(int, int, int);
|
|
80
|
+
static int (*real_socketpair)(int, int, int, int[2]);
|
|
81
|
+
static int (*real_listen)(int, int);
|
|
82
|
+
static int (*real_accept)(int, struct sockaddr *, socklen_t *);
|
|
83
|
+
static int (*real_close)(int);
|
|
84
|
+
static int (*real_read)(int, void *, size_t);
|
|
85
|
+
|
|
86
|
+
/* Per-FD bookkeeping (FDs >= 1024 are passed through unshimmed). */
|
|
87
|
+
static int is_shimmed[1024];
|
|
88
|
+
static int peer_of[1024];
|
|
89
|
+
static int wake_r[1024]; /* accept() blocks reading this */
|
|
90
|
+
static int wake_w[1024]; /* close() writes to this */
|
|
91
|
+
static int listener_fd = -1; /* FD that received listen() */
|
|
92
|
+
|
|
93
|
+
__attribute__((constructor))
|
|
94
|
+
static void init(void) {
|
|
95
|
+
real_socket = dlsym(RTLD_NEXT, "socket");
|
|
96
|
+
real_socketpair = dlsym(RTLD_NEXT, "socketpair");
|
|
97
|
+
real_listen = dlsym(RTLD_NEXT, "listen");
|
|
98
|
+
real_accept = dlsym(RTLD_NEXT, "accept");
|
|
99
|
+
real_close = dlsym(RTLD_NEXT, "close");
|
|
100
|
+
real_read = dlsym(RTLD_NEXT, "read");
|
|
101
|
+
for (int i = 0; i < 1024; i++) {
|
|
102
|
+
peer_of[i] = -1;
|
|
103
|
+
wake_r[i] = -1;
|
|
104
|
+
wake_w[i] = -1;
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
/* ---- socket ---------------------------------------------------------- */
|
|
109
|
+
int socket(int domain, int type, int protocol) {
|
|
110
|
+
if (domain == AF_UNIX) {
|
|
111
|
+
int fd = real_socket(domain, type, protocol);
|
|
112
|
+
if (fd >= 0) return fd;
|
|
113
|
+
/* socket(AF_UNIX) blocked – fall back to socketpair(). */
|
|
114
|
+
int sv[2];
|
|
115
|
+
if (real_socketpair(domain, type, protocol, sv) == 0) {
|
|
116
|
+
if (sv[0] >= 0 && sv[0] < 1024) {
|
|
117
|
+
is_shimmed[sv[0]] = 1;
|
|
118
|
+
peer_of[sv[0]] = sv[1];
|
|
119
|
+
int wp[2];
|
|
120
|
+
if (pipe(wp) == 0) {
|
|
121
|
+
wake_r[sv[0]] = wp[0];
|
|
122
|
+
wake_w[sv[0]] = wp[1];
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
return sv[0];
|
|
126
|
+
}
|
|
127
|
+
errno = EPERM;
|
|
128
|
+
return -1;
|
|
129
|
+
}
|
|
130
|
+
return real_socket(domain, type, protocol);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/* ---- listen ---------------------------------------------------------- */
|
|
134
|
+
int listen(int sockfd, int backlog) {
|
|
135
|
+
if (sockfd >= 0 && sockfd < 1024 && is_shimmed[sockfd]) {
|
|
136
|
+
listener_fd = sockfd;
|
|
137
|
+
return 0;
|
|
138
|
+
}
|
|
139
|
+
return real_listen(sockfd, backlog);
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
/* ---- accept ---------------------------------------------------------- */
|
|
143
|
+
int accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen) {
|
|
144
|
+
if (sockfd >= 0 && sockfd < 1024 && is_shimmed[sockfd]) {
|
|
145
|
+
/* Block until close() writes to the wake pipe. */
|
|
146
|
+
if (wake_r[sockfd] >= 0) {
|
|
147
|
+
char buf;
|
|
148
|
+
real_read(wake_r[sockfd], &buf, 1);
|
|
149
|
+
}
|
|
150
|
+
errno = ECONNABORTED;
|
|
151
|
+
return -1;
|
|
152
|
+
}
|
|
153
|
+
return real_accept(sockfd, addr, addrlen);
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
/* ---- close ----------------------------------------------------------- */
|
|
157
|
+
int close(int fd) {
|
|
158
|
+
if (fd >= 0 && fd < 1024 && is_shimmed[fd]) {
|
|
159
|
+
int was_listener = (fd == listener_fd);
|
|
160
|
+
is_shimmed[fd] = 0;
|
|
161
|
+
|
|
162
|
+
if (wake_w[fd] >= 0) { /* unblock accept() */
|
|
163
|
+
char c = 0;
|
|
164
|
+
write(wake_w[fd], &c, 1);
|
|
165
|
+
real_close(wake_w[fd]);
|
|
166
|
+
wake_w[fd] = -1;
|
|
167
|
+
}
|
|
168
|
+
if (wake_r[fd] >= 0) { real_close(wake_r[fd]); wake_r[fd] = -1; }
|
|
169
|
+
if (peer_of[fd] >= 0) { real_close(peer_of[fd]); peer_of[fd] = -1; }
|
|
170
|
+
|
|
171
|
+
if (was_listener)
|
|
172
|
+
_exit(0); /* conversion done – exit */
|
|
173
|
+
}
|
|
174
|
+
return real_close(fd);
|
|
175
|
+
}
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
if __name__ == "__main__":
|
|
181
|
+
import sys
|
|
182
|
+
result = run_soffice(sys.argv[1:])
|
|
183
|
+
sys.exit(result.returncode)
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""Unpack Office files (DOCX, PPTX, XLSX) for editing.
|
|
2
|
+
|
|
3
|
+
Extracts the ZIP archive, pretty-prints XML files, and optionally:
|
|
4
|
+
- Merges adjacent runs with identical formatting (DOCX only)
|
|
5
|
+
- Simplifies adjacent tracked changes from same author (DOCX only)
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python unpack.py <office_file> <output_dir> [options]
|
|
9
|
+
|
|
10
|
+
Examples:
|
|
11
|
+
python unpack.py document.docx unpacked/
|
|
12
|
+
python unpack.py presentation.pptx unpacked/
|
|
13
|
+
python unpack.py document.docx unpacked/ --merge-runs false
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
import argparse
|
|
17
|
+
import sys
|
|
18
|
+
import zipfile
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
import defusedxml.minidom
|
|
22
|
+
|
|
23
|
+
from helpers.merge_runs import merge_runs as do_merge_runs
|
|
24
|
+
from helpers.simplify_redlines import simplify_redlines as do_simplify_redlines
|
|
25
|
+
|
|
26
|
+
SMART_QUOTE_REPLACEMENTS = {
|
|
27
|
+
"\u201c": "“",
|
|
28
|
+
"\u201d": "”",
|
|
29
|
+
"\u2018": "‘",
|
|
30
|
+
"\u2019": "’",
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def unpack(
|
|
35
|
+
input_file: str,
|
|
36
|
+
output_directory: str,
|
|
37
|
+
merge_runs: bool = True,
|
|
38
|
+
simplify_redlines: bool = True,
|
|
39
|
+
) -> tuple[None, str]:
|
|
40
|
+
input_path = Path(input_file)
|
|
41
|
+
output_path = Path(output_directory)
|
|
42
|
+
suffix = input_path.suffix.lower()
|
|
43
|
+
|
|
44
|
+
if not input_path.exists():
|
|
45
|
+
return None, f"Error: {input_file} does not exist"
|
|
46
|
+
|
|
47
|
+
if suffix not in {".docx", ".pptx", ".xlsx"}:
|
|
48
|
+
return None, f"Error: {input_file} must be a .docx, .pptx, or .xlsx file"
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
52
|
+
|
|
53
|
+
with zipfile.ZipFile(input_path, "r") as zf:
|
|
54
|
+
zf.extractall(output_path)
|
|
55
|
+
|
|
56
|
+
xml_files = list(output_path.rglob("*.xml")) + list(output_path.rglob("*.rels"))
|
|
57
|
+
for xml_file in xml_files:
|
|
58
|
+
_pretty_print_xml(xml_file)
|
|
59
|
+
|
|
60
|
+
message = f"Unpacked {input_file} ({len(xml_files)} XML files)"
|
|
61
|
+
|
|
62
|
+
if suffix == ".docx":
|
|
63
|
+
if simplify_redlines:
|
|
64
|
+
simplify_count, _ = do_simplify_redlines(str(output_path))
|
|
65
|
+
message += f", simplified {simplify_count} tracked changes"
|
|
66
|
+
|
|
67
|
+
if merge_runs:
|
|
68
|
+
merge_count, _ = do_merge_runs(str(output_path))
|
|
69
|
+
message += f", merged {merge_count} runs"
|
|
70
|
+
|
|
71
|
+
for xml_file in xml_files:
|
|
72
|
+
_escape_smart_quotes(xml_file)
|
|
73
|
+
|
|
74
|
+
return None, message
|
|
75
|
+
|
|
76
|
+
except zipfile.BadZipFile:
|
|
77
|
+
return None, f"Error: {input_file} is not a valid Office file"
|
|
78
|
+
except Exception as e:
|
|
79
|
+
return None, f"Error unpacking: {e}"
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def _pretty_print_xml(xml_file: Path) -> None:
|
|
83
|
+
try:
|
|
84
|
+
content = xml_file.read_text(encoding="utf-8")
|
|
85
|
+
dom = defusedxml.minidom.parseString(content)
|
|
86
|
+
xml_file.write_bytes(dom.toprettyxml(indent=" ", encoding="utf-8"))
|
|
87
|
+
except Exception:
|
|
88
|
+
pass
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _escape_smart_quotes(xml_file: Path) -> None:
|
|
92
|
+
try:
|
|
93
|
+
content = xml_file.read_text(encoding="utf-8")
|
|
94
|
+
for char, entity in SMART_QUOTE_REPLACEMENTS.items():
|
|
95
|
+
content = content.replace(char, entity)
|
|
96
|
+
xml_file.write_text(content, encoding="utf-8")
|
|
97
|
+
except Exception:
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
if __name__ == "__main__":
|
|
102
|
+
parser = argparse.ArgumentParser(
|
|
103
|
+
description="Unpack an Office file (DOCX, PPTX, XLSX) for editing"
|
|
104
|
+
)
|
|
105
|
+
parser.add_argument("input_file", help="Office file to unpack")
|
|
106
|
+
parser.add_argument("output_directory", help="Output directory")
|
|
107
|
+
parser.add_argument(
|
|
108
|
+
"--merge-runs",
|
|
109
|
+
type=lambda x: x.lower() == "true",
|
|
110
|
+
default=True,
|
|
111
|
+
metavar="true|false",
|
|
112
|
+
help="Merge adjacent runs with identical formatting (DOCX only, default: true)",
|
|
113
|
+
)
|
|
114
|
+
parser.add_argument(
|
|
115
|
+
"--simplify-redlines",
|
|
116
|
+
type=lambda x: x.lower() == "true",
|
|
117
|
+
default=True,
|
|
118
|
+
metavar="true|false",
|
|
119
|
+
help="Merge adjacent tracked changes from same author (DOCX only, default: true)",
|
|
120
|
+
)
|
|
121
|
+
args = parser.parse_args()
|
|
122
|
+
|
|
123
|
+
_, message = unpack(
|
|
124
|
+
args.input_file,
|
|
125
|
+
args.output_directory,
|
|
126
|
+
merge_runs=args.merge_runs,
|
|
127
|
+
simplify_redlines=args.simplify_redlines,
|
|
128
|
+
)
|
|
129
|
+
print(message)
|
|
130
|
+
|
|
131
|
+
if "Error" in message:
|
|
132
|
+
sys.exit(1)
|