@kortix/sandbox 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/customize.sh +143 -0
- package/config/kortix-env-setup.sh +25 -0
- package/kortix-master/package.json +22 -0
- package/kortix-master/src/config.ts +22 -0
- package/kortix-master/src/index.ts +44 -0
- package/kortix-master/src/routes/env.ts +65 -0
- package/kortix-master/src/routes/proxy.ts +108 -0
- package/kortix-master/src/routes/update.ts +185 -0
- package/kortix-master/src/services/proxy.ts +43 -0
- package/kortix-master/src/services/secret-store.ts +156 -0
- package/kortix-master/tsconfig.json +14 -0
- package/opencode/agents/kortix-browser.md +142 -0
- package/opencode/agents/kortix-build.md +62 -0
- package/opencode/agents/kortix-explore.md +66 -0
- package/opencode/agents/kortix-image-gen.md +33 -0
- package/opencode/agents/kortix-main.md +450 -0
- package/opencode/agents/kortix-plan.md +100 -0
- package/opencode/agents/kortix-research.md +84 -0
- package/opencode/agents/kortix-sheets.md +61 -0
- package/opencode/agents/kortix-slides.md +64 -0
- package/opencode/agents/kortix-web-dev.md +572 -0
- package/opencode/commands/email.md +36 -0
- package/opencode/commands/init.md +43 -0
- package/opencode/commands/journal.md +44 -0
- package/opencode/commands/memory-init.md +81 -0
- package/opencode/commands/memory-search.md +50 -0
- package/opencode/commands/memory-status.md +56 -0
- package/opencode/commands/research.md +36 -0
- package/opencode/commands/search.md +38 -0
- package/opencode/commands/slides.md +32 -0
- package/opencode/commands/spreadsheet.md +30 -0
- package/opencode/memory.json +37 -0
- package/opencode/ocx.jsonc +10 -0
- package/opencode/opencode.jsonc +103 -0
- package/opencode/package.json +25 -0
- package/opencode/patches/apply.sh +19 -0
- package/opencode/patches/opencode-pty-spawn.txt +49 -0
- package/opencode/plugin/background-agents.ts.disabled +483 -0
- package/opencode/plugin/kdco-primitives/get-project-id.ts +172 -0
- package/opencode/plugin/kdco-primitives/index.ts +26 -0
- package/opencode/plugin/kdco-primitives/log-warn.ts +51 -0
- package/opencode/plugin/kdco-primitives/mutex.ts +122 -0
- package/opencode/plugin/kdco-primitives/shell.ts +138 -0
- package/opencode/plugin/kdco-primitives/temp.ts +36 -0
- package/opencode/plugin/kdco-primitives/terminal-detect.ts +34 -0
- package/opencode/plugin/kdco-primitives/types.ts +13 -0
- package/opencode/plugin/kdco-primitives/with-timeout.ts +84 -0
- package/opencode/plugin/memory.ts +306 -0
- package/opencode/plugin/worktree/state.ts +412 -0
- package/opencode/plugin/worktree/terminal.ts +1002 -0
- package/opencode/plugin/worktree.ts +861 -0
- package/opencode/skills/KORTIX-browser/SKILL.md +478 -0
- package/opencode/skills/KORTIX-cron-triggers/SKILL.md +173 -0
- package/opencode/skills/KORTIX-deep-research/SKILL.md +278 -0
- package/opencode/skills/KORTIX-docx/SKILL.md +398 -0
- package/opencode/skills/KORTIX-docx/scripts/__init__.py +1 -0
- package/opencode/skills/KORTIX-docx/scripts/accept_changes.py +104 -0
- package/opencode/skills/KORTIX-docx/scripts/comment.py +244 -0
- package/opencode/skills/KORTIX-docx/scripts/office/helpers/__init__.py +0 -0
- package/opencode/skills/KORTIX-docx/scripts/office/helpers/merge_runs.py +199 -0
- package/opencode/skills/KORTIX-docx/scripts/office/helpers/simplify_redlines.py +197 -0
- package/opencode/skills/KORTIX-docx/scripts/office/pack.py +159 -0
- package/opencode/skills/KORTIX-docx/scripts/office/soffice.py +183 -0
- package/opencode/skills/KORTIX-docx/scripts/office/unpack.py +132 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validate.py +111 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/__init__.py +15 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/base.py +847 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/docx.py +446 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/pptx.py +275 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/redlining.py +247 -0
- package/opencode/skills/KORTIX-docx/scripts/render_docx.py +179 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/comments.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/commentsExtended.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/commentsExtensible.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/commentsIds.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/people.xml +3 -0
- package/opencode/skills/KORTIX-domain-research/SKILL.md +96 -0
- package/opencode/skills/KORTIX-domain-research/scripts/domain-lookup.py +810 -0
- package/opencode/skills/KORTIX-elevenlabs/SKILL.md +230 -0
- package/opencode/skills/KORTIX-elevenlabs/scripts/tts.py +389 -0
- package/opencode/skills/KORTIX-email/SKILL.md +145 -0
- package/opencode/skills/KORTIX-legal-writer/SKILL.md +409 -0
- package/opencode/skills/KORTIX-legal-writer/references/bluebook.md +152 -0
- package/opencode/skills/KORTIX-legal-writer/references/document-types.md +416 -0
- package/opencode/skills/KORTIX-legal-writer/scripts/courtlistener.py +291 -0
- package/opencode/skills/KORTIX-legal-writer/scripts/ecfr_lookup.py +299 -0
- package/opencode/skills/KORTIX-legal-writer/scripts/verify-legal.py +507 -0
- package/opencode/skills/KORTIX-logo-creator/SKILL.md +293 -0
- package/opencode/skills/KORTIX-logo-creator/references/prompt-patterns.md +134 -0
- package/opencode/skills/KORTIX-logo-creator/scripts/compose_logo.py +406 -0
- package/opencode/skills/KORTIX-logo-creator/scripts/create_logo_sheet.py +258 -0
- package/opencode/skills/KORTIX-logo-creator/scripts/remove_bg.py +96 -0
- package/opencode/skills/KORTIX-memory/SKILL.md +261 -0
- package/opencode/skills/KORTIX-memory/scripts/export-sessions.py +409 -0
- package/opencode/skills/KORTIX-paper-creator/SKILL.md +549 -0
- package/opencode/skills/KORTIX-paper-creator/assets/template.tex +101 -0
- package/opencode/skills/KORTIX-paper-creator/scripts/compile.sh +177 -0
- package/opencode/skills/KORTIX-paper-creator/scripts/openalex_to_bibtex.py +220 -0
- package/opencode/skills/KORTIX-paper-creator/scripts/verify.sh +354 -0
- package/opencode/skills/KORTIX-paper-search/SKILL.md +418 -0
- package/opencode/skills/KORTIX-pdf/SKILL.md +232 -0
- package/opencode/skills/KORTIX-pdf/forms.md +36 -0
- package/opencode/skills/KORTIX-pdf/reference.md +105 -0
- package/opencode/skills/KORTIX-pdf/scripts/check_bounding_boxes.py +65 -0
- package/opencode/skills/KORTIX-pdf/scripts/check_fillable_fields.py +11 -0
- package/opencode/skills/KORTIX-pdf/scripts/convert_pdf_to_images.py +33 -0
- package/opencode/skills/KORTIX-pdf/scripts/create_validation_image.py +37 -0
- package/opencode/skills/KORTIX-pdf/scripts/extract_form_field_info.py +122 -0
- package/opencode/skills/KORTIX-pdf/scripts/extract_form_structure.py +115 -0
- package/opencode/skills/KORTIX-pdf/scripts/fill_fillable_fields.py +98 -0
- package/opencode/skills/KORTIX-pdf/scripts/fill_pdf_form_with_annotations.py +107 -0
- package/opencode/skills/KORTIX-plan/SKILL.md +228 -0
- package/opencode/skills/KORTIX-presentation-viewer/SKILL.md +87 -0
- package/opencode/skills/KORTIX-presentation-viewer/serve.ts +136 -0
- package/opencode/skills/KORTIX-presentation-viewer/viewer.html +559 -0
- package/opencode/skills/KORTIX-presentations/SKILL.md +344 -0
- package/opencode/skills/KORTIX-remotion/SKILL.md +56 -0
- package/opencode/skills/KORTIX-remotion/rules/3d.md +86 -0
- package/opencode/skills/KORTIX-remotion/rules/animations.md +29 -0
- package/opencode/skills/KORTIX-remotion/rules/assets.md +78 -0
- package/opencode/skills/KORTIX-remotion/rules/audio-visualization.md +198 -0
- package/opencode/skills/KORTIX-remotion/rules/audio.md +169 -0
- package/opencode/skills/KORTIX-remotion/rules/calculate-metadata.md +104 -0
- package/opencode/skills/KORTIX-remotion/rules/can-decode.md +75 -0
- package/opencode/skills/KORTIX-remotion/rules/charts.md +120 -0
- package/opencode/skills/KORTIX-remotion/rules/compositions.md +141 -0
- package/opencode/skills/KORTIX-remotion/rules/display-captions.md +184 -0
- package/opencode/skills/KORTIX-remotion/rules/extract-frames.md +229 -0
- package/opencode/skills/KORTIX-remotion/rules/ffmpeg.md +38 -0
- package/opencode/skills/KORTIX-remotion/rules/fonts.md +152 -0
- package/opencode/skills/KORTIX-remotion/rules/get-audio-duration.md +58 -0
- package/opencode/skills/KORTIX-remotion/rules/get-video-dimensions.md +68 -0
- package/opencode/skills/KORTIX-remotion/rules/get-video-duration.md +58 -0
- package/opencode/skills/KORTIX-remotion/rules/gifs.md +141 -0
- package/opencode/skills/KORTIX-remotion/rules/images.md +130 -0
- package/opencode/skills/KORTIX-remotion/rules/import-srt-captions.md +69 -0
- package/opencode/skills/KORTIX-remotion/rules/light-leaks.md +73 -0
- package/opencode/skills/KORTIX-remotion/rules/lottie.md +68 -0
- package/opencode/skills/KORTIX-remotion/rules/maps.md +401 -0
- package/opencode/skills/KORTIX-remotion/rules/measuring-dom-nodes.md +35 -0
- package/opencode/skills/KORTIX-remotion/rules/measuring-text.md +143 -0
- package/opencode/skills/KORTIX-remotion/rules/parameters.md +98 -0
- package/opencode/skills/KORTIX-remotion/rules/sequencing.md +118 -0
- package/opencode/skills/KORTIX-remotion/rules/subtitles.md +36 -0
- package/opencode/skills/KORTIX-remotion/rules/tailwind.md +11 -0
- package/opencode/skills/KORTIX-remotion/rules/text-animations.md +20 -0
- package/opencode/skills/KORTIX-remotion/rules/timing.md +179 -0
- package/opencode/skills/KORTIX-remotion/rules/transcribe-captions.md +70 -0
- package/opencode/skills/KORTIX-remotion/rules/transitions.md +197 -0
- package/opencode/skills/KORTIX-remotion/rules/transparent-videos.md +106 -0
- package/opencode/skills/KORTIX-remotion/rules/trimming.md +53 -0
- package/opencode/skills/KORTIX-remotion/rules/videos.md +171 -0
- package/opencode/skills/KORTIX-secrets/SKILL.md +280 -0
- package/opencode/skills/KORTIX-semantic-search/SKILL.md +213 -0
- package/opencode/skills/KORTIX-session-search/SKILL.md +807 -0
- package/opencode/skills/KORTIX-session-search/Untitled +1 -0
- package/opencode/skills/KORTIX-skill-creator/SKILL.md +163 -0
- package/opencode/skills/KORTIX-web-research/SKILL.md +69 -0
- package/opencode/skills/KORTIX-xlsx/LICENSE.txt +30 -0
- package/opencode/skills/KORTIX-xlsx/SKILL.md +549 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/__init__.py +0 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/merge_runs.py +199 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/simplify_redlines.py +197 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/pack.py +159 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/soffice.py +183 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/unpack.py +132 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validate.py +111 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/__init__.py +15 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/base.py +847 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/docx.py +446 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/pptx.py +275 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/redlining.py +247 -0
- package/opencode/skills/KORTIX-xlsx/scripts/recalc.py +184 -0
- package/opencode/tools/image-gen.ts +342 -0
- package/opencode/tools/image-search.ts +190 -0
- package/opencode/tools/memory-get.ts +168 -0
- package/opencode/tools/memory-search.ts +247 -0
- package/opencode/tools/presentation-gen.ts +723 -0
- package/opencode/tools/scrape-webpage.ts +115 -0
- package/opencode/tools/scripts/.python-version +1 -0
- package/opencode/tools/scripts/convert_pdf.py +184 -0
- package/opencode/tools/scripts/convert_pptx.py +562 -0
- package/opencode/tools/scripts/pyproject.toml +11 -0
- package/opencode/tools/scripts/uv.lock +287 -0
- package/opencode/tools/scripts/validate_slide.py +74 -0
- package/opencode/tools/show-user.ts +217 -0
- package/opencode/tools/tests/e2e-presentation-fix.ts +277 -0
- package/opencode/tools/tests/image-gen.test.ts +215 -0
- package/opencode/tools/tests/image-search.test.ts +125 -0
- package/opencode/tools/tests/memory-system-benchmark.ts +1076 -0
- package/opencode/tools/tests/presentation-gen.test.ts +389 -0
- package/opencode/tools/tests/scrape-webpage.test.ts +74 -0
- package/opencode/tools/tests/show-user.test.ts +241 -0
- package/opencode/tools/tests/video-gen.test.ts +110 -0
- package/opencode/tools/tests/web-search.test.ts +106 -0
- package/opencode/tools/video-gen.ts +200 -0
- package/opencode/tools/web-search.ts +153 -0
- package/opencode/tsconfig.json +29 -0
- package/package.json +36 -0
- package/patch-agent-browser.js +100 -0
- package/postinstall.sh +88 -0
- package/services/KORTIX-presentation-viewer/run +37 -0
- package/services/agent-browser-viewer/run +48 -0
- package/services/kortix-master/run +16 -0
- package/services/lss-sync/run +22 -0
- package/services/opencode-serve/run +25 -0
- package/services/opencode-web/run +21 -0
|
@@ -0,0 +1,275 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Validator for PowerPoint presentation XML files against XSD schemas.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
from .base import BaseSchemaValidator
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class PPTXSchemaValidator(BaseSchemaValidator):
|
|
11
|
+
|
|
12
|
+
PRESENTATIONML_NAMESPACE = (
|
|
13
|
+
"http://schemas.openxmlformats.org/presentationml/2006/main"
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
ELEMENT_RELATIONSHIP_TYPES = {
|
|
17
|
+
"sldid": "slide",
|
|
18
|
+
"sldmasterid": "slidemaster",
|
|
19
|
+
"notesmasterid": "notesmaster",
|
|
20
|
+
"sldlayoutid": "slidelayout",
|
|
21
|
+
"themeid": "theme",
|
|
22
|
+
"tablestyleid": "tablestyles",
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
def validate(self):
|
|
26
|
+
if not self.validate_xml():
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
all_valid = True
|
|
30
|
+
if not self.validate_namespaces():
|
|
31
|
+
all_valid = False
|
|
32
|
+
|
|
33
|
+
if not self.validate_unique_ids():
|
|
34
|
+
all_valid = False
|
|
35
|
+
|
|
36
|
+
if not self.validate_uuid_ids():
|
|
37
|
+
all_valid = False
|
|
38
|
+
|
|
39
|
+
if not self.validate_file_references():
|
|
40
|
+
all_valid = False
|
|
41
|
+
|
|
42
|
+
if not self.validate_slide_layout_ids():
|
|
43
|
+
all_valid = False
|
|
44
|
+
|
|
45
|
+
if not self.validate_content_types():
|
|
46
|
+
all_valid = False
|
|
47
|
+
|
|
48
|
+
if not self.validate_against_xsd():
|
|
49
|
+
all_valid = False
|
|
50
|
+
|
|
51
|
+
if not self.validate_notes_slide_references():
|
|
52
|
+
all_valid = False
|
|
53
|
+
|
|
54
|
+
if not self.validate_all_relationship_ids():
|
|
55
|
+
all_valid = False
|
|
56
|
+
|
|
57
|
+
if not self.validate_no_duplicate_slide_layouts():
|
|
58
|
+
all_valid = False
|
|
59
|
+
|
|
60
|
+
return all_valid
|
|
61
|
+
|
|
62
|
+
def validate_uuid_ids(self):
|
|
63
|
+
import lxml.etree
|
|
64
|
+
|
|
65
|
+
errors = []
|
|
66
|
+
uuid_pattern = re.compile(
|
|
67
|
+
r"^[\{\(]?[0-9A-Fa-f]{8}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{4}-?[0-9A-Fa-f]{12}[\}\)]?$"
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
for xml_file in self.xml_files:
|
|
71
|
+
try:
|
|
72
|
+
root = lxml.etree.parse(str(xml_file)).getroot()
|
|
73
|
+
|
|
74
|
+
for elem in root.iter():
|
|
75
|
+
for attr, value in elem.attrib.items():
|
|
76
|
+
attr_name = attr.split("}")[-1].lower()
|
|
77
|
+
if attr_name == "id" or attr_name.endswith("id"):
|
|
78
|
+
if self._looks_like_uuid(value):
|
|
79
|
+
if not uuid_pattern.match(value):
|
|
80
|
+
errors.append(
|
|
81
|
+
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
|
82
|
+
f"Line {elem.sourceline}: ID '{value}' appears to be a UUID but contains invalid hex characters"
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
|
86
|
+
errors.append(
|
|
87
|
+
f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
if errors:
|
|
91
|
+
print(f"FAILED - Found {len(errors)} UUID ID validation errors:")
|
|
92
|
+
for error in errors:
|
|
93
|
+
print(error)
|
|
94
|
+
return False
|
|
95
|
+
else:
|
|
96
|
+
if self.verbose:
|
|
97
|
+
print("PASSED - All UUID-like IDs contain valid hex values")
|
|
98
|
+
return True
|
|
99
|
+
|
|
100
|
+
def _looks_like_uuid(self, value):
|
|
101
|
+
clean_value = value.strip("{}()").replace("-", "")
|
|
102
|
+
return len(clean_value) == 32 and all(c.isalnum() for c in clean_value)
|
|
103
|
+
|
|
104
|
+
def validate_slide_layout_ids(self):
|
|
105
|
+
import lxml.etree
|
|
106
|
+
|
|
107
|
+
errors = []
|
|
108
|
+
|
|
109
|
+
slide_masters = list(self.unpacked_dir.glob("ppt/slideMasters/*.xml"))
|
|
110
|
+
|
|
111
|
+
if not slide_masters:
|
|
112
|
+
if self.verbose:
|
|
113
|
+
print("PASSED - No slide masters found")
|
|
114
|
+
return True
|
|
115
|
+
|
|
116
|
+
for slide_master in slide_masters:
|
|
117
|
+
try:
|
|
118
|
+
root = lxml.etree.parse(str(slide_master)).getroot()
|
|
119
|
+
|
|
120
|
+
rels_file = slide_master.parent / "_rels" / f"{slide_master.name}.rels"
|
|
121
|
+
|
|
122
|
+
if not rels_file.exists():
|
|
123
|
+
errors.append(
|
|
124
|
+
f" {slide_master.relative_to(self.unpacked_dir)}: "
|
|
125
|
+
f"Missing relationships file: {rels_file.relative_to(self.unpacked_dir)}"
|
|
126
|
+
)
|
|
127
|
+
continue
|
|
128
|
+
|
|
129
|
+
rels_root = lxml.etree.parse(str(rels_file)).getroot()
|
|
130
|
+
|
|
131
|
+
valid_layout_rids = set()
|
|
132
|
+
for rel in rels_root.findall(
|
|
133
|
+
f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
|
|
134
|
+
):
|
|
135
|
+
rel_type = rel.get("Type", "")
|
|
136
|
+
if "slideLayout" in rel_type:
|
|
137
|
+
valid_layout_rids.add(rel.get("Id"))
|
|
138
|
+
|
|
139
|
+
for sld_layout_id in root.findall(
|
|
140
|
+
f".//{{{self.PRESENTATIONML_NAMESPACE}}}sldLayoutId"
|
|
141
|
+
):
|
|
142
|
+
r_id = sld_layout_id.get(
|
|
143
|
+
f"{{{self.OFFICE_RELATIONSHIPS_NAMESPACE}}}id"
|
|
144
|
+
)
|
|
145
|
+
layout_id = sld_layout_id.get("id")
|
|
146
|
+
|
|
147
|
+
if r_id and r_id not in valid_layout_rids:
|
|
148
|
+
errors.append(
|
|
149
|
+
f" {slide_master.relative_to(self.unpacked_dir)}: "
|
|
150
|
+
f"Line {sld_layout_id.sourceline}: sldLayoutId with id='{layout_id}' "
|
|
151
|
+
f"references r:id='{r_id}' which is not found in slide layout relationships"
|
|
152
|
+
)
|
|
153
|
+
|
|
154
|
+
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
|
155
|
+
errors.append(
|
|
156
|
+
f" {slide_master.relative_to(self.unpacked_dir)}: Error: {e}"
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
if errors:
|
|
160
|
+
print(f"FAILED - Found {len(errors)} slide layout ID validation errors:")
|
|
161
|
+
for error in errors:
|
|
162
|
+
print(error)
|
|
163
|
+
print(
|
|
164
|
+
"Remove invalid references or add missing slide layouts to the relationships file."
|
|
165
|
+
)
|
|
166
|
+
return False
|
|
167
|
+
else:
|
|
168
|
+
if self.verbose:
|
|
169
|
+
print("PASSED - All slide layout IDs reference valid slide layouts")
|
|
170
|
+
return True
|
|
171
|
+
|
|
172
|
+
def validate_no_duplicate_slide_layouts(self):
|
|
173
|
+
import lxml.etree
|
|
174
|
+
|
|
175
|
+
errors = []
|
|
176
|
+
slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
|
|
177
|
+
|
|
178
|
+
for rels_file in slide_rels_files:
|
|
179
|
+
try:
|
|
180
|
+
root = lxml.etree.parse(str(rels_file)).getroot()
|
|
181
|
+
|
|
182
|
+
layout_rels = [
|
|
183
|
+
rel
|
|
184
|
+
for rel in root.findall(
|
|
185
|
+
f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
|
|
186
|
+
)
|
|
187
|
+
if "slideLayout" in rel.get("Type", "")
|
|
188
|
+
]
|
|
189
|
+
|
|
190
|
+
if len(layout_rels) > 1:
|
|
191
|
+
errors.append(
|
|
192
|
+
f" {rels_file.relative_to(self.unpacked_dir)}: has {len(layout_rels)} slideLayout references"
|
|
193
|
+
)
|
|
194
|
+
|
|
195
|
+
except Exception as e:
|
|
196
|
+
errors.append(
|
|
197
|
+
f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
if errors:
|
|
201
|
+
print("FAILED - Found slides with duplicate slideLayout references:")
|
|
202
|
+
for error in errors:
|
|
203
|
+
print(error)
|
|
204
|
+
return False
|
|
205
|
+
else:
|
|
206
|
+
if self.verbose:
|
|
207
|
+
print("PASSED - All slides have exactly one slideLayout reference")
|
|
208
|
+
return True
|
|
209
|
+
|
|
210
|
+
def validate_notes_slide_references(self):
|
|
211
|
+
import lxml.etree
|
|
212
|
+
|
|
213
|
+
errors = []
|
|
214
|
+
notes_slide_references = {}
|
|
215
|
+
|
|
216
|
+
slide_rels_files = list(self.unpacked_dir.glob("ppt/slides/_rels/*.xml.rels"))
|
|
217
|
+
|
|
218
|
+
if not slide_rels_files:
|
|
219
|
+
if self.verbose:
|
|
220
|
+
print("PASSED - No slide relationship files found")
|
|
221
|
+
return True
|
|
222
|
+
|
|
223
|
+
for rels_file in slide_rels_files:
|
|
224
|
+
try:
|
|
225
|
+
root = lxml.etree.parse(str(rels_file)).getroot()
|
|
226
|
+
|
|
227
|
+
for rel in root.findall(
|
|
228
|
+
f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
|
|
229
|
+
):
|
|
230
|
+
rel_type = rel.get("Type", "")
|
|
231
|
+
if "notesSlide" in rel_type:
|
|
232
|
+
target = rel.get("Target", "")
|
|
233
|
+
if target:
|
|
234
|
+
normalized_target = target.replace("../", "")
|
|
235
|
+
|
|
236
|
+
slide_name = rels_file.stem.replace(
|
|
237
|
+
".xml", ""
|
|
238
|
+
)
|
|
239
|
+
|
|
240
|
+
if normalized_target not in notes_slide_references:
|
|
241
|
+
notes_slide_references[normalized_target] = []
|
|
242
|
+
notes_slide_references[normalized_target].append(
|
|
243
|
+
(slide_name, rels_file)
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
|
247
|
+
errors.append(
|
|
248
|
+
f" {rels_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
for target, references in notes_slide_references.items():
|
|
252
|
+
if len(references) > 1:
|
|
253
|
+
slide_names = [ref[0] for ref in references]
|
|
254
|
+
errors.append(
|
|
255
|
+
f" Notes slide '{target}' is referenced by multiple slides: {', '.join(slide_names)}"
|
|
256
|
+
)
|
|
257
|
+
for slide_name, rels_file in references:
|
|
258
|
+
errors.append(f" - {rels_file.relative_to(self.unpacked_dir)}")
|
|
259
|
+
|
|
260
|
+
if errors:
|
|
261
|
+
print(
|
|
262
|
+
f"FAILED - Found {len([e for e in errors if not e.startswith(' ')])} notes slide reference validation errors:"
|
|
263
|
+
)
|
|
264
|
+
for error in errors:
|
|
265
|
+
print(error)
|
|
266
|
+
print("Each slide may optionally have its own slide file.")
|
|
267
|
+
return False
|
|
268
|
+
else:
|
|
269
|
+
if self.verbose:
|
|
270
|
+
print("PASSED - All notes slide references are unique")
|
|
271
|
+
return True
|
|
272
|
+
|
|
273
|
+
|
|
274
|
+
if __name__ == "__main__":
|
|
275
|
+
raise RuntimeError("This module should not be run directly.")
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Validator for tracked changes in Word documents.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import subprocess
|
|
6
|
+
import tempfile
|
|
7
|
+
import zipfile
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class RedliningValidator:
|
|
12
|
+
|
|
13
|
+
def __init__(self, unpacked_dir, original_docx, verbose=False, author="Claude"):
|
|
14
|
+
self.unpacked_dir = Path(unpacked_dir)
|
|
15
|
+
self.original_docx = Path(original_docx)
|
|
16
|
+
self.verbose = verbose
|
|
17
|
+
self.author = author
|
|
18
|
+
self.namespaces = {
|
|
19
|
+
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
def repair(self) -> int:
|
|
23
|
+
return 0
|
|
24
|
+
|
|
25
|
+
def validate(self):
|
|
26
|
+
modified_file = self.unpacked_dir / "word" / "document.xml"
|
|
27
|
+
if not modified_file.exists():
|
|
28
|
+
print(f"FAILED - Modified document.xml not found at {modified_file}")
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
try:
|
|
32
|
+
import xml.etree.ElementTree as ET
|
|
33
|
+
|
|
34
|
+
tree = ET.parse(modified_file)
|
|
35
|
+
root = tree.getroot()
|
|
36
|
+
|
|
37
|
+
del_elements = root.findall(".//w:del", self.namespaces)
|
|
38
|
+
ins_elements = root.findall(".//w:ins", self.namespaces)
|
|
39
|
+
|
|
40
|
+
author_del_elements = [
|
|
41
|
+
elem
|
|
42
|
+
for elem in del_elements
|
|
43
|
+
if elem.get(f"{{{self.namespaces['w']}}}author") == self.author
|
|
44
|
+
]
|
|
45
|
+
author_ins_elements = [
|
|
46
|
+
elem
|
|
47
|
+
for elem in ins_elements
|
|
48
|
+
if elem.get(f"{{{self.namespaces['w']}}}author") == self.author
|
|
49
|
+
]
|
|
50
|
+
|
|
51
|
+
if not author_del_elements and not author_ins_elements:
|
|
52
|
+
if self.verbose:
|
|
53
|
+
print(f"PASSED - No tracked changes by {self.author} found.")
|
|
54
|
+
return True
|
|
55
|
+
|
|
56
|
+
except Exception:
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
60
|
+
temp_path = Path(temp_dir)
|
|
61
|
+
|
|
62
|
+
try:
|
|
63
|
+
with zipfile.ZipFile(self.original_docx, "r") as zip_ref:
|
|
64
|
+
zip_ref.extractall(temp_path)
|
|
65
|
+
except Exception as e:
|
|
66
|
+
print(f"FAILED - Error unpacking original docx: {e}")
|
|
67
|
+
return False
|
|
68
|
+
|
|
69
|
+
original_file = temp_path / "word" / "document.xml"
|
|
70
|
+
if not original_file.exists():
|
|
71
|
+
print(
|
|
72
|
+
f"FAILED - Original document.xml not found in {self.original_docx}"
|
|
73
|
+
)
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
try:
|
|
77
|
+
import xml.etree.ElementTree as ET
|
|
78
|
+
|
|
79
|
+
modified_tree = ET.parse(modified_file)
|
|
80
|
+
modified_root = modified_tree.getroot()
|
|
81
|
+
original_tree = ET.parse(original_file)
|
|
82
|
+
original_root = original_tree.getroot()
|
|
83
|
+
except ET.ParseError as e:
|
|
84
|
+
print(f"FAILED - Error parsing XML files: {e}")
|
|
85
|
+
return False
|
|
86
|
+
|
|
87
|
+
self._remove_author_tracked_changes(original_root)
|
|
88
|
+
self._remove_author_tracked_changes(modified_root)
|
|
89
|
+
|
|
90
|
+
modified_text = self._extract_text_content(modified_root)
|
|
91
|
+
original_text = self._extract_text_content(original_root)
|
|
92
|
+
|
|
93
|
+
if modified_text != original_text:
|
|
94
|
+
error_message = self._generate_detailed_diff(
|
|
95
|
+
original_text, modified_text
|
|
96
|
+
)
|
|
97
|
+
print(error_message)
|
|
98
|
+
return False
|
|
99
|
+
|
|
100
|
+
if self.verbose:
|
|
101
|
+
print(f"PASSED - All changes by {self.author} are properly tracked")
|
|
102
|
+
return True
|
|
103
|
+
|
|
104
|
+
def _generate_detailed_diff(self, original_text, modified_text):
|
|
105
|
+
error_parts = [
|
|
106
|
+
f"FAILED - Document text doesn't match after removing {self.author}'s tracked changes",
|
|
107
|
+
"",
|
|
108
|
+
"Likely causes:",
|
|
109
|
+
" 1. Modified text inside another author's <w:ins> or <w:del> tags",
|
|
110
|
+
" 2. Made edits without proper tracked changes",
|
|
111
|
+
" 3. Didn't nest <w:del> inside <w:ins> when deleting another's insertion",
|
|
112
|
+
"",
|
|
113
|
+
"For pre-redlined documents, use correct patterns:",
|
|
114
|
+
" - To reject another's INSERTION: Nest <w:del> inside their <w:ins>",
|
|
115
|
+
" - To restore another's DELETION: Add new <w:ins> AFTER their <w:del>",
|
|
116
|
+
"",
|
|
117
|
+
]
|
|
118
|
+
|
|
119
|
+
git_diff = self._get_git_word_diff(original_text, modified_text)
|
|
120
|
+
if git_diff:
|
|
121
|
+
error_parts.extend(["Differences:", "============", git_diff])
|
|
122
|
+
else:
|
|
123
|
+
error_parts.append("Unable to generate word diff (git not available)")
|
|
124
|
+
|
|
125
|
+
return "\n".join(error_parts)
|
|
126
|
+
|
|
127
|
+
def _get_git_word_diff(self, original_text, modified_text):
|
|
128
|
+
try:
|
|
129
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
130
|
+
temp_path = Path(temp_dir)
|
|
131
|
+
|
|
132
|
+
original_file = temp_path / "original.txt"
|
|
133
|
+
modified_file = temp_path / "modified.txt"
|
|
134
|
+
|
|
135
|
+
original_file.write_text(original_text, encoding="utf-8")
|
|
136
|
+
modified_file.write_text(modified_text, encoding="utf-8")
|
|
137
|
+
|
|
138
|
+
result = subprocess.run(
|
|
139
|
+
[
|
|
140
|
+
"git",
|
|
141
|
+
"diff",
|
|
142
|
+
"--word-diff=plain",
|
|
143
|
+
"--word-diff-regex=.",
|
|
144
|
+
"-U0",
|
|
145
|
+
"--no-index",
|
|
146
|
+
str(original_file),
|
|
147
|
+
str(modified_file),
|
|
148
|
+
],
|
|
149
|
+
capture_output=True,
|
|
150
|
+
text=True,
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
if result.stdout.strip():
|
|
154
|
+
lines = result.stdout.split("\n")
|
|
155
|
+
content_lines = []
|
|
156
|
+
in_content = False
|
|
157
|
+
for line in lines:
|
|
158
|
+
if line.startswith("@@"):
|
|
159
|
+
in_content = True
|
|
160
|
+
continue
|
|
161
|
+
if in_content and line.strip():
|
|
162
|
+
content_lines.append(line)
|
|
163
|
+
|
|
164
|
+
if content_lines:
|
|
165
|
+
return "\n".join(content_lines)
|
|
166
|
+
|
|
167
|
+
result = subprocess.run(
|
|
168
|
+
[
|
|
169
|
+
"git",
|
|
170
|
+
"diff",
|
|
171
|
+
"--word-diff=plain",
|
|
172
|
+
"-U0",
|
|
173
|
+
"--no-index",
|
|
174
|
+
str(original_file),
|
|
175
|
+
str(modified_file),
|
|
176
|
+
],
|
|
177
|
+
capture_output=True,
|
|
178
|
+
text=True,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
if result.stdout.strip():
|
|
182
|
+
lines = result.stdout.split("\n")
|
|
183
|
+
content_lines = []
|
|
184
|
+
in_content = False
|
|
185
|
+
for line in lines:
|
|
186
|
+
if line.startswith("@@"):
|
|
187
|
+
in_content = True
|
|
188
|
+
continue
|
|
189
|
+
if in_content and line.strip():
|
|
190
|
+
content_lines.append(line)
|
|
191
|
+
return "\n".join(content_lines)
|
|
192
|
+
|
|
193
|
+
except (subprocess.CalledProcessError, FileNotFoundError, Exception):
|
|
194
|
+
pass
|
|
195
|
+
|
|
196
|
+
return None
|
|
197
|
+
|
|
198
|
+
def _remove_author_tracked_changes(self, root):
|
|
199
|
+
ins_tag = f"{{{self.namespaces['w']}}}ins"
|
|
200
|
+
del_tag = f"{{{self.namespaces['w']}}}del"
|
|
201
|
+
author_attr = f"{{{self.namespaces['w']}}}author"
|
|
202
|
+
|
|
203
|
+
for parent in root.iter():
|
|
204
|
+
to_remove = []
|
|
205
|
+
for child in parent:
|
|
206
|
+
if child.tag == ins_tag and child.get(author_attr) == self.author:
|
|
207
|
+
to_remove.append(child)
|
|
208
|
+
for elem in to_remove:
|
|
209
|
+
parent.remove(elem)
|
|
210
|
+
|
|
211
|
+
deltext_tag = f"{{{self.namespaces['w']}}}delText"
|
|
212
|
+
t_tag = f"{{{self.namespaces['w']}}}t"
|
|
213
|
+
|
|
214
|
+
for parent in root.iter():
|
|
215
|
+
to_process = []
|
|
216
|
+
for child in parent:
|
|
217
|
+
if child.tag == del_tag and child.get(author_attr) == self.author:
|
|
218
|
+
to_process.append((child, list(parent).index(child)))
|
|
219
|
+
|
|
220
|
+
for del_elem, del_index in reversed(to_process):
|
|
221
|
+
for elem in del_elem.iter():
|
|
222
|
+
if elem.tag == deltext_tag:
|
|
223
|
+
elem.tag = t_tag
|
|
224
|
+
|
|
225
|
+
for child in reversed(list(del_elem)):
|
|
226
|
+
parent.insert(del_index, child)
|
|
227
|
+
parent.remove(del_elem)
|
|
228
|
+
|
|
229
|
+
def _extract_text_content(self, root):
|
|
230
|
+
p_tag = f"{{{self.namespaces['w']}}}p"
|
|
231
|
+
t_tag = f"{{{self.namespaces['w']}}}t"
|
|
232
|
+
|
|
233
|
+
paragraphs = []
|
|
234
|
+
for p_elem in root.findall(f".//{p_tag}"):
|
|
235
|
+
text_parts = []
|
|
236
|
+
for t_elem in p_elem.findall(f".//{t_tag}"):
|
|
237
|
+
if t_elem.text:
|
|
238
|
+
text_parts.append(t_elem.text)
|
|
239
|
+
paragraph_text = "".join(text_parts)
|
|
240
|
+
if paragraph_text:
|
|
241
|
+
paragraphs.append(paragraph_text)
|
|
242
|
+
|
|
243
|
+
return "\n".join(paragraphs)
|
|
244
|
+
|
|
245
|
+
|
|
246
|
+
if __name__ == "__main__":
|
|
247
|
+
raise RuntimeError("This module should not be run directly.")
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
"""Render DOCX-like file to PNG images via LibreOffice + Poppler.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
python render_docx.py /path/to/file.docx --output_dir /tmp/docx_pages
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import argparse
|
|
8
|
+
import os
|
|
9
|
+
import re
|
|
10
|
+
import subprocess
|
|
11
|
+
import tempfile
|
|
12
|
+
import xml.etree.ElementTree as ET
|
|
13
|
+
from os import makedirs, replace
|
|
14
|
+
from os.path import abspath, basename, exists, expanduser, join, splitext
|
|
15
|
+
from shutil import which
|
|
16
|
+
import sys
|
|
17
|
+
from typing import Sequence, cast
|
|
18
|
+
from zipfile import ZipFile
|
|
19
|
+
|
|
20
|
+
from pdf2image import convert_from_path, pdfinfo_from_path
|
|
21
|
+
|
|
22
|
+
TWIPS_PER_INCH: int = 1440
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def ensure_system_tools() -> None:
|
|
26
|
+
missing: list[str] = []
|
|
27
|
+
for tool in ("soffice", "pdftoppm"):
|
|
28
|
+
if which(tool) is None:
|
|
29
|
+
missing.append(tool)
|
|
30
|
+
if missing:
|
|
31
|
+
raise RuntimeError(f"Missing required system tool(s): {', '.join(missing)}. Install LibreOffice and Poppler.")
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def calc_dpi_via_ooxml_docx(input_path: str, max_w_px: int, max_h_px: int) -> int:
|
|
35
|
+
with ZipFile(input_path, "r") as zf:
|
|
36
|
+
xml = zf.read("word/document.xml")
|
|
37
|
+
root = ET.fromstring(xml)
|
|
38
|
+
ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
|
|
39
|
+
|
|
40
|
+
sect_pr = root.find(".//w:sectPr", ns)
|
|
41
|
+
if sect_pr is None:
|
|
42
|
+
raise RuntimeError("Section properties not found")
|
|
43
|
+
pg_sz = sect_pr.find("w:pgSz", ns)
|
|
44
|
+
if pg_sz is None:
|
|
45
|
+
raise RuntimeError("Page size not found")
|
|
46
|
+
|
|
47
|
+
w_twips_str = pg_sz.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}w") or pg_sz.get("w")
|
|
48
|
+
h_twips_str = pg_sz.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}h") or pg_sz.get("h")
|
|
49
|
+
|
|
50
|
+
if not w_twips_str or not h_twips_str:
|
|
51
|
+
raise RuntimeError("Page size attributes missing")
|
|
52
|
+
|
|
53
|
+
width_in = int(w_twips_str) / TWIPS_PER_INCH
|
|
54
|
+
height_in = int(h_twips_str) / TWIPS_PER_INCH
|
|
55
|
+
if width_in <= 0 or height_in <= 0:
|
|
56
|
+
raise RuntimeError("Invalid page size values")
|
|
57
|
+
return round(min(max_w_px / width_in, max_h_px / height_in))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def calc_dpi_via_pdf(input_path: str, max_w_px: int, max_h_px: int) -> int:
|
|
61
|
+
with tempfile.TemporaryDirectory(prefix="soffice_profile_") as user_profile:
|
|
62
|
+
with tempfile.TemporaryDirectory(prefix="soffice_convert_") as convert_tmp_dir:
|
|
63
|
+
stem = splitext(basename(input_path))[0]
|
|
64
|
+
pdf_path = convert_to_pdf(input_path, user_profile, convert_tmp_dir, stem)
|
|
65
|
+
if not (pdf_path and exists(pdf_path)):
|
|
66
|
+
raise RuntimeError("Failed to convert input to PDF for DPI computation.")
|
|
67
|
+
|
|
68
|
+
info = pdfinfo_from_path(pdf_path)
|
|
69
|
+
size_val = info.get("Page size")
|
|
70
|
+
if not size_val:
|
|
71
|
+
for k, v in info.items():
|
|
72
|
+
if isinstance(v, str) and "size" in k.lower() and "pts" in v:
|
|
73
|
+
size_val = v
|
|
74
|
+
break
|
|
75
|
+
if not isinstance(size_val, str):
|
|
76
|
+
raise RuntimeError("Failed to read PDF page size.")
|
|
77
|
+
|
|
78
|
+
m = re.search(r"(\d+)\s*x\s*(\d+)\s*pts", size_val)
|
|
79
|
+
if not m:
|
|
80
|
+
raise RuntimeError("Unrecognized PDF page size format.")
|
|
81
|
+
width_in = int(m.group(1)) / 72.0
|
|
82
|
+
height_in = int(m.group(2)) / 72.0
|
|
83
|
+
if width_in <= 0 or height_in <= 0:
|
|
84
|
+
raise RuntimeError("Invalid PDF page size values.")
|
|
85
|
+
return round(min(max_w_px / width_in, max_h_px / height_in))
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def convert_to_pdf(doc_path: str, user_profile: str, convert_tmp_dir: str, stem: str) -> str:
|
|
89
|
+
cmd_pdf = [
|
|
90
|
+
"soffice", f"-env:UserInstallation=file://{user_profile}",
|
|
91
|
+
"--invisible", "--headless", "--norestore",
|
|
92
|
+
"--convert-to", "pdf", "--outdir", convert_tmp_dir, doc_path,
|
|
93
|
+
]
|
|
94
|
+
subprocess.run(cmd_pdf, check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=os.environ.copy())
|
|
95
|
+
|
|
96
|
+
pdf_path = join(convert_tmp_dir, f"{stem}.pdf")
|
|
97
|
+
if exists(pdf_path):
|
|
98
|
+
return pdf_path
|
|
99
|
+
|
|
100
|
+
# Fallback: DOCX -> ODT -> PDF
|
|
101
|
+
cmd_odt = [
|
|
102
|
+
"soffice", f"-env:UserInstallation=file://{user_profile}",
|
|
103
|
+
"--invisible", "--headless", "--norestore",
|
|
104
|
+
"--convert-to", "odt", "--outdir", convert_tmp_dir, doc_path,
|
|
105
|
+
]
|
|
106
|
+
subprocess.run(cmd_odt, check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=os.environ.copy())
|
|
107
|
+
odt_path = join(convert_tmp_dir, f"{stem}.odt")
|
|
108
|
+
if exists(odt_path):
|
|
109
|
+
cmd_odt_pdf = [
|
|
110
|
+
"soffice", f"-env:UserInstallation=file://{user_profile}",
|
|
111
|
+
"--invisible", "--headless", "--norestore",
|
|
112
|
+
"--convert-to", "pdf", "--outdir", convert_tmp_dir, odt_path,
|
|
113
|
+
]
|
|
114
|
+
subprocess.run(cmd_odt_pdf, check=False, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, env=os.environ.copy())
|
|
115
|
+
if exists(pdf_path):
|
|
116
|
+
return pdf_path
|
|
117
|
+
return ""
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def rasterize(doc_path: str, out_dir: str, dpi: int) -> Sequence[str]:
|
|
121
|
+
makedirs(out_dir, exist_ok=True)
|
|
122
|
+
doc_path = abspath(doc_path)
|
|
123
|
+
stem = splitext(basename(doc_path))[0]
|
|
124
|
+
|
|
125
|
+
with tempfile.TemporaryDirectory(prefix="soffice_profile_") as user_profile:
|
|
126
|
+
with tempfile.TemporaryDirectory(prefix="soffice_convert_") as convert_tmp_dir:
|
|
127
|
+
pdf_path = convert_to_pdf(doc_path, user_profile, convert_tmp_dir, stem)
|
|
128
|
+
if not pdf_path or not exists(pdf_path):
|
|
129
|
+
raise RuntimeError("Failed to produce PDF for rasterization.")
|
|
130
|
+
paths_raw = cast(
|
|
131
|
+
list[str],
|
|
132
|
+
convert_from_path(pdf_path, dpi=dpi, fmt="png", thread_count=8, output_folder=out_dir, paths_only=True, output_file="page"),
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
pages: list[tuple[int, str]] = []
|
|
136
|
+
for src_path in paths_raw:
|
|
137
|
+
base = splitext(basename(src_path))[0]
|
|
138
|
+
page_num = int(base.split("-")[-1])
|
|
139
|
+
dst_path = join(out_dir, f"page-{page_num}.png")
|
|
140
|
+
replace(src_path, dst_path)
|
|
141
|
+
pages.append((page_num, dst_path))
|
|
142
|
+
pages.sort(key=lambda t: t[0])
|
|
143
|
+
return [path for _, path in pages]
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def main() -> None:
|
|
147
|
+
parser = argparse.ArgumentParser(description="Render DOCX to PNG images.")
|
|
148
|
+
parser.add_argument("input_path", help="Path to DOCX file.")
|
|
149
|
+
parser.add_argument("--output_dir", default=None, help="Output directory for images.")
|
|
150
|
+
parser.add_argument("--width", type=int, default=1600, help="Max width in pixels (default 1600).")
|
|
151
|
+
parser.add_argument("--height", type=int, default=2000, help="Max height in pixels (default 2000).")
|
|
152
|
+
parser.add_argument("--dpi", type=int, default=None, help="Override computed DPI.")
|
|
153
|
+
args = parser.parse_args()
|
|
154
|
+
|
|
155
|
+
try:
|
|
156
|
+
ensure_system_tools()
|
|
157
|
+
input_path = abspath(expanduser(args.input_path))
|
|
158
|
+
out_dir = abspath(expanduser(args.output_dir)) if args.output_dir else splitext(input_path)[0]
|
|
159
|
+
|
|
160
|
+
if args.dpi is not None:
|
|
161
|
+
dpi = args.dpi
|
|
162
|
+
else:
|
|
163
|
+
try:
|
|
164
|
+
if input_path.lower().endswith((".docx", ".docm", ".dotx", ".dotm")):
|
|
165
|
+
dpi = calc_dpi_via_ooxml_docx(input_path, args.width, args.height)
|
|
166
|
+
else:
|
|
167
|
+
raise RuntimeError("Not a DOCX container")
|
|
168
|
+
except Exception:
|
|
169
|
+
dpi = calc_dpi_via_pdf(input_path, args.width, args.height)
|
|
170
|
+
|
|
171
|
+
rasterize(input_path, out_dir, dpi)
|
|
172
|
+
print("Pages rendered to " + out_dir)
|
|
173
|
+
except RuntimeError as exc:
|
|
174
|
+
print(f"Error: {exc}", file=sys.stderr)
|
|
175
|
+
raise SystemExit(1)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
if __name__ == "__main__":
|
|
179
|
+
main()
|