@kortix/sandbox 0.4.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/config/customize.sh +143 -0
- package/config/kortix-env-setup.sh +25 -0
- package/kortix-master/package.json +22 -0
- package/kortix-master/src/config.ts +22 -0
- package/kortix-master/src/index.ts +44 -0
- package/kortix-master/src/routes/env.ts +65 -0
- package/kortix-master/src/routes/proxy.ts +108 -0
- package/kortix-master/src/routes/update.ts +185 -0
- package/kortix-master/src/services/proxy.ts +43 -0
- package/kortix-master/src/services/secret-store.ts +156 -0
- package/kortix-master/tsconfig.json +14 -0
- package/opencode/agents/kortix-browser.md +142 -0
- package/opencode/agents/kortix-build.md +62 -0
- package/opencode/agents/kortix-explore.md +66 -0
- package/opencode/agents/kortix-image-gen.md +33 -0
- package/opencode/agents/kortix-main.md +450 -0
- package/opencode/agents/kortix-plan.md +100 -0
- package/opencode/agents/kortix-research.md +84 -0
- package/opencode/agents/kortix-sheets.md +61 -0
- package/opencode/agents/kortix-slides.md +64 -0
- package/opencode/agents/kortix-web-dev.md +572 -0
- package/opencode/commands/email.md +36 -0
- package/opencode/commands/init.md +43 -0
- package/opencode/commands/journal.md +44 -0
- package/opencode/commands/memory-init.md +81 -0
- package/opencode/commands/memory-search.md +50 -0
- package/opencode/commands/memory-status.md +56 -0
- package/opencode/commands/research.md +36 -0
- package/opencode/commands/search.md +38 -0
- package/opencode/commands/slides.md +32 -0
- package/opencode/commands/spreadsheet.md +30 -0
- package/opencode/memory.json +37 -0
- package/opencode/ocx.jsonc +10 -0
- package/opencode/opencode.jsonc +103 -0
- package/opencode/package.json +25 -0
- package/opencode/patches/apply.sh +19 -0
- package/opencode/patches/opencode-pty-spawn.txt +49 -0
- package/opencode/plugin/background-agents.ts.disabled +483 -0
- package/opencode/plugin/kdco-primitives/get-project-id.ts +172 -0
- package/opencode/plugin/kdco-primitives/index.ts +26 -0
- package/opencode/plugin/kdco-primitives/log-warn.ts +51 -0
- package/opencode/plugin/kdco-primitives/mutex.ts +122 -0
- package/opencode/plugin/kdco-primitives/shell.ts +138 -0
- package/opencode/plugin/kdco-primitives/temp.ts +36 -0
- package/opencode/plugin/kdco-primitives/terminal-detect.ts +34 -0
- package/opencode/plugin/kdco-primitives/types.ts +13 -0
- package/opencode/plugin/kdco-primitives/with-timeout.ts +84 -0
- package/opencode/plugin/memory.ts +306 -0
- package/opencode/plugin/worktree/state.ts +412 -0
- package/opencode/plugin/worktree/terminal.ts +1002 -0
- package/opencode/plugin/worktree.ts +861 -0
- package/opencode/skills/KORTIX-browser/SKILL.md +478 -0
- package/opencode/skills/KORTIX-cron-triggers/SKILL.md +173 -0
- package/opencode/skills/KORTIX-deep-research/SKILL.md +278 -0
- package/opencode/skills/KORTIX-docx/SKILL.md +398 -0
- package/opencode/skills/KORTIX-docx/scripts/__init__.py +1 -0
- package/opencode/skills/KORTIX-docx/scripts/accept_changes.py +104 -0
- package/opencode/skills/KORTIX-docx/scripts/comment.py +244 -0
- package/opencode/skills/KORTIX-docx/scripts/office/helpers/__init__.py +0 -0
- package/opencode/skills/KORTIX-docx/scripts/office/helpers/merge_runs.py +199 -0
- package/opencode/skills/KORTIX-docx/scripts/office/helpers/simplify_redlines.py +197 -0
- package/opencode/skills/KORTIX-docx/scripts/office/pack.py +159 -0
- package/opencode/skills/KORTIX-docx/scripts/office/soffice.py +183 -0
- package/opencode/skills/KORTIX-docx/scripts/office/unpack.py +132 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validate.py +111 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/__init__.py +15 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/base.py +847 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/docx.py +446 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/pptx.py +275 -0
- package/opencode/skills/KORTIX-docx/scripts/office/validators/redlining.py +247 -0
- package/opencode/skills/KORTIX-docx/scripts/render_docx.py +179 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/comments.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/commentsExtended.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/commentsExtensible.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/commentsIds.xml +3 -0
- package/opencode/skills/KORTIX-docx/scripts/templates/people.xml +3 -0
- package/opencode/skills/KORTIX-domain-research/SKILL.md +96 -0
- package/opencode/skills/KORTIX-domain-research/scripts/domain-lookup.py +810 -0
- package/opencode/skills/KORTIX-elevenlabs/SKILL.md +230 -0
- package/opencode/skills/KORTIX-elevenlabs/scripts/tts.py +389 -0
- package/opencode/skills/KORTIX-email/SKILL.md +145 -0
- package/opencode/skills/KORTIX-legal-writer/SKILL.md +409 -0
- package/opencode/skills/KORTIX-legal-writer/references/bluebook.md +152 -0
- package/opencode/skills/KORTIX-legal-writer/references/document-types.md +416 -0
- package/opencode/skills/KORTIX-legal-writer/scripts/courtlistener.py +291 -0
- package/opencode/skills/KORTIX-legal-writer/scripts/ecfr_lookup.py +299 -0
- package/opencode/skills/KORTIX-legal-writer/scripts/verify-legal.py +507 -0
- package/opencode/skills/KORTIX-logo-creator/SKILL.md +293 -0
- package/opencode/skills/KORTIX-logo-creator/references/prompt-patterns.md +134 -0
- package/opencode/skills/KORTIX-logo-creator/scripts/compose_logo.py +406 -0
- package/opencode/skills/KORTIX-logo-creator/scripts/create_logo_sheet.py +258 -0
- package/opencode/skills/KORTIX-logo-creator/scripts/remove_bg.py +96 -0
- package/opencode/skills/KORTIX-memory/SKILL.md +261 -0
- package/opencode/skills/KORTIX-memory/scripts/export-sessions.py +409 -0
- package/opencode/skills/KORTIX-paper-creator/SKILL.md +549 -0
- package/opencode/skills/KORTIX-paper-creator/assets/template.tex +101 -0
- package/opencode/skills/KORTIX-paper-creator/scripts/compile.sh +177 -0
- package/opencode/skills/KORTIX-paper-creator/scripts/openalex_to_bibtex.py +220 -0
- package/opencode/skills/KORTIX-paper-creator/scripts/verify.sh +354 -0
- package/opencode/skills/KORTIX-paper-search/SKILL.md +418 -0
- package/opencode/skills/KORTIX-pdf/SKILL.md +232 -0
- package/opencode/skills/KORTIX-pdf/forms.md +36 -0
- package/opencode/skills/KORTIX-pdf/reference.md +105 -0
- package/opencode/skills/KORTIX-pdf/scripts/check_bounding_boxes.py +65 -0
- package/opencode/skills/KORTIX-pdf/scripts/check_fillable_fields.py +11 -0
- package/opencode/skills/KORTIX-pdf/scripts/convert_pdf_to_images.py +33 -0
- package/opencode/skills/KORTIX-pdf/scripts/create_validation_image.py +37 -0
- package/opencode/skills/KORTIX-pdf/scripts/extract_form_field_info.py +122 -0
- package/opencode/skills/KORTIX-pdf/scripts/extract_form_structure.py +115 -0
- package/opencode/skills/KORTIX-pdf/scripts/fill_fillable_fields.py +98 -0
- package/opencode/skills/KORTIX-pdf/scripts/fill_pdf_form_with_annotations.py +107 -0
- package/opencode/skills/KORTIX-plan/SKILL.md +228 -0
- package/opencode/skills/KORTIX-presentation-viewer/SKILL.md +87 -0
- package/opencode/skills/KORTIX-presentation-viewer/serve.ts +136 -0
- package/opencode/skills/KORTIX-presentation-viewer/viewer.html +559 -0
- package/opencode/skills/KORTIX-presentations/SKILL.md +344 -0
- package/opencode/skills/KORTIX-remotion/SKILL.md +56 -0
- package/opencode/skills/KORTIX-remotion/rules/3d.md +86 -0
- package/opencode/skills/KORTIX-remotion/rules/animations.md +29 -0
- package/opencode/skills/KORTIX-remotion/rules/assets.md +78 -0
- package/opencode/skills/KORTIX-remotion/rules/audio-visualization.md +198 -0
- package/opencode/skills/KORTIX-remotion/rules/audio.md +169 -0
- package/opencode/skills/KORTIX-remotion/rules/calculate-metadata.md +104 -0
- package/opencode/skills/KORTIX-remotion/rules/can-decode.md +75 -0
- package/opencode/skills/KORTIX-remotion/rules/charts.md +120 -0
- package/opencode/skills/KORTIX-remotion/rules/compositions.md +141 -0
- package/opencode/skills/KORTIX-remotion/rules/display-captions.md +184 -0
- package/opencode/skills/KORTIX-remotion/rules/extract-frames.md +229 -0
- package/opencode/skills/KORTIX-remotion/rules/ffmpeg.md +38 -0
- package/opencode/skills/KORTIX-remotion/rules/fonts.md +152 -0
- package/opencode/skills/KORTIX-remotion/rules/get-audio-duration.md +58 -0
- package/opencode/skills/KORTIX-remotion/rules/get-video-dimensions.md +68 -0
- package/opencode/skills/KORTIX-remotion/rules/get-video-duration.md +58 -0
- package/opencode/skills/KORTIX-remotion/rules/gifs.md +141 -0
- package/opencode/skills/KORTIX-remotion/rules/images.md +130 -0
- package/opencode/skills/KORTIX-remotion/rules/import-srt-captions.md +69 -0
- package/opencode/skills/KORTIX-remotion/rules/light-leaks.md +73 -0
- package/opencode/skills/KORTIX-remotion/rules/lottie.md +68 -0
- package/opencode/skills/KORTIX-remotion/rules/maps.md +401 -0
- package/opencode/skills/KORTIX-remotion/rules/measuring-dom-nodes.md +35 -0
- package/opencode/skills/KORTIX-remotion/rules/measuring-text.md +143 -0
- package/opencode/skills/KORTIX-remotion/rules/parameters.md +98 -0
- package/opencode/skills/KORTIX-remotion/rules/sequencing.md +118 -0
- package/opencode/skills/KORTIX-remotion/rules/subtitles.md +36 -0
- package/opencode/skills/KORTIX-remotion/rules/tailwind.md +11 -0
- package/opencode/skills/KORTIX-remotion/rules/text-animations.md +20 -0
- package/opencode/skills/KORTIX-remotion/rules/timing.md +179 -0
- package/opencode/skills/KORTIX-remotion/rules/transcribe-captions.md +70 -0
- package/opencode/skills/KORTIX-remotion/rules/transitions.md +197 -0
- package/opencode/skills/KORTIX-remotion/rules/transparent-videos.md +106 -0
- package/opencode/skills/KORTIX-remotion/rules/trimming.md +53 -0
- package/opencode/skills/KORTIX-remotion/rules/videos.md +171 -0
- package/opencode/skills/KORTIX-secrets/SKILL.md +280 -0
- package/opencode/skills/KORTIX-semantic-search/SKILL.md +213 -0
- package/opencode/skills/KORTIX-session-search/SKILL.md +807 -0
- package/opencode/skills/KORTIX-session-search/Untitled +1 -0
- package/opencode/skills/KORTIX-skill-creator/SKILL.md +163 -0
- package/opencode/skills/KORTIX-web-research/SKILL.md +69 -0
- package/opencode/skills/KORTIX-xlsx/LICENSE.txt +30 -0
- package/opencode/skills/KORTIX-xlsx/SKILL.md +549 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/__init__.py +0 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/merge_runs.py +199 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/helpers/simplify_redlines.py +197 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/pack.py +159 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +1499 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +146 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +1085 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +11 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-main.xsd +3081 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +23 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +185 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +287 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/pml.xsd +1676 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +28 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +144 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +174 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +25 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +18 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +59 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +56 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +195 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-math.xsd +582 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +25 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/sml.xsd +4439 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-main.xsd +570 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +509 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +12 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +108 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +96 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/wml.xsd +3646 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ISO-IEC29500-4_2016/xml.xsd +116 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-contentTypes.xsd +42 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-coreProperties.xsd +50 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-digSig.xsd +49 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/ecma/fouth-edition/opc-relationships.xsd +33 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/mce/mc.xsd +75 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2010.xsd +560 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2012.xsd +67 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-2018.xsd +14 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-cex-2018.xsd +20 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-cid-2016.xsd +13 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-sdtdatahash-2020.xsd +4 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/schemas/microsoft/wml-symex-2015.xsd +8 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/soffice.py +183 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/unpack.py +132 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validate.py +111 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/__init__.py +15 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/base.py +847 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/docx.py +446 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/pptx.py +275 -0
- package/opencode/skills/KORTIX-xlsx/scripts/office/validators/redlining.py +247 -0
- package/opencode/skills/KORTIX-xlsx/scripts/recalc.py +184 -0
- package/opencode/tools/image-gen.ts +342 -0
- package/opencode/tools/image-search.ts +190 -0
- package/opencode/tools/memory-get.ts +168 -0
- package/opencode/tools/memory-search.ts +247 -0
- package/opencode/tools/presentation-gen.ts +723 -0
- package/opencode/tools/scrape-webpage.ts +115 -0
- package/opencode/tools/scripts/.python-version +1 -0
- package/opencode/tools/scripts/convert_pdf.py +184 -0
- package/opencode/tools/scripts/convert_pptx.py +562 -0
- package/opencode/tools/scripts/pyproject.toml +11 -0
- package/opencode/tools/scripts/uv.lock +287 -0
- package/opencode/tools/scripts/validate_slide.py +74 -0
- package/opencode/tools/show-user.ts +217 -0
- package/opencode/tools/tests/e2e-presentation-fix.ts +277 -0
- package/opencode/tools/tests/image-gen.test.ts +215 -0
- package/opencode/tools/tests/image-search.test.ts +125 -0
- package/opencode/tools/tests/memory-system-benchmark.ts +1076 -0
- package/opencode/tools/tests/presentation-gen.test.ts +389 -0
- package/opencode/tools/tests/scrape-webpage.test.ts +74 -0
- package/opencode/tools/tests/show-user.test.ts +241 -0
- package/opencode/tools/tests/video-gen.test.ts +110 -0
- package/opencode/tools/tests/web-search.test.ts +106 -0
- package/opencode/tools/video-gen.ts +200 -0
- package/opencode/tools/web-search.ts +153 -0
- package/opencode/tsconfig.json +29 -0
- package/package.json +36 -0
- package/patch-agent-browser.js +100 -0
- package/postinstall.sh +88 -0
- package/services/KORTIX-presentation-viewer/run +37 -0
- package/services/agent-browser-viewer/run +48 -0
- package/services/kortix-master/run +16 -0
- package/services/lss-sync/run +22 -0
- package/services/opencode-serve/run +25 -0
- package/services/opencode-web/run +21 -0
|
@@ -0,0 +1,847 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Base validator with common validation logic for document files.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import defusedxml.minidom
|
|
9
|
+
import lxml.etree
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class BaseSchemaValidator:
|
|
13
|
+
|
|
14
|
+
IGNORED_VALIDATION_ERRORS = [
|
|
15
|
+
"hyphenationZone",
|
|
16
|
+
"purl.org/dc/terms",
|
|
17
|
+
]
|
|
18
|
+
|
|
19
|
+
UNIQUE_ID_REQUIREMENTS = {
|
|
20
|
+
"comment": ("id", "file"),
|
|
21
|
+
"commentrangestart": ("id", "file"),
|
|
22
|
+
"commentrangeend": ("id", "file"),
|
|
23
|
+
"bookmarkstart": ("id", "file"),
|
|
24
|
+
"bookmarkend": ("id", "file"),
|
|
25
|
+
"sldid": ("id", "file"),
|
|
26
|
+
"sldmasterid": ("id", "global"),
|
|
27
|
+
"sldlayoutid": ("id", "global"),
|
|
28
|
+
"cm": ("authorid", "file"),
|
|
29
|
+
"sheet": ("sheetid", "file"),
|
|
30
|
+
"definedname": ("id", "file"),
|
|
31
|
+
"cxnsp": ("id", "file"),
|
|
32
|
+
"sp": ("id", "file"),
|
|
33
|
+
"pic": ("id", "file"),
|
|
34
|
+
"grpsp": ("id", "file"),
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
EXCLUDED_ID_CONTAINERS = {
|
|
38
|
+
"sectionlst",
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
ELEMENT_RELATIONSHIP_TYPES = {}
|
|
42
|
+
|
|
43
|
+
SCHEMA_MAPPINGS = {
|
|
44
|
+
"word": "ISO-IEC29500-4_2016/wml.xsd",
|
|
45
|
+
"ppt": "ISO-IEC29500-4_2016/pml.xsd",
|
|
46
|
+
"xl": "ISO-IEC29500-4_2016/sml.xsd",
|
|
47
|
+
"[Content_Types].xml": "ecma/fouth-edition/opc-contentTypes.xsd",
|
|
48
|
+
"app.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd",
|
|
49
|
+
"core.xml": "ecma/fouth-edition/opc-coreProperties.xsd",
|
|
50
|
+
"custom.xml": "ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd",
|
|
51
|
+
".rels": "ecma/fouth-edition/opc-relationships.xsd",
|
|
52
|
+
"people.xml": "microsoft/wml-2012.xsd",
|
|
53
|
+
"commentsIds.xml": "microsoft/wml-cid-2016.xsd",
|
|
54
|
+
"commentsExtensible.xml": "microsoft/wml-cex-2018.xsd",
|
|
55
|
+
"commentsExtended.xml": "microsoft/wml-2012.xsd",
|
|
56
|
+
"chart": "ISO-IEC29500-4_2016/dml-chart.xsd",
|
|
57
|
+
"theme": "ISO-IEC29500-4_2016/dml-main.xsd",
|
|
58
|
+
"drawing": "ISO-IEC29500-4_2016/dml-main.xsd",
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
MC_NAMESPACE = "http://schemas.openxmlformats.org/markup-compatibility/2006"
|
|
62
|
+
XML_NAMESPACE = "http://www.w3.org/XML/1998/namespace"
|
|
63
|
+
|
|
64
|
+
PACKAGE_RELATIONSHIPS_NAMESPACE = (
|
|
65
|
+
"http://schemas.openxmlformats.org/package/2006/relationships"
|
|
66
|
+
)
|
|
67
|
+
OFFICE_RELATIONSHIPS_NAMESPACE = (
|
|
68
|
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships"
|
|
69
|
+
)
|
|
70
|
+
CONTENT_TYPES_NAMESPACE = (
|
|
71
|
+
"http://schemas.openxmlformats.org/package/2006/content-types"
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
MAIN_CONTENT_FOLDERS = {"word", "ppt", "xl"}
|
|
75
|
+
|
|
76
|
+
OOXML_NAMESPACES = {
|
|
77
|
+
"http://schemas.openxmlformats.org/officeDocument/2006/math",
|
|
78
|
+
"http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
|
79
|
+
"http://schemas.openxmlformats.org/schemaLibrary/2006/main",
|
|
80
|
+
"http://schemas.openxmlformats.org/drawingml/2006/main",
|
|
81
|
+
"http://schemas.openxmlformats.org/drawingml/2006/chart",
|
|
82
|
+
"http://schemas.openxmlformats.org/drawingml/2006/chartDrawing",
|
|
83
|
+
"http://schemas.openxmlformats.org/drawingml/2006/diagram",
|
|
84
|
+
"http://schemas.openxmlformats.org/drawingml/2006/picture",
|
|
85
|
+
"http://schemas.openxmlformats.org/drawingml/2006/spreadsheetDrawing",
|
|
86
|
+
"http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
|
87
|
+
"http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
|
88
|
+
"http://schemas.openxmlformats.org/presentationml/2006/main",
|
|
89
|
+
"http://schemas.openxmlformats.org/spreadsheetml/2006/main",
|
|
90
|
+
"http://schemas.openxmlformats.org/officeDocument/2006/sharedTypes",
|
|
91
|
+
"http://www.w3.org/XML/1998/namespace",
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
def __init__(self, unpacked_dir, original_file=None, verbose=False):
|
|
95
|
+
self.unpacked_dir = Path(unpacked_dir).resolve()
|
|
96
|
+
self.original_file = Path(original_file) if original_file else None
|
|
97
|
+
self.verbose = verbose
|
|
98
|
+
|
|
99
|
+
self.schemas_dir = Path(__file__).parent.parent / "schemas"
|
|
100
|
+
|
|
101
|
+
patterns = ["*.xml", "*.rels"]
|
|
102
|
+
self.xml_files = [
|
|
103
|
+
f for pattern in patterns for f in self.unpacked_dir.rglob(pattern)
|
|
104
|
+
]
|
|
105
|
+
|
|
106
|
+
if not self.xml_files:
|
|
107
|
+
print(f"Warning: No XML files found in {self.unpacked_dir}")
|
|
108
|
+
|
|
109
|
+
def validate(self):
|
|
110
|
+
raise NotImplementedError("Subclasses must implement the validate method")
|
|
111
|
+
|
|
112
|
+
def repair(self) -> int:
|
|
113
|
+
return self.repair_whitespace_preservation()
|
|
114
|
+
|
|
115
|
+
def repair_whitespace_preservation(self) -> int:
|
|
116
|
+
repairs = 0
|
|
117
|
+
|
|
118
|
+
for xml_file in self.xml_files:
|
|
119
|
+
try:
|
|
120
|
+
content = xml_file.read_text(encoding="utf-8")
|
|
121
|
+
dom = defusedxml.minidom.parseString(content)
|
|
122
|
+
modified = False
|
|
123
|
+
|
|
124
|
+
for elem in dom.getElementsByTagName("*"):
|
|
125
|
+
if elem.tagName.endswith(":t") and elem.firstChild:
|
|
126
|
+
text = elem.firstChild.nodeValue
|
|
127
|
+
if text and (text.startswith((' ', '\t')) or text.endswith((' ', '\t'))):
|
|
128
|
+
if elem.getAttribute("xml:space") != "preserve":
|
|
129
|
+
elem.setAttribute("xml:space", "preserve")
|
|
130
|
+
text_preview = repr(text[:30]) + "..." if len(text) > 30 else repr(text)
|
|
131
|
+
print(f" Repaired: {xml_file.name}: Added xml:space='preserve' to {elem.tagName}: {text_preview}")
|
|
132
|
+
repairs += 1
|
|
133
|
+
modified = True
|
|
134
|
+
|
|
135
|
+
if modified:
|
|
136
|
+
xml_file.write_bytes(dom.toxml(encoding="UTF-8"))
|
|
137
|
+
|
|
138
|
+
except Exception:
|
|
139
|
+
pass
|
|
140
|
+
|
|
141
|
+
return repairs
|
|
142
|
+
|
|
143
|
+
def validate_xml(self):
|
|
144
|
+
errors = []
|
|
145
|
+
|
|
146
|
+
for xml_file in self.xml_files:
|
|
147
|
+
try:
|
|
148
|
+
lxml.etree.parse(str(xml_file))
|
|
149
|
+
except lxml.etree.XMLSyntaxError as e:
|
|
150
|
+
errors.append(
|
|
151
|
+
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
|
152
|
+
f"Line {e.lineno}: {e.msg}"
|
|
153
|
+
)
|
|
154
|
+
except Exception as e:
|
|
155
|
+
errors.append(
|
|
156
|
+
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
|
157
|
+
f"Unexpected error: {str(e)}"
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
if errors:
|
|
161
|
+
print(f"FAILED - Found {len(errors)} XML violations:")
|
|
162
|
+
for error in errors:
|
|
163
|
+
print(error)
|
|
164
|
+
return False
|
|
165
|
+
else:
|
|
166
|
+
if self.verbose:
|
|
167
|
+
print("PASSED - All XML files are well-formed")
|
|
168
|
+
return True
|
|
169
|
+
|
|
170
|
+
def validate_namespaces(self):
|
|
171
|
+
errors = []
|
|
172
|
+
|
|
173
|
+
for xml_file in self.xml_files:
|
|
174
|
+
try:
|
|
175
|
+
root = lxml.etree.parse(str(xml_file)).getroot()
|
|
176
|
+
declared = set(root.nsmap.keys()) - {None}
|
|
177
|
+
|
|
178
|
+
for attr_val in [
|
|
179
|
+
v for k, v in root.attrib.items() if k.endswith("Ignorable")
|
|
180
|
+
]:
|
|
181
|
+
undeclared = set(attr_val.split()) - declared
|
|
182
|
+
errors.extend(
|
|
183
|
+
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
|
184
|
+
f"Namespace '{ns}' in Ignorable but not declared"
|
|
185
|
+
for ns in undeclared
|
|
186
|
+
)
|
|
187
|
+
except lxml.etree.XMLSyntaxError:
|
|
188
|
+
continue
|
|
189
|
+
|
|
190
|
+
if errors:
|
|
191
|
+
print(f"FAILED - {len(errors)} namespace issues:")
|
|
192
|
+
for error in errors:
|
|
193
|
+
print(error)
|
|
194
|
+
return False
|
|
195
|
+
if self.verbose:
|
|
196
|
+
print("PASSED - All namespace prefixes properly declared")
|
|
197
|
+
return True
|
|
198
|
+
|
|
199
|
+
def validate_unique_ids(self):
|
|
200
|
+
errors = []
|
|
201
|
+
global_ids = {}
|
|
202
|
+
|
|
203
|
+
for xml_file in self.xml_files:
|
|
204
|
+
try:
|
|
205
|
+
root = lxml.etree.parse(str(xml_file)).getroot()
|
|
206
|
+
file_ids = {}
|
|
207
|
+
|
|
208
|
+
mc_elements = root.xpath(
|
|
209
|
+
".//mc:AlternateContent", namespaces={"mc": self.MC_NAMESPACE}
|
|
210
|
+
)
|
|
211
|
+
for elem in mc_elements:
|
|
212
|
+
elem.getparent().remove(elem)
|
|
213
|
+
|
|
214
|
+
for elem in root.iter():
|
|
215
|
+
tag = (
|
|
216
|
+
elem.tag.split("}")[-1].lower()
|
|
217
|
+
if "}" in elem.tag
|
|
218
|
+
else elem.tag.lower()
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
if tag in self.UNIQUE_ID_REQUIREMENTS:
|
|
222
|
+
in_excluded_container = any(
|
|
223
|
+
ancestor.tag.split("}")[-1].lower() in self.EXCLUDED_ID_CONTAINERS
|
|
224
|
+
for ancestor in elem.iterancestors()
|
|
225
|
+
)
|
|
226
|
+
if in_excluded_container:
|
|
227
|
+
continue
|
|
228
|
+
|
|
229
|
+
attr_name, scope = self.UNIQUE_ID_REQUIREMENTS[tag]
|
|
230
|
+
|
|
231
|
+
id_value = None
|
|
232
|
+
for attr, value in elem.attrib.items():
|
|
233
|
+
attr_local = (
|
|
234
|
+
attr.split("}")[-1].lower()
|
|
235
|
+
if "}" in attr
|
|
236
|
+
else attr.lower()
|
|
237
|
+
)
|
|
238
|
+
if attr_local == attr_name:
|
|
239
|
+
id_value = value
|
|
240
|
+
break
|
|
241
|
+
|
|
242
|
+
if id_value is not None:
|
|
243
|
+
if scope == "global":
|
|
244
|
+
if id_value in global_ids:
|
|
245
|
+
prev_file, prev_line, prev_tag = global_ids[
|
|
246
|
+
id_value
|
|
247
|
+
]
|
|
248
|
+
errors.append(
|
|
249
|
+
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
|
250
|
+
f"Line {elem.sourceline}: Global ID '{id_value}' in <{tag}> "
|
|
251
|
+
f"already used in {prev_file} at line {prev_line} in <{prev_tag}>"
|
|
252
|
+
)
|
|
253
|
+
else:
|
|
254
|
+
global_ids[id_value] = (
|
|
255
|
+
xml_file.relative_to(self.unpacked_dir),
|
|
256
|
+
elem.sourceline,
|
|
257
|
+
tag,
|
|
258
|
+
)
|
|
259
|
+
elif scope == "file":
|
|
260
|
+
key = (tag, attr_name)
|
|
261
|
+
if key not in file_ids:
|
|
262
|
+
file_ids[key] = {}
|
|
263
|
+
|
|
264
|
+
if id_value in file_ids[key]:
|
|
265
|
+
prev_line = file_ids[key][id_value]
|
|
266
|
+
errors.append(
|
|
267
|
+
f" {xml_file.relative_to(self.unpacked_dir)}: "
|
|
268
|
+
f"Line {elem.sourceline}: Duplicate {attr_name}='{id_value}' in <{tag}> "
|
|
269
|
+
f"(first occurrence at line {prev_line})"
|
|
270
|
+
)
|
|
271
|
+
else:
|
|
272
|
+
file_ids[key][id_value] = elem.sourceline
|
|
273
|
+
|
|
274
|
+
except (lxml.etree.XMLSyntaxError, Exception) as e:
|
|
275
|
+
errors.append(
|
|
276
|
+
f" {xml_file.relative_to(self.unpacked_dir)}: Error: {e}"
|
|
277
|
+
)
|
|
278
|
+
|
|
279
|
+
if errors:
|
|
280
|
+
print(f"FAILED - Found {len(errors)} ID uniqueness violations:")
|
|
281
|
+
for error in errors:
|
|
282
|
+
print(error)
|
|
283
|
+
return False
|
|
284
|
+
else:
|
|
285
|
+
if self.verbose:
|
|
286
|
+
print("PASSED - All required IDs are unique")
|
|
287
|
+
return True
|
|
288
|
+
|
|
289
|
+
def validate_file_references(self):
|
|
290
|
+
errors = []
|
|
291
|
+
|
|
292
|
+
rels_files = list(self.unpacked_dir.rglob("*.rels"))
|
|
293
|
+
|
|
294
|
+
if not rels_files:
|
|
295
|
+
if self.verbose:
|
|
296
|
+
print("PASSED - No .rels files found")
|
|
297
|
+
return True
|
|
298
|
+
|
|
299
|
+
all_files = []
|
|
300
|
+
for file_path in self.unpacked_dir.rglob("*"):
|
|
301
|
+
if (
|
|
302
|
+
file_path.is_file()
|
|
303
|
+
and file_path.name != "[Content_Types].xml"
|
|
304
|
+
and not file_path.name.endswith(".rels")
|
|
305
|
+
):
|
|
306
|
+
all_files.append(file_path.resolve())
|
|
307
|
+
|
|
308
|
+
all_referenced_files = set()
|
|
309
|
+
|
|
310
|
+
if self.verbose:
|
|
311
|
+
print(
|
|
312
|
+
f"Found {len(rels_files)} .rels files and {len(all_files)} target files"
|
|
313
|
+
)
|
|
314
|
+
|
|
315
|
+
for rels_file in rels_files:
|
|
316
|
+
try:
|
|
317
|
+
rels_root = lxml.etree.parse(str(rels_file)).getroot()
|
|
318
|
+
|
|
319
|
+
rels_dir = rels_file.parent
|
|
320
|
+
|
|
321
|
+
referenced_files = set()
|
|
322
|
+
broken_refs = []
|
|
323
|
+
|
|
324
|
+
for rel in rels_root.findall(
|
|
325
|
+
".//ns:Relationship",
|
|
326
|
+
namespaces={"ns": self.PACKAGE_RELATIONSHIPS_NAMESPACE},
|
|
327
|
+
):
|
|
328
|
+
target = rel.get("Target")
|
|
329
|
+
if target and not target.startswith(
|
|
330
|
+
("http", "mailto:")
|
|
331
|
+
):
|
|
332
|
+
if target.startswith("/"):
|
|
333
|
+
target_path = self.unpacked_dir / target.lstrip("/")
|
|
334
|
+
elif rels_file.name == ".rels":
|
|
335
|
+
target_path = self.unpacked_dir / target
|
|
336
|
+
else:
|
|
337
|
+
base_dir = rels_dir.parent
|
|
338
|
+
target_path = base_dir / target
|
|
339
|
+
|
|
340
|
+
try:
|
|
341
|
+
target_path = target_path.resolve()
|
|
342
|
+
if target_path.exists() and target_path.is_file():
|
|
343
|
+
referenced_files.add(target_path)
|
|
344
|
+
all_referenced_files.add(target_path)
|
|
345
|
+
else:
|
|
346
|
+
broken_refs.append((target, rel.sourceline))
|
|
347
|
+
except (OSError, ValueError):
|
|
348
|
+
broken_refs.append((target, rel.sourceline))
|
|
349
|
+
|
|
350
|
+
if broken_refs:
|
|
351
|
+
rel_path = rels_file.relative_to(self.unpacked_dir)
|
|
352
|
+
for broken_ref, line_num in broken_refs:
|
|
353
|
+
errors.append(
|
|
354
|
+
f" {rel_path}: Line {line_num}: Broken reference to {broken_ref}"
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
except Exception as e:
|
|
358
|
+
rel_path = rels_file.relative_to(self.unpacked_dir)
|
|
359
|
+
errors.append(f" Error parsing {rel_path}: {e}")
|
|
360
|
+
|
|
361
|
+
unreferenced_files = set(all_files) - all_referenced_files
|
|
362
|
+
|
|
363
|
+
if unreferenced_files:
|
|
364
|
+
for unref_file in sorted(unreferenced_files):
|
|
365
|
+
unref_rel_path = unref_file.relative_to(self.unpacked_dir)
|
|
366
|
+
errors.append(f" Unreferenced file: {unref_rel_path}")
|
|
367
|
+
|
|
368
|
+
if errors:
|
|
369
|
+
print(f"FAILED - Found {len(errors)} relationship validation errors:")
|
|
370
|
+
for error in errors:
|
|
371
|
+
print(error)
|
|
372
|
+
print(
|
|
373
|
+
"CRITICAL: These errors will cause the document to appear corrupt. "
|
|
374
|
+
+ "Broken references MUST be fixed, "
|
|
375
|
+
+ "and unreferenced files MUST be referenced or removed."
|
|
376
|
+
)
|
|
377
|
+
return False
|
|
378
|
+
else:
|
|
379
|
+
if self.verbose:
|
|
380
|
+
print(
|
|
381
|
+
"PASSED - All references are valid and all files are properly referenced"
|
|
382
|
+
)
|
|
383
|
+
return True
|
|
384
|
+
|
|
385
|
+
def validate_all_relationship_ids(self):
|
|
386
|
+
import lxml.etree
|
|
387
|
+
|
|
388
|
+
errors = []
|
|
389
|
+
|
|
390
|
+
for xml_file in self.xml_files:
|
|
391
|
+
if xml_file.suffix == ".rels":
|
|
392
|
+
continue
|
|
393
|
+
|
|
394
|
+
rels_dir = xml_file.parent / "_rels"
|
|
395
|
+
rels_file = rels_dir / f"{xml_file.name}.rels"
|
|
396
|
+
|
|
397
|
+
if not rels_file.exists():
|
|
398
|
+
continue
|
|
399
|
+
|
|
400
|
+
try:
|
|
401
|
+
rels_root = lxml.etree.parse(str(rels_file)).getroot()
|
|
402
|
+
rid_to_type = {}
|
|
403
|
+
|
|
404
|
+
for rel in rels_root.findall(
|
|
405
|
+
f".//{{{self.PACKAGE_RELATIONSHIPS_NAMESPACE}}}Relationship"
|
|
406
|
+
):
|
|
407
|
+
rid = rel.get("Id")
|
|
408
|
+
rel_type = rel.get("Type", "")
|
|
409
|
+
if rid:
|
|
410
|
+
if rid in rid_to_type:
|
|
411
|
+
rels_rel_path = rels_file.relative_to(self.unpacked_dir)
|
|
412
|
+
errors.append(
|
|
413
|
+
f" {rels_rel_path}: Line {rel.sourceline}: "
|
|
414
|
+
f"Duplicate relationship ID '{rid}' (IDs must be unique)"
|
|
415
|
+
)
|
|
416
|
+
type_name = (
|
|
417
|
+
rel_type.split("/")[-1] if "/" in rel_type else rel_type
|
|
418
|
+
)
|
|
419
|
+
rid_to_type[rid] = type_name
|
|
420
|
+
|
|
421
|
+
xml_root = lxml.etree.parse(str(xml_file)).getroot()
|
|
422
|
+
|
|
423
|
+
r_ns = self.OFFICE_RELATIONSHIPS_NAMESPACE
|
|
424
|
+
rid_attrs_to_check = ["id", "embed", "link"]
|
|
425
|
+
for elem in xml_root.iter():
|
|
426
|
+
for attr_name in rid_attrs_to_check:
|
|
427
|
+
rid_attr = elem.get(f"{{{r_ns}}}{attr_name}")
|
|
428
|
+
if not rid_attr:
|
|
429
|
+
continue
|
|
430
|
+
xml_rel_path = xml_file.relative_to(self.unpacked_dir)
|
|
431
|
+
elem_name = (
|
|
432
|
+
elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
|
|
433
|
+
)
|
|
434
|
+
|
|
435
|
+
if rid_attr not in rid_to_type:
|
|
436
|
+
errors.append(
|
|
437
|
+
f" {xml_rel_path}: Line {elem.sourceline}: "
|
|
438
|
+
f"<{elem_name}> r:{attr_name} references non-existent relationship '{rid_attr}' "
|
|
439
|
+
f"(valid IDs: {', '.join(sorted(rid_to_type.keys())[:5])}{'...' if len(rid_to_type) > 5 else ''})"
|
|
440
|
+
)
|
|
441
|
+
elif attr_name == "id" and self.ELEMENT_RELATIONSHIP_TYPES:
|
|
442
|
+
expected_type = self._get_expected_relationship_type(
|
|
443
|
+
elem_name
|
|
444
|
+
)
|
|
445
|
+
if expected_type:
|
|
446
|
+
actual_type = rid_to_type[rid_attr]
|
|
447
|
+
if expected_type not in actual_type.lower():
|
|
448
|
+
errors.append(
|
|
449
|
+
f" {xml_rel_path}: Line {elem.sourceline}: "
|
|
450
|
+
f"<{elem_name}> references '{rid_attr}' which points to '{actual_type}' "
|
|
451
|
+
f"but should point to a '{expected_type}' relationship"
|
|
452
|
+
)
|
|
453
|
+
|
|
454
|
+
except Exception as e:
|
|
455
|
+
xml_rel_path = xml_file.relative_to(self.unpacked_dir)
|
|
456
|
+
errors.append(f" Error processing {xml_rel_path}: {e}")
|
|
457
|
+
|
|
458
|
+
if errors:
|
|
459
|
+
print(f"FAILED - Found {len(errors)} relationship ID reference errors:")
|
|
460
|
+
for error in errors:
|
|
461
|
+
print(error)
|
|
462
|
+
print("\nThese ID mismatches will cause the document to appear corrupt!")
|
|
463
|
+
return False
|
|
464
|
+
else:
|
|
465
|
+
if self.verbose:
|
|
466
|
+
print("PASSED - All relationship ID references are valid")
|
|
467
|
+
return True
|
|
468
|
+
|
|
469
|
+
def _get_expected_relationship_type(self, element_name):
|
|
470
|
+
elem_lower = element_name.lower()
|
|
471
|
+
|
|
472
|
+
if elem_lower in self.ELEMENT_RELATIONSHIP_TYPES:
|
|
473
|
+
return self.ELEMENT_RELATIONSHIP_TYPES[elem_lower]
|
|
474
|
+
|
|
475
|
+
if elem_lower.endswith("id") and len(elem_lower) > 2:
|
|
476
|
+
prefix = elem_lower[:-2]
|
|
477
|
+
if prefix.endswith("master"):
|
|
478
|
+
return prefix.lower()
|
|
479
|
+
elif prefix.endswith("layout"):
|
|
480
|
+
return prefix.lower()
|
|
481
|
+
else:
|
|
482
|
+
if prefix == "sld":
|
|
483
|
+
return "slide"
|
|
484
|
+
return prefix.lower()
|
|
485
|
+
|
|
486
|
+
if elem_lower.endswith("reference") and len(elem_lower) > 9:
|
|
487
|
+
prefix = elem_lower[:-9]
|
|
488
|
+
return prefix.lower()
|
|
489
|
+
|
|
490
|
+
return None
|
|
491
|
+
|
|
492
|
+
def validate_content_types(self):
|
|
493
|
+
errors = []
|
|
494
|
+
|
|
495
|
+
content_types_file = self.unpacked_dir / "[Content_Types].xml"
|
|
496
|
+
if not content_types_file.exists():
|
|
497
|
+
print("FAILED - [Content_Types].xml file not found")
|
|
498
|
+
return False
|
|
499
|
+
|
|
500
|
+
try:
|
|
501
|
+
root = lxml.etree.parse(str(content_types_file)).getroot()
|
|
502
|
+
declared_parts = set()
|
|
503
|
+
declared_extensions = set()
|
|
504
|
+
|
|
505
|
+
for override in root.findall(
|
|
506
|
+
f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Override"
|
|
507
|
+
):
|
|
508
|
+
part_name = override.get("PartName")
|
|
509
|
+
if part_name is not None:
|
|
510
|
+
declared_parts.add(part_name.lstrip("/"))
|
|
511
|
+
|
|
512
|
+
for default in root.findall(
|
|
513
|
+
f".//{{{self.CONTENT_TYPES_NAMESPACE}}}Default"
|
|
514
|
+
):
|
|
515
|
+
extension = default.get("Extension")
|
|
516
|
+
if extension is not None:
|
|
517
|
+
declared_extensions.add(extension.lower())
|
|
518
|
+
|
|
519
|
+
declarable_roots = {
|
|
520
|
+
"sld",
|
|
521
|
+
"sldLayout",
|
|
522
|
+
"sldMaster",
|
|
523
|
+
"presentation",
|
|
524
|
+
"document",
|
|
525
|
+
"workbook",
|
|
526
|
+
"worksheet",
|
|
527
|
+
"theme",
|
|
528
|
+
}
|
|
529
|
+
|
|
530
|
+
media_extensions = {
|
|
531
|
+
"png": "image/png",
|
|
532
|
+
"jpg": "image/jpeg",
|
|
533
|
+
"jpeg": "image/jpeg",
|
|
534
|
+
"gif": "image/gif",
|
|
535
|
+
"bmp": "image/bmp",
|
|
536
|
+
"tiff": "image/tiff",
|
|
537
|
+
"wmf": "image/x-wmf",
|
|
538
|
+
"emf": "image/x-emf",
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
all_files = list(self.unpacked_dir.rglob("*"))
|
|
542
|
+
all_files = [f for f in all_files if f.is_file()]
|
|
543
|
+
|
|
544
|
+
for xml_file in self.xml_files:
|
|
545
|
+
path_str = str(xml_file.relative_to(self.unpacked_dir)).replace(
|
|
546
|
+
"\\", "/"
|
|
547
|
+
)
|
|
548
|
+
|
|
549
|
+
if any(
|
|
550
|
+
skip in path_str
|
|
551
|
+
for skip in [".rels", "[Content_Types]", "docProps/", "_rels/"]
|
|
552
|
+
):
|
|
553
|
+
continue
|
|
554
|
+
|
|
555
|
+
try:
|
|
556
|
+
root_tag = lxml.etree.parse(str(xml_file)).getroot().tag
|
|
557
|
+
root_name = root_tag.split("}")[-1] if "}" in root_tag else root_tag
|
|
558
|
+
|
|
559
|
+
if root_name in declarable_roots and path_str not in declared_parts:
|
|
560
|
+
errors.append(
|
|
561
|
+
f" {path_str}: File with <{root_name}> root not declared in [Content_Types].xml"
|
|
562
|
+
)
|
|
563
|
+
|
|
564
|
+
except Exception:
|
|
565
|
+
continue
|
|
566
|
+
|
|
567
|
+
for file_path in all_files:
|
|
568
|
+
if file_path.suffix.lower() in {".xml", ".rels"}:
|
|
569
|
+
continue
|
|
570
|
+
if file_path.name == "[Content_Types].xml":
|
|
571
|
+
continue
|
|
572
|
+
if "_rels" in file_path.parts or "docProps" in file_path.parts:
|
|
573
|
+
continue
|
|
574
|
+
|
|
575
|
+
extension = file_path.suffix.lstrip(".").lower()
|
|
576
|
+
if extension and extension not in declared_extensions:
|
|
577
|
+
if extension in media_extensions:
|
|
578
|
+
relative_path = file_path.relative_to(self.unpacked_dir)
|
|
579
|
+
errors.append(
|
|
580
|
+
f' {relative_path}: File with extension \'{extension}\' not declared in [Content_Types].xml - should add: <Default Extension="{extension}" ContentType="{media_extensions[extension]}"/>'
|
|
581
|
+
)
|
|
582
|
+
|
|
583
|
+
except Exception as e:
|
|
584
|
+
errors.append(f" Error parsing [Content_Types].xml: {e}")
|
|
585
|
+
|
|
586
|
+
if errors:
|
|
587
|
+
print(f"FAILED - Found {len(errors)} content type declaration errors:")
|
|
588
|
+
for error in errors:
|
|
589
|
+
print(error)
|
|
590
|
+
return False
|
|
591
|
+
else:
|
|
592
|
+
if self.verbose:
|
|
593
|
+
print(
|
|
594
|
+
"PASSED - All content files are properly declared in [Content_Types].xml"
|
|
595
|
+
)
|
|
596
|
+
return True
|
|
597
|
+
|
|
598
|
+
def validate_file_against_xsd(self, xml_file, verbose=False):
|
|
599
|
+
xml_file = Path(xml_file).resolve()
|
|
600
|
+
unpacked_dir = self.unpacked_dir.resolve()
|
|
601
|
+
|
|
602
|
+
is_valid, current_errors = self._validate_single_file_xsd(
|
|
603
|
+
xml_file, unpacked_dir
|
|
604
|
+
)
|
|
605
|
+
|
|
606
|
+
if is_valid is None:
|
|
607
|
+
return None, set()
|
|
608
|
+
elif is_valid:
|
|
609
|
+
return True, set()
|
|
610
|
+
|
|
611
|
+
original_errors = self._get_original_file_errors(xml_file)
|
|
612
|
+
|
|
613
|
+
assert current_errors is not None
|
|
614
|
+
new_errors = current_errors - original_errors
|
|
615
|
+
|
|
616
|
+
new_errors = {
|
|
617
|
+
e for e in new_errors
|
|
618
|
+
if not any(pattern in e for pattern in self.IGNORED_VALIDATION_ERRORS)
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
if new_errors:
|
|
622
|
+
if verbose:
|
|
623
|
+
relative_path = xml_file.relative_to(unpacked_dir)
|
|
624
|
+
print(f"FAILED - {relative_path}: {len(new_errors)} new error(s)")
|
|
625
|
+
for error in list(new_errors)[:3]:
|
|
626
|
+
truncated = error[:250] + "..." if len(error) > 250 else error
|
|
627
|
+
print(f" - {truncated}")
|
|
628
|
+
return False, new_errors
|
|
629
|
+
else:
|
|
630
|
+
if verbose:
|
|
631
|
+
print(
|
|
632
|
+
f"PASSED - No new errors (original had {len(current_errors)} errors)"
|
|
633
|
+
)
|
|
634
|
+
return True, set()
|
|
635
|
+
|
|
636
|
+
def validate_against_xsd(self):
|
|
637
|
+
new_errors = []
|
|
638
|
+
original_error_count = 0
|
|
639
|
+
valid_count = 0
|
|
640
|
+
skipped_count = 0
|
|
641
|
+
|
|
642
|
+
for xml_file in self.xml_files:
|
|
643
|
+
relative_path = str(xml_file.relative_to(self.unpacked_dir))
|
|
644
|
+
is_valid, new_file_errors = self.validate_file_against_xsd(
|
|
645
|
+
xml_file, verbose=False
|
|
646
|
+
)
|
|
647
|
+
|
|
648
|
+
if is_valid is None:
|
|
649
|
+
skipped_count += 1
|
|
650
|
+
continue
|
|
651
|
+
elif is_valid and not new_file_errors:
|
|
652
|
+
valid_count += 1
|
|
653
|
+
continue
|
|
654
|
+
elif is_valid:
|
|
655
|
+
original_error_count += 1
|
|
656
|
+
valid_count += 1
|
|
657
|
+
continue
|
|
658
|
+
|
|
659
|
+
new_errors.append(f" {relative_path}: {len(new_file_errors)} new error(s)")
|
|
660
|
+
for error in list(new_file_errors)[:3]:
|
|
661
|
+
new_errors.append(
|
|
662
|
+
f" - {error[:250]}..." if len(error) > 250 else f" - {error}"
|
|
663
|
+
)
|
|
664
|
+
|
|
665
|
+
if self.verbose:
|
|
666
|
+
print(f"Validated {len(self.xml_files)} files:")
|
|
667
|
+
print(f" - Valid: {valid_count}")
|
|
668
|
+
print(f" - Skipped (no schema): {skipped_count}")
|
|
669
|
+
if original_error_count:
|
|
670
|
+
print(f" - With original errors (ignored): {original_error_count}")
|
|
671
|
+
print(
|
|
672
|
+
f" - With NEW errors: {len(new_errors) > 0 and len([e for e in new_errors if not e.startswith(' ')]) or 0}"
|
|
673
|
+
)
|
|
674
|
+
|
|
675
|
+
if new_errors:
|
|
676
|
+
print("\nFAILED - Found NEW validation errors:")
|
|
677
|
+
for error in new_errors:
|
|
678
|
+
print(error)
|
|
679
|
+
return False
|
|
680
|
+
else:
|
|
681
|
+
if self.verbose:
|
|
682
|
+
print("\nPASSED - No new XSD validation errors introduced")
|
|
683
|
+
return True
|
|
684
|
+
|
|
685
|
+
def _get_schema_path(self, xml_file):
|
|
686
|
+
if xml_file.name in self.SCHEMA_MAPPINGS:
|
|
687
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.name]
|
|
688
|
+
|
|
689
|
+
if xml_file.suffix == ".rels":
|
|
690
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS[".rels"]
|
|
691
|
+
|
|
692
|
+
if "charts/" in str(xml_file) and xml_file.name.startswith("chart"):
|
|
693
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS["chart"]
|
|
694
|
+
|
|
695
|
+
if "theme/" in str(xml_file) and xml_file.name.startswith("theme"):
|
|
696
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS["theme"]
|
|
697
|
+
|
|
698
|
+
if xml_file.parent.name in self.MAIN_CONTENT_FOLDERS:
|
|
699
|
+
return self.schemas_dir / self.SCHEMA_MAPPINGS[xml_file.parent.name]
|
|
700
|
+
|
|
701
|
+
return None
|
|
702
|
+
|
|
703
|
+
def _clean_ignorable_namespaces(self, xml_doc):
|
|
704
|
+
xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
|
|
705
|
+
xml_copy = lxml.etree.fromstring(xml_string)
|
|
706
|
+
|
|
707
|
+
for elem in xml_copy.iter():
|
|
708
|
+
attrs_to_remove = []
|
|
709
|
+
|
|
710
|
+
for attr in elem.attrib:
|
|
711
|
+
if "{" in attr:
|
|
712
|
+
ns = attr.split("}")[0][1:]
|
|
713
|
+
if ns not in self.OOXML_NAMESPACES:
|
|
714
|
+
attrs_to_remove.append(attr)
|
|
715
|
+
|
|
716
|
+
for attr in attrs_to_remove:
|
|
717
|
+
del elem.attrib[attr]
|
|
718
|
+
|
|
719
|
+
self._remove_ignorable_elements(xml_copy)
|
|
720
|
+
|
|
721
|
+
return lxml.etree.ElementTree(xml_copy)
|
|
722
|
+
|
|
723
|
+
def _remove_ignorable_elements(self, root):
|
|
724
|
+
elements_to_remove = []
|
|
725
|
+
|
|
726
|
+
for elem in list(root):
|
|
727
|
+
if not hasattr(elem, "tag") or callable(elem.tag):
|
|
728
|
+
continue
|
|
729
|
+
|
|
730
|
+
tag_str = str(elem.tag)
|
|
731
|
+
if tag_str.startswith("{"):
|
|
732
|
+
ns = tag_str.split("}")[0][1:]
|
|
733
|
+
if ns not in self.OOXML_NAMESPACES:
|
|
734
|
+
elements_to_remove.append(elem)
|
|
735
|
+
continue
|
|
736
|
+
|
|
737
|
+
self._remove_ignorable_elements(elem)
|
|
738
|
+
|
|
739
|
+
for elem in elements_to_remove:
|
|
740
|
+
root.remove(elem)
|
|
741
|
+
|
|
742
|
+
def _preprocess_for_mc_ignorable(self, xml_doc):
|
|
743
|
+
root = xml_doc.getroot()
|
|
744
|
+
|
|
745
|
+
if f"{{{self.MC_NAMESPACE}}}Ignorable" in root.attrib:
|
|
746
|
+
del root.attrib[f"{{{self.MC_NAMESPACE}}}Ignorable"]
|
|
747
|
+
|
|
748
|
+
return xml_doc
|
|
749
|
+
|
|
750
|
+
def _validate_single_file_xsd(self, xml_file, base_path):
|
|
751
|
+
schema_path = self._get_schema_path(xml_file)
|
|
752
|
+
if not schema_path:
|
|
753
|
+
return None, None
|
|
754
|
+
|
|
755
|
+
try:
|
|
756
|
+
with open(schema_path, "rb") as xsd_file:
|
|
757
|
+
parser = lxml.etree.XMLParser()
|
|
758
|
+
xsd_doc = lxml.etree.parse(
|
|
759
|
+
xsd_file, parser=parser, base_url=str(schema_path)
|
|
760
|
+
)
|
|
761
|
+
schema = lxml.etree.XMLSchema(xsd_doc)
|
|
762
|
+
|
|
763
|
+
with open(xml_file, "r") as f:
|
|
764
|
+
xml_doc = lxml.etree.parse(f)
|
|
765
|
+
|
|
766
|
+
xml_doc, _ = self._remove_template_tags_from_text_nodes(xml_doc)
|
|
767
|
+
xml_doc = self._preprocess_for_mc_ignorable(xml_doc)
|
|
768
|
+
|
|
769
|
+
relative_path = xml_file.relative_to(base_path)
|
|
770
|
+
if (
|
|
771
|
+
relative_path.parts
|
|
772
|
+
and relative_path.parts[0] in self.MAIN_CONTENT_FOLDERS
|
|
773
|
+
):
|
|
774
|
+
xml_doc = self._clean_ignorable_namespaces(xml_doc)
|
|
775
|
+
|
|
776
|
+
if schema.validate(xml_doc):
|
|
777
|
+
return True, set()
|
|
778
|
+
else:
|
|
779
|
+
errors = set()
|
|
780
|
+
for error in schema.error_log:
|
|
781
|
+
errors.add(error.message)
|
|
782
|
+
return False, errors
|
|
783
|
+
|
|
784
|
+
except Exception as e:
|
|
785
|
+
return False, {str(e)}
|
|
786
|
+
|
|
787
|
+
def _get_original_file_errors(self, xml_file):
|
|
788
|
+
if self.original_file is None:
|
|
789
|
+
return set()
|
|
790
|
+
|
|
791
|
+
import tempfile
|
|
792
|
+
import zipfile
|
|
793
|
+
|
|
794
|
+
xml_file = Path(xml_file).resolve()
|
|
795
|
+
unpacked_dir = self.unpacked_dir.resolve()
|
|
796
|
+
relative_path = xml_file.relative_to(unpacked_dir)
|
|
797
|
+
|
|
798
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
799
|
+
temp_path = Path(temp_dir)
|
|
800
|
+
|
|
801
|
+
with zipfile.ZipFile(self.original_file, "r") as zip_ref:
|
|
802
|
+
zip_ref.extractall(temp_path)
|
|
803
|
+
|
|
804
|
+
original_xml_file = temp_path / relative_path
|
|
805
|
+
|
|
806
|
+
if not original_xml_file.exists():
|
|
807
|
+
return set()
|
|
808
|
+
|
|
809
|
+
is_valid, errors = self._validate_single_file_xsd(
|
|
810
|
+
original_xml_file, temp_path
|
|
811
|
+
)
|
|
812
|
+
return errors if errors else set()
|
|
813
|
+
|
|
814
|
+
def _remove_template_tags_from_text_nodes(self, xml_doc):
|
|
815
|
+
warnings = []
|
|
816
|
+
template_pattern = re.compile(r"\{\{[^}]*\}\}")
|
|
817
|
+
|
|
818
|
+
xml_string = lxml.etree.tostring(xml_doc, encoding="unicode")
|
|
819
|
+
xml_copy = lxml.etree.fromstring(xml_string)
|
|
820
|
+
|
|
821
|
+
def process_text_content(text, content_type):
|
|
822
|
+
if not text:
|
|
823
|
+
return text
|
|
824
|
+
matches = list(template_pattern.finditer(text))
|
|
825
|
+
if matches:
|
|
826
|
+
for match in matches:
|
|
827
|
+
warnings.append(
|
|
828
|
+
f"Found template tag in {content_type}: {match.group()}"
|
|
829
|
+
)
|
|
830
|
+
return template_pattern.sub("", text)
|
|
831
|
+
return text
|
|
832
|
+
|
|
833
|
+
for elem in xml_copy.iter():
|
|
834
|
+
if not hasattr(elem, "tag") or callable(elem.tag):
|
|
835
|
+
continue
|
|
836
|
+
tag_str = str(elem.tag)
|
|
837
|
+
if tag_str.endswith("}t") or tag_str == "t":
|
|
838
|
+
continue
|
|
839
|
+
|
|
840
|
+
elem.text = process_text_content(elem.text, "text content")
|
|
841
|
+
elem.tail = process_text_content(elem.tail, "tail content")
|
|
842
|
+
|
|
843
|
+
return lxml.etree.ElementTree(xml_copy), warnings
|
|
844
|
+
|
|
845
|
+
|
|
846
|
+
if __name__ == "__main__":
|
|
847
|
+
raise RuntimeError("This module should not be run directly.")
|