devlyn-cli 0.5.2 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/devlyn.js +1 -0
- package/config/commands/devlyn.team-resolve.md +31 -2
- package/optional-skills/dokkit/ANALYSIS.md +198 -0
- package/optional-skills/dokkit/COMMANDS.md +365 -0
- package/optional-skills/dokkit/DOCX-XML.md +76 -0
- package/optional-skills/dokkit/EXPORT.md +102 -0
- package/optional-skills/dokkit/FILLING.md +377 -0
- package/optional-skills/dokkit/HWPX-XML.md +73 -0
- package/optional-skills/dokkit/IMAGE-SOURCING.md +127 -0
- package/optional-skills/dokkit/INGESTION.md +65 -0
- package/optional-skills/dokkit/SKILL.md +153 -0
- package/optional-skills/dokkit/STATE.md +60 -0
- package/optional-skills/dokkit/references/docx-field-patterns.md +151 -0
- package/optional-skills/dokkit/references/docx-structure.md +58 -0
- package/optional-skills/dokkit/references/field-detection-patterns.md +130 -0
- package/optional-skills/dokkit/references/hwpx-field-patterns.md +461 -0
- package/optional-skills/dokkit/references/hwpx-structure.md +159 -0
- package/optional-skills/dokkit/references/image-opportunity-heuristics.md +121 -0
- package/optional-skills/dokkit/references/image-xml-patterns.md +338 -0
- package/optional-skills/dokkit/references/section-image-interleaving.md +346 -0
- package/optional-skills/dokkit/references/section-range-detection.md +118 -0
- package/optional-skills/dokkit/references/state-schema.md +143 -0
- package/optional-skills/dokkit/references/supported-formats.md +67 -0
- package/optional-skills/dokkit/scripts/compile_hwpx.py +134 -0
- package/optional-skills/dokkit/scripts/detect_fields.py +301 -0
- package/optional-skills/dokkit/scripts/detect_fields_hwpx.py +286 -0
- package/optional-skills/dokkit/scripts/export_pdf.py +99 -0
- package/optional-skills/dokkit/scripts/parse_hwpx.py +185 -0
- package/optional-skills/dokkit/scripts/parse_image_with_gemini.py +159 -0
- package/optional-skills/dokkit/scripts/parse_xlsx.py +98 -0
- package/optional-skills/dokkit/scripts/source_images.py +365 -0
- package/optional-skills/dokkit/scripts/validate_docx.py +142 -0
- package/optional-skills/dokkit/scripts/validate_hwpx.py +281 -0
- package/optional-skills/dokkit/scripts/validate_state.py +132 -0
- package/package.json +1 -1
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Validate a modified DOCX working directory for XML well-formedness.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python validate_docx.py <work_dir>
|
|
6
|
+
|
|
7
|
+
Checks:
|
|
8
|
+
- All XML files are well-formed
|
|
9
|
+
- Required files exist ([Content_Types].xml, word/document.xml)
|
|
10
|
+
- No broken element nesting
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import sys
|
|
14
|
+
import xml.etree.ElementTree as ET
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
REQUIRED_FILES = [
|
|
19
|
+
"[Content_Types].xml",
|
|
20
|
+
"word/document.xml",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".gif", ".webp"}
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def validate_xml_file(path: Path) -> list[str]:
|
|
27
|
+
"""Check if an XML file is well-formed."""
|
|
28
|
+
errors = []
|
|
29
|
+
try:
|
|
30
|
+
ET.parse(path)
|
|
31
|
+
except ET.ParseError as e:
|
|
32
|
+
errors.append(f"{path}: XML parse error: {e}")
|
|
33
|
+
return errors
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def validate_image_relationships(work_dir: Path) -> list[str]:
|
|
37
|
+
"""Verify that image media files, relationships, and Content_Types are consistent."""
|
|
38
|
+
errors = []
|
|
39
|
+
|
|
40
|
+
# 1. Collect actual image files in word/media/
|
|
41
|
+
media_dir = work_dir / "word" / "media"
|
|
42
|
+
media_images = set()
|
|
43
|
+
if media_dir.is_dir():
|
|
44
|
+
for f in media_dir.iterdir():
|
|
45
|
+
if f.suffix.lower() in IMAGE_EXTENSIONS:
|
|
46
|
+
media_images.add(f.name)
|
|
47
|
+
|
|
48
|
+
if not media_images:
|
|
49
|
+
return errors # No images, nothing to validate
|
|
50
|
+
|
|
51
|
+
# 2. Parse relationships to find image references
|
|
52
|
+
rels_path = work_dir / "word" / "_rels" / "document.xml.rels"
|
|
53
|
+
rel_targets = set()
|
|
54
|
+
if rels_path.exists():
|
|
55
|
+
try:
|
|
56
|
+
tree = ET.parse(rels_path)
|
|
57
|
+
root = tree.getroot()
|
|
58
|
+
ns = {"r": "http://schemas.openxmlformats.org/package/2006/relationships"}
|
|
59
|
+
image_type = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"
|
|
60
|
+
for rel in root.iter("{%s}Relationship" % ns["r"]):
|
|
61
|
+
if rel.get("Type") == image_type:
|
|
62
|
+
target = rel.get("Target", "")
|
|
63
|
+
# Target is like "media/image1.png"
|
|
64
|
+
if target.startswith("media/"):
|
|
65
|
+
rel_targets.add(target[len("media/"):])
|
|
66
|
+
except ET.ParseError:
|
|
67
|
+
pass # XML error already caught by validate_xml_file
|
|
68
|
+
|
|
69
|
+
# 3. Check media files have corresponding relationships
|
|
70
|
+
for img in media_images:
|
|
71
|
+
if img not in rel_targets:
|
|
72
|
+
errors.append(f"Image file word/media/{img} has no relationship entry in document.xml.rels")
|
|
73
|
+
|
|
74
|
+
# 4. Check relationships point to existing files
|
|
75
|
+
for target in rel_targets:
|
|
76
|
+
if target not in media_images:
|
|
77
|
+
errors.append(f"Relationship references media/{target} but file does not exist in word/media/")
|
|
78
|
+
|
|
79
|
+
# 5. Check Content_Types has entries for image extensions used
|
|
80
|
+
ct_path = work_dir / "[Content_Types].xml"
|
|
81
|
+
if ct_path.exists():
|
|
82
|
+
try:
|
|
83
|
+
ct_tree = ET.parse(ct_path)
|
|
84
|
+
ct_root = ct_tree.getroot()
|
|
85
|
+
ct_ns = "http://schemas.openxmlformats.org/package/2006/content-types"
|
|
86
|
+
registered_exts = set()
|
|
87
|
+
for default in ct_root.iter(f"{{{ct_ns}}}Default"):
|
|
88
|
+
registered_exts.add(default.get("Extension", "").lower())
|
|
89
|
+
|
|
90
|
+
for img in media_images:
|
|
91
|
+
ext = img.rsplit(".", 1)[-1].lower() if "." in img else ""
|
|
92
|
+
if ext and ext not in registered_exts:
|
|
93
|
+
# JPEG can be registered as either "jpeg" or "jpg"
|
|
94
|
+
if ext in ("jpg", "jpeg") and ("jpg" in registered_exts or "jpeg" in registered_exts):
|
|
95
|
+
continue
|
|
96
|
+
errors.append(f"Image extension '.{ext}' (from {img}) not registered in [Content_Types].xml")
|
|
97
|
+
except ET.ParseError:
|
|
98
|
+
pass
|
|
99
|
+
|
|
100
|
+
return errors
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def main():
|
|
104
|
+
if len(sys.argv) != 2:
|
|
105
|
+
print("Usage: python validate_docx.py <work_dir>", file=sys.stderr)
|
|
106
|
+
sys.exit(1)
|
|
107
|
+
|
|
108
|
+
work_dir = Path(sys.argv[1])
|
|
109
|
+
if not work_dir.is_dir():
|
|
110
|
+
print(f"Error: Not a directory: {work_dir}", file=sys.stderr)
|
|
111
|
+
sys.exit(2)
|
|
112
|
+
|
|
113
|
+
errors = []
|
|
114
|
+
|
|
115
|
+
# Check required files
|
|
116
|
+
for req in REQUIRED_FILES:
|
|
117
|
+
if not (work_dir / req).exists():
|
|
118
|
+
errors.append(f"Missing required file: {req}")
|
|
119
|
+
|
|
120
|
+
# Validate all XML files
|
|
121
|
+
for xml_file in work_dir.rglob("*.xml"):
|
|
122
|
+
errors.extend(validate_xml_file(xml_file))
|
|
123
|
+
|
|
124
|
+
# Also check .rels files
|
|
125
|
+
for rels_file in work_dir.rglob("*.rels"):
|
|
126
|
+
errors.extend(validate_xml_file(rels_file))
|
|
127
|
+
|
|
128
|
+
# Validate image relationships
|
|
129
|
+
errors.extend(validate_image_relationships(work_dir))
|
|
130
|
+
|
|
131
|
+
if errors:
|
|
132
|
+
print(f"Validation FAILED — {len(errors)} error(s):", file=sys.stderr)
|
|
133
|
+
for err in errors:
|
|
134
|
+
print(f" - {err}", file=sys.stderr)
|
|
135
|
+
sys.exit(1)
|
|
136
|
+
else:
|
|
137
|
+
print("DOCX validation passed.", file=sys.stderr)
|
|
138
|
+
sys.exit(0)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
if __name__ == "__main__":
|
|
142
|
+
main()
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Validate a modified HWPX working directory for XML well-formedness.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python validate_hwpx.py <work_dir>
|
|
6
|
+
|
|
7
|
+
Checks:
|
|
8
|
+
- All XML files are well-formed
|
|
9
|
+
- mimetype file exists
|
|
10
|
+
- Section files exist in Contents/
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import sys
|
|
14
|
+
import xml.etree.ElementTree as ET
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
IMAGE_EXTENSIONS = {".png", ".jpg", ".jpeg", ".bmp", ".tiff", ".gif", ".webp"}
|
|
19
|
+
|
|
20
|
+
HP = "http://www.hancom.co.kr/hwpml/2011/paragraph"
|
|
21
|
+
HC = "http://www.hancom.co.kr/hwpml/2011/core"
|
|
22
|
+
HH = "http://www.hancom.co.kr/hwpml/2011/head"
|
|
23
|
+
OPF = "http://www.idpf.org/2007/opf"
|
|
24
|
+
|
|
25
|
+
# Required child element local names inside <hp:pic>
|
|
26
|
+
REQUIRED_PIC_CHILDREN = {
|
|
27
|
+
"offset", "orgSz", "curSz", "flip", "rotationInfo", "renderingInfo",
|
|
28
|
+
"inMargin", "sz", "pos", "outMargin", "lineShape", "imgRect", "imgClip",
|
|
29
|
+
"imgDim",
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
# Elements that should NOT appear inside <hp:pic>
|
|
33
|
+
SPURIOUS_PIC_ELEMENTS = {"picSz", "picOutline", "picRect", "caption", "shapeComment"}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def validate_xml_file(path: Path) -> list[str]:
|
|
37
|
+
"""Check if an XML file is well-formed."""
|
|
38
|
+
errors = []
|
|
39
|
+
try:
|
|
40
|
+
ET.parse(path)
|
|
41
|
+
except ET.ParseError as e:
|
|
42
|
+
errors.append(f"{path}: XML parse error: {e}")
|
|
43
|
+
return errors
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _check_single_pic(pic: ET.Element, prefix: str, manifest_ids: set[str]) -> tuple[list[str], list[str]]:
|
|
47
|
+
"""Validate a single <hp:pic> element against the 8 structural rules.
|
|
48
|
+
|
|
49
|
+
Returns (errors, warnings) lists.
|
|
50
|
+
"""
|
|
51
|
+
errors = []
|
|
52
|
+
warnings = []
|
|
53
|
+
|
|
54
|
+
# Rule 1: <img> must use hc: namespace
|
|
55
|
+
hc_imgs = list(pic.iter(f"{{{HC}}}img"))
|
|
56
|
+
hp_imgs = list(pic.iter(f"{{{HP}}}img"))
|
|
57
|
+
if hp_imgs:
|
|
58
|
+
errors.append(f"{prefix}: <img> uses hp: namespace (must be hc:)")
|
|
59
|
+
if not hc_imgs and not hp_imgs:
|
|
60
|
+
errors.append(f"{prefix}: missing <img> element entirely")
|
|
61
|
+
|
|
62
|
+
# Rule 2: <imgRect> must have <hc:pt0..3> children
|
|
63
|
+
for imgRect in pic.iter(f"{{{HP}}}imgRect"):
|
|
64
|
+
pt_count = sum(1 for i in range(4) for _ in imgRect.iter(f"{{{HC}}}pt{i}"))
|
|
65
|
+
if pt_count < 4:
|
|
66
|
+
if imgRect.get("x1") is not None or imgRect.get("x2") is not None:
|
|
67
|
+
errors.append(
|
|
68
|
+
f"{prefix}: <imgRect> uses inline attributes "
|
|
69
|
+
f"(must have <hc:pt0..3> children)"
|
|
70
|
+
)
|
|
71
|
+
else:
|
|
72
|
+
errors.append(
|
|
73
|
+
f"{prefix}: <imgRect> has {pt_count}/4 "
|
|
74
|
+
f"required <hc:pt> children"
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Rule 3: Required child elements
|
|
78
|
+
child_local_names = set()
|
|
79
|
+
for child in pic:
|
|
80
|
+
tag = child.tag
|
|
81
|
+
local = tag.split("}")[1] if "}" in tag else tag
|
|
82
|
+
child_local_names.add(local)
|
|
83
|
+
|
|
84
|
+
missing = REQUIRED_PIC_CHILDREN - child_local_names
|
|
85
|
+
if missing:
|
|
86
|
+
errors.append(f"{prefix}: missing required children: {', '.join(sorted(missing))}")
|
|
87
|
+
|
|
88
|
+
# Rule 4: No spurious elements
|
|
89
|
+
spurious_found = SPURIOUS_PIC_ELEMENTS & child_local_names
|
|
90
|
+
if spurious_found:
|
|
91
|
+
warnings.append(f"{prefix}: spurious elements found: {', '.join(sorted(spurious_found))}")
|
|
92
|
+
|
|
93
|
+
# Rule 5: imgClip right/bottom should not be all zeros
|
|
94
|
+
for imgClip in pic.iter(f"{{{HP}}}imgClip"):
|
|
95
|
+
if imgClip.get("right", "0") == "0" and imgClip.get("bottom", "0") == "0":
|
|
96
|
+
warnings.append(f"{prefix}: <imgClip> right/bottom are both 0 (should be pixel dimensions)")
|
|
97
|
+
|
|
98
|
+
# Rule 6: lineShape attributes
|
|
99
|
+
for ls in pic.iter(f"{{{HP}}}lineShape"):
|
|
100
|
+
if ls.get("color", "") == "#000000":
|
|
101
|
+
warnings.append(f"{prefix}: <lineShape> color=\"#000000\" (should be \"none\")")
|
|
102
|
+
if ls.get("width", "") == "0":
|
|
103
|
+
warnings.append(f"{prefix}: <lineShape> width=\"0\" (should be \"283\")")
|
|
104
|
+
|
|
105
|
+
# Rule 7: Check manifest registration
|
|
106
|
+
for img in hc_imgs:
|
|
107
|
+
ref_id = img.get("binaryItemIDRef", "")
|
|
108
|
+
if ref_id and manifest_ids and ref_id not in manifest_ids:
|
|
109
|
+
errors.append(f"{prefix}: binaryItemIDRef=\"{ref_id}\" not found in content.hpf manifest")
|
|
110
|
+
|
|
111
|
+
# Rule 8: hp:pos attributes
|
|
112
|
+
for pos in pic.iter(f"{{{HP}}}pos"):
|
|
113
|
+
if pos.get("flowWithText", "") == "1":
|
|
114
|
+
warnings.append(f"{prefix}: <pos> flowWithText=\"1\" (should be \"0\")")
|
|
115
|
+
if pos.get("horzRelTo", "") == "PARA":
|
|
116
|
+
warnings.append(f"{prefix}: <pos> horzRelTo=\"PARA\" (should be \"COLUMN\")")
|
|
117
|
+
|
|
118
|
+
return errors, warnings
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def validate_pic_elements(work_dir: Path) -> list[str]:
|
|
122
|
+
"""Validate <hp:pic> elements in section XML for correct structure."""
|
|
123
|
+
errors = []
|
|
124
|
+
warnings = []
|
|
125
|
+
|
|
126
|
+
contents_dir = work_dir / "Contents"
|
|
127
|
+
if not contents_dir.is_dir():
|
|
128
|
+
return errors
|
|
129
|
+
|
|
130
|
+
# Collect manifest IDs from content.hpf
|
|
131
|
+
manifest_ids = set()
|
|
132
|
+
content_hpf = contents_dir / "content.hpf"
|
|
133
|
+
if content_hpf.exists():
|
|
134
|
+
try:
|
|
135
|
+
tree = ET.parse(content_hpf)
|
|
136
|
+
for item in tree.getroot().iter(f"{{{OPF}}}item"):
|
|
137
|
+
if "BinData/" in item.get("href", ""):
|
|
138
|
+
manifest_ids.add(item.get("id", ""))
|
|
139
|
+
except ET.ParseError as e:
|
|
140
|
+
errors.append(f"content.hpf: XML parse error (skipping manifest check): {e}")
|
|
141
|
+
|
|
142
|
+
for section_file in sorted(contents_dir.glob("section*.xml")):
|
|
143
|
+
try:
|
|
144
|
+
tree = ET.parse(section_file)
|
|
145
|
+
root = tree.getroot()
|
|
146
|
+
except ET.ParseError as e:
|
|
147
|
+
errors.append(f"{section_file.name}: XML parse error (skipping pic validation): {e}")
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
for pic in root.iter(f"{{{HP}}}pic"):
|
|
151
|
+
prefix = f"{section_file.name} pic id={pic.get('id', '?')}"
|
|
152
|
+
pic_errors, pic_warnings = _check_single_pic(pic, prefix, manifest_ids)
|
|
153
|
+
errors.extend(pic_errors)
|
|
154
|
+
warnings.extend(pic_warnings)
|
|
155
|
+
|
|
156
|
+
# Check header.xml for spurious binDataItems referencing images
|
|
157
|
+
for header_path in work_dir.rglob("header.xml"):
|
|
158
|
+
try:
|
|
159
|
+
tree = ET.parse(header_path)
|
|
160
|
+
for _ in tree.getroot().iter(f"{{{HH}}}binDataItem"):
|
|
161
|
+
warnings.append(
|
|
162
|
+
f"{header_path.name}: found <hh:binDataItem> "
|
|
163
|
+
f"(images should be registered in content.hpf only)"
|
|
164
|
+
)
|
|
165
|
+
except ET.ParseError as e:
|
|
166
|
+
errors.append(f"{header_path.name}: XML parse error (skipping binDataItem check): {e}")
|
|
167
|
+
|
|
168
|
+
# Warnings are non-fatal but reported
|
|
169
|
+
for w in warnings:
|
|
170
|
+
errors.append(f"WARNING: {w}")
|
|
171
|
+
|
|
172
|
+
return errors
|
|
173
|
+
|
|
174
|
+
|
|
175
|
+
def validate_image_references(work_dir: Path) -> list[str]:
|
|
176
|
+
"""Verify that BinData image files are referenced in section XML files."""
|
|
177
|
+
errors = []
|
|
178
|
+
|
|
179
|
+
# 1. Collect actual image files in BinData/
|
|
180
|
+
bindata_dir = work_dir / "BinData"
|
|
181
|
+
bindata_images = set()
|
|
182
|
+
if bindata_dir.is_dir():
|
|
183
|
+
for f in bindata_dir.iterdir():
|
|
184
|
+
if f.suffix.lower() in IMAGE_EXTENSIONS:
|
|
185
|
+
bindata_images.add(f"BinData/{f.name}")
|
|
186
|
+
|
|
187
|
+
if not bindata_images:
|
|
188
|
+
return errors # No images, nothing to validate
|
|
189
|
+
|
|
190
|
+
# 2. Scan section XML files for BinData references
|
|
191
|
+
contents_dir = work_dir / "Contents"
|
|
192
|
+
referenced_images = set()
|
|
193
|
+
if contents_dir.is_dir():
|
|
194
|
+
for section_file in contents_dir.glob("section*.xml"):
|
|
195
|
+
try:
|
|
196
|
+
tree = ET.parse(section_file)
|
|
197
|
+
root = tree.getroot()
|
|
198
|
+
# Search for binDataEmbedding elements referencing BinData/
|
|
199
|
+
for elem in root.iter():
|
|
200
|
+
for attr_val in elem.attrib.values():
|
|
201
|
+
if isinstance(attr_val, str) and attr_val.startswith("BinData/"):
|
|
202
|
+
referenced_images.add(attr_val)
|
|
203
|
+
except ET.ParseError:
|
|
204
|
+
pass # Already caught by validate_xml_file
|
|
205
|
+
|
|
206
|
+
# Also check header XML files
|
|
207
|
+
for xml_file in work_dir.rglob("*.xml"):
|
|
208
|
+
if xml_file.name.startswith("section"):
|
|
209
|
+
continue # Already checked above
|
|
210
|
+
try:
|
|
211
|
+
tree = ET.parse(xml_file)
|
|
212
|
+
root = tree.getroot()
|
|
213
|
+
for elem in root.iter():
|
|
214
|
+
for attr_val in elem.attrib.values():
|
|
215
|
+
if isinstance(attr_val, str) and attr_val.startswith("BinData/"):
|
|
216
|
+
referenced_images.add(attr_val)
|
|
217
|
+
except ET.ParseError:
|
|
218
|
+
pass
|
|
219
|
+
|
|
220
|
+
# 3. Check BinData files have corresponding references
|
|
221
|
+
for img in bindata_images:
|
|
222
|
+
if img not in referenced_images:
|
|
223
|
+
errors.append(f"Image file {img} has no reference in section or header XML")
|
|
224
|
+
|
|
225
|
+
# 4. Check references point to existing files
|
|
226
|
+
for ref in referenced_images:
|
|
227
|
+
ref_path = work_dir / ref
|
|
228
|
+
if not ref_path.exists():
|
|
229
|
+
errors.append(f"XML references {ref} but file does not exist")
|
|
230
|
+
|
|
231
|
+
return errors
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def main():
|
|
235
|
+
if len(sys.argv) != 2:
|
|
236
|
+
print("Usage: python validate_hwpx.py <work_dir>", file=sys.stderr)
|
|
237
|
+
sys.exit(1)
|
|
238
|
+
|
|
239
|
+
work_dir = Path(sys.argv[1])
|
|
240
|
+
if not work_dir.is_dir():
|
|
241
|
+
print(f"Error: Not a directory: {work_dir}", file=sys.stderr)
|
|
242
|
+
sys.exit(2)
|
|
243
|
+
|
|
244
|
+
errors = []
|
|
245
|
+
|
|
246
|
+
# Check mimetype
|
|
247
|
+
mimetype_path = work_dir / "mimetype"
|
|
248
|
+
if not mimetype_path.exists():
|
|
249
|
+
errors.append("Missing required file: mimetype")
|
|
250
|
+
|
|
251
|
+
# Check for section files
|
|
252
|
+
contents_dir = work_dir / "Contents"
|
|
253
|
+
if not contents_dir.is_dir():
|
|
254
|
+
errors.append("Missing Contents/ directory")
|
|
255
|
+
else:
|
|
256
|
+
section_files = list(contents_dir.glob("section*.xml"))
|
|
257
|
+
if not section_files:
|
|
258
|
+
errors.append("No section*.xml files found in Contents/")
|
|
259
|
+
|
|
260
|
+
# Validate all XML files
|
|
261
|
+
for xml_file in work_dir.rglob("*.xml"):
|
|
262
|
+
errors.extend(validate_xml_file(xml_file))
|
|
263
|
+
|
|
264
|
+
# Validate image references
|
|
265
|
+
errors.extend(validate_image_references(work_dir))
|
|
266
|
+
|
|
267
|
+
# Validate <hp:pic> element structure
|
|
268
|
+
errors.extend(validate_pic_elements(work_dir))
|
|
269
|
+
|
|
270
|
+
if errors:
|
|
271
|
+
print(f"Validation FAILED — {len(errors)} error(s):", file=sys.stderr)
|
|
272
|
+
for err in errors:
|
|
273
|
+
print(f" - {err}", file=sys.stderr)
|
|
274
|
+
sys.exit(1)
|
|
275
|
+
else:
|
|
276
|
+
print("HWPX validation passed.", file=sys.stderr)
|
|
277
|
+
sys.exit(0)
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
if __name__ == "__main__":
|
|
281
|
+
main()
|
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Validate .dokkit/state.json against the Dokkit state schema.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python validate_state.py <path-to-state.json>
|
|
6
|
+
|
|
7
|
+
Exit codes:
|
|
8
|
+
0 — valid
|
|
9
|
+
1 — validation errors found
|
|
10
|
+
2 — file not found or invalid JSON
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
import sys
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
VALID_SOURCE_STATUSES = {"processing", "ready", "error"}
|
|
19
|
+
VALID_DOC_STATUSES = {"filling", "review", "modified", "finalized"}
|
|
20
|
+
VALID_FILE_TYPES = {
|
|
21
|
+
"pdf", "docx", "xlsx", "csv", "pptx", "hwp", "hwpx",
|
|
22
|
+
"png", "jpg", "jpeg", "txt", "md", "json", "html"
|
|
23
|
+
}
|
|
24
|
+
VALID_TEMPLATE_TYPES = {"docx", "hwpx"}
|
|
25
|
+
VALID_EXPORT_FORMATS = {"docx", "hwpx", "pdf"}
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def validate(state: dict) -> list[str]:
|
|
29
|
+
"""Validate state dict, return list of error messages."""
|
|
30
|
+
errors = []
|
|
31
|
+
|
|
32
|
+
# Root fields
|
|
33
|
+
if not isinstance(state.get("version"), str):
|
|
34
|
+
errors.append("Missing or invalid 'version' (expected string)")
|
|
35
|
+
if not isinstance(state.get("created"), str):
|
|
36
|
+
errors.append("Missing or invalid 'created' (expected ISO timestamp string)")
|
|
37
|
+
if not isinstance(state.get("sources"), list):
|
|
38
|
+
errors.append("Missing or invalid 'sources' (expected array)")
|
|
39
|
+
if "template" not in state:
|
|
40
|
+
errors.append("Missing 'template' field (expected object or null)")
|
|
41
|
+
if "analysis" not in state:
|
|
42
|
+
errors.append("Missing 'analysis' field (expected object or null)")
|
|
43
|
+
if "filled_document" not in state:
|
|
44
|
+
errors.append("Missing 'filled_document' field (expected object or null)")
|
|
45
|
+
if not isinstance(state.get("exports"), list):
|
|
46
|
+
errors.append("Missing or invalid 'exports' (expected array)")
|
|
47
|
+
|
|
48
|
+
# Validate sources
|
|
49
|
+
for i, src in enumerate(state.get("sources", [])):
|
|
50
|
+
prefix = f"sources[{i}]"
|
|
51
|
+
for field in ("id", "file_path", "file_type", "display_name",
|
|
52
|
+
"content_path", "metadata_path", "summary", "status"):
|
|
53
|
+
if not isinstance(src.get(field), str):
|
|
54
|
+
errors.append(f"{prefix}: missing or invalid '{field}'")
|
|
55
|
+
if src.get("status") and src["status"] not in VALID_SOURCE_STATUSES:
|
|
56
|
+
errors.append(f"{prefix}: invalid status '{src['status']}' "
|
|
57
|
+
f"(expected one of {VALID_SOURCE_STATUSES})")
|
|
58
|
+
if src.get("file_type") and src["file_type"] not in VALID_FILE_TYPES:
|
|
59
|
+
errors.append(f"{prefix}: unknown file_type '{src['file_type']}'")
|
|
60
|
+
|
|
61
|
+
# Validate template
|
|
62
|
+
tmpl = state.get("template")
|
|
63
|
+
if tmpl is not None:
|
|
64
|
+
for field in ("file_path", "file_type", "display_name", "work_dir"):
|
|
65
|
+
if not isinstance(tmpl.get(field), str):
|
|
66
|
+
errors.append(f"template: missing or invalid '{field}'")
|
|
67
|
+
if tmpl.get("file_type") and tmpl["file_type"] not in VALID_TEMPLATE_TYPES:
|
|
68
|
+
errors.append(f"template: invalid file_type '{tmpl['file_type']}' "
|
|
69
|
+
f"(expected one of {VALID_TEMPLATE_TYPES})")
|
|
70
|
+
|
|
71
|
+
# Validate analysis
|
|
72
|
+
analysis = state.get("analysis")
|
|
73
|
+
if analysis is not None:
|
|
74
|
+
if not isinstance(analysis.get("path"), str):
|
|
75
|
+
errors.append("analysis: missing or invalid 'path'")
|
|
76
|
+
for field in ("total_fields", "mapped", "unmapped"):
|
|
77
|
+
if not isinstance(analysis.get(field), int):
|
|
78
|
+
errors.append(f"analysis: missing or invalid '{field}' (expected integer)")
|
|
79
|
+
|
|
80
|
+
# Validate filled_document
|
|
81
|
+
doc = state.get("filled_document")
|
|
82
|
+
if doc is not None:
|
|
83
|
+
if not isinstance(doc.get("status"), str):
|
|
84
|
+
errors.append("filled_document: missing or invalid 'status'")
|
|
85
|
+
elif doc["status"] not in VALID_DOC_STATUSES:
|
|
86
|
+
errors.append(f"filled_document: invalid status '{doc['status']}' "
|
|
87
|
+
f"(expected one of {VALID_DOC_STATUSES})")
|
|
88
|
+
|
|
89
|
+
# Validate exports
|
|
90
|
+
for i, exp in enumerate(state.get("exports", [])):
|
|
91
|
+
prefix = f"exports[{i}]"
|
|
92
|
+
for field in ("format", "output_path"):
|
|
93
|
+
if not isinstance(exp.get(field), str):
|
|
94
|
+
errors.append(f"{prefix}: missing or invalid '{field}'")
|
|
95
|
+
if exp.get("format") and exp["format"] not in VALID_EXPORT_FORMATS:
|
|
96
|
+
errors.append(f"{prefix}: invalid format '{exp['format']}' "
|
|
97
|
+
f"(expected one of {VALID_EXPORT_FORMATS})")
|
|
98
|
+
|
|
99
|
+
return errors
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def main():
|
|
103
|
+
if len(sys.argv) != 2:
|
|
104
|
+
print("Usage: python validate_state.py <path-to-state.json>", file=sys.stderr)
|
|
105
|
+
sys.exit(2)
|
|
106
|
+
|
|
107
|
+
path = Path(sys.argv[1])
|
|
108
|
+
if not path.exists():
|
|
109
|
+
print(f"Error: File not found: {path}", file=sys.stderr)
|
|
110
|
+
sys.exit(2)
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
with open(path, encoding="utf-8") as f:
|
|
114
|
+
state = json.load(f)
|
|
115
|
+
except json.JSONDecodeError as e:
|
|
116
|
+
print(f"Error: Invalid JSON: {e}", file=sys.stderr)
|
|
117
|
+
sys.exit(2)
|
|
118
|
+
|
|
119
|
+
errors = validate(state)
|
|
120
|
+
|
|
121
|
+
if errors:
|
|
122
|
+
print(f"Validation FAILED — {len(errors)} error(s):", file=sys.stderr)
|
|
123
|
+
for err in errors:
|
|
124
|
+
print(f" - {err}", file=sys.stderr)
|
|
125
|
+
sys.exit(1)
|
|
126
|
+
else:
|
|
127
|
+
print("Validation passed.", file=sys.stderr)
|
|
128
|
+
sys.exit(0)
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
if __name__ == "__main__":
|
|
132
|
+
main()
|