devlyn-cli 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/devlyn.js +1 -0
- package/optional-skills/dokkit/ANALYSIS.md +198 -0
- package/optional-skills/dokkit/COMMANDS.md +365 -0
- package/optional-skills/dokkit/DOCX-XML.md +76 -0
- package/optional-skills/dokkit/EXPORT.md +102 -0
- package/optional-skills/dokkit/FILLING.md +377 -0
- package/optional-skills/dokkit/HWPX-XML.md +73 -0
- package/optional-skills/dokkit/IMAGE-SOURCING.md +127 -0
- package/optional-skills/dokkit/INGESTION.md +65 -0
- package/optional-skills/dokkit/SKILL.md +153 -0
- package/optional-skills/dokkit/STATE.md +60 -0
- package/optional-skills/dokkit/references/docx-field-patterns.md +151 -0
- package/optional-skills/dokkit/references/docx-structure.md +58 -0
- package/optional-skills/dokkit/references/field-detection-patterns.md +130 -0
- package/optional-skills/dokkit/references/hwpx-field-patterns.md +461 -0
- package/optional-skills/dokkit/references/hwpx-structure.md +159 -0
- package/optional-skills/dokkit/references/image-opportunity-heuristics.md +121 -0
- package/optional-skills/dokkit/references/image-xml-patterns.md +338 -0
- package/optional-skills/dokkit/references/section-image-interleaving.md +346 -0
- package/optional-skills/dokkit/references/section-range-detection.md +118 -0
- package/optional-skills/dokkit/references/state-schema.md +143 -0
- package/optional-skills/dokkit/references/supported-formats.md +67 -0
- package/optional-skills/dokkit/scripts/compile_hwpx.py +134 -0
- package/optional-skills/dokkit/scripts/detect_fields.py +301 -0
- package/optional-skills/dokkit/scripts/detect_fields_hwpx.py +286 -0
- package/optional-skills/dokkit/scripts/export_pdf.py +99 -0
- package/optional-skills/dokkit/scripts/parse_hwpx.py +185 -0
- package/optional-skills/dokkit/scripts/parse_image_with_gemini.py +159 -0
- package/optional-skills/dokkit/scripts/parse_xlsx.py +98 -0
- package/optional-skills/dokkit/scripts/source_images.py +365 -0
- package/optional-skills/dokkit/scripts/validate_docx.py +142 -0
- package/optional-skills/dokkit/scripts/validate_hwpx.py +281 -0
- package/optional-skills/dokkit/scripts/validate_state.py +132 -0
- package/package.json +1 -1
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Compile an HWPX document from its unpacked working directory.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python compile_hwpx.py <work_dir> <output.hwpx> [--reference <original.hwpx>]
|
|
6
|
+
|
|
7
|
+
Critical: mimetype must be the first file in the ZIP and stored uncompressed.
|
|
8
|
+
When --reference is given, preserves the original ZIP's file ordering and
|
|
9
|
+
per-file compression types (STORED vs DEFLATED). New files not present in the
|
|
10
|
+
reference are appended at the end with DEFLATED compression.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
import zipfile
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def compile_hwpx(work_dir: str, output_path: str, reference_zip: str | None = None) -> str:
|
|
20
|
+
"""Repackage an HWPX from its unpacked working directory."""
|
|
21
|
+
work = Path(work_dir)
|
|
22
|
+
out = Path(output_path)
|
|
23
|
+
out.parent.mkdir(parents=True, exist_ok=True)
|
|
24
|
+
|
|
25
|
+
# Collect all files in work_dir (excluding mimetype and .bak)
|
|
26
|
+
all_work_files: set[str] = set()
|
|
27
|
+
for root, dirs, files in os.walk(work):
|
|
28
|
+
for f in files:
|
|
29
|
+
if f == "mimetype" or f.endswith(".bak"):
|
|
30
|
+
continue
|
|
31
|
+
fpath = os.path.join(root, f)
|
|
32
|
+
arcname = os.path.relpath(fpath, work).replace(os.sep, "/")
|
|
33
|
+
all_work_files.add(arcname)
|
|
34
|
+
|
|
35
|
+
if reference_zip:
|
|
36
|
+
_compile_with_reference(work, out, reference_zip, all_work_files)
|
|
37
|
+
else:
|
|
38
|
+
_compile_default(work, out, all_work_files)
|
|
39
|
+
|
|
40
|
+
# Validate
|
|
41
|
+
with zipfile.ZipFile(out, 'r') as zf:
|
|
42
|
+
names = zf.namelist()
|
|
43
|
+
if names and names[0] != "mimetype":
|
|
44
|
+
print("Warning: mimetype is not the first entry in the archive", file=sys.stderr)
|
|
45
|
+
|
|
46
|
+
size = out.stat().st_size
|
|
47
|
+
print(f"Compiled: {out} ({size:,} bytes)", file=sys.stderr)
|
|
48
|
+
return str(out)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
def _compile_with_reference(work: Path, out: Path, reference_zip: str, all_work_files: set[str]) -> None:
|
|
52
|
+
"""Compile preserving the reference ZIP's file order and compression types."""
|
|
53
|
+
ref_zip = zipfile.ZipFile(reference_zip)
|
|
54
|
+
ref_entries = [(info.filename, info.compress_type) for info in ref_zip.infolist()]
|
|
55
|
+
|
|
56
|
+
added: set[str] = set()
|
|
57
|
+
with zipfile.ZipFile(out, 'w') as zf:
|
|
58
|
+
# 1. mimetype first, stored
|
|
59
|
+
mimetype_path = work / "mimetype"
|
|
60
|
+
if mimetype_path.exists():
|
|
61
|
+
zf.write(str(mimetype_path), "mimetype", compress_type=zipfile.ZIP_STORED)
|
|
62
|
+
added.add("mimetype")
|
|
63
|
+
|
|
64
|
+
# 2. Files from reference in original order with original compression
|
|
65
|
+
for filename, compress_type in ref_entries:
|
|
66
|
+
if filename == "mimetype":
|
|
67
|
+
continue
|
|
68
|
+
file_path = work / filename
|
|
69
|
+
if file_path.exists():
|
|
70
|
+
zf.write(str(file_path), filename, compress_type=compress_type)
|
|
71
|
+
else:
|
|
72
|
+
# Fall back to original content
|
|
73
|
+
data = ref_zip.read(filename)
|
|
74
|
+
info = zipfile.ZipInfo(filename)
|
|
75
|
+
info.compress_type = compress_type
|
|
76
|
+
zf.writestr(info, data)
|
|
77
|
+
added.add(filename)
|
|
78
|
+
|
|
79
|
+
# 3. New files not in reference (BinData images etc.)
|
|
80
|
+
for arcname in sorted(all_work_files - added):
|
|
81
|
+
file_path = work / arcname
|
|
82
|
+
zf.write(str(file_path), arcname, compress_type=zipfile.ZIP_DEFLATED)
|
|
83
|
+
|
|
84
|
+
ref_zip.close()
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _compile_default(work: Path, out: Path, all_work_files: set[str]) -> None:
|
|
88
|
+
"""Compile with default ordering (mimetype first, rest alphabetical, all deflated)."""
|
|
89
|
+
with zipfile.ZipFile(out, 'w') as zf:
|
|
90
|
+
# mimetype MUST be first and uncompressed
|
|
91
|
+
mimetype_path = work / "mimetype"
|
|
92
|
+
if mimetype_path.exists():
|
|
93
|
+
zf.write(str(mimetype_path), "mimetype", compress_type=zipfile.ZIP_STORED)
|
|
94
|
+
else:
|
|
95
|
+
print("Warning: mimetype file not found", file=sys.stderr)
|
|
96
|
+
|
|
97
|
+
# All other files with compression
|
|
98
|
+
for root, dirs, files in os.walk(work):
|
|
99
|
+
dirs.sort()
|
|
100
|
+
for file in sorted(files):
|
|
101
|
+
if file == "mimetype" or file.endswith(".bak"):
|
|
102
|
+
continue
|
|
103
|
+
file_path = os.path.join(root, file)
|
|
104
|
+
arcname = os.path.relpath(file_path, work)
|
|
105
|
+
zf.write(file_path, arcname, compress_type=zipfile.ZIP_DEFLATED)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def main():
|
|
109
|
+
if len(sys.argv) < 3:
|
|
110
|
+
print("Usage: python compile_hwpx.py <work_dir> <output.hwpx> [--reference <original.hwpx>]",
|
|
111
|
+
file=sys.stderr)
|
|
112
|
+
sys.exit(1)
|
|
113
|
+
|
|
114
|
+
work_dir = sys.argv[1]
|
|
115
|
+
output_path = sys.argv[2]
|
|
116
|
+
reference_zip = None
|
|
117
|
+
|
|
118
|
+
if "--reference" in sys.argv:
|
|
119
|
+
idx = sys.argv.index("--reference")
|
|
120
|
+
if idx + 1 < len(sys.argv):
|
|
121
|
+
reference_zip = sys.argv[idx + 1]
|
|
122
|
+
else:
|
|
123
|
+
print("Error: --reference requires a path argument", file=sys.stderr)
|
|
124
|
+
sys.exit(1)
|
|
125
|
+
|
|
126
|
+
if not Path(work_dir).is_dir():
|
|
127
|
+
print(f"Error: Not a directory: {work_dir}", file=sys.stderr)
|
|
128
|
+
sys.exit(1)
|
|
129
|
+
|
|
130
|
+
compile_hwpx(work_dir, output_path, reference_zip)
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
if __name__ == "__main__":
|
|
134
|
+
main()
|
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Detect fillable fields in a DOCX document.xml file.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python detect_fields.py <path-to-document.xml>
|
|
6
|
+
|
|
7
|
+
Output:
|
|
8
|
+
JSON array of detected fields to stdout.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import re
|
|
13
|
+
import sys
|
|
14
|
+
import xml.etree.ElementTree as ET
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
NS = {
|
|
19
|
+
"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
|
|
20
|
+
"wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
|
|
21
|
+
"a": "http://schemas.openxmlformats.org/drawingml/2006/main",
|
|
22
|
+
"pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
|
|
23
|
+
"r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
# Keywords that indicate image fields (Korean and English)
|
|
27
|
+
IMAGE_KEYWORDS_KO = ["사진", "증명사진", "여권사진", "로고", "서명", "날인", "도장", "직인"]
|
|
28
|
+
IMAGE_KEYWORDS_EN = ["photo", "picture", "logo", "signature", "stamp", "seal", "image", "portrait"]
|
|
29
|
+
IMAGE_KEYWORDS = IMAGE_KEYWORDS_KO + IMAGE_KEYWORDS_EN
|
|
30
|
+
|
|
31
|
+
# Map keywords to image_type classifier
|
|
32
|
+
IMAGE_TYPE_MAP = {
|
|
33
|
+
"사진": "photo", "증명사진": "photo", "여권사진": "photo",
|
|
34
|
+
"photo": "photo", "picture": "photo", "portrait": "photo", "image": "photo",
|
|
35
|
+
"로고": "logo", "logo": "logo",
|
|
36
|
+
"서명": "signature", "날인": "signature", "stamp": "signature", "seal": "signature",
|
|
37
|
+
"도장": "signature", "직인": "signature",
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def get_text(elem) -> str:
|
|
42
|
+
"""Extract all text from an element and its children."""
|
|
43
|
+
texts = []
|
|
44
|
+
for t in elem.iter("{%s}t" % NS["w"]):
|
|
45
|
+
if t.text:
|
|
46
|
+
texts.append(t.text)
|
|
47
|
+
return "".join(texts)
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _classify_image_type(text: str) -> str:
|
|
51
|
+
"""Classify image type from text. Returns photo/logo/signature/figure."""
|
|
52
|
+
lower = text.lower().strip()
|
|
53
|
+
for keyword, img_type in IMAGE_TYPE_MAP.items():
|
|
54
|
+
if keyword in lower:
|
|
55
|
+
return img_type
|
|
56
|
+
return "figure"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _is_image_keyword(text: str) -> bool:
|
|
60
|
+
"""Check if text contains an image-related keyword."""
|
|
61
|
+
lower = text.lower().strip()
|
|
62
|
+
return any(kw in lower for kw in IMAGE_KEYWORDS)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def detect_placeholder_text(root) -> list[dict]:
|
|
66
|
+
"""Find {{placeholder}} and <<placeholder>> patterns (excluding image keywords)."""
|
|
67
|
+
fields = []
|
|
68
|
+
pattern = re.compile(r"\{\{([^}]+)\}\}|<<([^>]+)>>|\[([^\]]+)\]")
|
|
69
|
+
|
|
70
|
+
for i, p in enumerate(root.iter("{%s}p" % NS["w"])):
|
|
71
|
+
text = get_text(p)
|
|
72
|
+
for match in pattern.finditer(text):
|
|
73
|
+
label = match.group(1) or match.group(2) or match.group(3)
|
|
74
|
+
# Skip image keywords — handled by detect_image_fields
|
|
75
|
+
if _is_image_keyword(label):
|
|
76
|
+
continue
|
|
77
|
+
fields.append({
|
|
78
|
+
"label": label.strip(),
|
|
79
|
+
"field_type": "placeholder_text",
|
|
80
|
+
"pattern": match.group(0),
|
|
81
|
+
"xml_path": f"p[{i}]",
|
|
82
|
+
})
|
|
83
|
+
return fields
|
|
84
|
+
|
|
85
|
+
|
|
86
|
+
def detect_empty_table_cells(root) -> list[dict]:
|
|
87
|
+
"""Find empty table cells adjacent to label cells."""
|
|
88
|
+
fields = []
|
|
89
|
+
|
|
90
|
+
for ti, tbl in enumerate(root.iter("{%s}tbl" % NS["w"])):
|
|
91
|
+
for ri, tr in enumerate(tbl.iter("{%s}tr" % NS["w"])):
|
|
92
|
+
cells = list(tr.iter("{%s}tc" % NS["w"]))
|
|
93
|
+
for ci in range(len(cells) - 1):
|
|
94
|
+
label_text = get_text(cells[ci]).strip()
|
|
95
|
+
next_text = get_text(cells[ci + 1]).strip()
|
|
96
|
+
|
|
97
|
+
if label_text and not next_text and len(label_text) < 50:
|
|
98
|
+
# Skip image keywords — handled by detect_image_fields
|
|
99
|
+
if _is_image_keyword(label_text):
|
|
100
|
+
continue
|
|
101
|
+
fields.append({
|
|
102
|
+
"label": label_text,
|
|
103
|
+
"field_type": "empty_cell",
|
|
104
|
+
"pattern": "(empty cell)",
|
|
105
|
+
"xml_path": f"tbl[{ti}]/tr[{ri}]/tc[{ci + 1}]",
|
|
106
|
+
})
|
|
107
|
+
return fields
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def detect_underline_fields(root) -> list[dict]:
|
|
111
|
+
"""Find underline-only runs (blank line placeholders)."""
|
|
112
|
+
fields = []
|
|
113
|
+
|
|
114
|
+
for i, r in enumerate(root.iter("{%s}r" % NS["w"])):
|
|
115
|
+
rPr = r.find("{%s}rPr" % NS["w"])
|
|
116
|
+
if rPr is None:
|
|
117
|
+
continue
|
|
118
|
+
u = rPr.find("{%s}u" % NS["w"])
|
|
119
|
+
if u is None:
|
|
120
|
+
continue
|
|
121
|
+
|
|
122
|
+
t = r.find("{%s}t" % NS["w"])
|
|
123
|
+
if t is not None and t.text:
|
|
124
|
+
text = t.text.strip()
|
|
125
|
+
if not text or all(c in " _" for c in text):
|
|
126
|
+
# Find preceding text for label
|
|
127
|
+
parent_p = None
|
|
128
|
+
for p in root.iter("{%s}p" % NS["w"]):
|
|
129
|
+
if r in list(p):
|
|
130
|
+
parent_p = p
|
|
131
|
+
break
|
|
132
|
+
|
|
133
|
+
label = "underline_field"
|
|
134
|
+
if parent_p is not None:
|
|
135
|
+
full_text = get_text(parent_p)
|
|
136
|
+
# Try to extract label from surrounding text
|
|
137
|
+
clean = full_text.replace(t.text, "").strip()
|
|
138
|
+
if clean:
|
|
139
|
+
label = clean
|
|
140
|
+
|
|
141
|
+
fields.append({
|
|
142
|
+
"label": label,
|
|
143
|
+
"field_type": "underline",
|
|
144
|
+
"pattern": "(underline)",
|
|
145
|
+
"xml_path": f"r[{i}]",
|
|
146
|
+
})
|
|
147
|
+
return fields
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def detect_content_controls(root) -> list[dict]:
|
|
151
|
+
"""Find structured document tags (content controls)."""
|
|
152
|
+
fields = []
|
|
153
|
+
|
|
154
|
+
for i, sdt in enumerate(root.iter("{%s}sdt" % NS["w"])):
|
|
155
|
+
sdtPr = sdt.find("{%s}sdtPr" % NS["w"])
|
|
156
|
+
if sdtPr is None:
|
|
157
|
+
continue
|
|
158
|
+
|
|
159
|
+
# Get alias or tag
|
|
160
|
+
alias = sdtPr.find("{%s}alias" % NS["w"])
|
|
161
|
+
tag = sdtPr.find("{%s}tag" % NS["w"])
|
|
162
|
+
|
|
163
|
+
label = "unknown"
|
|
164
|
+
if alias is not None:
|
|
165
|
+
label = alias.get("{%s}val" % NS["w"], "unknown")
|
|
166
|
+
elif tag is not None:
|
|
167
|
+
label = tag.get("{%s}val" % NS["w"], "unknown")
|
|
168
|
+
|
|
169
|
+
fields.append({
|
|
170
|
+
"label": label,
|
|
171
|
+
"field_type": "form_control",
|
|
172
|
+
"pattern": "(content control)",
|
|
173
|
+
"xml_path": f"sdt[{i}]",
|
|
174
|
+
})
|
|
175
|
+
return fields
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def detect_image_fields(root) -> list[dict]:
|
|
179
|
+
"""Detect image placeholders in a DOCX document.
|
|
180
|
+
|
|
181
|
+
Detects:
|
|
182
|
+
- Existing <w:drawing> elements in table cells (pre-positioned image slots)
|
|
183
|
+
- Image placeholder text: {{photo}}, {{사진}}, <<signature>>, etc.
|
|
184
|
+
- Empty cells adjacent to image-keyword labels
|
|
185
|
+
"""
|
|
186
|
+
fields = []
|
|
187
|
+
placeholder_pattern = re.compile(
|
|
188
|
+
r"\{\{([^}]+)\}\}|<<([^>]+)>>|\[([^\]]+)\]"
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# 1. Detect image placeholder text ({{photo}}, <<signature>>, etc.)
|
|
192
|
+
for i, p in enumerate(root.iter("{%s}p" % NS["w"])):
|
|
193
|
+
text = get_text(p)
|
|
194
|
+
for match in placeholder_pattern.finditer(text):
|
|
195
|
+
label = match.group(1) or match.group(2) or match.group(3)
|
|
196
|
+
if _is_image_keyword(label):
|
|
197
|
+
fields.append({
|
|
198
|
+
"label": label.strip(),
|
|
199
|
+
"field_type": "image",
|
|
200
|
+
"image_type": _classify_image_type(label),
|
|
201
|
+
"pattern": match.group(0),
|
|
202
|
+
"xml_path": f"p[{i}]",
|
|
203
|
+
})
|
|
204
|
+
|
|
205
|
+
# 2. Detect existing <w:drawing> placeholders in table cells
|
|
206
|
+
for ti, tbl in enumerate(root.iter("{%s}tbl" % NS["w"])):
|
|
207
|
+
for ri, tr in enumerate(tbl.iter("{%s}tr" % NS["w"])):
|
|
208
|
+
cells = list(tr.iter("{%s}tc" % NS["w"]))
|
|
209
|
+
for ci, cell in enumerate(cells):
|
|
210
|
+
drawings = list(cell.iter("{%s}drawing" % NS["w"]))
|
|
211
|
+
if drawings:
|
|
212
|
+
# Cell has a drawing — check if adjacent cell has image-keyword label
|
|
213
|
+
label_text = ""
|
|
214
|
+
if ci > 0:
|
|
215
|
+
label_text = get_text(cells[ci - 1]).strip()
|
|
216
|
+
if not _is_image_keyword(label_text) and ci + 1 < len(cells):
|
|
217
|
+
label_text = get_text(cells[ci + 1]).strip()
|
|
218
|
+
if not _is_image_keyword(label_text):
|
|
219
|
+
label_text = "image_placeholder"
|
|
220
|
+
|
|
221
|
+
fields.append({
|
|
222
|
+
"label": label_text,
|
|
223
|
+
"field_type": "image",
|
|
224
|
+
"image_type": _classify_image_type(label_text),
|
|
225
|
+
"pattern": "(existing drawing)",
|
|
226
|
+
"xml_path": f"tbl[{ti}]/tr[{ri}]/tc[{ci}]",
|
|
227
|
+
})
|
|
228
|
+
|
|
229
|
+
# 3. Detect empty cells adjacent to image-keyword labels
|
|
230
|
+
for ti, tbl in enumerate(root.iter("{%s}tbl" % NS["w"])):
|
|
231
|
+
for ri, tr in enumerate(tbl.iter("{%s}tr" % NS["w"])):
|
|
232
|
+
cells = list(tr.iter("{%s}tc" % NS["w"]))
|
|
233
|
+
for ci in range(len(cells) - 1):
|
|
234
|
+
label_text = get_text(cells[ci]).strip()
|
|
235
|
+
next_text = get_text(cells[ci + 1]).strip()
|
|
236
|
+
|
|
237
|
+
if _is_image_keyword(label_text) and not next_text:
|
|
238
|
+
# Check the empty cell doesn't already have a drawing
|
|
239
|
+
has_drawing = bool(list(cells[ci + 1].iter("{%s}drawing" % NS["w"])))
|
|
240
|
+
if not has_drawing:
|
|
241
|
+
fields.append({
|
|
242
|
+
"label": label_text,
|
|
243
|
+
"field_type": "image",
|
|
244
|
+
"image_type": _classify_image_type(label_text),
|
|
245
|
+
"pattern": "(empty cell, image label)",
|
|
246
|
+
"xml_path": f"tbl[{ti}]/tr[{ri}]/tc[{ci + 1}]",
|
|
247
|
+
})
|
|
248
|
+
|
|
249
|
+
return fields
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def detect_instruction_text(root) -> list[dict]:
|
|
253
|
+
"""Find instruction text patterns like (enter name here)."""
|
|
254
|
+
fields = []
|
|
255
|
+
pattern = re.compile(
|
|
256
|
+
r"\(.*?(?:enter|type|input|write|fill|입력|기재|작성).*?\)",
|
|
257
|
+
re.IGNORECASE
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
for i, p in enumerate(root.iter("{%s}p" % NS["w"])):
|
|
261
|
+
text = get_text(p)
|
|
262
|
+
for match in pattern.finditer(text):
|
|
263
|
+
fields.append({
|
|
264
|
+
"label": match.group(0).strip("()"),
|
|
265
|
+
"field_type": "instruction_text",
|
|
266
|
+
"pattern": match.group(0),
|
|
267
|
+
"xml_path": f"p[{i}]",
|
|
268
|
+
})
|
|
269
|
+
return fields
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
def main():
|
|
273
|
+
if len(sys.argv) != 2:
|
|
274
|
+
print("Usage: python detect_fields.py <document.xml>", file=sys.stderr)
|
|
275
|
+
sys.exit(1)
|
|
276
|
+
|
|
277
|
+
path = Path(sys.argv[1])
|
|
278
|
+
if not path.exists():
|
|
279
|
+
print(json.dumps({"error": f"File not found: {path}"}))
|
|
280
|
+
sys.exit(1)
|
|
281
|
+
|
|
282
|
+
tree = ET.parse(path)
|
|
283
|
+
root = tree.getroot()
|
|
284
|
+
|
|
285
|
+
all_fields = []
|
|
286
|
+
all_fields.extend(detect_placeholder_text(root))
|
|
287
|
+
all_fields.extend(detect_empty_table_cells(root))
|
|
288
|
+
all_fields.extend(detect_underline_fields(root))
|
|
289
|
+
all_fields.extend(detect_content_controls(root))
|
|
290
|
+
all_fields.extend(detect_instruction_text(root))
|
|
291
|
+
all_fields.extend(detect_image_fields(root))
|
|
292
|
+
|
|
293
|
+
# Assign IDs
|
|
294
|
+
for i, field in enumerate(all_fields):
|
|
295
|
+
field["id"] = f"field_{i + 1:03d}"
|
|
296
|
+
|
|
297
|
+
print(json.dumps(all_fields, ensure_ascii=False, indent=2))
|
|
298
|
+
|
|
299
|
+
|
|
300
|
+
if __name__ == "__main__":
|
|
301
|
+
main()
|