devlyn-cli 0.5.1 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/devlyn.js +1 -0
- package/optional-skills/better-auth-setup/SKILL.md +222 -11
- package/optional-skills/better-auth-setup/references/proxy-gotchas.md +148 -0
- package/optional-skills/better-auth-setup/references/proxy-setup.md +284 -0
- package/optional-skills/dokkit/ANALYSIS.md +198 -0
- package/optional-skills/dokkit/COMMANDS.md +365 -0
- package/optional-skills/dokkit/DOCX-XML.md +76 -0
- package/optional-skills/dokkit/EXPORT.md +102 -0
- package/optional-skills/dokkit/FILLING.md +377 -0
- package/optional-skills/dokkit/HWPX-XML.md +73 -0
- package/optional-skills/dokkit/IMAGE-SOURCING.md +127 -0
- package/optional-skills/dokkit/INGESTION.md +65 -0
- package/optional-skills/dokkit/SKILL.md +153 -0
- package/optional-skills/dokkit/STATE.md +60 -0
- package/optional-skills/dokkit/references/docx-field-patterns.md +151 -0
- package/optional-skills/dokkit/references/docx-structure.md +58 -0
- package/optional-skills/dokkit/references/field-detection-patterns.md +130 -0
- package/optional-skills/dokkit/references/hwpx-field-patterns.md +461 -0
- package/optional-skills/dokkit/references/hwpx-structure.md +159 -0
- package/optional-skills/dokkit/references/image-opportunity-heuristics.md +121 -0
- package/optional-skills/dokkit/references/image-xml-patterns.md +338 -0
- package/optional-skills/dokkit/references/section-image-interleaving.md +346 -0
- package/optional-skills/dokkit/references/section-range-detection.md +118 -0
- package/optional-skills/dokkit/references/state-schema.md +143 -0
- package/optional-skills/dokkit/references/supported-formats.md +67 -0
- package/optional-skills/dokkit/scripts/compile_hwpx.py +134 -0
- package/optional-skills/dokkit/scripts/detect_fields.py +301 -0
- package/optional-skills/dokkit/scripts/detect_fields_hwpx.py +286 -0
- package/optional-skills/dokkit/scripts/export_pdf.py +99 -0
- package/optional-skills/dokkit/scripts/parse_hwpx.py +185 -0
- package/optional-skills/dokkit/scripts/parse_image_with_gemini.py +159 -0
- package/optional-skills/dokkit/scripts/parse_xlsx.py +98 -0
- package/optional-skills/dokkit/scripts/source_images.py +365 -0
- package/optional-skills/dokkit/scripts/validate_docx.py +142 -0
- package/optional-skills/dokkit/scripts/validate_hwpx.py +281 -0
- package/optional-skills/dokkit/scripts/validate_state.py +132 -0
- package/package.json +1 -1
|
@@ -0,0 +1,286 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Detect fillable fields in an HWPX section XML file.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python detect_fields_hwpx.py <path-to-section.xml>
|
|
6
|
+
|
|
7
|
+
Output:
|
|
8
|
+
JSON array of detected fields to stdout.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
import re
|
|
13
|
+
import sys
|
|
14
|
+
import xml.etree.ElementTree as ET
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
NS = {
|
|
19
|
+
"hp": "http://www.hancom.co.kr/hwpml/2011/paragraph",
|
|
20
|
+
"hs": "http://www.hancom.co.kr/hwpml/2011/section",
|
|
21
|
+
"hc": "http://www.hancom.co.kr/hwpml/2011/common",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
# Keywords that indicate image fields (Korean and English)
|
|
25
|
+
IMAGE_KEYWORDS_KO = ["사진", "증명사진", "여권사진", "로고", "서명", "날인", "도장", "직인"]
|
|
26
|
+
IMAGE_KEYWORDS_EN = ["photo", "picture", "logo", "signature", "stamp", "seal", "image", "portrait"]
|
|
27
|
+
IMAGE_KEYWORDS = IMAGE_KEYWORDS_KO + IMAGE_KEYWORDS_EN
|
|
28
|
+
|
|
29
|
+
# Map keywords to image_type classifier
|
|
30
|
+
IMAGE_TYPE_MAP = {
|
|
31
|
+
"사진": "photo", "증명사진": "photo", "여권사진": "photo",
|
|
32
|
+
"photo": "photo", "picture": "photo", "portrait": "photo", "image": "photo",
|
|
33
|
+
"로고": "logo", "logo": "logo",
|
|
34
|
+
"서명": "signature", "날인": "signature", "stamp": "signature", "seal": "signature",
|
|
35
|
+
"도장": "signature", "직인": "signature",
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_text(elem) -> str:
|
|
40
|
+
"""Extract all text from an element and its children."""
|
|
41
|
+
texts = []
|
|
42
|
+
for t in elem.iter("{%s}t" % NS["hp"]):
|
|
43
|
+
if t.text:
|
|
44
|
+
texts.append(t.text)
|
|
45
|
+
return "".join(texts)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _classify_image_type(text: str) -> str:
|
|
49
|
+
"""Classify image type from text. Returns photo/logo/signature/figure."""
|
|
50
|
+
lower = text.lower().strip()
|
|
51
|
+
for keyword, img_type in IMAGE_TYPE_MAP.items():
|
|
52
|
+
if keyword in lower:
|
|
53
|
+
return img_type
|
|
54
|
+
return "figure"
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _is_image_keyword(text: str) -> bool:
|
|
58
|
+
"""Check if text contains an image-related keyword."""
|
|
59
|
+
lower = text.lower().strip()
|
|
60
|
+
return any(kw in lower for kw in IMAGE_KEYWORDS)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
def detect_empty_table_cells(root) -> list[dict]:
|
|
64
|
+
"""Find empty table cells adjacent to label cells in HWPX tables (excluding image keywords)."""
|
|
65
|
+
fields = []
|
|
66
|
+
|
|
67
|
+
for ti, tbl in enumerate(root.iter("{%s}tbl" % NS["hp"])):
|
|
68
|
+
for ri, tr in enumerate(tbl.iter("{%s}tr" % NS["hp"])):
|
|
69
|
+
cells = list(tr.iter("{%s}tc" % NS["hp"]))
|
|
70
|
+
for ci in range(len(cells) - 1):
|
|
71
|
+
label_text = get_text(cells[ci]).strip()
|
|
72
|
+
next_text = get_text(cells[ci + 1]).strip()
|
|
73
|
+
|
|
74
|
+
if label_text and not next_text and len(label_text) < 50:
|
|
75
|
+
# Skip image keywords — handled by detect_image_fields
|
|
76
|
+
if _is_image_keyword(label_text):
|
|
77
|
+
continue
|
|
78
|
+
fields.append({
|
|
79
|
+
"label": label_text,
|
|
80
|
+
"field_type": "empty_cell",
|
|
81
|
+
"pattern": "(empty cell)",
|
|
82
|
+
"xml_path": f"tbl[{ti}]/tr[{ri}]/tc[{ci + 1}]",
|
|
83
|
+
})
|
|
84
|
+
return fields
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def detect_instruction_text(root) -> list[dict]:
|
|
88
|
+
"""Find Korean instruction text patterns."""
|
|
89
|
+
fields = []
|
|
90
|
+
pattern = re.compile(
|
|
91
|
+
r"\(.*?(?:입력|기재|작성|enter|type|fill).*?\)",
|
|
92
|
+
re.IGNORECASE
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
for i, p in enumerate(root.iter("{%s}p" % NS["hp"])):
|
|
96
|
+
text = get_text(p)
|
|
97
|
+
for match in pattern.finditer(text):
|
|
98
|
+
fields.append({
|
|
99
|
+
"label": match.group(0).strip("()"),
|
|
100
|
+
"field_type": "instruction_text",
|
|
101
|
+
"pattern": match.group(0),
|
|
102
|
+
"xml_path": f"p[{i}]",
|
|
103
|
+
})
|
|
104
|
+
return fields
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def detect_placeholder_text(root) -> list[dict]:
|
|
108
|
+
"""Find {{placeholder}} patterns in HWPX (excluding image keywords)."""
|
|
109
|
+
fields = []
|
|
110
|
+
pattern = re.compile(r"\{\{([^}]+)\}\}|<<([^>]+)>>")
|
|
111
|
+
|
|
112
|
+
for i, p in enumerate(root.iter("{%s}p" % NS["hp"])):
|
|
113
|
+
text = get_text(p)
|
|
114
|
+
for match in pattern.finditer(text):
|
|
115
|
+
label = match.group(1) or match.group(2)
|
|
116
|
+
# Skip image keywords — handled by detect_image_fields
|
|
117
|
+
if _is_image_keyword(label):
|
|
118
|
+
continue
|
|
119
|
+
fields.append({
|
|
120
|
+
"label": label.strip(),
|
|
121
|
+
"field_type": "placeholder_text",
|
|
122
|
+
"pattern": match.group(0),
|
|
123
|
+
"xml_path": f"p[{i}]",
|
|
124
|
+
})
|
|
125
|
+
return fields
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def detect_image_fields(root) -> list[dict]:
|
|
129
|
+
"""Detect image placeholders in an HWPX section XML.
|
|
130
|
+
|
|
131
|
+
Detects:
|
|
132
|
+
- Existing <hp:pic> elements in table cells (pre-positioned image slots)
|
|
133
|
+
- Image placeholder text: {{photo}}, {{사진}}, <<signature>>, etc.
|
|
134
|
+
- Empty cells adjacent to image-keyword labels
|
|
135
|
+
"""
|
|
136
|
+
fields = []
|
|
137
|
+
placeholder_pattern = re.compile(r"\{\{([^}]+)\}\}|<<([^>]+)>>")
|
|
138
|
+
|
|
139
|
+
# 1. Detect image placeholder text
|
|
140
|
+
for i, p in enumerate(root.iter("{%s}p" % NS["hp"])):
|
|
141
|
+
text = get_text(p)
|
|
142
|
+
for match in placeholder_pattern.finditer(text):
|
|
143
|
+
label = match.group(1) or match.group(2)
|
|
144
|
+
if _is_image_keyword(label):
|
|
145
|
+
fields.append({
|
|
146
|
+
"label": label.strip(),
|
|
147
|
+
"field_type": "image",
|
|
148
|
+
"image_type": _classify_image_type(label),
|
|
149
|
+
"pattern": match.group(0),
|
|
150
|
+
"xml_path": f"p[{i}]",
|
|
151
|
+
})
|
|
152
|
+
|
|
153
|
+
# 2. Detect existing <hp:pic> elements in table cells
|
|
154
|
+
for ti, tbl in enumerate(root.iter("{%s}tbl" % NS["hp"])):
|
|
155
|
+
for ri, tr in enumerate(tbl.iter("{%s}tr" % NS["hp"])):
|
|
156
|
+
cells = list(tr.iter("{%s}tc" % NS["hp"]))
|
|
157
|
+
for ci, cell in enumerate(cells):
|
|
158
|
+
pics = list(cell.iter("{%s}pic" % NS["hp"]))
|
|
159
|
+
if pics:
|
|
160
|
+
label_text = ""
|
|
161
|
+
if ci > 0:
|
|
162
|
+
label_text = get_text(cells[ci - 1]).strip()
|
|
163
|
+
if not _is_image_keyword(label_text) and ci + 1 < len(cells):
|
|
164
|
+
label_text = get_text(cells[ci + 1]).strip()
|
|
165
|
+
if not _is_image_keyword(label_text):
|
|
166
|
+
label_text = "image_placeholder"
|
|
167
|
+
|
|
168
|
+
fields.append({
|
|
169
|
+
"label": label_text,
|
|
170
|
+
"field_type": "image",
|
|
171
|
+
"image_type": _classify_image_type(label_text),
|
|
172
|
+
"pattern": "(existing pic)",
|
|
173
|
+
"xml_path": f"tbl[{ti}]/tr[{ri}]/tc[{ci}]",
|
|
174
|
+
})
|
|
175
|
+
|
|
176
|
+
# 3. Detect empty cells adjacent to image-keyword labels
|
|
177
|
+
for ti, tbl in enumerate(root.iter("{%s}tbl" % NS["hp"])):
|
|
178
|
+
for ri, tr in enumerate(tbl.iter("{%s}tr" % NS["hp"])):
|
|
179
|
+
cells = list(tr.iter("{%s}tc" % NS["hp"]))
|
|
180
|
+
for ci in range(len(cells) - 1):
|
|
181
|
+
label_text = get_text(cells[ci]).strip()
|
|
182
|
+
next_text = get_text(cells[ci + 1]).strip()
|
|
183
|
+
|
|
184
|
+
if _is_image_keyword(label_text) and not next_text:
|
|
185
|
+
has_pic = bool(list(cells[ci + 1].iter("{%s}pic" % NS["hp"])))
|
|
186
|
+
if not has_pic:
|
|
187
|
+
fields.append({
|
|
188
|
+
"label": label_text,
|
|
189
|
+
"field_type": "image",
|
|
190
|
+
"image_type": _classify_image_type(label_text),
|
|
191
|
+
"pattern": "(empty cell, image label)",
|
|
192
|
+
"xml_path": f"tbl[{ti}]/tr[{ri}]/tc[{ci + 1}]",
|
|
193
|
+
})
|
|
194
|
+
|
|
195
|
+
return fields
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def _build_nested_tip_set(root) -> set:
|
|
199
|
+
"""Build set of table element IDs that are nested inside subList elements."""
|
|
200
|
+
nested = set()
|
|
201
|
+
for tbl in root.iter("{%s}tbl" % NS["hp"]):
|
|
202
|
+
for sub_list in tbl.iter("{%s}subList" % NS["hp"]):
|
|
203
|
+
for nested_tbl in sub_list.iter("{%s}tbl" % NS["hp"]):
|
|
204
|
+
nested.add(id(nested_tbl))
|
|
205
|
+
return nested
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def detect_tip_boxes(root) -> list[dict]:
|
|
209
|
+
"""Detect writing tip boxes (작성 팁) — 1×1 tables with ※ guidance text."""
|
|
210
|
+
fields = []
|
|
211
|
+
tip_pattern = re.compile(r"^※|작성\s?팁|작성\s?요령")
|
|
212
|
+
nested_ids = _build_nested_tip_set(root)
|
|
213
|
+
|
|
214
|
+
for ti, tbl in enumerate(root.iter("{%s}tbl" % NS["hp"])):
|
|
215
|
+
if tbl.get("rowCnt", "") != "1" or tbl.get("colCnt", "") != "1":
|
|
216
|
+
continue
|
|
217
|
+
|
|
218
|
+
text = get_text(tbl).strip()
|
|
219
|
+
if not text or not tip_pattern.search(text):
|
|
220
|
+
continue
|
|
221
|
+
|
|
222
|
+
container = "nested" if id(tbl) in nested_ids else "standalone"
|
|
223
|
+
fields.append({
|
|
224
|
+
"label": text[:60] + ("..." if len(text) > 60 else ""),
|
|
225
|
+
"field_type": "tip_box",
|
|
226
|
+
"action": "delete",
|
|
227
|
+
"container": container,
|
|
228
|
+
"pattern": "(tip box: 1×1 table with ※ text)",
|
|
229
|
+
"xml_path": f"tbl[{ti}]",
|
|
230
|
+
})
|
|
231
|
+
|
|
232
|
+
return fields
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def detect_date_fields(root) -> list[dict]:
|
|
236
|
+
"""Find date component cells (cells before 년/월/일 markers)."""
|
|
237
|
+
fields = []
|
|
238
|
+
|
|
239
|
+
for ti, tbl in enumerate(root.iter("{%s}tbl" % NS["hp"])):
|
|
240
|
+
for ri, tr in enumerate(tbl.iter("{%s}tr" % NS["hp"])):
|
|
241
|
+
cells = list(tr.iter("{%s}tc" % NS["hp"]))
|
|
242
|
+
for ci, cell in enumerate(cells):
|
|
243
|
+
text = get_text(cell).strip()
|
|
244
|
+
if text in ("년", "월", "일") and ci > 0:
|
|
245
|
+
prev_text = get_text(cells[ci - 1]).strip()
|
|
246
|
+
if not prev_text:
|
|
247
|
+
date_part = {"년": "year", "월": "month", "일": "day"}[text]
|
|
248
|
+
fields.append({
|
|
249
|
+
"label": date_part,
|
|
250
|
+
"field_type": "empty_cell",
|
|
251
|
+
"pattern": f"(date: {date_part})",
|
|
252
|
+
"xml_path": f"tbl[{ti}]/tr[{ri}]/tc[{ci - 1}]",
|
|
253
|
+
})
|
|
254
|
+
return fields
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
def main():
|
|
258
|
+
if len(sys.argv) != 2:
|
|
259
|
+
print("Usage: python detect_fields_hwpx.py <section.xml>", file=sys.stderr)
|
|
260
|
+
sys.exit(1)
|
|
261
|
+
|
|
262
|
+
path = Path(sys.argv[1])
|
|
263
|
+
if not path.exists():
|
|
264
|
+
print(json.dumps({"error": f"File not found: {path}"}))
|
|
265
|
+
sys.exit(1)
|
|
266
|
+
|
|
267
|
+
tree = ET.parse(path)
|
|
268
|
+
root = tree.getroot()
|
|
269
|
+
|
|
270
|
+
all_fields = []
|
|
271
|
+
all_fields.extend(detect_empty_table_cells(root))
|
|
272
|
+
all_fields.extend(detect_instruction_text(root))
|
|
273
|
+
all_fields.extend(detect_placeholder_text(root))
|
|
274
|
+
all_fields.extend(detect_date_fields(root))
|
|
275
|
+
all_fields.extend(detect_image_fields(root))
|
|
276
|
+
all_fields.extend(detect_tip_boxes(root))
|
|
277
|
+
|
|
278
|
+
# Assign IDs
|
|
279
|
+
for i, field in enumerate(all_fields):
|
|
280
|
+
field["id"] = f"field_{i + 1:03d}"
|
|
281
|
+
|
|
282
|
+
print(json.dumps(all_fields, ensure_ascii=False, indent=2))
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
if __name__ == "__main__":
|
|
286
|
+
main()
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Export a document to PDF using LibreOffice.
|
|
3
|
+
|
|
4
|
+
Usage:
|
|
5
|
+
python export_pdf.py <input_file> <output.pdf>
|
|
6
|
+
|
|
7
|
+
Requires:
|
|
8
|
+
LibreOffice installed and 'soffice' in PATH.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import shutil
|
|
12
|
+
import subprocess
|
|
13
|
+
import sys
|
|
14
|
+
import tempfile
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def find_soffice() -> str:
|
|
19
|
+
"""Find the LibreOffice soffice executable."""
|
|
20
|
+
# Check PATH
|
|
21
|
+
soffice = shutil.which("soffice")
|
|
22
|
+
if soffice:
|
|
23
|
+
return soffice
|
|
24
|
+
|
|
25
|
+
# Common locations
|
|
26
|
+
candidates = [
|
|
27
|
+
r"C:\Program Files\LibreOffice\program\soffice.exe",
|
|
28
|
+
r"C:\Program Files (x86)\LibreOffice\program\soffice.exe",
|
|
29
|
+
"/usr/bin/soffice",
|
|
30
|
+
"/usr/local/bin/soffice",
|
|
31
|
+
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
|
|
32
|
+
]
|
|
33
|
+
for path in candidates:
|
|
34
|
+
if Path(path).exists():
|
|
35
|
+
return path
|
|
36
|
+
|
|
37
|
+
return ""
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def export_pdf(input_file: str, output_file: str) -> str:
|
|
41
|
+
"""Convert a document to PDF using LibreOffice."""
|
|
42
|
+
soffice = find_soffice()
|
|
43
|
+
if not soffice:
|
|
44
|
+
print("Error: LibreOffice not found. Install it or add 'soffice' to PATH.",
|
|
45
|
+
file=sys.stderr)
|
|
46
|
+
sys.exit(1)
|
|
47
|
+
|
|
48
|
+
input_path = Path(input_file).resolve()
|
|
49
|
+
output_path = Path(output_file).resolve()
|
|
50
|
+
|
|
51
|
+
if not input_path.exists():
|
|
52
|
+
print(f"Error: Input file not found: {input_path}", file=sys.stderr)
|
|
53
|
+
sys.exit(1)
|
|
54
|
+
|
|
55
|
+
# Use a temp directory for LibreOffice output
|
|
56
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
57
|
+
cmd = [
|
|
58
|
+
soffice,
|
|
59
|
+
"--headless",
|
|
60
|
+
"--convert-to", "pdf",
|
|
61
|
+
"--outdir", tmpdir,
|
|
62
|
+
str(input_path),
|
|
63
|
+
]
|
|
64
|
+
|
|
65
|
+
try:
|
|
66
|
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
|
67
|
+
except subprocess.TimeoutExpired:
|
|
68
|
+
print("Error: LibreOffice conversion timed out", file=sys.stderr)
|
|
69
|
+
sys.exit(1)
|
|
70
|
+
|
|
71
|
+
if result.returncode != 0:
|
|
72
|
+
print(f"Error: LibreOffice conversion failed: {result.stderr}", file=sys.stderr)
|
|
73
|
+
sys.exit(1)
|
|
74
|
+
|
|
75
|
+
# Find the generated PDF
|
|
76
|
+
pdf_files = list(Path(tmpdir).glob("*.pdf"))
|
|
77
|
+
if not pdf_files:
|
|
78
|
+
print("Error: No PDF file generated", file=sys.stderr)
|
|
79
|
+
sys.exit(1)
|
|
80
|
+
|
|
81
|
+
# Move to output location
|
|
82
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
83
|
+
shutil.move(str(pdf_files[0]), str(output_path))
|
|
84
|
+
|
|
85
|
+
size = output_path.stat().st_size
|
|
86
|
+
print(f"Exported: {output_path} ({size:,} bytes)", file=sys.stderr)
|
|
87
|
+
return str(output_path)
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def main():
|
|
91
|
+
if len(sys.argv) != 3:
|
|
92
|
+
print("Usage: python export_pdf.py <input_file> <output.pdf>", file=sys.stderr)
|
|
93
|
+
sys.exit(1)
|
|
94
|
+
|
|
95
|
+
export_pdf(sys.argv[1], sys.argv[2])
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
if __name__ == "__main__":
|
|
99
|
+
main()
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""Parse HWPX files into Dokkit's dual-file format (Markdown + JSON sidecar).
|
|
3
|
+
|
|
4
|
+
HWPX is a ZIP archive containing XML files following the OWPML standard.
|
|
5
|
+
Structure: Contents/section0.xml, Contents/section1.xml, etc.
|
|
6
|
+
|
|
7
|
+
Usage:
|
|
8
|
+
python parse_hwpx.py <input.hwpx>
|
|
9
|
+
|
|
10
|
+
Output:
|
|
11
|
+
JSON to stdout with 'content_md' and 'metadata' fields.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import json
|
|
15
|
+
import sys
|
|
16
|
+
import zipfile
|
|
17
|
+
import xml.etree.ElementTree as ET
|
|
18
|
+
from datetime import datetime
|
|
19
|
+
from pathlib import Path
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# HWPX XML namespaces
|
|
23
|
+
NS = {
|
|
24
|
+
"hp": "http://www.hancom.co.kr/hwpml/2011/paragraph",
|
|
25
|
+
"hs": "http://www.hancom.co.kr/hwpml/2011/section",
|
|
26
|
+
"hc": "http://www.hancom.co.kr/hwpml/2011/core",
|
|
27
|
+
"hh": "http://www.hancom.co.kr/hwpml/2011/head",
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def extract_text_from_element(elem) -> str:
|
|
32
|
+
"""Recursively extract text from an XML element and its children."""
|
|
33
|
+
texts = []
|
|
34
|
+
# Check for direct text
|
|
35
|
+
if elem.text:
|
|
36
|
+
texts.append(elem.text)
|
|
37
|
+
|
|
38
|
+
for child in elem:
|
|
39
|
+
# Look for text runs
|
|
40
|
+
tag = child.tag.split("}")[-1] if "}" in child.tag else child.tag
|
|
41
|
+
if tag in ("t", "text"):
|
|
42
|
+
if child.text:
|
|
43
|
+
texts.append(child.text)
|
|
44
|
+
elif tag == "run":
|
|
45
|
+
texts.append(extract_text_from_element(child))
|
|
46
|
+
elif tag == "lineseg":
|
|
47
|
+
texts.append(extract_text_from_element(child))
|
|
48
|
+
else:
|
|
49
|
+
texts.append(extract_text_from_element(child))
|
|
50
|
+
|
|
51
|
+
if child.tail:
|
|
52
|
+
texts.append(child.tail)
|
|
53
|
+
|
|
54
|
+
return "".join(texts)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def parse_table(table_elem) -> list[list[str]]:
|
|
58
|
+
"""Parse a table element into a 2D list of cell values."""
|
|
59
|
+
rows = []
|
|
60
|
+
for row_elem in table_elem:
|
|
61
|
+
tag = row_elem.tag.split("}")[-1] if "}" in row_elem.tag else row_elem.tag
|
|
62
|
+
if tag != "tr":
|
|
63
|
+
continue
|
|
64
|
+
cells = []
|
|
65
|
+
for cell_elem in row_elem:
|
|
66
|
+
cell_tag = cell_elem.tag.split("}")[-1] if "}" in cell_elem.tag else cell_elem.tag
|
|
67
|
+
if cell_tag != "tc":
|
|
68
|
+
continue
|
|
69
|
+
cell_text = extract_text_from_element(cell_elem).strip()
|
|
70
|
+
cells.append(cell_text)
|
|
71
|
+
if cells:
|
|
72
|
+
rows.append(cells)
|
|
73
|
+
return rows
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def parse_section(xml_content: str, section_name: str) -> tuple[str, dict]:
|
|
77
|
+
"""Parse a single section XML and return markdown content + key-value pairs."""
|
|
78
|
+
root = ET.fromstring(xml_content)
|
|
79
|
+
content_parts = []
|
|
80
|
+
key_value_pairs = {}
|
|
81
|
+
|
|
82
|
+
content_parts.append(f"## {section_name}\n")
|
|
83
|
+
|
|
84
|
+
for elem in root.iter():
|
|
85
|
+
tag = elem.tag.split("}")[-1] if "}" in elem.tag else elem.tag
|
|
86
|
+
|
|
87
|
+
if tag == "p":
|
|
88
|
+
text = extract_text_from_element(elem).strip()
|
|
89
|
+
if text:
|
|
90
|
+
content_parts.append(text)
|
|
91
|
+
|
|
92
|
+
elif tag == "tbl":
|
|
93
|
+
rows = parse_table(elem)
|
|
94
|
+
if rows:
|
|
95
|
+
# Convert to markdown table
|
|
96
|
+
if len(rows) > 0:
|
|
97
|
+
max_cols = max(len(r) for r in rows)
|
|
98
|
+
# Pad rows to same length
|
|
99
|
+
for r in rows:
|
|
100
|
+
while len(r) < max_cols:
|
|
101
|
+
r.append("")
|
|
102
|
+
|
|
103
|
+
content_parts.append("")
|
|
104
|
+
content_parts.append("| " + " | ".join(rows[0]) + " |")
|
|
105
|
+
content_parts.append("| " + " | ".join(["---"] * max_cols) + " |")
|
|
106
|
+
for row in rows[1:]:
|
|
107
|
+
content_parts.append("| " + " | ".join(row) + " |")
|
|
108
|
+
content_parts.append("")
|
|
109
|
+
|
|
110
|
+
# Extract key-value pairs from 2-column tables
|
|
111
|
+
for row in rows:
|
|
112
|
+
if len(row) >= 2 and row[0] and row[1]:
|
|
113
|
+
label = row[0].strip()
|
|
114
|
+
value = row[1].strip()
|
|
115
|
+
if len(label) < 50:
|
|
116
|
+
key_value_pairs[label] = value
|
|
117
|
+
|
|
118
|
+
return "\n".join(content_parts), key_value_pairs
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def parse_hwpx(file_path: str) -> dict:
|
|
122
|
+
"""Parse an HWPX file and return content + metadata."""
|
|
123
|
+
path = Path(file_path)
|
|
124
|
+
|
|
125
|
+
if not zipfile.is_zipfile(path):
|
|
126
|
+
return {"error": f"Not a valid HWPX (ZIP) file: {path}"}
|
|
127
|
+
|
|
128
|
+
all_content = []
|
|
129
|
+
all_kvp = {}
|
|
130
|
+
sections = []
|
|
131
|
+
|
|
132
|
+
with zipfile.ZipFile(path, "r") as zf:
|
|
133
|
+
# Find section files
|
|
134
|
+
section_files = sorted([
|
|
135
|
+
f for f in zf.namelist()
|
|
136
|
+
if f.startswith("Contents/section") and f.endswith(".xml")
|
|
137
|
+
])
|
|
138
|
+
|
|
139
|
+
if not section_files:
|
|
140
|
+
# Try alternate paths
|
|
141
|
+
section_files = sorted([
|
|
142
|
+
f for f in zf.namelist()
|
|
143
|
+
if "section" in f.lower() and f.endswith(".xml")
|
|
144
|
+
])
|
|
145
|
+
|
|
146
|
+
for i, section_file in enumerate(section_files):
|
|
147
|
+
section_name = f"Section {i + 1}"
|
|
148
|
+
sections.append(section_name)
|
|
149
|
+
|
|
150
|
+
xml_content = zf.read(section_file).decode("utf-8")
|
|
151
|
+
md_content, kvp = parse_section(xml_content, section_name)
|
|
152
|
+
all_content.append(md_content)
|
|
153
|
+
all_kvp.update(kvp)
|
|
154
|
+
|
|
155
|
+
content_md = f"# {path.stem}\n\n" + "\n\n".join(all_content)
|
|
156
|
+
|
|
157
|
+
return {
|
|
158
|
+
"content_md": content_md,
|
|
159
|
+
"metadata": {
|
|
160
|
+
"file_name": path.name,
|
|
161
|
+
"file_type": "hwpx",
|
|
162
|
+
"parse_date": datetime.now().isoformat(),
|
|
163
|
+
"key_value_pairs": all_kvp,
|
|
164
|
+
"sections": sections,
|
|
165
|
+
"section_count": len(sections),
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def main():
|
|
171
|
+
if len(sys.argv) != 2:
|
|
172
|
+
print("Usage: python parse_hwpx.py <input.hwpx>", file=sys.stderr)
|
|
173
|
+
sys.exit(1)
|
|
174
|
+
|
|
175
|
+
file_path = sys.argv[1]
|
|
176
|
+
if not Path(file_path).exists():
|
|
177
|
+
print(json.dumps({"error": f"File not found: {file_path}"}))
|
|
178
|
+
sys.exit(1)
|
|
179
|
+
|
|
180
|
+
result = parse_hwpx(file_path)
|
|
181
|
+
print(json.dumps(result, ensure_ascii=False, indent=2))
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
if __name__ == "__main__":
|
|
185
|
+
main()
|