devlyn-cli 0.5.2 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/devlyn.js +1 -0
- package/config/commands/devlyn.team-resolve.md +31 -2
- package/optional-skills/dokkit/ANALYSIS.md +198 -0
- package/optional-skills/dokkit/COMMANDS.md +365 -0
- package/optional-skills/dokkit/DOCX-XML.md +76 -0
- package/optional-skills/dokkit/EXPORT.md +102 -0
- package/optional-skills/dokkit/FILLING.md +377 -0
- package/optional-skills/dokkit/HWPX-XML.md +73 -0
- package/optional-skills/dokkit/IMAGE-SOURCING.md +127 -0
- package/optional-skills/dokkit/INGESTION.md +65 -0
- package/optional-skills/dokkit/SKILL.md +153 -0
- package/optional-skills/dokkit/STATE.md +60 -0
- package/optional-skills/dokkit/references/docx-field-patterns.md +151 -0
- package/optional-skills/dokkit/references/docx-structure.md +58 -0
- package/optional-skills/dokkit/references/field-detection-patterns.md +130 -0
- package/optional-skills/dokkit/references/hwpx-field-patterns.md +461 -0
- package/optional-skills/dokkit/references/hwpx-structure.md +159 -0
- package/optional-skills/dokkit/references/image-opportunity-heuristics.md +121 -0
- package/optional-skills/dokkit/references/image-xml-patterns.md +338 -0
- package/optional-skills/dokkit/references/section-image-interleaving.md +346 -0
- package/optional-skills/dokkit/references/section-range-detection.md +118 -0
- package/optional-skills/dokkit/references/state-schema.md +143 -0
- package/optional-skills/dokkit/references/supported-formats.md +67 -0
- package/optional-skills/dokkit/scripts/compile_hwpx.py +134 -0
- package/optional-skills/dokkit/scripts/detect_fields.py +301 -0
- package/optional-skills/dokkit/scripts/detect_fields_hwpx.py +286 -0
- package/optional-skills/dokkit/scripts/export_pdf.py +99 -0
- package/optional-skills/dokkit/scripts/parse_hwpx.py +185 -0
- package/optional-skills/dokkit/scripts/parse_image_with_gemini.py +159 -0
- package/optional-skills/dokkit/scripts/parse_xlsx.py +98 -0
- package/optional-skills/dokkit/scripts/source_images.py +365 -0
- package/optional-skills/dokkit/scripts/validate_docx.py +142 -0
- package/optional-skills/dokkit/scripts/validate_hwpx.py +281 -0
- package/optional-skills/dokkit/scripts/validate_state.py +132 -0
- package/package.json +1 -1
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
# DOCX XML Knowledge
|
|
2
|
+
|
|
3
|
+
Open XML structure for surgical DOCX document editing.
|
|
4
|
+
|
|
5
|
+
## DOCX Structure
|
|
6
|
+
|
|
7
|
+
A DOCX file is a ZIP archive:
|
|
8
|
+
```
|
|
9
|
+
[Content_Types].xml — MIME type mappings
|
|
10
|
+
_rels/.rels — root relationships
|
|
11
|
+
word/
|
|
12
|
+
document.xml — main document body (PRIMARY TARGET)
|
|
13
|
+
styles.xml — style definitions
|
|
14
|
+
numbering.xml — list numbering definitions
|
|
15
|
+
settings.xml — document settings
|
|
16
|
+
fontTable.xml — font declarations
|
|
17
|
+
theme/theme1.xml — theme colors/fonts
|
|
18
|
+
media/ — embedded images
|
|
19
|
+
_rels/document.xml.rels — document relationships
|
|
20
|
+
docProps/
|
|
21
|
+
app.xml — application metadata
|
|
22
|
+
core.xml — document metadata
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## Key XML Elements
|
|
26
|
+
|
|
27
|
+
### Namespace
|
|
28
|
+
```xml
|
|
29
|
+
xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
### Document Body
|
|
33
|
+
```xml
|
|
34
|
+
<w:body>
|
|
35
|
+
<w:p> <!-- paragraph -->
|
|
36
|
+
<w:pPr> <!-- paragraph properties -->
|
|
37
|
+
<w:r> <!-- run (text with formatting) -->
|
|
38
|
+
<w:rPr> <!-- run properties (font, size, bold, etc.) -->
|
|
39
|
+
<w:t> <!-- text content -->
|
|
40
|
+
</w:r>
|
|
41
|
+
</w:p>
|
|
42
|
+
</w:body>
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Tables
|
|
46
|
+
```xml
|
|
47
|
+
<w:tbl>
|
|
48
|
+
<w:tblPr> <!-- table properties -->
|
|
49
|
+
<w:tblGrid> <!-- column widths -->
|
|
50
|
+
<w:tr> <!-- table row -->
|
|
51
|
+
<w:trPr> <!-- row properties -->
|
|
52
|
+
<w:tc> <!-- table cell -->
|
|
53
|
+
<w:tcPr> <!-- cell properties (width, merge, borders) -->
|
|
54
|
+
<w:p> <!-- cell content (paragraph) -->
|
|
55
|
+
</w:tc>
|
|
56
|
+
</w:tr>
|
|
57
|
+
</w:tbl>
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
### Content Controls (Structured Document Tags)
|
|
61
|
+
```xml
|
|
62
|
+
<w:sdt>
|
|
63
|
+
<w:sdtPr>
|
|
64
|
+
<w:alias w:val="FieldName"/>
|
|
65
|
+
<w:tag w:val="field_tag"/>
|
|
66
|
+
</w:sdtPr>
|
|
67
|
+
<w:sdtContent>
|
|
68
|
+
<w:p><w:r><w:t>Placeholder</w:t></w:r></w:p>
|
|
69
|
+
</w:sdtContent>
|
|
70
|
+
</w:sdt>
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
## References
|
|
74
|
+
|
|
75
|
+
See `references/docx-structure.md` for unpacking, repackaging, and critical rules.
|
|
76
|
+
See `references/docx-field-patterns.md` for field detection patterns (placeholders, empty cells, underline, content controls, instruction text, tip boxes).
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
# Export Knowledge
|
|
2
|
+
|
|
3
|
+
Document compilation and format conversion for the dokkit-exporter agent.
|
|
4
|
+
|
|
5
|
+
## Compilation (Repackaging)
|
|
6
|
+
|
|
7
|
+
### DOCX Compilation
|
|
8
|
+
```python
|
|
9
|
+
import os, zipfile
|
|
10
|
+
|
|
11
|
+
def compile_docx(work_dir: str, output_path: str):
|
|
12
|
+
"""Repackage a DOCX from its unpacked working directory."""
|
|
13
|
+
with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
|
|
14
|
+
for root, dirs, files in os.walk(work_dir):
|
|
15
|
+
for file in files:
|
|
16
|
+
file_path = os.path.join(root, file)
|
|
17
|
+
arcname = os.path.relpath(file_path, work_dir)
|
|
18
|
+
zf.write(file_path, arcname)
|
|
19
|
+
return output_path
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
### HWPX Compilation
|
|
23
|
+
```python
|
|
24
|
+
import os, zipfile
|
|
25
|
+
|
|
26
|
+
def compile_hwpx(work_dir: str, output_path: str):
|
|
27
|
+
"""Repackage HWPX. CRITICAL: mimetype must be first and uncompressed."""
|
|
28
|
+
with zipfile.ZipFile(output_path, 'w') as zf:
|
|
29
|
+
mimetype_path = os.path.join(work_dir, "mimetype")
|
|
30
|
+
if os.path.exists(mimetype_path):
|
|
31
|
+
zf.write(mimetype_path, "mimetype", compress_type=zipfile.ZIP_STORED)
|
|
32
|
+
for root, dirs, files in os.walk(work_dir):
|
|
33
|
+
for file in sorted(files):
|
|
34
|
+
if file == "mimetype" or file.endswith(".bak"):
|
|
35
|
+
continue
|
|
36
|
+
file_path = os.path.join(root, file)
|
|
37
|
+
arcname = os.path.relpath(file_path, work_dir)
|
|
38
|
+
zf.write(file_path, arcname, compress_type=zipfile.ZIP_DEFLATED)
|
|
39
|
+
return output_path
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
### Scripts
|
|
43
|
+
```bash
|
|
44
|
+
python .claude/skills/dokkit/scripts/compile_hwpx.py <work_dir> <output.hwpx>
|
|
45
|
+
python .claude/skills/dokkit/scripts/export_pdf.py <input> <output.pdf>
|
|
46
|
+
```
|
|
47
|
+
|
|
48
|
+
## PDF Conversion
|
|
49
|
+
|
|
50
|
+
### Using LibreOffice
|
|
51
|
+
```bash
|
|
52
|
+
soffice --headless --convert-to pdf --outdir <output_dir> <input_file>
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
### Using Python Script
|
|
56
|
+
```bash
|
|
57
|
+
python .claude/skills/dokkit/scripts/export_pdf.py <input> <output.pdf>
|
|
58
|
+
```
|
|
59
|
+
|
|
60
|
+
## Cross-Format Conversion
|
|
61
|
+
|
|
62
|
+
Use LibreOffice as intermediary:
|
|
63
|
+
```bash
|
|
64
|
+
soffice --headless --convert-to hwpx --outdir <dir> <input.docx>
|
|
65
|
+
soffice --headless --convert-to docx --outdir <dir> <input.hwpx>
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
Cross-format conversion may lose formatting fidelity. Always warn the user.
|
|
69
|
+
|
|
70
|
+
## Validation
|
|
71
|
+
|
|
72
|
+
After compilation, verify:
|
|
73
|
+
1. Output file is a valid ZIP archive
|
|
74
|
+
2. File size is reasonable (> 0 bytes)
|
|
75
|
+
3. For DOCX: `[Content_Types].xml` exists at root
|
|
76
|
+
4. For HWPX: `mimetype` is first entry and correct value
|
|
77
|
+
|
|
78
|
+
```python
|
|
79
|
+
import zipfile
|
|
80
|
+
|
|
81
|
+
def validate_archive(path: str, doc_type: str) -> list[str]:
|
|
82
|
+
errors = []
|
|
83
|
+
try:
|
|
84
|
+
with zipfile.ZipFile(path, 'r') as zf:
|
|
85
|
+
names = zf.namelist()
|
|
86
|
+
if doc_type == "docx":
|
|
87
|
+
if "[Content_Types].xml" not in names:
|
|
88
|
+
errors.append("Missing [Content_Types].xml")
|
|
89
|
+
elif doc_type == "hwpx":
|
|
90
|
+
if not names or names[0] != "mimetype":
|
|
91
|
+
errors.append("mimetype is not the first entry")
|
|
92
|
+
except zipfile.BadZipFile:
|
|
93
|
+
errors.append("Output is not a valid ZIP archive")
|
|
94
|
+
return errors
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
## Rules
|
|
98
|
+
|
|
99
|
+
- Never modify filled XML during export — only repackage
|
|
100
|
+
- ZIP structure must match original (Content_Types.xml at root for DOCX, mimetype first for HWPX)
|
|
101
|
+
- Skip .bak files during HWPX compilation
|
|
102
|
+
- Report clear errors if conversion tools unavailable
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
# Filling Knowledge
|
|
2
|
+
|
|
3
|
+
Field detection, matching strategies, and surgical XML editing rules for the dokkit-filler (and shared with dokkit-analyzer).
|
|
4
|
+
|
|
5
|
+
## Table of Contents
|
|
6
|
+
|
|
7
|
+
- [Field Detection Scripts](#field-detection-scripts)
|
|
8
|
+
- [Matching Strategy](#matching-strategy)
|
|
9
|
+
- [XML Surgery Rules](#xml-surgery-rules)
|
|
10
|
+
- [Image Insertion Surgery](#image-insertion-surgery)
|
|
11
|
+
|
|
12
|
+
---
|
|
13
|
+
|
|
14
|
+
## Field Detection Scripts
|
|
15
|
+
|
|
16
|
+
### DOCX Field Detection
|
|
17
|
+
```bash
|
|
18
|
+
python .claude/skills/dokkit/scripts/detect_fields.py <document.xml>
|
|
19
|
+
```
|
|
20
|
+
Outputs JSON array of detected fields with labels, types, and XML paths.
|
|
21
|
+
|
|
22
|
+
### HWPX Field Detection
|
|
23
|
+
```bash
|
|
24
|
+
python .claude/skills/dokkit/scripts/detect_fields_hwpx.py <section.xml>
|
|
25
|
+
```
|
|
26
|
+
Same output format, adapted for HWPX XML structure.
|
|
27
|
+
|
|
28
|
+
### DOCX Validation
|
|
29
|
+
```bash
|
|
30
|
+
python .claude/skills/dokkit/scripts/validate_docx.py <work_dir>
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### HWPX Validation
|
|
34
|
+
```bash
|
|
35
|
+
python .claude/skills/dokkit/scripts/validate_hwpx.py <work_dir>
|
|
36
|
+
```
|
|
37
|
+
|
|
38
|
+
## Matching Strategy
|
|
39
|
+
|
|
40
|
+
### Step 1: Exact Match
|
|
41
|
+
`field.label == source.key` — confidence: high
|
|
42
|
+
|
|
43
|
+
### Step 2: Normalized Match
|
|
44
|
+
Lowercase, strip whitespace, remove punctuation — confidence: high
|
|
45
|
+
|
|
46
|
+
### Step 3: Semantic Match
|
|
47
|
+
"Full Name" matches "Name" — confidence: high
|
|
48
|
+
"Phone Number" matches "Contact" — confidence: medium
|
|
49
|
+
|
|
50
|
+
### Step 4: Cross-Language Match
|
|
51
|
+
"성명" matches "Name" — confidence: medium
|
|
52
|
+
"주소" matches "Address" — confidence: medium
|
|
53
|
+
|
|
54
|
+
### Step 5: Context Inference
|
|
55
|
+
If field is in "Education" section and source has education data — confidence: low
|
|
56
|
+
Generic fields like "비고" (Remarks) — skip or flag
|
|
57
|
+
|
|
58
|
+
## XML Surgery Rules
|
|
59
|
+
|
|
60
|
+
### Rule 1: Preserve Run Properties
|
|
61
|
+
```xml
|
|
62
|
+
<!-- BEFORE -->
|
|
63
|
+
<w:r><w:rPr><w:b/><w:sz w:val="24"/></w:rPr><w:t>{{name}}</w:t></w:r>
|
|
64
|
+
<!-- AFTER — rPr is IDENTICAL -->
|
|
65
|
+
<w:r><w:rPr><w:b/><w:sz w:val="24"/></w:rPr><w:t>John Doe</w:t></w:r>
|
|
66
|
+
```
|
|
67
|
+
|
|
68
|
+
### Rule 2: Handle xml:space
|
|
69
|
+
When inserting text with leading/trailing spaces:
|
|
70
|
+
```xml
|
|
71
|
+
<w:t xml:space="preserve"> John Doe </w:t>
|
|
72
|
+
```
|
|
73
|
+
|
|
74
|
+
### Rule 3: Copy Formatting for Empty Cells
|
|
75
|
+
Copy run properties from the label cell. Always sanitize:
|
|
76
|
+
```python
|
|
77
|
+
label_rPr = label_run.find("w:rPr", ns)
|
|
78
|
+
new_run = ET.SubElement(empty_p, "w:r")
|
|
79
|
+
if label_rPr is not None:
|
|
80
|
+
new_rPr = copy.deepcopy(label_rPr)
|
|
81
|
+
# Remove red color from guide text
|
|
82
|
+
color_elem = new_rPr.find("w:color", ns)
|
|
83
|
+
if color_elem is not None:
|
|
84
|
+
val = (color_elem.get("{%s}val" % ns["w"]) or "").upper()
|
|
85
|
+
if val in ("FF0000", "FF0000FF", "RED"):
|
|
86
|
+
new_rPr.remove(color_elem)
|
|
87
|
+
# Remove italic from guide text
|
|
88
|
+
italic_elem = new_rPr.find("w:i", ns)
|
|
89
|
+
if italic_elem is not None:
|
|
90
|
+
new_rPr.remove(italic_elem)
|
|
91
|
+
new_run.append(new_rPr)
|
|
92
|
+
new_t = ET.SubElement(new_run, "w:t")
|
|
93
|
+
new_t.text = value
|
|
94
|
+
```
|
|
95
|
+
|
|
96
|
+
HWPX equivalent: Verify `charPrIDRef` in header.xml does NOT have `textColor="#FF0000"`. If it does, use a black charPr instead (see Rule 6).
|
|
97
|
+
|
|
98
|
+
### Rule 4: Never Break Table Structure
|
|
99
|
+
- Do not add or remove `<w:tc>` elements
|
|
100
|
+
- Do not change `<w:gridSpan>` or `<w:vMerge>`
|
|
101
|
+
- Only modify content within existing cells
|
|
102
|
+
|
|
103
|
+
### Rule 5: Tip Box Removal
|
|
104
|
+
|
|
105
|
+
Before filling fields, remove all `field_type: "tip_box"` entries.
|
|
106
|
+
|
|
107
|
+
**HWPX standalone** — delete entire `<hp:tbl>`:
|
|
108
|
+
```python
|
|
109
|
+
ns = {"hp": "http://www.hancom.co.kr/hwpml/2011/paragraph"}
|
|
110
|
+
tip_pattern = re.compile(r"^※|작성\s?팁|작성\s?요령")
|
|
111
|
+
|
|
112
|
+
def remove_tip_boxes_hwpx(root):
|
|
113
|
+
to_remove = []
|
|
114
|
+
for tbl in root.iter("{%s}tbl" % ns["hp"]):
|
|
115
|
+
if tbl.get("rowCnt") == "1" and tbl.get("colCnt") == "1":
|
|
116
|
+
text = "".join(t.text or "" for t in tbl.iter("{%s}t" % ns["hp"]))
|
|
117
|
+
if tip_pattern.search(text.strip()):
|
|
118
|
+
to_remove.append(tbl)
|
|
119
|
+
parent_map = {c: p for p in root.iter() for c in p}
|
|
120
|
+
root_children = set(root)
|
|
121
|
+
for tbl in to_remove:
|
|
122
|
+
if tbl in root_children:
|
|
123
|
+
root.remove(tbl)
|
|
124
|
+
else:
|
|
125
|
+
parent = parent_map.get(tbl)
|
|
126
|
+
if parent is not None:
|
|
127
|
+
parent.remove(tbl)
|
|
128
|
+
return len(to_remove)
|
|
129
|
+
```
|
|
130
|
+
|
|
131
|
+
**HWPX nested** — delete only the `<hp:p>` containing the tip (preserve subList):
|
|
132
|
+
```python
|
|
133
|
+
def remove_nested_tips_hwpx(cell_elem):
|
|
134
|
+
removed = 0
|
|
135
|
+
for sub_list in list(cell_elem.iter("{%s}subList" % ns["hp"])):
|
|
136
|
+
for p_elem in list(sub_list.findall("{%s}p" % ns["hp"])):
|
|
137
|
+
for tbl in p_elem.iter("{%s}tbl" % ns["hp"]):
|
|
138
|
+
if tbl.get("rowCnt") == "1" and tbl.get("colCnt") == "1":
|
|
139
|
+
text = "".join(t.text or "" for t in tbl.iter("{%s}t" % ns["hp"]))
|
|
140
|
+
if tip_pattern.search(text.strip()):
|
|
141
|
+
sub_list.remove(p_elem)
|
|
142
|
+
removed += 1
|
|
143
|
+
break
|
|
144
|
+
return removed
|
|
145
|
+
```
|
|
146
|
+
|
|
147
|
+
**DOCX** — delete 1x1 dashed-border tip tables:
|
|
148
|
+
```python
|
|
149
|
+
def remove_tip_boxes_docx(root):
|
|
150
|
+
ns_w = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
|
|
151
|
+
to_remove = []
|
|
152
|
+
for tbl in root.iter("{%s}tbl" % ns_w["w"]):
|
|
153
|
+
rows = list(tbl.iter("{%s}tr" % ns_w["w"]))
|
|
154
|
+
if len(rows) != 1:
|
|
155
|
+
continue
|
|
156
|
+
cells = list(rows[0].iter("{%s}tc" % ns_w["w"]))
|
|
157
|
+
if len(cells) != 1:
|
|
158
|
+
continue
|
|
159
|
+
text = "".join(t.text or "" for t in tbl.iter("{%s}t" % ns_w["w"]))
|
|
160
|
+
if tip_pattern.search(text.strip()):
|
|
161
|
+
to_remove.append(tbl)
|
|
162
|
+
parent_map = {c: p for p in root.iter() for c in p}
|
|
163
|
+
for tbl in to_remove:
|
|
164
|
+
parent = parent_map.get(tbl)
|
|
165
|
+
if parent is not None:
|
|
166
|
+
parent.remove(tbl)
|
|
167
|
+
return len(to_remove)
|
|
168
|
+
```
|
|
169
|
+
|
|
170
|
+
**Post-removal cleanup**: Clear remaining `※`-prefixed runs in fill-target cells:
|
|
171
|
+
```python
|
|
172
|
+
def clear_residual_tips(cell_elem, ns_prefix):
|
|
173
|
+
for t_elem in cell_elem.iter("{%s}t" % ns_prefix):
|
|
174
|
+
if t_elem.text and t_elem.text.strip().startswith("※"):
|
|
175
|
+
t_elem.text = ""
|
|
176
|
+
```
|
|
177
|
+
|
|
178
|
+
### Rule 6: Color Sanitization
|
|
179
|
+
|
|
180
|
+
Filled text must always be black. Never inherit red/colored styles from guide text.
|
|
181
|
+
|
|
182
|
+
**HWPX — find black charPrIDRef**:
|
|
183
|
+
```python
|
|
184
|
+
def find_black_charpr(header_path):
|
|
185
|
+
hns = {"hh": "http://www.hancom.co.kr/hwpml/2011/head"}
|
|
186
|
+
tree = ET.parse(header_path)
|
|
187
|
+
normal_id = None
|
|
188
|
+
bold_id = None
|
|
189
|
+
for cp in tree.getroot().iter("{%s}charPr" % hns["hh"]):
|
|
190
|
+
color = cp.get("textColor", "#000000").upper()
|
|
191
|
+
if color not in ("#000000", "#000000FF", "BLACK"):
|
|
192
|
+
continue
|
|
193
|
+
italic = cp.get("italic", "false")
|
|
194
|
+
spacing = int(cp.get("spacing", "0"))
|
|
195
|
+
if italic != "false" or spacing < 0:
|
|
196
|
+
continue
|
|
197
|
+
bold = cp.get("bold", "false")
|
|
198
|
+
if bold == "false" and normal_id is None:
|
|
199
|
+
normal_id = cp.get("id")
|
|
200
|
+
elif bold == "true" and bold_id is None:
|
|
201
|
+
bold_id = cp.get("id")
|
|
202
|
+
return {"normal": normal_id, "bold": bold_id}
|
|
203
|
+
```
|
|
204
|
+
|
|
205
|
+
Before inserting any `<hp:run>`, check if the `charPrIDRef` has `textColor="#FF0000"`. If so, use `normal` ID from `find_black_charpr()`.
|
|
206
|
+
|
|
207
|
+
**DOCX — sanitize copied rPr**:
|
|
208
|
+
```python
|
|
209
|
+
def sanitize_rpr(rpr_elem, ns):
|
|
210
|
+
if rpr_elem is None:
|
|
211
|
+
return
|
|
212
|
+
color = rpr_elem.find("{%s}color" % ns["w"])
|
|
213
|
+
if color is not None:
|
|
214
|
+
val = color.get("{%s}val" % ns["w"], "").upper()
|
|
215
|
+
if val in ("FF0000", "FF0000FF", "RED"):
|
|
216
|
+
rpr_elem.remove(color)
|
|
217
|
+
italic = rpr_elem.find("{%s}i" % ns["w"])
|
|
218
|
+
if italic is not None:
|
|
219
|
+
rpr_elem.remove(italic)
|
|
220
|
+
```
|
|
221
|
+
|
|
222
|
+
Avoid charPrIDRef with negative `spacing` — causes character overlap.
|
|
223
|
+
|
|
224
|
+
### Rule 7: Table Template Row Selection (HWPX)
|
|
225
|
+
|
|
226
|
+
For `table_content` fields, select the right template row for cloning:
|
|
227
|
+
|
|
228
|
+
**Normal row** — all rowSpan=1, full column count:
|
|
229
|
+
```python
|
|
230
|
+
def find_normal_template_row(tbl, tr_start, tr_end):
|
|
231
|
+
HP = "http://www.hancom.co.kr/hwpml/2011/paragraph"
|
|
232
|
+
rows = tbl.findall(f"{{{HP}}}tr")
|
|
233
|
+
col_cnt = int(tbl.get("colCnt", "3"))
|
|
234
|
+
for i in range(tr_start, min(tr_end + 1, len(rows))):
|
|
235
|
+
row = rows[i]
|
|
236
|
+
text = "".join(t.text or "" for t in row.iter(f"{{{HP}}}t")).strip()
|
|
237
|
+
normalized = text.replace(" ", "").replace("\u3000", "")
|
|
238
|
+
if "합계" in normalized or "소계" in normalized:
|
|
239
|
+
continue
|
|
240
|
+
stripped = text.replace(".", "").replace("…", "").replace(" ", "")
|
|
241
|
+
if not stripped:
|
|
242
|
+
continue
|
|
243
|
+
cells = row.findall(f"{{{HP}}}tc")
|
|
244
|
+
if len(cells) != col_cnt:
|
|
245
|
+
continue
|
|
246
|
+
all_span1 = all(
|
|
247
|
+
int((tc.find(f"{{{HP}}}cellSpan") or {}).get("rowSpan", "1")) == 1
|
|
248
|
+
for tc in cells
|
|
249
|
+
if tc.find(f"{{{HP}}}cellSpan") is not None
|
|
250
|
+
)
|
|
251
|
+
if all_span1:
|
|
252
|
+
return copy.deepcopy(row)
|
|
253
|
+
return None
|
|
254
|
+
```
|
|
255
|
+
|
|
256
|
+
**Fallback with rowSpan stripping**:
|
|
257
|
+
```python
|
|
258
|
+
def strip_rowspan_from_template(tpl_row):
|
|
259
|
+
HP = "http://www.hancom.co.kr/hwpml/2011/paragraph"
|
|
260
|
+
for tc in tpl_row.findall(f"{{{HP}}}tc"):
|
|
261
|
+
cs = tc.find(f"{{{HP}}}cellSpan")
|
|
262
|
+
if cs is not None:
|
|
263
|
+
rs = int(cs.get("rowSpan", "1"))
|
|
264
|
+
if rs > 1:
|
|
265
|
+
cs.set("rowSpan", "1")
|
|
266
|
+
csz = tc.find(f"{{{HP}}}cellSz")
|
|
267
|
+
if csz is not None:
|
|
268
|
+
old_h = int(csz.get("height", "2129"))
|
|
269
|
+
csz.set("height", str(old_h // rs))
|
|
270
|
+
```
|
|
271
|
+
|
|
272
|
+
**Summary row detection** — separate 합계/소계 from data rows:
|
|
273
|
+
```python
|
|
274
|
+
def separate_summary_rows(data_rows):
|
|
275
|
+
regular, summary = [], None
|
|
276
|
+
for row in data_rows:
|
|
277
|
+
label = row[0].strip().replace(" ", "").replace("\u3000", "")
|
|
278
|
+
if label in ("합계", "소계"):
|
|
279
|
+
summary = row
|
|
280
|
+
else:
|
|
281
|
+
regular.append(row)
|
|
282
|
+
return regular, summary
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
Deep-copy original summary row template to preserve colSpan structure.
|
|
286
|
+
|
|
287
|
+
### Rule 8: SubList Recreation (HWPX)
|
|
288
|
+
|
|
289
|
+
When writing to a cell with no `<hp:subList>`, recreate one:
|
|
290
|
+
```python
|
|
291
|
+
def ensure_sublist(tc):
|
|
292
|
+
HP = "http://www.hancom.co.kr/hwpml/2011/paragraph"
|
|
293
|
+
for child in tc:
|
|
294
|
+
if child.tag == f"{{{HP}}}subList":
|
|
295
|
+
return child
|
|
296
|
+
sl = ET.SubElement(tc, f"{{{HP}}}subList", {
|
|
297
|
+
"id": "", "textDirection": "HORIZONTAL", "lineWrap": "BREAK",
|
|
298
|
+
"vertAlign": "CENTER", "linkListIDRef": "0", "linkListNextIDRef": "0",
|
|
299
|
+
"textWidth": "0", "textHeight": "0", "hasTextRef": "0", "hasNumRef": "0"
|
|
300
|
+
})
|
|
301
|
+
tc.insert(0, sl)
|
|
302
|
+
return sl
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
Use `ensure_sublist(tc)` when writing to a cell. Append new `<hp:p>` to the returned container.
|
|
306
|
+
|
|
307
|
+
### Rule 9: cellAddr Re-indexing (HWPX)
|
|
308
|
+
|
|
309
|
+
After inserting or deleting rows, ALL `<hp:cellAddr rowAddr="N"/>` must equal the 0-based row index:
|
|
310
|
+
```python
|
|
311
|
+
def fix_celladdr(tbl):
|
|
312
|
+
HP = "http://www.hancom.co.kr/hwpml/2011/paragraph"
|
|
313
|
+
for row_idx, tr in enumerate(tbl.findall(f"{{{HP}}}tr")):
|
|
314
|
+
for tc in tr.findall(f"{{{HP}}}tc"):
|
|
315
|
+
addr = tc.find(f"{{{HP}}}cellAddr")
|
|
316
|
+
if addr is not None:
|
|
317
|
+
addr.set("rowAddr", str(row_idx))
|
|
318
|
+
tbl.set("rowCnt", str(len(tbl.findall(f"{{{HP}}}tr"))))
|
|
319
|
+
```
|
|
320
|
+
|
|
321
|
+
Always call after row insertion/deletion. Duplicate rowAddr causes Polaris to silently hide rows.
|
|
322
|
+
|
|
323
|
+
## Image Insertion Surgery
|
|
324
|
+
|
|
325
|
+
### DOCX Image Insertion
|
|
326
|
+
1. Copy image to `word/media/imageN.ext` (next available number)
|
|
327
|
+
2. Add relationship in `word/_rels/document.xml.rels`:
|
|
328
|
+
```xml
|
|
329
|
+
<Relationship Id="rIdN" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/imageN.png"/>
|
|
330
|
+
```
|
|
331
|
+
3. Add Content_Types entry if extension not registered:
|
|
332
|
+
```xml
|
|
333
|
+
<Default Extension="png" ContentType="image/png"/>
|
|
334
|
+
```
|
|
335
|
+
4. Insert drawing element in target paragraph:
|
|
336
|
+
```xml
|
|
337
|
+
<w:r><w:drawing>
|
|
338
|
+
<wp:inline distT="0" distB="0" distL="0" distR="0">
|
|
339
|
+
<wp:extent cx="{width_emu}" cy="{height_emu}"/>
|
|
340
|
+
<wp:docPr id="{uid}" name="Picture {uid}"/>
|
|
341
|
+
<a:graphic xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
|
|
342
|
+
<a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture">
|
|
343
|
+
<pic:pic xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">
|
|
344
|
+
<pic:blipFill><a:blip r:embed="rIdN"/></pic:blipFill>
|
|
345
|
+
<pic:spPr><a:xfrm><a:ext cx="{width_emu}" cy="{height_emu}"/></a:xfrm></pic:spPr>
|
|
346
|
+
</pic:pic>
|
|
347
|
+
</a:graphicData>
|
|
348
|
+
</a:graphic>
|
|
349
|
+
</wp:inline>
|
|
350
|
+
</w:drawing></w:r>
|
|
351
|
+
```
|
|
352
|
+
|
|
353
|
+
### HWPX Image Insertion
|
|
354
|
+
1. Copy image to `BinData/imageN.ext` (next available N)
|
|
355
|
+
2. Register in `content.hpf` manifest only:
|
|
356
|
+
```xml
|
|
357
|
+
<opf:item id="imageN" href="BinData/imageN.ext" media-type="image/png" isEmbeded="1"/>
|
|
358
|
+
```
|
|
359
|
+
Do NOT add `<hh:binDataItems>` to `header.xml`.
|
|
360
|
+
3. Insert complete `<hp:pic>` in target cell — see `references/image-xml-patterns.md` for the full element structure.
|
|
361
|
+
|
|
362
|
+
**Critical HWPX image rules** (all 8 must be followed):
|
|
363
|
+
- `<img>` uses `hc:` namespace (NOT `hp:img`)
|
|
364
|
+
- `<imgRect>` has 4 `<hc:pt0..3>` children (NOT inline attributes)
|
|
365
|
+
- All children required: offset, orgSz, curSz, flip, rotationInfo, renderingInfo, inMargin
|
|
366
|
+
- No spurious elements (picSz, picOutline, caption, shapeComment, picRect)
|
|
367
|
+
- `imgClip` right/bottom = actual pixel dimensions from PIL (NOT zeros)
|
|
368
|
+
- Do NOT add `<hp:lineShape>`
|
|
369
|
+
- `hp:pos`: `flowWithText="0"` `horzRelTo="COLUMN"`
|
|
370
|
+
- Sequential IDs: find max existing `id` in section XML + 1
|
|
371
|
+
|
|
372
|
+
## References
|
|
373
|
+
|
|
374
|
+
See `references/field-detection-patterns.md` for advanced detection heuristics.
|
|
375
|
+
See `references/section-range-detection.md` for dynamic section content range detection (HWPX).
|
|
376
|
+
See `references/section-image-interleaving.md` for image interleaving algorithm in section content.
|
|
377
|
+
See `references/image-xml-patterns.md` for complete image element structures and `build_hwpx_pic_element()`.
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
# HWPX XML Knowledge
|
|
2
|
+
|
|
3
|
+
OWPML structure for surgical HWPX document editing.
|
|
4
|
+
|
|
5
|
+
## HWPX Structure
|
|
6
|
+
|
|
7
|
+
HWPX (Hancom Office Open XML) is a ZIP archive:
|
|
8
|
+
```
|
|
9
|
+
mimetype — "application/hwp+zip" (MUST be first entry, uncompressed)
|
|
10
|
+
META-INF/
|
|
11
|
+
manifest.xml — file manifest
|
|
12
|
+
Contents/
|
|
13
|
+
content.hpf — content manifest (OPF package)
|
|
14
|
+
header.xml — document header (styles, fonts, charPr definitions)
|
|
15
|
+
section0.xml — first section (PRIMARY TARGET)
|
|
16
|
+
section1.xml — additional sections
|
|
17
|
+
BinData/ — embedded images and binary data
|
|
18
|
+
Preview/
|
|
19
|
+
PrvImage.png — thumbnail preview
|
|
20
|
+
settings.xml — document settings
|
|
21
|
+
```
|
|
22
|
+
|
|
23
|
+
## Key XML Elements
|
|
24
|
+
|
|
25
|
+
### Namespaces
|
|
26
|
+
```xml
|
|
27
|
+
xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph"
|
|
28
|
+
xmlns:hs="http://www.hancom.co.kr/hwpml/2011/section"
|
|
29
|
+
xmlns:hc="http://www.hancom.co.kr/hwpml/2011/core"
|
|
30
|
+
xmlns:hh="http://www.hancom.co.kr/hwpml/2011/head"
|
|
31
|
+
```
|
|
32
|
+
|
|
33
|
+
### Section Structure
|
|
34
|
+
```xml
|
|
35
|
+
<hs:sec>
|
|
36
|
+
<hp:p> <!-- paragraph -->
|
|
37
|
+
<hp:run> <!-- text run -->
|
|
38
|
+
<hp:rPr> <!-- run properties (charPrIDRef) -->
|
|
39
|
+
<hp:t> <!-- text content -->
|
|
40
|
+
</hp:run>
|
|
41
|
+
</hp:p>
|
|
42
|
+
</hs:sec>
|
|
43
|
+
```
|
|
44
|
+
|
|
45
|
+
### Tables
|
|
46
|
+
```xml
|
|
47
|
+
<hp:tbl rowCnt="N" colCnt="M">
|
|
48
|
+
<hp:tr> <!-- table row -->
|
|
49
|
+
<hp:tc> <!-- table cell -->
|
|
50
|
+
<hp:cellAddr colAddr="0" rowAddr="0"/>
|
|
51
|
+
<hp:cellSpan colSpan="1" rowSpan="1"/>
|
|
52
|
+
<hp:cellSz width="W" height="H"/>
|
|
53
|
+
<hp:subList> <!-- ~65% of cells wrap content here -->
|
|
54
|
+
<hp:p> <!-- cell content -->
|
|
55
|
+
</hp:subList>
|
|
56
|
+
</hp:tc>
|
|
57
|
+
</hp:tr>
|
|
58
|
+
</hp:tbl>
|
|
59
|
+
```
|
|
60
|
+
|
|
61
|
+
## Critical Notes
|
|
62
|
+
|
|
63
|
+
1. `mimetype` file MUST be the first ZIP entry and stored uncompressed
|
|
64
|
+
2. Korean text is UTF-8 encoded
|
|
65
|
+
3. Table cells often use complex merging (`hp:cellSpan`) for form layouts
|
|
66
|
+
4. Section files are independent — each is a complete XML document
|
|
67
|
+
5. Character properties reference IDs defined in `header.xml` (`charPrIDRef`)
|
|
68
|
+
6. After any `tree.write()`, must restore ALL 14 original namespace declarations on root elements
|
|
69
|
+
|
|
70
|
+
## References
|
|
71
|
+
|
|
72
|
+
See `references/hwpx-structure.md` for unpacking, namespace preservation fix, repackaging, and critical rules.
|
|
73
|
+
See `references/hwpx-field-patterns.md` for field detection patterns (10 patterns including subList wrapping, cellAddr addressing, charPrIDRef resolution).
|