devlyn-cli 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/bin/devlyn.js +1 -0
  2. package/optional-skills/dokkit/ANALYSIS.md +198 -0
  3. package/optional-skills/dokkit/COMMANDS.md +365 -0
  4. package/optional-skills/dokkit/DOCX-XML.md +76 -0
  5. package/optional-skills/dokkit/EXPORT.md +102 -0
  6. package/optional-skills/dokkit/FILLING.md +377 -0
  7. package/optional-skills/dokkit/HWPX-XML.md +73 -0
  8. package/optional-skills/dokkit/IMAGE-SOURCING.md +127 -0
  9. package/optional-skills/dokkit/INGESTION.md +65 -0
  10. package/optional-skills/dokkit/SKILL.md +153 -0
  11. package/optional-skills/dokkit/STATE.md +60 -0
  12. package/optional-skills/dokkit/references/docx-field-patterns.md +151 -0
  13. package/optional-skills/dokkit/references/docx-structure.md +58 -0
  14. package/optional-skills/dokkit/references/field-detection-patterns.md +130 -0
  15. package/optional-skills/dokkit/references/hwpx-field-patterns.md +461 -0
  16. package/optional-skills/dokkit/references/hwpx-structure.md +159 -0
  17. package/optional-skills/dokkit/references/image-opportunity-heuristics.md +121 -0
  18. package/optional-skills/dokkit/references/image-xml-patterns.md +338 -0
  19. package/optional-skills/dokkit/references/section-image-interleaving.md +346 -0
  20. package/optional-skills/dokkit/references/section-range-detection.md +118 -0
  21. package/optional-skills/dokkit/references/state-schema.md +143 -0
  22. package/optional-skills/dokkit/references/supported-formats.md +67 -0
  23. package/optional-skills/dokkit/scripts/compile_hwpx.py +134 -0
  24. package/optional-skills/dokkit/scripts/detect_fields.py +301 -0
  25. package/optional-skills/dokkit/scripts/detect_fields_hwpx.py +286 -0
  26. package/optional-skills/dokkit/scripts/export_pdf.py +99 -0
  27. package/optional-skills/dokkit/scripts/parse_hwpx.py +185 -0
  28. package/optional-skills/dokkit/scripts/parse_image_with_gemini.py +159 -0
  29. package/optional-skills/dokkit/scripts/parse_xlsx.py +98 -0
  30. package/optional-skills/dokkit/scripts/source_images.py +365 -0
  31. package/optional-skills/dokkit/scripts/validate_docx.py +142 -0
  32. package/optional-skills/dokkit/scripts/validate_hwpx.py +281 -0
  33. package/optional-skills/dokkit/scripts/validate_state.py +132 -0
  34. package/package.json +1 -1
@@ -0,0 +1,102 @@
1
+ # Export Knowledge
2
+
3
+ Document compilation and format conversion for the dokkit-exporter agent.
4
+
5
+ ## Compilation (Repackaging)
6
+
7
+ ### DOCX Compilation
8
+ ```python
9
+ import os, zipfile
10
+
11
+ def compile_docx(work_dir: str, output_path: str):
12
+ """Repackage a DOCX from its unpacked working directory."""
13
+ with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf:
14
+ for root, dirs, files in os.walk(work_dir):
15
+ for file in files:
16
+ file_path = os.path.join(root, file)
17
+ arcname = os.path.relpath(file_path, work_dir)
18
+ zf.write(file_path, arcname)
19
+ return output_path
20
+ ```
21
+
22
+ ### HWPX Compilation
23
+ ```python
24
+ import os, zipfile
25
+
26
+ def compile_hwpx(work_dir: str, output_path: str):
27
+ """Repackage HWPX. CRITICAL: mimetype must be first and uncompressed."""
28
+ with zipfile.ZipFile(output_path, 'w') as zf:
29
+ mimetype_path = os.path.join(work_dir, "mimetype")
30
+ if os.path.exists(mimetype_path):
31
+ zf.write(mimetype_path, "mimetype", compress_type=zipfile.ZIP_STORED)
32
+ for root, dirs, files in os.walk(work_dir):
33
+ for file in sorted(files):
34
+ if file == "mimetype" or file.endswith(".bak"):
35
+ continue
36
+ file_path = os.path.join(root, file)
37
+ arcname = os.path.relpath(file_path, work_dir)
38
+ zf.write(file_path, arcname, compress_type=zipfile.ZIP_DEFLATED)
39
+ return output_path
40
+ ```
41
+
42
+ ### Scripts
43
+ ```bash
44
+ python .claude/skills/dokkit/scripts/compile_hwpx.py <work_dir> <output.hwpx>
45
+ python .claude/skills/dokkit/scripts/export_pdf.py <input> <output.pdf>
46
+ ```
47
+
48
+ ## PDF Conversion
49
+
50
+ ### Using LibreOffice
51
+ ```bash
52
+ soffice --headless --convert-to pdf --outdir <output_dir> <input_file>
53
+ ```
54
+
55
+ ### Using Python Script
56
+ ```bash
57
+ python .claude/skills/dokkit/scripts/export_pdf.py <input> <output.pdf>
58
+ ```
59
+
60
+ ## Cross-Format Conversion
61
+
62
+ Use LibreOffice as intermediary:
63
+ ```bash
64
+ soffice --headless --convert-to hwpx --outdir <dir> <input.docx>
65
+ soffice --headless --convert-to docx --outdir <dir> <input.hwpx>
66
+ ```
67
+
68
+ Cross-format conversion may lose formatting fidelity. Always warn the user.
69
+
70
+ ## Validation
71
+
72
+ After compilation, verify:
73
+ 1. Output file is a valid ZIP archive
74
+ 2. File size is reasonable (> 0 bytes)
75
+ 3. For DOCX: `[Content_Types].xml` exists at root
76
+ 4. For HWPX: `mimetype` is first entry and correct value
77
+
78
+ ```python
79
+ import zipfile
80
+
81
+ def validate_archive(path: str, doc_type: str) -> list[str]:
82
+ errors = []
83
+ try:
84
+ with zipfile.ZipFile(path, 'r') as zf:
85
+ names = zf.namelist()
86
+ if doc_type == "docx":
87
+ if "[Content_Types].xml" not in names:
88
+ errors.append("Missing [Content_Types].xml")
89
+ elif doc_type == "hwpx":
90
+ if not names or names[0] != "mimetype":
91
+ errors.append("mimetype is not the first entry")
92
+ except zipfile.BadZipFile:
93
+ errors.append("Output is not a valid ZIP archive")
94
+ return errors
95
+ ```
96
+
97
+ ## Rules
98
+
99
+ - Never modify filled XML during export — only repackage
100
+ - ZIP structure must match original (Content_Types.xml at root for DOCX, mimetype first for HWPX)
101
+ - Skip .bak files during HWPX compilation
102
+ - Report clear errors if conversion tools unavailable
@@ -0,0 +1,377 @@
1
+ # Filling Knowledge
2
+
3
+ Field detection, matching strategies, and surgical XML editing rules for the dokkit-filler (and shared with dokkit-analyzer).
4
+
5
+ ## Table of Contents
6
+
7
+ - [Field Detection Scripts](#field-detection-scripts)
8
+ - [Matching Strategy](#matching-strategy)
9
+ - [XML Surgery Rules](#xml-surgery-rules)
10
+ - [Image Insertion Surgery](#image-insertion-surgery)
11
+
12
+ ---
13
+
14
+ ## Field Detection Scripts
15
+
16
+ ### DOCX Field Detection
17
+ ```bash
18
+ python .claude/skills/dokkit/scripts/detect_fields.py <document.xml>
19
+ ```
20
+ Outputs JSON array of detected fields with labels, types, and XML paths.
21
+
22
+ ### HWPX Field Detection
23
+ ```bash
24
+ python .claude/skills/dokkit/scripts/detect_fields_hwpx.py <section.xml>
25
+ ```
26
+ Same output format, adapted for HWPX XML structure.
27
+
28
+ ### DOCX Validation
29
+ ```bash
30
+ python .claude/skills/dokkit/scripts/validate_docx.py <work_dir>
31
+ ```
32
+
33
+ ### HWPX Validation
34
+ ```bash
35
+ python .claude/skills/dokkit/scripts/validate_hwpx.py <work_dir>
36
+ ```
37
+
38
+ ## Matching Strategy
39
+
40
+ ### Step 1: Exact Match
41
+ `field.label == source.key` — confidence: high
42
+
43
+ ### Step 2: Normalized Match
44
+ Lowercase, strip whitespace, remove punctuation — confidence: high
45
+
46
+ ### Step 3: Semantic Match
47
+ "Full Name" matches "Name" — confidence: high
48
+ "Phone Number" matches "Contact" — confidence: medium
49
+
50
+ ### Step 4: Cross-Language Match
51
+ "성명" matches "Name" — confidence: medium
52
+ "주소" matches "Address" — confidence: medium
53
+
54
+ ### Step 5: Context Inference
55
+ If field is in "Education" section and source has education data — confidence: low
56
+ Generic fields like "비고" (Remarks) — skip or flag
57
+
58
+ ## XML Surgery Rules
59
+
60
+ ### Rule 1: Preserve Run Properties
61
+ ```xml
62
+ <!-- BEFORE -->
63
+ <w:r><w:rPr><w:b/><w:sz w:val="24"/></w:rPr><w:t>{{name}}</w:t></w:r>
64
+ <!-- AFTER — rPr is IDENTICAL -->
65
+ <w:r><w:rPr><w:b/><w:sz w:val="24"/></w:rPr><w:t>John Doe</w:t></w:r>
66
+ ```
67
+
68
+ ### Rule 2: Handle xml:space
69
+ When inserting text with leading/trailing spaces:
70
+ ```xml
71
+ <w:t xml:space="preserve"> John Doe </w:t>
72
+ ```
73
+
74
+ ### Rule 3: Copy Formatting for Empty Cells
75
+ Copy run properties from the label cell. Always sanitize:
76
+ ```python
77
+ label_rPr = label_run.find("w:rPr", ns)
78
+ new_run = ET.SubElement(empty_p, "w:r")
79
+ if label_rPr is not None:
80
+ new_rPr = copy.deepcopy(label_rPr)
81
+ # Remove red color from guide text
82
+ color_elem = new_rPr.find("w:color", ns)
83
+ if color_elem is not None:
84
+ val = (color_elem.get("{%s}val" % ns["w"]) or "").upper()
85
+ if val in ("FF0000", "FF0000FF", "RED"):
86
+ new_rPr.remove(color_elem)
87
+ # Remove italic from guide text
88
+ italic_elem = new_rPr.find("w:i", ns)
89
+ if italic_elem is not None:
90
+ new_rPr.remove(italic_elem)
91
+ new_run.append(new_rPr)
92
+ new_t = ET.SubElement(new_run, "w:t")
93
+ new_t.text = value
94
+ ```
95
+
96
+ HWPX equivalent: Verify `charPrIDRef` in header.xml does NOT have `textColor="#FF0000"`. If it does, use a black charPr instead (see Rule 6).
97
+
98
+ ### Rule 4: Never Break Table Structure
99
+ - Do not add or remove `<w:tc>` elements
100
+ - Do not change `<w:gridSpan>` or `<w:vMerge>`
101
+ - Only modify content within existing cells
102
+
103
+ ### Rule 5: Tip Box Removal
104
+
105
+ Before filling fields, remove all `field_type: "tip_box"` entries.
106
+
107
+ **HWPX standalone** — delete entire `<hp:tbl>`:
108
+ ```python
109
+ ns = {"hp": "http://www.hancom.co.kr/hwpml/2011/paragraph"}
110
+ tip_pattern = re.compile(r"^※|작성\s?팁|작성\s?요령")
111
+
112
+ def remove_tip_boxes_hwpx(root):
113
+ to_remove = []
114
+ for tbl in root.iter("{%s}tbl" % ns["hp"]):
115
+ if tbl.get("rowCnt") == "1" and tbl.get("colCnt") == "1":
116
+ text = "".join(t.text or "" for t in tbl.iter("{%s}t" % ns["hp"]))
117
+ if tip_pattern.search(text.strip()):
118
+ to_remove.append(tbl)
119
+ parent_map = {c: p for p in root.iter() for c in p}
120
+ root_children = set(root)
121
+ for tbl in to_remove:
122
+ if tbl in root_children:
123
+ root.remove(tbl)
124
+ else:
125
+ parent = parent_map.get(tbl)
126
+ if parent is not None:
127
+ parent.remove(tbl)
128
+ return len(to_remove)
129
+ ```
130
+
131
+ **HWPX nested** — delete only the `<hp:p>` containing the tip (preserve subList):
132
+ ```python
133
+ def remove_nested_tips_hwpx(cell_elem):
134
+ removed = 0
135
+ for sub_list in list(cell_elem.iter("{%s}subList" % ns["hp"])):
136
+ for p_elem in list(sub_list.findall("{%s}p" % ns["hp"])):
137
+ for tbl in p_elem.iter("{%s}tbl" % ns["hp"]):
138
+ if tbl.get("rowCnt") == "1" and tbl.get("colCnt") == "1":
139
+ text = "".join(t.text or "" for t in tbl.iter("{%s}t" % ns["hp"]))
140
+ if tip_pattern.search(text.strip()):
141
+ sub_list.remove(p_elem)
142
+ removed += 1
143
+ break
144
+ return removed
145
+ ```
146
+
147
+ **DOCX** — delete 1x1 dashed-border tip tables:
148
+ ```python
149
+ def remove_tip_boxes_docx(root):
150
+ ns_w = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
151
+ to_remove = []
152
+ for tbl in root.iter("{%s}tbl" % ns_w["w"]):
153
+ rows = list(tbl.iter("{%s}tr" % ns_w["w"]))
154
+ if len(rows) != 1:
155
+ continue
156
+ cells = list(rows[0].iter("{%s}tc" % ns_w["w"]))
157
+ if len(cells) != 1:
158
+ continue
159
+ text = "".join(t.text or "" for t in tbl.iter("{%s}t" % ns_w["w"]))
160
+ if tip_pattern.search(text.strip()):
161
+ to_remove.append(tbl)
162
+ parent_map = {c: p for p in root.iter() for c in p}
163
+ for tbl in to_remove:
164
+ parent = parent_map.get(tbl)
165
+ if parent is not None:
166
+ parent.remove(tbl)
167
+ return len(to_remove)
168
+ ```
169
+
170
+ **Post-removal cleanup**: Clear remaining `※`-prefixed runs in fill-target cells:
171
+ ```python
172
+ def clear_residual_tips(cell_elem, ns_prefix):
173
+ for t_elem in cell_elem.iter("{%s}t" % ns_prefix):
174
+ if t_elem.text and t_elem.text.strip().startswith("※"):
175
+ t_elem.text = ""
176
+ ```
177
+
178
+ ### Rule 6: Color Sanitization
179
+
180
+ Filled text must always be black. Never inherit red/colored styles from guide text.
181
+
182
+ **HWPX — find black charPrIDRef**:
183
+ ```python
184
+ def find_black_charpr(header_path):
185
+ hns = {"hh": "http://www.hancom.co.kr/hwpml/2011/head"}
186
+ tree = ET.parse(header_path)
187
+ normal_id = None
188
+ bold_id = None
189
+ for cp in tree.getroot().iter("{%s}charPr" % hns["hh"]):
190
+ color = cp.get("textColor", "#000000").upper()
191
+ if color not in ("#000000", "#000000FF", "BLACK"):
192
+ continue
193
+ italic = cp.get("italic", "false")
194
+ spacing = int(cp.get("spacing", "0"))
195
+ if italic != "false" or spacing < 0:
196
+ continue
197
+ bold = cp.get("bold", "false")
198
+ if bold == "false" and normal_id is None:
199
+ normal_id = cp.get("id")
200
+ elif bold == "true" and bold_id is None:
201
+ bold_id = cp.get("id")
202
+ return {"normal": normal_id, "bold": bold_id}
203
+ ```
204
+
205
+ Before inserting any `<hp:run>`, check if the `charPrIDRef` has `textColor="#FF0000"`. If so, use `normal` ID from `find_black_charpr()`.
206
+
207
+ **DOCX — sanitize copied rPr**:
208
+ ```python
209
+ def sanitize_rpr(rpr_elem, ns):
210
+ if rpr_elem is None:
211
+ return
212
+ color = rpr_elem.find("{%s}color" % ns["w"])
213
+ if color is not None:
214
+ val = color.get("{%s}val" % ns["w"], "").upper()
215
+ if val in ("FF0000", "FF0000FF", "RED"):
216
+ rpr_elem.remove(color)
217
+ italic = rpr_elem.find("{%s}i" % ns["w"])
218
+ if italic is not None:
219
+ rpr_elem.remove(italic)
220
+ ```
221
+
222
+ Avoid charPrIDRef with negative `spacing` — causes character overlap.
223
+
224
+ ### Rule 7: Table Template Row Selection (HWPX)
225
+
226
+ For `table_content` fields, select the right template row for cloning:
227
+
228
+ **Normal row** — all rowSpan=1, full column count:
229
+ ```python
230
+ def find_normal_template_row(tbl, tr_start, tr_end):
231
+ HP = "http://www.hancom.co.kr/hwpml/2011/paragraph"
232
+ rows = tbl.findall(f"{{{HP}}}tr")
233
+ col_cnt = int(tbl.get("colCnt", "3"))
234
+ for i in range(tr_start, min(tr_end + 1, len(rows))):
235
+ row = rows[i]
236
+ text = "".join(t.text or "" for t in row.iter(f"{{{HP}}}t")).strip()
237
+ normalized = text.replace(" ", "").replace("\u3000", "")
238
+ if "합계" in normalized or "소계" in normalized:
239
+ continue
240
+ stripped = text.replace(".", "").replace("…", "").replace(" ", "")
241
+ if not stripped:
242
+ continue
243
+ cells = row.findall(f"{{{HP}}}tc")
244
+ if len(cells) != col_cnt:
245
+ continue
246
+ all_span1 = all(
247
+ int((tc.find(f"{{{HP}}}cellSpan") or {}).get("rowSpan", "1")) == 1
248
+ for tc in cells
249
+ if tc.find(f"{{{HP}}}cellSpan") is not None
250
+ )
251
+ if all_span1:
252
+ return copy.deepcopy(row)
253
+ return None
254
+ ```
255
+
256
+ **Fallback with rowSpan stripping**:
257
+ ```python
258
+ def strip_rowspan_from_template(tpl_row):
259
+ HP = "http://www.hancom.co.kr/hwpml/2011/paragraph"
260
+ for tc in tpl_row.findall(f"{{{HP}}}tc"):
261
+ cs = tc.find(f"{{{HP}}}cellSpan")
262
+ if cs is not None:
263
+ rs = int(cs.get("rowSpan", "1"))
264
+ if rs > 1:
265
+ cs.set("rowSpan", "1")
266
+ csz = tc.find(f"{{{HP}}}cellSz")
267
+ if csz is not None:
268
+ old_h = int(csz.get("height", "2129"))
269
+ csz.set("height", str(old_h // rs))
270
+ ```
271
+
272
+ **Summary row detection** — separate 합계/소계 from data rows:
273
+ ```python
274
+ def separate_summary_rows(data_rows):
275
+ regular, summary = [], None
276
+ for row in data_rows:
277
+ label = row[0].strip().replace(" ", "").replace("\u3000", "")
278
+ if label in ("합계", "소계"):
279
+ summary = row
280
+ else:
281
+ regular.append(row)
282
+ return regular, summary
283
+ ```
284
+
285
+ Deep-copy original summary row template to preserve colSpan structure.
286
+
287
+ ### Rule 8: SubList Recreation (HWPX)
288
+
289
+ When writing to a cell with no `<hp:subList>`, recreate one:
290
+ ```python
291
+ def ensure_sublist(tc):
292
+ HP = "http://www.hancom.co.kr/hwpml/2011/paragraph"
293
+ for child in tc:
294
+ if child.tag == f"{{{HP}}}subList":
295
+ return child
296
+ sl = ET.SubElement(tc, f"{{{HP}}}subList", {
297
+ "id": "", "textDirection": "HORIZONTAL", "lineWrap": "BREAK",
298
+ "vertAlign": "CENTER", "linkListIDRef": "0", "linkListNextIDRef": "0",
299
+ "textWidth": "0", "textHeight": "0", "hasTextRef": "0", "hasNumRef": "0"
300
+ })
301
+ tc.insert(0, sl)
302
+ return sl
303
+ ```
304
+
305
+ Use `ensure_sublist(tc)` when writing to a cell. Append new `<hp:p>` to the returned container.
306
+
307
+ ### Rule 9: cellAddr Re-indexing (HWPX)
308
+
309
+ After inserting or deleting rows, ALL `<hp:cellAddr rowAddr="N"/>` must equal the 0-based row index:
310
+ ```python
311
+ def fix_celladdr(tbl):
312
+ HP = "http://www.hancom.co.kr/hwpml/2011/paragraph"
313
+ for row_idx, tr in enumerate(tbl.findall(f"{{{HP}}}tr")):
314
+ for tc in tr.findall(f"{{{HP}}}tc"):
315
+ addr = tc.find(f"{{{HP}}}cellAddr")
316
+ if addr is not None:
317
+ addr.set("rowAddr", str(row_idx))
318
+ tbl.set("rowCnt", str(len(tbl.findall(f"{{{HP}}}tr"))))
319
+ ```
320
+
321
+ Always call after row insertion/deletion. Duplicate rowAddr causes Polaris to silently hide rows.
322
+
323
+ ## Image Insertion Surgery
324
+
325
+ ### DOCX Image Insertion
326
+ 1. Copy image to `word/media/imageN.ext` (next available number)
327
+ 2. Add relationship in `word/_rels/document.xml.rels`:
328
+ ```xml
329
+ <Relationship Id="rIdN" Type="http://schemas.openxmlformats.org/officeDocument/2006/relationships/image" Target="media/imageN.png"/>
330
+ ```
331
+ 3. Add Content_Types entry if extension not registered:
332
+ ```xml
333
+ <Default Extension="png" ContentType="image/png"/>
334
+ ```
335
+ 4. Insert drawing element in target paragraph:
336
+ ```xml
337
+ <w:r><w:drawing>
338
+ <wp:inline distT="0" distB="0" distL="0" distR="0">
339
+ <wp:extent cx="{width_emu}" cy="{height_emu}"/>
340
+ <wp:docPr id="{uid}" name="Picture {uid}"/>
341
+ <a:graphic xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
342
+ <a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture">
343
+ <pic:pic xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">
344
+ <pic:blipFill><a:blip r:embed="rIdN"/></pic:blipFill>
345
+ <pic:spPr><a:xfrm><a:ext cx="{width_emu}" cy="{height_emu}"/></a:xfrm></pic:spPr>
346
+ </pic:pic>
347
+ </a:graphicData>
348
+ </a:graphic>
349
+ </wp:inline>
350
+ </w:drawing></w:r>
351
+ ```
352
+
353
+ ### HWPX Image Insertion
354
+ 1. Copy image to `BinData/imageN.ext` (next available N)
355
+ 2. Register in `content.hpf` manifest only:
356
+ ```xml
357
+ <opf:item id="imageN" href="BinData/imageN.ext" media-type="image/png" isEmbeded="1"/>
358
+ ```
359
+ Do NOT add `<hh:binDataItems>` to `header.xml`.
360
+ 3. Insert complete `<hp:pic>` in target cell — see `references/image-xml-patterns.md` for the full element structure.
361
+
362
+ **Critical HWPX image rules** (all 8 must be followed):
363
+ - `<img>` uses `hc:` namespace (NOT `hp:img`)
364
+ - `<imgRect>` has 4 `<hc:pt0..3>` children (NOT inline attributes)
365
+ - All children required: offset, orgSz, curSz, flip, rotationInfo, renderingInfo, inMargin
366
+ - No spurious elements (picSz, picOutline, caption, shapeComment, picRect)
367
+ - `imgClip` right/bottom = actual pixel dimensions from PIL (NOT zeros)
368
+ - Do NOT add `<hp:lineShape>`
369
+ - `hp:pos`: `flowWithText="0"` `horzRelTo="COLUMN"`
370
+ - Sequential IDs: find max existing `id` in section XML + 1
371
+
372
+ ## References
373
+
374
+ See `references/field-detection-patterns.md` for advanced detection heuristics.
375
+ See `references/section-range-detection.md` for dynamic section content range detection (HWPX).
376
+ See `references/section-image-interleaving.md` for image interleaving algorithm in section content.
377
+ See `references/image-xml-patterns.md` for complete image element structures and `build_hwpx_pic_element()`.
@@ -0,0 +1,73 @@
1
+ # HWPX XML Knowledge
2
+
3
+ OWPML structure for surgical HWPX document editing.
4
+
5
+ ## HWPX Structure
6
+
7
+ HWPX (Hancom Office Open XML) is a ZIP archive:
8
+ ```
9
+ mimetype — "application/hwp+zip" (MUST be first entry, uncompressed)
10
+ META-INF/
11
+ manifest.xml — file manifest
12
+ Contents/
13
+ content.hpf — content manifest (OPF package)
14
+ header.xml — document header (styles, fonts, charPr definitions)
15
+ section0.xml — first section (PRIMARY TARGET)
16
+ section1.xml — additional sections
17
+ BinData/ — embedded images and binary data
18
+ Preview/
19
+ PrvImage.png — thumbnail preview
20
+ settings.xml — document settings
21
+ ```
22
+
23
+ ## Key XML Elements
24
+
25
+ ### Namespaces
26
+ ```xml
27
+ xmlns:hp="http://www.hancom.co.kr/hwpml/2011/paragraph"
28
+ xmlns:hs="http://www.hancom.co.kr/hwpml/2011/section"
29
+ xmlns:hc="http://www.hancom.co.kr/hwpml/2011/core"
30
+ xmlns:hh="http://www.hancom.co.kr/hwpml/2011/head"
31
+ ```
32
+
33
+ ### Section Structure
34
+ ```xml
35
+ <hs:sec>
36
+ <hp:p> <!-- paragraph -->
37
+ <hp:run> <!-- text run -->
38
+ <hp:rPr> <!-- run properties (charPrIDRef) -->
39
+ <hp:t> <!-- text content -->
40
+ </hp:run>
41
+ </hp:p>
42
+ </hs:sec>
43
+ ```
44
+
45
+ ### Tables
46
+ ```xml
47
+ <hp:tbl rowCnt="N" colCnt="M">
48
+ <hp:tr> <!-- table row -->
49
+ <hp:tc> <!-- table cell -->
50
+ <hp:cellAddr colAddr="0" rowAddr="0"/>
51
+ <hp:cellSpan colSpan="1" rowSpan="1"/>
52
+ <hp:cellSz width="W" height="H"/>
53
+ <hp:subList> <!-- ~65% of cells wrap content here -->
54
+ <hp:p> <!-- cell content -->
55
+ </hp:subList>
56
+ </hp:tc>
57
+ </hp:tr>
58
+ </hp:tbl>
59
+ ```
60
+
61
+ ## Critical Notes
62
+
63
+ 1. `mimetype` file MUST be the first ZIP entry and stored uncompressed
64
+ 2. Korean text is UTF-8 encoded
65
+ 3. Table cells often use complex merging (`hp:cellSpan`) for form layouts
66
+ 4. Section files are independent — each is a complete XML document
67
+ 5. Character properties reference IDs defined in `header.xml` (`charPrIDRef`)
68
+ 6. After any `tree.write()`, must restore ALL 14 original namespace declarations on root elements
69
+
70
+ ## References
71
+
72
+ See `references/hwpx-structure.md` for unpacking, namespace preservation fix, repackaging, and critical rules.
73
+ See `references/hwpx-field-patterns.md` for field detection patterns (10 patterns including subList wrapping, cellAddr addressing, charPrIDRef resolution).
@@ -0,0 +1,127 @@
1
+ # Image Sourcing
2
+
3
+ Strategies for sourcing images to fill template fields requiring photos, logos, signatures, or illustrations.
4
+
5
+ ## Image Types
6
+
7
+ | image_type | Description | Auto-generate? |
8
+ |-----------|-------------|----------------|
9
+ | `photo` | ID/profile pictures | No — user-provided only |
10
+ | `logo` | Company logos | No — user-provided only |
11
+ | `signature` | Signature fields | NEVER — must be user-provided |
12
+ | `figure` | Illustrations, diagrams | Yes — auto-generated during fill |
13
+
14
+ ## Sourcing Priority
15
+
16
+ ### 1. Check Ingested Sources
17
+ Search `.dokkit/sources/` for image files (PNG, JPG, JPEG, BMP, TIFF):
18
+ - Match by field's `image_type` and source metadata
19
+ - Set `image_source: "ingested"` and `image_file` to the path
20
+
21
+ ### 2. User-Provided File
22
+ Via `/dokkit modify "use <file>"`:
23
+ - Search `.dokkit/sources/`, then project root
24
+ - Copy to `.dokkit/images/`
25
+
26
+ ### 3. AI Generation
27
+ ```bash
28
+ python scripts/source_images.py generate \
29
+ --prompt "인포그래픽: AI 감정 케어 플랫폼 4단계 로드맵" \
30
+ --preset infographic \
31
+ --output-dir .dokkit/images/ \
32
+ --project-dir . \
33
+ --lang ko
34
+ ```
35
+ Parse `__RESULT__` JSON from stdout: `{"image_id": "...", "file_path": "...", "source_type": "generated"}`
36
+
37
+ #### Language Options (`--lang`)
38
+
39
+ | Value | Behavior | Example |
40
+ |---|---|---|
41
+ | `ko` | **Default.** All text in Korean only. English strictly forbidden. | 제목, 라벨, 설명 모두 한국어 |
42
+ | `en` | All text in English only. | Titles, labels, descriptions in English |
43
+ | `ko+en` | Mixed. Titles in Korean, technical terms may use English. | 제목은 한국어, Node.js 등 기술 용어는 영어 허용 |
44
+ | `ja` | All text in Japanese only. | 日本語のみ |
45
+ | `<code>` | Any ISO 639-1 code. | `zh`, `es`, `fr`, `de`, `pt` |
46
+ | `<a>+<b>` | Mixed: primary + secondary language. | `ko+ja`, `en+ko` |
47
+
48
+ #### Presets
49
+
50
+ | Preset | Style | Default Aspect Ratio |
51
+ |---|---|---|
52
+ | `technical_illustration` | Clean diagrams, labeled components | 16:9 |
53
+ | `infographic` | Icon-based, corporate color palette | 16:9 |
54
+ | `photorealistic` | High-quality, natural lighting | 4:3 |
55
+ | `concept` | Abstract/modern, business proposal style | 1:1 |
56
+ | `chart` | Clean data visualization | 16:9 |
57
+
58
+ Use `--aspect-ratio 16:9` to override. Use `--no-enhance` to skip preset style injection (language instruction still applies).
59
+
60
+ **Model**: `gemini-3-pro-image-preview` (nano-banana). Best for accurate text rendering in non-Latin scripts.
61
+
62
+ ### 4. Web Search
63
+ ```bash
64
+ python scripts/source_images.py search \
65
+ --query "company logo example" \
66
+ --output-dir .dokkit/images/
67
+ ```
68
+ Parse `__RESULT__` JSON: `{"image_id": "...", "file_path": "...", "source_type": "searched"}`
69
+ (Note: search is not yet implemented — directs user to provide images manually.)
70
+
71
+ ## Prompt Templates by Image Type
72
+
73
+ | image_type | Suggested prompt |
74
+ |-----------|-----------------|
75
+ | photo | "Professional ID photo, white background, formal attire" |
76
+ | logo | "Clean company logo, transparent background, modern design" |
77
+ | signature | **NEVER generate** — signatures must be user-provided |
78
+ | figure | Derive from field label and section context |
79
+
80
+ ## Section Content Image Generation
81
+
82
+ For `image_opportunities` in `section_content` fields — auto-generated during `/dokkit fill` (decorative/explanatory, not identity-sensitive).
83
+
84
+ ### Prompt Templates by Content Type
85
+
86
+ | content_type | preset | Prompt guidance |
87
+ |---|---|---|
88
+ | diagram | `technical_illustration` | "Technical architecture/system diagram showing [concept]. Clean lines, labeled components." |
89
+ | flowchart | `technical_illustration` | "Process flowchart showing [steps]. Left-to-right flow, clear arrows, numbered steps." |
90
+ | data | `infographic` | "Data visualization showing [metric/trend]. Clean chart style, professional colors." |
91
+ | concept | `technical_illustration` | "Conceptual illustration of [idea]. Abstract/modern style, suitable for business proposal." |
92
+ | infographic | `infographic` | "Infographic comparing [items]. Icon-based, clean layout, corporate color palette." |
93
+
94
+ ### Dimension Defaults
95
+
96
+ HWPML units: 1/7200 inch (~283.46 units/mm). ~77% of A4 text width = 36,000 units.
97
+
98
+ | content_type | HWPML w x h | Approx mm | EMU cx x cy |
99
+ |---|---|---|---|
100
+ | diagram | 36,000 x 24,000 | 127x85 | 4,572,000 x 3,048,000 |
101
+ | flowchart | 36,000 x 24,000 | 127x85 | 4,572,000 x 3,048,000 |
102
+ | data | 36,000 x 20,000 | 127x71 | 4,572,000 x 2,540,000 |
103
+ | concept | 28,000 x 28,000 | 99x99 | 3,556,000 x 3,556,000 |
104
+ | infographic | 36,000 x 24,000 | 127x85 | 4,572,000 x 3,048,000 |
105
+
106
+ ## Default Cell-Level Dimensions
107
+
108
+ | image_type | Width (mm) | Height (mm) | Width (EMU) | Height (EMU) |
109
+ |-----------|-----------|------------|------------|-------------|
110
+ | photo | 35 | 45 | 1,260,000 | 1,620,000 |
111
+ | logo | 50 | 50 | 1,800,000 | 1,800,000 |
112
+ | signature | 40 | 15 | 1,440,000 | 540,000 |
113
+ | figure | 100 | 75 | 3,600,000 | 2,700,000 |
114
+
115
+ Conversion: 1 mm = 36,000 EMU. For HWPX, use HWPML unit system.
116
+
117
+ ## Rules
118
+
119
+ - Signatures MUST be user-provided — never generate or search
120
+ - Never auto-generate/download images without user approval EXCEPT section content images (auto-generated during fill)
121
+ - Ingested images can be inserted automatically
122
+ - Prefer user-provided over generated
123
+ - Image format must be PNG or JPG (compatible with both DOCX and HWPX)
124
+
125
+ ## References
126
+
127
+ See `references/image-xml-patterns.md` for complete DOCX/HWPX image element structures, registration patterns, and the `build_hwpx_pic_element()` function.