devlyn-cli 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/devlyn.js +1 -0
- package/optional-skills/dokkit/ANALYSIS.md +198 -0
- package/optional-skills/dokkit/COMMANDS.md +365 -0
- package/optional-skills/dokkit/DOCX-XML.md +76 -0
- package/optional-skills/dokkit/EXPORT.md +102 -0
- package/optional-skills/dokkit/FILLING.md +377 -0
- package/optional-skills/dokkit/HWPX-XML.md +73 -0
- package/optional-skills/dokkit/IMAGE-SOURCING.md +127 -0
- package/optional-skills/dokkit/INGESTION.md +65 -0
- package/optional-skills/dokkit/SKILL.md +153 -0
- package/optional-skills/dokkit/STATE.md +60 -0
- package/optional-skills/dokkit/references/docx-field-patterns.md +151 -0
- package/optional-skills/dokkit/references/docx-structure.md +58 -0
- package/optional-skills/dokkit/references/field-detection-patterns.md +130 -0
- package/optional-skills/dokkit/references/hwpx-field-patterns.md +461 -0
- package/optional-skills/dokkit/references/hwpx-structure.md +159 -0
- package/optional-skills/dokkit/references/image-opportunity-heuristics.md +121 -0
- package/optional-skills/dokkit/references/image-xml-patterns.md +338 -0
- package/optional-skills/dokkit/references/section-image-interleaving.md +346 -0
- package/optional-skills/dokkit/references/section-range-detection.md +118 -0
- package/optional-skills/dokkit/references/state-schema.md +143 -0
- package/optional-skills/dokkit/references/supported-formats.md +67 -0
- package/optional-skills/dokkit/scripts/compile_hwpx.py +134 -0
- package/optional-skills/dokkit/scripts/detect_fields.py +301 -0
- package/optional-skills/dokkit/scripts/detect_fields_hwpx.py +286 -0
- package/optional-skills/dokkit/scripts/export_pdf.py +99 -0
- package/optional-skills/dokkit/scripts/parse_hwpx.py +185 -0
- package/optional-skills/dokkit/scripts/parse_image_with_gemini.py +159 -0
- package/optional-skills/dokkit/scripts/parse_xlsx.py +98 -0
- package/optional-skills/dokkit/scripts/source_images.py +365 -0
- package/optional-skills/dokkit/scripts/validate_docx.py +142 -0
- package/optional-skills/dokkit/scripts/validate_hwpx.py +281 -0
- package/optional-skills/dokkit/scripts/validate_state.py +132 -0
- package/package.json +1 -1
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
# Section Content Image Interleaving
|
|
2
|
+
|
|
3
|
+
Algorithm and code patterns for inserting AI-generated images within `section_content` fields, interleaved between text paragraphs.
|
|
4
|
+
|
|
5
|
+
## Overview
|
|
6
|
+
|
|
7
|
+
When a `section_content` field has `image_opportunities` in analysis.json, the filler must:
|
|
8
|
+
1. Resolve each opportunity's anchor text to a specific paragraph in the XML
|
|
9
|
+
2. Build an image paragraph (`<hp:p>` or `<w:p>`) containing the image element
|
|
10
|
+
3. Insert the image paragraph **after** the anchor paragraph
|
|
11
|
+
4. Register the image in the document manifest
|
|
12
|
+
|
|
13
|
+
## Algorithm: `fill_section_content_with_images()`
|
|
14
|
+
|
|
15
|
+
```python
|
|
16
|
+
def fill_section_content_with_images(
|
|
17
|
+
section_root, # The section XML root element
|
|
18
|
+
content_elements, # List of XML elements in the section_content range
|
|
19
|
+
mapped_value, # The text content (markdown string)
|
|
20
|
+
image_opportunities,# List of image opportunity dicts from analysis.json
|
|
21
|
+
format_context, # Dict with charPrIDs, header_path, etc.
|
|
22
|
+
template_type, # "hwpx" or "docx"
|
|
23
|
+
work_dir, # Path to template_work directory
|
|
24
|
+
):
|
|
25
|
+
"""Fill section content with text AND interleaved images.
|
|
26
|
+
|
|
27
|
+
Strategy:
|
|
28
|
+
1. First, fill the section content with formatted text (existing logic)
|
|
29
|
+
2. Then, for each sourced image opportunity, find the anchor paragraph
|
|
30
|
+
and insert an image paragraph after it
|
|
31
|
+
"""
|
|
32
|
+
# Step 1: Fill text content using existing fill_section_content() logic
|
|
33
|
+
# (This creates all the text paragraphs with markdown formatting)
|
|
34
|
+
fill_section_content(section_root, content_elements, mapped_value, format_context, template_type)
|
|
35
|
+
|
|
36
|
+
# Step 2: Insert images at anchor points
|
|
37
|
+
sourced = [op for op in image_opportunities if op.get("status") == "sourced" and op.get("image_file")]
|
|
38
|
+
|
|
39
|
+
for opportunity in sourced:
|
|
40
|
+
anchor_text = opportunity["insertion_point"]["anchor_text"]
|
|
41
|
+
image_file = opportunity["image_file"]
|
|
42
|
+
dims = opportunity.get("dimensions", {})
|
|
43
|
+
|
|
44
|
+
# Find the anchor paragraph
|
|
45
|
+
anchor_p = find_anchor_paragraph(section_root, content_elements, anchor_text, template_type)
|
|
46
|
+
if anchor_p is None:
|
|
47
|
+
print(f"WARNING: Anchor text not found for {opportunity['opportunity_id']}, skipping image")
|
|
48
|
+
opportunity["status"] = "skipped"
|
|
49
|
+
continue
|
|
50
|
+
|
|
51
|
+
# Build image paragraph
|
|
52
|
+
if template_type == "hwpx":
|
|
53
|
+
img_p = build_hwpx_image_paragraph(
|
|
54
|
+
image_file, dims, format_context, work_dir
|
|
55
|
+
)
|
|
56
|
+
else:
|
|
57
|
+
img_p = build_docx_image_paragraph(
|
|
58
|
+
image_file, dims, format_context, work_dir
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
# Insert after anchor paragraph
|
|
62
|
+
insert_after_element(section_root, anchor_p, img_p)
|
|
63
|
+
opportunity["status"] = "inserted"
|
|
64
|
+
```
|
|
65
|
+
|
|
66
|
+
## Anchor Text Resolution
|
|
67
|
+
|
|
68
|
+
The anchor text is a distinctive Korean phrase from the paragraph where the image should be inserted **after**.
|
|
69
|
+
|
|
70
|
+
```python
|
|
71
|
+
def find_anchor_paragraph(section_root, content_elements, anchor_text, template_type):
|
|
72
|
+
"""Find the paragraph element containing the anchor text.
|
|
73
|
+
|
|
74
|
+
Search strategy:
|
|
75
|
+
1. Exact substring match in paragraph text
|
|
76
|
+
2. Normalized match (strip whitespace differences)
|
|
77
|
+
3. Partial match (first 20 chars of anchor)
|
|
78
|
+
|
|
79
|
+
Returns the <hp:p> or <w:p> element, or None if not found.
|
|
80
|
+
"""
|
|
81
|
+
ns_t = "{http://www.hancom.co.kr/hwpml/2011/paragraph}t" if template_type == "hwpx" else "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t"
|
|
82
|
+
ns_p = "{http://www.hancom.co.kr/hwpml/2011/paragraph}p" if template_type == "hwpx" else "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p"
|
|
83
|
+
|
|
84
|
+
# Collect all paragraphs within the content range
|
|
85
|
+
paragraphs = []
|
|
86
|
+
for elem in content_elements:
|
|
87
|
+
if elem.tag == ns_p:
|
|
88
|
+
paragraphs.append(elem)
|
|
89
|
+
for child_p in elem.iter(ns_p):
|
|
90
|
+
if child_p not in paragraphs:
|
|
91
|
+
paragraphs.append(child_p)
|
|
92
|
+
|
|
93
|
+
# Strategy 1: Exact substring match
|
|
94
|
+
for p in paragraphs:
|
|
95
|
+
text = "".join(t.text or "" for t in p.iter(ns_t))
|
|
96
|
+
if anchor_text in text:
|
|
97
|
+
return p
|
|
98
|
+
|
|
99
|
+
# Strategy 2: Normalized match (collapse whitespace)
|
|
100
|
+
import re
|
|
101
|
+
normalized_anchor = re.sub(r'\s+', ' ', anchor_text.strip())
|
|
102
|
+
for p in paragraphs:
|
|
103
|
+
text = "".join(t.text or "" for t in p.iter(ns_t))
|
|
104
|
+
normalized_text = re.sub(r'\s+', ' ', text.strip())
|
|
105
|
+
if normalized_anchor in normalized_text:
|
|
106
|
+
return p
|
|
107
|
+
|
|
108
|
+
# Strategy 3: Partial match (first 20 chars)
|
|
109
|
+
partial = anchor_text[:20]
|
|
110
|
+
for p in paragraphs:
|
|
111
|
+
text = "".join(t.text or "" for t in p.iter(ns_t))
|
|
112
|
+
if partial in text:
|
|
113
|
+
return p
|
|
114
|
+
|
|
115
|
+
return None # Not found — caller should skip with warning
|
|
116
|
+
```
|
|
117
|
+
|
|
118
|
+
## Image Paragraph Construction
|
|
119
|
+
|
|
120
|
+
### HWPX: `<hp:p>` with `<hp:pic>`
|
|
121
|
+
|
|
122
|
+
```python
|
|
123
|
+
def find_center_parapr(header_path):
|
|
124
|
+
"""Find first center-aligned paraPr from header.xml for image paragraphs."""
|
|
125
|
+
HH = "http://www.hancom.co.kr/hwpml/2011/head"
|
|
126
|
+
tree = ET.parse(header_path)
|
|
127
|
+
for pp in tree.getroot().iter(f"{{{HH}}}paraPr"):
|
|
128
|
+
align = pp.find(f"{{{HH}}}align")
|
|
129
|
+
if align is not None and align.get("horizontal") == "CENTER":
|
|
130
|
+
return pp.get("id")
|
|
131
|
+
return "0" # fallback to default
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def build_hwpx_image_paragraph(image_file, dims, format_context, work_dir):
|
|
135
|
+
"""Build an <hp:p> element containing an <hp:pic> for inline image display.
|
|
136
|
+
|
|
137
|
+
The paragraph contains:
|
|
138
|
+
- A <hp:run> with an empty <hp:t> (required for valid paragraph structure)
|
|
139
|
+
- A <hp:pic> element (the actual image)
|
|
140
|
+
- Center-aligned paraPrIDRef from header.xml
|
|
141
|
+
"""
|
|
142
|
+
HP = "http://www.hancom.co.kr/hwpml/2011/paragraph"
|
|
143
|
+
|
|
144
|
+
# Use dimensions from opportunity, with defaults (~77% of A4 page width)
|
|
145
|
+
width_hwpml = dims.get("width_hwpml", 36000) # ~127mm default
|
|
146
|
+
height_hwpml = dims.get("height_hwpml", 24000) # ~85mm default
|
|
147
|
+
|
|
148
|
+
# Override small legacy defaults (15000 = ~53mm, too small for page)
|
|
149
|
+
if width_hwpml <= 16000:
|
|
150
|
+
width_hwpml = 36000
|
|
151
|
+
if height_hwpml <= 12000:
|
|
152
|
+
height_hwpml = int(width_hwpml * 0.667)
|
|
153
|
+
|
|
154
|
+
# Copy image to BinData/ and register in manifest
|
|
155
|
+
manifest_id, bin_path = register_image_in_manifest(image_file, work_dir)
|
|
156
|
+
|
|
157
|
+
# Get next sequential ID and zOrder from section XML
|
|
158
|
+
seq_id = format_context["next_seq_id"]
|
|
159
|
+
z_order = format_context["next_z_order"]
|
|
160
|
+
format_context["next_seq_id"] += 1
|
|
161
|
+
format_context["next_z_order"] += 1
|
|
162
|
+
|
|
163
|
+
# Find center-aligned paraPrIDRef from header.xml
|
|
164
|
+
center_parapr_id = format_context.get("center_parapr_id")
|
|
165
|
+
if center_parapr_id is None:
|
|
166
|
+
center_parapr_id = find_center_parapr(
|
|
167
|
+
os.path.join(work_dir, "Contents", "header.xml")
|
|
168
|
+
)
|
|
169
|
+
format_context["center_parapr_id"] = center_parapr_id
|
|
170
|
+
|
|
171
|
+
# Build the paragraph with center alignment
|
|
172
|
+
p = ET.Element(f"{{{HP}}}p")
|
|
173
|
+
p.set("paraPrIDRef", str(center_parapr_id))
|
|
174
|
+
p.set("styleIDRef", "0")
|
|
175
|
+
p.set("pageBreak", "0")
|
|
176
|
+
p.set("columnBreak", "0")
|
|
177
|
+
p.set("merged", "0")
|
|
178
|
+
|
|
179
|
+
# Hancom structure: <hp:run><hp:pic>...</hp:pic><hp:t/></hp:run>
|
|
180
|
+
# pic goes INSIDE run, t AFTER pic (verified against real Hancom Office output)
|
|
181
|
+
run = ET.SubElement(p, f"{{{HP}}}run")
|
|
182
|
+
run.set("charPrIDRef", str(format_context.get("normal_charpr_id", "0")))
|
|
183
|
+
|
|
184
|
+
# Build <hp:pic> element and append INSIDE the run
|
|
185
|
+
pic = build_hwpx_pic_element(
|
|
186
|
+
manifest_id=manifest_id,
|
|
187
|
+
image_path=bin_path,
|
|
188
|
+
width_hwpml=width_hwpml,
|
|
189
|
+
height_hwpml=height_hwpml,
|
|
190
|
+
seq_id=seq_id,
|
|
191
|
+
z_order=z_order,
|
|
192
|
+
)
|
|
193
|
+
run.append(pic)
|
|
194
|
+
|
|
195
|
+
# Empty <hp:t/> goes AFTER <hp:pic> inside the run
|
|
196
|
+
ET.SubElement(run, f"{{{HP}}}t")
|
|
197
|
+
|
|
198
|
+
return p
|
|
199
|
+
```
|
|
200
|
+
|
|
201
|
+
### DOCX: `<w:p>` with `<w:drawing>`
|
|
202
|
+
|
|
203
|
+
```python
|
|
204
|
+
def build_docx_image_paragraph(image_file, dims, format_context, work_dir):
|
|
205
|
+
"""Build a <w:p> element containing a <w:drawing> for inline image display."""
|
|
206
|
+
W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
|
|
207
|
+
|
|
208
|
+
width_emu = dims.get("width_emu", 5400000) # 150mm default
|
|
209
|
+
height_emu = dims.get("height_emu", 3600000) # 100mm default
|
|
210
|
+
|
|
211
|
+
# Copy image to word/media/ and register relationship
|
|
212
|
+
rel_id, media_path = register_image_in_docx(image_file, work_dir)
|
|
213
|
+
pic_id = format_context["next_pic_id"]
|
|
214
|
+
format_context["next_pic_id"] += 1
|
|
215
|
+
filename = os.path.basename(media_path)
|
|
216
|
+
|
|
217
|
+
# Build the paragraph
|
|
218
|
+
p = ET.Element(f"{{{W}}}p")
|
|
219
|
+
|
|
220
|
+
# Center alignment for the image paragraph
|
|
221
|
+
pPr = ET.SubElement(p, f"{{{W}}}pPr")
|
|
222
|
+
jc = ET.SubElement(pPr, f"{{{W}}}jc")
|
|
223
|
+
jc.set(f"{{{W}}}val", "center")
|
|
224
|
+
|
|
225
|
+
# Build the run with drawing
|
|
226
|
+
r = ET.SubElement(p, f"{{{W}}}r")
|
|
227
|
+
# Use build_drawing_element() from dokkit-image-sourcing skill
|
|
228
|
+
drawing = build_drawing_element(rel_id, width_emu, height_emu, pic_id, filename)
|
|
229
|
+
r.append(drawing)
|
|
230
|
+
|
|
231
|
+
return p
|
|
232
|
+
```
|
|
233
|
+
|
|
234
|
+
## Element Insertion
|
|
235
|
+
|
|
236
|
+
```python
|
|
237
|
+
def insert_after_element(root, anchor_elem, new_elem):
|
|
238
|
+
"""Insert new_elem immediately after anchor_elem in the parent's children.
|
|
239
|
+
|
|
240
|
+
Handles both direct children and nested elements by building a parent map.
|
|
241
|
+
"""
|
|
242
|
+
parent_map = {c: p for p in root.iter() for c in p}
|
|
243
|
+
parent = parent_map.get(anchor_elem)
|
|
244
|
+
if parent is None:
|
|
245
|
+
return False
|
|
246
|
+
|
|
247
|
+
children = list(parent)
|
|
248
|
+
idx = children.index(anchor_elem)
|
|
249
|
+
parent.insert(idx + 1, new_elem)
|
|
250
|
+
return True
|
|
251
|
+
```
|
|
252
|
+
|
|
253
|
+
## Image Registration (same rules as cell-level images)
|
|
254
|
+
|
|
255
|
+
### HWPX Registration
|
|
256
|
+
```python
|
|
257
|
+
def register_image_in_manifest(image_file, work_dir):
|
|
258
|
+
"""Copy image to BinData/ and register in content.hpf manifest.
|
|
259
|
+
|
|
260
|
+
Returns (manifest_id, bin_path).
|
|
261
|
+
Same rules as cell-level image registration:
|
|
262
|
+
- Register in content.hpf ONLY
|
|
263
|
+
- Do NOT add to header.xml binDataItems
|
|
264
|
+
"""
|
|
265
|
+
import shutil
|
|
266
|
+
import os
|
|
267
|
+
|
|
268
|
+
# Find next available image number
|
|
269
|
+
bindata_dir = os.path.join(work_dir, "BinData")
|
|
270
|
+
os.makedirs(bindata_dir, exist_ok=True)
|
|
271
|
+
existing = [f for f in os.listdir(bindata_dir) if f.startswith("image")]
|
|
272
|
+
next_n = len(existing) + 1
|
|
273
|
+
ext = os.path.splitext(image_file)[1]
|
|
274
|
+
filename = f"image{next_n}{ext}"
|
|
275
|
+
bin_path = os.path.join(bindata_dir, filename)
|
|
276
|
+
shutil.copy2(image_file, bin_path)
|
|
277
|
+
|
|
278
|
+
# Register in content.hpf
|
|
279
|
+
manifest_id = f"image{next_n}"
|
|
280
|
+
content_hpf = os.path.join(work_dir, "Contents", "content.hpf")
|
|
281
|
+
# Add <opf:item id="imageN" href="BinData/imageN.ext" media-type="image/png" isEmbeded="1"/>
|
|
282
|
+
# (Filler agent performs the actual XML modification)
|
|
283
|
+
|
|
284
|
+
return manifest_id, bin_path
|
|
285
|
+
```
|
|
286
|
+
|
|
287
|
+
### DOCX Registration
|
|
288
|
+
```python
|
|
289
|
+
def register_image_in_docx(image_file, work_dir):
|
|
290
|
+
"""Copy image to word/media/ and register in relationships and Content_Types.
|
|
291
|
+
|
|
292
|
+
Returns (rel_id, media_path).
|
|
293
|
+
Same rules as cell-level image registration:
|
|
294
|
+
- Add relationship in word/_rels/document.xml.rels
|
|
295
|
+
- Add Content_Types entry if extension not registered
|
|
296
|
+
"""
|
|
297
|
+
import shutil
|
|
298
|
+
import os
|
|
299
|
+
|
|
300
|
+
media_dir = os.path.join(work_dir, "word", "media")
|
|
301
|
+
os.makedirs(media_dir, exist_ok=True)
|
|
302
|
+
existing = [f for f in os.listdir(media_dir) if f.startswith("image")]
|
|
303
|
+
next_n = len(existing) + 1
|
|
304
|
+
ext = os.path.splitext(image_file)[1]
|
|
305
|
+
filename = f"image{next_n}{ext}"
|
|
306
|
+
media_path = os.path.join(media_dir, filename)
|
|
307
|
+
shutil.copy2(image_file, media_path)
|
|
308
|
+
|
|
309
|
+
# Generate next relationship ID
|
|
310
|
+
rel_id = f"rId{next_n + 10}" # offset to avoid conflicts
|
|
311
|
+
# (Filler agent adds the actual relationship XML and Content_Types entry)
|
|
312
|
+
|
|
313
|
+
return rel_id, media_path
|
|
314
|
+
```
|
|
315
|
+
|
|
316
|
+
## Edge Cases
|
|
317
|
+
|
|
318
|
+
### Anchor text not found
|
|
319
|
+
- **Action**: Skip the image opportunity, set `status: "skipped"`, log a warning
|
|
320
|
+
- **Cause**: Text may have been modified during markdown rendering, or anchor was not distinctive enough
|
|
321
|
+
- **Mitigation**: Analyzer should choose anchor text that is unique within the field
|
|
322
|
+
|
|
323
|
+
### Image generation failure
|
|
324
|
+
- **Action**: The image_file will be null and status remains "pending" (fill-doc sets to "skipped" on failure)
|
|
325
|
+
- **Filler behavior**: Skip opportunities where `status != "sourced"`, proceed with text-only fill
|
|
326
|
+
- **No fallback**: Do not insert placeholder images or broken references
|
|
327
|
+
|
|
328
|
+
### Multiple images in same paragraph area
|
|
329
|
+
- **Rule**: Min 150 chars between image opportunities (enforced by analyzer)
|
|
330
|
+
- **If violated**: Insert images in order; later images shift down naturally
|
|
331
|
+
|
|
332
|
+
### Content too short after formatting
|
|
333
|
+
- **Rule**: If the filled section has fewer paragraphs than expected (e.g., due to markdown rendering differences), skip any opportunities whose anchor text cannot be found
|
|
334
|
+
- **Never force**: Do not insert images at approximate positions if anchor resolution fails
|
|
335
|
+
|
|
336
|
+
## Dimension Defaults for Section Content Images
|
|
337
|
+
|
|
338
|
+
HWPML units are 1/7200 inch (NOT hundredths of mm). ~77% of A4 text width = 36,000 units.
|
|
339
|
+
|
|
340
|
+
| content_type | HWPML width | HWPML height | Approx mm | EMU cx | EMU cy |
|
|
341
|
+
|---|---|---|---|---|---|
|
|
342
|
+
| diagram | 36,000 | 24,000 | 127x85 | 4,572,000 | 3,048,000 |
|
|
343
|
+
| flowchart | 36,000 | 24,000 | 127x85 | 4,572,000 | 3,048,000 |
|
|
344
|
+
| data | 36,000 | 20,000 | 127x71 | 4,572,000 | 2,540,000 |
|
|
345
|
+
| concept | 28,000 | 28,000 | 99x99 | 3,556,000 | 3,556,000 |
|
|
346
|
+
| infographic | 36,000 | 24,000 | 127x85 | 4,572,000 | 3,048,000 |
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
# Section Content Range Detection (HWPX)
|
|
2
|
+
|
|
3
|
+
## Problem
|
|
4
|
+
|
|
5
|
+
When `analysis.json` records `element_path: "section/children[N:M]"` for `section_content` fields, those indices refer to the **pre-tip-removal** state of the document. Tip box removal (Phase 2) deletes standalone tip paragraphs from the section root, shifting all subsequent child indices.
|
|
6
|
+
|
|
7
|
+
Using stale indices causes:
|
|
8
|
+
1. **Section titles get destroyed** — the range overlaps title elements
|
|
9
|
+
2. **Images don't show** — corrupted document structure
|
|
10
|
+
3. **Out-of-bounds errors** — indices exceed actual child count
|
|
11
|
+
|
|
12
|
+
## Solution: Dynamic Range Detection
|
|
13
|
+
|
|
14
|
+
After tip removal, **recompute** section content ranges by scanning the section root's children for structural markers (section title elements). This produces correct post-removal indices.
|
|
15
|
+
|
|
16
|
+
### Algorithm
|
|
17
|
+
|
|
18
|
+
```python
|
|
19
|
+
def find_section_content_ranges(root, hp_ns):
|
|
20
|
+
"""Find content ranges for each section_content field by locating title markers.
|
|
21
|
+
|
|
22
|
+
Must run AFTER tip box removal so indices are stable.
|
|
23
|
+
|
|
24
|
+
Returns dict mapping field IDs to (start, end) inclusive child index ranges.
|
|
25
|
+
"""
|
|
26
|
+
hp_tag = lambda name: f'{{{hp_ns}}}{name}'
|
|
27
|
+
children = list(root)
|
|
28
|
+
markers = {} # label -> child index
|
|
29
|
+
|
|
30
|
+
for i, child in enumerate(children):
|
|
31
|
+
text = ''.join(t.text or '' for t in child.iter(hp_tag('t'))).strip()
|
|
32
|
+
|
|
33
|
+
# --- Section title markers ---
|
|
34
|
+
# These are hp:p elements containing 1x2 hp:tbl with numbered headings.
|
|
35
|
+
# Match by section number + characteristic keywords.
|
|
36
|
+
# Use flexible matching: number prefix + key Korean terms.
|
|
37
|
+
|
|
38
|
+
if '1.' in text and '문제' in text and ('Problem' in text or '필요성' in text):
|
|
39
|
+
markers['sec1_title'] = i
|
|
40
|
+
elif '2.' in text and '실현' in text and ('Solution' in text or '개발' in text):
|
|
41
|
+
markers['sec2_title'] = i
|
|
42
|
+
elif '3.' in text and '성장' in text and ('Scale' in text or '사업화' in text):
|
|
43
|
+
markers['sec3_title'] = i
|
|
44
|
+
elif '4.' in text and '팀' in text and ('Team' in text or '대표자' in text):
|
|
45
|
+
markers['sec4_title'] = i
|
|
46
|
+
|
|
47
|
+
# --- End markers (tables/sections that follow the content) ---
|
|
48
|
+
elif '사업추진' in text and '일정' in text and '협약기간' in text:
|
|
49
|
+
markers['schedule1'] = i
|
|
50
|
+
elif '사업추진' in text and '일정' in text and '전체' in text:
|
|
51
|
+
markers['schedule2'] = i
|
|
52
|
+
elif '팀 구성' in text and '구분' in text and '직위' in text:
|
|
53
|
+
markers['team_table'] = i
|
|
54
|
+
|
|
55
|
+
# Build ranges: content starts after title, ends before next structural element
|
|
56
|
+
ranges = {}
|
|
57
|
+
if 'sec1_title' in markers and 'sec2_title' in markers:
|
|
58
|
+
ranges['field_028'] = (markers['sec1_title'] + 1, markers['sec2_title'] - 1)
|
|
59
|
+
if 'sec2_title' in markers and 'schedule1' in markers:
|
|
60
|
+
ranges['field_029'] = (markers['sec2_title'] + 1, markers['schedule1'] - 1)
|
|
61
|
+
if 'sec3_title' in markers and 'schedule2' in markers:
|
|
62
|
+
ranges['field_046'] = (markers['sec3_title'] + 1, markers['schedule2'] - 1)
|
|
63
|
+
if 'sec4_title' in markers and 'team_table' in markers:
|
|
64
|
+
ranges['field_051'] = (markers['sec4_title'] + 1, markers['team_table'] - 1)
|
|
65
|
+
|
|
66
|
+
return ranges
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
### Integration into fill_template.py
|
|
70
|
+
|
|
71
|
+
The filler agent MUST include this logic when generating `fill_template.py` for HWPX templates that have `section_content` fields:
|
|
72
|
+
|
|
73
|
+
```python
|
|
74
|
+
# Phase 2b: After tip removal, before filling
|
|
75
|
+
# Override stale analysis.json ranges with dynamically-detected correct ranges
|
|
76
|
+
dynamic_ranges = find_section_content_ranges(root)
|
|
77
|
+
for fid, dyn_range in dynamic_ranges.items():
|
|
78
|
+
if fid in field_refs:
|
|
79
|
+
field_refs[fid] = dyn_range
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
### Adapting for Different Templates
|
|
83
|
+
|
|
84
|
+
The marker detection patterns above are specific to the 예비창업패키지 사업계획서 template. For other templates:
|
|
85
|
+
|
|
86
|
+
1. **Identify structural markers** — section title elements that bound each `section_content` field
|
|
87
|
+
2. **Match by text content** — use keywords from the section titles that appear in the template
|
|
88
|
+
3. **Map field IDs** — connect each `section_content` field's ID from analysis.json to the correct marker pair
|
|
89
|
+
|
|
90
|
+
The general pattern is always:
|
|
91
|
+
- `content_start = title_marker_index + 1`
|
|
92
|
+
- `content_end = next_structural_element_index - 1`
|
|
93
|
+
|
|
94
|
+
### Why Not Fix analysis.json Instead?
|
|
95
|
+
|
|
96
|
+
The analyzer runs BEFORE tip removal, so it can only record pre-removal indices. The correct approach is:
|
|
97
|
+
1. Analyzer records approximate ranges (useful for documentation)
|
|
98
|
+
2. Filler dynamically recomputes exact ranges after cleanup
|
|
99
|
+
|
|
100
|
+
### imgDim: Use Actual Image Dimensions
|
|
101
|
+
|
|
102
|
+
When inserting images via `hp:pic`, the `hp:imgDim` element should use the **actual pixel dimensions** of the image file, not the display size. Use PIL/Pillow:
|
|
103
|
+
|
|
104
|
+
```python
|
|
105
|
+
from PIL import Image
|
|
106
|
+
|
|
107
|
+
# Display size (for hp:sz, hp:picRect, hp:imgRect)
|
|
108
|
+
img_w = SECTION_IMG_WIDTH # e.g. 29400
|
|
109
|
+
img_h = SECTION_IMG_HEIGHT # e.g. 16538
|
|
110
|
+
|
|
111
|
+
# Actual pixel dimensions (for hp:imgDim only)
|
|
112
|
+
dim_w, dim_h = img_w, img_h
|
|
113
|
+
try:
|
|
114
|
+
with Image.open(image_path) as pil_img:
|
|
115
|
+
dim_w, dim_h = pil_img.size
|
|
116
|
+
except Exception:
|
|
117
|
+
pass # Fall back to display dimensions
|
|
118
|
+
```
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# Dokkit State Schema
|
|
2
|
+
|
|
3
|
+
## state.json
|
|
4
|
+
|
|
5
|
+
```json
|
|
6
|
+
{
|
|
7
|
+
"version": "1.0",
|
|
8
|
+
"created": "2026-02-07T12:00:00Z",
|
|
9
|
+
"updated": "2026-02-07T12:30:00Z",
|
|
10
|
+
|
|
11
|
+
"sources": [
|
|
12
|
+
{
|
|
13
|
+
"id": "src_001",
|
|
14
|
+
"file_path": "docs/sample_source/resume.pdf",
|
|
15
|
+
"file_type": "pdf",
|
|
16
|
+
"display_name": "resume.pdf",
|
|
17
|
+
"content_path": ".dokkit/sources/resume.md",
|
|
18
|
+
"metadata_path": ".dokkit/sources/resume.json",
|
|
19
|
+
"summary": "Personal resume with education, work history, and skills",
|
|
20
|
+
"status": "ready",
|
|
21
|
+
"ingested_at": "2026-02-07T12:05:00Z"
|
|
22
|
+
}
|
|
23
|
+
],
|
|
24
|
+
|
|
25
|
+
"template": {
|
|
26
|
+
"file_path": "docs/sample_template/template.docx",
|
|
27
|
+
"file_type": "docx",
|
|
28
|
+
"display_name": "template.docx",
|
|
29
|
+
"work_dir": ".dokkit/template_work/",
|
|
30
|
+
"set_at": "2026-02-07T12:15:00Z"
|
|
31
|
+
},
|
|
32
|
+
|
|
33
|
+
"analysis": {
|
|
34
|
+
"path": ".dokkit/analysis.json",
|
|
35
|
+
"total_fields": 22,
|
|
36
|
+
"mapped": 18,
|
|
37
|
+
"unmapped": 4,
|
|
38
|
+
"analyzed_at": "2026-02-07T12:16:00Z",
|
|
39
|
+
"image_fields": 2,
|
|
40
|
+
"image_fields_sourced": 1,
|
|
41
|
+
"image_fields_pending": 1
|
|
42
|
+
},
|
|
43
|
+
|
|
44
|
+
"filled_document": {
|
|
45
|
+
"status": "review",
|
|
46
|
+
"filled_at": "2026-02-07T12:20:00Z",
|
|
47
|
+
"modifications": [
|
|
48
|
+
{
|
|
49
|
+
"instruction": "Change phone to 010-1234-5678",
|
|
50
|
+
"fields_affected": ["field_005"],
|
|
51
|
+
"modified_at": "2026-02-07T12:25:00Z"
|
|
52
|
+
}
|
|
53
|
+
]
|
|
54
|
+
},
|
|
55
|
+
|
|
56
|
+
"exports": [
|
|
57
|
+
{
|
|
58
|
+
"format": "docx",
|
|
59
|
+
"output_path": ".dokkit/output/filled_template.docx",
|
|
60
|
+
"exported_at": "2026-02-07T12:30:00Z",
|
|
61
|
+
"file_size": 45678
|
|
62
|
+
}
|
|
63
|
+
]
|
|
64
|
+
}
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## Field Definitions
|
|
68
|
+
|
|
69
|
+
### Root
|
|
70
|
+
| Field | Type | Required | Description |
|
|
71
|
+
|-------|------|----------|-------------|
|
|
72
|
+
| version | string | yes | Schema version ("1.0") |
|
|
73
|
+
| created | string | yes | ISO 8601 timestamp of workspace creation |
|
|
74
|
+
| updated | string | no | ISO 8601 timestamp of last update |
|
|
75
|
+
| sources | array | yes | List of ingested source documents |
|
|
76
|
+
| template | object\|null | yes | Current template being filled |
|
|
77
|
+
| analysis | object\|null | yes | Template analysis metadata |
|
|
78
|
+
| filled_document | object\|null | yes | Filled document status |
|
|
79
|
+
| exports | array | yes | List of exports performed |
|
|
80
|
+
|
|
81
|
+
### Source Entry
|
|
82
|
+
| Field | Type | Required | Description |
|
|
83
|
+
|-------|------|----------|-------------|
|
|
84
|
+
| id | string | yes | Unique identifier (src_NNN) |
|
|
85
|
+
| file_path | string | yes | Original file location |
|
|
86
|
+
| file_type | string | yes | Detected format |
|
|
87
|
+
| display_name | string | yes | Human-readable name |
|
|
88
|
+
| content_path | string | yes | Path to .md content file |
|
|
89
|
+
| metadata_path | string | yes | Path to .json sidecar |
|
|
90
|
+
| summary | string | yes | Brief content summary |
|
|
91
|
+
| status | string | yes | "processing" \| "ready" \| "error" |
|
|
92
|
+
| ingested_at | string | yes | ISO 8601 timestamp |
|
|
93
|
+
| error_message | string | no | Error details (when status=error) |
|
|
94
|
+
|
|
95
|
+
### Template
|
|
96
|
+
| Field | Type | Required | Description |
|
|
97
|
+
|-------|------|----------|-------------|
|
|
98
|
+
| file_path | string | yes | Original template location |
|
|
99
|
+
| file_type | string | yes | "docx" \| "hwpx" |
|
|
100
|
+
| display_name | string | yes | Human-readable name |
|
|
101
|
+
| work_dir | string | yes | Path to unpacked working copy |
|
|
102
|
+
| set_at | string | yes | ISO 8601 timestamp |
|
|
103
|
+
|
|
104
|
+
### Analysis
|
|
105
|
+
| Field | Type | Required | Description |
|
|
106
|
+
|-------|------|----------|-------------|
|
|
107
|
+
| path | string | yes | Path to analysis.json |
|
|
108
|
+
| total_fields | integer | yes | Total fields detected |
|
|
109
|
+
| mapped | integer | yes | Fields with source mappings |
|
|
110
|
+
| unmapped | integer | yes | Fields without mappings |
|
|
111
|
+
| image_fields | integer | no | Total image fields detected |
|
|
112
|
+
| image_fields_sourced | integer | no | Image fields with source images |
|
|
113
|
+
| image_fields_pending | integer | no | Image fields awaiting images |
|
|
114
|
+
| analyzed_at | string | yes | ISO 8601 timestamp |
|
|
115
|
+
|
|
116
|
+
### Filled Document
|
|
117
|
+
| Field | Type | Required | Description |
|
|
118
|
+
|-------|------|----------|-------------|
|
|
119
|
+
| status | string | yes | "filling" \| "review" \| "modified" \| "finalized" |
|
|
120
|
+
| filled_at | string | yes | ISO 8601 timestamp |
|
|
121
|
+
| modifications | array | no | List of modification records |
|
|
122
|
+
|
|
123
|
+
### Export Entry
|
|
124
|
+
| Field | Type | Required | Description |
|
|
125
|
+
|-------|------|----------|-------------|
|
|
126
|
+
| format | string | yes | "docx" \| "hwpx" \| "pdf" |
|
|
127
|
+
| output_path | string | yes | Path to exported file |
|
|
128
|
+
| exported_at | string | yes | ISO 8601 timestamp |
|
|
129
|
+
| file_size | integer | no | File size in bytes |
|
|
130
|
+
| warnings | array | no | Conversion warnings |
|
|
131
|
+
|
|
132
|
+
## Valid Status Values
|
|
133
|
+
|
|
134
|
+
### Source Status
|
|
135
|
+
- `processing` — currently being parsed
|
|
136
|
+
- `ready` — successfully ingested
|
|
137
|
+
- `error` — parsing failed
|
|
138
|
+
|
|
139
|
+
### Filled Document Status
|
|
140
|
+
- `filling` — fields being mapped and filled
|
|
141
|
+
- `review` — filling complete, awaiting review
|
|
142
|
+
- `modified` — user requested changes
|
|
143
|
+
- `finalized` — user approved, ready for export
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
# Supported Source Formats
|
|
2
|
+
|
|
3
|
+
## Docling-Supported Formats
|
|
4
|
+
|
|
5
|
+
### PDF
|
|
6
|
+
- Text PDFs: direct text extraction
|
|
7
|
+
- Scanned PDFs: OCR via Docling's built-in OCR
|
|
8
|
+
- Mixed PDFs: handles both text and image regions
|
|
9
|
+
- Tables: extracted as markdown tables
|
|
10
|
+
|
|
11
|
+
### DOCX (Microsoft Word)
|
|
12
|
+
- Paragraphs, headings, lists
|
|
13
|
+
- Tables with merged cells
|
|
14
|
+
- Embedded images (extracted as descriptions)
|
|
15
|
+
- Headers/footers
|
|
16
|
+
|
|
17
|
+
### PPTX (PowerPoint)
|
|
18
|
+
- Slide content as sections
|
|
19
|
+
- Speaker notes included
|
|
20
|
+
- Tables and charts (text content)
|
|
21
|
+
|
|
22
|
+
### HTML
|
|
23
|
+
- Semantic structure preserved
|
|
24
|
+
- Tables converted to markdown
|
|
25
|
+
- Links and formatting extracted
|
|
26
|
+
|
|
27
|
+
### CSV
|
|
28
|
+
- Converted to markdown table
|
|
29
|
+
- Headers auto-detected
|
|
30
|
+
|
|
31
|
+
### MD (Markdown)
|
|
32
|
+
- Passed through with minimal processing
|
|
33
|
+
- Metadata extracted from frontmatter if present
|
|
34
|
+
|
|
35
|
+
## Custom-Parsed Formats
|
|
36
|
+
|
|
37
|
+
### XLSX (Excel)
|
|
38
|
+
- Multiple sheets → separate sections
|
|
39
|
+
- Tables preserved with formatting
|
|
40
|
+
- Formulas shown as computed values
|
|
41
|
+
- Named ranges and cell references
|
|
42
|
+
|
|
43
|
+
### HWPX (Hancom Office)
|
|
44
|
+
- Korean document format (XML-based, ZIP archive)
|
|
45
|
+
- Structure: Contents/section*.xml
|
|
46
|
+
- Tables with complex merging patterns
|
|
47
|
+
- Korean text preserved with UTF-8 encoding
|
|
48
|
+
|
|
49
|
+
### JSON
|
|
50
|
+
- Formatted as structured markdown
|
|
51
|
+
- Nested objects → indented sections
|
|
52
|
+
- Arrays → lists or tables
|
|
53
|
+
|
|
54
|
+
### TXT
|
|
55
|
+
- Wrapped as markdown
|
|
56
|
+
- Auto-detect structure (lists, paragraphs)
|
|
57
|
+
|
|
58
|
+
## Image Formats (PNG, JPG, JPEG)
|
|
59
|
+
- OCR via Google Gemini Vision API
|
|
60
|
+
- Text extraction with layout preservation
|
|
61
|
+
- Table detection in scanned documents
|
|
62
|
+
- Handwriting recognition (best effort)
|
|
63
|
+
|
|
64
|
+
## Unsupported Formats
|
|
65
|
+
- HWP (legacy Hancom binary format — convert to HWPX first)
|
|
66
|
+
- Password-protected files
|
|
67
|
+
- DRM-protected documents
|