devlyn-cli 0.5.1 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/bin/devlyn.js +1 -0
  2. package/optional-skills/better-auth-setup/SKILL.md +222 -11
  3. package/optional-skills/better-auth-setup/references/proxy-gotchas.md +148 -0
  4. package/optional-skills/better-auth-setup/references/proxy-setup.md +284 -0
  5. package/optional-skills/dokkit/ANALYSIS.md +198 -0
  6. package/optional-skills/dokkit/COMMANDS.md +365 -0
  7. package/optional-skills/dokkit/DOCX-XML.md +76 -0
  8. package/optional-skills/dokkit/EXPORT.md +102 -0
  9. package/optional-skills/dokkit/FILLING.md +377 -0
  10. package/optional-skills/dokkit/HWPX-XML.md +73 -0
  11. package/optional-skills/dokkit/IMAGE-SOURCING.md +127 -0
  12. package/optional-skills/dokkit/INGESTION.md +65 -0
  13. package/optional-skills/dokkit/SKILL.md +153 -0
  14. package/optional-skills/dokkit/STATE.md +60 -0
  15. package/optional-skills/dokkit/references/docx-field-patterns.md +151 -0
  16. package/optional-skills/dokkit/references/docx-structure.md +58 -0
  17. package/optional-skills/dokkit/references/field-detection-patterns.md +130 -0
  18. package/optional-skills/dokkit/references/hwpx-field-patterns.md +461 -0
  19. package/optional-skills/dokkit/references/hwpx-structure.md +159 -0
  20. package/optional-skills/dokkit/references/image-opportunity-heuristics.md +121 -0
  21. package/optional-skills/dokkit/references/image-xml-patterns.md +338 -0
  22. package/optional-skills/dokkit/references/section-image-interleaving.md +346 -0
  23. package/optional-skills/dokkit/references/section-range-detection.md +118 -0
  24. package/optional-skills/dokkit/references/state-schema.md +143 -0
  25. package/optional-skills/dokkit/references/supported-formats.md +67 -0
  26. package/optional-skills/dokkit/scripts/compile_hwpx.py +134 -0
  27. package/optional-skills/dokkit/scripts/detect_fields.py +301 -0
  28. package/optional-skills/dokkit/scripts/detect_fields_hwpx.py +286 -0
  29. package/optional-skills/dokkit/scripts/export_pdf.py +99 -0
  30. package/optional-skills/dokkit/scripts/parse_hwpx.py +185 -0
  31. package/optional-skills/dokkit/scripts/parse_image_with_gemini.py +159 -0
  32. package/optional-skills/dokkit/scripts/parse_xlsx.py +98 -0
  33. package/optional-skills/dokkit/scripts/source_images.py +365 -0
  34. package/optional-skills/dokkit/scripts/validate_docx.py +142 -0
  35. package/optional-skills/dokkit/scripts/validate_hwpx.py +281 -0
  36. package/optional-skills/dokkit/scripts/validate_state.py +132 -0
  37. package/package.json +1 -1
@@ -0,0 +1,346 @@
1
+ # Section Content Image Interleaving
2
+
3
+ Algorithm and code patterns for inserting AI-generated images within `section_content` fields, interleaved between text paragraphs.
4
+
5
+ ## Overview
6
+
7
+ When a `section_content` field has `image_opportunities` in analysis.json, the filler must:
8
+ 1. Resolve each opportunity's anchor text to a specific paragraph in the XML
9
+ 2. Build an image paragraph (`<hp:p>` or `<w:p>`) containing the image element
10
+ 3. Insert the image paragraph **after** the anchor paragraph
11
+ 4. Register the image in the document manifest
12
+
13
+ ## Algorithm: `fill_section_content_with_images()`
14
+
15
+ ```python
16
+ def fill_section_content_with_images(
17
+ section_root, # The section XML root element
18
+ content_elements, # List of XML elements in the section_content range
19
+ mapped_value, # The text content (markdown string)
20
+ image_opportunities,# List of image opportunity dicts from analysis.json
21
+ format_context, # Dict with charPrIDs, header_path, etc.
22
+ template_type, # "hwpx" or "docx"
23
+ work_dir, # Path to template_work directory
24
+ ):
25
+ """Fill section content with text AND interleaved images.
26
+
27
+ Strategy:
28
+ 1. First, fill the section content with formatted text (existing logic)
29
+ 2. Then, for each sourced image opportunity, find the anchor paragraph
30
+ and insert an image paragraph after it
31
+ """
32
+ # Step 1: Fill text content using existing fill_section_content() logic
33
+ # (This creates all the text paragraphs with markdown formatting)
34
+ fill_section_content(section_root, content_elements, mapped_value, format_context, template_type)
35
+
36
+ # Step 2: Insert images at anchor points
37
+ sourced = [op for op in image_opportunities if op.get("status") == "sourced" and op.get("image_file")]
38
+
39
+ for opportunity in sourced:
40
+ anchor_text = opportunity["insertion_point"]["anchor_text"]
41
+ image_file = opportunity["image_file"]
42
+ dims = opportunity.get("dimensions", {})
43
+
44
+ # Find the anchor paragraph
45
+ anchor_p = find_anchor_paragraph(section_root, content_elements, anchor_text, template_type)
46
+ if anchor_p is None:
47
+ print(f"WARNING: Anchor text not found for {opportunity['opportunity_id']}, skipping image")
48
+ opportunity["status"] = "skipped"
49
+ continue
50
+
51
+ # Build image paragraph
52
+ if template_type == "hwpx":
53
+ img_p = build_hwpx_image_paragraph(
54
+ image_file, dims, format_context, work_dir
55
+ )
56
+ else:
57
+ img_p = build_docx_image_paragraph(
58
+ image_file, dims, format_context, work_dir
59
+ )
60
+
61
+ # Insert after anchor paragraph
62
+ insert_after_element(section_root, anchor_p, img_p)
63
+ opportunity["status"] = "inserted"
64
+ ```
65
+
66
+ ## Anchor Text Resolution
67
+
68
+ The anchor text is a distinctive Korean phrase from the paragraph where the image should be inserted **after**.
69
+
70
+ ```python
71
+ def find_anchor_paragraph(section_root, content_elements, anchor_text, template_type):
72
+ """Find the paragraph element containing the anchor text.
73
+
74
+ Search strategy:
75
+ 1. Exact substring match in paragraph text
76
+ 2. Normalized match (strip whitespace differences)
77
+ 3. Partial match (first 20 chars of anchor)
78
+
79
+ Returns the <hp:p> or <w:p> element, or None if not found.
80
+ """
81
+ ns_t = "{http://www.hancom.co.kr/hwpml/2011/paragraph}t" if template_type == "hwpx" else "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}t"
82
+ ns_p = "{http://www.hancom.co.kr/hwpml/2011/paragraph}p" if template_type == "hwpx" else "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}p"
83
+
84
+ # Collect all paragraphs within the content range
85
+ paragraphs = []
86
+ for elem in content_elements:
87
+ if elem.tag == ns_p:
88
+ paragraphs.append(elem)
89
+ for child_p in elem.iter(ns_p):
90
+ if child_p not in paragraphs:
91
+ paragraphs.append(child_p)
92
+
93
+ # Strategy 1: Exact substring match
94
+ for p in paragraphs:
95
+ text = "".join(t.text or "" for t in p.iter(ns_t))
96
+ if anchor_text in text:
97
+ return p
98
+
99
+ # Strategy 2: Normalized match (collapse whitespace)
100
+ import re
101
+ normalized_anchor = re.sub(r'\s+', ' ', anchor_text.strip())
102
+ for p in paragraphs:
103
+ text = "".join(t.text or "" for t in p.iter(ns_t))
104
+ normalized_text = re.sub(r'\s+', ' ', text.strip())
105
+ if normalized_anchor in normalized_text:
106
+ return p
107
+
108
+ # Strategy 3: Partial match (first 20 chars)
109
+ partial = anchor_text[:20]
110
+ for p in paragraphs:
111
+ text = "".join(t.text or "" for t in p.iter(ns_t))
112
+ if partial in text:
113
+ return p
114
+
115
+ return None # Not found — caller should skip with warning
116
+ ```
117
+
118
+ ## Image Paragraph Construction
119
+
120
+ ### HWPX: `<hp:p>` with `<hp:pic>`
121
+
122
+ ```python
123
+ def find_center_parapr(header_path):
124
+ """Find first center-aligned paraPr from header.xml for image paragraphs."""
125
+ HH = "http://www.hancom.co.kr/hwpml/2011/head"
126
+ tree = ET.parse(header_path)
127
+ for pp in tree.getroot().iter(f"{{{HH}}}paraPr"):
128
+ align = pp.find(f"{{{HH}}}align")
129
+ if align is not None and align.get("horizontal") == "CENTER":
130
+ return pp.get("id")
131
+ return "0" # fallback to default
132
+
133
+
134
+ def build_hwpx_image_paragraph(image_file, dims, format_context, work_dir):
135
+ """Build an <hp:p> element containing an <hp:pic> for inline image display.
136
+
137
+ The paragraph contains:
138
+ - A <hp:run> with an empty <hp:t> (required for valid paragraph structure)
139
+ - A <hp:pic> element (the actual image)
140
+ - Center-aligned paraPrIDRef from header.xml
141
+ """
142
+ HP = "http://www.hancom.co.kr/hwpml/2011/paragraph"
143
+
144
+ # Use dimensions from opportunity, with defaults (~77% of A4 page width)
145
+ width_hwpml = dims.get("width_hwpml", 36000) # ~127mm default
146
+ height_hwpml = dims.get("height_hwpml", 24000) # ~85mm default
147
+
148
+ # Override small legacy defaults (15000 = ~53mm, too small for page)
149
+ if width_hwpml <= 16000:
150
+ width_hwpml = 36000
151
+ if height_hwpml <= 12000:
152
+ height_hwpml = int(width_hwpml * 0.667)
153
+
154
+ # Copy image to BinData/ and register in manifest
155
+ manifest_id, bin_path = register_image_in_manifest(image_file, work_dir)
156
+
157
+ # Get next sequential ID and zOrder from section XML
158
+ seq_id = format_context["next_seq_id"]
159
+ z_order = format_context["next_z_order"]
160
+ format_context["next_seq_id"] += 1
161
+ format_context["next_z_order"] += 1
162
+
163
+ # Find center-aligned paraPrIDRef from header.xml
164
+ center_parapr_id = format_context.get("center_parapr_id")
165
+ if center_parapr_id is None:
166
+ center_parapr_id = find_center_parapr(
167
+ os.path.join(work_dir, "Contents", "header.xml")
168
+ )
169
+ format_context["center_parapr_id"] = center_parapr_id
170
+
171
+ # Build the paragraph with center alignment
172
+ p = ET.Element(f"{{{HP}}}p")
173
+ p.set("paraPrIDRef", str(center_parapr_id))
174
+ p.set("styleIDRef", "0")
175
+ p.set("pageBreak", "0")
176
+ p.set("columnBreak", "0")
177
+ p.set("merged", "0")
178
+
179
+ # Hancom structure: <hp:run><hp:pic>...</hp:pic><hp:t/></hp:run>
180
+ # pic goes INSIDE run, t AFTER pic (verified against real Hancom Office output)
181
+ run = ET.SubElement(p, f"{{{HP}}}run")
182
+ run.set("charPrIDRef", str(format_context.get("normal_charpr_id", "0")))
183
+
184
+ # Build <hp:pic> element and append INSIDE the run
185
+ pic = build_hwpx_pic_element(
186
+ manifest_id=manifest_id,
187
+ image_path=bin_path,
188
+ width_hwpml=width_hwpml,
189
+ height_hwpml=height_hwpml,
190
+ seq_id=seq_id,
191
+ z_order=z_order,
192
+ )
193
+ run.append(pic)
194
+
195
+ # Empty <hp:t/> goes AFTER <hp:pic> inside the run
196
+ ET.SubElement(run, f"{{{HP}}}t")
197
+
198
+ return p
199
+ ```
200
+
201
+ ### DOCX: `<w:p>` with `<w:drawing>`
202
+
203
+ ```python
204
+ def build_docx_image_paragraph(image_file, dims, format_context, work_dir):
205
+ """Build a <w:p> element containing a <w:drawing> for inline image display."""
206
+ W = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
207
+
208
+ width_emu = dims.get("width_emu", 5400000) # 150mm default
209
+ height_emu = dims.get("height_emu", 3600000) # 100mm default
210
+
211
+ # Copy image to word/media/ and register relationship
212
+ rel_id, media_path = register_image_in_docx(image_file, work_dir)
213
+ pic_id = format_context["next_pic_id"]
214
+ format_context["next_pic_id"] += 1
215
+ filename = os.path.basename(media_path)
216
+
217
+ # Build the paragraph
218
+ p = ET.Element(f"{{{W}}}p")
219
+
220
+ # Center alignment for the image paragraph
221
+ pPr = ET.SubElement(p, f"{{{W}}}pPr")
222
+ jc = ET.SubElement(pPr, f"{{{W}}}jc")
223
+ jc.set(f"{{{W}}}val", "center")
224
+
225
+ # Build the run with drawing
226
+ r = ET.SubElement(p, f"{{{W}}}r")
227
+ # Use build_drawing_element() from dokkit-image-sourcing skill
228
+ drawing = build_drawing_element(rel_id, width_emu, height_emu, pic_id, filename)
229
+ r.append(drawing)
230
+
231
+ return p
232
+ ```
233
+
234
+ ## Element Insertion
235
+
236
+ ```python
237
+ def insert_after_element(root, anchor_elem, new_elem):
238
+ """Insert new_elem immediately after anchor_elem in the parent's children.
239
+
240
+ Handles both direct children and nested elements by building a parent map.
241
+ """
242
+ parent_map = {c: p for p in root.iter() for c in p}
243
+ parent = parent_map.get(anchor_elem)
244
+ if parent is None:
245
+ return False
246
+
247
+ children = list(parent)
248
+ idx = children.index(anchor_elem)
249
+ parent.insert(idx + 1, new_elem)
250
+ return True
251
+ ```
252
+
253
+ ## Image Registration (same rules as cell-level images)
254
+
255
+ ### HWPX Registration
256
+ ```python
257
+ def register_image_in_manifest(image_file, work_dir):
258
+ """Copy image to BinData/ and register in content.hpf manifest.
259
+
260
+ Returns (manifest_id, bin_path).
261
+ Same rules as cell-level image registration:
262
+ - Register in content.hpf ONLY
263
+ - Do NOT add to header.xml binDataItems
264
+ """
265
+ import shutil
266
+ import os
267
+
268
+ # Find next available image number
269
+ bindata_dir = os.path.join(work_dir, "BinData")
270
+ os.makedirs(bindata_dir, exist_ok=True)
271
+ existing = [f for f in os.listdir(bindata_dir) if f.startswith("image")]
272
+ next_n = len(existing) + 1
273
+ ext = os.path.splitext(image_file)[1]
274
+ filename = f"image{next_n}{ext}"
275
+ bin_path = os.path.join(bindata_dir, filename)
276
+ shutil.copy2(image_file, bin_path)
277
+
278
+ # Register in content.hpf
279
+ manifest_id = f"image{next_n}"
280
+ content_hpf = os.path.join(work_dir, "Contents", "content.hpf")
281
+ # Add <opf:item id="imageN" href="BinData/imageN.ext" media-type="image/png" isEmbeded="1"/>
282
+ # (Filler agent performs the actual XML modification)
283
+
284
+ return manifest_id, bin_path
285
+ ```
286
+
287
+ ### DOCX Registration
288
+ ```python
289
+ def register_image_in_docx(image_file, work_dir):
290
+ """Copy image to word/media/ and register in relationships and Content_Types.
291
+
292
+ Returns (rel_id, media_path).
293
+ Same rules as cell-level image registration:
294
+ - Add relationship in word/_rels/document.xml.rels
295
+ - Add Content_Types entry if extension not registered
296
+ """
297
+ import shutil
298
+ import os
299
+
300
+ media_dir = os.path.join(work_dir, "word", "media")
301
+ os.makedirs(media_dir, exist_ok=True)
302
+ existing = [f for f in os.listdir(media_dir) if f.startswith("image")]
303
+ next_n = len(existing) + 1
304
+ ext = os.path.splitext(image_file)[1]
305
+ filename = f"image{next_n}{ext}"
306
+ media_path = os.path.join(media_dir, filename)
307
+ shutil.copy2(image_file, media_path)
308
+
309
+ # Generate next relationship ID
310
+ rel_id = f"rId{next_n + 10}" # offset to avoid conflicts
311
+ # (Filler agent adds the actual relationship XML and Content_Types entry)
312
+
313
+ return rel_id, media_path
314
+ ```
315
+
316
+ ## Edge Cases
317
+
318
+ ### Anchor text not found
319
+ - **Action**: Skip the image opportunity, set `status: "skipped"`, log a warning
320
+ - **Cause**: Text may have been modified during markdown rendering, or anchor was not distinctive enough
321
+ - **Mitigation**: Analyzer should choose anchor text that is unique within the field
322
+
323
+ ### Image generation failure
324
+ - **Action**: The image_file will be null and status remains "pending" (fill-doc sets to "skipped" on failure)
325
+ - **Filler behavior**: Skip opportunities where `status != "sourced"`, proceed with text-only fill
326
+ - **No fallback**: Do not insert placeholder images or broken references
327
+
328
+ ### Multiple images in same paragraph area
329
+ - **Rule**: Min 150 chars between image opportunities (enforced by analyzer)
330
+ - **If violated**: Insert images in order; later images shift down naturally
331
+
332
+ ### Content too short after formatting
333
+ - **Rule**: If the filled section has fewer paragraphs than expected (e.g., due to markdown rendering differences), skip any opportunities whose anchor text cannot be found
334
+ - **Never force**: Do not insert images at approximate positions if anchor resolution fails
335
+
336
+ ## Dimension Defaults for Section Content Images
337
+
338
+ HWPML units are 1/7200 inch (NOT hundredths of mm). ~77% of A4 text width = 36,000 units.
339
+
340
+ | content_type | HWPML width | HWPML height | Approx mm | EMU cx | EMU cy |
341
+ |---|---|---|---|---|---|
342
+ | diagram | 36,000 | 24,000 | 127x85 | 4,572,000 | 3,048,000 |
343
+ | flowchart | 36,000 | 24,000 | 127x85 | 4,572,000 | 3,048,000 |
344
+ | data | 36,000 | 20,000 | 127x71 | 4,572,000 | 2,540,000 |
345
+ | concept | 28,000 | 28,000 | 99x99 | 3,556,000 | 3,556,000 |
346
+ | infographic | 36,000 | 24,000 | 127x85 | 4,572,000 | 3,048,000 |
@@ -0,0 +1,118 @@
1
+ # Section Content Range Detection (HWPX)
2
+
3
+ ## Problem
4
+
5
+ When `analysis.json` records `element_path: "section/children[N:M]"` for `section_content` fields, those indices refer to the **pre-tip-removal** state of the document. Tip box removal (Phase 2) deletes standalone tip paragraphs from the section root, shifting all subsequent child indices.
6
+
7
+ Using stale indices causes:
8
+ 1. **Section titles get destroyed** — the range overlaps title elements
9
+ 2. **Images don't show** — corrupted document structure
10
+ 3. **Out-of-bounds errors** — indices exceed actual child count
11
+
12
+ ## Solution: Dynamic Range Detection
13
+
14
+ After tip removal, **recompute** section content ranges by scanning the section root's children for structural markers (section title elements). This produces correct post-removal indices.
15
+
16
+ ### Algorithm
17
+
18
+ ```python
19
+ def find_section_content_ranges(root, hp_ns):
20
+ """Find content ranges for each section_content field by locating title markers.
21
+
22
+ Must run AFTER tip box removal so indices are stable.
23
+
24
+ Returns dict mapping field IDs to (start, end) inclusive child index ranges.
25
+ """
26
+ hp_tag = lambda name: f'{{{hp_ns}}}{name}'
27
+ children = list(root)
28
+ markers = {} # label -> child index
29
+
30
+ for i, child in enumerate(children):
31
+ text = ''.join(t.text or '' for t in child.iter(hp_tag('t'))).strip()
32
+
33
+ # --- Section title markers ---
34
+ # These are hp:p elements containing 1x2 hp:tbl with numbered headings.
35
+ # Match by section number + characteristic keywords.
36
+ # Use flexible matching: number prefix + key Korean terms.
37
+
38
+ if '1.' in text and '문제' in text and ('Problem' in text or '필요성' in text):
39
+ markers['sec1_title'] = i
40
+ elif '2.' in text and '실현' in text and ('Solution' in text or '개발' in text):
41
+ markers['sec2_title'] = i
42
+ elif '3.' in text and '성장' in text and ('Scale' in text or '사업화' in text):
43
+ markers['sec3_title'] = i
44
+ elif '4.' in text and '팀' in text and ('Team' in text or '대표자' in text):
45
+ markers['sec4_title'] = i
46
+
47
+ # --- End markers (tables/sections that follow the content) ---
48
+ elif '사업추진' in text and '일정' in text and '협약기간' in text:
49
+ markers['schedule1'] = i
50
+ elif '사업추진' in text and '일정' in text and '전체' in text:
51
+ markers['schedule2'] = i
52
+ elif '팀 구성' in text and '구분' in text and '직위' in text:
53
+ markers['team_table'] = i
54
+
55
+ # Build ranges: content starts after title, ends before next structural element
56
+ ranges = {}
57
+ if 'sec1_title' in markers and 'sec2_title' in markers:
58
+ ranges['field_028'] = (markers['sec1_title'] + 1, markers['sec2_title'] - 1)
59
+ if 'sec2_title' in markers and 'schedule1' in markers:
60
+ ranges['field_029'] = (markers['sec2_title'] + 1, markers['schedule1'] - 1)
61
+ if 'sec3_title' in markers and 'schedule2' in markers:
62
+ ranges['field_046'] = (markers['sec3_title'] + 1, markers['schedule2'] - 1)
63
+ if 'sec4_title' in markers and 'team_table' in markers:
64
+ ranges['field_051'] = (markers['sec4_title'] + 1, markers['team_table'] - 1)
65
+
66
+ return ranges
67
+ ```
68
+
69
+ ### Integration into fill_template.py
70
+
71
+ The filler agent MUST include this logic when generating `fill_template.py` for HWPX templates that have `section_content` fields:
72
+
73
+ ```python
74
+ # Phase 2b: After tip removal, before filling
75
+ # Override stale analysis.json ranges with dynamically-detected correct ranges
76
+ dynamic_ranges = find_section_content_ranges(root)
77
+ for fid, dyn_range in dynamic_ranges.items():
78
+ if fid in field_refs:
79
+ field_refs[fid] = dyn_range
80
+ ```
81
+
82
+ ### Adapting for Different Templates
83
+
84
+ The marker detection patterns above are specific to the 예비창업패키지 사업계획서 template. For other templates:
85
+
86
+ 1. **Identify structural markers** — section title elements that bound each `section_content` field
87
+ 2. **Match by text content** — use keywords from the section titles that appear in the template
88
+ 3. **Map field IDs** — connect each `section_content` field's ID from analysis.json to the correct marker pair
89
+
90
+ The general pattern is always:
91
+ - `content_start = title_marker_index + 1`
92
+ - `content_end = next_structural_element_index - 1`
93
+
94
+ ### Why Not Fix analysis.json Instead?
95
+
96
+ The analyzer runs BEFORE tip removal, so it can only record pre-removal indices. The correct approach is:
97
+ 1. Analyzer records approximate ranges (useful for documentation)
98
+ 2. Filler dynamically recomputes exact ranges after cleanup
99
+
100
+ ### imgDim: Use Actual Image Dimensions
101
+
102
+ When inserting images via `hp:pic`, the `hp:imgDim` element should use the **actual pixel dimensions** of the image file, not the display size. Use PIL/Pillow:
103
+
104
+ ```python
105
+ from PIL import Image
106
+
107
+ # Display size (for hp:sz, hp:picRect, hp:imgRect)
108
+ img_w = SECTION_IMG_WIDTH # e.g. 29400
109
+ img_h = SECTION_IMG_HEIGHT # e.g. 16538
110
+
111
+ # Actual pixel dimensions (for hp:imgDim only)
112
+ dim_w, dim_h = img_w, img_h
113
+ try:
114
+ with Image.open(image_path) as pil_img:
115
+ dim_w, dim_h = pil_img.size
116
+ except Exception:
117
+ pass # Fall back to display dimensions
118
+ ```
@@ -0,0 +1,143 @@
1
+ # Dokkit State Schema
2
+
3
+ ## state.json
4
+
5
+ ```json
6
+ {
7
+ "version": "1.0",
8
+ "created": "2026-02-07T12:00:00Z",
9
+ "updated": "2026-02-07T12:30:00Z",
10
+
11
+ "sources": [
12
+ {
13
+ "id": "src_001",
14
+ "file_path": "docs/sample_source/resume.pdf",
15
+ "file_type": "pdf",
16
+ "display_name": "resume.pdf",
17
+ "content_path": ".dokkit/sources/resume.md",
18
+ "metadata_path": ".dokkit/sources/resume.json",
19
+ "summary": "Personal resume with education, work history, and skills",
20
+ "status": "ready",
21
+ "ingested_at": "2026-02-07T12:05:00Z"
22
+ }
23
+ ],
24
+
25
+ "template": {
26
+ "file_path": "docs/sample_template/template.docx",
27
+ "file_type": "docx",
28
+ "display_name": "template.docx",
29
+ "work_dir": ".dokkit/template_work/",
30
+ "set_at": "2026-02-07T12:15:00Z"
31
+ },
32
+
33
+ "analysis": {
34
+ "path": ".dokkit/analysis.json",
35
+ "total_fields": 22,
36
+ "mapped": 18,
37
+ "unmapped": 4,
38
+ "analyzed_at": "2026-02-07T12:16:00Z",
39
+ "image_fields": 2,
40
+ "image_fields_sourced": 1,
41
+ "image_fields_pending": 1
42
+ },
43
+
44
+ "filled_document": {
45
+ "status": "review",
46
+ "filled_at": "2026-02-07T12:20:00Z",
47
+ "modifications": [
48
+ {
49
+ "instruction": "Change phone to 010-1234-5678",
50
+ "fields_affected": ["field_005"],
51
+ "modified_at": "2026-02-07T12:25:00Z"
52
+ }
53
+ ]
54
+ },
55
+
56
+ "exports": [
57
+ {
58
+ "format": "docx",
59
+ "output_path": ".dokkit/output/filled_template.docx",
60
+ "exported_at": "2026-02-07T12:30:00Z",
61
+ "file_size": 45678
62
+ }
63
+ ]
64
+ }
65
+ ```
66
+
67
+ ## Field Definitions
68
+
69
+ ### Root
70
+ | Field | Type | Required | Description |
71
+ |-------|------|----------|-------------|
72
+ | version | string | yes | Schema version ("1.0") |
73
+ | created | string | yes | ISO 8601 timestamp of workspace creation |
74
+ | updated | string | no | ISO 8601 timestamp of last update |
75
+ | sources | array | yes | List of ingested source documents |
76
+ | template | object\|null | yes | Current template being filled |
77
+ | analysis | object\|null | yes | Template analysis metadata |
78
+ | filled_document | object\|null | yes | Filled document status |
79
+ | exports | array | yes | List of exports performed |
80
+
81
+ ### Source Entry
82
+ | Field | Type | Required | Description |
83
+ |-------|------|----------|-------------|
84
+ | id | string | yes | Unique identifier (src_NNN) |
85
+ | file_path | string | yes | Original file location |
86
+ | file_type | string | yes | Detected format |
87
+ | display_name | string | yes | Human-readable name |
88
+ | content_path | string | yes | Path to .md content file |
89
+ | metadata_path | string | yes | Path to .json sidecar |
90
+ | summary | string | yes | Brief content summary |
91
+ | status | string | yes | "processing" \| "ready" \| "error" |
92
+ | ingested_at | string | yes | ISO 8601 timestamp |
93
+ | error_message | string | no | Error details (when status=error) |
94
+
95
+ ### Template
96
+ | Field | Type | Required | Description |
97
+ |-------|------|----------|-------------|
98
+ | file_path | string | yes | Original template location |
99
+ | file_type | string | yes | "docx" \| "hwpx" |
100
+ | display_name | string | yes | Human-readable name |
101
+ | work_dir | string | yes | Path to unpacked working copy |
102
+ | set_at | string | yes | ISO 8601 timestamp |
103
+
104
+ ### Analysis
105
+ | Field | Type | Required | Description |
106
+ |-------|------|----------|-------------|
107
+ | path | string | yes | Path to analysis.json |
108
+ | total_fields | integer | yes | Total fields detected |
109
+ | mapped | integer | yes | Fields with source mappings |
110
+ | unmapped | integer | yes | Fields without mappings |
111
+ | image_fields | integer | no | Total image fields detected |
112
+ | image_fields_sourced | integer | no | Image fields with source images |
113
+ | image_fields_pending | integer | no | Image fields awaiting images |
114
+ | analyzed_at | string | yes | ISO 8601 timestamp |
115
+
116
+ ### Filled Document
117
+ | Field | Type | Required | Description |
118
+ |-------|------|----------|-------------|
119
+ | status | string | yes | "filling" \| "review" \| "modified" \| "finalized" |
120
+ | filled_at | string | yes | ISO 8601 timestamp |
121
+ | modifications | array | no | List of modification records |
122
+
123
+ ### Export Entry
124
+ | Field | Type | Required | Description |
125
+ |-------|------|----------|-------------|
126
+ | format | string | yes | "docx" \| "hwpx" \| "pdf" |
127
+ | output_path | string | yes | Path to exported file |
128
+ | exported_at | string | yes | ISO 8601 timestamp |
129
+ | file_size | integer | no | File size in bytes |
130
+ | warnings | array | no | Conversion warnings |
131
+
132
+ ## Valid Status Values
133
+
134
+ ### Source Status
135
+ - `processing` — currently being parsed
136
+ - `ready` — successfully ingested
137
+ - `error` — parsing failed
138
+
139
+ ### Filled Document Status
140
+ - `filling` — fields being mapped and filled
141
+ - `review` — filling complete, awaiting review
142
+ - `modified` — user requested changes
143
+ - `finalized` — user approved, ready for export
@@ -0,0 +1,67 @@
1
+ # Supported Source Formats
2
+
3
+ ## Docling-Supported Formats
4
+
5
+ ### PDF
6
+ - Text PDFs: direct text extraction
7
+ - Scanned PDFs: OCR via Docling's built-in OCR
8
+ - Mixed PDFs: handles both text and image regions
9
+ - Tables: extracted as markdown tables
10
+
11
+ ### DOCX (Microsoft Word)
12
+ - Paragraphs, headings, lists
13
+ - Tables with merged cells
14
+ - Embedded images (extracted as descriptions)
15
+ - Headers/footers
16
+
17
+ ### PPTX (PowerPoint)
18
+ - Slide content as sections
19
+ - Speaker notes included
20
+ - Tables and charts (text content)
21
+
22
+ ### HTML
23
+ - Semantic structure preserved
24
+ - Tables converted to markdown
25
+ - Links and formatting extracted
26
+
27
+ ### CSV
28
+ - Converted to markdown table
29
+ - Headers auto-detected
30
+
31
+ ### MD (Markdown)
32
+ - Passed through with minimal processing
33
+ - Metadata extracted from frontmatter if present
34
+
35
+ ## Custom-Parsed Formats
36
+
37
+ ### XLSX (Excel)
38
+ - Multiple sheets → separate sections
39
+ - Tables preserved with formatting
40
+ - Formulas shown as computed values
41
+ - Named ranges and cell references
42
+
43
+ ### HWPX (Hancom Office)
44
+ - Korean document format (XML-based, ZIP archive)
45
+ - Structure: Contents/section*.xml
46
+ - Tables with complex merging patterns
47
+ - Korean text preserved with UTF-8 encoding
48
+
49
+ ### JSON
50
+ - Formatted as structured markdown
51
+ - Nested objects → indented sections
52
+ - Arrays → lists or tables
53
+
54
+ ### TXT
55
+ - Wrapped as markdown
56
+ - Auto-detect structure (lists, paragraphs)
57
+
58
+ ## Image Formats (PNG, JPG, JPEG)
59
+ - OCR via Google Gemini Vision API
60
+ - Text extraction with layout preservation
61
+ - Table detection in scanned documents
62
+ - Handwriting recognition (best effort)
63
+
64
+ ## Unsupported Formats
65
+ - HWP (legacy Hancom binary format — convert to HWPX first)
66
+ - Password-protected files
67
+ - DRM-protected documents