@simplysm/sd-claude 14.0.75 → 14.0.77

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. package/claude/output-styles/sd-tone.md +128 -0
  2. package/claude/references/sd-simplysm14/apis/angular/README.md +28 -89
  3. package/claude/references/sd-simplysm14/apis/angular/app-structure.md +75 -32
  4. package/claude/references/sd-simplysm14/apis/angular/buttons.md +65 -29
  5. package/claude/references/sd-simplysm14/apis/angular/crud.md +86 -21
  6. package/claude/references/sd-simplysm14/apis/angular/forms.md +168 -42
  7. package/claude/references/sd-simplysm14/apis/angular/infrastructure.md +200 -49
  8. package/claude/references/sd-simplysm14/apis/angular/kanban.md +64 -20
  9. package/claude/references/sd-simplysm14/apis/angular/layout.md +75 -30
  10. package/claude/references/sd-simplysm14/apis/angular/modal.md +92 -40
  11. package/claude/references/sd-simplysm14/apis/angular/routing.md +86 -25
  12. package/claude/references/sd-simplysm14/apis/angular/selection-managers.md +72 -41
  13. package/claude/references/sd-simplysm14/apis/angular/shared-data.md +113 -21
  14. package/claude/references/sd-simplysm14/apis/angular/sheet.md +108 -33
  15. package/claude/references/sd-simplysm14/apis/angular/toast.md +81 -30
  16. package/claude/references/sd-simplysm14/apis/angular/visual.md +140 -32
  17. package/claude/references/sd-simplysm14/apis/capacitor-plugin-auto-update/README.md +46 -43
  18. package/claude/references/sd-simplysm14/apis/capacitor-plugin-intent/README.md +59 -48
  19. package/claude/references/sd-simplysm14/apis/capacitor-plugin-usb-storage/README.md +17 -7
  20. package/claude/references/sd-simplysm14/apis/core-common/README.md +43 -116
  21. package/claude/references/sd-simplysm14/apis/core-common/extensions.md +74 -109
  22. package/claude/references/sd-simplysm14/apis/core-common/features.md +40 -35
  23. package/claude/references/sd-simplysm14/apis/core-common/types.md +80 -106
  24. package/claude/references/sd-simplysm14/apis/core-common/utils.md +142 -111
  25. package/claude/references/sd-simplysm14/apis/core-node/README.md +7 -16
  26. package/claude/references/sd-simplysm14/apis/core-node/consola.md +33 -38
  27. package/claude/references/sd-simplysm14/apis/core-node/cpx.md +25 -33
  28. package/claude/references/sd-simplysm14/apis/core-node/fs-watcher.md +27 -38
  29. package/claude/references/sd-simplysm14/apis/core-node/fsx.md +32 -60
  30. package/claude/references/sd-simplysm14/apis/core-node/pathx.md +14 -45
  31. package/claude/references/sd-simplysm14/apis/core-node/worker.md +35 -81
  32. package/claude/references/sd-simplysm14/apis/excel/README.md +178 -80
  33. package/claude/references/sd-simplysm14/apis/lint/README.md +5 -0
  34. package/claude/references/sd-simplysm14/apis/orm-node/README.md +1 -1
  35. package/claude/references/sd-simplysm14/apis/sd-claude/README.md +28 -5
  36. package/claude/references/sd-simplysm14/apis/sd-cli/README.md +1 -1
  37. package/claude/references/sd-simplysm14/apis/service-client/README.md +57 -50
  38. package/claude/references/sd-simplysm14/apis/service-server/README.md +8 -15
  39. package/claude/references/sd-simplysm14/apis/service-server/auth.md +24 -16
  40. package/claude/references/sd-simplysm14/apis/service-server/builtin-services.md +55 -31
  41. package/claude/references/sd-simplysm14/apis/service-server/define-service.md +28 -44
  42. package/claude/references/sd-simplysm14/apis/service-server/internals.md +59 -18
  43. package/claude/references/sd-simplysm14/apis/service-server/server.md +37 -46
  44. package/claude/references/sd-simplysm14/manuals/client-component.md +3 -1
  45. package/claude/references/sd-simplysm14/manuals/logging.md +9 -8
  46. package/claude/rules/sd-base-rules.md +380 -217
  47. package/claude/settings.json +1 -0
  48. package/claude/skills/sd-commit/SKILL.md +31 -8
  49. package/claude/skills/sd-docs/SKILL.md +15 -10
  50. package/claude/skills/sd-docs/references/subagent-prompt.md +26 -8
  51. package/claude/skills/sd-impl/SKILL.md +1 -1
  52. package/claude/skills/sd-skill/references/skill-authoring.md +1 -1
  53. package/claude/skills/sd-spec/SKILL.md +22 -13
  54. package/claude/skills/sd-spec/references/spec-authoring.md +1 -1
  55. package/claude/skills/sd-unpack/SKILL.md +150 -26
  56. package/claude/skills/sd-unpack/scripts/handlers/__pycache__/_common.cpython-314.pyc +0 -0
  57. package/claude/skills/sd-unpack/scripts/handlers/__pycache__/eml_handler.cpython-314.pyc +0 -0
  58. package/claude/skills/sd-unpack/scripts/handlers/__pycache__/office_com.cpython-314.pyc +0 -0
  59. package/claude/skills/sd-unpack/scripts/handlers/__pycache__/pdf_handler.cpython-314.pyc +0 -0
  60. package/claude/skills/sd-unpack/scripts/handlers/_common.py +17 -2
  61. package/claude/skills/sd-unpack/scripts/handlers/eml_handler.py +100 -24
  62. package/claude/skills/sd-unpack/scripts/handlers/msg_handler.py +140 -27
  63. package/claude/skills/sd-unpack/scripts/handlers/office_com.py +698 -107
  64. package/claude/skills/sd-unpack/scripts/handlers/office_worker.py +34 -26
  65. package/claude/skills/sd-unpack/scripts/handlers/pdf_handler.py +130 -8
  66. package/package.json +1 -1
@@ -1,22 +1,26 @@
1
1
  """Office (docx/pptx/xlsx) + 레거시 (doc/ppt/xls/xlsb) COM 핸들러.
2
2
 
3
- 시각 산출물은 PNG, 텍스트 산출물은 MD 로 분리:
3
+ 시각 산출물은 PNG, 텍스트/구조 산출물은 형식별로:
4
4
  - docx → pages/<NNN>.png + pages/<NNN>.md (페이지별)
5
5
  - pptx → slides/<NN>_<title>.png + .md + .notes.md (슬라이드별)
6
- - xlsx → sheets/<NN>_<name>.png + .md + .formulas.json (시트별)
6
+ - xlsx → sheets/<NN>_<name>.png + .jsonl (시트별) + workbook.meta.json
7
+
8
+ xlsx jsonl 한 줄 = 한 행. 좌표는 행번호(`r`)·열문자 키로 명시. 값·수식·시트 메타 통합.
7
9
 
8
10
  Office COM 호출은 office_worker.py subprocess 로 격리 (cleanup race 회피).
9
- 이 모듈 (office_com.py) 은 호출자 + Office 외 작업 (.md, ZIP strip, 매크로 추출, README 생성).
11
+ 이 모듈 (office_com.py) 은 호출자 + Office 외 작업 (jsonl 직렬화, ZIP strip, 매크로 추출, README 생성).
10
12
  원칙: 처리 실패는 묻지 않고 그대로 throw. try/finally 는 락/임시 폴더 cleanup 에만 사용.
11
13
  """
12
14
  from __future__ import annotations
13
15
 
14
16
  import json
15
17
  import os
18
+ import re
16
19
  import sys
17
20
  import zipfile
21
+ from datetime import date, datetime, time
18
22
  from pathlib import Path
19
- from typing import Optional
23
+ from typing import Any, Optional
20
24
 
21
25
  from . import _common
22
26
  from .dispatch import maybe_recurse_attachment
@@ -39,20 +43,25 @@ def run(input_path: Path, out_dir: Path) -> None:
39
43
 
40
44
 
41
45
  def run_legacy(input_path: Path, out_dir: Path) -> None:
46
+ """레거시 (.doc/.ppt/.xls/.xlsb) → 신형 변환 후 처리.
47
+
48
+ `_converted.<ext>` 는 임시 폴더에서만 처리하고 산출 폴더(out_dir)에는 잔존시키지 않음.
49
+ """
42
50
  ext = input_path.suffix.lower()
43
51
  target_ext_map = {".doc": ".docx", ".ppt": ".pptx", ".xls": ".xlsx", ".xlsb": ".xlsx"}
44
52
  target_ext = target_ext_map[ext]
45
53
 
46
- converted_in_out = out_dir / f"_converted{target_ext}"
47
- _convert_legacy(input_path, converted_in_out)
48
-
49
54
  tool_extra = f"(레거시 {ext} → {target_ext} 변환 후 처리)"
50
- if target_ext == ".docx":
51
- _run_docx(converted_in_out, out_dir, source_name_override=input_path.name, tool_extra=tool_extra)
52
- elif target_ext == ".pptx":
53
- _run_pptx(converted_in_out, out_dir, source_name_override=input_path.name, tool_extra=tool_extra)
54
- elif target_ext == ".xlsx":
55
- _run_xlsx(converted_in_out, out_dir, source_name_override=input_path.name, tool_extra=tool_extra)
55
+ with _common.temp_workdir() as tmp:
56
+ converted_path = tmp / f"_converted{target_ext}"
57
+ _convert_legacy(input_path, converted_path)
58
+
59
+ if target_ext == ".docx":
60
+ _run_docx(converted_path, out_dir, source_name_override=input_path.name, tool_extra=tool_extra)
61
+ elif target_ext == ".pptx":
62
+ _run_pptx(converted_path, out_dir, source_name_override=input_path.name, tool_extra=tool_extra)
63
+ elif target_ext == ".xlsx":
64
+ _run_xlsx(converted_path, out_dir, source_name_override=input_path.name, tool_extra=tool_extra)
56
65
 
57
66
 
58
67
  # ====================================================================
@@ -66,15 +75,30 @@ def _run_docx(
66
75
  source_name_override: Optional[str] = None,
67
76
  tool_extra: str = "",
68
77
  ) -> None:
78
+ """python-docx 로 구조 추출 → content.jsonl 단일 시퀀스. 페이지 단위 폐기.
79
+
80
+ PNG 는 fitz PDF 경유로 시각 검증용 유지. pages.meta.json 으로 페이지↔노드 best-effort 매핑.
81
+ """
82
+ _common.ensure_pip("docx", "python-docx")
83
+
69
84
  pages_dir = out_dir / "pages"
70
85
  images_dir = out_dir / "images"
71
86
 
72
- # COM Word 임시 PDF → PyMuPDF 로 페이지별 PNG + MD.
87
+ # 1. python-docx 구조 추출
88
+ nodes, counts = _docx_extract_nodes(input_path)
89
+
90
+ # content.jsonl
91
+ lines: list[str] = [json.dumps({"_meta": counts}, ensure_ascii=False)]
92
+ for n in nodes:
93
+ lines.append(json.dumps(n, ensure_ascii=False, default=_json_default))
94
+ _common.write_text(out_dir / "content.jsonl", "\n".join(lines))
95
+
96
+ # 2. fitz PDF 경유 PNG + pages.meta.json (페이지↔노드 매핑 best-effort)
73
97
  with _common.com_lock(), _common.temp_workdir() as tmp:
74
98
  tmp_pdf = tmp / "out.pdf"
75
99
  _word_export_pdf(input_path, tmp_pdf)
76
100
  _common.mkdir(pages_dir)
77
- page_summaries = _render_pdf_pages(tmp_pdf, pages_dir)
101
+ page_count = _docx_pages_from_pdf(tmp_pdf, pages_dir, out_dir, nodes)
78
102
 
79
103
  attachment_links = _extract_zip_media(
80
104
  input_path,
@@ -88,8 +112,15 @@ def _run_docx(
88
112
  macro_modules = _extract_macros(_source_path(out_dir, source_name), out_dir)
89
113
 
90
114
  sections: dict[str, list[str]] = {}
91
- if page_summaries:
92
- sections[f"페이지 (총 {len(page_summaries)}개)"] = page_summaries
115
+ summary = (
116
+ f"노드 {counts['nodes']}개 "
117
+ f"(heading {counts['headings']}·para {counts['paragraphs_plain']}·"
118
+ f"bullet {counts['bullets']}·table_cell {counts['table_cells']}·image {counts['images']})"
119
+ )
120
+ content_items = [f"`content.jsonl` — {summary}"]
121
+ if page_count:
122
+ content_items.append(f"`pages.meta.json` — PNG ↔ 노드 매핑 ({page_count}페이지)")
123
+ sections["콘텐츠"] = content_items
93
124
  if macro_modules:
94
125
  sections[f"VBA 매크로 (총 {len(macro_modules)}개)"] = [f"`macros/{m}`" for m in macro_modules]
95
126
 
@@ -97,13 +128,261 @@ def _run_docx(
97
128
  out_dir,
98
129
  source_name=source_name,
99
130
  source_size=source_size,
100
- tool=("COM Word + PyMuPDF + ZIP " + tool_extra).strip(),
101
- loss_notes="서식(폰트/색/볼드)·정확한 페이지 레이아웃은 PNG 안에서만 보존. 매크로(VBA)는 macros/ 로 별도 추출.",
131
+ tool=("python-docx + COM Word + PyMuPDF + ZIP " + tool_extra).strip(),
132
+ loss_notes=(
133
+ "서식(폰트/색/볼드)·정확한 페이지 레이아웃은 PNG 안에서만 보존. "
134
+ "구조는 content.jsonl 단일 시퀀스(heading/para/bullet/table_cell/image), "
135
+ "PNG↔노드 매핑은 pages.meta.json. 매크로(VBA)는 macros/ 로 별도 추출."
136
+ ),
102
137
  sections=sections or None,
103
138
  attachments=attachment_links,
104
139
  )
105
140
 
106
141
 
142
+ def _docx_extract_nodes(input_path: Path) -> tuple[list[dict], dict[str, int]]:
143
+ """python-docx 로 body 시퀀스(paragraph/table) 순회 → jsonl 노드 리스트."""
144
+ from docx import Document
145
+ from docx.oxml.ns import qn
146
+ from docx.table import Table
147
+ from docx.text.paragraph import Paragraph
148
+
149
+ IMG_RELTYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"
150
+ doc = Document(_common.long_str(input_path))
151
+
152
+ # image relationship: rid → 'images/<basename>'
153
+ img_rels: dict[str, str] = {}
154
+ for rid, rel in doc.part.rels.items():
155
+ if rel.reltype == IMG_RELTYPE:
156
+ try:
157
+ basename = Path(rel.target_ref).name
158
+ img_rels[rid] = f"images/{basename}"
159
+ except Exception:
160
+ continue
161
+
162
+ nodes: list[dict] = []
163
+ counts = {
164
+ "nodes": 0,
165
+ "headings": 0,
166
+ "paragraphs_plain": 0,
167
+ "bullets": 0,
168
+ "tables": 0,
169
+ "table_cells": 0,
170
+ "images": 0,
171
+ }
172
+ table_idx = 0
173
+ node_idx = 0
174
+
175
+ for elem in doc.element.body.iterchildren():
176
+ tag = elem.tag
177
+ if tag == qn("w:p"):
178
+ para = Paragraph(elem, doc)
179
+ text = para.text or ""
180
+ style_name = para.style.name if para.style else ""
181
+ heading_level = _docx_heading_level(style_name)
182
+ bullet_level = _docx_bullet_level(para)
183
+ image_rids = _docx_inline_image_rids(para)
184
+ hyperlinks = _docx_paragraph_hyperlinks(para, doc)
185
+
186
+ node: dict
187
+ if heading_level is not None:
188
+ node = {"node": node_idx, "type": "heading", "level": heading_level, "text": text}
189
+ counts["headings"] += 1
190
+ elif bullet_level is not None:
191
+ node = {"node": node_idx, "type": "bullet", "level": bullet_level, "text": text}
192
+ counts["bullets"] += 1
193
+ else:
194
+ # 빈 paragraph 도 원본 정보 → 노드로 보존 (text="")
195
+ node = {"node": node_idx, "type": "para", "text": text}
196
+ counts["paragraphs_plain"] += 1
197
+
198
+ if hyperlinks:
199
+ node["hyperlinks"] = hyperlinks
200
+
201
+ nodes.append(node)
202
+ node_idx += 1
203
+
204
+ for rid in image_rids:
205
+ ref = img_rels.get(rid)
206
+ if ref:
207
+ nodes.append({"node": node_idx, "type": "image", "ref": ref})
208
+ counts["images"] += 1
209
+ node_idx += 1
210
+
211
+ elif tag == qn("w:tbl"):
212
+ table_obj = Table(elem, doc)
213
+ table_idx += 1
214
+ counts["tables"] += 1
215
+ seen_tc: set[int] = set()
216
+ for r, row in enumerate(table_obj.rows, start=1):
217
+ for c, cell in enumerate(row.cells, start=1):
218
+ tc_id = id(cell._tc)
219
+ if tc_id in seen_tc:
220
+ # gridSpan 으로 같은 row 안 colspan 중복 노출 — origin 의 colspan 에 표기됨
221
+ continue
222
+ seen_tc.add(tc_id)
223
+ vm = _docx_cell_vmerge(cell)
224
+ if vm == "continue":
225
+ # vMerge continue cell — origin 의 rowspan 영역. skip.
226
+ continue
227
+ cell_text = (cell.text or "").strip()
228
+ colspan = _docx_cell_colspan(cell)
229
+ cell_node = {
230
+ "node": node_idx,
231
+ "type": "table_cell",
232
+ "table_idx": table_idx,
233
+ "row": r,
234
+ "col": c,
235
+ "text": cell_text,
236
+ }
237
+ if colspan > 1:
238
+ cell_node["colspan"] = colspan
239
+ nodes.append(cell_node)
240
+ counts["table_cells"] += 1
241
+ node_idx += 1
242
+
243
+ counts["nodes"] = node_idx
244
+ return nodes, counts
245
+
246
+
247
+ def _docx_heading_level(style_name: str) -> Optional[int]:
248
+ """python-docx 스타일명 → heading level. heading 아니면 None."""
249
+ if not style_name:
250
+ return None
251
+ if style_name.startswith("Heading "):
252
+ try:
253
+ return int(style_name.split(" ")[1])
254
+ except (ValueError, IndexError):
255
+ return None
256
+ if style_name == "Title":
257
+ return 0
258
+ return None
259
+
260
+
261
+ def _docx_bullet_level(para) -> Optional[int]:
262
+ """paragraph 의 numbering ilvl 추출. bullet/numbered 아니면 None."""
263
+ from docx.oxml.ns import qn
264
+
265
+ pPr = para._element.find(qn("w:pPr"))
266
+ if pPr is None:
267
+ return None
268
+ numPr = pPr.find(qn("w:numPr"))
269
+ if numPr is None:
270
+ return None
271
+ ilvl_elem = numPr.find(qn("w:ilvl"))
272
+ if ilvl_elem is None:
273
+ return 0
274
+ try:
275
+ return int(ilvl_elem.get(qn("w:val")) or 0)
276
+ except (ValueError, TypeError):
277
+ return 0
278
+
279
+
280
+ _DRAWING_EMBED_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
281
+ _DRAWING_BLIP_TAG = "{http://schemas.openxmlformats.org/drawingml/2006/main}blip"
282
+
283
+
284
+ def _docx_inline_image_rids(para) -> list[str]:
285
+ """paragraph 안 inline image relationship IDs."""
286
+ from docx.oxml.ns import qn
287
+
288
+ rids: list[str] = []
289
+ for drawing in para._element.iter(qn("w:drawing")):
290
+ for blip in drawing.iter(_DRAWING_BLIP_TAG):
291
+ rid = blip.get(_DRAWING_EMBED_NS)
292
+ if rid:
293
+ rids.append(rid)
294
+ return rids
295
+
296
+
297
+ _DOCX_R_ID_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
298
+ _DOCX_HYPERLINK_RELTYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"
299
+
300
+
301
+ def _docx_paragraph_hyperlinks(para, doc) -> list[dict]:
302
+ """paragraph 안 hyperlink list: [{"text":"...", "url":"..."}, ...]"""
303
+ from docx.oxml.ns import qn
304
+
305
+ rels = doc.part.rels
306
+ result: list[dict] = []
307
+ for hl_elem in para._element.iter(qn("w:hyperlink")):
308
+ rid = hl_elem.get(_DOCX_R_ID_NS)
309
+ url = ""
310
+ if rid and rid in rels:
311
+ rel = rels[rid]
312
+ if rel.reltype == _DOCX_HYPERLINK_RELTYPE:
313
+ url = rel.target_ref
314
+ # hyperlink 안 모든 w:t 텍스트 join
315
+ hl_text = "".join((t.text or "") for t in hl_elem.iter(qn("w:t")))
316
+ if hl_text or url:
317
+ result.append({"text": hl_text, "url": url})
318
+ return result
319
+
320
+
321
+ def _docx_cell_colspan(cell) -> int:
322
+ """docx 표 셀의 colspan (gridSpan val). 기본 1."""
323
+ from docx.oxml.ns import qn
324
+
325
+ tcPr = cell._tc.find(qn("w:tcPr"))
326
+ if tcPr is None:
327
+ return 1
328
+ gridSpan = tcPr.find(qn("w:gridSpan"))
329
+ if gridSpan is None:
330
+ return 1
331
+ val = gridSpan.get(qn("w:val"))
332
+ try:
333
+ return int(val) if val else 1
334
+ except (ValueError, TypeError):
335
+ return 1
336
+
337
+
338
+ def _docx_cell_vmerge(cell) -> Optional[str]:
339
+ """docx 표 셀의 vMerge 상태. 'restart' | 'continue' | None."""
340
+ from docx.oxml.ns import qn
341
+
342
+ tcPr = cell._tc.find(qn("w:tcPr"))
343
+ if tcPr is None:
344
+ return None
345
+ vMerge = tcPr.find(qn("w:vMerge"))
346
+ if vMerge is None:
347
+ return None
348
+ val = vMerge.get(qn("w:val"))
349
+ return val if val else "continue" # vMerge 요소 있고 val 없으면 continue
350
+
351
+
352
+ def _docx_pages_from_pdf(
353
+ pdf_path: Path,
354
+ pages_dir: Path,
355
+ out_dir: Path,
356
+ nodes: list[dict],
357
+ ) -> int:
358
+ """fitz PDF 경유 페이지별 PNG + pages.meta.json (페이지별 raw text 보존).
359
+
360
+ nodes 와의 매핑은 fitz·python-docx 간 텍스트 분할 차이로 자동 추정 시 오매핑 위험 →
361
+ raw text 만 보존. Claude 가 분석 시 페이지 text 와 content.jsonl 노드 text 를 직접 비교.
362
+ """
363
+ _common.ensure_pip("fitz", "PyMuPDF")
364
+ import fitz
365
+
366
+ pages_meta: dict[str, dict] = {}
367
+ fdoc = fitz.open(_common.long_str(pdf_path))
368
+ try:
369
+ for i, page in enumerate(fdoc, start=1):
370
+ idx = f"{i:03d}"
371
+ pix = page.get_pixmap(dpi=300)
372
+ pix.save(_common.long_str(pages_dir / f"{idx}.png"))
373
+ text = page.get_text("text") or ""
374
+ pages_meta[idx] = {"text": text}
375
+ finally:
376
+ fdoc.close()
377
+
378
+ if pages_meta:
379
+ _common.write_text(
380
+ out_dir / "pages.meta.json",
381
+ json.dumps(pages_meta, ensure_ascii=False, indent=2),
382
+ )
383
+ return len(pages_meta)
384
+
385
+
107
386
  # ====================================================================
108
387
  # PPTX
109
388
  # ====================================================================
@@ -115,39 +394,56 @@ def _run_pptx(
115
394
  source_name_override: Optional[str] = None,
116
395
  tool_extra: str = "",
117
396
  ) -> None:
397
+ """python-pptx 로 구조 추출 → 슬라이드별 jsonl. 시각 순서 정렬 + pos EMU 좌표.
398
+
399
+ 노드 type: title·heading·para·bullet·table_cell·image·chart·shape.
400
+ PNG 은 COM PowerPoint 의 Slide.Export 로 슬라이드별 직접 출력.
401
+ """
118
402
  _common.ensure_pip("pptx", "python-pptx")
119
403
  from pptx import Presentation
120
404
 
121
405
  slides_dir = out_dir / "slides"
122
406
  charts_dir = out_dir / "charts"
407
+ images_dir = out_dir / "images"
123
408
 
124
409
  prs = Presentation(_common.long_str(input_path))
410
+ slide_w = int(prs.slide_width or 0)
411
+ slide_h = int(prs.slide_height or 0)
412
+
125
413
  slide_titles: list[tuple[str, str]] = [] # (idx, safe_title)
126
414
  slide_summaries: list[str] = []
127
415
  slide_has_notes: dict[str, bool] = {}
128
416
  slide_charts: dict[str, list[str]] = {} # idx -> chart filenames
417
+ slide_cores: dict[str, str] = {} # idx -> 핵심 텍스트 (title 또는 첫 텍스트)
129
418
 
130
419
  _common.mkdir(slides_dir)
131
420
  for i, slide in enumerate(prs.slides, start=1):
132
421
  idx = f"{i:02d}"
133
- title = ""
134
- if slide.shapes.title and slide.shapes.title.text:
135
- title = slide.shapes.title.text.strip()
136
- if not title:
137
- title = f"슬라이드{i}"
138
- safe_title = _common.slugify_filename(title, max_len=40)
422
+ title = _pptx_slide_title(slide)
423
+ safe_title = _common.slugify_filename(title or f"슬라이드{i}", max_len=40)
139
424
  slide_titles.append((idx, safe_title))
140
425
 
141
- # 슬라이드 텍스트 (python-pptx)
142
- text_lines: list[str] = []
143
- for shape in slide.shapes:
144
- if not shape.has_text_frame:
145
- continue
146
- for para in shape.text_frame.paragraphs:
147
- line = "".join(run_.text for run_ in para.runs)
148
- if line.strip():
149
- text_lines.append(line)
150
- _common.write_text(slides_dir / f"{idx}_{safe_title}.md", "\n".join(text_lines))
426
+ nodes, chart_refs = _pptx_extract_slide_nodes(
427
+ slide, i, charts_dir, images_dir,
428
+ )
429
+ # 원본 XML 순서 (shape_idx 순) 그대로 보존. 시각 순서는 pos 가 보존되어 있어
430
+ # Claude 가 필요시 직접 정렬 가능.
431
+
432
+ meta = {
433
+ "_meta": {
434
+ "slide": i,
435
+ "title": title,
436
+ "size": [slide_w, slide_h],
437
+ "shapes": len(nodes),
438
+ }
439
+ }
440
+ lines = [json.dumps(meta, ensure_ascii=False, default=_json_default)]
441
+ for n in nodes:
442
+ lines.append(json.dumps(n, ensure_ascii=False, default=_json_default))
443
+ _common.write_text(slides_dir / f"{idx}_{safe_title}.jsonl", "\n".join(lines))
444
+
445
+ if chart_refs:
446
+ slide_charts[idx] = chart_refs
151
447
 
152
448
  if slide.has_notes_slide:
153
449
  notes_text = slide.notes_slide.notes_text_frame.text or ""
@@ -158,25 +454,18 @@ def _run_pptx(
158
454
  )
159
455
  slide_has_notes[idx] = True
160
456
 
161
- for shape_idx, shape in enumerate(slide.shapes, start=1):
162
- if shape.has_chart:
163
- data = _extract_pptx_chart_data(shape.chart)
164
- _common.mkdir(charts_dir)
165
- chart_filename = f"slide{i:02d}_chart{shape_idx:02d}.data.json"
166
- _common.write_text(
167
- charts_dir / chart_filename,
168
- json.dumps(data, ensure_ascii=False, indent=2),
169
- )
170
- slide_charts.setdefault(idx, []).append(chart_filename)
457
+ core = title or _pptx_first_text(nodes)
458
+ if core:
459
+ slide_cores[idx] = core[:60]
171
460
 
172
- # 슬라이드별 산출물 풀목록
173
- parts = [f"`slides/{idx}_{safe_title}.png`", "`.md`"]
461
+ parts = [f"`slides/{idx}_{safe_title}.png`", "`.jsonl`"]
174
462
  if slide_has_notes.get(idx):
175
463
  parts.append("`.notes.md`")
176
- chart_refs = slide_charts.get(idx, [])
177
464
  if chart_refs:
178
465
  chart_str = ", ".join(f"`charts/{c}`" for c in chart_refs)
179
466
  parts.append(f"(차트: {chart_str})")
467
+ if slide_cores.get(idx):
468
+ parts.append(f"— {slide_cores[idx]}")
180
469
  slide_summaries.append(" ".join(parts))
181
470
 
182
471
  # COM PowerPoint 의 Slide.Export 로 슬라이드별 PNG 직접 출력. 임시 폴더에서 만든 후 long-path-safe copy.
@@ -187,7 +476,8 @@ def _run_pptx(
187
476
  if tmp_png.exists():
188
477
  _common.copy(tmp_png, slides_dir / f"{idx}_{safe_title}.png")
189
478
 
190
- # pptx 의 시각은 슬라이드 PNG 에 모두 포함 → images/ 만들지 않음 (embeddings 만 추출).
479
+ # pptx 의 시각은 슬라이드 PNG 에 모두 포함 → ZIP media 전체 복제 skip
480
+ # (개별 picture shape 은 _pptx_extract_slide_nodes 에서 image ref 와 함께 저장됨).
191
481
  attachment_links = _extract_zip_media(
192
482
  input_path,
193
483
  out_dir,
@@ -209,12 +499,213 @@ def _run_pptx(
209
499
  source_name=source_name,
210
500
  source_size=source_size,
211
501
  tool=("python-pptx + COM PowerPoint + ZIP " + tool_extra).strip(),
212
- loss_notes="애니메이션·슬라이드 전환·정확한 폰트는 미보존. 시각은 슬라이드별 PNG 로, 차트 데이터는 charts/*.data.json 으로 보존.",
502
+ loss_notes=(
503
+ "애니메이션·슬라이드 전환·정확한 폰트는 미보존. "
504
+ "시각은 슬라이드별 PNG, 구조는 슬라이드별 .jsonl(시각 순서·pos EMU 좌표), "
505
+ "차트 데이터는 charts/*.data.json, picture shape 의 image 는 images/."
506
+ ),
213
507
  sections=sections or None,
214
508
  attachments=attachment_links,
215
509
  )
216
510
 
217
511
 
512
+ def _pptx_slide_title(slide) -> str:
513
+ """슬라이드 title placeholder 텍스트. 없으면 빈 문자열."""
514
+ try:
515
+ title_shape = slide.shapes.title
516
+ if title_shape is not None and title_shape.text:
517
+ return title_shape.text.strip()
518
+ except (AttributeError, ValueError):
519
+ pass
520
+ return ""
521
+
522
+
523
+ def _pptx_first_text(nodes: list[dict]) -> str:
524
+ """노드 리스트 중 첫 비어있지 않은 text. 없으면 빈 문자열."""
525
+ for n in nodes:
526
+ t = (n.get("text") or "").strip()
527
+ if t:
528
+ return t
529
+ return ""
530
+
531
+
532
+ def _pptx_extract_slide_nodes(
533
+ slide,
534
+ slide_num: int,
535
+ charts_dir: Path,
536
+ images_dir: Path,
537
+ ) -> tuple[list[dict], list[str]]:
538
+ """슬라이드 안 shape → 노드 list + chart 파일 list.
539
+
540
+ text_frame 의 paragraph 별로 노드 분리 (heading·para·bullet).
541
+ 표·차트·이미지는 각각 별도 노드.
542
+ 그 외 (autoshape·SmartArt·group) 은 shape 노드.
543
+ """
544
+ nodes: list[dict] = []
545
+ chart_refs: list[str] = []
546
+
547
+ title_shape = None
548
+ try:
549
+ title_shape = slide.shapes.title
550
+ except (AttributeError, ValueError):
551
+ title_shape = None
552
+
553
+ for shape_idx, shape in enumerate(slide.shapes):
554
+ pos = _pptx_shape_pos(shape)
555
+ common = {
556
+ "slide": slide_num,
557
+ "pos": pos,
558
+ "shape_idx": shape_idx,
559
+ }
560
+
561
+ # 표
562
+ if getattr(shape, "has_table", False):
563
+ try:
564
+ table = shape.table
565
+ except Exception:
566
+ table = None
567
+ if table is not None:
568
+ table_idx = shape_idx + 1
569
+ for r_idx, row in enumerate(table.rows, start=1):
570
+ for c_idx, cell in enumerate(row.cells, start=1):
571
+ cell_text = (cell.text or "").strip()
572
+ nodes.append({
573
+ **common,
574
+ "type": "table_cell",
575
+ "table_idx": table_idx,
576
+ "row": r_idx,
577
+ "col": c_idx,
578
+ "text": cell_text,
579
+ })
580
+ continue
581
+
582
+ # 차트
583
+ if getattr(shape, "has_chart", False):
584
+ try:
585
+ data = _extract_pptx_chart_data(shape.chart)
586
+ except Exception:
587
+ data = None
588
+ chart_filename = f"slide{slide_num:02d}_chart{shape_idx + 1:02d}.data.json"
589
+ if data is not None:
590
+ _common.mkdir(charts_dir)
591
+ _common.write_text(
592
+ charts_dir / chart_filename,
593
+ json.dumps(data, ensure_ascii=False, indent=2),
594
+ )
595
+ chart_refs.append(chart_filename)
596
+ nodes.append({
597
+ **common,
598
+ "type": "chart",
599
+ "ref": f"charts/{chart_filename}",
600
+ })
601
+ continue
602
+
603
+ # 그림 (picture)
604
+ if _pptx_is_picture(shape):
605
+ ref = _pptx_save_picture(shape, slide_num, shape_idx, images_dir)
606
+ node = {**common, "type": "image"}
607
+ if ref:
608
+ node["ref"] = ref
609
+ nodes.append(node)
610
+ continue
611
+
612
+ # text_frame 보유 shape (placeholder·text box·autoshape with text)
613
+ if getattr(shape, "has_text_frame", False):
614
+ is_title = (title_shape is not None and shape == title_shape)
615
+ for p_idx, para in enumerate(shape.text_frame.paragraphs):
616
+ text = "".join(run.text for run in para.runs)
617
+ hyperlinks = _pptx_run_hyperlinks(para)
618
+ bullet_lvl = getattr(para, "level", 0) or 0
619
+
620
+ base_node: dict
621
+ if is_title and p_idx == 0:
622
+ base_node = {**common, "type": "title", "para_idx": p_idx, "text": text}
623
+ elif bullet_lvl > 0:
624
+ base_node = {**common, "type": "bullet", "para_idx": p_idx,
625
+ "level": bullet_lvl, "text": text}
626
+ else:
627
+ base_node = {**common, "type": "para", "para_idx": p_idx, "text": text}
628
+ if hyperlinks:
629
+ base_node["hyperlinks"] = hyperlinks
630
+ nodes.append(base_node)
631
+ continue
632
+
633
+ # 그 외 (group·SmartArt·connector·autoshape 등)
634
+ subtype = ""
635
+ try:
636
+ subtype = str(shape.shape_type)
637
+ except Exception:
638
+ pass
639
+ nodes.append({
640
+ **common,
641
+ "type": "shape",
642
+ "subtype": subtype,
643
+ })
644
+
645
+ return nodes, chart_refs
646
+
647
+
648
+ def _pptx_shape_pos(shape) -> list[int]:
649
+ """shape 의 [left, top, width, height] EMU. 누락 시 0."""
650
+ try:
651
+ return [
652
+ int(shape.left or 0),
653
+ int(shape.top or 0),
654
+ int(shape.width or 0),
655
+ int(shape.height or 0),
656
+ ]
657
+ except (AttributeError, TypeError, ValueError):
658
+ return [0, 0, 0, 0]
659
+
660
+
661
+ def _pptx_is_picture(shape) -> bool:
662
+ """python-pptx shape 이 picture 인지. shape_type 또는 image 속성으로 판별."""
663
+ try:
664
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
665
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
666
+ return True
667
+ except Exception:
668
+ pass
669
+ # placeholder picture 인 경우 shape_type 이 PLACEHOLDER 라 image 속성으로 보완
670
+ try:
671
+ _ = shape.image
672
+ return True
673
+ except Exception:
674
+ return False
675
+
676
+
677
+ def _pptx_run_hyperlinks(para) -> list[dict]:
678
+ """pptx paragraph 안 run 별 hyperlink list. 텍스트·URL."""
679
+ result: list[dict] = []
680
+ for run in para.runs:
681
+ try:
682
+ hl = run.hyperlink
683
+ url = getattr(hl, "address", None)
684
+ except Exception:
685
+ url = None
686
+ if url:
687
+ result.append({"text": run.text or "", "url": url})
688
+ return result
689
+
690
+
691
+ def _pptx_save_picture(
692
+ shape, slide_num: int, shape_idx: int, images_dir: Path,
693
+ ) -> Optional[str]:
694
+ """shape.image.blob 을 images/ 에 저장하고 ref(상대경로) 반환. 실패 시 None."""
695
+ try:
696
+ img = shape.image
697
+ ext = (img.ext or "bin").lstrip(".")
698
+ blob = img.blob
699
+ except Exception:
700
+ return None
701
+ if not blob:
702
+ return None
703
+ _common.mkdir(images_dir)
704
+ filename = f"slide{slide_num:02d}_shape{shape_idx + 1:02d}.{ext}"
705
+ _common.write_bytes(images_dir / filename, blob)
706
+ return f"images/{filename}"
707
+
708
+
218
709
  # ====================================================================
219
710
  # XLSX
220
711
  # ====================================================================
@@ -254,9 +745,10 @@ def _run_xlsx(
254
745
  sheet_names.append((idx, safe_name, name))
255
746
 
256
747
  # COM Excel 호출: 데이터 영역 → ChartObject + Range.CopyPicture → 시트별 PNG.
257
- # 시트별 (last_row, last_col) 도 같이 반환되어 .md/.formulas.json 이 같은 데이터 영역으로 통일됨.
748
+ # 시트별 (last_row, last_col) 도 같이 반환되어 .jsonl 이 같은 데이터 영역으로 통일됨.
749
+ # PNG export 실패한 시트는 sheet_png_skipped 에 사유 (silent skip 금지).
258
750
  with _common.com_lock():
259
- sheet_ranges = _excel_export_sheet_pngs(input_path, sheets_dir, sheet_names)
751
+ sheet_ranges, sheet_png_skipped = _excel_export_sheet_pngs(input_path, sheets_dir, sheet_names)
260
752
 
261
753
  for idx, safe_name, raw_name in sheet_names:
262
754
  ws_v = wb_values[raw_name]
@@ -266,24 +758,9 @@ def _run_xlsx(
266
758
  last_row, last_col = sheet_ranges.get(raw_name, (ws_v.max_row, ws_v.max_column))
267
759
  sheet_dims[idx] = (last_row, last_col)
268
760
 
269
- md_lines = _sheet_to_md(ws_v, last_row, last_col)
270
- _common.write_text(sheets_dir / f"{idx}_{safe_name}.md", "\n".join(md_lines))
271
-
272
- formulas: dict[str, str] = {}
273
- if last_row >= 1 and last_col >= 1:
274
- for row in ws_f.iter_rows(min_row=1, max_row=last_row, min_col=1, max_col=last_col):
275
- for cell in row:
276
- if cell.data_type != "f":
277
- continue
278
- v = cell.value
279
- # 일반·shared formula 는 str, array formula 는 ArrayFormula(.text 보유)
280
- formulas[cell.coordinate] = v if isinstance(v, str) else getattr(v, "text", str(v))
281
- if formulas:
282
- _common.write_text(
283
- sheets_dir / f"{idx}_{safe_name}.formulas.json",
284
- json.dumps(formulas, ensure_ascii=False, indent=2),
285
- )
286
- sheet_formula_count[idx] = len(formulas)
761
+ jsonl_lines, formula_n = _sheet_to_jsonl(ws_v, ws_f, last_row, last_col)
762
+ _common.write_text(sheets_dir / f"{idx}_{safe_name}.jsonl", "\n".join(jsonl_lines))
763
+ sheet_formula_count[idx] = formula_n
287
764
 
288
765
  for chart_idx, chart in enumerate(getattr(ws_f, "_charts", []), start=1):
289
766
  data = _extract_openpyxl_chart_data(chart)
@@ -294,6 +771,21 @@ def _run_xlsx(
294
771
  json.dumps(data, ensure_ascii=False, indent=2),
295
772
  )
296
773
  sheet_charts.setdefault(idx, []).append(chart_filename)
774
+
775
+ # 워크북 단위 메타 (defined names 등) — 시트 jsonl 외부 분리.
776
+ wb_meta = _workbook_meta(wb_formulas)
777
+ if wb_meta:
778
+ _common.write_text(
779
+ out_dir / "workbook.meta.json",
780
+ json.dumps(wb_meta, ensure_ascii=False, indent=2),
781
+ )
782
+
783
+ # VBA 시트 객체명 ↔ raw 시트명 매핑 (시트 codeName 기반)
784
+ sheet_code_map: dict[str, str] = {}
785
+ for ws in wb_formulas.worksheets:
786
+ code = getattr(ws.sheet_properties, "codeName", None)
787
+ if code:
788
+ sheet_code_map[code] = ws.title
297
789
  finally:
298
790
  wb_values.close()
299
791
  wb_formulas.close()
@@ -312,9 +804,13 @@ def _run_xlsx(
312
804
  for idx, safe_name, raw_name in sheet_names:
313
805
  last_row, last_col = sheet_dims.get(idx, (0, 0))
314
806
  formula_n = sheet_formula_count.get(idx, 0)
315
- parts = [f"`sheets/{idx}_{safe_name}.png`", "`.md`"]
316
- if formula_n:
317
- parts.append("`.formulas.json`")
807
+ png_path = sheets_dir / f"{idx}_{safe_name}.png"
808
+ if png_path.exists():
809
+ parts = [f"`sheets/{idx}_{safe_name}.png`", "`.jsonl`"]
810
+ else:
811
+ # PNG 미생성 — worker 가 사유 전달 (16-bit cap / COM 실패 등)
812
+ reason = sheet_png_skipped.get(raw_name, "사유 미상")
813
+ parts = [f"`sheets/{idx}_{safe_name}.jsonl`", f"(PNG 미생성 — {reason})"]
318
814
  chart_refs = sheet_charts.get(idx, [])
319
815
  if chart_refs:
320
816
  parts.append("(차트: " + ", ".join(f"`charts/{c}`" for c in chart_refs) + ")")
@@ -328,7 +824,9 @@ def _run_xlsx(
328
824
  sheet_summaries.append(" ".join(parts) + " " + meta)
329
825
 
330
826
  source_name, source_size = _source_meta(input_path, out_dir, source_name_override)
331
- macro_modules = _extract_macros(_source_path(out_dir, source_name), out_dir)
827
+ macro_modules = _extract_macros(
828
+ _source_path(out_dir, source_name), out_dir, sheet_code_map=sheet_code_map,
829
+ )
332
830
 
333
831
  sections: dict[str, list[str]] = {}
334
832
  if sheet_summaries:
@@ -341,7 +839,11 @@ def _run_xlsx(
341
839
  source_name=source_name,
342
840
  source_size=source_size,
343
841
  tool=("openpyxl + COM Excel + ZIP " + tool_extra).strip(),
344
- loss_notes="셀 서식·조건부 서식·데이터 검증 규칙은 미보존. 시각은 시트별 PNG 로, 표 구조는 .md 로, 셀 수식은 .formulas.json 으로 보존.",
842
+ loss_notes=(
843
+ "셀 서식·조건부 서식·데이터 검증 규칙은 미보존. "
844
+ "시각은 시트별 PNG, 데이터·수식·시트 메타는 시트별 .jsonl 한 줄=한 행(좌표 명시), "
845
+ "워크북 단위 메타(defined names 등)는 workbook.meta.json."
846
+ ),
345
847
  sections=sections or None,
346
848
  attachments=attachment_links,
347
849
  )
@@ -396,11 +898,13 @@ def _excel_export_sheet_pngs(
396
898
  input_path: Path,
397
899
  sheets_dir: Path,
398
900
  sheet_names: list[tuple[str, str, str]],
399
- ) -> dict[str, tuple[int, int]]:
400
- """시트별 PNG 생성 + (last_row, last_col) 매핑 반환.
901
+ ) -> tuple[dict[str, tuple[int, int]], dict[str, str]]:
902
+ """시트별 PNG 생성 + (last_row, last_col) 매핑 + skipped 사유 반환.
401
903
 
402
904
  호출자에서 sheetProtection strip 사본 만들고 worker 에 그 사본 path 만 넘김.
403
905
  Excel COM 자체 작업은 worker subprocess.
906
+
907
+ 반환: (sheet_ranges, skipped) — skipped 는 PNG export 실패한 시트의 사유 dict (raw_name → reason).
404
908
  """
405
909
  with _common.temp_workdir() as tmp:
406
910
  unprotected = tmp / "_unprotected.xlsx"
@@ -409,8 +913,13 @@ def _excel_export_sheet_pngs(
409
913
  "excel_sheets", str(unprotected), str(sheets_dir), json.dumps(sheet_names),
410
914
  timeout=600, capture_stdout=True,
411
915
  )
412
- raw = json.loads(result) if result.strip() else {}
413
- return {k: tuple(v) for k, v in raw.items()}
916
+ if not result.strip():
917
+ return {}, {}
918
+ parsed = json.loads(result)
919
+ ranges_raw = parsed.get("sheet_ranges", {})
920
+ sheet_ranges = {k: tuple(v) for k, v in ranges_raw.items()}
921
+ skipped = parsed.get("skipped", {})
922
+ return sheet_ranges, skipped
414
923
 
415
924
 
416
925
  def _xlsx_strip_protection(src: Path, dst: Path) -> None:
@@ -493,11 +1002,18 @@ def _source_path(out_dir: Path, source_name: str) -> Path:
493
1002
  return out_dir / f"_source.{ext}"
494
1003
 
495
1004
 
496
- def _extract_macros(input_path: Path, out_dir: Path) -> list[str]:
1005
+ def _extract_macros(
1006
+ input_path: Path,
1007
+ out_dir: Path,
1008
+ sheet_code_map: Optional[dict[str, str]] = None,
1009
+ ) -> list[str]:
497
1010
  """OLE/OOXML 파일에서 VBA 매크로 추출. macros/<모듈명>.vba 로 저장.
498
1011
 
499
1012
  추출된 모듈 파일명 list 반환 (예: ["Module1.vba", "ThisWorkbook.vba"]).
500
1013
  매크로 없으면 빈 list.
1014
+
1015
+ sheet_code_map: VBA 시트 객체 codeName → raw 시트명 (예: {"Sheet1": "BOA"}).
1016
+ 매크로 파일 첫 줄에 코멘트로 매핑 정보 prepend (시트 모듈만).
501
1017
  """
502
1018
  _common.ensure_pip("oletools")
503
1019
  from oletools.olevba import VBA_Parser
@@ -512,8 +1028,11 @@ def _extract_macros(input_path: Path, out_dir: Path) -> list[str]:
512
1028
  for (_filename, stream_path, vba_filename, vba_code) in parser.extract_macros():
513
1029
  module_name = vba_filename or stream_path or "module"
514
1030
  stem = Path(module_name).stem or "module"
1031
+ prefix = ""
1032
+ if sheet_code_map and stem in sheet_code_map:
1033
+ prefix = f'\' (object: {stem}, sheet: "{sheet_code_map[stem]}")\n\n'
515
1034
  dst = _common.unique_path(macros_dir, f"{stem}.vba")
516
- _common.write_text(dst, vba_code or "")
1035
+ _common.write_text(dst, prefix + (vba_code or ""))
517
1036
  module_files.append(dst.name)
518
1037
  return module_files
519
1038
  finally:
@@ -617,40 +1136,112 @@ def _extract_zip_media(
617
1136
  dst = _common.unique_path(attachments_dir, base)
618
1137
  with zf.open(info) as f:
619
1138
  _common.write_bytes(dst, f.read())
1139
+ size = dst.stat().st_size
620
1140
  recursed = maybe_recurse_attachment(dst, attachments_dir)
621
1141
  if recursed is not None:
622
1142
  os.unlink(_common.long_str(dst))
623
- attachment_links.append(f"attachments/{recursed.name}/")
1143
+ attachment_links.append(f"attachments/{recursed.name}/ ({_common.format_size(size)})")
624
1144
  else:
625
- attachment_links.append(f"attachments/{dst.name}")
1145
+ attachment_links.append(f"attachments/{dst.name} ({_common.format_size(size)})")
626
1146
  return attachment_links
627
1147
 
628
1148
 
629
- def _sheet_to_md(ws, last_row: int, last_col: int) -> list[str]:
630
- """openpyxl Worksheet (1,1)~(last_row,last_col) 범위를 마크다운 라인으로."""
1149
+ def _json_default(obj: Any) -> str:
1150
+ """JSON 직렬화 fallback. openpyxl datetime ISO 8601. 외는 throw."""
1151
+ if isinstance(obj, (datetime, date, time)):
1152
+ return obj.isoformat()
1153
+ raise TypeError(f"not JSON serializable: {type(obj).__name__}")
1154
+
1155
+
1156
+ def _sheet_to_jsonl(ws_v, ws_f, last_row: int, last_col: int) -> tuple[list[str], int]:
1157
+ """openpyxl Worksheet 의 (1,1)~(last_row,last_col) 범위를 행 단위 JSONL 라인으로.
1158
+
1159
+ 한 줄 = 한 행. 빈 셀 키 생략. 좌표는 `r`(1-based 행번호) + 열문자 키(`A`·`B`·...·`AA`·...).
1160
+ 같은 행 수식은 `_f` 맵 (열문자 → 수식문자열). 빈 행도 `{"r":N}` 한 줄 유지 → Read offset = 행번호.
1161
+ 첫 줄은 `{"_meta":{...}}` (시트 dims·merges·frozen·hyperlinks·comments).
1162
+ 값 타입은 JSON 네이티브(int·float·bool) + datetime ISO 8601.
1163
+
1164
+ 반환: (lines, formula_count)
1165
+ """
1166
+ from openpyxl.utils import get_column_letter
1167
+
631
1168
  if last_row < 1 or last_col < 1:
632
- return ["(빈 시트)"]
633
-
634
- rows: list[list[str]] = []
635
- for row in ws.iter_rows(
636
- min_row=1, max_row=last_row, min_col=1, max_col=last_col, values_only=True
637
- ):
638
- rows.append(["" if v is None else str(v) for v in row])
639
- if not rows or not any(any(c for c in r) for r in rows):
640
- return ["(빈 시트)"]
641
-
642
- header = rows[0]
643
- md_lines: list[str] = []
644
- md_lines.append("| " + " | ".join(_md_escape(c) for c in header) + " |")
645
- md_lines.append("| " + " | ".join("---" for _ in header) + " |")
646
- for row in rows[1:]:
647
- padded = list(row) + [""] * (len(header) - len(row))
648
- md_lines.append("| " + " | ".join(_md_escape(c) for c in padded[: len(header)]) + " |")
649
- return md_lines
650
-
651
-
652
- def _md_escape(s: str) -> str:
653
- return s.replace("|", "\\|").replace("\n", " ")
1169
+ meta = {"_meta": {"dims": [0, 0]}}
1170
+ return [json.dumps(meta, ensure_ascii=False)], 0
1171
+
1172
+ # 메타 수집: 머지·frozen·hyperlinks·comments
1173
+ meta: dict[str, Any] = {"dims": [last_row, last_col]}
1174
+ merges = [str(r) for r in ws_v.merged_cells.ranges]
1175
+ if merges:
1176
+ meta["merges"] = merges
1177
+ frozen = ws_v.freeze_panes
1178
+ if frozen:
1179
+ meta["frozen"] = frozen
1180
+
1181
+ hyperlinks: dict[str, str] = {}
1182
+ comments: dict[str, str] = {}
1183
+ number_formats: dict[str, str] = {} # General(기본) 외 셀의 표시 형식
1184
+ for row in ws_v.iter_rows(min_row=1, max_row=last_row, min_col=1, max_col=last_col):
1185
+ for cell in row:
1186
+ hl = getattr(cell, "hyperlink", None)
1187
+ if hl is not None and getattr(hl, "target", None):
1188
+ hyperlinks[cell.coordinate] = hl.target
1189
+ cm = getattr(cell, "comment", None)
1190
+ if cm is not None and getattr(cm, "text", None):
1191
+ comments[cell.coordinate] = cm.text
1192
+ nf = getattr(cell, "number_format", None)
1193
+ if nf and nf != "General":
1194
+ number_formats[cell.coordinate] = nf
1195
+ if hyperlinks:
1196
+ meta["hyperlinks"] = hyperlinks
1197
+ if comments:
1198
+ meta["comments"] = comments
1199
+ if number_formats:
1200
+ meta["number_formats"] = number_formats
1201
+
1202
+ lines: list[str] = [json.dumps({"_meta": meta}, ensure_ascii=False, default=_json_default)]
1203
+ formula_count = 0
1204
+
1205
+ rows_v = ws_v.iter_rows(min_row=1, max_row=last_row, min_col=1, max_col=last_col, values_only=True)
1206
+ rows_f = ws_f.iter_rows(min_row=1, max_row=last_row, min_col=1, max_col=last_col)
1207
+ for r_idx, (row_v, row_f) in enumerate(zip(rows_v, rows_f), start=1):
1208
+ row_data: dict[str, Any] = {"r": r_idx}
1209
+ fmap: dict[str, str] = {}
1210
+ for c_idx, (v, fcell) in enumerate(zip(row_v, row_f), start=1):
1211
+ col_letter = get_column_letter(c_idx)
1212
+ if v is not None:
1213
+ row_data[col_letter] = v
1214
+ if fcell.data_type == "f":
1215
+ fv = fcell.value
1216
+ # 일반·shared formula 는 str, array formula 는 ArrayFormula(.text 보유)
1217
+ fmap[col_letter] = fv if isinstance(fv, str) else getattr(fv, "text", str(fv))
1218
+ formula_count += 1
1219
+ if fmap:
1220
+ row_data["_f"] = fmap
1221
+ lines.append(json.dumps(row_data, ensure_ascii=False, default=_json_default))
1222
+
1223
+ return lines, formula_count
1224
+
1225
+
1226
+ def _workbook_meta(wb) -> dict[str, Any]:
1227
+ """워크북 단위 메타 (defined names 등). 비어있으면 빈 dict 반환."""
1228
+ meta: dict[str, Any] = {}
1229
+ defined_names: dict[str, list[str]] = {}
1230
+ # openpyxl 3.x: wb.defined_names 는 DefinedNameDict (dict-like)
1231
+ try:
1232
+ for name, dn in wb.defined_names.items():
1233
+ try:
1234
+ dests = [f"'{sheet}'!{addr}" for sheet, addr in dn.destinations]
1235
+ except Exception:
1236
+ # destinations 파싱 불가 시 raw value 보존 (예: 워크북-수식 형태)
1237
+ dests = [str(getattr(dn, "value", ""))]
1238
+ defined_names[name] = dests
1239
+ except Exception:
1240
+ # defined_names 자체 접근 실패 → 워크북에 없는 것으로 처리
1241
+ pass
1242
+ if defined_names:
1243
+ meta["defined_names"] = defined_names
1244
+ return meta
654
1245
 
655
1246
 
656
1247
  def _extract_pptx_chart_data(chart) -> dict: