@simplysm/sd-claude 14.0.76 → 14.0.78
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/claude/output-styles/sd-tone.md +128 -0
- package/claude/references/sd-simplysm14/apis/angular/README.md +28 -89
- package/claude/references/sd-simplysm14/apis/angular/app-structure.md +75 -32
- package/claude/references/sd-simplysm14/apis/angular/buttons.md +65 -29
- package/claude/references/sd-simplysm14/apis/angular/crud.md +86 -21
- package/claude/references/sd-simplysm14/apis/angular/forms.md +168 -42
- package/claude/references/sd-simplysm14/apis/angular/infrastructure.md +200 -49
- package/claude/references/sd-simplysm14/apis/angular/kanban.md +64 -20
- package/claude/references/sd-simplysm14/apis/angular/layout.md +75 -30
- package/claude/references/sd-simplysm14/apis/angular/modal.md +92 -40
- package/claude/references/sd-simplysm14/apis/angular/routing.md +86 -25
- package/claude/references/sd-simplysm14/apis/angular/selection-managers.md +72 -41
- package/claude/references/sd-simplysm14/apis/angular/shared-data.md +113 -21
- package/claude/references/sd-simplysm14/apis/angular/sheet.md +108 -33
- package/claude/references/sd-simplysm14/apis/angular/toast.md +81 -30
- package/claude/references/sd-simplysm14/apis/angular/visual.md +140 -32
- package/claude/references/sd-simplysm14/apis/capacitor-plugin-auto-update/README.md +46 -43
- package/claude/references/sd-simplysm14/apis/capacitor-plugin-intent/README.md +59 -48
- package/claude/references/sd-simplysm14/apis/capacitor-plugin-usb-storage/README.md +17 -7
- package/claude/references/sd-simplysm14/apis/core-common/README.md +43 -116
- package/claude/references/sd-simplysm14/apis/core-common/extensions.md +74 -109
- package/claude/references/sd-simplysm14/apis/core-common/features.md +40 -35
- package/claude/references/sd-simplysm14/apis/core-common/types.md +80 -106
- package/claude/references/sd-simplysm14/apis/core-common/utils.md +142 -111
- package/claude/references/sd-simplysm14/apis/core-node/README.md +7 -16
- package/claude/references/sd-simplysm14/apis/core-node/consola.md +33 -38
- package/claude/references/sd-simplysm14/apis/core-node/cpx.md +25 -33
- package/claude/references/sd-simplysm14/apis/core-node/fs-watcher.md +27 -38
- package/claude/references/sd-simplysm14/apis/core-node/fsx.md +32 -60
- package/claude/references/sd-simplysm14/apis/core-node/pathx.md +14 -45
- package/claude/references/sd-simplysm14/apis/core-node/worker.md +35 -81
- package/claude/references/sd-simplysm14/apis/excel/README.md +178 -80
- package/claude/references/sd-simplysm14/apis/lint/README.md +5 -0
- package/claude/references/sd-simplysm14/apis/orm-node/README.md +1 -1
- package/claude/references/sd-simplysm14/apis/sd-claude/README.md +28 -5
- package/claude/references/sd-simplysm14/apis/sd-cli/README.md +1 -1
- package/claude/references/sd-simplysm14/apis/service-client/README.md +57 -50
- package/claude/references/sd-simplysm14/apis/service-server/README.md +8 -15
- package/claude/references/sd-simplysm14/apis/service-server/auth.md +24 -16
- package/claude/references/sd-simplysm14/apis/service-server/builtin-services.md +55 -31
- package/claude/references/sd-simplysm14/apis/service-server/define-service.md +28 -44
- package/claude/references/sd-simplysm14/apis/service-server/internals.md +59 -18
- package/claude/references/sd-simplysm14/apis/service-server/server.md +37 -46
- package/claude/references/sd-simplysm14/manuals/client-component.md +3 -1
- package/claude/references/sd-simplysm14/manuals/logging.md +9 -8
- package/claude/rules/sd-base-rules.md +377 -219
- package/claude/settings.json +1 -0
- package/claude/skills/sd-commit/SKILL.md +31 -8
- package/claude/skills/sd-docs/SKILL.md +15 -10
- package/claude/skills/sd-docs/references/subagent-prompt.md +26 -8
- package/claude/skills/sd-impl/SKILL.md +1 -1
- package/claude/skills/sd-skill/references/skill-authoring.md +1 -1
- package/claude/skills/sd-spec/SKILL.md +22 -13
- package/claude/skills/sd-spec/references/spec-authoring.md +1 -1
- package/claude/skills/sd-unpack/SKILL.md +150 -26
- package/claude/skills/sd-unpack/scripts/handlers/__pycache__/_common.cpython-314.pyc +0 -0
- package/claude/skills/sd-unpack/scripts/handlers/__pycache__/eml_handler.cpython-314.pyc +0 -0
- package/claude/skills/sd-unpack/scripts/handlers/__pycache__/office_com.cpython-314.pyc +0 -0
- package/claude/skills/sd-unpack/scripts/handlers/__pycache__/pdf_handler.cpython-314.pyc +0 -0
- package/claude/skills/sd-unpack/scripts/handlers/_common.py +17 -2
- package/claude/skills/sd-unpack/scripts/handlers/eml_handler.py +100 -24
- package/claude/skills/sd-unpack/scripts/handlers/msg_handler.py +140 -27
- package/claude/skills/sd-unpack/scripts/handlers/office_com.py +698 -107
- package/claude/skills/sd-unpack/scripts/handlers/office_worker.py +34 -26
- package/claude/skills/sd-unpack/scripts/handlers/pdf_handler.py +130 -8
- package/package.json +1 -1
|
@@ -1,22 +1,26 @@
|
|
|
1
1
|
"""Office (docx/pptx/xlsx) + 레거시 (doc/ppt/xls/xlsb) COM 핸들러.
|
|
2
2
|
|
|
3
|
-
시각 산출물은 PNG,
|
|
3
|
+
시각 산출물은 PNG, 텍스트/구조 산출물은 형식별로:
|
|
4
4
|
- docx → pages/<NNN>.png + pages/<NNN>.md (페이지별)
|
|
5
5
|
- pptx → slides/<NN>_<title>.png + .md + .notes.md (슬라이드별)
|
|
6
|
-
- xlsx → sheets/<NN>_<name>.png + .
|
|
6
|
+
- xlsx → sheets/<NN>_<name>.png + .jsonl (시트별) + workbook.meta.json
|
|
7
|
+
|
|
8
|
+
xlsx jsonl 한 줄 = 한 행. 좌표는 행번호(`r`)·열문자 키로 명시. 값·수식·시트 메타 통합.
|
|
7
9
|
|
|
8
10
|
Office COM 호출은 office_worker.py subprocess 로 격리 (cleanup race 회피).
|
|
9
|
-
이 모듈 (office_com.py) 은 호출자 + Office 외 작업 (
|
|
11
|
+
이 모듈 (office_com.py) 은 호출자 + Office 외 작업 (jsonl 직렬화, ZIP strip, 매크로 추출, README 생성).
|
|
10
12
|
원칙: 처리 실패는 묻지 않고 그대로 throw. try/finally 는 락/임시 폴더 cleanup 에만 사용.
|
|
11
13
|
"""
|
|
12
14
|
from __future__ import annotations
|
|
13
15
|
|
|
14
16
|
import json
|
|
15
17
|
import os
|
|
18
|
+
import re
|
|
16
19
|
import sys
|
|
17
20
|
import zipfile
|
|
21
|
+
from datetime import date, datetime, time
|
|
18
22
|
from pathlib import Path
|
|
19
|
-
from typing import Optional
|
|
23
|
+
from typing import Any, Optional
|
|
20
24
|
|
|
21
25
|
from . import _common
|
|
22
26
|
from .dispatch import maybe_recurse_attachment
|
|
@@ -39,20 +43,25 @@ def run(input_path: Path, out_dir: Path) -> None:
|
|
|
39
43
|
|
|
40
44
|
|
|
41
45
|
def run_legacy(input_path: Path, out_dir: Path) -> None:
|
|
46
|
+
"""레거시 (.doc/.ppt/.xls/.xlsb) → 신형 변환 후 처리.
|
|
47
|
+
|
|
48
|
+
`_converted.<ext>` 는 임시 폴더에서만 처리하고 산출 폴더(out_dir)에는 잔존시키지 않음.
|
|
49
|
+
"""
|
|
42
50
|
ext = input_path.suffix.lower()
|
|
43
51
|
target_ext_map = {".doc": ".docx", ".ppt": ".pptx", ".xls": ".xlsx", ".xlsb": ".xlsx"}
|
|
44
52
|
target_ext = target_ext_map[ext]
|
|
45
53
|
|
|
46
|
-
converted_in_out = out_dir / f"_converted{target_ext}"
|
|
47
|
-
_convert_legacy(input_path, converted_in_out)
|
|
48
|
-
|
|
49
54
|
tool_extra = f"(레거시 {ext} → {target_ext} 변환 후 처리)"
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
55
|
+
with _common.temp_workdir() as tmp:
|
|
56
|
+
converted_path = tmp / f"_converted{target_ext}"
|
|
57
|
+
_convert_legacy(input_path, converted_path)
|
|
58
|
+
|
|
59
|
+
if target_ext == ".docx":
|
|
60
|
+
_run_docx(converted_path, out_dir, source_name_override=input_path.name, tool_extra=tool_extra)
|
|
61
|
+
elif target_ext == ".pptx":
|
|
62
|
+
_run_pptx(converted_path, out_dir, source_name_override=input_path.name, tool_extra=tool_extra)
|
|
63
|
+
elif target_ext == ".xlsx":
|
|
64
|
+
_run_xlsx(converted_path, out_dir, source_name_override=input_path.name, tool_extra=tool_extra)
|
|
56
65
|
|
|
57
66
|
|
|
58
67
|
# ====================================================================
|
|
@@ -66,15 +75,30 @@ def _run_docx(
|
|
|
66
75
|
source_name_override: Optional[str] = None,
|
|
67
76
|
tool_extra: str = "",
|
|
68
77
|
) -> None:
|
|
78
|
+
"""python-docx 로 구조 추출 → content.jsonl 단일 시퀀스. 페이지 단위 폐기.
|
|
79
|
+
|
|
80
|
+
PNG 는 fitz PDF 경유로 시각 검증용 유지. pages.meta.json 으로 페이지↔노드 best-effort 매핑.
|
|
81
|
+
"""
|
|
82
|
+
_common.ensure_pip("docx", "python-docx")
|
|
83
|
+
|
|
69
84
|
pages_dir = out_dir / "pages"
|
|
70
85
|
images_dir = out_dir / "images"
|
|
71
86
|
|
|
72
|
-
#
|
|
87
|
+
# 1. python-docx 구조 추출
|
|
88
|
+
nodes, counts = _docx_extract_nodes(input_path)
|
|
89
|
+
|
|
90
|
+
# content.jsonl
|
|
91
|
+
lines: list[str] = [json.dumps({"_meta": counts}, ensure_ascii=False)]
|
|
92
|
+
for n in nodes:
|
|
93
|
+
lines.append(json.dumps(n, ensure_ascii=False, default=_json_default))
|
|
94
|
+
_common.write_text(out_dir / "content.jsonl", "\n".join(lines))
|
|
95
|
+
|
|
96
|
+
# 2. fitz PDF 경유 PNG + pages.meta.json (페이지↔노드 매핑 best-effort)
|
|
73
97
|
with _common.com_lock(), _common.temp_workdir() as tmp:
|
|
74
98
|
tmp_pdf = tmp / "out.pdf"
|
|
75
99
|
_word_export_pdf(input_path, tmp_pdf)
|
|
76
100
|
_common.mkdir(pages_dir)
|
|
77
|
-
|
|
101
|
+
page_count = _docx_pages_from_pdf(tmp_pdf, pages_dir, out_dir, nodes)
|
|
78
102
|
|
|
79
103
|
attachment_links = _extract_zip_media(
|
|
80
104
|
input_path,
|
|
@@ -88,8 +112,15 @@ def _run_docx(
|
|
|
88
112
|
macro_modules = _extract_macros(_source_path(out_dir, source_name), out_dir)
|
|
89
113
|
|
|
90
114
|
sections: dict[str, list[str]] = {}
|
|
91
|
-
|
|
92
|
-
|
|
115
|
+
summary = (
|
|
116
|
+
f"노드 {counts['nodes']}개 "
|
|
117
|
+
f"(heading {counts['headings']}·para {counts['paragraphs_plain']}·"
|
|
118
|
+
f"bullet {counts['bullets']}·table_cell {counts['table_cells']}·image {counts['images']})"
|
|
119
|
+
)
|
|
120
|
+
content_items = [f"`content.jsonl` — {summary}"]
|
|
121
|
+
if page_count:
|
|
122
|
+
content_items.append(f"`pages.meta.json` — PNG ↔ 노드 매핑 ({page_count}페이지)")
|
|
123
|
+
sections["콘텐츠"] = content_items
|
|
93
124
|
if macro_modules:
|
|
94
125
|
sections[f"VBA 매크로 (총 {len(macro_modules)}개)"] = [f"`macros/{m}`" for m in macro_modules]
|
|
95
126
|
|
|
@@ -97,13 +128,261 @@ def _run_docx(
|
|
|
97
128
|
out_dir,
|
|
98
129
|
source_name=source_name,
|
|
99
130
|
source_size=source_size,
|
|
100
|
-
tool=("COM Word + PyMuPDF + ZIP " + tool_extra).strip(),
|
|
101
|
-
loss_notes=
|
|
131
|
+
tool=("python-docx + COM Word + PyMuPDF + ZIP " + tool_extra).strip(),
|
|
132
|
+
loss_notes=(
|
|
133
|
+
"서식(폰트/색/볼드)·정확한 페이지 레이아웃은 PNG 안에서만 보존. "
|
|
134
|
+
"구조는 content.jsonl 단일 시퀀스(heading/para/bullet/table_cell/image), "
|
|
135
|
+
"PNG↔노드 매핑은 pages.meta.json. 매크로(VBA)는 macros/ 로 별도 추출."
|
|
136
|
+
),
|
|
102
137
|
sections=sections or None,
|
|
103
138
|
attachments=attachment_links,
|
|
104
139
|
)
|
|
105
140
|
|
|
106
141
|
|
|
142
|
+
def _docx_extract_nodes(input_path: Path) -> tuple[list[dict], dict[str, int]]:
|
|
143
|
+
"""python-docx 로 body 시퀀스(paragraph/table) 순회 → jsonl 노드 리스트."""
|
|
144
|
+
from docx import Document
|
|
145
|
+
from docx.oxml.ns import qn
|
|
146
|
+
from docx.table import Table
|
|
147
|
+
from docx.text.paragraph import Paragraph
|
|
148
|
+
|
|
149
|
+
IMG_RELTYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/image"
|
|
150
|
+
doc = Document(_common.long_str(input_path))
|
|
151
|
+
|
|
152
|
+
# image relationship: rid → 'images/<basename>'
|
|
153
|
+
img_rels: dict[str, str] = {}
|
|
154
|
+
for rid, rel in doc.part.rels.items():
|
|
155
|
+
if rel.reltype == IMG_RELTYPE:
|
|
156
|
+
try:
|
|
157
|
+
basename = Path(rel.target_ref).name
|
|
158
|
+
img_rels[rid] = f"images/{basename}"
|
|
159
|
+
except Exception:
|
|
160
|
+
continue
|
|
161
|
+
|
|
162
|
+
nodes: list[dict] = []
|
|
163
|
+
counts = {
|
|
164
|
+
"nodes": 0,
|
|
165
|
+
"headings": 0,
|
|
166
|
+
"paragraphs_plain": 0,
|
|
167
|
+
"bullets": 0,
|
|
168
|
+
"tables": 0,
|
|
169
|
+
"table_cells": 0,
|
|
170
|
+
"images": 0,
|
|
171
|
+
}
|
|
172
|
+
table_idx = 0
|
|
173
|
+
node_idx = 0
|
|
174
|
+
|
|
175
|
+
for elem in doc.element.body.iterchildren():
|
|
176
|
+
tag = elem.tag
|
|
177
|
+
if tag == qn("w:p"):
|
|
178
|
+
para = Paragraph(elem, doc)
|
|
179
|
+
text = para.text or ""
|
|
180
|
+
style_name = para.style.name if para.style else ""
|
|
181
|
+
heading_level = _docx_heading_level(style_name)
|
|
182
|
+
bullet_level = _docx_bullet_level(para)
|
|
183
|
+
image_rids = _docx_inline_image_rids(para)
|
|
184
|
+
hyperlinks = _docx_paragraph_hyperlinks(para, doc)
|
|
185
|
+
|
|
186
|
+
node: dict
|
|
187
|
+
if heading_level is not None:
|
|
188
|
+
node = {"node": node_idx, "type": "heading", "level": heading_level, "text": text}
|
|
189
|
+
counts["headings"] += 1
|
|
190
|
+
elif bullet_level is not None:
|
|
191
|
+
node = {"node": node_idx, "type": "bullet", "level": bullet_level, "text": text}
|
|
192
|
+
counts["bullets"] += 1
|
|
193
|
+
else:
|
|
194
|
+
# 빈 paragraph 도 원본 정보 → 노드로 보존 (text="")
|
|
195
|
+
node = {"node": node_idx, "type": "para", "text": text}
|
|
196
|
+
counts["paragraphs_plain"] += 1
|
|
197
|
+
|
|
198
|
+
if hyperlinks:
|
|
199
|
+
node["hyperlinks"] = hyperlinks
|
|
200
|
+
|
|
201
|
+
nodes.append(node)
|
|
202
|
+
node_idx += 1
|
|
203
|
+
|
|
204
|
+
for rid in image_rids:
|
|
205
|
+
ref = img_rels.get(rid)
|
|
206
|
+
if ref:
|
|
207
|
+
nodes.append({"node": node_idx, "type": "image", "ref": ref})
|
|
208
|
+
counts["images"] += 1
|
|
209
|
+
node_idx += 1
|
|
210
|
+
|
|
211
|
+
elif tag == qn("w:tbl"):
|
|
212
|
+
table_obj = Table(elem, doc)
|
|
213
|
+
table_idx += 1
|
|
214
|
+
counts["tables"] += 1
|
|
215
|
+
seen_tc: set[int] = set()
|
|
216
|
+
for r, row in enumerate(table_obj.rows, start=1):
|
|
217
|
+
for c, cell in enumerate(row.cells, start=1):
|
|
218
|
+
tc_id = id(cell._tc)
|
|
219
|
+
if tc_id in seen_tc:
|
|
220
|
+
# gridSpan 으로 같은 row 안 colspan 중복 노출 — origin 의 colspan 에 표기됨
|
|
221
|
+
continue
|
|
222
|
+
seen_tc.add(tc_id)
|
|
223
|
+
vm = _docx_cell_vmerge(cell)
|
|
224
|
+
if vm == "continue":
|
|
225
|
+
# vMerge continue cell — origin 의 rowspan 영역. skip.
|
|
226
|
+
continue
|
|
227
|
+
cell_text = (cell.text or "").strip()
|
|
228
|
+
colspan = _docx_cell_colspan(cell)
|
|
229
|
+
cell_node = {
|
|
230
|
+
"node": node_idx,
|
|
231
|
+
"type": "table_cell",
|
|
232
|
+
"table_idx": table_idx,
|
|
233
|
+
"row": r,
|
|
234
|
+
"col": c,
|
|
235
|
+
"text": cell_text,
|
|
236
|
+
}
|
|
237
|
+
if colspan > 1:
|
|
238
|
+
cell_node["colspan"] = colspan
|
|
239
|
+
nodes.append(cell_node)
|
|
240
|
+
counts["table_cells"] += 1
|
|
241
|
+
node_idx += 1
|
|
242
|
+
|
|
243
|
+
counts["nodes"] = node_idx
|
|
244
|
+
return nodes, counts
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def _docx_heading_level(style_name: str) -> Optional[int]:
|
|
248
|
+
"""python-docx 스타일명 → heading level. heading 아니면 None."""
|
|
249
|
+
if not style_name:
|
|
250
|
+
return None
|
|
251
|
+
if style_name.startswith("Heading "):
|
|
252
|
+
try:
|
|
253
|
+
return int(style_name.split(" ")[1])
|
|
254
|
+
except (ValueError, IndexError):
|
|
255
|
+
return None
|
|
256
|
+
if style_name == "Title":
|
|
257
|
+
return 0
|
|
258
|
+
return None
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
def _docx_bullet_level(para) -> Optional[int]:
|
|
262
|
+
"""paragraph 의 numbering ilvl 추출. bullet/numbered 아니면 None."""
|
|
263
|
+
from docx.oxml.ns import qn
|
|
264
|
+
|
|
265
|
+
pPr = para._element.find(qn("w:pPr"))
|
|
266
|
+
if pPr is None:
|
|
267
|
+
return None
|
|
268
|
+
numPr = pPr.find(qn("w:numPr"))
|
|
269
|
+
if numPr is None:
|
|
270
|
+
return None
|
|
271
|
+
ilvl_elem = numPr.find(qn("w:ilvl"))
|
|
272
|
+
if ilvl_elem is None:
|
|
273
|
+
return 0
|
|
274
|
+
try:
|
|
275
|
+
return int(ilvl_elem.get(qn("w:val")) or 0)
|
|
276
|
+
except (ValueError, TypeError):
|
|
277
|
+
return 0
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
_DRAWING_EMBED_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
|
|
281
|
+
_DRAWING_BLIP_TAG = "{http://schemas.openxmlformats.org/drawingml/2006/main}blip"
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def _docx_inline_image_rids(para) -> list[str]:
|
|
285
|
+
"""paragraph 안 inline image relationship IDs."""
|
|
286
|
+
from docx.oxml.ns import qn
|
|
287
|
+
|
|
288
|
+
rids: list[str] = []
|
|
289
|
+
for drawing in para._element.iter(qn("w:drawing")):
|
|
290
|
+
for blip in drawing.iter(_DRAWING_BLIP_TAG):
|
|
291
|
+
rid = blip.get(_DRAWING_EMBED_NS)
|
|
292
|
+
if rid:
|
|
293
|
+
rids.append(rid)
|
|
294
|
+
return rids
|
|
295
|
+
|
|
296
|
+
|
|
297
|
+
_DOCX_R_ID_NS = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}id"
|
|
298
|
+
_DOCX_HYPERLINK_RELTYPE = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/hyperlink"
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def _docx_paragraph_hyperlinks(para, doc) -> list[dict]:
|
|
302
|
+
"""paragraph 안 hyperlink list: [{"text":"...", "url":"..."}, ...]"""
|
|
303
|
+
from docx.oxml.ns import qn
|
|
304
|
+
|
|
305
|
+
rels = doc.part.rels
|
|
306
|
+
result: list[dict] = []
|
|
307
|
+
for hl_elem in para._element.iter(qn("w:hyperlink")):
|
|
308
|
+
rid = hl_elem.get(_DOCX_R_ID_NS)
|
|
309
|
+
url = ""
|
|
310
|
+
if rid and rid in rels:
|
|
311
|
+
rel = rels[rid]
|
|
312
|
+
if rel.reltype == _DOCX_HYPERLINK_RELTYPE:
|
|
313
|
+
url = rel.target_ref
|
|
314
|
+
# hyperlink 안 모든 w:t 텍스트 join
|
|
315
|
+
hl_text = "".join((t.text or "") for t in hl_elem.iter(qn("w:t")))
|
|
316
|
+
if hl_text or url:
|
|
317
|
+
result.append({"text": hl_text, "url": url})
|
|
318
|
+
return result
|
|
319
|
+
|
|
320
|
+
|
|
321
|
+
def _docx_cell_colspan(cell) -> int:
|
|
322
|
+
"""docx 표 셀의 colspan (gridSpan val). 기본 1."""
|
|
323
|
+
from docx.oxml.ns import qn
|
|
324
|
+
|
|
325
|
+
tcPr = cell._tc.find(qn("w:tcPr"))
|
|
326
|
+
if tcPr is None:
|
|
327
|
+
return 1
|
|
328
|
+
gridSpan = tcPr.find(qn("w:gridSpan"))
|
|
329
|
+
if gridSpan is None:
|
|
330
|
+
return 1
|
|
331
|
+
val = gridSpan.get(qn("w:val"))
|
|
332
|
+
try:
|
|
333
|
+
return int(val) if val else 1
|
|
334
|
+
except (ValueError, TypeError):
|
|
335
|
+
return 1
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def _docx_cell_vmerge(cell) -> Optional[str]:
|
|
339
|
+
"""docx 표 셀의 vMerge 상태. 'restart' | 'continue' | None."""
|
|
340
|
+
from docx.oxml.ns import qn
|
|
341
|
+
|
|
342
|
+
tcPr = cell._tc.find(qn("w:tcPr"))
|
|
343
|
+
if tcPr is None:
|
|
344
|
+
return None
|
|
345
|
+
vMerge = tcPr.find(qn("w:vMerge"))
|
|
346
|
+
if vMerge is None:
|
|
347
|
+
return None
|
|
348
|
+
val = vMerge.get(qn("w:val"))
|
|
349
|
+
return val if val else "continue" # vMerge 요소 있고 val 없으면 continue
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def _docx_pages_from_pdf(
|
|
353
|
+
pdf_path: Path,
|
|
354
|
+
pages_dir: Path,
|
|
355
|
+
out_dir: Path,
|
|
356
|
+
nodes: list[dict],
|
|
357
|
+
) -> int:
|
|
358
|
+
"""fitz PDF 경유 페이지별 PNG + pages.meta.json (페이지별 raw text 보존).
|
|
359
|
+
|
|
360
|
+
nodes 와의 매핑은 fitz·python-docx 간 텍스트 분할 차이로 자동 추정 시 오매핑 위험 →
|
|
361
|
+
raw text 만 보존. Claude 가 분석 시 페이지 text 와 content.jsonl 노드 text 를 직접 비교.
|
|
362
|
+
"""
|
|
363
|
+
_common.ensure_pip("fitz", "PyMuPDF")
|
|
364
|
+
import fitz
|
|
365
|
+
|
|
366
|
+
pages_meta: dict[str, dict] = {}
|
|
367
|
+
fdoc = fitz.open(_common.long_str(pdf_path))
|
|
368
|
+
try:
|
|
369
|
+
for i, page in enumerate(fdoc, start=1):
|
|
370
|
+
idx = f"{i:03d}"
|
|
371
|
+
pix = page.get_pixmap(dpi=300)
|
|
372
|
+
pix.save(_common.long_str(pages_dir / f"{idx}.png"))
|
|
373
|
+
text = page.get_text("text") or ""
|
|
374
|
+
pages_meta[idx] = {"text": text}
|
|
375
|
+
finally:
|
|
376
|
+
fdoc.close()
|
|
377
|
+
|
|
378
|
+
if pages_meta:
|
|
379
|
+
_common.write_text(
|
|
380
|
+
out_dir / "pages.meta.json",
|
|
381
|
+
json.dumps(pages_meta, ensure_ascii=False, indent=2),
|
|
382
|
+
)
|
|
383
|
+
return len(pages_meta)
|
|
384
|
+
|
|
385
|
+
|
|
107
386
|
# ====================================================================
|
|
108
387
|
# PPTX
|
|
109
388
|
# ====================================================================
|
|
@@ -115,39 +394,56 @@ def _run_pptx(
|
|
|
115
394
|
source_name_override: Optional[str] = None,
|
|
116
395
|
tool_extra: str = "",
|
|
117
396
|
) -> None:
|
|
397
|
+
"""python-pptx 로 구조 추출 → 슬라이드별 jsonl. 시각 순서 정렬 + pos EMU 좌표.
|
|
398
|
+
|
|
399
|
+
노드 type: title·heading·para·bullet·table_cell·image·chart·shape.
|
|
400
|
+
PNG 은 COM PowerPoint 의 Slide.Export 로 슬라이드별 직접 출력.
|
|
401
|
+
"""
|
|
118
402
|
_common.ensure_pip("pptx", "python-pptx")
|
|
119
403
|
from pptx import Presentation
|
|
120
404
|
|
|
121
405
|
slides_dir = out_dir / "slides"
|
|
122
406
|
charts_dir = out_dir / "charts"
|
|
407
|
+
images_dir = out_dir / "images"
|
|
123
408
|
|
|
124
409
|
prs = Presentation(_common.long_str(input_path))
|
|
410
|
+
slide_w = int(prs.slide_width or 0)
|
|
411
|
+
slide_h = int(prs.slide_height or 0)
|
|
412
|
+
|
|
125
413
|
slide_titles: list[tuple[str, str]] = [] # (idx, safe_title)
|
|
126
414
|
slide_summaries: list[str] = []
|
|
127
415
|
slide_has_notes: dict[str, bool] = {}
|
|
128
416
|
slide_charts: dict[str, list[str]] = {} # idx -> chart filenames
|
|
417
|
+
slide_cores: dict[str, str] = {} # idx -> 핵심 텍스트 (title 또는 첫 텍스트)
|
|
129
418
|
|
|
130
419
|
_common.mkdir(slides_dir)
|
|
131
420
|
for i, slide in enumerate(prs.slides, start=1):
|
|
132
421
|
idx = f"{i:02d}"
|
|
133
|
-
title =
|
|
134
|
-
|
|
135
|
-
title = slide.shapes.title.text.strip()
|
|
136
|
-
if not title:
|
|
137
|
-
title = f"슬라이드{i}"
|
|
138
|
-
safe_title = _common.slugify_filename(title, max_len=40)
|
|
422
|
+
title = _pptx_slide_title(slide)
|
|
423
|
+
safe_title = _common.slugify_filename(title or f"슬라이드{i}", max_len=40)
|
|
139
424
|
slide_titles.append((idx, safe_title))
|
|
140
425
|
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
426
|
+
nodes, chart_refs = _pptx_extract_slide_nodes(
|
|
427
|
+
slide, i, charts_dir, images_dir,
|
|
428
|
+
)
|
|
429
|
+
# 원본 XML 순서 (shape_idx 순) 그대로 보존. 시각 순서는 pos 가 보존되어 있어
|
|
430
|
+
# Claude 가 필요시 직접 정렬 가능.
|
|
431
|
+
|
|
432
|
+
meta = {
|
|
433
|
+
"_meta": {
|
|
434
|
+
"slide": i,
|
|
435
|
+
"title": title,
|
|
436
|
+
"size": [slide_w, slide_h],
|
|
437
|
+
"shapes": len(nodes),
|
|
438
|
+
}
|
|
439
|
+
}
|
|
440
|
+
lines = [json.dumps(meta, ensure_ascii=False, default=_json_default)]
|
|
441
|
+
for n in nodes:
|
|
442
|
+
lines.append(json.dumps(n, ensure_ascii=False, default=_json_default))
|
|
443
|
+
_common.write_text(slides_dir / f"{idx}_{safe_title}.jsonl", "\n".join(lines))
|
|
444
|
+
|
|
445
|
+
if chart_refs:
|
|
446
|
+
slide_charts[idx] = chart_refs
|
|
151
447
|
|
|
152
448
|
if slide.has_notes_slide:
|
|
153
449
|
notes_text = slide.notes_slide.notes_text_frame.text or ""
|
|
@@ -158,25 +454,18 @@ def _run_pptx(
|
|
|
158
454
|
)
|
|
159
455
|
slide_has_notes[idx] = True
|
|
160
456
|
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
_common.mkdir(charts_dir)
|
|
165
|
-
chart_filename = f"slide{i:02d}_chart{shape_idx:02d}.data.json"
|
|
166
|
-
_common.write_text(
|
|
167
|
-
charts_dir / chart_filename,
|
|
168
|
-
json.dumps(data, ensure_ascii=False, indent=2),
|
|
169
|
-
)
|
|
170
|
-
slide_charts.setdefault(idx, []).append(chart_filename)
|
|
457
|
+
core = title or _pptx_first_text(nodes)
|
|
458
|
+
if core:
|
|
459
|
+
slide_cores[idx] = core[:60]
|
|
171
460
|
|
|
172
|
-
|
|
173
|
-
parts = [f"`slides/{idx}_{safe_title}.png`", "`.md`"]
|
|
461
|
+
parts = [f"`slides/{idx}_{safe_title}.png`", "`.jsonl`"]
|
|
174
462
|
if slide_has_notes.get(idx):
|
|
175
463
|
parts.append("`.notes.md`")
|
|
176
|
-
chart_refs = slide_charts.get(idx, [])
|
|
177
464
|
if chart_refs:
|
|
178
465
|
chart_str = ", ".join(f"`charts/{c}`" for c in chart_refs)
|
|
179
466
|
parts.append(f"(차트: {chart_str})")
|
|
467
|
+
if slide_cores.get(idx):
|
|
468
|
+
parts.append(f"— {slide_cores[idx]}")
|
|
180
469
|
slide_summaries.append(" ".join(parts))
|
|
181
470
|
|
|
182
471
|
# COM PowerPoint 의 Slide.Export 로 슬라이드별 PNG 직접 출력. 임시 폴더에서 만든 후 long-path-safe copy.
|
|
@@ -187,7 +476,8 @@ def _run_pptx(
|
|
|
187
476
|
if tmp_png.exists():
|
|
188
477
|
_common.copy(tmp_png, slides_dir / f"{idx}_{safe_title}.png")
|
|
189
478
|
|
|
190
|
-
# pptx 의 시각은 슬라이드 PNG 에 모두 포함 →
|
|
479
|
+
# pptx 의 시각은 슬라이드 PNG 에 모두 포함 → ZIP media 전체 복제 skip
|
|
480
|
+
# (개별 picture shape 은 _pptx_extract_slide_nodes 에서 image ref 와 함께 저장됨).
|
|
191
481
|
attachment_links = _extract_zip_media(
|
|
192
482
|
input_path,
|
|
193
483
|
out_dir,
|
|
@@ -209,12 +499,213 @@ def _run_pptx(
|
|
|
209
499
|
source_name=source_name,
|
|
210
500
|
source_size=source_size,
|
|
211
501
|
tool=("python-pptx + COM PowerPoint + ZIP " + tool_extra).strip(),
|
|
212
|
-
loss_notes=
|
|
502
|
+
loss_notes=(
|
|
503
|
+
"애니메이션·슬라이드 전환·정확한 폰트는 미보존. "
|
|
504
|
+
"시각은 슬라이드별 PNG, 구조는 슬라이드별 .jsonl(시각 순서·pos EMU 좌표), "
|
|
505
|
+
"차트 데이터는 charts/*.data.json, picture shape 의 image 는 images/."
|
|
506
|
+
),
|
|
213
507
|
sections=sections or None,
|
|
214
508
|
attachments=attachment_links,
|
|
215
509
|
)
|
|
216
510
|
|
|
217
511
|
|
|
512
|
+
def _pptx_slide_title(slide) -> str:
|
|
513
|
+
"""슬라이드 title placeholder 텍스트. 없으면 빈 문자열."""
|
|
514
|
+
try:
|
|
515
|
+
title_shape = slide.shapes.title
|
|
516
|
+
if title_shape is not None and title_shape.text:
|
|
517
|
+
return title_shape.text.strip()
|
|
518
|
+
except (AttributeError, ValueError):
|
|
519
|
+
pass
|
|
520
|
+
return ""
|
|
521
|
+
|
|
522
|
+
|
|
523
|
+
def _pptx_first_text(nodes: list[dict]) -> str:
|
|
524
|
+
"""노드 리스트 중 첫 비어있지 않은 text. 없으면 빈 문자열."""
|
|
525
|
+
for n in nodes:
|
|
526
|
+
t = (n.get("text") or "").strip()
|
|
527
|
+
if t:
|
|
528
|
+
return t
|
|
529
|
+
return ""
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def _pptx_extract_slide_nodes(
|
|
533
|
+
slide,
|
|
534
|
+
slide_num: int,
|
|
535
|
+
charts_dir: Path,
|
|
536
|
+
images_dir: Path,
|
|
537
|
+
) -> tuple[list[dict], list[str]]:
|
|
538
|
+
"""슬라이드 안 shape → 노드 list + chart 파일 list.
|
|
539
|
+
|
|
540
|
+
text_frame 의 paragraph 별로 노드 분리 (heading·para·bullet).
|
|
541
|
+
표·차트·이미지는 각각 별도 노드.
|
|
542
|
+
그 외 (autoshape·SmartArt·group) 은 shape 노드.
|
|
543
|
+
"""
|
|
544
|
+
nodes: list[dict] = []
|
|
545
|
+
chart_refs: list[str] = []
|
|
546
|
+
|
|
547
|
+
title_shape = None
|
|
548
|
+
try:
|
|
549
|
+
title_shape = slide.shapes.title
|
|
550
|
+
except (AttributeError, ValueError):
|
|
551
|
+
title_shape = None
|
|
552
|
+
|
|
553
|
+
for shape_idx, shape in enumerate(slide.shapes):
|
|
554
|
+
pos = _pptx_shape_pos(shape)
|
|
555
|
+
common = {
|
|
556
|
+
"slide": slide_num,
|
|
557
|
+
"pos": pos,
|
|
558
|
+
"shape_idx": shape_idx,
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
# 표
|
|
562
|
+
if getattr(shape, "has_table", False):
|
|
563
|
+
try:
|
|
564
|
+
table = shape.table
|
|
565
|
+
except Exception:
|
|
566
|
+
table = None
|
|
567
|
+
if table is not None:
|
|
568
|
+
table_idx = shape_idx + 1
|
|
569
|
+
for r_idx, row in enumerate(table.rows, start=1):
|
|
570
|
+
for c_idx, cell in enumerate(row.cells, start=1):
|
|
571
|
+
cell_text = (cell.text or "").strip()
|
|
572
|
+
nodes.append({
|
|
573
|
+
**common,
|
|
574
|
+
"type": "table_cell",
|
|
575
|
+
"table_idx": table_idx,
|
|
576
|
+
"row": r_idx,
|
|
577
|
+
"col": c_idx,
|
|
578
|
+
"text": cell_text,
|
|
579
|
+
})
|
|
580
|
+
continue
|
|
581
|
+
|
|
582
|
+
# 차트
|
|
583
|
+
if getattr(shape, "has_chart", False):
|
|
584
|
+
try:
|
|
585
|
+
data = _extract_pptx_chart_data(shape.chart)
|
|
586
|
+
except Exception:
|
|
587
|
+
data = None
|
|
588
|
+
chart_filename = f"slide{slide_num:02d}_chart{shape_idx + 1:02d}.data.json"
|
|
589
|
+
if data is not None:
|
|
590
|
+
_common.mkdir(charts_dir)
|
|
591
|
+
_common.write_text(
|
|
592
|
+
charts_dir / chart_filename,
|
|
593
|
+
json.dumps(data, ensure_ascii=False, indent=2),
|
|
594
|
+
)
|
|
595
|
+
chart_refs.append(chart_filename)
|
|
596
|
+
nodes.append({
|
|
597
|
+
**common,
|
|
598
|
+
"type": "chart",
|
|
599
|
+
"ref": f"charts/{chart_filename}",
|
|
600
|
+
})
|
|
601
|
+
continue
|
|
602
|
+
|
|
603
|
+
# 그림 (picture)
|
|
604
|
+
if _pptx_is_picture(shape):
|
|
605
|
+
ref = _pptx_save_picture(shape, slide_num, shape_idx, images_dir)
|
|
606
|
+
node = {**common, "type": "image"}
|
|
607
|
+
if ref:
|
|
608
|
+
node["ref"] = ref
|
|
609
|
+
nodes.append(node)
|
|
610
|
+
continue
|
|
611
|
+
|
|
612
|
+
# text_frame 보유 shape (placeholder·text box·autoshape with text)
|
|
613
|
+
if getattr(shape, "has_text_frame", False):
|
|
614
|
+
is_title = (title_shape is not None and shape == title_shape)
|
|
615
|
+
for p_idx, para in enumerate(shape.text_frame.paragraphs):
|
|
616
|
+
text = "".join(run.text for run in para.runs)
|
|
617
|
+
hyperlinks = _pptx_run_hyperlinks(para)
|
|
618
|
+
bullet_lvl = getattr(para, "level", 0) or 0
|
|
619
|
+
|
|
620
|
+
base_node: dict
|
|
621
|
+
if is_title and p_idx == 0:
|
|
622
|
+
base_node = {**common, "type": "title", "para_idx": p_idx, "text": text}
|
|
623
|
+
elif bullet_lvl > 0:
|
|
624
|
+
base_node = {**common, "type": "bullet", "para_idx": p_idx,
|
|
625
|
+
"level": bullet_lvl, "text": text}
|
|
626
|
+
else:
|
|
627
|
+
base_node = {**common, "type": "para", "para_idx": p_idx, "text": text}
|
|
628
|
+
if hyperlinks:
|
|
629
|
+
base_node["hyperlinks"] = hyperlinks
|
|
630
|
+
nodes.append(base_node)
|
|
631
|
+
continue
|
|
632
|
+
|
|
633
|
+
# 그 외 (group·SmartArt·connector·autoshape 등)
|
|
634
|
+
subtype = ""
|
|
635
|
+
try:
|
|
636
|
+
subtype = str(shape.shape_type)
|
|
637
|
+
except Exception:
|
|
638
|
+
pass
|
|
639
|
+
nodes.append({
|
|
640
|
+
**common,
|
|
641
|
+
"type": "shape",
|
|
642
|
+
"subtype": subtype,
|
|
643
|
+
})
|
|
644
|
+
|
|
645
|
+
return nodes, chart_refs
|
|
646
|
+
|
|
647
|
+
|
|
648
|
+
def _pptx_shape_pos(shape) -> list[int]:
|
|
649
|
+
"""shape 의 [left, top, width, height] EMU. 누락 시 0."""
|
|
650
|
+
try:
|
|
651
|
+
return [
|
|
652
|
+
int(shape.left or 0),
|
|
653
|
+
int(shape.top or 0),
|
|
654
|
+
int(shape.width or 0),
|
|
655
|
+
int(shape.height or 0),
|
|
656
|
+
]
|
|
657
|
+
except (AttributeError, TypeError, ValueError):
|
|
658
|
+
return [0, 0, 0, 0]
|
|
659
|
+
|
|
660
|
+
|
|
661
|
+
def _pptx_is_picture(shape) -> bool:
|
|
662
|
+
"""python-pptx shape 이 picture 인지. shape_type 또는 image 속성으로 판별."""
|
|
663
|
+
try:
|
|
664
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
665
|
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
|
|
666
|
+
return True
|
|
667
|
+
except Exception:
|
|
668
|
+
pass
|
|
669
|
+
# placeholder picture 인 경우 shape_type 이 PLACEHOLDER 라 image 속성으로 보완
|
|
670
|
+
try:
|
|
671
|
+
_ = shape.image
|
|
672
|
+
return True
|
|
673
|
+
except Exception:
|
|
674
|
+
return False
|
|
675
|
+
|
|
676
|
+
|
|
677
|
+
def _pptx_run_hyperlinks(para) -> list[dict]:
|
|
678
|
+
"""pptx paragraph 안 run 별 hyperlink list. 텍스트·URL."""
|
|
679
|
+
result: list[dict] = []
|
|
680
|
+
for run in para.runs:
|
|
681
|
+
try:
|
|
682
|
+
hl = run.hyperlink
|
|
683
|
+
url = getattr(hl, "address", None)
|
|
684
|
+
except Exception:
|
|
685
|
+
url = None
|
|
686
|
+
if url:
|
|
687
|
+
result.append({"text": run.text or "", "url": url})
|
|
688
|
+
return result
|
|
689
|
+
|
|
690
|
+
|
|
691
|
+
def _pptx_save_picture(
|
|
692
|
+
shape, slide_num: int, shape_idx: int, images_dir: Path,
|
|
693
|
+
) -> Optional[str]:
|
|
694
|
+
"""shape.image.blob 을 images/ 에 저장하고 ref(상대경로) 반환. 실패 시 None."""
|
|
695
|
+
try:
|
|
696
|
+
img = shape.image
|
|
697
|
+
ext = (img.ext or "bin").lstrip(".")
|
|
698
|
+
blob = img.blob
|
|
699
|
+
except Exception:
|
|
700
|
+
return None
|
|
701
|
+
if not blob:
|
|
702
|
+
return None
|
|
703
|
+
_common.mkdir(images_dir)
|
|
704
|
+
filename = f"slide{slide_num:02d}_shape{shape_idx + 1:02d}.{ext}"
|
|
705
|
+
_common.write_bytes(images_dir / filename, blob)
|
|
706
|
+
return f"images/{filename}"
|
|
707
|
+
|
|
708
|
+
|
|
218
709
|
# ====================================================================
|
|
219
710
|
# XLSX
|
|
220
711
|
# ====================================================================
|
|
@@ -254,9 +745,10 @@ def _run_xlsx(
|
|
|
254
745
|
sheet_names.append((idx, safe_name, name))
|
|
255
746
|
|
|
256
747
|
# COM Excel 호출: 데이터 영역 → ChartObject + Range.CopyPicture → 시트별 PNG.
|
|
257
|
-
# 시트별 (last_row, last_col) 도 같이 반환되어 .
|
|
748
|
+
# 시트별 (last_row, last_col) 도 같이 반환되어 .jsonl 이 같은 데이터 영역으로 통일됨.
|
|
749
|
+
# PNG export 실패한 시트는 sheet_png_skipped 에 사유 (silent skip 금지).
|
|
258
750
|
with _common.com_lock():
|
|
259
|
-
sheet_ranges = _excel_export_sheet_pngs(input_path, sheets_dir, sheet_names)
|
|
751
|
+
sheet_ranges, sheet_png_skipped = _excel_export_sheet_pngs(input_path, sheets_dir, sheet_names)
|
|
260
752
|
|
|
261
753
|
for idx, safe_name, raw_name in sheet_names:
|
|
262
754
|
ws_v = wb_values[raw_name]
|
|
@@ -266,24 +758,9 @@ def _run_xlsx(
|
|
|
266
758
|
last_row, last_col = sheet_ranges.get(raw_name, (ws_v.max_row, ws_v.max_column))
|
|
267
759
|
sheet_dims[idx] = (last_row, last_col)
|
|
268
760
|
|
|
269
|
-
|
|
270
|
-
_common.write_text(sheets_dir / f"{idx}_{safe_name}.
|
|
271
|
-
|
|
272
|
-
formulas: dict[str, str] = {}
|
|
273
|
-
if last_row >= 1 and last_col >= 1:
|
|
274
|
-
for row in ws_f.iter_rows(min_row=1, max_row=last_row, min_col=1, max_col=last_col):
|
|
275
|
-
for cell in row:
|
|
276
|
-
if cell.data_type != "f":
|
|
277
|
-
continue
|
|
278
|
-
v = cell.value
|
|
279
|
-
# 일반·shared formula 는 str, array formula 는 ArrayFormula(.text 보유)
|
|
280
|
-
formulas[cell.coordinate] = v if isinstance(v, str) else getattr(v, "text", str(v))
|
|
281
|
-
if formulas:
|
|
282
|
-
_common.write_text(
|
|
283
|
-
sheets_dir / f"{idx}_{safe_name}.formulas.json",
|
|
284
|
-
json.dumps(formulas, ensure_ascii=False, indent=2),
|
|
285
|
-
)
|
|
286
|
-
sheet_formula_count[idx] = len(formulas)
|
|
761
|
+
jsonl_lines, formula_n = _sheet_to_jsonl(ws_v, ws_f, last_row, last_col)
|
|
762
|
+
_common.write_text(sheets_dir / f"{idx}_{safe_name}.jsonl", "\n".join(jsonl_lines))
|
|
763
|
+
sheet_formula_count[idx] = formula_n
|
|
287
764
|
|
|
288
765
|
for chart_idx, chart in enumerate(getattr(ws_f, "_charts", []), start=1):
|
|
289
766
|
data = _extract_openpyxl_chart_data(chart)
|
|
@@ -294,6 +771,21 @@ def _run_xlsx(
|
|
|
294
771
|
json.dumps(data, ensure_ascii=False, indent=2),
|
|
295
772
|
)
|
|
296
773
|
sheet_charts.setdefault(idx, []).append(chart_filename)
|
|
774
|
+
|
|
775
|
+
# 워크북 단위 메타 (defined names 등) — 시트 jsonl 외부 분리.
|
|
776
|
+
wb_meta = _workbook_meta(wb_formulas)
|
|
777
|
+
if wb_meta:
|
|
778
|
+
_common.write_text(
|
|
779
|
+
out_dir / "workbook.meta.json",
|
|
780
|
+
json.dumps(wb_meta, ensure_ascii=False, indent=2),
|
|
781
|
+
)
|
|
782
|
+
|
|
783
|
+
# VBA 시트 객체명 ↔ raw 시트명 매핑 (시트 codeName 기반)
|
|
784
|
+
sheet_code_map: dict[str, str] = {}
|
|
785
|
+
for ws in wb_formulas.worksheets:
|
|
786
|
+
code = getattr(ws.sheet_properties, "codeName", None)
|
|
787
|
+
if code:
|
|
788
|
+
sheet_code_map[code] = ws.title
|
|
297
789
|
finally:
|
|
298
790
|
wb_values.close()
|
|
299
791
|
wb_formulas.close()
|
|
@@ -312,9 +804,13 @@ def _run_xlsx(
|
|
|
312
804
|
for idx, safe_name, raw_name in sheet_names:
|
|
313
805
|
last_row, last_col = sheet_dims.get(idx, (0, 0))
|
|
314
806
|
formula_n = sheet_formula_count.get(idx, 0)
|
|
315
|
-
|
|
316
|
-
if
|
|
317
|
-
parts.
|
|
807
|
+
png_path = sheets_dir / f"{idx}_{safe_name}.png"
|
|
808
|
+
if png_path.exists():
|
|
809
|
+
parts = [f"`sheets/{idx}_{safe_name}.png`", "`.jsonl`"]
|
|
810
|
+
else:
|
|
811
|
+
# PNG 미생성 — worker 가 사유 전달 (16-bit cap / COM 실패 등)
|
|
812
|
+
reason = sheet_png_skipped.get(raw_name, "사유 미상")
|
|
813
|
+
parts = [f"`sheets/{idx}_{safe_name}.jsonl`", f"(PNG 미생성 — {reason})"]
|
|
318
814
|
chart_refs = sheet_charts.get(idx, [])
|
|
319
815
|
if chart_refs:
|
|
320
816
|
parts.append("(차트: " + ", ".join(f"`charts/{c}`" for c in chart_refs) + ")")
|
|
@@ -328,7 +824,9 @@ def _run_xlsx(
|
|
|
328
824
|
sheet_summaries.append(" ".join(parts) + " " + meta)
|
|
329
825
|
|
|
330
826
|
source_name, source_size = _source_meta(input_path, out_dir, source_name_override)
|
|
331
|
-
macro_modules = _extract_macros(
|
|
827
|
+
macro_modules = _extract_macros(
|
|
828
|
+
_source_path(out_dir, source_name), out_dir, sheet_code_map=sheet_code_map,
|
|
829
|
+
)
|
|
332
830
|
|
|
333
831
|
sections: dict[str, list[str]] = {}
|
|
334
832
|
if sheet_summaries:
|
|
@@ -341,7 +839,11 @@ def _run_xlsx(
|
|
|
341
839
|
source_name=source_name,
|
|
342
840
|
source_size=source_size,
|
|
343
841
|
tool=("openpyxl + COM Excel + ZIP " + tool_extra).strip(),
|
|
344
|
-
loss_notes=
|
|
842
|
+
loss_notes=(
|
|
843
|
+
"셀 서식·조건부 서식·데이터 검증 규칙은 미보존. "
|
|
844
|
+
"시각은 시트별 PNG, 데이터·수식·시트 메타는 시트별 .jsonl 한 줄=한 행(좌표 명시), "
|
|
845
|
+
"워크북 단위 메타(defined names 등)는 workbook.meta.json."
|
|
846
|
+
),
|
|
345
847
|
sections=sections or None,
|
|
346
848
|
attachments=attachment_links,
|
|
347
849
|
)
|
|
@@ -396,11 +898,13 @@ def _excel_export_sheet_pngs(
|
|
|
396
898
|
input_path: Path,
|
|
397
899
|
sheets_dir: Path,
|
|
398
900
|
sheet_names: list[tuple[str, str, str]],
|
|
399
|
-
) -> dict[str, tuple[int, int]]:
|
|
400
|
-
"""시트별 PNG 생성 + (last_row, last_col) 매핑 반환.
|
|
901
|
+
) -> tuple[dict[str, tuple[int, int]], dict[str, str]]:
|
|
902
|
+
"""시트별 PNG 생성 + (last_row, last_col) 매핑 + skipped 사유 반환.
|
|
401
903
|
|
|
402
904
|
호출자에서 sheetProtection strip 사본 만들고 worker 에 그 사본 path 만 넘김.
|
|
403
905
|
Excel COM 자체 작업은 worker subprocess.
|
|
906
|
+
|
|
907
|
+
반환: (sheet_ranges, skipped) — skipped 는 PNG export 실패한 시트의 사유 dict (raw_name → reason).
|
|
404
908
|
"""
|
|
405
909
|
with _common.temp_workdir() as tmp:
|
|
406
910
|
unprotected = tmp / "_unprotected.xlsx"
|
|
@@ -409,8 +913,13 @@ def _excel_export_sheet_pngs(
|
|
|
409
913
|
"excel_sheets", str(unprotected), str(sheets_dir), json.dumps(sheet_names),
|
|
410
914
|
timeout=600, capture_stdout=True,
|
|
411
915
|
)
|
|
412
|
-
|
|
413
|
-
|
|
916
|
+
if not result.strip():
|
|
917
|
+
return {}, {}
|
|
918
|
+
parsed = json.loads(result)
|
|
919
|
+
ranges_raw = parsed.get("sheet_ranges", {})
|
|
920
|
+
sheet_ranges = {k: tuple(v) for k, v in ranges_raw.items()}
|
|
921
|
+
skipped = parsed.get("skipped", {})
|
|
922
|
+
return sheet_ranges, skipped
|
|
414
923
|
|
|
415
924
|
|
|
416
925
|
def _xlsx_strip_protection(src: Path, dst: Path) -> None:
|
|
@@ -493,11 +1002,18 @@ def _source_path(out_dir: Path, source_name: str) -> Path:
|
|
|
493
1002
|
return out_dir / f"_source.{ext}"
|
|
494
1003
|
|
|
495
1004
|
|
|
496
|
-
def _extract_macros(
|
|
1005
|
+
def _extract_macros(
|
|
1006
|
+
input_path: Path,
|
|
1007
|
+
out_dir: Path,
|
|
1008
|
+
sheet_code_map: Optional[dict[str, str]] = None,
|
|
1009
|
+
) -> list[str]:
|
|
497
1010
|
"""OLE/OOXML 파일에서 VBA 매크로 추출. macros/<모듈명>.vba 로 저장.
|
|
498
1011
|
|
|
499
1012
|
추출된 모듈 파일명 list 반환 (예: ["Module1.vba", "ThisWorkbook.vba"]).
|
|
500
1013
|
매크로 없으면 빈 list.
|
|
1014
|
+
|
|
1015
|
+
sheet_code_map: VBA 시트 객체 codeName → raw 시트명 (예: {"Sheet1": "BOA"}).
|
|
1016
|
+
매크로 파일 첫 줄에 코멘트로 매핑 정보 prepend (시트 모듈만).
|
|
501
1017
|
"""
|
|
502
1018
|
_common.ensure_pip("oletools")
|
|
503
1019
|
from oletools.olevba import VBA_Parser
|
|
@@ -512,8 +1028,11 @@ def _extract_macros(input_path: Path, out_dir: Path) -> list[str]:
|
|
|
512
1028
|
for (_filename, stream_path, vba_filename, vba_code) in parser.extract_macros():
|
|
513
1029
|
module_name = vba_filename or stream_path or "module"
|
|
514
1030
|
stem = Path(module_name).stem or "module"
|
|
1031
|
+
prefix = ""
|
|
1032
|
+
if sheet_code_map and stem in sheet_code_map:
|
|
1033
|
+
prefix = f'\' (object: {stem}, sheet: "{sheet_code_map[stem]}")\n\n'
|
|
515
1034
|
dst = _common.unique_path(macros_dir, f"{stem}.vba")
|
|
516
|
-
_common.write_text(dst, vba_code or "")
|
|
1035
|
+
_common.write_text(dst, prefix + (vba_code or ""))
|
|
517
1036
|
module_files.append(dst.name)
|
|
518
1037
|
return module_files
|
|
519
1038
|
finally:
|
|
@@ -617,40 +1136,112 @@ def _extract_zip_media(
|
|
|
617
1136
|
dst = _common.unique_path(attachments_dir, base)
|
|
618
1137
|
with zf.open(info) as f:
|
|
619
1138
|
_common.write_bytes(dst, f.read())
|
|
1139
|
+
size = dst.stat().st_size
|
|
620
1140
|
recursed = maybe_recurse_attachment(dst, attachments_dir)
|
|
621
1141
|
if recursed is not None:
|
|
622
1142
|
os.unlink(_common.long_str(dst))
|
|
623
|
-
attachment_links.append(f"attachments/{recursed.name}/")
|
|
1143
|
+
attachment_links.append(f"attachments/{recursed.name}/ ({_common.format_size(size)})")
|
|
624
1144
|
else:
|
|
625
|
-
attachment_links.append(f"attachments/{dst.name}")
|
|
1145
|
+
attachment_links.append(f"attachments/{dst.name} ({_common.format_size(size)})")
|
|
626
1146
|
return attachment_links
|
|
627
1147
|
|
|
628
1148
|
|
|
629
|
-
def
|
|
630
|
-
"""openpyxl
|
|
1149
|
+
def _json_default(obj: Any) -> str:
|
|
1150
|
+
"""JSON 직렬화 fallback. openpyxl datetime → ISO 8601. 그 외는 throw."""
|
|
1151
|
+
if isinstance(obj, (datetime, date, time)):
|
|
1152
|
+
return obj.isoformat()
|
|
1153
|
+
raise TypeError(f"not JSON serializable: {type(obj).__name__}")
|
|
1154
|
+
|
|
1155
|
+
|
|
1156
|
+
def _sheet_to_jsonl(ws_v, ws_f, last_row: int, last_col: int) -> tuple[list[str], int]:
|
|
1157
|
+
"""openpyxl Worksheet 의 (1,1)~(last_row,last_col) 범위를 행 단위 JSONL 라인으로.
|
|
1158
|
+
|
|
1159
|
+
한 줄 = 한 행. 빈 셀 키 생략. 좌표는 `r`(1-based 행번호) + 열문자 키(`A`·`B`·...·`AA`·...).
|
|
1160
|
+
같은 행 수식은 `_f` 맵 (열문자 → 수식문자열). 빈 행도 `{"r":N}` 한 줄 유지 → Read offset = 행번호.
|
|
1161
|
+
첫 줄은 `{"_meta":{...}}` (시트 dims·merges·frozen·hyperlinks·comments).
|
|
1162
|
+
값 타입은 JSON 네이티브(int·float·bool) + datetime ISO 8601.
|
|
1163
|
+
|
|
1164
|
+
반환: (lines, formula_count)
|
|
1165
|
+
"""
|
|
1166
|
+
from openpyxl.utils import get_column_letter
|
|
1167
|
+
|
|
631
1168
|
if last_row < 1 or last_col < 1:
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
)
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
1169
|
+
meta = {"_meta": {"dims": [0, 0]}}
|
|
1170
|
+
return [json.dumps(meta, ensure_ascii=False)], 0
|
|
1171
|
+
|
|
1172
|
+
# 메타 수집: 머지·frozen·hyperlinks·comments
|
|
1173
|
+
meta: dict[str, Any] = {"dims": [last_row, last_col]}
|
|
1174
|
+
merges = [str(r) for r in ws_v.merged_cells.ranges]
|
|
1175
|
+
if merges:
|
|
1176
|
+
meta["merges"] = merges
|
|
1177
|
+
frozen = ws_v.freeze_panes
|
|
1178
|
+
if frozen:
|
|
1179
|
+
meta["frozen"] = frozen
|
|
1180
|
+
|
|
1181
|
+
hyperlinks: dict[str, str] = {}
|
|
1182
|
+
comments: dict[str, str] = {}
|
|
1183
|
+
number_formats: dict[str, str] = {} # General(기본) 외 셀의 표시 형식
|
|
1184
|
+
for row in ws_v.iter_rows(min_row=1, max_row=last_row, min_col=1, max_col=last_col):
|
|
1185
|
+
for cell in row:
|
|
1186
|
+
hl = getattr(cell, "hyperlink", None)
|
|
1187
|
+
if hl is not None and getattr(hl, "target", None):
|
|
1188
|
+
hyperlinks[cell.coordinate] = hl.target
|
|
1189
|
+
cm = getattr(cell, "comment", None)
|
|
1190
|
+
if cm is not None and getattr(cm, "text", None):
|
|
1191
|
+
comments[cell.coordinate] = cm.text
|
|
1192
|
+
nf = getattr(cell, "number_format", None)
|
|
1193
|
+
if nf and nf != "General":
|
|
1194
|
+
number_formats[cell.coordinate] = nf
|
|
1195
|
+
if hyperlinks:
|
|
1196
|
+
meta["hyperlinks"] = hyperlinks
|
|
1197
|
+
if comments:
|
|
1198
|
+
meta["comments"] = comments
|
|
1199
|
+
if number_formats:
|
|
1200
|
+
meta["number_formats"] = number_formats
|
|
1201
|
+
|
|
1202
|
+
lines: list[str] = [json.dumps({"_meta": meta}, ensure_ascii=False, default=_json_default)]
|
|
1203
|
+
formula_count = 0
|
|
1204
|
+
|
|
1205
|
+
rows_v = ws_v.iter_rows(min_row=1, max_row=last_row, min_col=1, max_col=last_col, values_only=True)
|
|
1206
|
+
rows_f = ws_f.iter_rows(min_row=1, max_row=last_row, min_col=1, max_col=last_col)
|
|
1207
|
+
for r_idx, (row_v, row_f) in enumerate(zip(rows_v, rows_f), start=1):
|
|
1208
|
+
row_data: dict[str, Any] = {"r": r_idx}
|
|
1209
|
+
fmap: dict[str, str] = {}
|
|
1210
|
+
for c_idx, (v, fcell) in enumerate(zip(row_v, row_f), start=1):
|
|
1211
|
+
col_letter = get_column_letter(c_idx)
|
|
1212
|
+
if v is not None:
|
|
1213
|
+
row_data[col_letter] = v
|
|
1214
|
+
if fcell.data_type == "f":
|
|
1215
|
+
fv = fcell.value
|
|
1216
|
+
# 일반·shared formula 는 str, array formula 는 ArrayFormula(.text 보유)
|
|
1217
|
+
fmap[col_letter] = fv if isinstance(fv, str) else getattr(fv, "text", str(fv))
|
|
1218
|
+
formula_count += 1
|
|
1219
|
+
if fmap:
|
|
1220
|
+
row_data["_f"] = fmap
|
|
1221
|
+
lines.append(json.dumps(row_data, ensure_ascii=False, default=_json_default))
|
|
1222
|
+
|
|
1223
|
+
return lines, formula_count
|
|
1224
|
+
|
|
1225
|
+
|
|
1226
|
+
def _workbook_meta(wb) -> dict[str, Any]:
|
|
1227
|
+
"""워크북 단위 메타 (defined names 등). 비어있으면 빈 dict 반환."""
|
|
1228
|
+
meta: dict[str, Any] = {}
|
|
1229
|
+
defined_names: dict[str, list[str]] = {}
|
|
1230
|
+
# openpyxl 3.x: wb.defined_names 는 DefinedNameDict (dict-like)
|
|
1231
|
+
try:
|
|
1232
|
+
for name, dn in wb.defined_names.items():
|
|
1233
|
+
try:
|
|
1234
|
+
dests = [f"'{sheet}'!{addr}" for sheet, addr in dn.destinations]
|
|
1235
|
+
except Exception:
|
|
1236
|
+
# destinations 파싱 불가 시 raw value 보존 (예: 워크북-수식 형태)
|
|
1237
|
+
dests = [str(getattr(dn, "value", ""))]
|
|
1238
|
+
defined_names[name] = dests
|
|
1239
|
+
except Exception:
|
|
1240
|
+
# defined_names 자체 접근 실패 → 워크북에 없는 것으로 처리
|
|
1241
|
+
pass
|
|
1242
|
+
if defined_names:
|
|
1243
|
+
meta["defined_names"] = defined_names
|
|
1244
|
+
return meta
|
|
654
1245
|
|
|
655
1246
|
|
|
656
1247
|
def _extract_pptx_chart_data(chart) -> dict:
|