@simplysm/sd-claude 14.0.47 → 14.0.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/{claude/references/sd-simplysm14/sd-claude/usage.md → README.md} +2 -2
- package/claude/rules/sd-claude-rules.md +25 -10
- package/claude/rules/sd-options.md +11 -6
- package/claude/sd-subagent-start.sh +6 -0
- package/claude/settings.json +1 -12
- package/claude/skills/sd-check/SKILL.md +43 -12
- package/claude/skills/sd-claude-docs/SKILL.md +30 -58
- package/claude/skills/sd-claude-docs/references/package-claudemd.md +12 -0
- package/claude/skills/sd-claude-docs/references/package-doc-gen.md +26 -13
- package/claude/skills/sd-commit/SKILL.md +1 -1
- package/claude/skills/sd-debug/SKILL.md +5 -3
- package/claude/skills/sd-deliverable/SKILL.md +1 -1
- package/claude/skills/sd-dev/SKILL.md +14 -9
- package/claude/skills/sd-doc-extract/SKILL.md +8 -10
- package/claude/skills/sd-doc-extract/_common.py +8 -1
- package/claude/skills/sd-doc-extract/_extract_docx.py +74 -34
- package/claude/skills/sd-doc-extract/_extract_pdf.py +12 -1
- package/claude/skills/sd-doc-extract/_extract_pptx.py +103 -23
- package/claude/skills/sd-doc-extract/_extract_xlsb.py +93 -4
- package/claude/skills/sd-doc-extract/_extract_xlsx.py +98 -36
- package/claude/skills/sd-doc-extract/extract.py +22 -3
- package/claude/skills/sd-inner-clarify/SKILL.md +78 -0
- package/claude/skills/sd-inner-debug/SKILL.md +1 -1
- package/claude/skills/sd-inner-review/SKILL.md +13 -0
- package/claude/skills/sd-issue/SKILL.md +1 -1
- package/claude/skills/sd-outlook/SKILL.md +1 -1
- package/claude/skills/sd-plan/SKILL.md +50 -17
- package/claude/skills/sd-prompt/SKILL.md +180 -178
- package/claude/skills/sd-prompt/references/eval-runner.md +5 -30
- package/claude/skills/sd-prompt/references/sd-eval-env-template.md +23 -0
- package/claude/skills/sd-refactor/SKILL.md +2 -2
- package/claude/skills/sd-tdd/SKILL.md +45 -16
- package/claude/skills/sd-use/SKILL.md +84 -80
- package/claude/skills/sd-wbs/SKILL.md +84 -27
- package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/assets.md +2 -3
- package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/hooks.md +7 -6
- package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/scripts.md +1 -9
- package/package.json +3 -2
- package/scripts/sync.mjs +4 -2
- package/claude/references/sd-simplysm14/angular/docs/bootstrap.md +0 -48
- package/claude/references/sd-simplysm14/angular/docs/directives.md +0 -236
- package/claude/references/sd-simplysm14/angular/docs/features.md +0 -379
- package/claude/references/sd-simplysm14/angular/docs/pipes.md +0 -32
- package/claude/references/sd-simplysm14/angular/docs/plugins.md +0 -37
- package/claude/references/sd-simplysm14/angular/docs/provider-types.md +0 -283
- package/claude/references/sd-simplysm14/angular/docs/providers.md +0 -379
- package/claude/references/sd-simplysm14/angular/docs/styling.md +0 -222
- package/claude/references/sd-simplysm14/angular/docs/type-utilities.md +0 -250
- package/claude/references/sd-simplysm14/angular/docs/ui-data.md +0 -275
- package/claude/references/sd-simplysm14/angular/docs/ui-form.md +0 -490
- package/claude/references/sd-simplysm14/angular/docs/ui-layout.md +0 -140
- package/claude/references/sd-simplysm14/angular/docs/ui-navigation.md +0 -273
- package/claude/references/sd-simplysm14/angular/docs/ui-overlay.md +0 -157
- package/claude/references/sd-simplysm14/angular/docs/ui-visual.md +0 -127
- package/claude/references/sd-simplysm14/angular/docs/utils.md +0 -295
- package/claude/references/sd-simplysm14/angular/usage.md +0 -489
- package/claude/references/sd-simplysm14/capacitor-plugin-auto-update/usage.md +0 -182
- package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/file-operations.md +0 -154
- package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/permissions.md +0 -84
- package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/storage-paths.md +0 -107
- package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/types.md +0 -83
- package/claude/references/sd-simplysm14/capacitor-plugin-file-system/usage.md +0 -133
- package/claude/references/sd-simplysm14/capacitor-plugin-intent/usage.md +0 -203
- package/claude/references/sd-simplysm14/capacitor-plugin-usb-storage/usage.md +0 -258
- package/claude/references/sd-simplysm14/core-browser/usage.md +0 -306
- package/claude/references/sd-simplysm14/core-common/docs/errors.md +0 -82
- package/claude/references/sd-simplysm14/core-common/docs/extensions.md +0 -167
- package/claude/references/sd-simplysm14/core-common/docs/features.md +0 -136
- package/claude/references/sd-simplysm14/core-common/docs/types.md +0 -245
- package/claude/references/sd-simplysm14/core-common/docs/utils.md +0 -591
- package/claude/references/sd-simplysm14/core-common/usage.md +0 -255
- package/claude/references/sd-simplysm14/core-node/docs/child-process.md +0 -182
- package/claude/references/sd-simplysm14/core-node/docs/features.md +0 -214
- package/claude/references/sd-simplysm14/core-node/docs/file-system.md +0 -509
- package/claude/references/sd-simplysm14/core-node/docs/file-watching.md +0 -139
- package/claude/references/sd-simplysm14/core-node/docs/logging.md +0 -180
- package/claude/references/sd-simplysm14/core-node/docs/path.md +0 -176
- package/claude/references/sd-simplysm14/core-node/docs/utilities-cpx.md +0 -194
- package/claude/references/sd-simplysm14/core-node/docs/utilities-fsx.md +0 -469
- package/claude/references/sd-simplysm14/core-node/docs/utilities-pathx.md +0 -151
- package/claude/references/sd-simplysm14/core-node/docs/worker-threads.md +0 -334
- package/claude/references/sd-simplysm14/core-node/docs/worker.md +0 -205
- package/claude/references/sd-simplysm14/core-node/usage.md +0 -259
- package/claude/references/sd-simplysm14/excel/docs/core-classes.md +0 -453
- package/claude/references/sd-simplysm14/excel/docs/types.md +0 -459
- package/claude/references/sd-simplysm14/excel/docs/utilities.md +0 -194
- package/claude/references/sd-simplysm14/excel/docs/wrapper.md +0 -73
- package/claude/references/sd-simplysm14/excel/usage.md +0 -134
- package/claude/references/sd-simplysm14/lint/usage.md +0 -130
- package/claude/references/sd-simplysm14/orm-common/docs/core.md +0 -188
- package/claude/references/sd-simplysm14/orm-common/docs/expression.md +0 -190
- package/claude/references/sd-simplysm14/orm-common/docs/models.md +0 -17
- package/claude/references/sd-simplysm14/orm-common/docs/query-builder.md +0 -97
- package/claude/references/sd-simplysm14/orm-common/docs/queryable-executable.md +0 -250
- package/claude/references/sd-simplysm14/orm-common/docs/schema-builders.md +0 -364
- package/claude/references/sd-simplysm14/orm-common/docs/types.md +0 -522
- package/claude/references/sd-simplysm14/orm-common/usage.md +0 -229
- package/claude/references/sd-simplysm14/orm-node/docs/connections.md +0 -137
- package/claude/references/sd-simplysm14/orm-node/docs/core.md +0 -131
- package/claude/references/sd-simplysm14/orm-node/docs/types.md +0 -173
- package/claude/references/sd-simplysm14/orm-node/usage.md +0 -143
- package/claude/references/sd-simplysm14/sd-cli/usage.md +0 -782
- package/claude/references/sd-simplysm14/service-client/docs/features.md +0 -217
- package/claude/references/sd-simplysm14/service-client/docs/main.md +0 -148
- package/claude/references/sd-simplysm14/service-client/docs/protocol.md +0 -53
- package/claude/references/sd-simplysm14/service-client/docs/transport.md +0 -131
- package/claude/references/sd-simplysm14/service-client/docs/types.md +0 -129
- package/claude/references/sd-simplysm14/service-client/usage.md +0 -202
- package/claude/references/sd-simplysm14/service-common/docs/app-structure.md +0 -175
- package/claude/references/sd-simplysm14/service-common/docs/events.md +0 -64
- package/claude/references/sd-simplysm14/service-common/docs/protocol.md +0 -331
- package/claude/references/sd-simplysm14/service-common/docs/service-types.md +0 -90
- package/claude/references/sd-simplysm14/service-common/docs/types.md +0 -19
- package/claude/references/sd-simplysm14/service-common/usage.md +0 -154
- package/claude/references/sd-simplysm14/service-server/docs/auth.md +0 -64
- package/claude/references/sd-simplysm14/service-server/docs/core.md +0 -174
- package/claude/references/sd-simplysm14/service-server/docs/legacy.md +0 -25
- package/claude/references/sd-simplysm14/service-server/docs/main.md +0 -88
- package/claude/references/sd-simplysm14/service-server/docs/protocol.md +0 -33
- package/claude/references/sd-simplysm14/service-server/docs/services.md +0 -94
- package/claude/references/sd-simplysm14/service-server/docs/transport-http.md +0 -93
- package/claude/references/sd-simplysm14/service-server/docs/transport-socket.md +0 -119
- package/claude/references/sd-simplysm14/service-server/docs/types.md +0 -36
- package/claude/references/sd-simplysm14/service-server/docs/utils.md +0 -22
- package/claude/references/sd-simplysm14/service-server/usage.md +0 -171
- package/claude/references/sd-simplysm14/storage/usage.md +0 -301
- package/claude/references/sd-simplysm14.md +0 -35
- package/claude/rules/sd-clarify.md +0 -23
- package/claude/sd-session-start.sh +0 -10
- /package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/cli.md +0 -0
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: sd-doc-extract
|
|
3
3
|
description: 문서 파일(docx, xlsx, xlsb, pptx, pdf, eml, msg)에서 텍스트, 이미지, 임베디드 파일을 추출하는 스킬. "문서 추출", "문서 분해", "docx 분석", "PDF 내용 뽑아줘", "eml 파일 추출" 등을 요청할 때 사용한다.
|
|
4
|
-
model:
|
|
4
|
+
model: haiku
|
|
5
5
|
---
|
|
6
6
|
|
|
7
7
|
# sd-doc-extract: 문서 분해/추출
|
|
@@ -15,7 +15,7 @@ model: claude-haiku-4-5
|
|
|
15
15
|
| `.docx` | Word |
|
|
16
16
|
| `.xlsx` | Excel |
|
|
17
17
|
| `.xlsb` | Excel (Binary) |
|
|
18
|
-
| `.pptx` | PowerPoint |
|
|
18
|
+
| `.pptx` | PowerPoint (Windows + PowerPoint 설치 필요) |
|
|
19
19
|
| `.pdf` | PDF |
|
|
20
20
|
| `.eml` | Email |
|
|
21
21
|
| `.msg` | Email (Outlook) |
|
|
@@ -57,30 +57,28 @@ python .claude/skills/sd-doc-extract/extract.py "<file_path>"
|
|
|
57
57
|
|
|
58
58
|
| 포맷 | 이미지 배치 | Embedded 배치 |
|
|
59
59
|
|------|-----------|--------------|
|
|
60
|
-
| PPTX |
|
|
60
|
+
| PPTX | 슬라이드당 PNG 렌더링(PowerPoint COM) + `[SLIDE:N]` 삽입, 개별 이미지 분해 없음 | OLE 객체를 만난 슬라이드 내에 `[EMB:N]` 삽입 |
|
|
61
61
|
| DOCX | run 순회 중 drawing/blip을 만나면 그 문단에 `[IMG:N]` 삽입 | OLE 객체를 만난 위치에 `[EMB:N]` 삽입 |
|
|
62
62
|
| PDF | 페이지별 이미지를 해당 페이지 텍스트 내에 `[IMG:N]` 삽입 | 첨부파일은 문서 끝에 `[EMB:N]` 배치 (PDF 첨부는 페이지 귀속이 아님) |
|
|
63
|
-
| XLSX |
|
|
64
|
-
| XLSB | (이미지 없음) | (embedded 없음) |
|
|
63
|
+
| XLSX | 시트 데이터는 마크다운 테이블(열 헤더=Excel 열 문자 A/B/C…, 첫 열=원본 행 번호)로 렌더링. 이미지 앵커 행에서 테이블을 분리하고 `[IMG:N]` 삽입 후 새 테이블 재개 | 시트의 embeddings 디렉토리에서 추출한 객체를 문서 끝에 `[EMB:N]` 배치 |
|
|
64
|
+
| XLSB | 시트 데이터는 XLSX와 동일한 마크다운 테이블 포맷 (이미지 없음). VBA 매크로가 있으면 모듈별 소스코드를 fenced code block으로 추출하고, 의심 패턴(AutoExec/Suspicious/IOC) 분석 테이블을 첨부 | (embedded 없음) |
|
|
65
65
|
| EMAIL | HTML 본문의 `cid:` 참조 위치에 `[IMG:N]` 삽입, data URI 이미지도 등장 위치에 삽입 | 첨부파일은 본문 뒤에 `[EMB:N]` 배치 |
|
|
66
66
|
|
|
67
67
|
### 치환 결과 예시
|
|
68
68
|
|
|
69
69
|
```markdown
|
|
70
70
|
[Slide 1]
|
|
71
|
+

|
|
71
72
|
[TXT] (left=0.4", top=0.4") 1. 일정 및 정보 변경
|
|
72
73
|
[TXT] (left=0.6", top=0.8") - Case1~5번 공통 적용 사항
|
|
73
|
-
|
|
74
|
-

|
|
75
|
-
|
|
76
74
|
[TXT] (left=0.5", top=1.4") 1) 구성
|
|
77
75
|
[TXT] (left=0.8", top=2.8") 프로세스: BOA 선택 ...
|
|
78
76
|
|
|
79
|
-

|
|
80
|
-
|
|
81
77
|
> embedded: [embedded_001_worksheet.xlsb](scheduling-1/embedded_001_worksheet.md)
|
|
82
78
|
```
|
|
83
79
|
|
|
80
|
+
PPTX는 슬라이드별 PNG 렌더링(PowerPoint COM)으로 오버레이 도형·주석 박스의 공간 관계를 보존한다. 개별 이미지 추출은 하지 않는다(스크린샷에 포함되므로 중복). 텍스트 shape는 `[TXT]`로 병행 수록하여 원문 인용 정확도를 확보한다.
|
|
81
|
+
|
|
84
82
|
## 주의사항
|
|
85
83
|
|
|
86
84
|
- 바이너리 문서를 Read 도구로 직접 열면 의미 있는 내용을 얻을 수 없다. 반드시 `extract.py`를 통해 추출한다.
|
|
@@ -65,7 +65,14 @@ def ext_from_content_type(content_type: str) -> str:
|
|
|
65
65
|
def normalize_cell(text) -> str:
|
|
66
66
|
if text is None:
|
|
67
67
|
return ""
|
|
68
|
-
return
|
|
68
|
+
return (
|
|
69
|
+
str(text).strip()
|
|
70
|
+
.replace("\\", "\\\\")
|
|
71
|
+
.replace("|", "\\|")
|
|
72
|
+
.replace("\r\n", "<br>")
|
|
73
|
+
.replace("\n", "<br>")
|
|
74
|
+
.replace("\r", "<br>")
|
|
75
|
+
)
|
|
69
76
|
|
|
70
77
|
|
|
71
78
|
def parse_heading_level(style_name: str) -> int | None:
|
|
@@ -9,6 +9,8 @@ def extract(file_path):
|
|
|
9
9
|
ensure_packages(PACKAGES)
|
|
10
10
|
from docx import Document
|
|
11
11
|
from docx.oxml.ns import qn
|
|
12
|
+
from docx.table import Table as DocxTable
|
|
13
|
+
from docx.text.paragraph import Paragraph
|
|
12
14
|
|
|
13
15
|
doc = Document(file_path)
|
|
14
16
|
text_parts = []
|
|
@@ -17,47 +19,85 @@ def extract(file_path):
|
|
|
17
19
|
img_idx = 0
|
|
18
20
|
emb_idx = 0
|
|
19
21
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
+
def _extract_drawing(drawing):
|
|
23
|
+
nonlocal img_idx
|
|
24
|
+
blip = drawing.find(f".//{qn('a:blip')}")
|
|
25
|
+
if blip is None:
|
|
26
|
+
return None
|
|
27
|
+
embed_id = blip.get(qn("r:embed"))
|
|
28
|
+
if not embed_id:
|
|
29
|
+
return None
|
|
30
|
+
rel = doc.part.rels.get(embed_id)
|
|
31
|
+
if not rel or not hasattr(rel, 'target_part'):
|
|
32
|
+
return None
|
|
33
|
+
ext = ext_from_content_type(rel.target_part.content_type)
|
|
34
|
+
img_idx += 1
|
|
35
|
+
doc_pr = drawing.find(f".//{qn('wp:docPr')}")
|
|
36
|
+
alt = ""
|
|
37
|
+
if doc_pr is not None:
|
|
38
|
+
alt = doc_pr.get("descr", "") or doc_pr.get("title", "")
|
|
39
|
+
images.append({
|
|
40
|
+
"data": rel.target_part.blob,
|
|
41
|
+
"ext": ext,
|
|
42
|
+
"context": alt or "paragraph image",
|
|
43
|
+
})
|
|
44
|
+
return img_idx
|
|
45
|
+
|
|
46
|
+
def _process_paragraph(element):
|
|
47
|
+
para = Paragraph(element, doc)
|
|
48
|
+
style = para.style.name if para.style else ""
|
|
49
|
+
prefix = ""
|
|
50
|
+
if "Heading" in style:
|
|
51
|
+
level = parse_heading_level(style)
|
|
52
|
+
prefix = "#" * (level or 2) + " "
|
|
53
|
+
|
|
54
|
+
parts = []
|
|
22
55
|
for run in para.runs:
|
|
56
|
+
if run.text:
|
|
57
|
+
parts.append(run.text)
|
|
23
58
|
drawings = (run._element.findall(f".//{qn('wp:inline')}") +
|
|
24
59
|
run._element.findall(f".//{qn('wp:anchor')}"))
|
|
25
|
-
for
|
|
26
|
-
|
|
27
|
-
if
|
|
28
|
-
|
|
29
|
-
if embed_id:
|
|
30
|
-
rel = doc.part.rels.get(embed_id)
|
|
31
|
-
if rel and hasattr(rel, 'target_part'):
|
|
32
|
-
ext = ext_from_content_type(rel.target_part.content_type)
|
|
33
|
-
img_idx += 1
|
|
34
|
-
images.append({
|
|
35
|
-
"data": rel.target_part.blob,
|
|
36
|
-
"ext": ext,
|
|
37
|
-
"context": "paragraph image",
|
|
38
|
-
})
|
|
39
|
-
para_img_markers.append(f"[IMG:{img_idx}]")
|
|
40
|
-
|
|
41
|
-
text = para.text.strip()
|
|
42
|
-
if text:
|
|
43
|
-
style = para.style.name if para.style else ""
|
|
44
|
-
prefix = ""
|
|
45
|
-
if "Heading" in style:
|
|
46
|
-
level = parse_heading_level(style)
|
|
47
|
-
if level is not None:
|
|
48
|
-
prefix = "#" * level + " "
|
|
49
|
-
else:
|
|
50
|
-
prefix = "## "
|
|
51
|
-
text_parts.append(f"{prefix}{text}")
|
|
60
|
+
for d in drawings:
|
|
61
|
+
idx = _extract_drawing(d)
|
|
62
|
+
if idx is not None:
|
|
63
|
+
parts.append(f"[IMG:{idx}]")
|
|
52
64
|
|
|
53
|
-
|
|
54
|
-
|
|
65
|
+
line = "".join(parts).strip()
|
|
66
|
+
if line:
|
|
67
|
+
text_parts.append(f"{prefix}{line}")
|
|
55
68
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
69
|
+
def _process_table(element):
|
|
70
|
+
table = DocxTable(element, doc)
|
|
71
|
+
rows = list(table.rows)
|
|
72
|
+
if not rows:
|
|
73
|
+
return
|
|
74
|
+
text_parts.append("")
|
|
75
|
+
for r_idx, row in enumerate(rows):
|
|
59
76
|
cells = [normalize_cell(cell.text) for cell in row.cells]
|
|
60
77
|
text_parts.append("| " + " | ".join(cells) + " |")
|
|
78
|
+
if r_idx == 0:
|
|
79
|
+
text_parts.append("|" + "|".join(["---"] * len(cells)) + "|")
|
|
80
|
+
text_parts.append("")
|
|
81
|
+
|
|
82
|
+
# Iterate body elements in document order (paragraphs and tables interleaved)
|
|
83
|
+
for child in doc.element.body:
|
|
84
|
+
tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag
|
|
85
|
+
if tag == 'p':
|
|
86
|
+
_process_paragraph(child)
|
|
87
|
+
elif tag == 'tbl':
|
|
88
|
+
_process_table(child)
|
|
89
|
+
|
|
90
|
+
# Headers and footers
|
|
91
|
+
for sec_idx, section in enumerate(doc.sections):
|
|
92
|
+
h_parts = [p.text.strip() for p in section.header.paragraphs if p.text.strip()]
|
|
93
|
+
f_parts = [p.text.strip() for p in section.footer.paragraphs if p.text.strip()]
|
|
94
|
+
if h_parts or f_parts:
|
|
95
|
+
text_parts.append("")
|
|
96
|
+
text_parts.append(f"[Header/Footer — Section {sec_idx + 1}]")
|
|
97
|
+
if h_parts:
|
|
98
|
+
text_parts.append(f"Header: {' | '.join(h_parts)}")
|
|
99
|
+
if f_parts:
|
|
100
|
+
text_parts.append(f"Footer: {' | '.join(f_parts)}")
|
|
61
101
|
|
|
62
102
|
# OLE embedded objects
|
|
63
103
|
seen = set()
|
|
@@ -37,11 +37,22 @@ def extract(file_path):
|
|
|
37
37
|
if w <= 4 or h <= 4:
|
|
38
38
|
continue
|
|
39
39
|
|
|
40
|
+
# Get image position on page
|
|
41
|
+
try:
|
|
42
|
+
rects = page.get_image_rects(xref)
|
|
43
|
+
if rects:
|
|
44
|
+
r = rects[0]
|
|
45
|
+
bbox_str = f" bbox:({r.x0:.0f},{r.y0:.0f},{r.x1:.0f},{r.y1:.0f})"
|
|
46
|
+
else:
|
|
47
|
+
bbox_str = ""
|
|
48
|
+
except Exception:
|
|
49
|
+
bbox_str = ""
|
|
50
|
+
|
|
40
51
|
img_idx += 1
|
|
41
52
|
images.append({
|
|
42
53
|
"data": data,
|
|
43
54
|
"ext": ext,
|
|
44
|
-
"context": f"Page {page_num}",
|
|
55
|
+
"context": f"Page {page_num}{bbox_str}",
|
|
45
56
|
})
|
|
46
57
|
page_img_indices[page_num].append(img_idx)
|
|
47
58
|
|
|
@@ -1,8 +1,17 @@
|
|
|
1
|
-
"""PPTX handler: extract text
|
|
1
|
+
"""PPTX handler: render slides to PNG via PowerPoint COM, extract text and OLE embedded.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Individual image/shape extraction is intentionally omitted — slide screenshots
|
|
4
|
+
contain all visuals including overlay shapes (boxes, arrows, annotations) that
|
|
5
|
+
lose their spatial relationship when decomposed. Requires Windows + Microsoft
|
|
6
|
+
PowerPoint installed.
|
|
7
|
+
"""
|
|
4
8
|
|
|
5
|
-
|
|
9
|
+
import tempfile
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from _common import ensure_packages
|
|
13
|
+
|
|
14
|
+
PACKAGES = {"pywin32": "win32com.client", "python-pptx": "pptx"}
|
|
6
15
|
|
|
7
16
|
|
|
8
17
|
def _emu_to_inches(emu):
|
|
@@ -15,37 +24,107 @@ def _pos(shape):
|
|
|
15
24
|
return f"(left={_emu_to_inches(shape.left)}\", top={_emu_to_inches(shape.top)}\")"
|
|
16
25
|
|
|
17
26
|
|
|
27
|
+
def _extract_shapes(shapes, text_parts):
|
|
28
|
+
for shape in shapes:
|
|
29
|
+
if shape.shape_type == 6: # MSO_SHAPE_TYPE.GROUP
|
|
30
|
+
_extract_shapes(shape.shapes, text_parts)
|
|
31
|
+
elif shape.has_table:
|
|
32
|
+
tbl = shape.table
|
|
33
|
+
text_parts.append(f"[TABLE] {_pos(shape)}")
|
|
34
|
+
for r_idx, row in enumerate(tbl.rows):
|
|
35
|
+
cells = [
|
|
36
|
+
cell.text.strip().replace("\\", "\\\\").replace("|", "\\|")
|
|
37
|
+
.replace("\r\n", "<br>").replace("\n", "<br>").replace("\r", "<br>")
|
|
38
|
+
for cell in row.cells
|
|
39
|
+
]
|
|
40
|
+
text_parts.append("| " + " | ".join(cells) + " |")
|
|
41
|
+
if r_idx == 0:
|
|
42
|
+
text_parts.append("|" + "|".join(["---"] * len(cells)) + "|")
|
|
43
|
+
elif hasattr(shape, "text") and shape.text.strip():
|
|
44
|
+
text = shape.text.strip().replace("\n", "\n ")
|
|
45
|
+
text_parts.append(f"[TXT] {_pos(shape)} {text}")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _render_slides_via_com(file_path: str, tmp_dir: Path, slide_count: int,
|
|
49
|
+
width: int, height: int) -> list[bytes]:
|
|
50
|
+
import win32com.client
|
|
51
|
+
import pythoncom
|
|
52
|
+
|
|
53
|
+
pythoncom.CoInitialize()
|
|
54
|
+
try:
|
|
55
|
+
app = win32com.client.DispatchEx("PowerPoint.Application")
|
|
56
|
+
try:
|
|
57
|
+
try:
|
|
58
|
+
app.DisplayAlerts = 0
|
|
59
|
+
except Exception:
|
|
60
|
+
pass
|
|
61
|
+
abs_path = str(Path(file_path).resolve())
|
|
62
|
+
prs = app.Presentations.Open(abs_path, ReadOnly=True, Untitled=False,
|
|
63
|
+
WithWindow=False)
|
|
64
|
+
try:
|
|
65
|
+
results = []
|
|
66
|
+
for i in range(1, slide_count + 1):
|
|
67
|
+
tmp_path = tmp_dir / f"__tmp_slide_{i}.png"
|
|
68
|
+
prs.Slides(i).Export(str(tmp_path), "PNG", width, height)
|
|
69
|
+
results.append(tmp_path.read_bytes())
|
|
70
|
+
tmp_path.unlink()
|
|
71
|
+
return results
|
|
72
|
+
finally:
|
|
73
|
+
prs.Close()
|
|
74
|
+
finally:
|
|
75
|
+
app.Quit()
|
|
76
|
+
finally:
|
|
77
|
+
pythoncom.CoUninitialize()
|
|
78
|
+
|
|
79
|
+
|
|
18
80
|
def extract(file_path):
|
|
19
81
|
ensure_packages(PACKAGES)
|
|
20
82
|
from pptx import Presentation
|
|
21
|
-
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
22
83
|
|
|
23
84
|
prs = Presentation(file_path)
|
|
85
|
+
slide_count = len(prs.slides)
|
|
86
|
+
|
|
87
|
+
target_width = 1920
|
|
88
|
+
if prs.slide_width and prs.slide_height:
|
|
89
|
+
target_height = int(target_width * prs.slide_height / prs.slide_width)
|
|
90
|
+
else:
|
|
91
|
+
target_height = 1080
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
95
|
+
slide_pngs = _render_slides_via_com(
|
|
96
|
+
file_path, Path(tmpdir), slide_count, target_width, target_height
|
|
97
|
+
)
|
|
98
|
+
except Exception as e:
|
|
99
|
+
raise RuntimeError(
|
|
100
|
+
f"PowerPoint COM rendering failed: {e}. "
|
|
101
|
+
"This extractor requires Windows with Microsoft PowerPoint installed."
|
|
102
|
+
) from e
|
|
103
|
+
|
|
24
104
|
text_parts = []
|
|
25
|
-
|
|
105
|
+
slide_images = []
|
|
26
106
|
embedded = []
|
|
27
|
-
img_idx = 0
|
|
28
107
|
emb_idx = 0
|
|
29
108
|
|
|
30
109
|
for slide_num, slide in enumerate(prs.slides, 1):
|
|
31
110
|
text_parts.append(f"[Slide {slide_num}]")
|
|
32
111
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
112
|
+
slide_images.append({
|
|
113
|
+
"filename": f"slide_{slide_num:03d}.png",
|
|
114
|
+
"data": slide_pngs[slide_num - 1],
|
|
115
|
+
})
|
|
116
|
+
text_parts.append(f"[SLIDE:{slide_num}]")
|
|
117
|
+
|
|
118
|
+
_extract_shapes(slide.shapes, text_parts)
|
|
119
|
+
|
|
120
|
+
# Speaker notes
|
|
121
|
+
if slide.has_notes_slide:
|
|
122
|
+
notes_frame = slide.notes_slide.notes_text_frame
|
|
123
|
+
notes_text = notes_frame.text.strip() if notes_frame else ""
|
|
124
|
+
if notes_text:
|
|
125
|
+
notes_text = notes_text.replace("\n", "\n ")
|
|
126
|
+
text_parts.append(f"[Notes] {notes_text}")
|
|
127
|
+
|
|
49
128
|
seen = set()
|
|
50
129
|
for rel in slide.part.rels.values():
|
|
51
130
|
reltype = rel.reltype or ""
|
|
@@ -69,7 +148,8 @@ def extract(file_path):
|
|
|
69
148
|
|
|
70
149
|
return {
|
|
71
150
|
"text": "\n".join(text_parts),
|
|
72
|
-
"images":
|
|
151
|
+
"images": [],
|
|
73
152
|
"embedded": embedded,
|
|
74
153
|
"metadata": {},
|
|
154
|
+
"slide_images": slide_images,
|
|
75
155
|
}
|
|
@@ -1,8 +1,36 @@
|
|
|
1
|
-
"""XLSB handler: extract cell data from binary Excel format.
|
|
1
|
+
"""XLSB handler: extract cell data and VBA macros from binary Excel format.
|
|
2
|
+
|
|
3
|
+
Output format matches the XLSX handler: per sheet, a markdown table with
|
|
4
|
+
Excel column letters as headers and the original row number in the first
|
|
5
|
+
column. VBA macros are extracted via oletools and appended as fenced code
|
|
6
|
+
blocks.
|
|
7
|
+
"""
|
|
2
8
|
|
|
3
9
|
from _common import ensure_packages
|
|
4
10
|
|
|
5
|
-
PACKAGES = {"pyxlsb": "pyxlsb"}
|
|
11
|
+
PACKAGES = {"pyxlsb": "pyxlsb", "oletools": "oletools"}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _escape_md(v):
|
|
15
|
+
if v is None:
|
|
16
|
+
return ""
|
|
17
|
+
s = str(v).strip()
|
|
18
|
+
return (
|
|
19
|
+
s.replace("\\", "\\\\")
|
|
20
|
+
.replace("|", "\\|")
|
|
21
|
+
.replace("\r\n", "<br>")
|
|
22
|
+
.replace("\n", "<br>")
|
|
23
|
+
.replace("\r", "<br>")
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _col_letter(n):
|
|
28
|
+
# 1-based column index → Excel letter (A, B, ..., Z, AA, AB, ...)
|
|
29
|
+
s = ""
|
|
30
|
+
while n > 0:
|
|
31
|
+
n, r = divmod(n - 1, 26)
|
|
32
|
+
s = chr(65 + r) + s
|
|
33
|
+
return s
|
|
6
34
|
|
|
7
35
|
|
|
8
36
|
def extract(file_path):
|
|
@@ -14,10 +42,71 @@ def extract(file_path):
|
|
|
14
42
|
with open_workbook(file_path) as wb:
|
|
15
43
|
for sheet_name in wb.sheets:
|
|
16
44
|
text_parts.append(f"[Sheet: {sheet_name}]")
|
|
45
|
+
text_parts.append("")
|
|
46
|
+
|
|
17
47
|
with wb.get_sheet(sheet_name) as sheet:
|
|
48
|
+
rows_data = []
|
|
49
|
+
max_col = 0
|
|
18
50
|
for row in sheet.rows():
|
|
19
|
-
|
|
20
|
-
|
|
51
|
+
if not row:
|
|
52
|
+
continue
|
|
53
|
+
row_num = row[0].r + 1 # pyxlsb is 0-based
|
|
54
|
+
cells = [_escape_md(cell.v) for cell in row]
|
|
55
|
+
if len(cells) > max_col:
|
|
56
|
+
max_col = len(cells)
|
|
57
|
+
rows_data.append((row_num, cells))
|
|
58
|
+
|
|
59
|
+
if not rows_data:
|
|
60
|
+
text_parts.append("(empty sheet)")
|
|
61
|
+
text_parts.append("")
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
headers = ["Row"] + [_col_letter(c) for c in range(1, max_col + 1)]
|
|
65
|
+
text_parts.append("| " + " | ".join(headers) + " |")
|
|
66
|
+
text_parts.append("|" + "|".join(["---"] * len(headers)) + "|")
|
|
67
|
+
for row_num, cells in rows_data:
|
|
68
|
+
padded = list(cells) + [""] * (max_col - len(cells))
|
|
69
|
+
text_parts.append(
|
|
70
|
+
f"| {row_num} | " + " | ".join(padded[:max_col]) + " |"
|
|
71
|
+
)
|
|
72
|
+
text_parts.append("")
|
|
73
|
+
|
|
74
|
+
# --- VBA macro extraction ---
|
|
75
|
+
vba_parts = []
|
|
76
|
+
try:
|
|
77
|
+
from oletools.olevba import VBA_Parser
|
|
78
|
+
|
|
79
|
+
vba_parser = VBA_Parser(file_path)
|
|
80
|
+
if vba_parser.detect_vba_macros():
|
|
81
|
+
vba_parts.append("[VBA Macros]")
|
|
82
|
+
vba_parts.append("")
|
|
83
|
+
for filename, stream_path, vba_filename, vba_code in vba_parser.extract_macros():
|
|
84
|
+
vba_parts.append(f"### {vba_filename}")
|
|
85
|
+
vba_parts.append(f"<!-- stream: {stream_path} -->")
|
|
86
|
+
vba_parts.append("")
|
|
87
|
+
vba_parts.append("```vb")
|
|
88
|
+
vba_parts.append(vba_code)
|
|
89
|
+
vba_parts.append("```")
|
|
90
|
+
vba_parts.append("")
|
|
91
|
+
|
|
92
|
+
analysis = vba_parser.analyze_macros()
|
|
93
|
+
suspicious = [e for e in analysis if e[0] in ("AutoExec", "Suspicious", "IOC")]
|
|
94
|
+
if suspicious:
|
|
95
|
+
vba_parts.append("### Analysis")
|
|
96
|
+
vba_parts.append("")
|
|
97
|
+
vba_parts.append("| Type | Keyword | Description |")
|
|
98
|
+
vba_parts.append("|------|---------|-------------|")
|
|
99
|
+
for entry_type, keyword, description in suspicious:
|
|
100
|
+
vba_parts.append(f"| {entry_type} | `{keyword}` | {description} |")
|
|
101
|
+
vba_parts.append("")
|
|
102
|
+
|
|
103
|
+
vba_parser.close()
|
|
104
|
+
except Exception:
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
if vba_parts:
|
|
108
|
+
text_parts.append("")
|
|
109
|
+
text_parts.extend(vba_parts)
|
|
21
110
|
|
|
22
111
|
return {
|
|
23
112
|
"text": "\n".join(text_parts),
|
|
@@ -1,4 +1,14 @@
|
|
|
1
|
-
"""XLSX handler: extract cell data, images, and embedded objects.
|
|
1
|
+
"""XLSX handler: extract cell data, images, and embedded objects.
|
|
2
|
+
|
|
3
|
+
Output format: per sheet, cell data is rendered as a markdown table whose
|
|
4
|
+
column headers are Excel column letters (A, B, C, ...) and whose first
|
|
5
|
+
column is the original Excel row number. When an image is anchored to a
|
|
6
|
+
row, the current table chunk is flushed, the [IMG:N] placeholder is
|
|
7
|
+
emitted, and a new table (re-rendering the header) resumes from the next
|
|
8
|
+
row. This preserves the spatial relationship between cell data and
|
|
9
|
+
images while keeping each chunk a valid markdown table that LLMs parse
|
|
10
|
+
natively.
|
|
11
|
+
"""
|
|
2
12
|
|
|
3
13
|
import zipfile
|
|
4
14
|
from _common import ensure_packages
|
|
@@ -6,9 +16,35 @@ from _common import ensure_packages
|
|
|
6
16
|
PACKAGES = {"openpyxl": "openpyxl"}
|
|
7
17
|
|
|
8
18
|
|
|
19
|
+
def _escape_md(v):
|
|
20
|
+
if v is None:
|
|
21
|
+
return ""
|
|
22
|
+
s = str(v).strip()
|
|
23
|
+
return (
|
|
24
|
+
s.replace("\\", "\\\\")
|
|
25
|
+
.replace("|", "\\|")
|
|
26
|
+
.replace("\r\n", "<br>")
|
|
27
|
+
.replace("\n", "<br>")
|
|
28
|
+
.replace("\r", "<br>")
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _render_chunk(chunk_rows, max_col, get_col_letter):
|
|
33
|
+
if not chunk_rows:
|
|
34
|
+
return []
|
|
35
|
+
headers = ["Row"] + [get_col_letter(c) for c in range(1, max_col + 1)]
|
|
36
|
+
out = ["| " + " | ".join(headers) + " |",
|
|
37
|
+
"|" + "|".join(["---"] * len(headers)) + "|"]
|
|
38
|
+
for row_num, cells in chunk_rows:
|
|
39
|
+
padded = list(cells) + [""] * (max_col - len(cells))
|
|
40
|
+
out.append(f"| {row_num} | " + " | ".join(padded[:max_col]) + " |")
|
|
41
|
+
return out
|
|
42
|
+
|
|
43
|
+
|
|
9
44
|
def extract(file_path):
|
|
10
45
|
ensure_packages(PACKAGES)
|
|
11
46
|
from openpyxl import load_workbook
|
|
47
|
+
from openpyxl.utils import get_column_letter
|
|
12
48
|
from openpyxl.worksheet.worksheet import Worksheet
|
|
13
49
|
|
|
14
50
|
wb = load_workbook(file_path, data_only=True)
|
|
@@ -21,57 +57,83 @@ def extract(file_path):
|
|
|
21
57
|
for sheet_name in wb.sheetnames:
|
|
22
58
|
ws = wb[sheet_name]
|
|
23
59
|
text_parts.append(f"[Sheet: {sheet_name}]")
|
|
60
|
+
text_parts.append("")
|
|
24
61
|
|
|
25
62
|
if not isinstance(ws, Worksheet):
|
|
26
63
|
text_parts.append(f"({type(ws).__name__} — 데이터 없음)")
|
|
64
|
+
text_parts.append("")
|
|
27
65
|
continue
|
|
28
66
|
|
|
29
67
|
if ws.max_row is None or ws.max_row == 0:
|
|
30
68
|
text_parts.append("(empty sheet)")
|
|
69
|
+
text_parts.append("")
|
|
31
70
|
continue
|
|
32
71
|
|
|
33
|
-
#
|
|
34
|
-
|
|
35
|
-
|
|
72
|
+
# Merged cells annotation
|
|
73
|
+
merged = list(ws.merged_cells.ranges)
|
|
74
|
+
if merged:
|
|
75
|
+
text_parts.append(f"[Merged: {', '.join(str(r) for r in merged)}]")
|
|
76
|
+
text_parts.append("")
|
|
77
|
+
|
|
78
|
+
ws_images = getattr(ws, "_images", [])
|
|
79
|
+
row_img_markers = {}
|
|
36
80
|
for img in ws_images:
|
|
37
|
-
data_fn = getattr(img,
|
|
81
|
+
data_fn = getattr(img, "_data", None)
|
|
38
82
|
blob = data_fn() if callable(data_fn) else b""
|
|
39
|
-
if blob:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
if
|
|
51
|
-
anchor_row
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
83
|
+
if not blob:
|
|
84
|
+
continue
|
|
85
|
+
img_idx += 1
|
|
86
|
+
anchor = getattr(img, "anchor", None)
|
|
87
|
+
anchor_row = None
|
|
88
|
+
anchor_col = None
|
|
89
|
+
if anchor:
|
|
90
|
+
_from = getattr(anchor, "_from", None)
|
|
91
|
+
if _from:
|
|
92
|
+
anchor_row = getattr(_from, "row", None)
|
|
93
|
+
anchor_col = getattr(_from, "col", None)
|
|
94
|
+
if anchor_row is not None:
|
|
95
|
+
anchor_row += 1
|
|
96
|
+
if anchor_col is not None:
|
|
97
|
+
anchor_col += 1
|
|
98
|
+
if anchor_row is None:
|
|
99
|
+
anchor_row = ws.max_row or 1
|
|
100
|
+
cell_ref = ""
|
|
101
|
+
if anchor_col is not None:
|
|
102
|
+
cell_ref = f" anchor:{get_column_letter(anchor_col)}{anchor_row}"
|
|
103
|
+
else:
|
|
104
|
+
cell_ref = f" anchor:row {anchor_row}"
|
|
105
|
+
images.append({
|
|
106
|
+
"data": blob,
|
|
107
|
+
"ext": "png",
|
|
108
|
+
"context": f"sheet '{sheet_name}'{cell_ref}",
|
|
109
|
+
})
|
|
110
|
+
row_img_markers.setdefault(anchor_row, []).append(img_idx)
|
|
111
|
+
|
|
112
|
+
max_col = ws.max_column or 1
|
|
113
|
+
chunk = []
|
|
114
|
+
|
|
59
115
|
for row in ws.iter_rows(values_only=False):
|
|
60
|
-
cells = []
|
|
61
|
-
for cell in row:
|
|
62
|
-
val = cell.value
|
|
63
|
-
cells.append(str(val).strip() if val is not None else "")
|
|
64
116
|
row_num = row[0].row
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
117
|
+
cells = [_escape_md(c.value) for c in row]
|
|
118
|
+
chunk.append((row_num, cells))
|
|
119
|
+
|
|
120
|
+
if row_num in row_img_markers:
|
|
121
|
+
text_parts.extend(_render_chunk(chunk, max_col, get_column_letter))
|
|
122
|
+
text_parts.append("")
|
|
123
|
+
for idx in row_img_markers[row_num]:
|
|
124
|
+
text_parts.append(f"[IMG:{idx}]")
|
|
125
|
+
text_parts.append("")
|
|
126
|
+
chunk = []
|
|
127
|
+
|
|
128
|
+
if chunk:
|
|
129
|
+
text_parts.extend(_render_chunk(chunk, max_col, get_column_letter))
|
|
130
|
+
text_parts.append("")
|
|
68
131
|
|
|
69
|
-
# Embedded objects from XLSX ZIP
|
|
70
132
|
try:
|
|
71
|
-
with zipfile.ZipFile(file_path,
|
|
133
|
+
with zipfile.ZipFile(file_path, "r") as zf:
|
|
72
134
|
for name in zf.namelist():
|
|
73
|
-
if
|
|
74
|
-
filename = name.split(
|
|
135
|
+
if "embeddings/" in name.lower():
|
|
136
|
+
filename = name.split("/")[-1]
|
|
75
137
|
data = zf.read(name)
|
|
76
138
|
emb_idx += 1
|
|
77
139
|
embedded.append({"filename": filename, "data": data})
|