@simplysm/sd-claude 14.0.46 → 14.0.48
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/{claude/references/sd-simplysm14/sd-claude/usage.md → README.md} +2 -2
- package/claude/rules/sd-claude-rules.md +27 -9
- package/claude/rules/sd-options.md +11 -6
- package/claude/sd-subagent-start.sh +6 -0
- package/claude/settings.json +1 -12
- package/claude/skills/sd-check/SKILL.md +18 -9
- package/claude/skills/sd-claude-docs/SKILL.md +29 -58
- package/claude/skills/sd-claude-docs/references/package-claudemd.md +12 -0
- package/claude/skills/sd-claude-docs/references/package-doc-gen.md +22 -12
- package/claude/skills/sd-debug/SKILL.md +5 -3
- package/claude/skills/sd-deliverable/SKILL.md +0 -1
- package/claude/skills/sd-dev/SKILL.md +14 -9
- package/claude/skills/sd-doc-extract/SKILL.md +7 -9
- package/claude/skills/sd-doc-extract/_common.py +8 -1
- package/claude/skills/sd-doc-extract/_extract_docx.py +74 -34
- package/claude/skills/sd-doc-extract/_extract_pdf.py +12 -1
- package/claude/skills/sd-doc-extract/_extract_pptx.py +103 -23
- package/claude/skills/sd-doc-extract/_extract_xlsb.py +93 -4
- package/claude/skills/sd-doc-extract/_extract_xlsx.py +98 -36
- package/claude/skills/sd-doc-extract/extract.py +22 -3
- package/claude/skills/sd-inner-clarify/SKILL.md +78 -0
- package/claude/skills/sd-inner-debug/SKILL.md +1 -1
- package/claude/skills/sd-inner-review/SKILL.md +13 -0
- package/claude/skills/sd-plan/SKILL.md +50 -17
- package/claude/skills/sd-prompt/SKILL.md +180 -178
- package/claude/skills/sd-prompt/references/eval-runner.md +5 -31
- package/claude/skills/sd-prompt/references/sd-eval-env-template.md +23 -0
- package/claude/skills/sd-refactor/SKILL.md +2 -2
- package/claude/skills/sd-tdd/SKILL.md +46 -10
- package/claude/skills/sd-use/SKILL.md +84 -80
- package/claude/skills/sd-wbs/SKILL.md +85 -27
- package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/assets.md +2 -3
- package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/hooks.md +7 -6
- package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/scripts.md +1 -9
- package/package.json +3 -2
- package/scripts/sync.mjs +4 -2
- package/claude/references/sd-simplysm14/angular/docs/bootstrap.md +0 -48
- package/claude/references/sd-simplysm14/angular/docs/directives.md +0 -236
- package/claude/references/sd-simplysm14/angular/docs/features.md +0 -379
- package/claude/references/sd-simplysm14/angular/docs/pipes.md +0 -32
- package/claude/references/sd-simplysm14/angular/docs/plugins.md +0 -37
- package/claude/references/sd-simplysm14/angular/docs/provider-types.md +0 -283
- package/claude/references/sd-simplysm14/angular/docs/providers.md +0 -370
- package/claude/references/sd-simplysm14/angular/docs/styling.md +0 -222
- package/claude/references/sd-simplysm14/angular/docs/type-utilities.md +0 -250
- package/claude/references/sd-simplysm14/angular/docs/ui-data.md +0 -275
- package/claude/references/sd-simplysm14/angular/docs/ui-form.md +0 -490
- package/claude/references/sd-simplysm14/angular/docs/ui-layout.md +0 -140
- package/claude/references/sd-simplysm14/angular/docs/ui-navigation.md +0 -241
- package/claude/references/sd-simplysm14/angular/docs/ui-overlay.md +0 -157
- package/claude/references/sd-simplysm14/angular/docs/ui-visual.md +0 -127
- package/claude/references/sd-simplysm14/angular/docs/utils.md +0 -295
- package/claude/references/sd-simplysm14/angular/usage.md +0 -489
- package/claude/references/sd-simplysm14/capacitor-plugin-auto-update/usage.md +0 -182
- package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/file-operations.md +0 -154
- package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/permissions.md +0 -84
- package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/storage-paths.md +0 -107
- package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/types.md +0 -83
- package/claude/references/sd-simplysm14/capacitor-plugin-file-system/usage.md +0 -133
- package/claude/references/sd-simplysm14/capacitor-plugin-intent/usage.md +0 -203
- package/claude/references/sd-simplysm14/capacitor-plugin-usb-storage/usage.md +0 -258
- package/claude/references/sd-simplysm14/core-browser/usage.md +0 -306
- package/claude/references/sd-simplysm14/core-common/docs/errors.md +0 -82
- package/claude/references/sd-simplysm14/core-common/docs/extensions.md +0 -167
- package/claude/references/sd-simplysm14/core-common/docs/features.md +0 -136
- package/claude/references/sd-simplysm14/core-common/docs/types.md +0 -245
- package/claude/references/sd-simplysm14/core-common/docs/utils.md +0 -591
- package/claude/references/sd-simplysm14/core-common/usage.md +0 -255
- package/claude/references/sd-simplysm14/core-node/docs/child-process.md +0 -182
- package/claude/references/sd-simplysm14/core-node/docs/features.md +0 -214
- package/claude/references/sd-simplysm14/core-node/docs/file-system.md +0 -509
- package/claude/references/sd-simplysm14/core-node/docs/file-watching.md +0 -139
- package/claude/references/sd-simplysm14/core-node/docs/logging.md +0 -180
- package/claude/references/sd-simplysm14/core-node/docs/path.md +0 -176
- package/claude/references/sd-simplysm14/core-node/docs/utilities-cpx.md +0 -194
- package/claude/references/sd-simplysm14/core-node/docs/utilities-fsx.md +0 -469
- package/claude/references/sd-simplysm14/core-node/docs/utilities-pathx.md +0 -151
- package/claude/references/sd-simplysm14/core-node/docs/worker-threads.md +0 -334
- package/claude/references/sd-simplysm14/core-node/docs/worker.md +0 -205
- package/claude/references/sd-simplysm14/core-node/usage.md +0 -259
- package/claude/references/sd-simplysm14/excel/docs/core-classes.md +0 -443
- package/claude/references/sd-simplysm14/excel/docs/types.md +0 -455
- package/claude/references/sd-simplysm14/excel/docs/utilities.md +0 -194
- package/claude/references/sd-simplysm14/excel/docs/wrapper.md +0 -73
- package/claude/references/sd-simplysm14/excel/usage.md +0 -134
- package/claude/references/sd-simplysm14/lint/usage.md +0 -130
- package/claude/references/sd-simplysm14/orm-common/docs/core.md +0 -188
- package/claude/references/sd-simplysm14/orm-common/docs/expression.md +0 -190
- package/claude/references/sd-simplysm14/orm-common/docs/models.md +0 -17
- package/claude/references/sd-simplysm14/orm-common/docs/query-builder.md +0 -97
- package/claude/references/sd-simplysm14/orm-common/docs/queryable-executable.md +0 -250
- package/claude/references/sd-simplysm14/orm-common/docs/schema-builders.md +0 -364
- package/claude/references/sd-simplysm14/orm-common/docs/types.md +0 -522
- package/claude/references/sd-simplysm14/orm-common/usage.md +0 -229
- package/claude/references/sd-simplysm14/orm-node/docs/connections.md +0 -137
- package/claude/references/sd-simplysm14/orm-node/docs/core.md +0 -131
- package/claude/references/sd-simplysm14/orm-node/docs/types.md +0 -173
- package/claude/references/sd-simplysm14/orm-node/usage.md +0 -143
- package/claude/references/sd-simplysm14/sd-cli/usage.md +0 -782
- package/claude/references/sd-simplysm14/service-client/docs/features.md +0 -217
- package/claude/references/sd-simplysm14/service-client/docs/main.md +0 -148
- package/claude/references/sd-simplysm14/service-client/docs/protocol.md +0 -53
- package/claude/references/sd-simplysm14/service-client/docs/transport.md +0 -131
- package/claude/references/sd-simplysm14/service-client/docs/types.md +0 -129
- package/claude/references/sd-simplysm14/service-client/usage.md +0 -202
- package/claude/references/sd-simplysm14/service-common/docs/app-structure.md +0 -175
- package/claude/references/sd-simplysm14/service-common/docs/events.md +0 -64
- package/claude/references/sd-simplysm14/service-common/docs/protocol.md +0 -331
- package/claude/references/sd-simplysm14/service-common/docs/service-types.md +0 -90
- package/claude/references/sd-simplysm14/service-common/docs/types.md +0 -19
- package/claude/references/sd-simplysm14/service-common/usage.md +0 -154
- package/claude/references/sd-simplysm14/service-server/docs/auth.md +0 -64
- package/claude/references/sd-simplysm14/service-server/docs/core.md +0 -174
- package/claude/references/sd-simplysm14/service-server/docs/legacy.md +0 -25
- package/claude/references/sd-simplysm14/service-server/docs/main.md +0 -88
- package/claude/references/sd-simplysm14/service-server/docs/protocol.md +0 -33
- package/claude/references/sd-simplysm14/service-server/docs/services.md +0 -94
- package/claude/references/sd-simplysm14/service-server/docs/transport-http.md +0 -93
- package/claude/references/sd-simplysm14/service-server/docs/transport-socket.md +0 -119
- package/claude/references/sd-simplysm14/service-server/docs/types.md +0 -36
- package/claude/references/sd-simplysm14/service-server/docs/utils.md +0 -22
- package/claude/references/sd-simplysm14/service-server/usage.md +0 -171
- package/claude/references/sd-simplysm14/storage/usage.md +0 -301
- package/claude/references/sd-simplysm14.md +0 -35
- package/claude/rules/sd-clarify.md +0 -23
- package/claude/sd-session-start.sh +0 -10
- /package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/cli.md +0 -0
|
@@ -65,7 +65,14 @@ def ext_from_content_type(content_type: str) -> str:
|
|
|
65
65
|
def normalize_cell(text) -> str:
|
|
66
66
|
if text is None:
|
|
67
67
|
return ""
|
|
68
|
-
return
|
|
68
|
+
return (
|
|
69
|
+
str(text).strip()
|
|
70
|
+
.replace("\\", "\\\\")
|
|
71
|
+
.replace("|", "\\|")
|
|
72
|
+
.replace("\r\n", "<br>")
|
|
73
|
+
.replace("\n", "<br>")
|
|
74
|
+
.replace("\r", "<br>")
|
|
75
|
+
)
|
|
69
76
|
|
|
70
77
|
|
|
71
78
|
def parse_heading_level(style_name: str) -> int | None:
|
|
@@ -9,6 +9,8 @@ def extract(file_path):
|
|
|
9
9
|
ensure_packages(PACKAGES)
|
|
10
10
|
from docx import Document
|
|
11
11
|
from docx.oxml.ns import qn
|
|
12
|
+
from docx.table import Table as DocxTable
|
|
13
|
+
from docx.text.paragraph import Paragraph
|
|
12
14
|
|
|
13
15
|
doc = Document(file_path)
|
|
14
16
|
text_parts = []
|
|
@@ -17,47 +19,85 @@ def extract(file_path):
|
|
|
17
19
|
img_idx = 0
|
|
18
20
|
emb_idx = 0
|
|
19
21
|
|
|
20
|
-
|
|
21
|
-
|
|
22
|
+
def _extract_drawing(drawing):
|
|
23
|
+
nonlocal img_idx
|
|
24
|
+
blip = drawing.find(f".//{qn('a:blip')}")
|
|
25
|
+
if blip is None:
|
|
26
|
+
return None
|
|
27
|
+
embed_id = blip.get(qn("r:embed"))
|
|
28
|
+
if not embed_id:
|
|
29
|
+
return None
|
|
30
|
+
rel = doc.part.rels.get(embed_id)
|
|
31
|
+
if not rel or not hasattr(rel, 'target_part'):
|
|
32
|
+
return None
|
|
33
|
+
ext = ext_from_content_type(rel.target_part.content_type)
|
|
34
|
+
img_idx += 1
|
|
35
|
+
doc_pr = drawing.find(f".//{qn('wp:docPr')}")
|
|
36
|
+
alt = ""
|
|
37
|
+
if doc_pr is not None:
|
|
38
|
+
alt = doc_pr.get("descr", "") or doc_pr.get("title", "")
|
|
39
|
+
images.append({
|
|
40
|
+
"data": rel.target_part.blob,
|
|
41
|
+
"ext": ext,
|
|
42
|
+
"context": alt or "paragraph image",
|
|
43
|
+
})
|
|
44
|
+
return img_idx
|
|
45
|
+
|
|
46
|
+
def _process_paragraph(element):
|
|
47
|
+
para = Paragraph(element, doc)
|
|
48
|
+
style = para.style.name if para.style else ""
|
|
49
|
+
prefix = ""
|
|
50
|
+
if "Heading" in style:
|
|
51
|
+
level = parse_heading_level(style)
|
|
52
|
+
prefix = "#" * (level or 2) + " "
|
|
53
|
+
|
|
54
|
+
parts = []
|
|
22
55
|
for run in para.runs:
|
|
56
|
+
if run.text:
|
|
57
|
+
parts.append(run.text)
|
|
23
58
|
drawings = (run._element.findall(f".//{qn('wp:inline')}") +
|
|
24
59
|
run._element.findall(f".//{qn('wp:anchor')}"))
|
|
25
|
-
for
|
|
26
|
-
|
|
27
|
-
if
|
|
28
|
-
|
|
29
|
-
if embed_id:
|
|
30
|
-
rel = doc.part.rels.get(embed_id)
|
|
31
|
-
if rel and hasattr(rel, 'target_part'):
|
|
32
|
-
ext = ext_from_content_type(rel.target_part.content_type)
|
|
33
|
-
img_idx += 1
|
|
34
|
-
images.append({
|
|
35
|
-
"data": rel.target_part.blob,
|
|
36
|
-
"ext": ext,
|
|
37
|
-
"context": "paragraph image",
|
|
38
|
-
})
|
|
39
|
-
para_img_markers.append(f"[IMG:{img_idx}]")
|
|
40
|
-
|
|
41
|
-
text = para.text.strip()
|
|
42
|
-
if text:
|
|
43
|
-
style = para.style.name if para.style else ""
|
|
44
|
-
prefix = ""
|
|
45
|
-
if "Heading" in style:
|
|
46
|
-
level = parse_heading_level(style)
|
|
47
|
-
if level is not None:
|
|
48
|
-
prefix = "#" * level + " "
|
|
49
|
-
else:
|
|
50
|
-
prefix = "## "
|
|
51
|
-
text_parts.append(f"{prefix}{text}")
|
|
60
|
+
for d in drawings:
|
|
61
|
+
idx = _extract_drawing(d)
|
|
62
|
+
if idx is not None:
|
|
63
|
+
parts.append(f"[IMG:{idx}]")
|
|
52
64
|
|
|
53
|
-
|
|
54
|
-
|
|
65
|
+
line = "".join(parts).strip()
|
|
66
|
+
if line:
|
|
67
|
+
text_parts.append(f"{prefix}{line}")
|
|
55
68
|
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
69
|
+
def _process_table(element):
|
|
70
|
+
table = DocxTable(element, doc)
|
|
71
|
+
rows = list(table.rows)
|
|
72
|
+
if not rows:
|
|
73
|
+
return
|
|
74
|
+
text_parts.append("")
|
|
75
|
+
for r_idx, row in enumerate(rows):
|
|
59
76
|
cells = [normalize_cell(cell.text) for cell in row.cells]
|
|
60
77
|
text_parts.append("| " + " | ".join(cells) + " |")
|
|
78
|
+
if r_idx == 0:
|
|
79
|
+
text_parts.append("|" + "|".join(["---"] * len(cells)) + "|")
|
|
80
|
+
text_parts.append("")
|
|
81
|
+
|
|
82
|
+
# Iterate body elements in document order (paragraphs and tables interleaved)
|
|
83
|
+
for child in doc.element.body:
|
|
84
|
+
tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag
|
|
85
|
+
if tag == 'p':
|
|
86
|
+
_process_paragraph(child)
|
|
87
|
+
elif tag == 'tbl':
|
|
88
|
+
_process_table(child)
|
|
89
|
+
|
|
90
|
+
# Headers and footers
|
|
91
|
+
for sec_idx, section in enumerate(doc.sections):
|
|
92
|
+
h_parts = [p.text.strip() for p in section.header.paragraphs if p.text.strip()]
|
|
93
|
+
f_parts = [p.text.strip() for p in section.footer.paragraphs if p.text.strip()]
|
|
94
|
+
if h_parts or f_parts:
|
|
95
|
+
text_parts.append("")
|
|
96
|
+
text_parts.append(f"[Header/Footer — Section {sec_idx + 1}]")
|
|
97
|
+
if h_parts:
|
|
98
|
+
text_parts.append(f"Header: {' | '.join(h_parts)}")
|
|
99
|
+
if f_parts:
|
|
100
|
+
text_parts.append(f"Footer: {' | '.join(f_parts)}")
|
|
61
101
|
|
|
62
102
|
# OLE embedded objects
|
|
63
103
|
seen = set()
|
|
@@ -37,11 +37,22 @@ def extract(file_path):
|
|
|
37
37
|
if w <= 4 or h <= 4:
|
|
38
38
|
continue
|
|
39
39
|
|
|
40
|
+
# Get image position on page
|
|
41
|
+
try:
|
|
42
|
+
rects = page.get_image_rects(xref)
|
|
43
|
+
if rects:
|
|
44
|
+
r = rects[0]
|
|
45
|
+
bbox_str = f" bbox:({r.x0:.0f},{r.y0:.0f},{r.x1:.0f},{r.y1:.0f})"
|
|
46
|
+
else:
|
|
47
|
+
bbox_str = ""
|
|
48
|
+
except Exception:
|
|
49
|
+
bbox_str = ""
|
|
50
|
+
|
|
40
51
|
img_idx += 1
|
|
41
52
|
images.append({
|
|
42
53
|
"data": data,
|
|
43
54
|
"ext": ext,
|
|
44
|
-
"context": f"Page {page_num}",
|
|
55
|
+
"context": f"Page {page_num}{bbox_str}",
|
|
45
56
|
})
|
|
46
57
|
page_img_indices[page_num].append(img_idx)
|
|
47
58
|
|
|
@@ -1,8 +1,17 @@
|
|
|
1
|
-
"""PPTX handler: extract text
|
|
1
|
+
"""PPTX handler: render slides to PNG via PowerPoint COM, extract text and OLE embedded.
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Individual image/shape extraction is intentionally omitted — slide screenshots
|
|
4
|
+
contain all visuals including overlay shapes (boxes, arrows, annotations) that
|
|
5
|
+
lose their spatial relationship when decomposed. Requires Windows + Microsoft
|
|
6
|
+
PowerPoint installed.
|
|
7
|
+
"""
|
|
4
8
|
|
|
5
|
-
|
|
9
|
+
import tempfile
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from _common import ensure_packages
|
|
13
|
+
|
|
14
|
+
PACKAGES = {"pywin32": "win32com.client", "python-pptx": "pptx"}
|
|
6
15
|
|
|
7
16
|
|
|
8
17
|
def _emu_to_inches(emu):
|
|
@@ -15,37 +24,107 @@ def _pos(shape):
|
|
|
15
24
|
return f"(left={_emu_to_inches(shape.left)}\", top={_emu_to_inches(shape.top)}\")"
|
|
16
25
|
|
|
17
26
|
|
|
27
|
+
def _extract_shapes(shapes, text_parts):
|
|
28
|
+
for shape in shapes:
|
|
29
|
+
if shape.shape_type == 6: # MSO_SHAPE_TYPE.GROUP
|
|
30
|
+
_extract_shapes(shape.shapes, text_parts)
|
|
31
|
+
elif shape.has_table:
|
|
32
|
+
tbl = shape.table
|
|
33
|
+
text_parts.append(f"[TABLE] {_pos(shape)}")
|
|
34
|
+
for r_idx, row in enumerate(tbl.rows):
|
|
35
|
+
cells = [
|
|
36
|
+
cell.text.strip().replace("\\", "\\\\").replace("|", "\\|")
|
|
37
|
+
.replace("\r\n", "<br>").replace("\n", "<br>").replace("\r", "<br>")
|
|
38
|
+
for cell in row.cells
|
|
39
|
+
]
|
|
40
|
+
text_parts.append("| " + " | ".join(cells) + " |")
|
|
41
|
+
if r_idx == 0:
|
|
42
|
+
text_parts.append("|" + "|".join(["---"] * len(cells)) + "|")
|
|
43
|
+
elif hasattr(shape, "text") and shape.text.strip():
|
|
44
|
+
text = shape.text.strip().replace("\n", "\n ")
|
|
45
|
+
text_parts.append(f"[TXT] {_pos(shape)} {text}")
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _render_slides_via_com(file_path: str, tmp_dir: Path, slide_count: int,
|
|
49
|
+
width: int, height: int) -> list[bytes]:
|
|
50
|
+
import win32com.client
|
|
51
|
+
import pythoncom
|
|
52
|
+
|
|
53
|
+
pythoncom.CoInitialize()
|
|
54
|
+
try:
|
|
55
|
+
app = win32com.client.DispatchEx("PowerPoint.Application")
|
|
56
|
+
try:
|
|
57
|
+
try:
|
|
58
|
+
app.DisplayAlerts = 0
|
|
59
|
+
except Exception:
|
|
60
|
+
pass
|
|
61
|
+
abs_path = str(Path(file_path).resolve())
|
|
62
|
+
prs = app.Presentations.Open(abs_path, ReadOnly=True, Untitled=False,
|
|
63
|
+
WithWindow=False)
|
|
64
|
+
try:
|
|
65
|
+
results = []
|
|
66
|
+
for i in range(1, slide_count + 1):
|
|
67
|
+
tmp_path = tmp_dir / f"__tmp_slide_{i}.png"
|
|
68
|
+
prs.Slides(i).Export(str(tmp_path), "PNG", width, height)
|
|
69
|
+
results.append(tmp_path.read_bytes())
|
|
70
|
+
tmp_path.unlink()
|
|
71
|
+
return results
|
|
72
|
+
finally:
|
|
73
|
+
prs.Close()
|
|
74
|
+
finally:
|
|
75
|
+
app.Quit()
|
|
76
|
+
finally:
|
|
77
|
+
pythoncom.CoUninitialize()
|
|
78
|
+
|
|
79
|
+
|
|
18
80
|
def extract(file_path):
|
|
19
81
|
ensure_packages(PACKAGES)
|
|
20
82
|
from pptx import Presentation
|
|
21
|
-
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
|
22
83
|
|
|
23
84
|
prs = Presentation(file_path)
|
|
85
|
+
slide_count = len(prs.slides)
|
|
86
|
+
|
|
87
|
+
target_width = 1920
|
|
88
|
+
if prs.slide_width and prs.slide_height:
|
|
89
|
+
target_height = int(target_width * prs.slide_height / prs.slide_width)
|
|
90
|
+
else:
|
|
91
|
+
target_height = 1080
|
|
92
|
+
|
|
93
|
+
try:
|
|
94
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
95
|
+
slide_pngs = _render_slides_via_com(
|
|
96
|
+
file_path, Path(tmpdir), slide_count, target_width, target_height
|
|
97
|
+
)
|
|
98
|
+
except Exception as e:
|
|
99
|
+
raise RuntimeError(
|
|
100
|
+
f"PowerPoint COM rendering failed: {e}. "
|
|
101
|
+
"This extractor requires Windows with Microsoft PowerPoint installed."
|
|
102
|
+
) from e
|
|
103
|
+
|
|
24
104
|
text_parts = []
|
|
25
|
-
|
|
105
|
+
slide_images = []
|
|
26
106
|
embedded = []
|
|
27
|
-
img_idx = 0
|
|
28
107
|
emb_idx = 0
|
|
29
108
|
|
|
30
109
|
for slide_num, slide in enumerate(prs.slides, 1):
|
|
31
110
|
text_parts.append(f"[Slide {slide_num}]")
|
|
32
111
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
112
|
+
slide_images.append({
|
|
113
|
+
"filename": f"slide_{slide_num:03d}.png",
|
|
114
|
+
"data": slide_pngs[slide_num - 1],
|
|
115
|
+
})
|
|
116
|
+
text_parts.append(f"[SLIDE:{slide_num}]")
|
|
117
|
+
|
|
118
|
+
_extract_shapes(slide.shapes, text_parts)
|
|
119
|
+
|
|
120
|
+
# Speaker notes
|
|
121
|
+
if slide.has_notes_slide:
|
|
122
|
+
notes_frame = slide.notes_slide.notes_text_frame
|
|
123
|
+
notes_text = notes_frame.text.strip() if notes_frame else ""
|
|
124
|
+
if notes_text:
|
|
125
|
+
notes_text = notes_text.replace("\n", "\n ")
|
|
126
|
+
text_parts.append(f"[Notes] {notes_text}")
|
|
127
|
+
|
|
49
128
|
seen = set()
|
|
50
129
|
for rel in slide.part.rels.values():
|
|
51
130
|
reltype = rel.reltype or ""
|
|
@@ -69,7 +148,8 @@ def extract(file_path):
|
|
|
69
148
|
|
|
70
149
|
return {
|
|
71
150
|
"text": "\n".join(text_parts),
|
|
72
|
-
"images":
|
|
151
|
+
"images": [],
|
|
73
152
|
"embedded": embedded,
|
|
74
153
|
"metadata": {},
|
|
154
|
+
"slide_images": slide_images,
|
|
75
155
|
}
|
|
@@ -1,8 +1,36 @@
|
|
|
1
|
-
"""XLSB handler: extract cell data from binary Excel format.
|
|
1
|
+
"""XLSB handler: extract cell data and VBA macros from binary Excel format.
|
|
2
|
+
|
|
3
|
+
Output format matches the XLSX handler: per sheet, a markdown table with
|
|
4
|
+
Excel column letters as headers and the original row number in the first
|
|
5
|
+
column. VBA macros are extracted via oletools and appended as fenced code
|
|
6
|
+
blocks.
|
|
7
|
+
"""
|
|
2
8
|
|
|
3
9
|
from _common import ensure_packages
|
|
4
10
|
|
|
5
|
-
PACKAGES = {"pyxlsb": "pyxlsb"}
|
|
11
|
+
PACKAGES = {"pyxlsb": "pyxlsb", "oletools": "oletools"}
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _escape_md(v):
|
|
15
|
+
if v is None:
|
|
16
|
+
return ""
|
|
17
|
+
s = str(v).strip()
|
|
18
|
+
return (
|
|
19
|
+
s.replace("\\", "\\\\")
|
|
20
|
+
.replace("|", "\\|")
|
|
21
|
+
.replace("\r\n", "<br>")
|
|
22
|
+
.replace("\n", "<br>")
|
|
23
|
+
.replace("\r", "<br>")
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _col_letter(n):
|
|
28
|
+
# 1-based column index → Excel letter (A, B, ..., Z, AA, AB, ...)
|
|
29
|
+
s = ""
|
|
30
|
+
while n > 0:
|
|
31
|
+
n, r = divmod(n - 1, 26)
|
|
32
|
+
s = chr(65 + r) + s
|
|
33
|
+
return s
|
|
6
34
|
|
|
7
35
|
|
|
8
36
|
def extract(file_path):
|
|
@@ -14,10 +42,71 @@ def extract(file_path):
|
|
|
14
42
|
with open_workbook(file_path) as wb:
|
|
15
43
|
for sheet_name in wb.sheets:
|
|
16
44
|
text_parts.append(f"[Sheet: {sheet_name}]")
|
|
45
|
+
text_parts.append("")
|
|
46
|
+
|
|
17
47
|
with wb.get_sheet(sheet_name) as sheet:
|
|
48
|
+
rows_data = []
|
|
49
|
+
max_col = 0
|
|
18
50
|
for row in sheet.rows():
|
|
19
|
-
|
|
20
|
-
|
|
51
|
+
if not row:
|
|
52
|
+
continue
|
|
53
|
+
row_num = row[0].r + 1 # pyxlsb is 0-based
|
|
54
|
+
cells = [_escape_md(cell.v) for cell in row]
|
|
55
|
+
if len(cells) > max_col:
|
|
56
|
+
max_col = len(cells)
|
|
57
|
+
rows_data.append((row_num, cells))
|
|
58
|
+
|
|
59
|
+
if not rows_data:
|
|
60
|
+
text_parts.append("(empty sheet)")
|
|
61
|
+
text_parts.append("")
|
|
62
|
+
continue
|
|
63
|
+
|
|
64
|
+
headers = ["Row"] + [_col_letter(c) for c in range(1, max_col + 1)]
|
|
65
|
+
text_parts.append("| " + " | ".join(headers) + " |")
|
|
66
|
+
text_parts.append("|" + "|".join(["---"] * len(headers)) + "|")
|
|
67
|
+
for row_num, cells in rows_data:
|
|
68
|
+
padded = list(cells) + [""] * (max_col - len(cells))
|
|
69
|
+
text_parts.append(
|
|
70
|
+
f"| {row_num} | " + " | ".join(padded[:max_col]) + " |"
|
|
71
|
+
)
|
|
72
|
+
text_parts.append("")
|
|
73
|
+
|
|
74
|
+
# --- VBA macro extraction ---
|
|
75
|
+
vba_parts = []
|
|
76
|
+
try:
|
|
77
|
+
from oletools.olevba import VBA_Parser
|
|
78
|
+
|
|
79
|
+
vba_parser = VBA_Parser(file_path)
|
|
80
|
+
if vba_parser.detect_vba_macros():
|
|
81
|
+
vba_parts.append("[VBA Macros]")
|
|
82
|
+
vba_parts.append("")
|
|
83
|
+
for filename, stream_path, vba_filename, vba_code in vba_parser.extract_macros():
|
|
84
|
+
vba_parts.append(f"### {vba_filename}")
|
|
85
|
+
vba_parts.append(f"<!-- stream: {stream_path} -->")
|
|
86
|
+
vba_parts.append("")
|
|
87
|
+
vba_parts.append("```vb")
|
|
88
|
+
vba_parts.append(vba_code)
|
|
89
|
+
vba_parts.append("```")
|
|
90
|
+
vba_parts.append("")
|
|
91
|
+
|
|
92
|
+
analysis = vba_parser.analyze_macros()
|
|
93
|
+
suspicious = [e for e in analysis if e[0] in ("AutoExec", "Suspicious", "IOC")]
|
|
94
|
+
if suspicious:
|
|
95
|
+
vba_parts.append("### Analysis")
|
|
96
|
+
vba_parts.append("")
|
|
97
|
+
vba_parts.append("| Type | Keyword | Description |")
|
|
98
|
+
vba_parts.append("|------|---------|-------------|")
|
|
99
|
+
for entry_type, keyword, description in suspicious:
|
|
100
|
+
vba_parts.append(f"| {entry_type} | `{keyword}` | {description} |")
|
|
101
|
+
vba_parts.append("")
|
|
102
|
+
|
|
103
|
+
vba_parser.close()
|
|
104
|
+
except Exception:
|
|
105
|
+
pass
|
|
106
|
+
|
|
107
|
+
if vba_parts:
|
|
108
|
+
text_parts.append("")
|
|
109
|
+
text_parts.extend(vba_parts)
|
|
21
110
|
|
|
22
111
|
return {
|
|
23
112
|
"text": "\n".join(text_parts),
|
|
@@ -1,4 +1,14 @@
|
|
|
1
|
-
"""XLSX handler: extract cell data, images, and embedded objects.
|
|
1
|
+
"""XLSX handler: extract cell data, images, and embedded objects.
|
|
2
|
+
|
|
3
|
+
Output format: per sheet, cell data is rendered as a markdown table whose
|
|
4
|
+
column headers are Excel column letters (A, B, C, ...) and whose first
|
|
5
|
+
column is the original Excel row number. When an image is anchored to a
|
|
6
|
+
row, the current table chunk is flushed, the [IMG:N] placeholder is
|
|
7
|
+
emitted, and a new table (re-rendering the header) resumes from the next
|
|
8
|
+
row. This preserves the spatial relationship between cell data and
|
|
9
|
+
images while keeping each chunk a valid markdown table that LLMs parse
|
|
10
|
+
natively.
|
|
11
|
+
"""
|
|
2
12
|
|
|
3
13
|
import zipfile
|
|
4
14
|
from _common import ensure_packages
|
|
@@ -6,9 +16,35 @@ from _common import ensure_packages
|
|
|
6
16
|
PACKAGES = {"openpyxl": "openpyxl"}
|
|
7
17
|
|
|
8
18
|
|
|
19
|
+
def _escape_md(v):
|
|
20
|
+
if v is None:
|
|
21
|
+
return ""
|
|
22
|
+
s = str(v).strip()
|
|
23
|
+
return (
|
|
24
|
+
s.replace("\\", "\\\\")
|
|
25
|
+
.replace("|", "\\|")
|
|
26
|
+
.replace("\r\n", "<br>")
|
|
27
|
+
.replace("\n", "<br>")
|
|
28
|
+
.replace("\r", "<br>")
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _render_chunk(chunk_rows, max_col, get_col_letter):
|
|
33
|
+
if not chunk_rows:
|
|
34
|
+
return []
|
|
35
|
+
headers = ["Row"] + [get_col_letter(c) for c in range(1, max_col + 1)]
|
|
36
|
+
out = ["| " + " | ".join(headers) + " |",
|
|
37
|
+
"|" + "|".join(["---"] * len(headers)) + "|"]
|
|
38
|
+
for row_num, cells in chunk_rows:
|
|
39
|
+
padded = list(cells) + [""] * (max_col - len(cells))
|
|
40
|
+
out.append(f"| {row_num} | " + " | ".join(padded[:max_col]) + " |")
|
|
41
|
+
return out
|
|
42
|
+
|
|
43
|
+
|
|
9
44
|
def extract(file_path):
|
|
10
45
|
ensure_packages(PACKAGES)
|
|
11
46
|
from openpyxl import load_workbook
|
|
47
|
+
from openpyxl.utils import get_column_letter
|
|
12
48
|
from openpyxl.worksheet.worksheet import Worksheet
|
|
13
49
|
|
|
14
50
|
wb = load_workbook(file_path, data_only=True)
|
|
@@ -21,57 +57,83 @@ def extract(file_path):
|
|
|
21
57
|
for sheet_name in wb.sheetnames:
|
|
22
58
|
ws = wb[sheet_name]
|
|
23
59
|
text_parts.append(f"[Sheet: {sheet_name}]")
|
|
60
|
+
text_parts.append("")
|
|
24
61
|
|
|
25
62
|
if not isinstance(ws, Worksheet):
|
|
26
63
|
text_parts.append(f"({type(ws).__name__} — 데이터 없음)")
|
|
64
|
+
text_parts.append("")
|
|
27
65
|
continue
|
|
28
66
|
|
|
29
67
|
if ws.max_row is None or ws.max_row == 0:
|
|
30
68
|
text_parts.append("(empty sheet)")
|
|
69
|
+
text_parts.append("")
|
|
31
70
|
continue
|
|
32
71
|
|
|
33
|
-
#
|
|
34
|
-
|
|
35
|
-
|
|
72
|
+
# Merged cells annotation
|
|
73
|
+
merged = list(ws.merged_cells.ranges)
|
|
74
|
+
if merged:
|
|
75
|
+
text_parts.append(f"[Merged: {', '.join(str(r) for r in merged)}]")
|
|
76
|
+
text_parts.append("")
|
|
77
|
+
|
|
78
|
+
ws_images = getattr(ws, "_images", [])
|
|
79
|
+
row_img_markers = {}
|
|
36
80
|
for img in ws_images:
|
|
37
|
-
data_fn = getattr(img,
|
|
81
|
+
data_fn = getattr(img, "_data", None)
|
|
38
82
|
blob = data_fn() if callable(data_fn) else b""
|
|
39
|
-
if blob:
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
if
|
|
51
|
-
anchor_row
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
83
|
+
if not blob:
|
|
84
|
+
continue
|
|
85
|
+
img_idx += 1
|
|
86
|
+
anchor = getattr(img, "anchor", None)
|
|
87
|
+
anchor_row = None
|
|
88
|
+
anchor_col = None
|
|
89
|
+
if anchor:
|
|
90
|
+
_from = getattr(anchor, "_from", None)
|
|
91
|
+
if _from:
|
|
92
|
+
anchor_row = getattr(_from, "row", None)
|
|
93
|
+
anchor_col = getattr(_from, "col", None)
|
|
94
|
+
if anchor_row is not None:
|
|
95
|
+
anchor_row += 1
|
|
96
|
+
if anchor_col is not None:
|
|
97
|
+
anchor_col += 1
|
|
98
|
+
if anchor_row is None:
|
|
99
|
+
anchor_row = ws.max_row or 1
|
|
100
|
+
cell_ref = ""
|
|
101
|
+
if anchor_col is not None:
|
|
102
|
+
cell_ref = f" anchor:{get_column_letter(anchor_col)}{anchor_row}"
|
|
103
|
+
else:
|
|
104
|
+
cell_ref = f" anchor:row {anchor_row}"
|
|
105
|
+
images.append({
|
|
106
|
+
"data": blob,
|
|
107
|
+
"ext": "png",
|
|
108
|
+
"context": f"sheet '{sheet_name}'{cell_ref}",
|
|
109
|
+
})
|
|
110
|
+
row_img_markers.setdefault(anchor_row, []).append(img_idx)
|
|
111
|
+
|
|
112
|
+
max_col = ws.max_column or 1
|
|
113
|
+
chunk = []
|
|
114
|
+
|
|
59
115
|
for row in ws.iter_rows(values_only=False):
|
|
60
|
-
cells = []
|
|
61
|
-
for cell in row:
|
|
62
|
-
val = cell.value
|
|
63
|
-
cells.append(str(val).strip() if val is not None else "")
|
|
64
116
|
row_num = row[0].row
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
117
|
+
cells = [_escape_md(c.value) for c in row]
|
|
118
|
+
chunk.append((row_num, cells))
|
|
119
|
+
|
|
120
|
+
if row_num in row_img_markers:
|
|
121
|
+
text_parts.extend(_render_chunk(chunk, max_col, get_column_letter))
|
|
122
|
+
text_parts.append("")
|
|
123
|
+
for idx in row_img_markers[row_num]:
|
|
124
|
+
text_parts.append(f"[IMG:{idx}]")
|
|
125
|
+
text_parts.append("")
|
|
126
|
+
chunk = []
|
|
127
|
+
|
|
128
|
+
if chunk:
|
|
129
|
+
text_parts.extend(_render_chunk(chunk, max_col, get_column_letter))
|
|
130
|
+
text_parts.append("")
|
|
68
131
|
|
|
69
|
-
# Embedded objects from XLSX ZIP
|
|
70
132
|
try:
|
|
71
|
-
with zipfile.ZipFile(file_path,
|
|
133
|
+
with zipfile.ZipFile(file_path, "r") as zf:
|
|
72
134
|
for name in zf.namelist():
|
|
73
|
-
if
|
|
74
|
-
filename = name.split(
|
|
135
|
+
if "embeddings/" in name.lower():
|
|
136
|
+
filename = name.split("/")[-1]
|
|
75
137
|
data = zf.read(name)
|
|
76
138
|
emb_idx += 1
|
|
77
139
|
embedded.append({"filename": filename, "data": data})
|
|
@@ -44,6 +44,13 @@ def extract_recursive(file_path: Path, out_dir: Path):
|
|
|
44
44
|
"context": img.get("context", ""),
|
|
45
45
|
})
|
|
46
46
|
|
|
47
|
+
# Save pre-named slide images (PPTX screenshots)
|
|
48
|
+
saved_slides = []
|
|
49
|
+
for s in result.get("slide_images", []):
|
|
50
|
+
slide_path = out_dir / s["filename"]
|
|
51
|
+
slide_path.write_bytes(s["data"])
|
|
52
|
+
saved_slides.append({"filename": s["filename"], "size": len(s["data"])})
|
|
53
|
+
|
|
47
54
|
# Save embedded/attached files and recurse
|
|
48
55
|
prefix = "attachment" if result.get("metadata", {}).get("email_headers") else "embedded"
|
|
49
56
|
saved_embedded = []
|
|
@@ -68,12 +75,13 @@ def extract_recursive(file_path: Path, out_dir: Path):
|
|
|
68
75
|
saved_embedded.append(entry)
|
|
69
76
|
|
|
70
77
|
# Generate {stem}.md index in parent of out_dir
|
|
71
|
-
_generate_index_md(out_dir, file_path, result, saved_images, saved_embedded)
|
|
78
|
+
_generate_index_md(out_dir, file_path, result, saved_images, saved_embedded, saved_slides)
|
|
72
79
|
|
|
73
80
|
|
|
74
81
|
|
|
75
82
|
def _generate_index_md(out_dir: Path, file_path: Path, result: dict,
|
|
76
|
-
saved_images: list, saved_embedded: list
|
|
83
|
+
saved_images: list, saved_embedded: list,
|
|
84
|
+
saved_slides: list | None = None):
|
|
77
85
|
"""Generate {stem}.md in parent of out_dir, summarizing extraction results.
|
|
78
86
|
|
|
79
87
|
Images and embedded files are placed inline via [IMG:N]/[EMB:N] placeholders
|
|
@@ -120,7 +128,8 @@ def _generate_index_md(out_dir: Path, file_path: Path, result: dict,
|
|
|
120
128
|
referenced_imgs.add(idx)
|
|
121
129
|
if 1 <= idx <= len(saved_images):
|
|
122
130
|
img = saved_images[idx - 1]
|
|
123
|
-
|
|
131
|
+
alt = img.get('context', '') or img['filename']
|
|
132
|
+
return f""
|
|
124
133
|
return m.group(0)
|
|
125
134
|
|
|
126
135
|
def replace_emb(m):
|
|
@@ -136,8 +145,18 @@ def _generate_index_md(out_dir: Path, file_path: Path, result: dict,
|
|
|
136
145
|
return f"> embedded: [{name}]({rel_prefix}/{name})"
|
|
137
146
|
return m.group(0)
|
|
138
147
|
|
|
148
|
+
slides_list = saved_slides or []
|
|
149
|
+
|
|
150
|
+
def replace_slide(m):
|
|
151
|
+
idx = int(m.group(1))
|
|
152
|
+
if 1 <= idx <= len(slides_list):
|
|
153
|
+
fname = slides_list[idx - 1]["filename"]
|
|
154
|
+
return f""
|
|
155
|
+
return m.group(0)
|
|
156
|
+
|
|
139
157
|
text = re.sub(r'\[IMG:(\d+)\]', replace_img, text)
|
|
140
158
|
text = re.sub(r'\[EMB:(\d+)\]', replace_emb, text)
|
|
159
|
+
text = re.sub(r'\[SLIDE:(\d+)\]', replace_slide, text)
|
|
141
160
|
|
|
142
161
|
if len(text) > 10000:
|
|
143
162
|
body_path = out_dir / "body.txt"
|