@simplysm/sd-claude 14.0.46 → 14.0.48

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (127) hide show
  1. package/{claude/references/sd-simplysm14/sd-claude/usage.md → README.md} +2 -2
  2. package/claude/rules/sd-claude-rules.md +27 -9
  3. package/claude/rules/sd-options.md +11 -6
  4. package/claude/sd-subagent-start.sh +6 -0
  5. package/claude/settings.json +1 -12
  6. package/claude/skills/sd-check/SKILL.md +18 -9
  7. package/claude/skills/sd-claude-docs/SKILL.md +29 -58
  8. package/claude/skills/sd-claude-docs/references/package-claudemd.md +12 -0
  9. package/claude/skills/sd-claude-docs/references/package-doc-gen.md +22 -12
  10. package/claude/skills/sd-debug/SKILL.md +5 -3
  11. package/claude/skills/sd-deliverable/SKILL.md +0 -1
  12. package/claude/skills/sd-dev/SKILL.md +14 -9
  13. package/claude/skills/sd-doc-extract/SKILL.md +7 -9
  14. package/claude/skills/sd-doc-extract/_common.py +8 -1
  15. package/claude/skills/sd-doc-extract/_extract_docx.py +74 -34
  16. package/claude/skills/sd-doc-extract/_extract_pdf.py +12 -1
  17. package/claude/skills/sd-doc-extract/_extract_pptx.py +103 -23
  18. package/claude/skills/sd-doc-extract/_extract_xlsb.py +93 -4
  19. package/claude/skills/sd-doc-extract/_extract_xlsx.py +98 -36
  20. package/claude/skills/sd-doc-extract/extract.py +22 -3
  21. package/claude/skills/sd-inner-clarify/SKILL.md +78 -0
  22. package/claude/skills/sd-inner-debug/SKILL.md +1 -1
  23. package/claude/skills/sd-inner-review/SKILL.md +13 -0
  24. package/claude/skills/sd-plan/SKILL.md +50 -17
  25. package/claude/skills/sd-prompt/SKILL.md +180 -178
  26. package/claude/skills/sd-prompt/references/eval-runner.md +5 -31
  27. package/claude/skills/sd-prompt/references/sd-eval-env-template.md +23 -0
  28. package/claude/skills/sd-refactor/SKILL.md +2 -2
  29. package/claude/skills/sd-tdd/SKILL.md +46 -10
  30. package/claude/skills/sd-use/SKILL.md +84 -80
  31. package/claude/skills/sd-wbs/SKILL.md +85 -27
  32. package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/assets.md +2 -3
  33. package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/hooks.md +7 -6
  34. package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/scripts.md +1 -9
  35. package/package.json +3 -2
  36. package/scripts/sync.mjs +4 -2
  37. package/claude/references/sd-simplysm14/angular/docs/bootstrap.md +0 -48
  38. package/claude/references/sd-simplysm14/angular/docs/directives.md +0 -236
  39. package/claude/references/sd-simplysm14/angular/docs/features.md +0 -379
  40. package/claude/references/sd-simplysm14/angular/docs/pipes.md +0 -32
  41. package/claude/references/sd-simplysm14/angular/docs/plugins.md +0 -37
  42. package/claude/references/sd-simplysm14/angular/docs/provider-types.md +0 -283
  43. package/claude/references/sd-simplysm14/angular/docs/providers.md +0 -370
  44. package/claude/references/sd-simplysm14/angular/docs/styling.md +0 -222
  45. package/claude/references/sd-simplysm14/angular/docs/type-utilities.md +0 -250
  46. package/claude/references/sd-simplysm14/angular/docs/ui-data.md +0 -275
  47. package/claude/references/sd-simplysm14/angular/docs/ui-form.md +0 -490
  48. package/claude/references/sd-simplysm14/angular/docs/ui-layout.md +0 -140
  49. package/claude/references/sd-simplysm14/angular/docs/ui-navigation.md +0 -241
  50. package/claude/references/sd-simplysm14/angular/docs/ui-overlay.md +0 -157
  51. package/claude/references/sd-simplysm14/angular/docs/ui-visual.md +0 -127
  52. package/claude/references/sd-simplysm14/angular/docs/utils.md +0 -295
  53. package/claude/references/sd-simplysm14/angular/usage.md +0 -489
  54. package/claude/references/sd-simplysm14/capacitor-plugin-auto-update/usage.md +0 -182
  55. package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/file-operations.md +0 -154
  56. package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/permissions.md +0 -84
  57. package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/storage-paths.md +0 -107
  58. package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/types.md +0 -83
  59. package/claude/references/sd-simplysm14/capacitor-plugin-file-system/usage.md +0 -133
  60. package/claude/references/sd-simplysm14/capacitor-plugin-intent/usage.md +0 -203
  61. package/claude/references/sd-simplysm14/capacitor-plugin-usb-storage/usage.md +0 -258
  62. package/claude/references/sd-simplysm14/core-browser/usage.md +0 -306
  63. package/claude/references/sd-simplysm14/core-common/docs/errors.md +0 -82
  64. package/claude/references/sd-simplysm14/core-common/docs/extensions.md +0 -167
  65. package/claude/references/sd-simplysm14/core-common/docs/features.md +0 -136
  66. package/claude/references/sd-simplysm14/core-common/docs/types.md +0 -245
  67. package/claude/references/sd-simplysm14/core-common/docs/utils.md +0 -591
  68. package/claude/references/sd-simplysm14/core-common/usage.md +0 -255
  69. package/claude/references/sd-simplysm14/core-node/docs/child-process.md +0 -182
  70. package/claude/references/sd-simplysm14/core-node/docs/features.md +0 -214
  71. package/claude/references/sd-simplysm14/core-node/docs/file-system.md +0 -509
  72. package/claude/references/sd-simplysm14/core-node/docs/file-watching.md +0 -139
  73. package/claude/references/sd-simplysm14/core-node/docs/logging.md +0 -180
  74. package/claude/references/sd-simplysm14/core-node/docs/path.md +0 -176
  75. package/claude/references/sd-simplysm14/core-node/docs/utilities-cpx.md +0 -194
  76. package/claude/references/sd-simplysm14/core-node/docs/utilities-fsx.md +0 -469
  77. package/claude/references/sd-simplysm14/core-node/docs/utilities-pathx.md +0 -151
  78. package/claude/references/sd-simplysm14/core-node/docs/worker-threads.md +0 -334
  79. package/claude/references/sd-simplysm14/core-node/docs/worker.md +0 -205
  80. package/claude/references/sd-simplysm14/core-node/usage.md +0 -259
  81. package/claude/references/sd-simplysm14/excel/docs/core-classes.md +0 -443
  82. package/claude/references/sd-simplysm14/excel/docs/types.md +0 -455
  83. package/claude/references/sd-simplysm14/excel/docs/utilities.md +0 -194
  84. package/claude/references/sd-simplysm14/excel/docs/wrapper.md +0 -73
  85. package/claude/references/sd-simplysm14/excel/usage.md +0 -134
  86. package/claude/references/sd-simplysm14/lint/usage.md +0 -130
  87. package/claude/references/sd-simplysm14/orm-common/docs/core.md +0 -188
  88. package/claude/references/sd-simplysm14/orm-common/docs/expression.md +0 -190
  89. package/claude/references/sd-simplysm14/orm-common/docs/models.md +0 -17
  90. package/claude/references/sd-simplysm14/orm-common/docs/query-builder.md +0 -97
  91. package/claude/references/sd-simplysm14/orm-common/docs/queryable-executable.md +0 -250
  92. package/claude/references/sd-simplysm14/orm-common/docs/schema-builders.md +0 -364
  93. package/claude/references/sd-simplysm14/orm-common/docs/types.md +0 -522
  94. package/claude/references/sd-simplysm14/orm-common/usage.md +0 -229
  95. package/claude/references/sd-simplysm14/orm-node/docs/connections.md +0 -137
  96. package/claude/references/sd-simplysm14/orm-node/docs/core.md +0 -131
  97. package/claude/references/sd-simplysm14/orm-node/docs/types.md +0 -173
  98. package/claude/references/sd-simplysm14/orm-node/usage.md +0 -143
  99. package/claude/references/sd-simplysm14/sd-cli/usage.md +0 -782
  100. package/claude/references/sd-simplysm14/service-client/docs/features.md +0 -217
  101. package/claude/references/sd-simplysm14/service-client/docs/main.md +0 -148
  102. package/claude/references/sd-simplysm14/service-client/docs/protocol.md +0 -53
  103. package/claude/references/sd-simplysm14/service-client/docs/transport.md +0 -131
  104. package/claude/references/sd-simplysm14/service-client/docs/types.md +0 -129
  105. package/claude/references/sd-simplysm14/service-client/usage.md +0 -202
  106. package/claude/references/sd-simplysm14/service-common/docs/app-structure.md +0 -175
  107. package/claude/references/sd-simplysm14/service-common/docs/events.md +0 -64
  108. package/claude/references/sd-simplysm14/service-common/docs/protocol.md +0 -331
  109. package/claude/references/sd-simplysm14/service-common/docs/service-types.md +0 -90
  110. package/claude/references/sd-simplysm14/service-common/docs/types.md +0 -19
  111. package/claude/references/sd-simplysm14/service-common/usage.md +0 -154
  112. package/claude/references/sd-simplysm14/service-server/docs/auth.md +0 -64
  113. package/claude/references/sd-simplysm14/service-server/docs/core.md +0 -174
  114. package/claude/references/sd-simplysm14/service-server/docs/legacy.md +0 -25
  115. package/claude/references/sd-simplysm14/service-server/docs/main.md +0 -88
  116. package/claude/references/sd-simplysm14/service-server/docs/protocol.md +0 -33
  117. package/claude/references/sd-simplysm14/service-server/docs/services.md +0 -94
  118. package/claude/references/sd-simplysm14/service-server/docs/transport-http.md +0 -93
  119. package/claude/references/sd-simplysm14/service-server/docs/transport-socket.md +0 -119
  120. package/claude/references/sd-simplysm14/service-server/docs/types.md +0 -36
  121. package/claude/references/sd-simplysm14/service-server/docs/utils.md +0 -22
  122. package/claude/references/sd-simplysm14/service-server/usage.md +0 -171
  123. package/claude/references/sd-simplysm14/storage/usage.md +0 -301
  124. package/claude/references/sd-simplysm14.md +0 -35
  125. package/claude/rules/sd-clarify.md +0 -23
  126. package/claude/sd-session-start.sh +0 -10
  127. /package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/cli.md +0 -0
@@ -65,7 +65,14 @@ def ext_from_content_type(content_type: str) -> str:
65
65
  def normalize_cell(text) -> str:
66
66
  if text is None:
67
67
  return ""
68
- return str(text).strip().replace("\n", " ")
68
+ return (
69
+ str(text).strip()
70
+ .replace("\\", "\\\\")
71
+ .replace("|", "\\|")
72
+ .replace("\r\n", "<br>")
73
+ .replace("\n", "<br>")
74
+ .replace("\r", "<br>")
75
+ )
69
76
 
70
77
 
71
78
  def parse_heading_level(style_name: str) -> int | None:
@@ -9,6 +9,8 @@ def extract(file_path):
9
9
  ensure_packages(PACKAGES)
10
10
  from docx import Document
11
11
  from docx.oxml.ns import qn
12
+ from docx.table import Table as DocxTable
13
+ from docx.text.paragraph import Paragraph
12
14
 
13
15
  doc = Document(file_path)
14
16
  text_parts = []
@@ -17,47 +19,85 @@ def extract(file_path):
17
19
  img_idx = 0
18
20
  emb_idx = 0
19
21
 
20
- for para in doc.paragraphs:
21
- para_img_markers = []
22
+ def _extract_drawing(drawing):
23
+ nonlocal img_idx
24
+ blip = drawing.find(f".//{qn('a:blip')}")
25
+ if blip is None:
26
+ return None
27
+ embed_id = blip.get(qn("r:embed"))
28
+ if not embed_id:
29
+ return None
30
+ rel = doc.part.rels.get(embed_id)
31
+ if not rel or not hasattr(rel, 'target_part'):
32
+ return None
33
+ ext = ext_from_content_type(rel.target_part.content_type)
34
+ img_idx += 1
35
+ doc_pr = drawing.find(f".//{qn('wp:docPr')}")
36
+ alt = ""
37
+ if doc_pr is not None:
38
+ alt = doc_pr.get("descr", "") or doc_pr.get("title", "")
39
+ images.append({
40
+ "data": rel.target_part.blob,
41
+ "ext": ext,
42
+ "context": alt or "paragraph image",
43
+ })
44
+ return img_idx
45
+
46
+ def _process_paragraph(element):
47
+ para = Paragraph(element, doc)
48
+ style = para.style.name if para.style else ""
49
+ prefix = ""
50
+ if "Heading" in style:
51
+ level = parse_heading_level(style)
52
+ prefix = "#" * (level or 2) + " "
53
+
54
+ parts = []
22
55
  for run in para.runs:
56
+ if run.text:
57
+ parts.append(run.text)
23
58
  drawings = (run._element.findall(f".//{qn('wp:inline')}") +
24
59
  run._element.findall(f".//{qn('wp:anchor')}"))
25
- for drawing in drawings:
26
- blip = drawing.find(f".//{qn('a:blip')}")
27
- if blip is not None:
28
- embed_id = blip.get(qn("r:embed"))
29
- if embed_id:
30
- rel = doc.part.rels.get(embed_id)
31
- if rel and hasattr(rel, 'target_part'):
32
- ext = ext_from_content_type(rel.target_part.content_type)
33
- img_idx += 1
34
- images.append({
35
- "data": rel.target_part.blob,
36
- "ext": ext,
37
- "context": "paragraph image",
38
- })
39
- para_img_markers.append(f"[IMG:{img_idx}]")
40
-
41
- text = para.text.strip()
42
- if text:
43
- style = para.style.name if para.style else ""
44
- prefix = ""
45
- if "Heading" in style:
46
- level = parse_heading_level(style)
47
- if level is not None:
48
- prefix = "#" * level + " "
49
- else:
50
- prefix = "## "
51
- text_parts.append(f"{prefix}{text}")
60
+ for d in drawings:
61
+ idx = _extract_drawing(d)
62
+ if idx is not None:
63
+ parts.append(f"[IMG:{idx}]")
52
64
 
53
- for marker in para_img_markers:
54
- text_parts.append(marker)
65
+ line = "".join(parts).strip()
66
+ if line:
67
+ text_parts.append(f"{prefix}{line}")
55
68
 
56
- for t_idx, table in enumerate(doc.tables):
57
- text_parts.append(f"\n### Table {t_idx + 1}\n")
58
- for row in table.rows:
69
+ def _process_table(element):
70
+ table = DocxTable(element, doc)
71
+ rows = list(table.rows)
72
+ if not rows:
73
+ return
74
+ text_parts.append("")
75
+ for r_idx, row in enumerate(rows):
59
76
  cells = [normalize_cell(cell.text) for cell in row.cells]
60
77
  text_parts.append("| " + " | ".join(cells) + " |")
78
+ if r_idx == 0:
79
+ text_parts.append("|" + "|".join(["---"] * len(cells)) + "|")
80
+ text_parts.append("")
81
+
82
+ # Iterate body elements in document order (paragraphs and tables interleaved)
83
+ for child in doc.element.body:
84
+ tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag
85
+ if tag == 'p':
86
+ _process_paragraph(child)
87
+ elif tag == 'tbl':
88
+ _process_table(child)
89
+
90
+ # Headers and footers
91
+ for sec_idx, section in enumerate(doc.sections):
92
+ h_parts = [p.text.strip() for p in section.header.paragraphs if p.text.strip()]
93
+ f_parts = [p.text.strip() for p in section.footer.paragraphs if p.text.strip()]
94
+ if h_parts or f_parts:
95
+ text_parts.append("")
96
+ text_parts.append(f"[Header/Footer — Section {sec_idx + 1}]")
97
+ if h_parts:
98
+ text_parts.append(f"Header: {' | '.join(h_parts)}")
99
+ if f_parts:
100
+ text_parts.append(f"Footer: {' | '.join(f_parts)}")
61
101
 
62
102
  # OLE embedded objects
63
103
  seen = set()
@@ -37,11 +37,22 @@ def extract(file_path):
37
37
  if w <= 4 or h <= 4:
38
38
  continue
39
39
 
40
+ # Get image position on page
41
+ try:
42
+ rects = page.get_image_rects(xref)
43
+ if rects:
44
+ r = rects[0]
45
+ bbox_str = f" bbox:({r.x0:.0f},{r.y0:.0f},{r.x1:.0f},{r.y1:.0f})"
46
+ else:
47
+ bbox_str = ""
48
+ except Exception:
49
+ bbox_str = ""
50
+
40
51
  img_idx += 1
41
52
  images.append({
42
53
  "data": data,
43
54
  "ext": ext,
44
- "context": f"Page {page_num}",
55
+ "context": f"Page {page_num}{bbox_str}",
45
56
  })
46
57
  page_img_indices[page_num].append(img_idx)
47
58
 
@@ -1,8 +1,17 @@
1
- """PPTX handler: extract text, images, and OLE embedded objects."""
1
+ """PPTX handler: render slides to PNG via PowerPoint COM, extract text and OLE embedded.
2
2
 
3
- from _common import ensure_packages, ext_from_content_type
3
+ Individual image/shape extraction is intentionally omitted — slide screenshots
4
+ contain all visuals including overlay shapes (boxes, arrows, annotations) that
5
+ lose their spatial relationship when decomposed. Requires Windows + Microsoft
6
+ PowerPoint installed.
7
+ """
4
8
 
5
- PACKAGES = {"python-pptx": "pptx"}
9
+ import tempfile
10
+ from pathlib import Path
11
+
12
+ from _common import ensure_packages
13
+
14
+ PACKAGES = {"pywin32": "win32com.client", "python-pptx": "pptx"}
6
15
 
7
16
 
8
17
  def _emu_to_inches(emu):
@@ -15,37 +24,107 @@ def _pos(shape):
15
24
  return f"(left={_emu_to_inches(shape.left)}\", top={_emu_to_inches(shape.top)}\")"
16
25
 
17
26
 
27
+ def _extract_shapes(shapes, text_parts):
28
+ for shape in shapes:
29
+ if shape.shape_type == 6: # MSO_SHAPE_TYPE.GROUP
30
+ _extract_shapes(shape.shapes, text_parts)
31
+ elif shape.has_table:
32
+ tbl = shape.table
33
+ text_parts.append(f"[TABLE] {_pos(shape)}")
34
+ for r_idx, row in enumerate(tbl.rows):
35
+ cells = [
36
+ cell.text.strip().replace("\\", "\\\\").replace("|", "\\|")
37
+ .replace("\r\n", "<br>").replace("\n", "<br>").replace("\r", "<br>")
38
+ for cell in row.cells
39
+ ]
40
+ text_parts.append("| " + " | ".join(cells) + " |")
41
+ if r_idx == 0:
42
+ text_parts.append("|" + "|".join(["---"] * len(cells)) + "|")
43
+ elif hasattr(shape, "text") and shape.text.strip():
44
+ text = shape.text.strip().replace("\n", "\n ")
45
+ text_parts.append(f"[TXT] {_pos(shape)} {text}")
46
+
47
+
48
+ def _render_slides_via_com(file_path: str, tmp_dir: Path, slide_count: int,
49
+ width: int, height: int) -> list[bytes]:
50
+ import win32com.client
51
+ import pythoncom
52
+
53
+ pythoncom.CoInitialize()
54
+ try:
55
+ app = win32com.client.DispatchEx("PowerPoint.Application")
56
+ try:
57
+ try:
58
+ app.DisplayAlerts = 0
59
+ except Exception:
60
+ pass
61
+ abs_path = str(Path(file_path).resolve())
62
+ prs = app.Presentations.Open(abs_path, ReadOnly=True, Untitled=False,
63
+ WithWindow=False)
64
+ try:
65
+ results = []
66
+ for i in range(1, slide_count + 1):
67
+ tmp_path = tmp_dir / f"__tmp_slide_{i}.png"
68
+ prs.Slides(i).Export(str(tmp_path), "PNG", width, height)
69
+ results.append(tmp_path.read_bytes())
70
+ tmp_path.unlink()
71
+ return results
72
+ finally:
73
+ prs.Close()
74
+ finally:
75
+ app.Quit()
76
+ finally:
77
+ pythoncom.CoUninitialize()
78
+
79
+
18
80
  def extract(file_path):
19
81
  ensure_packages(PACKAGES)
20
82
  from pptx import Presentation
21
- from pptx.enum.shapes import MSO_SHAPE_TYPE
22
83
 
23
84
  prs = Presentation(file_path)
85
+ slide_count = len(prs.slides)
86
+
87
+ target_width = 1920
88
+ if prs.slide_width and prs.slide_height:
89
+ target_height = int(target_width * prs.slide_height / prs.slide_width)
90
+ else:
91
+ target_height = 1080
92
+
93
+ try:
94
+ with tempfile.TemporaryDirectory() as tmpdir:
95
+ slide_pngs = _render_slides_via_com(
96
+ file_path, Path(tmpdir), slide_count, target_width, target_height
97
+ )
98
+ except Exception as e:
99
+ raise RuntimeError(
100
+ f"PowerPoint COM rendering failed: {e}. "
101
+ "This extractor requires Windows with Microsoft PowerPoint installed."
102
+ ) from e
103
+
24
104
  text_parts = []
25
- images = []
105
+ slide_images = []
26
106
  embedded = []
27
- img_idx = 0
28
107
  emb_idx = 0
29
108
 
30
109
  for slide_num, slide in enumerate(prs.slides, 1):
31
110
  text_parts.append(f"[Slide {slide_num}]")
32
111
 
33
- for shape in slide.shapes:
34
- if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
35
- ext = ext_from_content_type(shape.image.content_type)
36
- img_idx += 1
37
- images.append({
38
- "data": shape.image.blob,
39
- "ext": ext,
40
- "context": f"Slide {slide_num} {_pos(shape)}",
41
- })
42
- text_parts.append(f"[IMG:{img_idx}]")
43
-
44
- if hasattr(shape, "text") and shape.text.strip():
45
- text = shape.text.strip().replace("\n", "\n ")
46
- text_parts.append(f"[TXT] {_pos(shape)} {text}")
47
-
48
- # OLE embedded objects from slide relationships
112
+ slide_images.append({
113
+ "filename": f"slide_{slide_num:03d}.png",
114
+ "data": slide_pngs[slide_num - 1],
115
+ })
116
+ text_parts.append(f"[SLIDE:{slide_num}]")
117
+
118
+ _extract_shapes(slide.shapes, text_parts)
119
+
120
+ # Speaker notes
121
+ if slide.has_notes_slide:
122
+ notes_frame = slide.notes_slide.notes_text_frame
123
+ notes_text = notes_frame.text.strip() if notes_frame else ""
124
+ if notes_text:
125
+ notes_text = notes_text.replace("\n", "\n ")
126
+ text_parts.append(f"[Notes] {notes_text}")
127
+
49
128
  seen = set()
50
129
  for rel in slide.part.rels.values():
51
130
  reltype = rel.reltype or ""
@@ -69,7 +148,8 @@ def extract(file_path):
69
148
 
70
149
  return {
71
150
  "text": "\n".join(text_parts),
72
- "images": images,
151
+ "images": [],
73
152
  "embedded": embedded,
74
153
  "metadata": {},
154
+ "slide_images": slide_images,
75
155
  }
@@ -1,8 +1,36 @@
1
- """XLSB handler: extract cell data from binary Excel format."""
1
+ """XLSB handler: extract cell data and VBA macros from binary Excel format.
2
+
3
+ Output format matches the XLSX handler: per sheet, a markdown table with
4
+ Excel column letters as headers and the original row number in the first
5
+ column. VBA macros are extracted via oletools and appended as fenced code
6
+ blocks.
7
+ """
2
8
 
3
9
  from _common import ensure_packages
4
10
 
5
- PACKAGES = {"pyxlsb": "pyxlsb"}
11
+ PACKAGES = {"pyxlsb": "pyxlsb", "oletools": "oletools"}
12
+
13
+
14
+ def _escape_md(v):
15
+ if v is None:
16
+ return ""
17
+ s = str(v).strip()
18
+ return (
19
+ s.replace("\\", "\\\\")
20
+ .replace("|", "\\|")
21
+ .replace("\r\n", "<br>")
22
+ .replace("\n", "<br>")
23
+ .replace("\r", "<br>")
24
+ )
25
+
26
+
27
+ def _col_letter(n):
28
+ # 1-based column index → Excel letter (A, B, ..., Z, AA, AB, ...)
29
+ s = ""
30
+ while n > 0:
31
+ n, r = divmod(n - 1, 26)
32
+ s = chr(65 + r) + s
33
+ return s
6
34
 
7
35
 
8
36
  def extract(file_path):
@@ -14,10 +42,71 @@ def extract(file_path):
14
42
  with open_workbook(file_path) as wb:
15
43
  for sheet_name in wb.sheets:
16
44
  text_parts.append(f"[Sheet: {sheet_name}]")
45
+ text_parts.append("")
46
+
17
47
  with wb.get_sheet(sheet_name) as sheet:
48
+ rows_data = []
49
+ max_col = 0
18
50
  for row in sheet.rows():
19
- cells = [str(cell.v) if cell.v is not None else "" for cell in row]
20
- text_parts.append(" | ".join(cells))
51
+ if not row:
52
+ continue
53
+ row_num = row[0].r + 1 # pyxlsb is 0-based
54
+ cells = [_escape_md(cell.v) for cell in row]
55
+ if len(cells) > max_col:
56
+ max_col = len(cells)
57
+ rows_data.append((row_num, cells))
58
+
59
+ if not rows_data:
60
+ text_parts.append("(empty sheet)")
61
+ text_parts.append("")
62
+ continue
63
+
64
+ headers = ["Row"] + [_col_letter(c) for c in range(1, max_col + 1)]
65
+ text_parts.append("| " + " | ".join(headers) + " |")
66
+ text_parts.append("|" + "|".join(["---"] * len(headers)) + "|")
67
+ for row_num, cells in rows_data:
68
+ padded = list(cells) + [""] * (max_col - len(cells))
69
+ text_parts.append(
70
+ f"| {row_num} | " + " | ".join(padded[:max_col]) + " |"
71
+ )
72
+ text_parts.append("")
73
+
74
+ # --- VBA macro extraction ---
75
+ vba_parts = []
76
+ try:
77
+ from oletools.olevba import VBA_Parser
78
+
79
+ vba_parser = VBA_Parser(file_path)
80
+ if vba_parser.detect_vba_macros():
81
+ vba_parts.append("[VBA Macros]")
82
+ vba_parts.append("")
83
+ for filename, stream_path, vba_filename, vba_code in vba_parser.extract_macros():
84
+ vba_parts.append(f"### {vba_filename}")
85
+ vba_parts.append(f"<!-- stream: {stream_path} -->")
86
+ vba_parts.append("")
87
+ vba_parts.append("```vb")
88
+ vba_parts.append(vba_code)
89
+ vba_parts.append("```")
90
+ vba_parts.append("")
91
+
92
+ analysis = vba_parser.analyze_macros()
93
+ suspicious = [e for e in analysis if e[0] in ("AutoExec", "Suspicious", "IOC")]
94
+ if suspicious:
95
+ vba_parts.append("### Analysis")
96
+ vba_parts.append("")
97
+ vba_parts.append("| Type | Keyword | Description |")
98
+ vba_parts.append("|------|---------|-------------|")
99
+ for entry_type, keyword, description in suspicious:
100
+ vba_parts.append(f"| {entry_type} | `{keyword}` | {description} |")
101
+ vba_parts.append("")
102
+
103
+ vba_parser.close()
104
+ except Exception:
105
+ pass
106
+
107
+ if vba_parts:
108
+ text_parts.append("")
109
+ text_parts.extend(vba_parts)
21
110
 
22
111
  return {
23
112
  "text": "\n".join(text_parts),
@@ -1,4 +1,14 @@
1
- """XLSX handler: extract cell data, images, and embedded objects."""
1
+ """XLSX handler: extract cell data, images, and embedded objects.
2
+
3
+ Output format: per sheet, cell data is rendered as a markdown table whose
4
+ column headers are Excel column letters (A, B, C, ...) and whose first
5
+ column is the original Excel row number. When an image is anchored to a
6
+ row, the current table chunk is flushed, the [IMG:N] placeholder is
7
+ emitted, and a new table (re-rendering the header) resumes from the next
8
+ row. This preserves the spatial relationship between cell data and
9
+ images while keeping each chunk a valid markdown table that LLMs parse
10
+ natively.
11
+ """
2
12
 
3
13
  import zipfile
4
14
  from _common import ensure_packages
@@ -6,9 +16,35 @@ from _common import ensure_packages
6
16
  PACKAGES = {"openpyxl": "openpyxl"}
7
17
 
8
18
 
19
+ def _escape_md(v):
20
+ if v is None:
21
+ return ""
22
+ s = str(v).strip()
23
+ return (
24
+ s.replace("\\", "\\\\")
25
+ .replace("|", "\\|")
26
+ .replace("\r\n", "<br>")
27
+ .replace("\n", "<br>")
28
+ .replace("\r", "<br>")
29
+ )
30
+
31
+
32
+ def _render_chunk(chunk_rows, max_col, get_col_letter):
33
+ if not chunk_rows:
34
+ return []
35
+ headers = ["Row"] + [get_col_letter(c) for c in range(1, max_col + 1)]
36
+ out = ["| " + " | ".join(headers) + " |",
37
+ "|" + "|".join(["---"] * len(headers)) + "|"]
38
+ for row_num, cells in chunk_rows:
39
+ padded = list(cells) + [""] * (max_col - len(cells))
40
+ out.append(f"| {row_num} | " + " | ".join(padded[:max_col]) + " |")
41
+ return out
42
+
43
+
9
44
  def extract(file_path):
10
45
  ensure_packages(PACKAGES)
11
46
  from openpyxl import load_workbook
47
+ from openpyxl.utils import get_column_letter
12
48
  from openpyxl.worksheet.worksheet import Worksheet
13
49
 
14
50
  wb = load_workbook(file_path, data_only=True)
@@ -21,57 +57,83 @@ def extract(file_path):
21
57
  for sheet_name in wb.sheetnames:
22
58
  ws = wb[sheet_name]
23
59
  text_parts.append(f"[Sheet: {sheet_name}]")
60
+ text_parts.append("")
24
61
 
25
62
  if not isinstance(ws, Worksheet):
26
63
  text_parts.append(f"({type(ws).__name__} — 데이터 없음)")
64
+ text_parts.append("")
27
65
  continue
28
66
 
29
67
  if ws.max_row is None or ws.max_row == 0:
30
68
  text_parts.append("(empty sheet)")
69
+ text_parts.append("")
31
70
  continue
32
71
 
33
- # Collect images for this sheet with anchor row info
34
- ws_images = getattr(ws, '_images', [])
35
- row_img_markers = {} # row_number -> list of img_idx
72
+ # Merged cells annotation
73
+ merged = list(ws.merged_cells.ranges)
74
+ if merged:
75
+ text_parts.append(f"[Merged: {', '.join(str(r) for r in merged)}]")
76
+ text_parts.append("")
77
+
78
+ ws_images = getattr(ws, "_images", [])
79
+ row_img_markers = {}
36
80
  for img in ws_images:
37
- data_fn = getattr(img, '_data', None)
81
+ data_fn = getattr(img, "_data", None)
38
82
  blob = data_fn() if callable(data_fn) else b""
39
- if blob:
40
- img_idx += 1
41
- images.append({
42
- "data": blob,
43
- "ext": "png",
44
- "context": f"sheet '{sheet_name}'",
45
- })
46
- anchor = getattr(img, 'anchor', None)
47
- anchor_row = None
48
- if anchor:
49
- _from = getattr(anchor, '_from', None)
50
- if _from:
51
- anchor_row = getattr(_from, 'row', None)
52
- if anchor_row is not None:
53
- anchor_row += 1 # openpyxl anchor is 0-based
54
- if anchor_row is None:
55
- anchor_row = ws.max_row or 1
56
- row_img_markers.setdefault(anchor_row, []).append(img_idx)
57
-
58
- # Output rows with inline image markers at anchor positions
83
+ if not blob:
84
+ continue
85
+ img_idx += 1
86
+ anchor = getattr(img, "anchor", None)
87
+ anchor_row = None
88
+ anchor_col = None
89
+ if anchor:
90
+ _from = getattr(anchor, "_from", None)
91
+ if _from:
92
+ anchor_row = getattr(_from, "row", None)
93
+ anchor_col = getattr(_from, "col", None)
94
+ if anchor_row is not None:
95
+ anchor_row += 1
96
+ if anchor_col is not None:
97
+ anchor_col += 1
98
+ if anchor_row is None:
99
+ anchor_row = ws.max_row or 1
100
+ cell_ref = ""
101
+ if anchor_col is not None:
102
+ cell_ref = f" anchor:{get_column_letter(anchor_col)}{anchor_row}"
103
+ else:
104
+ cell_ref = f" anchor:row {anchor_row}"
105
+ images.append({
106
+ "data": blob,
107
+ "ext": "png",
108
+ "context": f"sheet '{sheet_name}'{cell_ref}",
109
+ })
110
+ row_img_markers.setdefault(anchor_row, []).append(img_idx)
111
+
112
+ max_col = ws.max_column or 1
113
+ chunk = []
114
+
59
115
  for row in ws.iter_rows(values_only=False):
60
- cells = []
61
- for cell in row:
62
- val = cell.value
63
- cells.append(str(val).strip() if val is not None else "")
64
116
  row_num = row[0].row
65
- text_parts.append(f"[{row[0].column_letter}{row_num}] " + " | ".join(cells))
66
- for idx in row_img_markers.get(row_num, []):
67
- text_parts.append(f"[IMG:{idx}]")
117
+ cells = [_escape_md(c.value) for c in row]
118
+ chunk.append((row_num, cells))
119
+
120
+ if row_num in row_img_markers:
121
+ text_parts.extend(_render_chunk(chunk, max_col, get_column_letter))
122
+ text_parts.append("")
123
+ for idx in row_img_markers[row_num]:
124
+ text_parts.append(f"[IMG:{idx}]")
125
+ text_parts.append("")
126
+ chunk = []
127
+
128
+ if chunk:
129
+ text_parts.extend(_render_chunk(chunk, max_col, get_column_letter))
130
+ text_parts.append("")
68
131
 
69
- # Embedded objects from XLSX ZIP
70
132
  try:
71
- with zipfile.ZipFile(file_path, 'r') as zf:
133
+ with zipfile.ZipFile(file_path, "r") as zf:
72
134
  for name in zf.namelist():
73
- if 'embeddings/' in name.lower():
74
- filename = name.split('/')[-1]
135
+ if "embeddings/" in name.lower():
136
+ filename = name.split("/")[-1]
75
137
  data = zf.read(name)
76
138
  emb_idx += 1
77
139
  embedded.append({"filename": filename, "data": data})
@@ -44,6 +44,13 @@ def extract_recursive(file_path: Path, out_dir: Path):
44
44
  "context": img.get("context", ""),
45
45
  })
46
46
 
47
+ # Save pre-named slide images (PPTX screenshots)
48
+ saved_slides = []
49
+ for s in result.get("slide_images", []):
50
+ slide_path = out_dir / s["filename"]
51
+ slide_path.write_bytes(s["data"])
52
+ saved_slides.append({"filename": s["filename"], "size": len(s["data"])})
53
+
47
54
  # Save embedded/attached files and recurse
48
55
  prefix = "attachment" if result.get("metadata", {}).get("email_headers") else "embedded"
49
56
  saved_embedded = []
@@ -68,12 +75,13 @@ def extract_recursive(file_path: Path, out_dir: Path):
68
75
  saved_embedded.append(entry)
69
76
 
70
77
  # Generate {stem}.md index in parent of out_dir
71
- _generate_index_md(out_dir, file_path, result, saved_images, saved_embedded)
78
+ _generate_index_md(out_dir, file_path, result, saved_images, saved_embedded, saved_slides)
72
79
 
73
80
 
74
81
 
75
82
  def _generate_index_md(out_dir: Path, file_path: Path, result: dict,
76
- saved_images: list, saved_embedded: list):
83
+ saved_images: list, saved_embedded: list,
84
+ saved_slides: list | None = None):
77
85
  """Generate {stem}.md in parent of out_dir, summarizing extraction results.
78
86
 
79
87
  Images and embedded files are placed inline via [IMG:N]/[EMB:N] placeholders
@@ -120,7 +128,8 @@ def _generate_index_md(out_dir: Path, file_path: Path, result: dict,
120
128
  referenced_imgs.add(idx)
121
129
  if 1 <= idx <= len(saved_images):
122
130
  img = saved_images[idx - 1]
123
- return f"![{img['filename']}]({rel_prefix}/{img['filename']})"
131
+ alt = img.get('context', '') or img['filename']
132
+ return f"![{alt}]({rel_prefix}/{img['filename']})"
124
133
  return m.group(0)
125
134
 
126
135
  def replace_emb(m):
@@ -136,8 +145,18 @@ def _generate_index_md(out_dir: Path, file_path: Path, result: dict,
136
145
  return f"> embedded: [{name}]({rel_prefix}/{name})"
137
146
  return m.group(0)
138
147
 
148
+ slides_list = saved_slides or []
149
+
150
+ def replace_slide(m):
151
+ idx = int(m.group(1))
152
+ if 1 <= idx <= len(slides_list):
153
+ fname = slides_list[idx - 1]["filename"]
154
+ return f"![{fname}]({rel_prefix}/{fname})"
155
+ return m.group(0)
156
+
139
157
  text = re.sub(r'\[IMG:(\d+)\]', replace_img, text)
140
158
  text = re.sub(r'\[EMB:(\d+)\]', replace_emb, text)
159
+ text = re.sub(r'\[SLIDE:(\d+)\]', replace_slide, text)
141
160
 
142
161
  if len(text) > 10000:
143
162
  body_path = out_dir / "body.txt"