@simplysm/sd-claude 14.0.47 → 14.0.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. package/{claude/references/sd-simplysm14/sd-claude/usage.md → README.md} +2 -2
  2. package/claude/rules/sd-claude-rules.md +25 -10
  3. package/claude/rules/sd-options.md +11 -6
  4. package/claude/sd-subagent-start.sh +6 -0
  5. package/claude/settings.json +1 -12
  6. package/claude/skills/sd-check/SKILL.md +43 -12
  7. package/claude/skills/sd-claude-docs/SKILL.md +30 -58
  8. package/claude/skills/sd-claude-docs/references/package-claudemd.md +12 -0
  9. package/claude/skills/sd-claude-docs/references/package-doc-gen.md +26 -13
  10. package/claude/skills/sd-commit/SKILL.md +1 -1
  11. package/claude/skills/sd-debug/SKILL.md +5 -3
  12. package/claude/skills/sd-deliverable/SKILL.md +1 -1
  13. package/claude/skills/sd-dev/SKILL.md +14 -9
  14. package/claude/skills/sd-doc-extract/SKILL.md +8 -10
  15. package/claude/skills/sd-doc-extract/_common.py +8 -1
  16. package/claude/skills/sd-doc-extract/_extract_docx.py +74 -34
  17. package/claude/skills/sd-doc-extract/_extract_pdf.py +12 -1
  18. package/claude/skills/sd-doc-extract/_extract_pptx.py +103 -23
  19. package/claude/skills/sd-doc-extract/_extract_xlsb.py +93 -4
  20. package/claude/skills/sd-doc-extract/_extract_xlsx.py +98 -36
  21. package/claude/skills/sd-doc-extract/extract.py +22 -3
  22. package/claude/skills/sd-inner-clarify/SKILL.md +78 -0
  23. package/claude/skills/sd-inner-debug/SKILL.md +1 -1
  24. package/claude/skills/sd-inner-review/SKILL.md +13 -0
  25. package/claude/skills/sd-issue/SKILL.md +1 -1
  26. package/claude/skills/sd-outlook/SKILL.md +1 -1
  27. package/claude/skills/sd-plan/SKILL.md +50 -17
  28. package/claude/skills/sd-prompt/SKILL.md +180 -178
  29. package/claude/skills/sd-prompt/references/eval-runner.md +5 -30
  30. package/claude/skills/sd-prompt/references/sd-eval-env-template.md +23 -0
  31. package/claude/skills/sd-refactor/SKILL.md +2 -2
  32. package/claude/skills/sd-tdd/SKILL.md +45 -16
  33. package/claude/skills/sd-use/SKILL.md +84 -80
  34. package/claude/skills/sd-wbs/SKILL.md +84 -27
  35. package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/assets.md +2 -3
  36. package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/hooks.md +7 -6
  37. package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/scripts.md +1 -9
  38. package/package.json +3 -2
  39. package/scripts/sync.mjs +4 -2
  40. package/claude/references/sd-simplysm14/angular/docs/bootstrap.md +0 -48
  41. package/claude/references/sd-simplysm14/angular/docs/directives.md +0 -236
  42. package/claude/references/sd-simplysm14/angular/docs/features.md +0 -379
  43. package/claude/references/sd-simplysm14/angular/docs/pipes.md +0 -32
  44. package/claude/references/sd-simplysm14/angular/docs/plugins.md +0 -37
  45. package/claude/references/sd-simplysm14/angular/docs/provider-types.md +0 -283
  46. package/claude/references/sd-simplysm14/angular/docs/providers.md +0 -379
  47. package/claude/references/sd-simplysm14/angular/docs/styling.md +0 -222
  48. package/claude/references/sd-simplysm14/angular/docs/type-utilities.md +0 -250
  49. package/claude/references/sd-simplysm14/angular/docs/ui-data.md +0 -275
  50. package/claude/references/sd-simplysm14/angular/docs/ui-form.md +0 -490
  51. package/claude/references/sd-simplysm14/angular/docs/ui-layout.md +0 -140
  52. package/claude/references/sd-simplysm14/angular/docs/ui-navigation.md +0 -273
  53. package/claude/references/sd-simplysm14/angular/docs/ui-overlay.md +0 -157
  54. package/claude/references/sd-simplysm14/angular/docs/ui-visual.md +0 -127
  55. package/claude/references/sd-simplysm14/angular/docs/utils.md +0 -295
  56. package/claude/references/sd-simplysm14/angular/usage.md +0 -489
  57. package/claude/references/sd-simplysm14/capacitor-plugin-auto-update/usage.md +0 -182
  58. package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/file-operations.md +0 -154
  59. package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/permissions.md +0 -84
  60. package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/storage-paths.md +0 -107
  61. package/claude/references/sd-simplysm14/capacitor-plugin-file-system/docs/types.md +0 -83
  62. package/claude/references/sd-simplysm14/capacitor-plugin-file-system/usage.md +0 -133
  63. package/claude/references/sd-simplysm14/capacitor-plugin-intent/usage.md +0 -203
  64. package/claude/references/sd-simplysm14/capacitor-plugin-usb-storage/usage.md +0 -258
  65. package/claude/references/sd-simplysm14/core-browser/usage.md +0 -306
  66. package/claude/references/sd-simplysm14/core-common/docs/errors.md +0 -82
  67. package/claude/references/sd-simplysm14/core-common/docs/extensions.md +0 -167
  68. package/claude/references/sd-simplysm14/core-common/docs/features.md +0 -136
  69. package/claude/references/sd-simplysm14/core-common/docs/types.md +0 -245
  70. package/claude/references/sd-simplysm14/core-common/docs/utils.md +0 -591
  71. package/claude/references/sd-simplysm14/core-common/usage.md +0 -255
  72. package/claude/references/sd-simplysm14/core-node/docs/child-process.md +0 -182
  73. package/claude/references/sd-simplysm14/core-node/docs/features.md +0 -214
  74. package/claude/references/sd-simplysm14/core-node/docs/file-system.md +0 -509
  75. package/claude/references/sd-simplysm14/core-node/docs/file-watching.md +0 -139
  76. package/claude/references/sd-simplysm14/core-node/docs/logging.md +0 -180
  77. package/claude/references/sd-simplysm14/core-node/docs/path.md +0 -176
  78. package/claude/references/sd-simplysm14/core-node/docs/utilities-cpx.md +0 -194
  79. package/claude/references/sd-simplysm14/core-node/docs/utilities-fsx.md +0 -469
  80. package/claude/references/sd-simplysm14/core-node/docs/utilities-pathx.md +0 -151
  81. package/claude/references/sd-simplysm14/core-node/docs/worker-threads.md +0 -334
  82. package/claude/references/sd-simplysm14/core-node/docs/worker.md +0 -205
  83. package/claude/references/sd-simplysm14/core-node/usage.md +0 -259
  84. package/claude/references/sd-simplysm14/excel/docs/core-classes.md +0 -453
  85. package/claude/references/sd-simplysm14/excel/docs/types.md +0 -459
  86. package/claude/references/sd-simplysm14/excel/docs/utilities.md +0 -194
  87. package/claude/references/sd-simplysm14/excel/docs/wrapper.md +0 -73
  88. package/claude/references/sd-simplysm14/excel/usage.md +0 -134
  89. package/claude/references/sd-simplysm14/lint/usage.md +0 -130
  90. package/claude/references/sd-simplysm14/orm-common/docs/core.md +0 -188
  91. package/claude/references/sd-simplysm14/orm-common/docs/expression.md +0 -190
  92. package/claude/references/sd-simplysm14/orm-common/docs/models.md +0 -17
  93. package/claude/references/sd-simplysm14/orm-common/docs/query-builder.md +0 -97
  94. package/claude/references/sd-simplysm14/orm-common/docs/queryable-executable.md +0 -250
  95. package/claude/references/sd-simplysm14/orm-common/docs/schema-builders.md +0 -364
  96. package/claude/references/sd-simplysm14/orm-common/docs/types.md +0 -522
  97. package/claude/references/sd-simplysm14/orm-common/usage.md +0 -229
  98. package/claude/references/sd-simplysm14/orm-node/docs/connections.md +0 -137
  99. package/claude/references/sd-simplysm14/orm-node/docs/core.md +0 -131
  100. package/claude/references/sd-simplysm14/orm-node/docs/types.md +0 -173
  101. package/claude/references/sd-simplysm14/orm-node/usage.md +0 -143
  102. package/claude/references/sd-simplysm14/sd-cli/usage.md +0 -782
  103. package/claude/references/sd-simplysm14/service-client/docs/features.md +0 -217
  104. package/claude/references/sd-simplysm14/service-client/docs/main.md +0 -148
  105. package/claude/references/sd-simplysm14/service-client/docs/protocol.md +0 -53
  106. package/claude/references/sd-simplysm14/service-client/docs/transport.md +0 -131
  107. package/claude/references/sd-simplysm14/service-client/docs/types.md +0 -129
  108. package/claude/references/sd-simplysm14/service-client/usage.md +0 -202
  109. package/claude/references/sd-simplysm14/service-common/docs/app-structure.md +0 -175
  110. package/claude/references/sd-simplysm14/service-common/docs/events.md +0 -64
  111. package/claude/references/sd-simplysm14/service-common/docs/protocol.md +0 -331
  112. package/claude/references/sd-simplysm14/service-common/docs/service-types.md +0 -90
  113. package/claude/references/sd-simplysm14/service-common/docs/types.md +0 -19
  114. package/claude/references/sd-simplysm14/service-common/usage.md +0 -154
  115. package/claude/references/sd-simplysm14/service-server/docs/auth.md +0 -64
  116. package/claude/references/sd-simplysm14/service-server/docs/core.md +0 -174
  117. package/claude/references/sd-simplysm14/service-server/docs/legacy.md +0 -25
  118. package/claude/references/sd-simplysm14/service-server/docs/main.md +0 -88
  119. package/claude/references/sd-simplysm14/service-server/docs/protocol.md +0 -33
  120. package/claude/references/sd-simplysm14/service-server/docs/services.md +0 -94
  121. package/claude/references/sd-simplysm14/service-server/docs/transport-http.md +0 -93
  122. package/claude/references/sd-simplysm14/service-server/docs/transport-socket.md +0 -119
  123. package/claude/references/sd-simplysm14/service-server/docs/types.md +0 -36
  124. package/claude/references/sd-simplysm14/service-server/docs/utils.md +0 -22
  125. package/claude/references/sd-simplysm14/service-server/usage.md +0 -171
  126. package/claude/references/sd-simplysm14/storage/usage.md +0 -301
  127. package/claude/references/sd-simplysm14.md +0 -35
  128. package/claude/rules/sd-clarify.md +0 -23
  129. package/claude/sd-session-start.sh +0 -10
  130. /package/{claude/references/sd-simplysm14/sd-claude/docs → docs}/cli.md +0 -0
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  name: sd-doc-extract
3
3
  description: 문서 파일(docx, xlsx, xlsb, pptx, pdf, eml, msg)에서 텍스트, 이미지, 임베디드 파일을 추출하는 스킬. "문서 추출", "문서 분해", "docx 분석", "PDF 내용 뽑아줘", "eml 파일 추출" 등을 요청할 때 사용한다.
4
- model: claude-haiku-4-5
4
+ model: haiku
5
5
  ---
6
6
 
7
7
  # sd-doc-extract: 문서 분해/추출
@@ -15,7 +15,7 @@ model: claude-haiku-4-5
15
15
  | `.docx` | Word |
16
16
  | `.xlsx` | Excel |
17
17
  | `.xlsb` | Excel (Binary) |
18
- | `.pptx` | PowerPoint |
18
+ | `.pptx` | PowerPoint (Windows + PowerPoint 설치 필요) |
19
19
  | `.pdf` | PDF |
20
20
  | `.eml` | Email |
21
21
  | `.msg` | Email (Outlook) |
@@ -57,30 +57,28 @@ python .claude/skills/sd-doc-extract/extract.py "<file_path>"
57
57
 
58
58
  | 포맷 | 이미지 배치 | Embedded 배치 |
59
59
  |------|-----------|--------------|
60
- | PPTX | shape 순회 PICTURE를 만나면 그 자리에 `[IMG:N]` 삽입 | OLE 객체를 만난 슬라이드 내에 `[EMB:N]` 삽입 |
60
+ | PPTX | 슬라이드당 PNG 렌더링(PowerPoint COM) + `[SLIDE:N]` 삽입, 개별 이미지 분해 없음 | OLE 객체를 만난 슬라이드 내에 `[EMB:N]` 삽입 |
61
61
  | DOCX | run 순회 중 drawing/blip을 만나면 그 문단에 `[IMG:N]` 삽입 | OLE 객체를 만난 위치에 `[EMB:N]` 삽입 |
62
62
  | PDF | 페이지별 이미지를 해당 페이지 텍스트 내에 `[IMG:N]` 삽입 | 첨부파일은 문서 끝에 `[EMB:N]` 배치 (PDF 첨부는 페이지 귀속이 아님) |
63
- | XLSX | 이미지의 anchor 좌표 근처에 `[IMG:N]` 삽입 | 시트의 embeddings 디렉토리에서 추출한 객체를 해당 시트 끝에 `[EMB:N]` 배치 |
64
- | XLSB | (이미지 없음) | (embedded 없음) |
63
+ | XLSX | 시트 데이터는 마크다운 테이블(열 헤더=Excel 열 문자 A/B/C…, 첫 열=원본 행 번호)로 렌더링. 이미지 앵커 행에서 테이블을 분리하고 `[IMG:N]` 삽입 후 새 테이블 재개 | 시트의 embeddings 디렉토리에서 추출한 객체를 문서 끝에 `[EMB:N]` 배치 |
64
+ | XLSB | 시트 데이터는 XLSX와 동일한 마크다운 테이블 포맷 (이미지 없음). VBA 매크로가 있으면 모듈별 소스코드를 fenced code block으로 추출하고, 의심 패턴(AutoExec/Suspicious/IOC) 분석 테이블을 첨부 | (embedded 없음) |
65
65
  | EMAIL | HTML 본문의 `cid:` 참조 위치에 `[IMG:N]` 삽입, data URI 이미지도 등장 위치에 삽입 | 첨부파일은 본문 뒤에 `[EMB:N]` 배치 |
66
66
 
67
67
  ### 치환 결과 예시
68
68
 
69
69
  ```markdown
70
70
  [Slide 1]
71
+ ![slide_001.png](scheduling-1/slide_001.png)
71
72
  [TXT] (left=0.4", top=0.4") 1. 일정 및 정보 변경
72
73
  [TXT] (left=0.6", top=0.8") - Case1~5번 공통 적용 사항
73
-
74
- ![img_001](scheduling-1/img_001.png)
75
-
76
74
  [TXT] (left=0.5", top=1.4") 1) 구성
77
75
  [TXT] (left=0.8", top=2.8") 프로세스: BOA 선택 ...
78
76
 
79
- ![img_002](scheduling-1/img_002.png)
80
-
81
77
  > embedded: [embedded_001_worksheet.xlsb](scheduling-1/embedded_001_worksheet.md)
82
78
  ```
83
79
 
80
+ PPTX는 슬라이드별 PNG 렌더링(PowerPoint COM)으로 오버레이 도형·주석 박스의 공간 관계를 보존한다. 개별 이미지 추출은 하지 않는다(스크린샷에 포함되므로 중복). 텍스트 shape는 `[TXT]`로 병행 수록하여 원문 인용 정확도를 확보한다.
81
+
84
82
  ## 주의사항
85
83
 
86
84
  - 바이너리 문서를 Read 도구로 직접 열면 의미 있는 내용을 얻을 수 없다. 반드시 `extract.py`를 통해 추출한다.
@@ -65,7 +65,14 @@ def ext_from_content_type(content_type: str) -> str:
65
65
  def normalize_cell(text) -> str:
66
66
  if text is None:
67
67
  return ""
68
- return str(text).strip().replace("\n", " ")
68
+ return (
69
+ str(text).strip()
70
+ .replace("\\", "\\\\")
71
+ .replace("|", "\\|")
72
+ .replace("\r\n", "<br>")
73
+ .replace("\n", "<br>")
74
+ .replace("\r", "<br>")
75
+ )
69
76
 
70
77
 
71
78
  def parse_heading_level(style_name: str) -> int | None:
@@ -9,6 +9,8 @@ def extract(file_path):
9
9
  ensure_packages(PACKAGES)
10
10
  from docx import Document
11
11
  from docx.oxml.ns import qn
12
+ from docx.table import Table as DocxTable
13
+ from docx.text.paragraph import Paragraph
12
14
 
13
15
  doc = Document(file_path)
14
16
  text_parts = []
@@ -17,47 +19,85 @@ def extract(file_path):
17
19
  img_idx = 0
18
20
  emb_idx = 0
19
21
 
20
- for para in doc.paragraphs:
21
- para_img_markers = []
22
+ def _extract_drawing(drawing):
23
+ nonlocal img_idx
24
+ blip = drawing.find(f".//{qn('a:blip')}")
25
+ if blip is None:
26
+ return None
27
+ embed_id = blip.get(qn("r:embed"))
28
+ if not embed_id:
29
+ return None
30
+ rel = doc.part.rels.get(embed_id)
31
+ if not rel or not hasattr(rel, 'target_part'):
32
+ return None
33
+ ext = ext_from_content_type(rel.target_part.content_type)
34
+ img_idx += 1
35
+ doc_pr = drawing.find(f".//{qn('wp:docPr')}")
36
+ alt = ""
37
+ if doc_pr is not None:
38
+ alt = doc_pr.get("descr", "") or doc_pr.get("title", "")
39
+ images.append({
40
+ "data": rel.target_part.blob,
41
+ "ext": ext,
42
+ "context": alt or "paragraph image",
43
+ })
44
+ return img_idx
45
+
46
+ def _process_paragraph(element):
47
+ para = Paragraph(element, doc)
48
+ style = para.style.name if para.style else ""
49
+ prefix = ""
50
+ if "Heading" in style:
51
+ level = parse_heading_level(style)
52
+ prefix = "#" * (level or 2) + " "
53
+
54
+ parts = []
22
55
  for run in para.runs:
56
+ if run.text:
57
+ parts.append(run.text)
23
58
  drawings = (run._element.findall(f".//{qn('wp:inline')}") +
24
59
  run._element.findall(f".//{qn('wp:anchor')}"))
25
- for drawing in drawings:
26
- blip = drawing.find(f".//{qn('a:blip')}")
27
- if blip is not None:
28
- embed_id = blip.get(qn("r:embed"))
29
- if embed_id:
30
- rel = doc.part.rels.get(embed_id)
31
- if rel and hasattr(rel, 'target_part'):
32
- ext = ext_from_content_type(rel.target_part.content_type)
33
- img_idx += 1
34
- images.append({
35
- "data": rel.target_part.blob,
36
- "ext": ext,
37
- "context": "paragraph image",
38
- })
39
- para_img_markers.append(f"[IMG:{img_idx}]")
40
-
41
- text = para.text.strip()
42
- if text:
43
- style = para.style.name if para.style else ""
44
- prefix = ""
45
- if "Heading" in style:
46
- level = parse_heading_level(style)
47
- if level is not None:
48
- prefix = "#" * level + " "
49
- else:
50
- prefix = "## "
51
- text_parts.append(f"{prefix}{text}")
60
+ for d in drawings:
61
+ idx = _extract_drawing(d)
62
+ if idx is not None:
63
+ parts.append(f"[IMG:{idx}]")
52
64
 
53
- for marker in para_img_markers:
54
- text_parts.append(marker)
65
+ line = "".join(parts).strip()
66
+ if line:
67
+ text_parts.append(f"{prefix}{line}")
55
68
 
56
- for t_idx, table in enumerate(doc.tables):
57
- text_parts.append(f"\n### Table {t_idx + 1}\n")
58
- for row in table.rows:
69
+ def _process_table(element):
70
+ table = DocxTable(element, doc)
71
+ rows = list(table.rows)
72
+ if not rows:
73
+ return
74
+ text_parts.append("")
75
+ for r_idx, row in enumerate(rows):
59
76
  cells = [normalize_cell(cell.text) for cell in row.cells]
60
77
  text_parts.append("| " + " | ".join(cells) + " |")
78
+ if r_idx == 0:
79
+ text_parts.append("|" + "|".join(["---"] * len(cells)) + "|")
80
+ text_parts.append("")
81
+
82
+ # Iterate body elements in document order (paragraphs and tables interleaved)
83
+ for child in doc.element.body:
84
+ tag = child.tag.split('}')[-1] if '}' in child.tag else child.tag
85
+ if tag == 'p':
86
+ _process_paragraph(child)
87
+ elif tag == 'tbl':
88
+ _process_table(child)
89
+
90
+ # Headers and footers
91
+ for sec_idx, section in enumerate(doc.sections):
92
+ h_parts = [p.text.strip() for p in section.header.paragraphs if p.text.strip()]
93
+ f_parts = [p.text.strip() for p in section.footer.paragraphs if p.text.strip()]
94
+ if h_parts or f_parts:
95
+ text_parts.append("")
96
+ text_parts.append(f"[Header/Footer — Section {sec_idx + 1}]")
97
+ if h_parts:
98
+ text_parts.append(f"Header: {' | '.join(h_parts)}")
99
+ if f_parts:
100
+ text_parts.append(f"Footer: {' | '.join(f_parts)}")
61
101
 
62
102
  # OLE embedded objects
63
103
  seen = set()
@@ -37,11 +37,22 @@ def extract(file_path):
37
37
  if w <= 4 or h <= 4:
38
38
  continue
39
39
 
40
+ # Get image position on page
41
+ try:
42
+ rects = page.get_image_rects(xref)
43
+ if rects:
44
+ r = rects[0]
45
+ bbox_str = f" bbox:({r.x0:.0f},{r.y0:.0f},{r.x1:.0f},{r.y1:.0f})"
46
+ else:
47
+ bbox_str = ""
48
+ except Exception:
49
+ bbox_str = ""
50
+
40
51
  img_idx += 1
41
52
  images.append({
42
53
  "data": data,
43
54
  "ext": ext,
44
- "context": f"Page {page_num}",
55
+ "context": f"Page {page_num}{bbox_str}",
45
56
  })
46
57
  page_img_indices[page_num].append(img_idx)
47
58
 
@@ -1,8 +1,17 @@
1
- """PPTX handler: extract text, images, and OLE embedded objects."""
1
+ """PPTX handler: render slides to PNG via PowerPoint COM, extract text and OLE embedded.
2
2
 
3
- from _common import ensure_packages, ext_from_content_type
3
+ Individual image/shape extraction is intentionally omitted — slide screenshots
4
+ contain all visuals including overlay shapes (boxes, arrows, annotations) that
5
+ lose their spatial relationship when decomposed. Requires Windows + Microsoft
6
+ PowerPoint installed.
7
+ """
4
8
 
5
- PACKAGES = {"python-pptx": "pptx"}
9
+ import tempfile
10
+ from pathlib import Path
11
+
12
+ from _common import ensure_packages
13
+
14
+ PACKAGES = {"pywin32": "win32com.client", "python-pptx": "pptx"}
6
15
 
7
16
 
8
17
  def _emu_to_inches(emu):
@@ -15,37 +24,107 @@ def _pos(shape):
15
24
  return f"(left={_emu_to_inches(shape.left)}\", top={_emu_to_inches(shape.top)}\")"
16
25
 
17
26
 
27
+ def _extract_shapes(shapes, text_parts):
28
+ for shape in shapes:
29
+ if shape.shape_type == 6: # MSO_SHAPE_TYPE.GROUP
30
+ _extract_shapes(shape.shapes, text_parts)
31
+ elif shape.has_table:
32
+ tbl = shape.table
33
+ text_parts.append(f"[TABLE] {_pos(shape)}")
34
+ for r_idx, row in enumerate(tbl.rows):
35
+ cells = [
36
+ cell.text.strip().replace("\\", "\\\\").replace("|", "\\|")
37
+ .replace("\r\n", "<br>").replace("\n", "<br>").replace("\r", "<br>")
38
+ for cell in row.cells
39
+ ]
40
+ text_parts.append("| " + " | ".join(cells) + " |")
41
+ if r_idx == 0:
42
+ text_parts.append("|" + "|".join(["---"] * len(cells)) + "|")
43
+ elif hasattr(shape, "text") and shape.text.strip():
44
+ text = shape.text.strip().replace("\n", "\n ")
45
+ text_parts.append(f"[TXT] {_pos(shape)} {text}")
46
+
47
+
48
+ def _render_slides_via_com(file_path: str, tmp_dir: Path, slide_count: int,
49
+ width: int, height: int) -> list[bytes]:
50
+ import win32com.client
51
+ import pythoncom
52
+
53
+ pythoncom.CoInitialize()
54
+ try:
55
+ app = win32com.client.DispatchEx("PowerPoint.Application")
56
+ try:
57
+ try:
58
+ app.DisplayAlerts = 0
59
+ except Exception:
60
+ pass
61
+ abs_path = str(Path(file_path).resolve())
62
+ prs = app.Presentations.Open(abs_path, ReadOnly=True, Untitled=False,
63
+ WithWindow=False)
64
+ try:
65
+ results = []
66
+ for i in range(1, slide_count + 1):
67
+ tmp_path = tmp_dir / f"__tmp_slide_{i}.png"
68
+ prs.Slides(i).Export(str(tmp_path), "PNG", width, height)
69
+ results.append(tmp_path.read_bytes())
70
+ tmp_path.unlink()
71
+ return results
72
+ finally:
73
+ prs.Close()
74
+ finally:
75
+ app.Quit()
76
+ finally:
77
+ pythoncom.CoUninitialize()
78
+
79
+
18
80
  def extract(file_path):
19
81
  ensure_packages(PACKAGES)
20
82
  from pptx import Presentation
21
- from pptx.enum.shapes import MSO_SHAPE_TYPE
22
83
 
23
84
  prs = Presentation(file_path)
85
+ slide_count = len(prs.slides)
86
+
87
+ target_width = 1920
88
+ if prs.slide_width and prs.slide_height:
89
+ target_height = int(target_width * prs.slide_height / prs.slide_width)
90
+ else:
91
+ target_height = 1080
92
+
93
+ try:
94
+ with tempfile.TemporaryDirectory() as tmpdir:
95
+ slide_pngs = _render_slides_via_com(
96
+ file_path, Path(tmpdir), slide_count, target_width, target_height
97
+ )
98
+ except Exception as e:
99
+ raise RuntimeError(
100
+ f"PowerPoint COM rendering failed: {e}. "
101
+ "This extractor requires Windows with Microsoft PowerPoint installed."
102
+ ) from e
103
+
24
104
  text_parts = []
25
- images = []
105
+ slide_images = []
26
106
  embedded = []
27
- img_idx = 0
28
107
  emb_idx = 0
29
108
 
30
109
  for slide_num, slide in enumerate(prs.slides, 1):
31
110
  text_parts.append(f"[Slide {slide_num}]")
32
111
 
33
- for shape in slide.shapes:
34
- if shape.shape_type == MSO_SHAPE_TYPE.PICTURE:
35
- ext = ext_from_content_type(shape.image.content_type)
36
- img_idx += 1
37
- images.append({
38
- "data": shape.image.blob,
39
- "ext": ext,
40
- "context": f"Slide {slide_num} {_pos(shape)}",
41
- })
42
- text_parts.append(f"[IMG:{img_idx}]")
43
-
44
- if hasattr(shape, "text") and shape.text.strip():
45
- text = shape.text.strip().replace("\n", "\n ")
46
- text_parts.append(f"[TXT] {_pos(shape)} {text}")
47
-
48
- # OLE embedded objects from slide relationships
112
+ slide_images.append({
113
+ "filename": f"slide_{slide_num:03d}.png",
114
+ "data": slide_pngs[slide_num - 1],
115
+ })
116
+ text_parts.append(f"[SLIDE:{slide_num}]")
117
+
118
+ _extract_shapes(slide.shapes, text_parts)
119
+
120
+ # Speaker notes
121
+ if slide.has_notes_slide:
122
+ notes_frame = slide.notes_slide.notes_text_frame
123
+ notes_text = notes_frame.text.strip() if notes_frame else ""
124
+ if notes_text:
125
+ notes_text = notes_text.replace("\n", "\n ")
126
+ text_parts.append(f"[Notes] {notes_text}")
127
+
49
128
  seen = set()
50
129
  for rel in slide.part.rels.values():
51
130
  reltype = rel.reltype or ""
@@ -69,7 +148,8 @@ def extract(file_path):
69
148
 
70
149
  return {
71
150
  "text": "\n".join(text_parts),
72
- "images": images,
151
+ "images": [],
73
152
  "embedded": embedded,
74
153
  "metadata": {},
154
+ "slide_images": slide_images,
75
155
  }
@@ -1,8 +1,36 @@
1
- """XLSB handler: extract cell data from binary Excel format."""
1
+ """XLSB handler: extract cell data and VBA macros from binary Excel format.
2
+
3
+ Output format matches the XLSX handler: per sheet, a markdown table with
4
+ Excel column letters as headers and the original row number in the first
5
+ column. VBA macros are extracted via oletools and appended as fenced code
6
+ blocks.
7
+ """
2
8
 
3
9
  from _common import ensure_packages
4
10
 
5
- PACKAGES = {"pyxlsb": "pyxlsb"}
11
+ PACKAGES = {"pyxlsb": "pyxlsb", "oletools": "oletools"}
12
+
13
+
14
+ def _escape_md(v):
15
+ if v is None:
16
+ return ""
17
+ s = str(v).strip()
18
+ return (
19
+ s.replace("\\", "\\\\")
20
+ .replace("|", "\\|")
21
+ .replace("\r\n", "<br>")
22
+ .replace("\n", "<br>")
23
+ .replace("\r", "<br>")
24
+ )
25
+
26
+
27
+ def _col_letter(n):
28
+ # 1-based column index → Excel letter (A, B, ..., Z, AA, AB, ...)
29
+ s = ""
30
+ while n > 0:
31
+ n, r = divmod(n - 1, 26)
32
+ s = chr(65 + r) + s
33
+ return s
6
34
 
7
35
 
8
36
  def extract(file_path):
@@ -14,10 +42,71 @@ def extract(file_path):
14
42
  with open_workbook(file_path) as wb:
15
43
  for sheet_name in wb.sheets:
16
44
  text_parts.append(f"[Sheet: {sheet_name}]")
45
+ text_parts.append("")
46
+
17
47
  with wb.get_sheet(sheet_name) as sheet:
48
+ rows_data = []
49
+ max_col = 0
18
50
  for row in sheet.rows():
19
- cells = [str(cell.v) if cell.v is not None else "" for cell in row]
20
- text_parts.append(" | ".join(cells))
51
+ if not row:
52
+ continue
53
+ row_num = row[0].r + 1 # pyxlsb is 0-based
54
+ cells = [_escape_md(cell.v) for cell in row]
55
+ if len(cells) > max_col:
56
+ max_col = len(cells)
57
+ rows_data.append((row_num, cells))
58
+
59
+ if not rows_data:
60
+ text_parts.append("(empty sheet)")
61
+ text_parts.append("")
62
+ continue
63
+
64
+ headers = ["Row"] + [_col_letter(c) for c in range(1, max_col + 1)]
65
+ text_parts.append("| " + " | ".join(headers) + " |")
66
+ text_parts.append("|" + "|".join(["---"] * len(headers)) + "|")
67
+ for row_num, cells in rows_data:
68
+ padded = list(cells) + [""] * (max_col - len(cells))
69
+ text_parts.append(
70
+ f"| {row_num} | " + " | ".join(padded[:max_col]) + " |"
71
+ )
72
+ text_parts.append("")
73
+
74
+ # --- VBA macro extraction ---
75
+ vba_parts = []
76
+ try:
77
+ from oletools.olevba import VBA_Parser
78
+
79
+ vba_parser = VBA_Parser(file_path)
80
+ if vba_parser.detect_vba_macros():
81
+ vba_parts.append("[VBA Macros]")
82
+ vba_parts.append("")
83
+ for filename, stream_path, vba_filename, vba_code in vba_parser.extract_macros():
84
+ vba_parts.append(f"### {vba_filename}")
85
+ vba_parts.append(f"<!-- stream: {stream_path} -->")
86
+ vba_parts.append("")
87
+ vba_parts.append("```vb")
88
+ vba_parts.append(vba_code)
89
+ vba_parts.append("```")
90
+ vba_parts.append("")
91
+
92
+ analysis = vba_parser.analyze_macros()
93
+ suspicious = [e for e in analysis if e[0] in ("AutoExec", "Suspicious", "IOC")]
94
+ if suspicious:
95
+ vba_parts.append("### Analysis")
96
+ vba_parts.append("")
97
+ vba_parts.append("| Type | Keyword | Description |")
98
+ vba_parts.append("|------|---------|-------------|")
99
+ for entry_type, keyword, description in suspicious:
100
+ vba_parts.append(f"| {entry_type} | `{keyword}` | {description} |")
101
+ vba_parts.append("")
102
+
103
+ vba_parser.close()
104
+ except Exception:
105
+ pass
106
+
107
+ if vba_parts:
108
+ text_parts.append("")
109
+ text_parts.extend(vba_parts)
21
110
 
22
111
  return {
23
112
  "text": "\n".join(text_parts),
@@ -1,4 +1,14 @@
1
- """XLSX handler: extract cell data, images, and embedded objects."""
1
+ """XLSX handler: extract cell data, images, and embedded objects.
2
+
3
+ Output format: per sheet, cell data is rendered as a markdown table whose
4
+ column headers are Excel column letters (A, B, C, ...) and whose first
5
+ column is the original Excel row number. When an image is anchored to a
6
+ row, the current table chunk is flushed, the [IMG:N] placeholder is
7
+ emitted, and a new table (re-rendering the header) resumes from the next
8
+ row. This preserves the spatial relationship between cell data and
9
+ images while keeping each chunk a valid markdown table that LLMs parse
10
+ natively.
11
+ """
2
12
 
3
13
  import zipfile
4
14
  from _common import ensure_packages
@@ -6,9 +16,35 @@ from _common import ensure_packages
6
16
  PACKAGES = {"openpyxl": "openpyxl"}
7
17
 
8
18
 
19
+ def _escape_md(v):
20
+ if v is None:
21
+ return ""
22
+ s = str(v).strip()
23
+ return (
24
+ s.replace("\\", "\\\\")
25
+ .replace("|", "\\|")
26
+ .replace("\r\n", "<br>")
27
+ .replace("\n", "<br>")
28
+ .replace("\r", "<br>")
29
+ )
30
+
31
+
32
+ def _render_chunk(chunk_rows, max_col, get_col_letter):
33
+ if not chunk_rows:
34
+ return []
35
+ headers = ["Row"] + [get_col_letter(c) for c in range(1, max_col + 1)]
36
+ out = ["| " + " | ".join(headers) + " |",
37
+ "|" + "|".join(["---"] * len(headers)) + "|"]
38
+ for row_num, cells in chunk_rows:
39
+ padded = list(cells) + [""] * (max_col - len(cells))
40
+ out.append(f"| {row_num} | " + " | ".join(padded[:max_col]) + " |")
41
+ return out
42
+
43
+
9
44
  def extract(file_path):
10
45
  ensure_packages(PACKAGES)
11
46
  from openpyxl import load_workbook
47
+ from openpyxl.utils import get_column_letter
12
48
  from openpyxl.worksheet.worksheet import Worksheet
13
49
 
14
50
  wb = load_workbook(file_path, data_only=True)
@@ -21,57 +57,83 @@ def extract(file_path):
21
57
  for sheet_name in wb.sheetnames:
22
58
  ws = wb[sheet_name]
23
59
  text_parts.append(f"[Sheet: {sheet_name}]")
60
+ text_parts.append("")
24
61
 
25
62
  if not isinstance(ws, Worksheet):
26
63
  text_parts.append(f"({type(ws).__name__} — 데이터 없음)")
64
+ text_parts.append("")
27
65
  continue
28
66
 
29
67
  if ws.max_row is None or ws.max_row == 0:
30
68
  text_parts.append("(empty sheet)")
69
+ text_parts.append("")
31
70
  continue
32
71
 
33
- # Collect images for this sheet with anchor row info
34
- ws_images = getattr(ws, '_images', [])
35
- row_img_markers = {} # row_number -> list of img_idx
72
+ # Merged cells annotation
73
+ merged = list(ws.merged_cells.ranges)
74
+ if merged:
75
+ text_parts.append(f"[Merged: {', '.join(str(r) for r in merged)}]")
76
+ text_parts.append("")
77
+
78
+ ws_images = getattr(ws, "_images", [])
79
+ row_img_markers = {}
36
80
  for img in ws_images:
37
- data_fn = getattr(img, '_data', None)
81
+ data_fn = getattr(img, "_data", None)
38
82
  blob = data_fn() if callable(data_fn) else b""
39
- if blob:
40
- img_idx += 1
41
- images.append({
42
- "data": blob,
43
- "ext": "png",
44
- "context": f"sheet '{sheet_name}'",
45
- })
46
- anchor = getattr(img, 'anchor', None)
47
- anchor_row = None
48
- if anchor:
49
- _from = getattr(anchor, '_from', None)
50
- if _from:
51
- anchor_row = getattr(_from, 'row', None)
52
- if anchor_row is not None:
53
- anchor_row += 1 # openpyxl anchor is 0-based
54
- if anchor_row is None:
55
- anchor_row = ws.max_row or 1
56
- row_img_markers.setdefault(anchor_row, []).append(img_idx)
57
-
58
- # Output rows with inline image markers at anchor positions
83
+ if not blob:
84
+ continue
85
+ img_idx += 1
86
+ anchor = getattr(img, "anchor", None)
87
+ anchor_row = None
88
+ anchor_col = None
89
+ if anchor:
90
+ _from = getattr(anchor, "_from", None)
91
+ if _from:
92
+ anchor_row = getattr(_from, "row", None)
93
+ anchor_col = getattr(_from, "col", None)
94
+ if anchor_row is not None:
95
+ anchor_row += 1
96
+ if anchor_col is not None:
97
+ anchor_col += 1
98
+ if anchor_row is None:
99
+ anchor_row = ws.max_row or 1
100
+ cell_ref = ""
101
+ if anchor_col is not None:
102
+ cell_ref = f" anchor:{get_column_letter(anchor_col)}{anchor_row}"
103
+ else:
104
+ cell_ref = f" anchor:row {anchor_row}"
105
+ images.append({
106
+ "data": blob,
107
+ "ext": "png",
108
+ "context": f"sheet '{sheet_name}'{cell_ref}",
109
+ })
110
+ row_img_markers.setdefault(anchor_row, []).append(img_idx)
111
+
112
+ max_col = ws.max_column or 1
113
+ chunk = []
114
+
59
115
  for row in ws.iter_rows(values_only=False):
60
- cells = []
61
- for cell in row:
62
- val = cell.value
63
- cells.append(str(val).strip() if val is not None else "")
64
116
  row_num = row[0].row
65
- text_parts.append(f"[{row[0].column_letter}{row_num}] " + " | ".join(cells))
66
- for idx in row_img_markers.get(row_num, []):
67
- text_parts.append(f"[IMG:{idx}]")
117
+ cells = [_escape_md(c.value) for c in row]
118
+ chunk.append((row_num, cells))
119
+
120
+ if row_num in row_img_markers:
121
+ text_parts.extend(_render_chunk(chunk, max_col, get_column_letter))
122
+ text_parts.append("")
123
+ for idx in row_img_markers[row_num]:
124
+ text_parts.append(f"[IMG:{idx}]")
125
+ text_parts.append("")
126
+ chunk = []
127
+
128
+ if chunk:
129
+ text_parts.extend(_render_chunk(chunk, max_col, get_column_letter))
130
+ text_parts.append("")
68
131
 
69
- # Embedded objects from XLSX ZIP
70
132
  try:
71
- with zipfile.ZipFile(file_path, 'r') as zf:
133
+ with zipfile.ZipFile(file_path, "r") as zf:
72
134
  for name in zf.namelist():
73
- if 'embeddings/' in name.lower():
74
- filename = name.split('/')[-1]
135
+ if "embeddings/" in name.lower():
136
+ filename = name.split("/")[-1]
75
137
  data = zf.read(name)
76
138
  emb_idx += 1
77
139
  embedded.append({"filename": filename, "data": data})