@cyber-dash-tech/revela 0.2.0 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -37,8 +37,17 @@ Given a research brief specifying your topic and axis, you will:
37
37
  Use the **\`revela-workspace-scan\`** tool in a single call to discover all document
38
38
  files in the workspace (PDF, Word, Excel, PowerPoint, CSV, text).
39
39
 
40
- Then read every relevant file using the \`read\` tool. For PDFs and Office formats,
41
- the Revela plugin extracts text transparently — just call \`read\` normally.
40
+ Then select the files relevant to your research axis.
41
+
42
+ For every selected file, call **\`revela-extract-document-materials\`** first.
43
+ - \`pptx\`, \`docx\`, and \`xlsx\` will produce a manifest plus extracted text and any available embedded materials
44
+ - unsupported file types will be skipped automatically
45
+
46
+ After that, use the \`read\` tool on:
47
+ - the original relevant file when you want the plain extracted text
48
+ - the generated manifest and extracted image/table files when visual or tabular evidence matters
49
+
50
+ For PDFs and Office formats, the Revela plugin extracts text transparently — just call \`read\` normally.
42
51
 
43
52
  ---
44
53
 
@@ -125,6 +134,7 @@ Gaps:
125
134
  - **NEVER** ask the user for information you can find through search or workspace files
126
135
  - **NEVER** use the raw \`write\` tool — always use \`revela-research-save\`
127
136
  - **NEVER** fabricate image URLs — only record URLs you actually found
137
+ - **Always** call \`revela-extract-document-materials\` for every selected workspace file before deciding which extracted materials to read next
128
138
  - **Always** include source attribution on every data point
129
139
  - **Always** use tables for comparative data (more useful than bullets for presentations)
130
140
  - **Preserve** raw data — the primary agent will select what to include in slides
@@ -0,0 +1,695 @@
1
+ import { createHash } from "crypto"
2
+ import { existsSync, mkdirSync, readFileSync, realpathSync, statSync, writeFileSync } from "fs"
3
+ import { basename, dirname, extname, isAbsolute, join, relative, resolve } from "path"
4
+ import { DOMParser } from "@xmldom/xmldom"
5
+ import { unzipSync } from "fflate"
6
+ import { extractDocx } from "../read-hooks/extractors/docx"
7
+ import { extractPptx } from "../read-hooks/extractors/pptx"
8
+ import { extractXlsx } from "../read-hooks/extractors/xlsx"
9
+
10
+ export type DocumentMaterial = {
11
+ path: string
12
+ source_ref: string
13
+ page_or_slide?: string
14
+ note?: string
15
+ }
16
+
17
+ export type SkippedAsset = {
18
+ source_ref: string
19
+ page_or_slide?: string
20
+ reason: "svg_asset" | "unmapped_media" | "low_value_asset"
21
+ kind?: "svg" | "icon" | "logo" | "overlay" | "decoration"
22
+ }
23
+
24
+ export type PptxSlideElement = {
25
+ id: string
26
+ kind: "text" | "image" | "shape"
27
+ zOrder: number
28
+ bbox?: { x: number; y: number; w: number; h: number }
29
+ likelyBackground?: boolean
30
+ likelyHeroImage?: boolean
31
+ likelyLogo?: boolean
32
+ likelyOverlayMask?: boolean
33
+ likelyDecoration?: boolean
34
+ text?: string
35
+ source_ref?: string
36
+ path?: string
37
+ asset_status?: "kept" | "skipped"
38
+ name?: string
39
+ }
40
+
41
+ export type PptxSlide = {
42
+ slide: string
43
+ width?: number
44
+ height?: number
45
+ elements: PptxSlideElement[]
46
+ }
47
+
48
+ export type DocumentMaterialsResult = {
49
+ status: "processed" | "skipped" | "failed"
50
+ source: string
51
+ type: "pptx" | "docx" | "xlsx" | "other"
52
+ cache_dir?: string
53
+ manifest_path?: string
54
+ text_path?: string
55
+ images?: DocumentMaterial[]
56
+ skipped_assets?: SkippedAsset[]
57
+ slides?: PptxSlide[]
58
+ tables?: DocumentMaterial[]
59
+ reason?: string
60
+ }
61
+
62
+ type SupportedType = Exclude<DocumentMaterialsResult["type"], "other">
63
+
64
+ type CachedManifest = {
65
+ source: string
66
+ type: SupportedType
67
+ fingerprint: string
68
+ cache_dir: string
69
+ manifest_path: string
70
+ text_path: string
71
+ images: DocumentMaterial[]
72
+ skipped_assets: SkippedAsset[]
73
+ slides: PptxSlide[]
74
+ tables: DocumentMaterial[]
75
+ }
76
+
77
+ type PptxImageExtraction = {
78
+ images: DocumentMaterial[]
79
+ skipped_assets: SkippedAsset[]
80
+ }
81
+
82
+ const SUPPORTED_EXTENSIONS: Record<string, SupportedType> = {
83
+ ".pptx": "pptx",
84
+ ".docx": "docx",
85
+ ".xlsx": "xlsx",
86
+ }
87
+
88
+ function normalizeZipTarget(basePath: string, target: string): string {
89
+ const segments = join(dirname(basePath), target).split("/")
90
+ const normalized: string[] = []
91
+
92
+ for (const segment of segments) {
93
+ if (!segment || segment === ".") continue
94
+ if (segment === "..") {
95
+ normalized.pop()
96
+ continue
97
+ }
98
+ normalized.push(segment)
99
+ }
100
+
101
+ return normalized.join("/")
102
+ }
103
+
104
+ function ensureWorkspacePath(filePath: string, workspaceDir: string): string {
105
+ const resolvedWorkspace = realpathSync(resolve(workspaceDir))
106
+ const candidate = isAbsolute(filePath) ? resolve(filePath) : resolve(workspaceDir, filePath)
107
+ const resolvedFile = existsSync(candidate)
108
+ ? realpathSync(candidate)
109
+ : candidate
110
+
111
+ if (resolvedFile !== resolvedWorkspace && !resolvedFile.startsWith(resolvedWorkspace + "/")) {
112
+ throw new Error("file must be within workspace")
113
+ }
114
+
115
+ return resolvedFile
116
+ }
117
+
118
+ function normalizeWorkspaceChild(filePath: string, workspaceDir: string): string {
119
+ const workspaceAlias = resolve(workspaceDir)
120
+ const workspaceReal = realpathSync(workspaceAlias)
121
+ const candidate = resolve(filePath)
122
+
123
+ if (existsSync(candidate)) return realpathSync(candidate)
124
+
125
+ if (candidate === workspaceAlias || candidate.startsWith(workspaceAlias + "/")) {
126
+ return join(workspaceReal, relative(workspaceAlias, candidate))
127
+ }
128
+
129
+ return candidate
130
+ }
131
+
132
+ function workspaceRelative(filePath: string, workspaceDir: string): string {
133
+ const resolvedWorkspace = realpathSync(resolve(workspaceDir))
134
+ const resolvedFile = normalizeWorkspaceChild(filePath, workspaceDir)
135
+ return relative(resolvedWorkspace, resolvedFile).replace(/\\/g, "/")
136
+ }
137
+
138
+ function buildFingerprint(filePath: string): string {
139
+ const stat = statSync(filePath)
140
+ return createHash("sha1")
141
+ .update(`${resolve(filePath)}:${stat.mtimeMs}:${stat.size}`)
142
+ .digest("hex")
143
+ }
144
+
145
+ function writeCachedBuffer(targetPath: string, buf: Uint8Array): void {
146
+ mkdirSync(dirname(targetPath), { recursive: true })
147
+ writeFileSync(targetPath, new Uint8Array(buf))
148
+ }
149
+
150
+ function materialPath(cacheDir: string, workspaceDir: string, ...segments: string[]): string {
151
+ return workspaceRelative(join(cacheDir, ...segments), workspaceDir)
152
+ }
153
+
154
+ function parseXml(files: Record<string, Uint8Array>, path: string): any | null {
155
+ const file = files[path]
156
+ if (!file) return null
157
+ return new DOMParser().parseFromString(new TextDecoder().decode(file), "text/xml")
158
+ }
159
+
160
+ function xmlLocalName(node: any): string {
161
+ return node?.localName ?? String(node?.nodeName ?? "").split(":").pop() ?? ""
162
+ }
163
+
164
+ function xmlElementChildren(node: any): any[] {
165
+ const children: any[] = []
166
+ const childNodes = node?.childNodes ?? []
167
+ for (let i = 0; i < childNodes.length; i++) {
168
+ const child = childNodes[i]
169
+ if (child?.nodeType === 1) children.push(child)
170
+ }
171
+ return children
172
+ }
173
+
174
+ function xmlDescendantsByLocalName(node: any, name: string): any[] {
175
+ const matches: any[] = []
176
+ const walk = (current: any) => {
177
+ for (const child of xmlElementChildren(current)) {
178
+ if (xmlLocalName(child) === name) matches.push(child)
179
+ walk(child)
180
+ }
181
+ }
182
+ walk(node)
183
+ return matches
184
+ }
185
+
186
+ function firstDescendantByLocalName(node: any, name: string): any | null {
187
+ const [match] = xmlDescendantsByLocalName(node, name)
188
+ return match ?? null
189
+ }
190
+
191
+ function extractShapeText(node: any): string | undefined {
192
+ const texts = xmlDescendantsByLocalName(node, "t")
193
+ .map((textNode) => textNode.textContent?.trim())
194
+ .filter(Boolean)
195
+ return texts.length > 0 ? texts.join("\n") : undefined
196
+ }
197
+
198
+ function extractElementName(node: any): string | undefined {
199
+ return firstDescendantByLocalName(node, "cNvPr")?.getAttribute?.("name") || undefined
200
+ }
201
+
202
+ function parseCoordinate(value: string | null | undefined): number | undefined {
203
+ if (value == null || value === "") return undefined
204
+ const parsed = Number(value)
205
+ return Number.isFinite(parsed) ? parsed : undefined
206
+ }
207
+
208
+ function extractElementBBox(node: any): { x: number; y: number; w: number; h: number } | undefined {
209
+ const xfrm = firstDescendantByLocalName(node, "xfrm")
210
+ if (!xfrm) return undefined
211
+
212
+ const off = firstDescendantByLocalName(xfrm, "off")
213
+ const ext = firstDescendantByLocalName(xfrm, "ext")
214
+ if (!off || !ext) return undefined
215
+
216
+ const x = parseCoordinate(off.getAttribute?.("x"))
217
+ const y = parseCoordinate(off.getAttribute?.("y"))
218
+ const w = parseCoordinate(ext.getAttribute?.("cx"))
219
+ const h = parseCoordinate(ext.getAttribute?.("cy"))
220
+ if ([x, y, w, h].some((value) => value == null)) return undefined
221
+
222
+ return { x: x!, y: y!, w: w!, h: h! }
223
+ }
224
+
225
+ function getPptxSlideSize(files: Record<string, Uint8Array>): { width: number; height: number } | undefined {
226
+ const doc = parseXml(files, "ppt/presentation.xml")
227
+ const size = firstDescendantByLocalName(doc, "sldSz")
228
+ if (!size) return undefined
229
+
230
+ const width = parseCoordinate(size.getAttribute?.("cx"))
231
+ const height = parseCoordinate(size.getAttribute?.("cy"))
232
+ if (width == null || height == null) return undefined
233
+ return { width, height }
234
+ }
235
+
236
+ function isNearCorner(
237
+ bbox: { x: number; y: number; w: number; h: number },
238
+ slideWidth: number,
239
+ slideHeight: number,
240
+ ): boolean {
241
+ const thresholdX = slideWidth * 0.12
242
+ const thresholdY = slideHeight * 0.12
243
+ const right = bbox.x + bbox.w
244
+ const bottom = bbox.y + bbox.h
245
+ return (
246
+ (bbox.x <= thresholdX && bbox.y <= thresholdY) ||
247
+ (right >= slideWidth - thresholdX && bbox.y <= thresholdY) ||
248
+ (bbox.x <= thresholdX && bottom >= slideHeight - thresholdY) ||
249
+ (right >= slideWidth - thresholdX && bottom >= slideHeight - thresholdY)
250
+ )
251
+ }
252
+
253
+ function applyPptxHeuristics(
254
+ slide: PptxSlide,
255
+ slideWidth: number | undefined,
256
+ slideHeight: number | undefined,
257
+ ): PptxSlide {
258
+ if (!slideWidth || !slideHeight) return slide
259
+
260
+ const slideArea = slideWidth * slideHeight
261
+ slide.elements = slide.elements.map((element) => {
262
+ if (!element.bbox) return element
263
+
264
+ const areaRatio = (element.bbox.w * element.bbox.h) / slideArea
265
+ const sourceName = `${element.source_ref ?? ""} ${element.name ?? ""}`.toLowerCase()
266
+
267
+ if (element.kind === "image") {
268
+ const flags: Partial<PptxSlideElement> = {}
269
+ if (areaRatio >= 0.75 && element.asset_status === "kept") flags.likelyBackground = true
270
+ else if (areaRatio >= 0.2 && element.asset_status === "kept") flags.likelyHeroImage = true
271
+ if (areaRatio <= 0.03 && isNearCorner(element.bbox, slideWidth, slideHeight)) flags.likelyLogo = true
272
+ if (/(logo|brand)/.test(sourceName)) flags.likelyLogo = true
273
+ if (/(mask|overlay|shadow)/.test(sourceName) || element.asset_status === "skipped") flags.likelyOverlayMask = true
274
+ if (/(arrow|ornament|decoration)/.test(sourceName)) flags.likelyDecoration = true
275
+ return Object.keys(flags).length > 0 ? { ...element, ...flags } : element
276
+ }
277
+
278
+ if (element.kind === "shape") {
279
+ const flags: Partial<PptxSlideElement> = {}
280
+ if (areaRatio >= 0.4) flags.likelyOverlayMask = true
281
+ if (areaRatio <= 0.03 || /(arrow|ornament|decoration)/.test(sourceName)) flags.likelyDecoration = true
282
+ return Object.keys(flags).length > 0 ? { ...element, ...flags } : element
283
+ }
284
+
285
+ return element
286
+ })
287
+
288
+ return slide
289
+ }
290
+
291
+ function getSlideMediaTargets(files: Record<string, Uint8Array>, slidePath: string): Map<string, string> {
292
+ const relPath = slidePath.replace("/slides/", "/slides/_rels/") + ".rels"
293
+ const doc = parseXml(files, relPath)
294
+ const targets = new Map<string, string>()
295
+ if (!doc) return targets
296
+
297
+ const relationships = doc.getElementsByTagName("Relationship")
298
+ for (let i = 0; i < relationships.length; i++) {
299
+ const rel = relationships[i]
300
+ const id = rel.getAttribute("Id")
301
+ const target = rel.getAttribute("Target")
302
+ if (!id || !target) continue
303
+ const normalized = normalizeZipTarget(slidePath, target)
304
+ if (!normalized.startsWith("ppt/media/")) continue
305
+ targets.set(id, normalized)
306
+ }
307
+
308
+ return targets
309
+ }
310
+
311
+ function extractPptxSlides(
312
+ files: Record<string, Uint8Array>,
313
+ images: DocumentMaterial[],
314
+ skippedAssets: SkippedAsset[],
315
+ ): PptxSlide[] {
316
+ const slideSize = getPptxSlideSize(files)
317
+ const keptBySource = new Map(images.map((image) => [image.source_ref, image]))
318
+ const skippedBySource = new Map(skippedAssets.map((asset) => [asset.source_ref, asset]))
319
+ const slideFiles = Object.keys(files)
320
+ .filter((file) => /^ppt\/slides\/slide\d+\.xml$/.test(file))
321
+ .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
322
+
323
+ return slideFiles.map((slidePath) => {
324
+ const slideNumber = slidePath.match(/slide(\d+)\.xml$/)?.[1] ?? "0"
325
+ const slideId = `slide-${slideNumber.padStart(2, "0")}`
326
+ const doc = parseXml(files, slidePath)
327
+ const mediaTargets = getSlideMediaTargets(files, slidePath)
328
+ const elements: PptxSlideElement[] = []
329
+
330
+ if (!doc) return { slide: slideId, ...(slideSize ?? {}), elements }
331
+
332
+ const spTree = firstDescendantByLocalName(doc, "spTree")
333
+ if (!spTree) return { slide: slideId, ...(slideSize ?? {}), elements }
334
+
335
+ for (const node of xmlElementChildren(spTree)) {
336
+ const kind = xmlLocalName(node)
337
+ if (kind === "nvGrpSpPr" || kind === "grpSpPr") continue
338
+
339
+ const zOrder = elements.length + 1
340
+ const id = `${slideId}-element-${String(zOrder).padStart(2, "0")}`
341
+ const name = extractElementName(node)
342
+ const bbox = extractElementBBox(node)
343
+
344
+ if (kind === "sp") {
345
+ const text = extractShapeText(node)
346
+ elements.push(text
347
+ ? { id, kind: "text", zOrder, text, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) }
348
+ : { id, kind: "shape", zOrder, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) })
349
+ continue
350
+ }
351
+
352
+ if (kind === "pic") {
353
+ const blip = firstDescendantByLocalName(node, "blip")
354
+ const rid = blip?.getAttribute?.("r:embed") || blip?.getAttribute?.("embed") || undefined
355
+ const sourceRef = rid ? mediaTargets.get(rid) : undefined
356
+ const kept = sourceRef ? keptBySource.get(sourceRef) : undefined
357
+ const skipped = sourceRef ? skippedBySource.get(sourceRef) : undefined
358
+
359
+ elements.push({
360
+ id,
361
+ kind: "image",
362
+ zOrder,
363
+ ...(bbox ? { bbox } : {}),
364
+ ...(name ? { name } : {}),
365
+ ...(sourceRef ? { source_ref: sourceRef } : {}),
366
+ ...(kept?.path ? { path: kept.path } : {}),
367
+ ...((kept || skipped) ? { asset_status: kept ? "kept" as const : "skipped" as const } : {}),
368
+ })
369
+ continue
370
+ }
371
+
372
+ if (kind === "cxnSp" || kind === "graphicFrame" || kind === "grpSp") {
373
+ elements.push({ id, kind: "shape", zOrder, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) })
374
+ }
375
+ }
376
+
377
+ return applyPptxHeuristics({ slide: slideId, ...(slideSize ?? {}), elements }, slideSize?.width, slideSize?.height)
378
+ })
379
+ }
380
+
381
+ const LOW_VALUE_PPTX_ASSET = /(icon|logo|mask|overlay|shadow|decoration|ornament|arrow)/i
382
+
383
+ function classifySkippedAsset(sourceRef: string, reason: SkippedAsset["reason"]): SkippedAsset["kind"] | undefined {
384
+ if (sourceRef.endsWith(".svg")) return "svg"
385
+ if (/icon/i.test(sourceRef)) return "icon"
386
+ if (/logo/i.test(sourceRef)) return "logo"
387
+ if (/(mask|overlay|shadow)/i.test(sourceRef)) return "overlay"
388
+ if (/(decoration|ornament|arrow)/i.test(sourceRef)) return "decoration"
389
+ if (reason === "svg_asset") return "svg"
390
+ return undefined
391
+ }
392
+
393
+ function shouldSkipPptxAsset(sourceRef: string): { reason: SkippedAsset["reason"]; kind?: SkippedAsset["kind"] } | null {
394
+ if (sourceRef.endsWith(".svg")) {
395
+ return { reason: "svg_asset", kind: "svg" }
396
+ }
397
+ if (LOW_VALUE_PPTX_ASSET.test(basename(sourceRef))) {
398
+ return { reason: "low_value_asset", kind: classifySkippedAsset(sourceRef, "low_value_asset") }
399
+ }
400
+ return null
401
+ }
402
+
403
+ function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): PptxImageExtraction {
404
+ const relFiles = Object.keys(files)
405
+ .filter((file) => /^ppt\/slides\/_rels\/slide\d+\.xml\.rels$/.test(file))
406
+ .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
407
+
408
+ const images: DocumentMaterial[] = []
409
+ const skipped_assets: SkippedAsset[] = []
410
+ const seenTargets = new Set<string>()
411
+
412
+ for (const relPath of relFiles) {
413
+ const slideMatch = relPath.match(/slide(\d+)\.xml\.rels$/)
414
+ const slideNumber = slideMatch?.[1] ?? "0"
415
+ const slidePath = relPath.replace("/_rels/", "/").replace(/\.rels$/, "")
416
+ const doc = parseXml(files, relPath)
417
+ if (!doc) continue
418
+ const relationships = doc.getElementsByTagName("Relationship")
419
+ let imageIndex = 0
420
+
421
+ for (let i = 0; i < relationships.length; i++) {
422
+ const rel = relationships[i]
423
+ const target = rel.getAttribute("Target")
424
+ if (!target) continue
425
+ const normalized = normalizeZipTarget(slidePath, target)
426
+ if (!normalized.startsWith("ppt/media/")) continue
427
+ const media = files[normalized]
428
+ if (!media) continue
429
+
430
+ seenTargets.add(normalized)
431
+ const skipped = shouldSkipPptxAsset(normalized)
432
+ if (skipped) {
433
+ skipped_assets.push({
434
+ source_ref: normalized,
435
+ page_or_slide: `slide-${slideNumber.padStart(2, "0")}`,
436
+ reason: skipped.reason,
437
+ kind: skipped.kind,
438
+ })
439
+ continue
440
+ }
441
+
442
+ imageIndex += 1
443
+ const exportedName = `slide-${slideNumber.padStart(2, "0")}-image-${String(imageIndex).padStart(2, "0")}${extname(normalized)}`
444
+ const outputPath = join(cacheDir, "images", exportedName)
445
+ writeCachedBuffer(outputPath, media)
446
+
447
+ images.push({
448
+ path: materialPath(cacheDir, workspaceDir, "images", exportedName),
449
+ source_ref: normalized,
450
+ page_or_slide: `slide-${slideNumber.padStart(2, "0")}`,
451
+ })
452
+ }
453
+ }
454
+
455
+ const remainingMedia = Object.keys(files)
456
+ .filter((file) => file.startsWith("ppt/media/") && !seenTargets.has(file))
457
+ .sort()
458
+
459
+ for (const mediaPath of remainingMedia) {
460
+ skipped_assets.push({
461
+ source_ref: mediaPath,
462
+ reason: "unmapped_media",
463
+ kind: classifySkippedAsset(mediaPath, "unmapped_media"),
464
+ })
465
+ }
466
+
467
+ return { images, skipped_assets }
468
+ }
469
+
470
+ function extractDocxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): DocumentMaterial[] {
471
+ return Object.keys(files)
472
+ .filter((file) => file.startsWith("word/media/"))
473
+ .sort()
474
+ .map((mediaPath, index) => {
475
+ const exportedName = `document-image-${String(index + 1).padStart(2, "0")}${extname(mediaPath)}`
476
+ const outputPath = join(cacheDir, "images", exportedName)
477
+ writeCachedBuffer(outputPath, files[mediaPath])
478
+
479
+ return {
480
+ path: materialPath(cacheDir, workspaceDir, "images", exportedName),
481
+ source_ref: mediaPath,
482
+ note: "Document-wide association",
483
+ }
484
+ })
485
+ }
486
+
487
+ function extractXlsxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): DocumentMaterial[] {
488
+ const drawingToImages = new Map<string, string[]>()
489
+ const drawingRelFiles = Object.keys(files)
490
+ .filter((file) => /^xl\/drawings\/_rels\/drawing\d+\.xml\.rels$/.test(file))
491
+ .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
492
+
493
+ for (const relPath of drawingRelFiles) {
494
+ const relDoc = parseXml(files, relPath)
495
+ if (!relDoc) continue
496
+ const drawingPath = relPath.replace("/_rels/", "/").replace(/\.rels$/, "")
497
+ const drawingDoc = parseXml(files, drawingPath)
498
+ if (!drawingDoc) continue
499
+
500
+ const targetByRid = new Map<string, string>()
501
+ const relationships = relDoc.getElementsByTagName("Relationship")
502
+ for (let i = 0; i < relationships.length; i++) {
503
+ const rel = relationships[i]
504
+ const id = rel.getAttribute("Id")
505
+ const target = rel.getAttribute("Target")
506
+ if (!id || !target) continue
507
+ const normalized = normalizeZipTarget(drawingPath, target)
508
+ if (normalized.startsWith("xl/media/")) {
509
+ targetByRid.set(id, normalized)
510
+ }
511
+ }
512
+
513
+ const blips = drawingDoc.getElementsByTagName("a:blip")
514
+ const mediaPaths: string[] = []
515
+ for (let i = 0; i < blips.length; i++) {
516
+ const rid = blips[i].getAttribute("r:embed") || blips[i].getAttribute("embed")
517
+ if (!rid) continue
518
+ const mediaPath = targetByRid.get(rid)
519
+ if (mediaPath) mediaPaths.push(mediaPath)
520
+ }
521
+
522
+ if (mediaPaths.length > 0) {
523
+ drawingToImages.set(drawingPath, mediaPaths)
524
+ }
525
+ }
526
+
527
+ const images: DocumentMaterial[] = []
528
+ const exportedMedia = new Set<string>()
529
+ const sheetRelFiles = Object.keys(files)
530
+ .filter((file) => /^xl\/worksheets\/_rels\/sheet\d+\.xml\.rels$/.test(file))
531
+ .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
532
+
533
+ for (const relPath of sheetRelFiles) {
534
+ const sheetMatch = relPath.match(/sheet(\d+)\.xml\.rels$/)
535
+ const sheetNumber = sheetMatch?.[1] ?? "0"
536
+ const sheetPath = relPath.replace("/_rels/", "/").replace(/\.rels$/, "")
537
+ const relDoc = parseXml(files, relPath)
538
+ if (!relDoc) continue
539
+ const relationships = relDoc.getElementsByTagName("Relationship")
540
+ let imageIndex = 0
541
+
542
+ for (let i = 0; i < relationships.length; i++) {
543
+ const rel = relationships[i]
544
+ const target = rel.getAttribute("Target")
545
+ if (!target) continue
546
+ const normalized = normalizeZipTarget(sheetPath, target)
547
+ const mediaPaths = drawingToImages.get(normalized)
548
+ if (!mediaPaths) continue
549
+
550
+ for (const mediaPath of mediaPaths) {
551
+ const media = files[mediaPath]
552
+ if (!media) continue
553
+ imageIndex += 1
554
+ exportedMedia.add(mediaPath)
555
+ const exportedName = `sheet-${sheetNumber.padStart(2, "0")}-image-${String(imageIndex).padStart(2, "0")}${extname(mediaPath)}`
556
+ const outputPath = join(cacheDir, "images", exportedName)
557
+ writeCachedBuffer(outputPath, media)
558
+
559
+ images.push({
560
+ path: materialPath(cacheDir, workspaceDir, "images", exportedName),
561
+ source_ref: mediaPath,
562
+ page_or_slide: `sheet-${sheetNumber.padStart(2, "0")}`,
563
+ })
564
+ }
565
+ }
566
+ }
567
+
568
+ const unmapped = Object.keys(files)
569
+ .filter((file) => file.startsWith("xl/media/") && !exportedMedia.has(file))
570
+ .sort()
571
+
572
+ for (const mediaPath of unmapped) {
573
+ const exportedName = `unmapped-${basename(mediaPath)}`
574
+ const outputPath = join(cacheDir, "images", exportedName)
575
+ writeCachedBuffer(outputPath, files[mediaPath])
576
+
577
+ images.push({
578
+ path: materialPath(cacheDir, workspaceDir, "images", exportedName),
579
+ source_ref: mediaPath,
580
+ note: "No sheet-level relationship found",
581
+ })
582
+ }
583
+
584
+ return images
585
+ }
586
+
587
+ function extractTables(type: SupportedType, textPath: string): DocumentMaterial[] {
588
+ if (type !== "xlsx") return []
589
+ return [{ path: textPath, source_ref: "workbook", note: "Sheet text and tables extracted to text file" }]
590
+ }
591
+
592
+ async function processOfficeFile(filePath: string, workspaceDir: string, type: SupportedType): Promise<DocumentMaterialsResult> {
593
+ const relativeSource = workspaceRelative(filePath, workspaceDir)
594
+ const fingerprint = buildFingerprint(filePath)
595
+ const cacheDir = join(workspaceDir, ".opencode", "revela", "doc-materials", fingerprint)
596
+ const manifestPath = join(cacheDir, "manifest.json")
597
+
598
+ if (existsSync(manifestPath)) {
599
+ const manifest = JSON.parse(readFileSync(manifestPath, "utf-8")) as CachedManifest
600
+ return {
601
+ status: "processed",
602
+ source: manifest.source,
603
+ type: manifest.type,
604
+ cache_dir: manifest.cache_dir,
605
+ manifest_path: manifest.manifest_path,
606
+ text_path: manifest.text_path,
607
+ images: manifest.images,
608
+ skipped_assets: manifest.skipped_assets,
609
+ slides: manifest.slides,
610
+ tables: manifest.tables,
611
+ }
612
+ }
613
+
614
+ mkdirSync(join(cacheDir, "images"), { recursive: true })
615
+ mkdirSync(join(cacheDir, "tables"), { recursive: true })
616
+
617
+ const buf = readFileSync(filePath)
618
+ const files = unzipSync(new Uint8Array(buf))
619
+
620
+ const text = type === "pptx"
621
+ ? await extractPptx(buf)
622
+ : type === "docx"
623
+ ? await extractDocx(buf)
624
+ : await extractXlsx(buf)
625
+
626
+ const textPath = join(cacheDir, "text.txt")
627
+ writeFileSync(textPath, `[Extracted from: ${basename(filePath)}]\n\n${text}`, "utf-8")
628
+
629
+ const pptxAssets = type === "pptx"
630
+ ? extractPptxImages(files, cacheDir, workspaceDir)
631
+ : null
632
+ const images = type === "pptx"
633
+ ? pptxAssets!.images
634
+ : type === "docx"
635
+ ? extractDocxImages(files, cacheDir, workspaceDir)
636
+ : extractXlsxImages(files, cacheDir, workspaceDir)
637
+ const slides = type === "pptx"
638
+ ? extractPptxSlides(files, images, pptxAssets!.skipped_assets)
639
+ : undefined
640
+
641
+ const result: DocumentMaterialsResult = {
642
+ status: "processed",
643
+ source: relativeSource,
644
+ type,
645
+ cache_dir: workspaceRelative(cacheDir, workspaceDir),
646
+ manifest_path: workspaceRelative(manifestPath, workspaceDir),
647
+ text_path: workspaceRelative(textPath, workspaceDir),
648
+ images,
649
+ skipped_assets: pptxAssets?.skipped_assets ?? [],
650
+ slides,
651
+ tables: extractTables(type, workspaceRelative(textPath, workspaceDir)),
652
+ }
653
+
654
+ const manifest: CachedManifest = {
655
+ source: result.source,
656
+ type,
657
+ fingerprint,
658
+ cache_dir: result.cache_dir!,
659
+ manifest_path: result.manifest_path!,
660
+ text_path: result.text_path!,
661
+ images: result.images ?? [],
662
+ skipped_assets: result.skipped_assets ?? [],
663
+ slides: result.slides ?? [],
664
+ tables: result.tables ?? [],
665
+ }
666
+
667
+ writeFileSync(manifestPath, JSON.stringify(manifest, null, 2), "utf-8")
668
+ return result
669
+ }
670
+
671
+ export async function extractDocumentMaterials(filePath: string, workspaceDir: string): Promise<DocumentMaterialsResult> {
672
+ try {
673
+ const resolvedFile = ensureWorkspacePath(filePath, workspaceDir)
674
+ const relativeSource = workspaceRelative(resolvedFile, workspaceDir)
675
+ const type = SUPPORTED_EXTENSIONS[extname(resolvedFile).toLowerCase()]
676
+
677
+ if (!type) {
678
+ return {
679
+ status: "skipped",
680
+ source: relativeSource,
681
+ type: "other",
682
+ reason: "unsupported_file_type",
683
+ }
684
+ }
685
+
686
+ return await processOfficeFile(resolvedFile, workspaceDir, type)
687
+ } catch (e) {
688
+ return {
689
+ status: "failed",
690
+ source: filePath,
691
+ type: "other",
692
+ reason: e instanceof Error ? e.message : String(e),
693
+ }
694
+ }
695
+ }
@@ -0,0 +1,45 @@
1
+ import { basename, extname } from "path"
2
+ export const OFFICE_EXTENSIONS = new Set([".docx", ".pptx", ".xlsx"])
3
+ export const IMAGE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
4
+
5
+ export type ReadStrategy =
6
+ | "before-materialize-document"
7
+ | "after-extract-text"
8
+ | "after-compress-image"
9
+ | "passthrough"
10
+
11
+ export function classifyReadFile(filePath: string): ReadStrategy {
12
+ const ext = extname(filePath).toLowerCase()
13
+ if (OFFICE_EXTENSIONS.has(ext)) return "before-materialize-document"
14
+ if (ext === ".pdf") return "after-extract-text"
15
+ if (IMAGE_EXTENSIONS.has(ext)) return "after-compress-image"
16
+ return "passthrough"
17
+ }
18
+
19
+ export function formatExtractedText(filePath: string, text: string): string {
20
+ return `[Extracted from: ${basename(filePath)}]\n\n${text}`
21
+ }
22
+
23
+ export function buildOfficeReadView(
24
+ filePath: string,
25
+ text: string,
26
+ images: Array<{ path: string }> | undefined,
27
+ ): string {
28
+ const lines = [
29
+ `# Extracted from: ${basename(filePath)}`,
30
+ "",
31
+ "## Text",
32
+ "",
33
+ text.trim() || "No text extracted.",
34
+ ]
35
+
36
+ lines.push("", "## Images", "")
37
+
38
+ if (!images?.length) {
39
+ lines.push("- None")
40
+ } else {
41
+ for (const image of images) lines.push(`- ${image.path}`)
42
+ }
43
+
44
+ return lines.join("\n")
45
+ }
@@ -4,7 +4,7 @@
4
4
  * Entry point for the read-hooks module.
5
5
  * Exports preRead and postRead for use in plugins/revela.ts hook handlers.
6
6
  *
7
- * preRead → tool.execute.before: redirect binary files (DOCX/PPTX/XLSX) to temp txt
7
+ * preRead → tool.execute.before: materialize Office docs and redirect to temp markdown
8
8
  * postRead → tool.execute.after: transform PDF/image attachments before LLM sees them
9
9
  */
10
10
 
@@ -0,0 +1,77 @@
1
+ import { readFileSync } from "fs"
2
+ import { join } from "path"
3
+ import type { PptxSlide } from "../document-materials/extract"
4
+ import { extractDocumentMaterials } from "../document-materials/extract"
5
+ import { buildOfficeReadView } from "./dispatch"
6
+ import { extractDocx } from "./extractors/docx"
7
+ import { extractPptx } from "./extractors/pptx"
8
+ import { extractXlsx } from "./extractors/xlsx"
9
+ import { formatExtractedText } from "./dispatch"
10
+
11
+ const HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
12
+ ".docx": extractDocx,
13
+ ".pptx": extractPptx,
14
+ ".xlsx": extractXlsx,
15
+ }
16
+
17
+ function buildPptxStructureHints(slides: PptxSlide[] | undefined): string {
18
+ if (!slides?.length) return ""
19
+
20
+ const lines = ["", "## Slide Structure", ""]
21
+ for (const slide of slides) {
22
+ const textCount = slide.elements.filter((element) => element.kind === "text").length
23
+ const keptImageCount = slide.elements.filter((element) => element.kind === "image" && element.asset_status === "kept").length
24
+ const skippedImageCount = slide.elements.filter((element) => element.kind === "image" && element.asset_status === "skipped").length
25
+ const shapeCount = slide.elements.filter((element) => element.kind === "shape").length
26
+ const summary = [
27
+ textCount > 0 ? `${textCount} text` : null,
28
+ keptImageCount > 0 ? `${keptImageCount} kept image` : null,
29
+ skippedImageCount > 0 ? `${skippedImageCount} skipped image` : null,
30
+ shapeCount > 0 ? `${shapeCount} shape` : null,
31
+ ].filter(Boolean).join(", ") || "no parsed elements"
32
+ lines.push(`- ${slide.slide}: ${summary}`)
33
+
34
+ const roleSummary = [
35
+ countRole(slide, (element) => element.likelyBackground, "background image"),
36
+ countRole(slide, (element) => element.likelyHeroImage, "hero image"),
37
+ countRole(slide, (element) => element.likelyLogo, "logo"),
38
+ countRole(slide, (element) => element.likelyOverlayMask, "overlay"),
39
+ countRole(slide, (element) => element.likelyDecoration, "decoration"),
40
+ ].filter(Boolean).join(", ")
41
+ if (roleSummary) lines.push(` likely roles: ${roleSummary}`)
42
+ }
43
+
44
+ return lines.join("\n")
45
+ }
46
+
47
+ function countRole(
48
+ slide: PptxSlide,
49
+ predicate: (element: PptxSlide["elements"][number]) => boolean | undefined,
50
+ label: string,
51
+ ): string | null {
52
+ const count = slide.elements.filter(predicate).length
53
+ if (count === 0) return null
54
+ return `${count} ${label}${count === 1 ? "" : "s"}`
55
+ }
56
+
57
+ export async function createOfficeReadView(filePath: string, workspaceDir: string): Promise<string> {
58
+ const ext = filePath.slice(filePath.lastIndexOf(".")).toLowerCase()
59
+ const handler = HANDLERS[ext]
60
+ if (!handler) throw new Error(`unsupported office file type: ${ext}`)
61
+
62
+ const materialized = await extractDocumentMaterials(filePath, workspaceDir)
63
+
64
+ if (materialized.status === "processed" && materialized.text_path) {
65
+ const textPath = join(workspaceDir, materialized.text_path)
66
+ const extracted = readFileSync(textPath, "utf-8")
67
+ const text = extracted.replace(/^\[Extracted from: .*?\]\n\n/, "")
68
+ const view = buildOfficeReadView(filePath, text, materialized.images)
69
+ return filePath.toLowerCase().endsWith(".pptx")
70
+ ? view + buildPptxStructureHints(materialized.slides)
71
+ : view
72
+ }
73
+
74
+ const buf = readFileSync(filePath)
75
+ const text = await handler(buf)
76
+ return formatExtractedText(filePath, text)
77
+ }
@@ -16,11 +16,10 @@
16
16
  * of packages/opencode/src/session/prompt.ts.
17
17
  */
18
18
 
19
- import { extname, basename } from "path"
19
+ import { basename } from "path"
20
20
  import { extractPdfText } from "./extractors/pdf"
21
21
  import { compressImage } from "./image/compress"
22
-
23
- const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
22
+ import { classifyReadFile, formatExtractedText } from "./dispatch"
24
23
 
25
24
  interface ReadOutput {
26
25
  title: string
@@ -41,10 +40,10 @@ export async function postRead(
41
40
  ): Promise<void> {
42
41
  if (!output.attachments?.length) return
43
42
 
44
- const ext = extname(args.filePath).toLowerCase()
43
+ const strategy = classifyReadFile(args.filePath)
45
44
 
46
45
  // ── PDF: extract text, drop base64 attachment ───────────────────────────
47
- if (ext === ".pdf") {
46
+ if (strategy === "after-extract-text") {
48
47
  const attachment = output.attachments[0]
49
48
  const base64 = attachment.url.split(",")[1]
50
49
  if (!base64) return
@@ -52,14 +51,14 @@ export async function postRead(
52
51
  const buf = Buffer.from(base64, "base64")
53
52
  const text = await extractPdfText(buf)
54
53
 
55
- output.output = `[Extracted from: ${basename(args.filePath)}]\n\n${text}`
54
+ output.output = formatExtractedText(args.filePath, text)
56
55
  output.title = `Extracted text from ${basename(args.filePath)}`
57
56
  output.attachments.length = 0 // Remove base64 — saves significant tokens
58
57
  return
59
58
  }
60
59
 
61
60
  // ── Images: compress attachment to reduce token cost ────────────────────
62
- if (IMAGE_EXTS.has(ext)) {
61
+ if (strategy === "after-compress-image") {
63
62
  const attachment = output.attachments[0]
64
63
  const base64 = attachment.url.split(",")[1]
65
64
  if (!base64) return
@@ -7,44 +7,33 @@
7
7
  * Handles DOCX, PPTX, XLSX — formats that cause read tool to throw
8
8
  * Effect.fail("Cannot read binary file"), so the after-hook never fires.
9
9
  *
10
- * Strategy: extract text write temp .txt file redirect args.filePath.
10
+ * Strategy: materialize the document into cached text + images, render a
11
+ * markdown read view, then redirect args.filePath to that temp .md file.
11
12
  * The read tool then reads the temp file normally. LLM is unaware of the redirect.
12
13
  */
13
14
 
14
- import { readFileSync, writeFileSync } from "fs"
15
- import { extname, basename, join } from "path"
15
+ import { writeFileSync } from "fs"
16
+ import { join } from "path"
16
17
  import { tmpdir } from "os"
17
18
  import { randomUUID } from "crypto"
18
- import { extractDocx } from "./extractors/docx"
19
- import { extractPptx } from "./extractors/pptx"
20
- import { extractXlsx } from "./extractors/xlsx"
21
-
22
- // Extension → extractor function mapping
23
- const HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
24
- ".docx": extractDocx,
25
- ".pptx": extractPptx,
26
- ".xlsx": extractXlsx,
27
- }
19
+ import { classifyReadFile } from "./dispatch"
20
+ import { createOfficeReadView } from "./office-read-view"
28
21
 
29
22
  /**
30
23
  * Intercept read tool args before execution.
31
- * If the file is a supported binary format, extract its text and redirect
32
- * args.filePath to a temp .txt file containing the extracted content.
24
+ * If the file is a supported Office document, materialize it into cached
25
+ * text + images and redirect args.filePath to a temporary markdown read view.
33
26
  *
34
27
  * @param args - Mutable read tool args object (from output.args in before-hook)
35
28
  */
36
29
  export async function preRead(args: { filePath: string; [k: string]: any }): Promise<void> {
37
- const ext = extname(args.filePath).toLowerCase()
38
- const handler = HANDLERS[ext]
39
- if (!handler) return // Not a handled format — let read tool proceed normally
30
+ if (classifyReadFile(args.filePath) !== "before-materialize-document") return
40
31
 
41
- const buf = readFileSync(args.filePath)
42
- const text = await handler(buf)
32
+ const workspaceDir = process.cwd()
33
+ const output = await createOfficeReadView(args.filePath, workspaceDir)
43
34
 
44
- // Write extracted text to a temp file, prefixed with source info
45
- const header = `[Extracted from: ${basename(args.filePath)}]\n\n`
46
- const tmpPath = join(tmpdir(), `revela-${randomUUID()}.txt`)
47
- writeFileSync(tmpPath, header + text, "utf-8")
35
+ const tmpPath = join(tmpdir(), `revela-${randomUUID()}.md`)
36
+ writeFileSync(tmpPath, output, "utf-8")
48
37
 
49
38
  // Redirect read tool to the temp file
50
39
  args.filePath = tmpPath
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cyber-dash-tech/revela",
3
- "version": "0.2.0",
3
+ "version": "0.3.0",
4
4
  "description": "OpenCode plugin that turns AI into an HTML slide deck generator",
5
5
  "type": "module",
6
6
  "main": "./index.ts",
package/plugin.ts CHANGED
@@ -25,10 +25,9 @@ import { ACTIVE_PROMPT_FILE } from "./lib/config"
25
25
  import { ctx } from "./lib/ctx"
26
26
  import { preRead } from "./lib/read-hooks"
27
27
  import { postRead } from "./lib/read-hooks"
28
- import { extractDocx } from "./lib/read-hooks/extractors/docx"
29
- import { extractPptx } from "./lib/read-hooks/extractors/pptx"
30
- import { extractXlsx } from "./lib/read-hooks/extractors/xlsx"
31
28
  import { extractPdfText } from "./lib/read-hooks/extractors/pdf"
29
+ import { createOfficeReadView } from "./lib/read-hooks/office-read-view"
30
+ import { OFFICE_EXTENSIONS, IMAGE_EXTENSIONS, formatExtractedText } from "./lib/read-hooks/dispatch"
32
31
  import { handleHelp } from "./lib/commands/help"
33
32
  import { handleEnable } from "./lib/commands/enable"
34
33
  import { handleDisable } from "./lib/commands/disable"
@@ -50,6 +49,7 @@ import designsTool from "./tools/designs"
50
49
  import domainsTool from "./tools/domains"
51
50
  import researchSaveTool from "./tools/research-save"
52
51
  import workspaceScanTool from "./tools/workspace-scan"
52
+ import extractDocumentMaterialsTool from "./tools/extract-document-materials"
53
53
  import qaTool from "./tools/qa"
54
54
  import { RESEARCH_PROMPT, RESEARCH_AGENT_SIGNATURE } from "./lib/agents/research-prompt"
55
55
  import { runQA, formatReport } from "./lib/qa"
@@ -225,12 +225,13 @@ const server: Plugin = (async (pluginCtx) => {
225
225
  throw new Error("__REVELA_UNKNOWN_HANDLED__")
226
226
  },
227
227
 
228
- // ── LLM tools: designs, domains, research, qa ─────────────────────────
228
+ // ── LLM tools: designs, domains, research, document materials, qa ─────
229
229
  tool: {
230
230
  "revela-designs": designsTool,
231
231
  "revela-domains": domainsTool,
232
232
  "revela-research-save": researchSaveTool,
233
233
  "revela-workspace-scan": workspaceScanTool,
234
+ "revela-extract-document-materials": extractDocumentMaterialsTool,
234
235
  "revela-qa": qaTool,
235
236
  },
236
237
 
@@ -239,19 +240,11 @@ const server: Plugin = (async (pluginCtx) => {
239
240
  // directly — the read tool is never called, so tool.execute.before/after
240
241
  // hooks don't fire. This hook intercepts FileParts before LLM sees them.
241
242
  //
242
- // DOCX/PPTX/XLSX/PDF → extract text → replace with TextPart
243
+ // DOCX/PPTX/XLSX/PDF → extract text/read view → replace with TextPart
243
244
  // Images → replace with TextPart hint (LLM can use read tool)
244
245
  "chat.message": async (input, output) => {
245
246
  if (!ctx.enabled) return
246
247
 
247
- const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
248
- const DOC_HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
249
- ".docx": extractDocx,
250
- ".pptx": extractPptx,
251
- ".xlsx": extractXlsx,
252
- ".pdf": extractPdfText,
253
- }
254
-
255
248
  for (let i = 0; i < output.parts.length; i++) {
256
249
  const part = output.parts[i] as any
257
250
  if (part.type !== "file") continue
@@ -262,15 +255,22 @@ const server: Plugin = (async (pluginCtx) => {
262
255
  const name = basename(filePath)
263
256
 
264
257
  try {
265
- if (DOC_HANDLERS[ext]) {
258
+ if (OFFICE_EXTENSIONS.has(ext)) {
259
+ const text = await createOfficeReadView(filePath, process.cwd())
260
+ output.parts[i] = {
261
+ ...part,
262
+ type: "text",
263
+ text,
264
+ } as any
265
+ } else if (ext === ".pdf") {
266
266
  const buf = readFileSync(filePath)
267
- const text = await DOC_HANDLERS[ext](buf)
267
+ const text = await extractPdfText(buf)
268
268
  output.parts[i] = {
269
269
  ...part,
270
270
  type: "text",
271
- text: `[Extracted from: ${name}]\n\n${text}`,
271
+ text: formatExtractedText(filePath, text),
272
272
  } as any
273
- } else if (IMAGE_EXTS.has(ext)) {
273
+ } else if (IMAGE_EXTENSIONS.has(ext)) {
274
274
  output.parts[i] = {
275
275
  ...part,
276
276
  type: "text",
@@ -0,0 +1,18 @@
1
+ import { tool } from "@opencode-ai/plugin"
2
+ import { extractDocumentMaterials } from "../lib/document-materials/extract"
3
+
4
+ export default tool({
5
+ description:
6
+ "Extract reusable materials from a workspace document into a workspace-local cache. " +
7
+ "Supports pptx, docx, and xlsx. Produces a manifest plus extracted text, embedded images, and available slide/sheet mappings. " +
8
+ "Unsupported file types are skipped instead of failing.",
9
+ args: {
10
+ file: tool.schema
11
+ .string()
12
+ .describe("Document path relative to workspace root. Supports pptx, docx, and xlsx; other file types are skipped."),
13
+ },
14
+ async execute(args, context) {
15
+ const workspaceDir = context.directory ?? process.cwd()
16
+ return JSON.stringify(await extractDocumentMaterials(args.file, workspaceDir), null, 2)
17
+ },
18
+ })