@cyber-dash-tech/revela 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,5 @@
1
1
  import { createHash } from "crypto"
2
- import { existsSync, mkdirSync, readFileSync, statSync, writeFileSync } from "fs"
2
+ import { existsSync, mkdirSync, readFileSync, realpathSync, statSync, writeFileSync } from "fs"
3
3
  import { basename, dirname, extname, isAbsolute, join, relative, resolve } from "path"
4
4
  import { DOMParser } from "@xmldom/xmldom"
5
5
  import { unzipSync } from "fflate"
@@ -14,6 +14,37 @@ export type DocumentMaterial = {
14
14
  note?: string
15
15
  }
16
16
 
17
+ export type SkippedAsset = {
18
+ source_ref: string
19
+ page_or_slide?: string
20
+ reason: "svg_asset" | "unmapped_media" | "low_value_asset"
21
+ kind?: "svg" | "icon" | "logo" | "overlay" | "decoration"
22
+ }
23
+
24
+ export type PptxSlideElement = {
25
+ id: string
26
+ kind: "text" | "image" | "shape"
27
+ zOrder: number
28
+ bbox?: { x: number; y: number; w: number; h: number }
29
+ likelyBackground?: boolean
30
+ likelyHeroImage?: boolean
31
+ likelyLogo?: boolean
32
+ likelyOverlayMask?: boolean
33
+ likelyDecoration?: boolean
34
+ text?: string
35
+ source_ref?: string
36
+ path?: string
37
+ asset_status?: "kept" | "skipped"
38
+ name?: string
39
+ }
40
+
41
+ export type PptxSlide = {
42
+ slide: string
43
+ width?: number
44
+ height?: number
45
+ elements: PptxSlideElement[]
46
+ }
47
+
17
48
  export type DocumentMaterialsResult = {
18
49
  status: "processed" | "skipped" | "failed"
19
50
  source: string
@@ -22,6 +53,8 @@ export type DocumentMaterialsResult = {
22
53
  manifest_path?: string
23
54
  text_path?: string
24
55
  images?: DocumentMaterial[]
56
+ skipped_assets?: SkippedAsset[]
57
+ slides?: PptxSlide[]
25
58
  tables?: DocumentMaterial[]
26
59
  reason?: string
27
60
  }
@@ -36,9 +69,16 @@ type CachedManifest = {
36
69
  manifest_path: string
37
70
  text_path: string
38
71
  images: DocumentMaterial[]
72
+ skipped_assets: SkippedAsset[]
73
+ slides: PptxSlide[]
39
74
  tables: DocumentMaterial[]
40
75
  }
41
76
 
77
+ type PptxImageExtraction = {
78
+ images: DocumentMaterial[]
79
+ skipped_assets: SkippedAsset[]
80
+ }
81
+
42
82
  const SUPPORTED_EXTENSIONS: Record<string, SupportedType> = {
43
83
  ".pptx": "pptx",
44
84
  ".docx": "docx",
@@ -62,8 +102,11 @@ function normalizeZipTarget(basePath: string, target: string): string {
62
102
  }
63
103
 
64
104
  function ensureWorkspacePath(filePath: string, workspaceDir: string): string {
65
- const resolvedWorkspace = resolve(workspaceDir)
66
- const resolvedFile = isAbsolute(filePath) ? resolve(filePath) : resolve(workspaceDir, filePath)
105
+ const resolvedWorkspace = realpathSync(resolve(workspaceDir))
106
+ const candidate = isAbsolute(filePath) ? resolve(filePath) : resolve(workspaceDir, filePath)
107
+ const resolvedFile = existsSync(candidate)
108
+ ? realpathSync(candidate)
109
+ : candidate
67
110
 
68
111
  if (resolvedFile !== resolvedWorkspace && !resolvedFile.startsWith(resolvedWorkspace + "/")) {
69
112
  throw new Error("file must be within workspace")
@@ -72,8 +115,24 @@ function ensureWorkspacePath(filePath: string, workspaceDir: string): string {
72
115
  return resolvedFile
73
116
  }
74
117
 
118
+ function normalizeWorkspaceChild(filePath: string, workspaceDir: string): string {
119
+ const workspaceAlias = resolve(workspaceDir)
120
+ const workspaceReal = realpathSync(workspaceAlias)
121
+ const candidate = resolve(filePath)
122
+
123
+ if (existsSync(candidate)) return realpathSync(candidate)
124
+
125
+ if (candidate === workspaceAlias || candidate.startsWith(workspaceAlias + "/")) {
126
+ return join(workspaceReal, relative(workspaceAlias, candidate))
127
+ }
128
+
129
+ return candidate
130
+ }
131
+
75
132
  function workspaceRelative(filePath: string, workspaceDir: string): string {
76
- return relative(workspaceDir, filePath).replace(/\\/g, "/")
133
+ const resolvedWorkspace = realpathSync(resolve(workspaceDir))
134
+ const resolvedFile = normalizeWorkspaceChild(filePath, workspaceDir)
135
+ return relative(resolvedWorkspace, resolvedFile).replace(/\\/g, "/")
77
136
  }
78
137
 
79
138
  function buildFingerprint(filePath: string): string {
@@ -98,12 +157,256 @@ function parseXml(files: Record<string, Uint8Array>, path: string): any | null {
98
157
  return new DOMParser().parseFromString(new TextDecoder().decode(file), "text/xml")
99
158
  }
100
159
 
101
- function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): DocumentMaterial[] {
160
+ function xmlLocalName(node: any): string {
161
+ return node?.localName ?? String(node?.nodeName ?? "").split(":").pop() ?? ""
162
+ }
163
+
164
+ function xmlElementChildren(node: any): any[] {
165
+ const children: any[] = []
166
+ const childNodes = node?.childNodes ?? []
167
+ for (let i = 0; i < childNodes.length; i++) {
168
+ const child = childNodes[i]
169
+ if (child?.nodeType === 1) children.push(child)
170
+ }
171
+ return children
172
+ }
173
+
174
+ function xmlDescendantsByLocalName(node: any, name: string): any[] {
175
+ const matches: any[] = []
176
+ const walk = (current: any) => {
177
+ for (const child of xmlElementChildren(current)) {
178
+ if (xmlLocalName(child) === name) matches.push(child)
179
+ walk(child)
180
+ }
181
+ }
182
+ walk(node)
183
+ return matches
184
+ }
185
+
186
+ function firstDescendantByLocalName(node: any, name: string): any | null {
187
+ const [match] = xmlDescendantsByLocalName(node, name)
188
+ return match ?? null
189
+ }
190
+
191
+ function extractShapeText(node: any): string | undefined {
192
+ const texts = xmlDescendantsByLocalName(node, "t")
193
+ .map((textNode) => textNode.textContent?.trim())
194
+ .filter(Boolean)
195
+ return texts.length > 0 ? texts.join("\n") : undefined
196
+ }
197
+
198
+ function extractElementName(node: any): string | undefined {
199
+ return firstDescendantByLocalName(node, "cNvPr")?.getAttribute?.("name") || undefined
200
+ }
201
+
202
+ function parseCoordinate(value: string | null | undefined): number | undefined {
203
+ if (value == null || value === "") return undefined
204
+ const parsed = Number(value)
205
+ return Number.isFinite(parsed) ? parsed : undefined
206
+ }
207
+
208
+ function extractElementBBox(node: any): { x: number; y: number; w: number; h: number } | undefined {
209
+ const xfrm = firstDescendantByLocalName(node, "xfrm")
210
+ if (!xfrm) return undefined
211
+
212
+ const off = firstDescendantByLocalName(xfrm, "off")
213
+ const ext = firstDescendantByLocalName(xfrm, "ext")
214
+ if (!off || !ext) return undefined
215
+
216
+ const x = parseCoordinate(off.getAttribute?.("x"))
217
+ const y = parseCoordinate(off.getAttribute?.("y"))
218
+ const w = parseCoordinate(ext.getAttribute?.("cx"))
219
+ const h = parseCoordinate(ext.getAttribute?.("cy"))
220
+ if ([x, y, w, h].some((value) => value == null)) return undefined
221
+
222
+ return { x: x!, y: y!, w: w!, h: h! }
223
+ }
224
+
225
+ function getPptxSlideSize(files: Record<string, Uint8Array>): { width: number; height: number } | undefined {
226
+ const doc = parseXml(files, "ppt/presentation.xml")
227
+ const size = firstDescendantByLocalName(doc, "sldSz")
228
+ if (!size) return undefined
229
+
230
+ const width = parseCoordinate(size.getAttribute?.("cx"))
231
+ const height = parseCoordinate(size.getAttribute?.("cy"))
232
+ if (width == null || height == null) return undefined
233
+ return { width, height }
234
+ }
235
+
236
+ function isNearCorner(
237
+ bbox: { x: number; y: number; w: number; h: number },
238
+ slideWidth: number,
239
+ slideHeight: number,
240
+ ): boolean {
241
+ const thresholdX = slideWidth * 0.12
242
+ const thresholdY = slideHeight * 0.12
243
+ const right = bbox.x + bbox.w
244
+ const bottom = bbox.y + bbox.h
245
+ return (
246
+ (bbox.x <= thresholdX && bbox.y <= thresholdY) ||
247
+ (right >= slideWidth - thresholdX && bbox.y <= thresholdY) ||
248
+ (bbox.x <= thresholdX && bottom >= slideHeight - thresholdY) ||
249
+ (right >= slideWidth - thresholdX && bottom >= slideHeight - thresholdY)
250
+ )
251
+ }
252
+
253
+ function applyPptxHeuristics(
254
+ slide: PptxSlide,
255
+ slideWidth: number | undefined,
256
+ slideHeight: number | undefined,
257
+ ): PptxSlide {
258
+ if (!slideWidth || !slideHeight) return slide
259
+
260
+ const slideArea = slideWidth * slideHeight
261
+ slide.elements = slide.elements.map((element) => {
262
+ if (!element.bbox) return element
263
+
264
+ const areaRatio = (element.bbox.w * element.bbox.h) / slideArea
265
+ const sourceName = `${element.source_ref ?? ""} ${element.name ?? ""}`.toLowerCase()
266
+
267
+ if (element.kind === "image") {
268
+ const flags: Partial<PptxSlideElement> = {}
269
+ if (areaRatio >= 0.75 && element.asset_status === "kept") flags.likelyBackground = true
270
+ else if (areaRatio >= 0.2 && element.asset_status === "kept") flags.likelyHeroImage = true
271
+ if (areaRatio <= 0.03 && isNearCorner(element.bbox, slideWidth, slideHeight)) flags.likelyLogo = true
272
+ if (/(logo|brand)/.test(sourceName)) flags.likelyLogo = true
273
+ if (/(mask|overlay|shadow)/.test(sourceName) || element.asset_status === "skipped") flags.likelyOverlayMask = true
274
+ if (/(arrow|ornament|decoration)/.test(sourceName)) flags.likelyDecoration = true
275
+ return Object.keys(flags).length > 0 ? { ...element, ...flags } : element
276
+ }
277
+
278
+ if (element.kind === "shape") {
279
+ const flags: Partial<PptxSlideElement> = {}
280
+ if (areaRatio >= 0.4) flags.likelyOverlayMask = true
281
+ if (areaRatio <= 0.03 || /(arrow|ornament|decoration)/.test(sourceName)) flags.likelyDecoration = true
282
+ return Object.keys(flags).length > 0 ? { ...element, ...flags } : element
283
+ }
284
+
285
+ return element
286
+ })
287
+
288
+ return slide
289
+ }
290
+
291
+ function getSlideMediaTargets(files: Record<string, Uint8Array>, slidePath: string): Map<string, string> {
292
+ const relPath = slidePath.replace("/slides/", "/slides/_rels/") + ".rels"
293
+ const doc = parseXml(files, relPath)
294
+ const targets = new Map<string, string>()
295
+ if (!doc) return targets
296
+
297
+ const relationships = doc.getElementsByTagName("Relationship")
298
+ for (let i = 0; i < relationships.length; i++) {
299
+ const rel = relationships[i]
300
+ const id = rel.getAttribute("Id")
301
+ const target = rel.getAttribute("Target")
302
+ if (!id || !target) continue
303
+ const normalized = normalizeZipTarget(slidePath, target)
304
+ if (!normalized.startsWith("ppt/media/")) continue
305
+ targets.set(id, normalized)
306
+ }
307
+
308
+ return targets
309
+ }
310
+
311
+ function extractPptxSlides(
312
+ files: Record<string, Uint8Array>,
313
+ images: DocumentMaterial[],
314
+ skippedAssets: SkippedAsset[],
315
+ ): PptxSlide[] {
316
+ const slideSize = getPptxSlideSize(files)
317
+ const keptBySource = new Map(images.map((image) => [image.source_ref, image]))
318
+ const skippedBySource = new Map(skippedAssets.map((asset) => [asset.source_ref, asset]))
319
+ const slideFiles = Object.keys(files)
320
+ .filter((file) => /^ppt\/slides\/slide\d+\.xml$/.test(file))
321
+ .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
322
+
323
+ return slideFiles.map((slidePath) => {
324
+ const slideNumber = slidePath.match(/slide(\d+)\.xml$/)?.[1] ?? "0"
325
+ const slideId = `slide-${slideNumber.padStart(2, "0")}`
326
+ const doc = parseXml(files, slidePath)
327
+ const mediaTargets = getSlideMediaTargets(files, slidePath)
328
+ const elements: PptxSlideElement[] = []
329
+
330
+ if (!doc) return { slide: slideId, ...(slideSize ?? {}), elements }
331
+
332
+ const spTree = firstDescendantByLocalName(doc, "spTree")
333
+ if (!spTree) return { slide: slideId, ...(slideSize ?? {}), elements }
334
+
335
+ for (const node of xmlElementChildren(spTree)) {
336
+ const kind = xmlLocalName(node)
337
+ if (kind === "nvGrpSpPr" || kind === "grpSpPr") continue
338
+
339
+ const zOrder = elements.length + 1
340
+ const id = `${slideId}-element-${String(zOrder).padStart(2, "0")}`
341
+ const name = extractElementName(node)
342
+ const bbox = extractElementBBox(node)
343
+
344
+ if (kind === "sp") {
345
+ const text = extractShapeText(node)
346
+ elements.push(text
347
+ ? { id, kind: "text", zOrder, text, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) }
348
+ : { id, kind: "shape", zOrder, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) })
349
+ continue
350
+ }
351
+
352
+ if (kind === "pic") {
353
+ const blip = firstDescendantByLocalName(node, "blip")
354
+ const rid = blip?.getAttribute?.("r:embed") || blip?.getAttribute?.("embed") || undefined
355
+ const sourceRef = rid ? mediaTargets.get(rid) : undefined
356
+ const kept = sourceRef ? keptBySource.get(sourceRef) : undefined
357
+ const skipped = sourceRef ? skippedBySource.get(sourceRef) : undefined
358
+
359
+ elements.push({
360
+ id,
361
+ kind: "image",
362
+ zOrder,
363
+ ...(bbox ? { bbox } : {}),
364
+ ...(name ? { name } : {}),
365
+ ...(sourceRef ? { source_ref: sourceRef } : {}),
366
+ ...(kept?.path ? { path: kept.path } : {}),
367
+ ...((kept || skipped) ? { asset_status: kept ? "kept" as const : "skipped" as const } : {}),
368
+ })
369
+ continue
370
+ }
371
+
372
+ if (kind === "cxnSp" || kind === "graphicFrame" || kind === "grpSp") {
373
+ elements.push({ id, kind: "shape", zOrder, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) })
374
+ }
375
+ }
376
+
377
+ return applyPptxHeuristics({ slide: slideId, ...(slideSize ?? {}), elements }, slideSize?.width, slideSize?.height)
378
+ })
379
+ }
380
+
381
+ const LOW_VALUE_PPTX_ASSET = /(icon|logo|mask|overlay|shadow|decoration|ornament|arrow)/i
382
+
383
+ function classifySkippedAsset(sourceRef: string, reason: SkippedAsset["reason"]): SkippedAsset["kind"] | undefined {
384
+ if (sourceRef.endsWith(".svg")) return "svg"
385
+ if (/icon/i.test(sourceRef)) return "icon"
386
+ if (/logo/i.test(sourceRef)) return "logo"
387
+ if (/(mask|overlay|shadow)/i.test(sourceRef)) return "overlay"
388
+ if (/(decoration|ornament|arrow)/i.test(sourceRef)) return "decoration"
389
+ if (reason === "svg_asset") return "svg"
390
+ return undefined
391
+ }
392
+
393
+ function shouldSkipPptxAsset(sourceRef: string): { reason: SkippedAsset["reason"]; kind?: SkippedAsset["kind"] } | null {
394
+ if (sourceRef.endsWith(".svg")) {
395
+ return { reason: "svg_asset", kind: "svg" }
396
+ }
397
+ if (LOW_VALUE_PPTX_ASSET.test(basename(sourceRef))) {
398
+ return { reason: "low_value_asset", kind: classifySkippedAsset(sourceRef, "low_value_asset") }
399
+ }
400
+ return null
401
+ }
402
+
403
+ function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): PptxImageExtraction {
102
404
  const relFiles = Object.keys(files)
103
405
  .filter((file) => /^ppt\/slides\/_rels\/slide\d+\.xml\.rels$/.test(file))
104
406
  .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
105
407
 
106
408
  const images: DocumentMaterial[] = []
409
+ const skipped_assets: SkippedAsset[] = []
107
410
  const seenTargets = new Set<string>()
108
411
 
109
412
  for (const relPath of relFiles) {
@@ -124,8 +427,19 @@ function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string,
124
427
  const media = files[normalized]
125
428
  if (!media) continue
126
429
 
127
- imageIndex += 1
128
430
  seenTargets.add(normalized)
431
+ const skipped = shouldSkipPptxAsset(normalized)
432
+ if (skipped) {
433
+ skipped_assets.push({
434
+ source_ref: normalized,
435
+ page_or_slide: `slide-${slideNumber.padStart(2, "0")}`,
436
+ reason: skipped.reason,
437
+ kind: skipped.kind,
438
+ })
439
+ continue
440
+ }
441
+
442
+ imageIndex += 1
129
443
  const exportedName = `slide-${slideNumber.padStart(2, "0")}-image-${String(imageIndex).padStart(2, "0")}${extname(normalized)}`
130
444
  const outputPath = join(cacheDir, "images", exportedName)
131
445
  writeCachedBuffer(outputPath, media)
@@ -143,18 +457,14 @@ function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string,
143
457
  .sort()
144
458
 
145
459
  for (const mediaPath of remainingMedia) {
146
- const exportedName = `unmapped-${basename(mediaPath)}`
147
- const outputPath = join(cacheDir, "images", exportedName)
148
- writeCachedBuffer(outputPath, files[mediaPath])
149
-
150
- images.push({
151
- path: materialPath(cacheDir, workspaceDir, "images", exportedName),
460
+ skipped_assets.push({
152
461
  source_ref: mediaPath,
153
- note: "No slide-level relationship found",
462
+ reason: "unmapped_media",
463
+ kind: classifySkippedAsset(mediaPath, "unmapped_media"),
154
464
  })
155
465
  }
156
466
 
157
- return images
467
+ return { images, skipped_assets }
158
468
  }
159
469
 
160
470
  function extractDocxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): DocumentMaterial[] {
@@ -295,6 +605,8 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
295
605
  manifest_path: manifest.manifest_path,
296
606
  text_path: manifest.text_path,
297
607
  images: manifest.images,
608
+ skipped_assets: manifest.skipped_assets,
609
+ slides: manifest.slides,
298
610
  tables: manifest.tables,
299
611
  }
300
612
  }
@@ -314,11 +626,17 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
314
626
  const textPath = join(cacheDir, "text.txt")
315
627
  writeFileSync(textPath, `[Extracted from: ${basename(filePath)}]\n\n${text}`, "utf-8")
316
628
 
317
- const images = type === "pptx"
629
+ const pptxAssets = type === "pptx"
318
630
  ? extractPptxImages(files, cacheDir, workspaceDir)
631
+ : null
632
+ const images = type === "pptx"
633
+ ? pptxAssets!.images
319
634
  : type === "docx"
320
635
  ? extractDocxImages(files, cacheDir, workspaceDir)
321
636
  : extractXlsxImages(files, cacheDir, workspaceDir)
637
+ const slides = type === "pptx"
638
+ ? extractPptxSlides(files, images, pptxAssets!.skipped_assets)
639
+ : undefined
322
640
 
323
641
  const result: DocumentMaterialsResult = {
324
642
  status: "processed",
@@ -328,6 +646,8 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
328
646
  manifest_path: workspaceRelative(manifestPath, workspaceDir),
329
647
  text_path: workspaceRelative(textPath, workspaceDir),
330
648
  images,
649
+ skipped_assets: pptxAssets?.skipped_assets ?? [],
650
+ slides,
331
651
  tables: extractTables(type, workspaceRelative(textPath, workspaceDir)),
332
652
  }
333
653
 
@@ -339,6 +659,8 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
339
659
  manifest_path: result.manifest_path!,
340
660
  text_path: result.text_path!,
341
661
  images: result.images ?? [],
662
+ skipped_assets: result.skipped_assets ?? [],
663
+ slides: result.slides ?? [],
342
664
  tables: result.tables ?? [],
343
665
  }
344
666
 
@@ -0,0 +1,45 @@
1
+ import { basename, extname } from "path"
2
+ export const OFFICE_EXTENSIONS = new Set([".docx", ".pptx", ".xlsx"])
3
+ export const IMAGE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
4
+
5
+ export type ReadStrategy =
6
+ | "before-materialize-document"
7
+ | "after-extract-text"
8
+ | "after-compress-image"
9
+ | "passthrough"
10
+
11
+ export function classifyReadFile(filePath: string): ReadStrategy {
12
+ const ext = extname(filePath).toLowerCase()
13
+ if (OFFICE_EXTENSIONS.has(ext)) return "before-materialize-document"
14
+ if (ext === ".pdf") return "after-extract-text"
15
+ if (IMAGE_EXTENSIONS.has(ext)) return "after-compress-image"
16
+ return "passthrough"
17
+ }
18
+
19
+ export function formatExtractedText(filePath: string, text: string): string {
20
+ return `[Extracted from: ${basename(filePath)}]\n\n${text}`
21
+ }
22
+
23
+ export function buildOfficeReadView(
24
+ filePath: string,
25
+ text: string,
26
+ images: Array<{ path: string }> | undefined,
27
+ ): string {
28
+ const lines = [
29
+ `# Extracted from: ${basename(filePath)}`,
30
+ "",
31
+ "## Text",
32
+ "",
33
+ text.trim() || "No text extracted.",
34
+ ]
35
+
36
+ lines.push("", "## Images", "")
37
+
38
+ if (!images?.length) {
39
+ lines.push("- None")
40
+ } else {
41
+ for (const image of images) lines.push(`- ${image.path}`)
42
+ }
43
+
44
+ return lines.join("\n")
45
+ }
@@ -4,7 +4,7 @@
4
4
  * Entry point for the read-hooks module.
5
5
  * Exports preRead and postRead for use in plugins/revela.ts hook handlers.
6
6
  *
7
- * preRead → tool.execute.before: redirect binary files (DOCX/PPTX/XLSX) to temp txt
7
+ * preRead → tool.execute.before: materialize Office docs and redirect to temp markdown
8
8
  * postRead → tool.execute.after: transform PDF/image attachments before LLM sees them
9
9
  */
10
10
 
@@ -0,0 +1,77 @@
1
+ import { readFileSync } from "fs"
2
+ import { join } from "path"
3
+ import type { PptxSlide } from "../document-materials/extract"
4
+ import { extractDocumentMaterials } from "../document-materials/extract"
5
+ import { buildOfficeReadView } from "./dispatch"
6
+ import { extractDocx } from "./extractors/docx"
7
+ import { extractPptx } from "./extractors/pptx"
8
+ import { extractXlsx } from "./extractors/xlsx"
9
+ import { formatExtractedText } from "./dispatch"
10
+
11
+ const HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
12
+ ".docx": extractDocx,
13
+ ".pptx": extractPptx,
14
+ ".xlsx": extractXlsx,
15
+ }
16
+
17
+ function buildPptxStructureHints(slides: PptxSlide[] | undefined): string {
18
+ if (!slides?.length) return ""
19
+
20
+ const lines = ["", "## Slide Structure", ""]
21
+ for (const slide of slides) {
22
+ const textCount = slide.elements.filter((element) => element.kind === "text").length
23
+ const keptImageCount = slide.elements.filter((element) => element.kind === "image" && element.asset_status === "kept").length
24
+ const skippedImageCount = slide.elements.filter((element) => element.kind === "image" && element.asset_status === "skipped").length
25
+ const shapeCount = slide.elements.filter((element) => element.kind === "shape").length
26
+ const summary = [
27
+ textCount > 0 ? `${textCount} text` : null,
28
+ keptImageCount > 0 ? `${keptImageCount} kept image` : null,
29
+ skippedImageCount > 0 ? `${skippedImageCount} skipped image` : null,
30
+ shapeCount > 0 ? `${shapeCount} shape` : null,
31
+ ].filter(Boolean).join(", ") || "no parsed elements"
32
+ lines.push(`- ${slide.slide}: ${summary}`)
33
+
34
+ const roleSummary = [
35
+ countRole(slide, (element) => element.likelyBackground, "background image"),
36
+ countRole(slide, (element) => element.likelyHeroImage, "hero image"),
37
+ countRole(slide, (element) => element.likelyLogo, "logo"),
38
+ countRole(slide, (element) => element.likelyOverlayMask, "overlay"),
39
+ countRole(slide, (element) => element.likelyDecoration, "decoration"),
40
+ ].filter(Boolean).join(", ")
41
+ if (roleSummary) lines.push(` likely roles: ${roleSummary}`)
42
+ }
43
+
44
+ return lines.join("\n")
45
+ }
46
+
47
+ function countRole(
48
+ slide: PptxSlide,
49
+ predicate: (element: PptxSlide["elements"][number]) => boolean | undefined,
50
+ label: string,
51
+ ): string | null {
52
+ const count = slide.elements.filter(predicate).length
53
+ if (count === 0) return null
54
+ return `${count} ${label}${count === 1 ? "" : "s"}`
55
+ }
56
+
57
+ export async function createOfficeReadView(filePath: string, workspaceDir: string): Promise<string> {
58
+ const ext = filePath.slice(filePath.lastIndexOf(".")).toLowerCase()
59
+ const handler = HANDLERS[ext]
60
+ if (!handler) throw new Error(`unsupported office file type: ${ext}`)
61
+
62
+ const materialized = await extractDocumentMaterials(filePath, workspaceDir)
63
+
64
+ if (materialized.status === "processed" && materialized.text_path) {
65
+ const textPath = join(workspaceDir, materialized.text_path)
66
+ const extracted = readFileSync(textPath, "utf-8")
67
+ const text = extracted.replace(/^\[Extracted from: .*?\]\n\n/, "")
68
+ const view = buildOfficeReadView(filePath, text, materialized.images)
69
+ return filePath.toLowerCase().endsWith(".pptx")
70
+ ? view + buildPptxStructureHints(materialized.slides)
71
+ : view
72
+ }
73
+
74
+ const buf = readFileSync(filePath)
75
+ const text = await handler(buf)
76
+ return formatExtractedText(filePath, text)
77
+ }
@@ -16,11 +16,10 @@
16
16
  * of packages/opencode/src/session/prompt.ts.
17
17
  */
18
18
 
19
- import { extname, basename } from "path"
19
+ import { basename } from "path"
20
20
  import { extractPdfText } from "./extractors/pdf"
21
21
  import { compressImage } from "./image/compress"
22
-
23
- const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
22
+ import { classifyReadFile, formatExtractedText } from "./dispatch"
24
23
 
25
24
  interface ReadOutput {
26
25
  title: string
@@ -41,10 +40,10 @@ export async function postRead(
41
40
  ): Promise<void> {
42
41
  if (!output.attachments?.length) return
43
42
 
44
- const ext = extname(args.filePath).toLowerCase()
43
+ const strategy = classifyReadFile(args.filePath)
45
44
 
46
45
  // ── PDF: extract text, drop base64 attachment ───────────────────────────
47
- if (ext === ".pdf") {
46
+ if (strategy === "after-extract-text") {
48
47
  const attachment = output.attachments[0]
49
48
  const base64 = attachment.url.split(",")[1]
50
49
  if (!base64) return
@@ -52,14 +51,14 @@ export async function postRead(
52
51
  const buf = Buffer.from(base64, "base64")
53
52
  const text = await extractPdfText(buf)
54
53
 
55
- output.output = `[Extracted from: ${basename(args.filePath)}]\n\n${text}`
54
+ output.output = formatExtractedText(args.filePath, text)
56
55
  output.title = `Extracted text from ${basename(args.filePath)}`
57
56
  output.attachments.length = 0 // Remove base64 — saves significant tokens
58
57
  return
59
58
  }
60
59
 
61
60
  // ── Images: compress attachment to reduce token cost ────────────────────
62
- if (IMAGE_EXTS.has(ext)) {
61
+ if (strategy === "after-compress-image") {
63
62
  const attachment = output.attachments[0]
64
63
  const base64 = attachment.url.split(",")[1]
65
64
  if (!base64) return
@@ -7,44 +7,33 @@
7
7
  * Handles DOCX, PPTX, XLSX — formats that cause read tool to throw
8
8
  * Effect.fail("Cannot read binary file"), so the after-hook never fires.
9
9
  *
10
- * Strategy: extract text write temp .txt file redirect args.filePath.
10
+ * Strategy: materialize the document into cached text + images, render a
11
+ * markdown read view, then redirect args.filePath to that temp .md file.
11
12
  * The read tool then reads the temp file normally. LLM is unaware of the redirect.
12
13
  */
13
14
 
14
- import { readFileSync, writeFileSync } from "fs"
15
- import { extname, basename, join } from "path"
15
+ import { writeFileSync } from "fs"
16
+ import { join } from "path"
16
17
  import { tmpdir } from "os"
17
18
  import { randomUUID } from "crypto"
18
- import { extractDocx } from "./extractors/docx"
19
- import { extractPptx } from "./extractors/pptx"
20
- import { extractXlsx } from "./extractors/xlsx"
21
-
22
- // Extension → extractor function mapping
23
- const HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
24
- ".docx": extractDocx,
25
- ".pptx": extractPptx,
26
- ".xlsx": extractXlsx,
27
- }
19
+ import { classifyReadFile } from "./dispatch"
20
+ import { createOfficeReadView } from "./office-read-view"
28
21
 
29
22
  /**
30
23
  * Intercept read tool args before execution.
31
- * If the file is a supported binary format, extract its text and redirect
32
- * args.filePath to a temp .txt file containing the extracted content.
24
+ * If the file is a supported Office document, materialize it into cached
25
+ * text + images and redirect args.filePath to a temporary markdown read view.
33
26
  *
34
27
  * @param args - Mutable read tool args object (from output.args in before-hook)
35
28
  */
36
29
  export async function preRead(args: { filePath: string; [k: string]: any }): Promise<void> {
37
- const ext = extname(args.filePath).toLowerCase()
38
- const handler = HANDLERS[ext]
39
- if (!handler) return // Not a handled format — let read tool proceed normally
30
+ if (classifyReadFile(args.filePath) !== "before-materialize-document") return
40
31
 
41
- const buf = readFileSync(args.filePath)
42
- const text = await handler(buf)
32
+ const workspaceDir = process.cwd()
33
+ const output = await createOfficeReadView(args.filePath, workspaceDir)
43
34
 
44
- // Write extracted text to a temp file, prefixed with source info
45
- const header = `[Extracted from: ${basename(args.filePath)}]\n\n`
46
- const tmpPath = join(tmpdir(), `revela-${randomUUID()}.txt`)
47
- writeFileSync(tmpPath, header + text, "utf-8")
35
+ const tmpPath = join(tmpdir(), `revela-${randomUUID()}.md`)
36
+ writeFileSync(tmpPath, output, "utf-8")
48
37
 
49
38
  // Redirect read tool to the temp file
50
39
  args.filePath = tmpPath
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@cyber-dash-tech/revela",
3
- "version": "0.2.1",
3
+ "version": "0.3.0",
4
4
  "description": "OpenCode plugin that turns AI into an HTML slide deck generator",
5
5
  "type": "module",
6
6
  "main": "./index.ts",
package/plugin.ts CHANGED
@@ -25,10 +25,9 @@ import { ACTIVE_PROMPT_FILE } from "./lib/config"
25
25
  import { ctx } from "./lib/ctx"
26
26
  import { preRead } from "./lib/read-hooks"
27
27
  import { postRead } from "./lib/read-hooks"
28
- import { extractDocx } from "./lib/read-hooks/extractors/docx"
29
- import { extractPptx } from "./lib/read-hooks/extractors/pptx"
30
- import { extractXlsx } from "./lib/read-hooks/extractors/xlsx"
31
28
  import { extractPdfText } from "./lib/read-hooks/extractors/pdf"
29
+ import { createOfficeReadView } from "./lib/read-hooks/office-read-view"
30
+ import { OFFICE_EXTENSIONS, IMAGE_EXTENSIONS, formatExtractedText } from "./lib/read-hooks/dispatch"
32
31
  import { handleHelp } from "./lib/commands/help"
33
32
  import { handleEnable } from "./lib/commands/enable"
34
33
  import { handleDisable } from "./lib/commands/disable"
@@ -241,19 +240,11 @@ const server: Plugin = (async (pluginCtx) => {
241
240
  // directly — the read tool is never called, so tool.execute.before/after
242
241
  // hooks don't fire. This hook intercepts FileParts before LLM sees them.
243
242
  //
244
- // DOCX/PPTX/XLSX/PDF → extract text → replace with TextPart
243
+ // DOCX/PPTX/XLSX/PDF → extract text/read view → replace with TextPart
245
244
  // Images → replace with TextPart hint (LLM can use read tool)
246
245
  "chat.message": async (input, output) => {
247
246
  if (!ctx.enabled) return
248
247
 
249
- const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
250
- const DOC_HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
251
- ".docx": extractDocx,
252
- ".pptx": extractPptx,
253
- ".xlsx": extractXlsx,
254
- ".pdf": extractPdfText,
255
- }
256
-
257
248
  for (let i = 0; i < output.parts.length; i++) {
258
249
  const part = output.parts[i] as any
259
250
  if (part.type !== "file") continue
@@ -264,15 +255,22 @@ const server: Plugin = (async (pluginCtx) => {
264
255
  const name = basename(filePath)
265
256
 
266
257
  try {
267
- if (DOC_HANDLERS[ext]) {
258
+ if (OFFICE_EXTENSIONS.has(ext)) {
259
+ const text = await createOfficeReadView(filePath, process.cwd())
260
+ output.parts[i] = {
261
+ ...part,
262
+ type: "text",
263
+ text,
264
+ } as any
265
+ } else if (ext === ".pdf") {
268
266
  const buf = readFileSync(filePath)
269
- const text = await DOC_HANDLERS[ext](buf)
267
+ const text = await extractPdfText(buf)
270
268
  output.parts[i] = {
271
269
  ...part,
272
270
  type: "text",
273
- text: `[Extracted from: ${name}]\n\n${text}`,
271
+ text: formatExtractedText(filePath, text),
274
272
  } as any
275
- } else if (IMAGE_EXTS.has(ext)) {
273
+ } else if (IMAGE_EXTENSIONS.has(ext)) {
276
274
  output.parts[i] = {
277
275
  ...part,
278
276
  type: "text",
@@ -3,7 +3,7 @@ import { extractDocumentMaterials } from "../lib/document-materials/extract"
3
3
 
4
4
  export default tool({
5
5
  description:
6
- "Extract research materials from a workspace document into a workspace-local cache. " +
6
+ "Extract reusable materials from a workspace document into a workspace-local cache. " +
7
7
  "Supports pptx, docx, and xlsx. Produces a manifest plus extracted text, embedded images, and available slide/sheet mappings. " +
8
8
  "Unsupported file types are skipped instead of failing.",
9
9
  args: {