npm - @cyber-dash-tech/revela - Versions diffs - 0.2.1 → 0.3.0 - Mend

@cyber-dash-tech/revela 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/lib/document-materials/extract.ts +337 -15
package/lib/read-hooks/dispatch.ts +45 -0
package/lib/read-hooks/index.ts +1 -1
package/lib/read-hooks/office-read-view.ts +77 -0
package/lib/read-hooks/post-read.ts +6 -7
package/lib/read-hooks/pre-read.ts +13 -24
package/package.json +1 -1
package/plugin.ts +14 -16
package/tools/extract-document-materials.ts +1 -1

package/lib/document-materials/extract.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 import { createHash } from "crypto"
-import { existsSync, mkdirSync, readFileSync, statSync, writeFileSync } from "fs"
+import { existsSync, mkdirSync, readFileSync, realpathSync, statSync, writeFileSync } from "fs"
 import { basename, dirname, extname, isAbsolute, join, relative, resolve } from "path"
 import { DOMParser } from "@xmldom/xmldom"
 import { unzipSync } from "fflate"
@@ -14,6 +14,37 @@ export type DocumentMaterial = {
   note?: string
 }
+export type SkippedAsset = {
+  source_ref: string
+  page_or_slide?: string
+  reason: "svg_asset" | "unmapped_media" | "low_value_asset"
+  kind?: "svg" | "icon" | "logo" | "overlay" | "decoration"
+}
+export type PptxSlideElement = {
+  id: string
+  kind: "text" | "image" | "shape"
+  zOrder: number
+  bbox?: { x: number; y: number; w: number; h: number }
+  likelyBackground?: boolean
+  likelyHeroImage?: boolean
+  likelyLogo?: boolean
+  likelyOverlayMask?: boolean
+  likelyDecoration?: boolean
+  text?: string
+  source_ref?: string
+  path?: string
+  asset_status?: "kept" | "skipped"
+  name?: string
+}
+export type PptxSlide = {
+  slide: string
+  width?: number
+  height?: number
+  elements: PptxSlideElement[]
+}
 export type DocumentMaterialsResult = {
   status: "processed" | "skipped" | "failed"
   source: string
@@ -22,6 +53,8 @@ export type DocumentMaterialsResult = {
   manifest_path?: string
   text_path?: string
   images?: DocumentMaterial[]
+  skipped_assets?: SkippedAsset[]
+  slides?: PptxSlide[]
   tables?: DocumentMaterial[]
   reason?: string
 }
@@ -36,9 +69,16 @@ type CachedManifest = {
   manifest_path: string
   text_path: string
   images: DocumentMaterial[]
+  skipped_assets: SkippedAsset[]
+  slides: PptxSlide[]
   tables: DocumentMaterial[]
 }
+type PptxImageExtraction = {
+  images: DocumentMaterial[]
+  skipped_assets: SkippedAsset[]
+}
 const SUPPORTED_EXTENSIONS: Record<string, SupportedType> = {
   ".pptx": "pptx",
   ".docx": "docx",
@@ -62,8 +102,11 @@ function normalizeZipTarget(basePath: string, target: string): string {
 }
 function ensureWorkspacePath(filePath: string, workspaceDir: string): string {
-  const resolvedWorkspace = resolve(workspaceDir)
-  const resolvedFile = isAbsolute(filePath) ? resolve(filePath) : resolve(workspaceDir, filePath)
+  const resolvedWorkspace = realpathSync(resolve(workspaceDir))
+  const candidate = isAbsolute(filePath) ? resolve(filePath) : resolve(workspaceDir, filePath)
+  const resolvedFile = existsSync(candidate)
+    ? realpathSync(candidate)
+    : candidate
   if (resolvedFile !== resolvedWorkspace && !resolvedFile.startsWith(resolvedWorkspace + "/")) {
     throw new Error("file must be within workspace")
@@ -72,8 +115,24 @@ function ensureWorkspacePath(filePath: string, workspaceDir: string): string {
   return resolvedFile
 }
+function normalizeWorkspaceChild(filePath: string, workspaceDir: string): string {
+  const workspaceAlias = resolve(workspaceDir)
+  const workspaceReal = realpathSync(workspaceAlias)
+  const candidate = resolve(filePath)
+  if (existsSync(candidate)) return realpathSync(candidate)
+  if (candidate === workspaceAlias || candidate.startsWith(workspaceAlias + "/")) {
+    return join(workspaceReal, relative(workspaceAlias, candidate))
+  }
+  return candidate
+}
 function workspaceRelative(filePath: string, workspaceDir: string): string {
-  return relative(workspaceDir, filePath).replace(/\\/g, "/")
+  const resolvedWorkspace = realpathSync(resolve(workspaceDir))
+  const resolvedFile = normalizeWorkspaceChild(filePath, workspaceDir)
+  return relative(resolvedWorkspace, resolvedFile).replace(/\\/g, "/")
 }
 function buildFingerprint(filePath: string): string {
@@ -98,12 +157,256 @@ function parseXml(files: Record<string, Uint8Array>, path: string): any | null {
   return new DOMParser().parseFromString(new TextDecoder().decode(file), "text/xml")
 }
-function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): DocumentMaterial[] {
+function xmlLocalName(node: any): string {
+  return node?.localName ?? String(node?.nodeName ?? "").split(":").pop() ?? ""
+}
+function xmlElementChildren(node: any): any[] {
+  const children: any[] = []
+  const childNodes = node?.childNodes ?? []
+  for (let i = 0; i < childNodes.length; i++) {
+    const child = childNodes[i]
+    if (child?.nodeType === 1) children.push(child)
+  }
+  return children
+}
+function xmlDescendantsByLocalName(node: any, name: string): any[] {
+  const matches: any[] = []
+  const walk = (current: any) => {
+    for (const child of xmlElementChildren(current)) {
+      if (xmlLocalName(child) === name) matches.push(child)
+      walk(child)
+    }
+  }
+  walk(node)
+  return matches
+}
+function firstDescendantByLocalName(node: any, name: string): any | null {
+  const [match] = xmlDescendantsByLocalName(node, name)
+  return match ?? null
+}
+function extractShapeText(node: any): string | undefined {
+  const texts = xmlDescendantsByLocalName(node, "t")
+    .map((textNode) => textNode.textContent?.trim())
+    .filter(Boolean)
+  return texts.length > 0 ? texts.join("\n") : undefined
+}
+function extractElementName(node: any): string | undefined {
+  return firstDescendantByLocalName(node, "cNvPr")?.getAttribute?.("name") || undefined
+}
+function parseCoordinate(value: string | null | undefined): number | undefined {
+  if (value == null || value === "") return undefined
+  const parsed = Number(value)
+  return Number.isFinite(parsed) ? parsed : undefined
+}
+function extractElementBBox(node: any): { x: number; y: number; w: number; h: number } | undefined {
+  const xfrm = firstDescendantByLocalName(node, "xfrm")
+  if (!xfrm) return undefined
+  const off = firstDescendantByLocalName(xfrm, "off")
+  const ext = firstDescendantByLocalName(xfrm, "ext")
+  if (!off || !ext) return undefined
+  const x = parseCoordinate(off.getAttribute?.("x"))
+  const y = parseCoordinate(off.getAttribute?.("y"))
+  const w = parseCoordinate(ext.getAttribute?.("cx"))
+  const h = parseCoordinate(ext.getAttribute?.("cy"))
+  if ([x, y, w, h].some((value) => value == null)) return undefined
+  return { x: x!, y: y!, w: w!, h: h! }
+}
+function getPptxSlideSize(files: Record<string, Uint8Array>): { width: number; height: number } | undefined {
+  const doc = parseXml(files, "ppt/presentation.xml")
+  const size = firstDescendantByLocalName(doc, "sldSz")
+  if (!size) return undefined
+  const width = parseCoordinate(size.getAttribute?.("cx"))
+  const height = parseCoordinate(size.getAttribute?.("cy"))
+  if (width == null || height == null) return undefined
+  return { width, height }
+}
+function isNearCorner(
+  bbox: { x: number; y: number; w: number; h: number },
+  slideWidth: number,
+  slideHeight: number,
+): boolean {
+  const thresholdX = slideWidth * 0.12
+  const thresholdY = slideHeight * 0.12
+  const right = bbox.x + bbox.w
+  const bottom = bbox.y + bbox.h
+  return (
+    (bbox.x <= thresholdX && bbox.y <= thresholdY) ||
+    (right >= slideWidth - thresholdX && bbox.y <= thresholdY) ||
+    (bbox.x <= thresholdX && bottom >= slideHeight - thresholdY) ||
+    (right >= slideWidth - thresholdX && bottom >= slideHeight - thresholdY)
+  )
+}
+function applyPptxHeuristics(
+  slide: PptxSlide,
+  slideWidth: number | undefined,
+  slideHeight: number | undefined,
+): PptxSlide {
+  if (!slideWidth || !slideHeight) return slide
+  const slideArea = slideWidth * slideHeight
+  slide.elements = slide.elements.map((element) => {
+    if (!element.bbox) return element
+    const areaRatio = (element.bbox.w * element.bbox.h) / slideArea
+    const sourceName = `${element.source_ref ?? ""} ${element.name ?? ""}`.toLowerCase()
+    if (element.kind === "image") {
+      const flags: Partial<PptxSlideElement> = {}
+      if (areaRatio >= 0.75 && element.asset_status === "kept") flags.likelyBackground = true
+      else if (areaRatio >= 0.2 && element.asset_status === "kept") flags.likelyHeroImage = true
+      if (areaRatio <= 0.03 && isNearCorner(element.bbox, slideWidth, slideHeight)) flags.likelyLogo = true
+      if (/(logo|brand)/.test(sourceName)) flags.likelyLogo = true
+      if (/(mask|overlay|shadow)/.test(sourceName) || element.asset_status === "skipped") flags.likelyOverlayMask = true
+      if (/(arrow|ornament|decoration)/.test(sourceName)) flags.likelyDecoration = true
+      return Object.keys(flags).length > 0 ? { ...element, ...flags } : element
+    }
+    if (element.kind === "shape") {
+      const flags: Partial<PptxSlideElement> = {}
+      if (areaRatio >= 0.4) flags.likelyOverlayMask = true
+      if (areaRatio <= 0.03 || /(arrow|ornament|decoration)/.test(sourceName)) flags.likelyDecoration = true
+      return Object.keys(flags).length > 0 ? { ...element, ...flags } : element
+    }
+    return element
+  })
+  return slide
+}
+function getSlideMediaTargets(files: Record<string, Uint8Array>, slidePath: string): Map<string, string> {
+  const relPath = slidePath.replace("/slides/", "/slides/_rels/") + ".rels"
+  const doc = parseXml(files, relPath)
+  const targets = new Map<string, string>()
+  if (!doc) return targets
+  const relationships = doc.getElementsByTagName("Relationship")
+  for (let i = 0; i < relationships.length; i++) {
+    const rel = relationships[i]
+    const id = rel.getAttribute("Id")
+    const target = rel.getAttribute("Target")
+    if (!id || !target) continue
+    const normalized = normalizeZipTarget(slidePath, target)
+    if (!normalized.startsWith("ppt/media/")) continue
+    targets.set(id, normalized)
+  }
+  return targets
+}
+function extractPptxSlides(
+  files: Record<string, Uint8Array>,
+  images: DocumentMaterial[],
+  skippedAssets: SkippedAsset[],
+): PptxSlide[] {
+  const slideSize = getPptxSlideSize(files)
+  const keptBySource = new Map(images.map((image) => [image.source_ref, image]))
+  const skippedBySource = new Map(skippedAssets.map((asset) => [asset.source_ref, asset]))
+  const slideFiles = Object.keys(files)
+    .filter((file) => /^ppt\/slides\/slide\d+\.xml$/.test(file))
+    .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
+  return slideFiles.map((slidePath) => {
+    const slideNumber = slidePath.match(/slide(\d+)\.xml$/)?.[1] ?? "0"
+    const slideId = `slide-${slideNumber.padStart(2, "0")}`
+    const doc = parseXml(files, slidePath)
+    const mediaTargets = getSlideMediaTargets(files, slidePath)
+    const elements: PptxSlideElement[] = []
+    if (!doc) return { slide: slideId, ...(slideSize ?? {}), elements }
+    const spTree = firstDescendantByLocalName(doc, "spTree")
+    if (!spTree) return { slide: slideId, ...(slideSize ?? {}), elements }
+    for (const node of xmlElementChildren(spTree)) {
+      const kind = xmlLocalName(node)
+      if (kind === "nvGrpSpPr" || kind === "grpSpPr") continue
+      const zOrder = elements.length + 1
+      const id = `${slideId}-element-${String(zOrder).padStart(2, "0")}`
+      const name = extractElementName(node)
+      const bbox = extractElementBBox(node)
+      if (kind === "sp") {
+        const text = extractShapeText(node)
+        elements.push(text
+          ? { id, kind: "text", zOrder, text, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) }
+          : { id, kind: "shape", zOrder, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) })
+        continue
+      }
+      if (kind === "pic") {
+        const blip = firstDescendantByLocalName(node, "blip")
+        const rid = blip?.getAttribute?.("r:embed") || blip?.getAttribute?.("embed") || undefined
+        const sourceRef = rid ? mediaTargets.get(rid) : undefined
+        const kept = sourceRef ? keptBySource.get(sourceRef) : undefined
+        const skipped = sourceRef ? skippedBySource.get(sourceRef) : undefined
+        elements.push({
+          id,
+          kind: "image",
+          zOrder,
+          ...(bbox ? { bbox } : {}),
+          ...(name ? { name } : {}),
+          ...(sourceRef ? { source_ref: sourceRef } : {}),
+          ...(kept?.path ? { path: kept.path } : {}),
+          ...((kept || skipped) ? { asset_status: kept ? "kept" as const : "skipped" as const } : {}),
+        })
+        continue
+      }
+      if (kind === "cxnSp" || kind === "graphicFrame" || kind === "grpSp") {
+        elements.push({ id, kind: "shape", zOrder, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) })
+      }
+    }
+    return applyPptxHeuristics({ slide: slideId, ...(slideSize ?? {}), elements }, slideSize?.width, slideSize?.height)
+  })
+}
+const LOW_VALUE_PPTX_ASSET = /(icon|logo|mask|overlay|shadow|decoration|ornament|arrow)/i
+function classifySkippedAsset(sourceRef: string, reason: SkippedAsset["reason"]): SkippedAsset["kind"] | undefined {
+  if (sourceRef.endsWith(".svg")) return "svg"
+  if (/icon/i.test(sourceRef)) return "icon"
+  if (/logo/i.test(sourceRef)) return "logo"
+  if (/(mask|overlay|shadow)/i.test(sourceRef)) return "overlay"
+  if (/(decoration|ornament|arrow)/i.test(sourceRef)) return "decoration"
+  if (reason === "svg_asset") return "svg"
+  return undefined
+}
+function shouldSkipPptxAsset(sourceRef: string): { reason: SkippedAsset["reason"]; kind?: SkippedAsset["kind"] } | null {
+  if (sourceRef.endsWith(".svg")) {
+    return { reason: "svg_asset", kind: "svg" }
+  }
+  if (LOW_VALUE_PPTX_ASSET.test(basename(sourceRef))) {
+    return { reason: "low_value_asset", kind: classifySkippedAsset(sourceRef, "low_value_asset") }
+  }
+  return null
+}
+function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): PptxImageExtraction {
   const relFiles = Object.keys(files)
     .filter((file) => /^ppt\/slides\/_rels\/slide\d+\.xml\.rels$/.test(file))
     .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
   const images: DocumentMaterial[] = []
+  const skipped_assets: SkippedAsset[] = []
   const seenTargets = new Set<string>()
   for (const relPath of relFiles) {
@@ -124,8 +427,19 @@ function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string,
       const media = files[normalized]
       if (!media) continue
-      imageIndex += 1
       seenTargets.add(normalized)
+      const skipped = shouldSkipPptxAsset(normalized)
+      if (skipped) {
+        skipped_assets.push({
+          source_ref: normalized,
+          page_or_slide: `slide-${slideNumber.padStart(2, "0")}`,
+          reason: skipped.reason,
+          kind: skipped.kind,
+        })
+        continue
+      }
+      imageIndex += 1
       const exportedName = `slide-${slideNumber.padStart(2, "0")}-image-${String(imageIndex).padStart(2, "0")}${extname(normalized)}`
       const outputPath = join(cacheDir, "images", exportedName)
       writeCachedBuffer(outputPath, media)
@@ -143,18 +457,14 @@ function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string,
     .sort()
   for (const mediaPath of remainingMedia) {
-    const exportedName = `unmapped-${basename(mediaPath)}`
-    const outputPath = join(cacheDir, "images", exportedName)
-    writeCachedBuffer(outputPath, files[mediaPath])
-    images.push({
-      path: materialPath(cacheDir, workspaceDir, "images", exportedName),
+    skipped_assets.push({
       source_ref: mediaPath,
-      note: "No slide-level relationship found",
+      reason: "unmapped_media",
+      kind: classifySkippedAsset(mediaPath, "unmapped_media"),
     })
   }
-  return images
+  return { images, skipped_assets }
 }
 function extractDocxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): DocumentMaterial[] {
@@ -295,6 +605,8 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
       manifest_path: manifest.manifest_path,
       text_path: manifest.text_path,
       images: manifest.images,
+      skipped_assets: manifest.skipped_assets,
+      slides: manifest.slides,
       tables: manifest.tables,
     }
   }
@@ -314,11 +626,17 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
   const textPath = join(cacheDir, "text.txt")
   writeFileSync(textPath, `[Extracted from: ${basename(filePath)}]\n\n${text}`, "utf-8")
-  const images = type === "pptx"
+  const pptxAssets = type === "pptx"
     ? extractPptxImages(files, cacheDir, workspaceDir)
+    : null
+  const images = type === "pptx"
+    ? pptxAssets!.images
     : type === "docx"
       ? extractDocxImages(files, cacheDir, workspaceDir)
       : extractXlsxImages(files, cacheDir, workspaceDir)
+  const slides = type === "pptx"
+    ? extractPptxSlides(files, images, pptxAssets!.skipped_assets)
+    : undefined
   const result: DocumentMaterialsResult = {
     status: "processed",
@@ -328,6 +646,8 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
     manifest_path: workspaceRelative(manifestPath, workspaceDir),
     text_path: workspaceRelative(textPath, workspaceDir),
     images,
+    skipped_assets: pptxAssets?.skipped_assets ?? [],
+    slides,
     tables: extractTables(type, workspaceRelative(textPath, workspaceDir)),
   }
@@ -339,6 +659,8 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
     manifest_path: result.manifest_path!,
     text_path: result.text_path!,
     images: result.images ?? [],
+    skipped_assets: result.skipped_assets ?? [],
+    slides: result.slides ?? [],
     tables: result.tables ?? [],
   }

package/lib/read-hooks/dispatch.ts ADDED Viewed

@@ -0,0 +1,45 @@
+import { basename, extname } from "path"
+export const OFFICE_EXTENSIONS = new Set([".docx", ".pptx", ".xlsx"])
+export const IMAGE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
+export type ReadStrategy =
+  | "before-materialize-document"
+  | "after-extract-text"
+  | "after-compress-image"
+  | "passthrough"
+export function classifyReadFile(filePath: string): ReadStrategy {
+  const ext = extname(filePath).toLowerCase()
+  if (OFFICE_EXTENSIONS.has(ext)) return "before-materialize-document"
+  if (ext === ".pdf") return "after-extract-text"
+  if (IMAGE_EXTENSIONS.has(ext)) return "after-compress-image"
+  return "passthrough"
+}
+export function formatExtractedText(filePath: string, text: string): string {
+  return `[Extracted from: ${basename(filePath)}]\n\n${text}`
+}
+export function buildOfficeReadView(
+  filePath: string,
+  text: string,
+  images: Array<{ path: string }> | undefined,
+): string {
+  const lines = [
+    `# Extracted from: ${basename(filePath)}`,
+    "",
+    "## Text",
+    "",
+    text.trim() || "No text extracted.",
+  ]
+  lines.push("", "## Images", "")
+  if (!images?.length) {
+    lines.push("- None")
+  } else {
+    for (const image of images) lines.push(`- ${image.path}`)
+  }
+  return lines.join("\n")
+}

package/lib/read-hooks/index.ts CHANGED Viewed

@@ -4,7 +4,7 @@
  * Entry point for the read-hooks module.
  * Exports preRead and postRead for use in plugins/revela.ts hook handlers.
  *
- * preRead  → tool.execute.before: redirect binary files (DOCX/PPTX/XLSX) to temp txt
+ * preRead  → tool.execute.before: materialize Office docs and redirect to temp markdown
  * postRead → tool.execute.after:  transform PDF/image attachments before LLM sees them
  */

package/lib/read-hooks/office-read-view.ts ADDED Viewed

@@ -0,0 +1,77 @@
+import { readFileSync } from "fs"
+import { join } from "path"
+import type { PptxSlide } from "../document-materials/extract"
+import { extractDocumentMaterials } from "../document-materials/extract"
+import { buildOfficeReadView } from "./dispatch"
+import { extractDocx } from "./extractors/docx"
+import { extractPptx } from "./extractors/pptx"
+import { extractXlsx } from "./extractors/xlsx"
+import { formatExtractedText } from "./dispatch"
+const HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
+  ".docx": extractDocx,
+  ".pptx": extractPptx,
+  ".xlsx": extractXlsx,
+}
+function buildPptxStructureHints(slides: PptxSlide[] | undefined): string {
+  if (!slides?.length) return ""
+  const lines = ["", "## Slide Structure", ""]
+  for (const slide of slides) {
+    const textCount = slide.elements.filter((element) => element.kind === "text").length
+    const keptImageCount = slide.elements.filter((element) => element.kind === "image" && element.asset_status === "kept").length
+    const skippedImageCount = slide.elements.filter((element) => element.kind === "image" && element.asset_status === "skipped").length
+    const shapeCount = slide.elements.filter((element) => element.kind === "shape").length
+    const summary = [
+      textCount > 0 ? `${textCount} text` : null,
+      keptImageCount > 0 ? `${keptImageCount} kept image` : null,
+      skippedImageCount > 0 ? `${skippedImageCount} skipped image` : null,
+      shapeCount > 0 ? `${shapeCount} shape` : null,
+    ].filter(Boolean).join(", ") || "no parsed elements"
+    lines.push(`- ${slide.slide}: ${summary}`)
+    const roleSummary = [
+      countRole(slide, (element) => element.likelyBackground, "background image"),
+      countRole(slide, (element) => element.likelyHeroImage, "hero image"),
+      countRole(slide, (element) => element.likelyLogo, "logo"),
+      countRole(slide, (element) => element.likelyOverlayMask, "overlay"),
+      countRole(slide, (element) => element.likelyDecoration, "decoration"),
+    ].filter(Boolean).join(", ")
+    if (roleSummary) lines.push(`  likely roles: ${roleSummary}`)
+  }
+  return lines.join("\n")
+}
+function countRole(
+  slide: PptxSlide,
+  predicate: (element: PptxSlide["elements"][number]) => boolean | undefined,
+  label: string,
+): string | null {
+  const count = slide.elements.filter(predicate).length
+  if (count === 0) return null
+  return `${count} ${label}${count === 1 ? "" : "s"}`
+}
+export async function createOfficeReadView(filePath: string, workspaceDir: string): Promise<string> {
+  const ext = filePath.slice(filePath.lastIndexOf(".")).toLowerCase()
+  const handler = HANDLERS[ext]
+  if (!handler) throw new Error(`unsupported office file type: ${ext}`)
+  const materialized = await extractDocumentMaterials(filePath, workspaceDir)
+  if (materialized.status === "processed" && materialized.text_path) {
+    const textPath = join(workspaceDir, materialized.text_path)
+    const extracted = readFileSync(textPath, "utf-8")
+    const text = extracted.replace(/^\[Extracted from: .*?\]\n\n/, "")
+    const view = buildOfficeReadView(filePath, text, materialized.images)
+    return filePath.toLowerCase().endsWith(".pptx")
+      ? view + buildPptxStructureHints(materialized.slides)
+      : view
+  }
+  const buf = readFileSync(filePath)
+  const text = await handler(buf)
+  return formatExtractedText(filePath, text)
+}

package/lib/read-hooks/post-read.ts CHANGED Viewed

@@ -16,11 +16,10 @@
  * of packages/opencode/src/session/prompt.ts.
  */
-import { extname, basename } from "path"
+import { basename } from "path"
 import { extractPdfText } from "./extractors/pdf"
 import { compressImage } from "./image/compress"
-const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
+import { classifyReadFile, formatExtractedText } from "./dispatch"
 interface ReadOutput {
   title: string
@@ -41,10 +40,10 @@ export async function postRead(
 ): Promise<void> {
   if (!output.attachments?.length) return
-  const ext = extname(args.filePath).toLowerCase()
+  const strategy = classifyReadFile(args.filePath)
   // ── PDF: extract text, drop base64 attachment ───────────────────────────
-  if (ext === ".pdf") {
+  if (strategy === "after-extract-text") {
     const attachment = output.attachments[0]
     const base64 = attachment.url.split(",")[1]
     if (!base64) return
@@ -52,14 +51,14 @@ export async function postRead(
     const buf = Buffer.from(base64, "base64")
     const text = await extractPdfText(buf)
-    output.output = `[Extracted from: ${basename(args.filePath)}]\n\n${text}`
+    output.output = formatExtractedText(args.filePath, text)
     output.title = `Extracted text from ${basename(args.filePath)}`
     output.attachments.length = 0 // Remove base64 — saves significant tokens
     return
   }
   // ── Images: compress attachment to reduce token cost ────────────────────
-  if (IMAGE_EXTS.has(ext)) {
+  if (strategy === "after-compress-image") {
     const attachment = output.attachments[0]
     const base64 = attachment.url.split(",")[1]
     if (!base64) return

package/lib/read-hooks/pre-read.ts CHANGED Viewed

@@ -7,44 +7,33 @@
  * Handles DOCX, PPTX, XLSX — formats that cause read tool to throw
  * Effect.fail("Cannot read binary file"), so the after-hook never fires.
  *
- * Strategy: extract text → write temp .txt file → redirect args.filePath.
+ * Strategy: materialize the document into cached text + images, render a
+ * markdown read view, then redirect args.filePath to that temp .md file.
  * The read tool then reads the temp file normally. LLM is unaware of the redirect.
  */
-import { readFileSync, writeFileSync } from "fs"
-import { extname, basename, join } from "path"
+import { writeFileSync } from "fs"
+import { join } from "path"
 import { tmpdir } from "os"
 import { randomUUID } from "crypto"
-import { extractDocx } from "./extractors/docx"
-import { extractPptx } from "./extractors/pptx"
-import { extractXlsx } from "./extractors/xlsx"
-// Extension → extractor function mapping
-const HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
-  ".docx": extractDocx,
-  ".pptx": extractPptx,
-  ".xlsx": extractXlsx,
-}
+import { classifyReadFile } from "./dispatch"
+import { createOfficeReadView } from "./office-read-view"
 /**
  * Intercept read tool args before execution.
- * If the file is a supported binary format, extract its text and redirect
- * args.filePath to a temp .txt file containing the extracted content.
+ * If the file is a supported Office document, materialize it into cached
+ * text + images and redirect args.filePath to a temporary markdown read view.
  *
  * @param args - Mutable read tool args object (from output.args in before-hook)
  */
 export async function preRead(args: { filePath: string; [k: string]: any }): Promise<void> {
-  const ext = extname(args.filePath).toLowerCase()
-  const handler = HANDLERS[ext]
-  if (!handler) return // Not a handled format — let read tool proceed normally
+  if (classifyReadFile(args.filePath) !== "before-materialize-document") return
-  const buf = readFileSync(args.filePath)
-  const text = await handler(buf)
+  const workspaceDir = process.cwd()
+  const output = await createOfficeReadView(args.filePath, workspaceDir)
-  // Write extracted text to a temp file, prefixed with source info
-  const header = `[Extracted from: ${basename(args.filePath)}]\n\n`
-  const tmpPath = join(tmpdir(), `revela-${randomUUID()}.txt`)
-  writeFileSync(tmpPath, header + text, "utf-8")
+  const tmpPath = join(tmpdir(), `revela-${randomUUID()}.md`)
+  writeFileSync(tmpPath, output, "utf-8")
   // Redirect read tool to the temp file
   args.filePath = tmpPath

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@cyber-dash-tech/revela",
-  "version": "0.2.1",
+  "version": "0.3.0",
   "description": "OpenCode plugin that turns AI into an HTML slide deck generator",
   "type": "module",
   "main": "./index.ts",

package/plugin.ts CHANGED Viewed

@@ -25,10 +25,9 @@ import { ACTIVE_PROMPT_FILE } from "./lib/config"
 import { ctx } from "./lib/ctx"
 import { preRead } from "./lib/read-hooks"
 import { postRead } from "./lib/read-hooks"
-import { extractDocx } from "./lib/read-hooks/extractors/docx"
-import { extractPptx } from "./lib/read-hooks/extractors/pptx"
-import { extractXlsx } from "./lib/read-hooks/extractors/xlsx"
 import { extractPdfText } from "./lib/read-hooks/extractors/pdf"
+import { createOfficeReadView } from "./lib/read-hooks/office-read-view"
+import { OFFICE_EXTENSIONS, IMAGE_EXTENSIONS, formatExtractedText } from "./lib/read-hooks/dispatch"
 import { handleHelp } from "./lib/commands/help"
 import { handleEnable } from "./lib/commands/enable"
 import { handleDisable } from "./lib/commands/disable"
@@ -241,19 +240,11 @@ const server: Plugin = (async (pluginCtx) => {
     // directly — the read tool is never called, so tool.execute.before/after
     // hooks don't fire. This hook intercepts FileParts before LLM sees them.
     //
-    // DOCX/PPTX/XLSX/PDF → extract text → replace with TextPart
+    // DOCX/PPTX/XLSX/PDF → extract text/read view → replace with TextPart
     // Images              → replace with TextPart hint (LLM can use read tool)
     "chat.message": async (input, output) => {
       if (!ctx.enabled) return
-      const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
-      const DOC_HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
-        ".docx": extractDocx,
-        ".pptx": extractPptx,
-        ".xlsx": extractXlsx,
-        ".pdf": extractPdfText,
-      }
       for (let i = 0; i < output.parts.length; i++) {
         const part = output.parts[i] as any
         if (part.type !== "file") continue
@@ -264,15 +255,22 @@ const server: Plugin = (async (pluginCtx) => {
         const name = basename(filePath)
         try {
-          if (DOC_HANDLERS[ext]) {
+          if (OFFICE_EXTENSIONS.has(ext)) {
+            const text = await createOfficeReadView(filePath, process.cwd())
+            output.parts[i] = {
+              ...part,
+              type: "text",
+              text,
+            } as any
+          } else if (ext === ".pdf") {
             const buf = readFileSync(filePath)
-            const text = await DOC_HANDLERS[ext](buf)
+            const text = await extractPdfText(buf)
             output.parts[i] = {
               ...part,
               type: "text",
-              text: `[Extracted from: ${name}]\n\n${text}`,
+              text: formatExtractedText(filePath, text),
             } as any
-          } else if (IMAGE_EXTS.has(ext)) {
+          } else if (IMAGE_EXTENSIONS.has(ext)) {
             output.parts[i] = {
               ...part,
               type: "text",

package/tools/extract-document-materials.ts CHANGED Viewed

@@ -3,7 +3,7 @@ import { extractDocumentMaterials } from "../lib/document-materials/extract"
 export default tool({
   description:
-    "Extract research materials from a workspace document into a workspace-local cache. " +
+    "Extract reusable materials from a workspace document into a workspace-local cache. " +
     "Supports pptx, docx, and xlsx. Produces a manifest plus extracted text, embedded images, and available slide/sheet mappings. " +
     "Unsupported file types are skipped instead of failing.",
   args: {