@cyber-dash-tech/revela 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/agents/research-prompt.ts +12 -2
- package/lib/document-materials/extract.ts +695 -0
- package/lib/read-hooks/dispatch.ts +45 -0
- package/lib/read-hooks/index.ts +1 -1
- package/lib/read-hooks/office-read-view.ts +77 -0
- package/lib/read-hooks/post-read.ts +6 -7
- package/lib/read-hooks/pre-read.ts +13 -24
- package/package.json +1 -1
- package/plugin.ts +17 -17
- package/tools/extract-document-materials.ts +18 -0
|
@@ -37,8 +37,17 @@ Given a research brief specifying your topic and axis, you will:
|
|
|
37
37
|
Use the **\`revela-workspace-scan\`** tool in a single call to discover all document
|
|
38
38
|
files in the workspace (PDF, Word, Excel, PowerPoint, CSV, text).
|
|
39
39
|
|
|
40
|
-
Then
|
|
41
|
-
|
|
40
|
+
Then select the files relevant to your research axis.
|
|
41
|
+
|
|
42
|
+
For every selected file, call **\`revela-extract-document-materials\`** first.
|
|
43
|
+
- \`pptx\`, \`docx\`, and \`xlsx\` will produce a manifest plus extracted text and any available embedded materials
|
|
44
|
+
- unsupported file types will be skipped automatically
|
|
45
|
+
|
|
46
|
+
After that, use the \`read\` tool on:
|
|
47
|
+
- the original relevant file when you want the plain extracted text
|
|
48
|
+
- the generated manifest and extracted image/table files when visual or tabular evidence matters
|
|
49
|
+
|
|
50
|
+
For PDFs and Office formats, the Revela plugin extracts text transparently — just call \`read\` normally.
|
|
42
51
|
|
|
43
52
|
---
|
|
44
53
|
|
|
@@ -125,6 +134,7 @@ Gaps:
|
|
|
125
134
|
- **NEVER** ask the user for information you can find through search or workspace files
|
|
126
135
|
- **NEVER** use the raw \`write\` tool — always use \`revela-research-save\`
|
|
127
136
|
- **NEVER** fabricate image URLs — only record URLs you actually found
|
|
137
|
+
- **Always** call \`revela-extract-document-materials\` for every selected workspace file before deciding which extracted materials to read next
|
|
128
138
|
- **Always** include source attribution on every data point
|
|
129
139
|
- **Always** use tables for comparative data (more useful than bullets for presentations)
|
|
130
140
|
- **Preserve** raw data — the primary agent will select what to include in slides
|
|
@@ -0,0 +1,695 @@
|
|
|
1
|
+
import { createHash } from "crypto"
|
|
2
|
+
import { existsSync, mkdirSync, readFileSync, realpathSync, statSync, writeFileSync } from "fs"
|
|
3
|
+
import { basename, dirname, extname, isAbsolute, join, relative, resolve } from "path"
|
|
4
|
+
import { DOMParser } from "@xmldom/xmldom"
|
|
5
|
+
import { unzipSync } from "fflate"
|
|
6
|
+
import { extractDocx } from "../read-hooks/extractors/docx"
|
|
7
|
+
import { extractPptx } from "../read-hooks/extractors/pptx"
|
|
8
|
+
import { extractXlsx } from "../read-hooks/extractors/xlsx"
|
|
9
|
+
|
|
10
|
+
export type DocumentMaterial = {
|
|
11
|
+
path: string
|
|
12
|
+
source_ref: string
|
|
13
|
+
page_or_slide?: string
|
|
14
|
+
note?: string
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
export type SkippedAsset = {
|
|
18
|
+
source_ref: string
|
|
19
|
+
page_or_slide?: string
|
|
20
|
+
reason: "svg_asset" | "unmapped_media" | "low_value_asset"
|
|
21
|
+
kind?: "svg" | "icon" | "logo" | "overlay" | "decoration"
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export type PptxSlideElement = {
|
|
25
|
+
id: string
|
|
26
|
+
kind: "text" | "image" | "shape"
|
|
27
|
+
zOrder: number
|
|
28
|
+
bbox?: { x: number; y: number; w: number; h: number }
|
|
29
|
+
likelyBackground?: boolean
|
|
30
|
+
likelyHeroImage?: boolean
|
|
31
|
+
likelyLogo?: boolean
|
|
32
|
+
likelyOverlayMask?: boolean
|
|
33
|
+
likelyDecoration?: boolean
|
|
34
|
+
text?: string
|
|
35
|
+
source_ref?: string
|
|
36
|
+
path?: string
|
|
37
|
+
asset_status?: "kept" | "skipped"
|
|
38
|
+
name?: string
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export type PptxSlide = {
|
|
42
|
+
slide: string
|
|
43
|
+
width?: number
|
|
44
|
+
height?: number
|
|
45
|
+
elements: PptxSlideElement[]
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
export type DocumentMaterialsResult = {
|
|
49
|
+
status: "processed" | "skipped" | "failed"
|
|
50
|
+
source: string
|
|
51
|
+
type: "pptx" | "docx" | "xlsx" | "other"
|
|
52
|
+
cache_dir?: string
|
|
53
|
+
manifest_path?: string
|
|
54
|
+
text_path?: string
|
|
55
|
+
images?: DocumentMaterial[]
|
|
56
|
+
skipped_assets?: SkippedAsset[]
|
|
57
|
+
slides?: PptxSlide[]
|
|
58
|
+
tables?: DocumentMaterial[]
|
|
59
|
+
reason?: string
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
type SupportedType = Exclude<DocumentMaterialsResult["type"], "other">
|
|
63
|
+
|
|
64
|
+
type CachedManifest = {
|
|
65
|
+
source: string
|
|
66
|
+
type: SupportedType
|
|
67
|
+
fingerprint: string
|
|
68
|
+
cache_dir: string
|
|
69
|
+
manifest_path: string
|
|
70
|
+
text_path: string
|
|
71
|
+
images: DocumentMaterial[]
|
|
72
|
+
skipped_assets: SkippedAsset[]
|
|
73
|
+
slides: PptxSlide[]
|
|
74
|
+
tables: DocumentMaterial[]
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
type PptxImageExtraction = {
|
|
78
|
+
images: DocumentMaterial[]
|
|
79
|
+
skipped_assets: SkippedAsset[]
|
|
80
|
+
}
|
|
81
|
+
|
|
82
|
+
const SUPPORTED_EXTENSIONS: Record<string, SupportedType> = {
|
|
83
|
+
".pptx": "pptx",
|
|
84
|
+
".docx": "docx",
|
|
85
|
+
".xlsx": "xlsx",
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function normalizeZipTarget(basePath: string, target: string): string {
|
|
89
|
+
const segments = join(dirname(basePath), target).split("/")
|
|
90
|
+
const normalized: string[] = []
|
|
91
|
+
|
|
92
|
+
for (const segment of segments) {
|
|
93
|
+
if (!segment || segment === ".") continue
|
|
94
|
+
if (segment === "..") {
|
|
95
|
+
normalized.pop()
|
|
96
|
+
continue
|
|
97
|
+
}
|
|
98
|
+
normalized.push(segment)
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
return normalized.join("/")
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
function ensureWorkspacePath(filePath: string, workspaceDir: string): string {
|
|
105
|
+
const resolvedWorkspace = realpathSync(resolve(workspaceDir))
|
|
106
|
+
const candidate = isAbsolute(filePath) ? resolve(filePath) : resolve(workspaceDir, filePath)
|
|
107
|
+
const resolvedFile = existsSync(candidate)
|
|
108
|
+
? realpathSync(candidate)
|
|
109
|
+
: candidate
|
|
110
|
+
|
|
111
|
+
if (resolvedFile !== resolvedWorkspace && !resolvedFile.startsWith(resolvedWorkspace + "/")) {
|
|
112
|
+
throw new Error("file must be within workspace")
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
return resolvedFile
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
function normalizeWorkspaceChild(filePath: string, workspaceDir: string): string {
|
|
119
|
+
const workspaceAlias = resolve(workspaceDir)
|
|
120
|
+
const workspaceReal = realpathSync(workspaceAlias)
|
|
121
|
+
const candidate = resolve(filePath)
|
|
122
|
+
|
|
123
|
+
if (existsSync(candidate)) return realpathSync(candidate)
|
|
124
|
+
|
|
125
|
+
if (candidate === workspaceAlias || candidate.startsWith(workspaceAlias + "/")) {
|
|
126
|
+
return join(workspaceReal, relative(workspaceAlias, candidate))
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return candidate
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
function workspaceRelative(filePath: string, workspaceDir: string): string {
|
|
133
|
+
const resolvedWorkspace = realpathSync(resolve(workspaceDir))
|
|
134
|
+
const resolvedFile = normalizeWorkspaceChild(filePath, workspaceDir)
|
|
135
|
+
return relative(resolvedWorkspace, resolvedFile).replace(/\\/g, "/")
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
function buildFingerprint(filePath: string): string {
|
|
139
|
+
const stat = statSync(filePath)
|
|
140
|
+
return createHash("sha1")
|
|
141
|
+
.update(`${resolve(filePath)}:${stat.mtimeMs}:${stat.size}`)
|
|
142
|
+
.digest("hex")
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
function writeCachedBuffer(targetPath: string, buf: Uint8Array): void {
|
|
146
|
+
mkdirSync(dirname(targetPath), { recursive: true })
|
|
147
|
+
writeFileSync(targetPath, new Uint8Array(buf))
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
function materialPath(cacheDir: string, workspaceDir: string, ...segments: string[]): string {
|
|
151
|
+
return workspaceRelative(join(cacheDir, ...segments), workspaceDir)
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function parseXml(files: Record<string, Uint8Array>, path: string): any | null {
|
|
155
|
+
const file = files[path]
|
|
156
|
+
if (!file) return null
|
|
157
|
+
return new DOMParser().parseFromString(new TextDecoder().decode(file), "text/xml")
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
function xmlLocalName(node: any): string {
|
|
161
|
+
return node?.localName ?? String(node?.nodeName ?? "").split(":").pop() ?? ""
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
function xmlElementChildren(node: any): any[] {
|
|
165
|
+
const children: any[] = []
|
|
166
|
+
const childNodes = node?.childNodes ?? []
|
|
167
|
+
for (let i = 0; i < childNodes.length; i++) {
|
|
168
|
+
const child = childNodes[i]
|
|
169
|
+
if (child?.nodeType === 1) children.push(child)
|
|
170
|
+
}
|
|
171
|
+
return children
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
function xmlDescendantsByLocalName(node: any, name: string): any[] {
|
|
175
|
+
const matches: any[] = []
|
|
176
|
+
const walk = (current: any) => {
|
|
177
|
+
for (const child of xmlElementChildren(current)) {
|
|
178
|
+
if (xmlLocalName(child) === name) matches.push(child)
|
|
179
|
+
walk(child)
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
walk(node)
|
|
183
|
+
return matches
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
function firstDescendantByLocalName(node: any, name: string): any | null {
|
|
187
|
+
const [match] = xmlDescendantsByLocalName(node, name)
|
|
188
|
+
return match ?? null
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
function extractShapeText(node: any): string | undefined {
|
|
192
|
+
const texts = xmlDescendantsByLocalName(node, "t")
|
|
193
|
+
.map((textNode) => textNode.textContent?.trim())
|
|
194
|
+
.filter(Boolean)
|
|
195
|
+
return texts.length > 0 ? texts.join("\n") : undefined
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
function extractElementName(node: any): string | undefined {
|
|
199
|
+
return firstDescendantByLocalName(node, "cNvPr")?.getAttribute?.("name") || undefined
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
function parseCoordinate(value: string | null | undefined): number | undefined {
|
|
203
|
+
if (value == null || value === "") return undefined
|
|
204
|
+
const parsed = Number(value)
|
|
205
|
+
return Number.isFinite(parsed) ? parsed : undefined
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
function extractElementBBox(node: any): { x: number; y: number; w: number; h: number } | undefined {
|
|
209
|
+
const xfrm = firstDescendantByLocalName(node, "xfrm")
|
|
210
|
+
if (!xfrm) return undefined
|
|
211
|
+
|
|
212
|
+
const off = firstDescendantByLocalName(xfrm, "off")
|
|
213
|
+
const ext = firstDescendantByLocalName(xfrm, "ext")
|
|
214
|
+
if (!off || !ext) return undefined
|
|
215
|
+
|
|
216
|
+
const x = parseCoordinate(off.getAttribute?.("x"))
|
|
217
|
+
const y = parseCoordinate(off.getAttribute?.("y"))
|
|
218
|
+
const w = parseCoordinate(ext.getAttribute?.("cx"))
|
|
219
|
+
const h = parseCoordinate(ext.getAttribute?.("cy"))
|
|
220
|
+
if ([x, y, w, h].some((value) => value == null)) return undefined
|
|
221
|
+
|
|
222
|
+
return { x: x!, y: y!, w: w!, h: h! }
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
function getPptxSlideSize(files: Record<string, Uint8Array>): { width: number; height: number } | undefined {
|
|
226
|
+
const doc = parseXml(files, "ppt/presentation.xml")
|
|
227
|
+
const size = firstDescendantByLocalName(doc, "sldSz")
|
|
228
|
+
if (!size) return undefined
|
|
229
|
+
|
|
230
|
+
const width = parseCoordinate(size.getAttribute?.("cx"))
|
|
231
|
+
const height = parseCoordinate(size.getAttribute?.("cy"))
|
|
232
|
+
if (width == null || height == null) return undefined
|
|
233
|
+
return { width, height }
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
function isNearCorner(
|
|
237
|
+
bbox: { x: number; y: number; w: number; h: number },
|
|
238
|
+
slideWidth: number,
|
|
239
|
+
slideHeight: number,
|
|
240
|
+
): boolean {
|
|
241
|
+
const thresholdX = slideWidth * 0.12
|
|
242
|
+
const thresholdY = slideHeight * 0.12
|
|
243
|
+
const right = bbox.x + bbox.w
|
|
244
|
+
const bottom = bbox.y + bbox.h
|
|
245
|
+
return (
|
|
246
|
+
(bbox.x <= thresholdX && bbox.y <= thresholdY) ||
|
|
247
|
+
(right >= slideWidth - thresholdX && bbox.y <= thresholdY) ||
|
|
248
|
+
(bbox.x <= thresholdX && bottom >= slideHeight - thresholdY) ||
|
|
249
|
+
(right >= slideWidth - thresholdX && bottom >= slideHeight - thresholdY)
|
|
250
|
+
)
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
function applyPptxHeuristics(
|
|
254
|
+
slide: PptxSlide,
|
|
255
|
+
slideWidth: number | undefined,
|
|
256
|
+
slideHeight: number | undefined,
|
|
257
|
+
): PptxSlide {
|
|
258
|
+
if (!slideWidth || !slideHeight) return slide
|
|
259
|
+
|
|
260
|
+
const slideArea = slideWidth * slideHeight
|
|
261
|
+
slide.elements = slide.elements.map((element) => {
|
|
262
|
+
if (!element.bbox) return element
|
|
263
|
+
|
|
264
|
+
const areaRatio = (element.bbox.w * element.bbox.h) / slideArea
|
|
265
|
+
const sourceName = `${element.source_ref ?? ""} ${element.name ?? ""}`.toLowerCase()
|
|
266
|
+
|
|
267
|
+
if (element.kind === "image") {
|
|
268
|
+
const flags: Partial<PptxSlideElement> = {}
|
|
269
|
+
if (areaRatio >= 0.75 && element.asset_status === "kept") flags.likelyBackground = true
|
|
270
|
+
else if (areaRatio >= 0.2 && element.asset_status === "kept") flags.likelyHeroImage = true
|
|
271
|
+
if (areaRatio <= 0.03 && isNearCorner(element.bbox, slideWidth, slideHeight)) flags.likelyLogo = true
|
|
272
|
+
if (/(logo|brand)/.test(sourceName)) flags.likelyLogo = true
|
|
273
|
+
if (/(mask|overlay|shadow)/.test(sourceName) || element.asset_status === "skipped") flags.likelyOverlayMask = true
|
|
274
|
+
if (/(arrow|ornament|decoration)/.test(sourceName)) flags.likelyDecoration = true
|
|
275
|
+
return Object.keys(flags).length > 0 ? { ...element, ...flags } : element
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
if (element.kind === "shape") {
|
|
279
|
+
const flags: Partial<PptxSlideElement> = {}
|
|
280
|
+
if (areaRatio >= 0.4) flags.likelyOverlayMask = true
|
|
281
|
+
if (areaRatio <= 0.03 || /(arrow|ornament|decoration)/.test(sourceName)) flags.likelyDecoration = true
|
|
282
|
+
return Object.keys(flags).length > 0 ? { ...element, ...flags } : element
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
return element
|
|
286
|
+
})
|
|
287
|
+
|
|
288
|
+
return slide
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
function getSlideMediaTargets(files: Record<string, Uint8Array>, slidePath: string): Map<string, string> {
|
|
292
|
+
const relPath = slidePath.replace("/slides/", "/slides/_rels/") + ".rels"
|
|
293
|
+
const doc = parseXml(files, relPath)
|
|
294
|
+
const targets = new Map<string, string>()
|
|
295
|
+
if (!doc) return targets
|
|
296
|
+
|
|
297
|
+
const relationships = doc.getElementsByTagName("Relationship")
|
|
298
|
+
for (let i = 0; i < relationships.length; i++) {
|
|
299
|
+
const rel = relationships[i]
|
|
300
|
+
const id = rel.getAttribute("Id")
|
|
301
|
+
const target = rel.getAttribute("Target")
|
|
302
|
+
if (!id || !target) continue
|
|
303
|
+
const normalized = normalizeZipTarget(slidePath, target)
|
|
304
|
+
if (!normalized.startsWith("ppt/media/")) continue
|
|
305
|
+
targets.set(id, normalized)
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
return targets
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
function extractPptxSlides(
|
|
312
|
+
files: Record<string, Uint8Array>,
|
|
313
|
+
images: DocumentMaterial[],
|
|
314
|
+
skippedAssets: SkippedAsset[],
|
|
315
|
+
): PptxSlide[] {
|
|
316
|
+
const slideSize = getPptxSlideSize(files)
|
|
317
|
+
const keptBySource = new Map(images.map((image) => [image.source_ref, image]))
|
|
318
|
+
const skippedBySource = new Map(skippedAssets.map((asset) => [asset.source_ref, asset]))
|
|
319
|
+
const slideFiles = Object.keys(files)
|
|
320
|
+
.filter((file) => /^ppt\/slides\/slide\d+\.xml$/.test(file))
|
|
321
|
+
.sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
|
|
322
|
+
|
|
323
|
+
return slideFiles.map((slidePath) => {
|
|
324
|
+
const slideNumber = slidePath.match(/slide(\d+)\.xml$/)?.[1] ?? "0"
|
|
325
|
+
const slideId = `slide-${slideNumber.padStart(2, "0")}`
|
|
326
|
+
const doc = parseXml(files, slidePath)
|
|
327
|
+
const mediaTargets = getSlideMediaTargets(files, slidePath)
|
|
328
|
+
const elements: PptxSlideElement[] = []
|
|
329
|
+
|
|
330
|
+
if (!doc) return { slide: slideId, ...(slideSize ?? {}), elements }
|
|
331
|
+
|
|
332
|
+
const spTree = firstDescendantByLocalName(doc, "spTree")
|
|
333
|
+
if (!spTree) return { slide: slideId, ...(slideSize ?? {}), elements }
|
|
334
|
+
|
|
335
|
+
for (const node of xmlElementChildren(spTree)) {
|
|
336
|
+
const kind = xmlLocalName(node)
|
|
337
|
+
if (kind === "nvGrpSpPr" || kind === "grpSpPr") continue
|
|
338
|
+
|
|
339
|
+
const zOrder = elements.length + 1
|
|
340
|
+
const id = `${slideId}-element-${String(zOrder).padStart(2, "0")}`
|
|
341
|
+
const name = extractElementName(node)
|
|
342
|
+
const bbox = extractElementBBox(node)
|
|
343
|
+
|
|
344
|
+
if (kind === "sp") {
|
|
345
|
+
const text = extractShapeText(node)
|
|
346
|
+
elements.push(text
|
|
347
|
+
? { id, kind: "text", zOrder, text, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) }
|
|
348
|
+
: { id, kind: "shape", zOrder, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) })
|
|
349
|
+
continue
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
if (kind === "pic") {
|
|
353
|
+
const blip = firstDescendantByLocalName(node, "blip")
|
|
354
|
+
const rid = blip?.getAttribute?.("r:embed") || blip?.getAttribute?.("embed") || undefined
|
|
355
|
+
const sourceRef = rid ? mediaTargets.get(rid) : undefined
|
|
356
|
+
const kept = sourceRef ? keptBySource.get(sourceRef) : undefined
|
|
357
|
+
const skipped = sourceRef ? skippedBySource.get(sourceRef) : undefined
|
|
358
|
+
|
|
359
|
+
elements.push({
|
|
360
|
+
id,
|
|
361
|
+
kind: "image",
|
|
362
|
+
zOrder,
|
|
363
|
+
...(bbox ? { bbox } : {}),
|
|
364
|
+
...(name ? { name } : {}),
|
|
365
|
+
...(sourceRef ? { source_ref: sourceRef } : {}),
|
|
366
|
+
...(kept?.path ? { path: kept.path } : {}),
|
|
367
|
+
...((kept || skipped) ? { asset_status: kept ? "kept" as const : "skipped" as const } : {}),
|
|
368
|
+
})
|
|
369
|
+
continue
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
if (kind === "cxnSp" || kind === "graphicFrame" || kind === "grpSp") {
|
|
373
|
+
elements.push({ id, kind: "shape", zOrder, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) })
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
return applyPptxHeuristics({ slide: slideId, ...(slideSize ?? {}), elements }, slideSize?.width, slideSize?.height)
|
|
378
|
+
})
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
const LOW_VALUE_PPTX_ASSET = /(icon|logo|mask|overlay|shadow|decoration|ornament|arrow)/i
|
|
382
|
+
|
|
383
|
+
function classifySkippedAsset(sourceRef: string, reason: SkippedAsset["reason"]): SkippedAsset["kind"] | undefined {
|
|
384
|
+
if (sourceRef.endsWith(".svg")) return "svg"
|
|
385
|
+
if (/icon/i.test(sourceRef)) return "icon"
|
|
386
|
+
if (/logo/i.test(sourceRef)) return "logo"
|
|
387
|
+
if (/(mask|overlay|shadow)/i.test(sourceRef)) return "overlay"
|
|
388
|
+
if (/(decoration|ornament|arrow)/i.test(sourceRef)) return "decoration"
|
|
389
|
+
if (reason === "svg_asset") return "svg"
|
|
390
|
+
return undefined
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
function shouldSkipPptxAsset(sourceRef: string): { reason: SkippedAsset["reason"]; kind?: SkippedAsset["kind"] } | null {
|
|
394
|
+
if (sourceRef.endsWith(".svg")) {
|
|
395
|
+
return { reason: "svg_asset", kind: "svg" }
|
|
396
|
+
}
|
|
397
|
+
if (LOW_VALUE_PPTX_ASSET.test(basename(sourceRef))) {
|
|
398
|
+
return { reason: "low_value_asset", kind: classifySkippedAsset(sourceRef, "low_value_asset") }
|
|
399
|
+
}
|
|
400
|
+
return null
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): PptxImageExtraction {
|
|
404
|
+
const relFiles = Object.keys(files)
|
|
405
|
+
.filter((file) => /^ppt\/slides\/_rels\/slide\d+\.xml\.rels$/.test(file))
|
|
406
|
+
.sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
|
|
407
|
+
|
|
408
|
+
const images: DocumentMaterial[] = []
|
|
409
|
+
const skipped_assets: SkippedAsset[] = []
|
|
410
|
+
const seenTargets = new Set<string>()
|
|
411
|
+
|
|
412
|
+
for (const relPath of relFiles) {
|
|
413
|
+
const slideMatch = relPath.match(/slide(\d+)\.xml\.rels$/)
|
|
414
|
+
const slideNumber = slideMatch?.[1] ?? "0"
|
|
415
|
+
const slidePath = relPath.replace("/_rels/", "/").replace(/\.rels$/, "")
|
|
416
|
+
const doc = parseXml(files, relPath)
|
|
417
|
+
if (!doc) continue
|
|
418
|
+
const relationships = doc.getElementsByTagName("Relationship")
|
|
419
|
+
let imageIndex = 0
|
|
420
|
+
|
|
421
|
+
for (let i = 0; i < relationships.length; i++) {
|
|
422
|
+
const rel = relationships[i]
|
|
423
|
+
const target = rel.getAttribute("Target")
|
|
424
|
+
if (!target) continue
|
|
425
|
+
const normalized = normalizeZipTarget(slidePath, target)
|
|
426
|
+
if (!normalized.startsWith("ppt/media/")) continue
|
|
427
|
+
const media = files[normalized]
|
|
428
|
+
if (!media) continue
|
|
429
|
+
|
|
430
|
+
seenTargets.add(normalized)
|
|
431
|
+
const skipped = shouldSkipPptxAsset(normalized)
|
|
432
|
+
if (skipped) {
|
|
433
|
+
skipped_assets.push({
|
|
434
|
+
source_ref: normalized,
|
|
435
|
+
page_or_slide: `slide-${slideNumber.padStart(2, "0")}`,
|
|
436
|
+
reason: skipped.reason,
|
|
437
|
+
kind: skipped.kind,
|
|
438
|
+
})
|
|
439
|
+
continue
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
imageIndex += 1
|
|
443
|
+
const exportedName = `slide-${slideNumber.padStart(2, "0")}-image-${String(imageIndex).padStart(2, "0")}${extname(normalized)}`
|
|
444
|
+
const outputPath = join(cacheDir, "images", exportedName)
|
|
445
|
+
writeCachedBuffer(outputPath, media)
|
|
446
|
+
|
|
447
|
+
images.push({
|
|
448
|
+
path: materialPath(cacheDir, workspaceDir, "images", exportedName),
|
|
449
|
+
source_ref: normalized,
|
|
450
|
+
page_or_slide: `slide-${slideNumber.padStart(2, "0")}`,
|
|
451
|
+
})
|
|
452
|
+
}
|
|
453
|
+
}
|
|
454
|
+
|
|
455
|
+
const remainingMedia = Object.keys(files)
|
|
456
|
+
.filter((file) => file.startsWith("ppt/media/") && !seenTargets.has(file))
|
|
457
|
+
.sort()
|
|
458
|
+
|
|
459
|
+
for (const mediaPath of remainingMedia) {
|
|
460
|
+
skipped_assets.push({
|
|
461
|
+
source_ref: mediaPath,
|
|
462
|
+
reason: "unmapped_media",
|
|
463
|
+
kind: classifySkippedAsset(mediaPath, "unmapped_media"),
|
|
464
|
+
})
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
return { images, skipped_assets }
|
|
468
|
+
}
|
|
469
|
+
|
|
470
|
+
function extractDocxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): DocumentMaterial[] {
|
|
471
|
+
return Object.keys(files)
|
|
472
|
+
.filter((file) => file.startsWith("word/media/"))
|
|
473
|
+
.sort()
|
|
474
|
+
.map((mediaPath, index) => {
|
|
475
|
+
const exportedName = `document-image-${String(index + 1).padStart(2, "0")}${extname(mediaPath)}`
|
|
476
|
+
const outputPath = join(cacheDir, "images", exportedName)
|
|
477
|
+
writeCachedBuffer(outputPath, files[mediaPath])
|
|
478
|
+
|
|
479
|
+
return {
|
|
480
|
+
path: materialPath(cacheDir, workspaceDir, "images", exportedName),
|
|
481
|
+
source_ref: mediaPath,
|
|
482
|
+
note: "Document-wide association",
|
|
483
|
+
}
|
|
484
|
+
})
|
|
485
|
+
}
|
|
486
|
+
|
|
487
|
+
function extractXlsxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): DocumentMaterial[] {
|
|
488
|
+
const drawingToImages = new Map<string, string[]>()
|
|
489
|
+
const drawingRelFiles = Object.keys(files)
|
|
490
|
+
.filter((file) => /^xl\/drawings\/_rels\/drawing\d+\.xml\.rels$/.test(file))
|
|
491
|
+
.sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
|
|
492
|
+
|
|
493
|
+
for (const relPath of drawingRelFiles) {
|
|
494
|
+
const relDoc = parseXml(files, relPath)
|
|
495
|
+
if (!relDoc) continue
|
|
496
|
+
const drawingPath = relPath.replace("/_rels/", "/").replace(/\.rels$/, "")
|
|
497
|
+
const drawingDoc = parseXml(files, drawingPath)
|
|
498
|
+
if (!drawingDoc) continue
|
|
499
|
+
|
|
500
|
+
const targetByRid = new Map<string, string>()
|
|
501
|
+
const relationships = relDoc.getElementsByTagName("Relationship")
|
|
502
|
+
for (let i = 0; i < relationships.length; i++) {
|
|
503
|
+
const rel = relationships[i]
|
|
504
|
+
const id = rel.getAttribute("Id")
|
|
505
|
+
const target = rel.getAttribute("Target")
|
|
506
|
+
if (!id || !target) continue
|
|
507
|
+
const normalized = normalizeZipTarget(drawingPath, target)
|
|
508
|
+
if (normalized.startsWith("xl/media/")) {
|
|
509
|
+
targetByRid.set(id, normalized)
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
const blips = drawingDoc.getElementsByTagName("a:blip")
|
|
514
|
+
const mediaPaths: string[] = []
|
|
515
|
+
for (let i = 0; i < blips.length; i++) {
|
|
516
|
+
const rid = blips[i].getAttribute("r:embed") || blips[i].getAttribute("embed")
|
|
517
|
+
if (!rid) continue
|
|
518
|
+
const mediaPath = targetByRid.get(rid)
|
|
519
|
+
if (mediaPath) mediaPaths.push(mediaPath)
|
|
520
|
+
}
|
|
521
|
+
|
|
522
|
+
if (mediaPaths.length > 0) {
|
|
523
|
+
drawingToImages.set(drawingPath, mediaPaths)
|
|
524
|
+
}
|
|
525
|
+
}
|
|
526
|
+
|
|
527
|
+
const images: DocumentMaterial[] = []
|
|
528
|
+
const exportedMedia = new Set<string>()
|
|
529
|
+
const sheetRelFiles = Object.keys(files)
|
|
530
|
+
.filter((file) => /^xl\/worksheets\/_rels\/sheet\d+\.xml\.rels$/.test(file))
|
|
531
|
+
.sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
|
|
532
|
+
|
|
533
|
+
for (const relPath of sheetRelFiles) {
|
|
534
|
+
const sheetMatch = relPath.match(/sheet(\d+)\.xml\.rels$/)
|
|
535
|
+
const sheetNumber = sheetMatch?.[1] ?? "0"
|
|
536
|
+
const sheetPath = relPath.replace("/_rels/", "/").replace(/\.rels$/, "")
|
|
537
|
+
const relDoc = parseXml(files, relPath)
|
|
538
|
+
if (!relDoc) continue
|
|
539
|
+
const relationships = relDoc.getElementsByTagName("Relationship")
|
|
540
|
+
let imageIndex = 0
|
|
541
|
+
|
|
542
|
+
for (let i = 0; i < relationships.length; i++) {
|
|
543
|
+
const rel = relationships[i]
|
|
544
|
+
const target = rel.getAttribute("Target")
|
|
545
|
+
if (!target) continue
|
|
546
|
+
const normalized = normalizeZipTarget(sheetPath, target)
|
|
547
|
+
const mediaPaths = drawingToImages.get(normalized)
|
|
548
|
+
if (!mediaPaths) continue
|
|
549
|
+
|
|
550
|
+
for (const mediaPath of mediaPaths) {
|
|
551
|
+
const media = files[mediaPath]
|
|
552
|
+
if (!media) continue
|
|
553
|
+
imageIndex += 1
|
|
554
|
+
exportedMedia.add(mediaPath)
|
|
555
|
+
const exportedName = `sheet-${sheetNumber.padStart(2, "0")}-image-${String(imageIndex).padStart(2, "0")}${extname(mediaPath)}`
|
|
556
|
+
const outputPath = join(cacheDir, "images", exportedName)
|
|
557
|
+
writeCachedBuffer(outputPath, media)
|
|
558
|
+
|
|
559
|
+
images.push({
|
|
560
|
+
path: materialPath(cacheDir, workspaceDir, "images", exportedName),
|
|
561
|
+
source_ref: mediaPath,
|
|
562
|
+
page_or_slide: `sheet-${sheetNumber.padStart(2, "0")}`,
|
|
563
|
+
})
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
|
|
568
|
+
const unmapped = Object.keys(files)
|
|
569
|
+
.filter((file) => file.startsWith("xl/media/") && !exportedMedia.has(file))
|
|
570
|
+
.sort()
|
|
571
|
+
|
|
572
|
+
for (const mediaPath of unmapped) {
|
|
573
|
+
const exportedName = `unmapped-${basename(mediaPath)}`
|
|
574
|
+
const outputPath = join(cacheDir, "images", exportedName)
|
|
575
|
+
writeCachedBuffer(outputPath, files[mediaPath])
|
|
576
|
+
|
|
577
|
+
images.push({
|
|
578
|
+
path: materialPath(cacheDir, workspaceDir, "images", exportedName),
|
|
579
|
+
source_ref: mediaPath,
|
|
580
|
+
note: "No sheet-level relationship found",
|
|
581
|
+
})
|
|
582
|
+
}
|
|
583
|
+
|
|
584
|
+
return images
|
|
585
|
+
}
|
|
586
|
+
|
|
587
|
+
function extractTables(type: SupportedType, textPath: string): DocumentMaterial[] {
|
|
588
|
+
if (type !== "xlsx") return []
|
|
589
|
+
return [{ path: textPath, source_ref: "workbook", note: "Sheet text and tables extracted to text file" }]
|
|
590
|
+
}
|
|
591
|
+
|
|
592
|
+
async function processOfficeFile(filePath: string, workspaceDir: string, type: SupportedType): Promise<DocumentMaterialsResult> {
|
|
593
|
+
const relativeSource = workspaceRelative(filePath, workspaceDir)
|
|
594
|
+
const fingerprint = buildFingerprint(filePath)
|
|
595
|
+
const cacheDir = join(workspaceDir, ".opencode", "revela", "doc-materials", fingerprint)
|
|
596
|
+
const manifestPath = join(cacheDir, "manifest.json")
|
|
597
|
+
|
|
598
|
+
if (existsSync(manifestPath)) {
|
|
599
|
+
const manifest = JSON.parse(readFileSync(manifestPath, "utf-8")) as CachedManifest
|
|
600
|
+
return {
|
|
601
|
+
status: "processed",
|
|
602
|
+
source: manifest.source,
|
|
603
|
+
type: manifest.type,
|
|
604
|
+
cache_dir: manifest.cache_dir,
|
|
605
|
+
manifest_path: manifest.manifest_path,
|
|
606
|
+
text_path: manifest.text_path,
|
|
607
|
+
images: manifest.images,
|
|
608
|
+
skipped_assets: manifest.skipped_assets,
|
|
609
|
+
slides: manifest.slides,
|
|
610
|
+
tables: manifest.tables,
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
mkdirSync(join(cacheDir, "images"), { recursive: true })
|
|
615
|
+
mkdirSync(join(cacheDir, "tables"), { recursive: true })
|
|
616
|
+
|
|
617
|
+
const buf = readFileSync(filePath)
|
|
618
|
+
const files = unzipSync(new Uint8Array(buf))
|
|
619
|
+
|
|
620
|
+
const text = type === "pptx"
|
|
621
|
+
? await extractPptx(buf)
|
|
622
|
+
: type === "docx"
|
|
623
|
+
? await extractDocx(buf)
|
|
624
|
+
: await extractXlsx(buf)
|
|
625
|
+
|
|
626
|
+
const textPath = join(cacheDir, "text.txt")
|
|
627
|
+
writeFileSync(textPath, `[Extracted from: ${basename(filePath)}]\n\n${text}`, "utf-8")
|
|
628
|
+
|
|
629
|
+
const pptxAssets = type === "pptx"
|
|
630
|
+
? extractPptxImages(files, cacheDir, workspaceDir)
|
|
631
|
+
: null
|
|
632
|
+
const images = type === "pptx"
|
|
633
|
+
? pptxAssets!.images
|
|
634
|
+
: type === "docx"
|
|
635
|
+
? extractDocxImages(files, cacheDir, workspaceDir)
|
|
636
|
+
: extractXlsxImages(files, cacheDir, workspaceDir)
|
|
637
|
+
const slides = type === "pptx"
|
|
638
|
+
? extractPptxSlides(files, images, pptxAssets!.skipped_assets)
|
|
639
|
+
: undefined
|
|
640
|
+
|
|
641
|
+
const result: DocumentMaterialsResult = {
|
|
642
|
+
status: "processed",
|
|
643
|
+
source: relativeSource,
|
|
644
|
+
type,
|
|
645
|
+
cache_dir: workspaceRelative(cacheDir, workspaceDir),
|
|
646
|
+
manifest_path: workspaceRelative(manifestPath, workspaceDir),
|
|
647
|
+
text_path: workspaceRelative(textPath, workspaceDir),
|
|
648
|
+
images,
|
|
649
|
+
skipped_assets: pptxAssets?.skipped_assets ?? [],
|
|
650
|
+
slides,
|
|
651
|
+
tables: extractTables(type, workspaceRelative(textPath, workspaceDir)),
|
|
652
|
+
}
|
|
653
|
+
|
|
654
|
+
const manifest: CachedManifest = {
|
|
655
|
+
source: result.source,
|
|
656
|
+
type,
|
|
657
|
+
fingerprint,
|
|
658
|
+
cache_dir: result.cache_dir!,
|
|
659
|
+
manifest_path: result.manifest_path!,
|
|
660
|
+
text_path: result.text_path!,
|
|
661
|
+
images: result.images ?? [],
|
|
662
|
+
skipped_assets: result.skipped_assets ?? [],
|
|
663
|
+
slides: result.slides ?? [],
|
|
664
|
+
tables: result.tables ?? [],
|
|
665
|
+
}
|
|
666
|
+
|
|
667
|
+
writeFileSync(manifestPath, JSON.stringify(manifest, null, 2), "utf-8")
|
|
668
|
+
return result
|
|
669
|
+
}
|
|
670
|
+
|
|
671
|
+
export async function extractDocumentMaterials(filePath: string, workspaceDir: string): Promise<DocumentMaterialsResult> {
|
|
672
|
+
try {
|
|
673
|
+
const resolvedFile = ensureWorkspacePath(filePath, workspaceDir)
|
|
674
|
+
const relativeSource = workspaceRelative(resolvedFile, workspaceDir)
|
|
675
|
+
const type = SUPPORTED_EXTENSIONS[extname(resolvedFile).toLowerCase()]
|
|
676
|
+
|
|
677
|
+
if (!type) {
|
|
678
|
+
return {
|
|
679
|
+
status: "skipped",
|
|
680
|
+
source: relativeSource,
|
|
681
|
+
type: "other",
|
|
682
|
+
reason: "unsupported_file_type",
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
return await processOfficeFile(resolvedFile, workspaceDir, type)
|
|
687
|
+
} catch (e) {
|
|
688
|
+
return {
|
|
689
|
+
status: "failed",
|
|
690
|
+
source: filePath,
|
|
691
|
+
type: "other",
|
|
692
|
+
reason: e instanceof Error ? e.message : String(e),
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { basename, extname } from "path"
|
|
2
|
+
export const OFFICE_EXTENSIONS = new Set([".docx", ".pptx", ".xlsx"])
|
|
3
|
+
export const IMAGE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
|
|
4
|
+
|
|
5
|
+
export type ReadStrategy =
|
|
6
|
+
| "before-materialize-document"
|
|
7
|
+
| "after-extract-text"
|
|
8
|
+
| "after-compress-image"
|
|
9
|
+
| "passthrough"
|
|
10
|
+
|
|
11
|
+
export function classifyReadFile(filePath: string): ReadStrategy {
|
|
12
|
+
const ext = extname(filePath).toLowerCase()
|
|
13
|
+
if (OFFICE_EXTENSIONS.has(ext)) return "before-materialize-document"
|
|
14
|
+
if (ext === ".pdf") return "after-extract-text"
|
|
15
|
+
if (IMAGE_EXTENSIONS.has(ext)) return "after-compress-image"
|
|
16
|
+
return "passthrough"
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export function formatExtractedText(filePath: string, text: string): string {
|
|
20
|
+
return `[Extracted from: ${basename(filePath)}]\n\n${text}`
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export function buildOfficeReadView(
|
|
24
|
+
filePath: string,
|
|
25
|
+
text: string,
|
|
26
|
+
images: Array<{ path: string }> | undefined,
|
|
27
|
+
): string {
|
|
28
|
+
const lines = [
|
|
29
|
+
`# Extracted from: ${basename(filePath)}`,
|
|
30
|
+
"",
|
|
31
|
+
"## Text",
|
|
32
|
+
"",
|
|
33
|
+
text.trim() || "No text extracted.",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
lines.push("", "## Images", "")
|
|
37
|
+
|
|
38
|
+
if (!images?.length) {
|
|
39
|
+
lines.push("- None")
|
|
40
|
+
} else {
|
|
41
|
+
for (const image of images) lines.push(`- ${image.path}`)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return lines.join("\n")
|
|
45
|
+
}
|
package/lib/read-hooks/index.ts
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Entry point for the read-hooks module.
|
|
5
5
|
* Exports preRead and postRead for use in plugins/revela.ts hook handlers.
|
|
6
6
|
*
|
|
7
|
-
* preRead → tool.execute.before:
|
|
7
|
+
* preRead → tool.execute.before: materialize Office docs and redirect to temp markdown
|
|
8
8
|
* postRead → tool.execute.after: transform PDF/image attachments before LLM sees them
|
|
9
9
|
*/
|
|
10
10
|
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import { readFileSync } from "fs"
|
|
2
|
+
import { join } from "path"
|
|
3
|
+
import type { PptxSlide } from "../document-materials/extract"
|
|
4
|
+
import { extractDocumentMaterials } from "../document-materials/extract"
|
|
5
|
+
import { buildOfficeReadView } from "./dispatch"
|
|
6
|
+
import { extractDocx } from "./extractors/docx"
|
|
7
|
+
import { extractPptx } from "./extractors/pptx"
|
|
8
|
+
import { extractXlsx } from "./extractors/xlsx"
|
|
9
|
+
import { formatExtractedText } from "./dispatch"
|
|
10
|
+
|
|
11
|
+
const HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
|
|
12
|
+
".docx": extractDocx,
|
|
13
|
+
".pptx": extractPptx,
|
|
14
|
+
".xlsx": extractXlsx,
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function buildPptxStructureHints(slides: PptxSlide[] | undefined): string {
|
|
18
|
+
if (!slides?.length) return ""
|
|
19
|
+
|
|
20
|
+
const lines = ["", "## Slide Structure", ""]
|
|
21
|
+
for (const slide of slides) {
|
|
22
|
+
const textCount = slide.elements.filter((element) => element.kind === "text").length
|
|
23
|
+
const keptImageCount = slide.elements.filter((element) => element.kind === "image" && element.asset_status === "kept").length
|
|
24
|
+
const skippedImageCount = slide.elements.filter((element) => element.kind === "image" && element.asset_status === "skipped").length
|
|
25
|
+
const shapeCount = slide.elements.filter((element) => element.kind === "shape").length
|
|
26
|
+
const summary = [
|
|
27
|
+
textCount > 0 ? `${textCount} text` : null,
|
|
28
|
+
keptImageCount > 0 ? `${keptImageCount} kept image` : null,
|
|
29
|
+
skippedImageCount > 0 ? `${skippedImageCount} skipped image` : null,
|
|
30
|
+
shapeCount > 0 ? `${shapeCount} shape` : null,
|
|
31
|
+
].filter(Boolean).join(", ") || "no parsed elements"
|
|
32
|
+
lines.push(`- ${slide.slide}: ${summary}`)
|
|
33
|
+
|
|
34
|
+
const roleSummary = [
|
|
35
|
+
countRole(slide, (element) => element.likelyBackground, "background image"),
|
|
36
|
+
countRole(slide, (element) => element.likelyHeroImage, "hero image"),
|
|
37
|
+
countRole(slide, (element) => element.likelyLogo, "logo"),
|
|
38
|
+
countRole(slide, (element) => element.likelyOverlayMask, "overlay"),
|
|
39
|
+
countRole(slide, (element) => element.likelyDecoration, "decoration"),
|
|
40
|
+
].filter(Boolean).join(", ")
|
|
41
|
+
if (roleSummary) lines.push(` likely roles: ${roleSummary}`)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return lines.join("\n")
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function countRole(
|
|
48
|
+
slide: PptxSlide,
|
|
49
|
+
predicate: (element: PptxSlide["elements"][number]) => boolean | undefined,
|
|
50
|
+
label: string,
|
|
51
|
+
): string | null {
|
|
52
|
+
const count = slide.elements.filter(predicate).length
|
|
53
|
+
if (count === 0) return null
|
|
54
|
+
return `${count} ${label}${count === 1 ? "" : "s"}`
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export async function createOfficeReadView(filePath: string, workspaceDir: string): Promise<string> {
|
|
58
|
+
const ext = filePath.slice(filePath.lastIndexOf(".")).toLowerCase()
|
|
59
|
+
const handler = HANDLERS[ext]
|
|
60
|
+
if (!handler) throw new Error(`unsupported office file type: ${ext}`)
|
|
61
|
+
|
|
62
|
+
const materialized = await extractDocumentMaterials(filePath, workspaceDir)
|
|
63
|
+
|
|
64
|
+
if (materialized.status === "processed" && materialized.text_path) {
|
|
65
|
+
const textPath = join(workspaceDir, materialized.text_path)
|
|
66
|
+
const extracted = readFileSync(textPath, "utf-8")
|
|
67
|
+
const text = extracted.replace(/^\[Extracted from: .*?\]\n\n/, "")
|
|
68
|
+
const view = buildOfficeReadView(filePath, text, materialized.images)
|
|
69
|
+
return filePath.toLowerCase().endsWith(".pptx")
|
|
70
|
+
? view + buildPptxStructureHints(materialized.slides)
|
|
71
|
+
: view
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const buf = readFileSync(filePath)
|
|
75
|
+
const text = await handler(buf)
|
|
76
|
+
return formatExtractedText(filePath, text)
|
|
77
|
+
}
|
|
@@ -16,11 +16,10 @@
|
|
|
16
16
|
* of packages/opencode/src/session/prompt.ts.
|
|
17
17
|
*/
|
|
18
18
|
|
|
19
|
-
import {
|
|
19
|
+
import { basename } from "path"
|
|
20
20
|
import { extractPdfText } from "./extractors/pdf"
|
|
21
21
|
import { compressImage } from "./image/compress"
|
|
22
|
-
|
|
23
|
-
const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
|
|
22
|
+
import { classifyReadFile, formatExtractedText } from "./dispatch"
|
|
24
23
|
|
|
25
24
|
interface ReadOutput {
|
|
26
25
|
title: string
|
|
@@ -41,10 +40,10 @@ export async function postRead(
|
|
|
41
40
|
): Promise<void> {
|
|
42
41
|
if (!output.attachments?.length) return
|
|
43
42
|
|
|
44
|
-
const
|
|
43
|
+
const strategy = classifyReadFile(args.filePath)
|
|
45
44
|
|
|
46
45
|
// ── PDF: extract text, drop base64 attachment ───────────────────────────
|
|
47
|
-
if (
|
|
46
|
+
if (strategy === "after-extract-text") {
|
|
48
47
|
const attachment = output.attachments[0]
|
|
49
48
|
const base64 = attachment.url.split(",")[1]
|
|
50
49
|
if (!base64) return
|
|
@@ -52,14 +51,14 @@ export async function postRead(
|
|
|
52
51
|
const buf = Buffer.from(base64, "base64")
|
|
53
52
|
const text = await extractPdfText(buf)
|
|
54
53
|
|
|
55
|
-
output.output =
|
|
54
|
+
output.output = formatExtractedText(args.filePath, text)
|
|
56
55
|
output.title = `Extracted text from ${basename(args.filePath)}`
|
|
57
56
|
output.attachments.length = 0 // Remove base64 — saves significant tokens
|
|
58
57
|
return
|
|
59
58
|
}
|
|
60
59
|
|
|
61
60
|
// ── Images: compress attachment to reduce token cost ────────────────────
|
|
62
|
-
if (
|
|
61
|
+
if (strategy === "after-compress-image") {
|
|
63
62
|
const attachment = output.attachments[0]
|
|
64
63
|
const base64 = attachment.url.split(",")[1]
|
|
65
64
|
if (!base64) return
|
|
@@ -7,44 +7,33 @@
|
|
|
7
7
|
* Handles DOCX, PPTX, XLSX — formats that cause read tool to throw
|
|
8
8
|
* Effect.fail("Cannot read binary file"), so the after-hook never fires.
|
|
9
9
|
*
|
|
10
|
-
* Strategy:
|
|
10
|
+
* Strategy: materialize the document into cached text + images, render a
|
|
11
|
+
* markdown read view, then redirect args.filePath to that temp .md file.
|
|
11
12
|
* The read tool then reads the temp file normally. LLM is unaware of the redirect.
|
|
12
13
|
*/
|
|
13
14
|
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
15
|
+
import { writeFileSync } from "fs"
|
|
16
|
+
import { join } from "path"
|
|
16
17
|
import { tmpdir } from "os"
|
|
17
18
|
import { randomUUID } from "crypto"
|
|
18
|
-
import {
|
|
19
|
-
import {
|
|
20
|
-
import { extractXlsx } from "./extractors/xlsx"
|
|
21
|
-
|
|
22
|
-
// Extension → extractor function mapping
|
|
23
|
-
const HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
|
|
24
|
-
".docx": extractDocx,
|
|
25
|
-
".pptx": extractPptx,
|
|
26
|
-
".xlsx": extractXlsx,
|
|
27
|
-
}
|
|
19
|
+
import { classifyReadFile } from "./dispatch"
|
|
20
|
+
import { createOfficeReadView } from "./office-read-view"
|
|
28
21
|
|
|
29
22
|
/**
|
|
30
23
|
* Intercept read tool args before execution.
|
|
31
|
-
* If the file is a supported
|
|
32
|
-
* args.filePath to a
|
|
24
|
+
* If the file is a supported Office document, materialize it into cached
|
|
25
|
+
* text + images and redirect args.filePath to a temporary markdown read view.
|
|
33
26
|
*
|
|
34
27
|
* @param args - Mutable read tool args object (from output.args in before-hook)
|
|
35
28
|
*/
|
|
36
29
|
export async function preRead(args: { filePath: string; [k: string]: any }): Promise<void> {
|
|
37
|
-
|
|
38
|
-
const handler = HANDLERS[ext]
|
|
39
|
-
if (!handler) return // Not a handled format — let read tool proceed normally
|
|
30
|
+
if (classifyReadFile(args.filePath) !== "before-materialize-document") return
|
|
40
31
|
|
|
41
|
-
const
|
|
42
|
-
const
|
|
32
|
+
const workspaceDir = process.cwd()
|
|
33
|
+
const output = await createOfficeReadView(args.filePath, workspaceDir)
|
|
43
34
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
const tmpPath = join(tmpdir(), `revela-${randomUUID()}.txt`)
|
|
47
|
-
writeFileSync(tmpPath, header + text, "utf-8")
|
|
35
|
+
const tmpPath = join(tmpdir(), `revela-${randomUUID()}.md`)
|
|
36
|
+
writeFileSync(tmpPath, output, "utf-8")
|
|
48
37
|
|
|
49
38
|
// Redirect read tool to the temp file
|
|
50
39
|
args.filePath = tmpPath
|
package/package.json
CHANGED
package/plugin.ts
CHANGED
|
@@ -25,10 +25,9 @@ import { ACTIVE_PROMPT_FILE } from "./lib/config"
|
|
|
25
25
|
import { ctx } from "./lib/ctx"
|
|
26
26
|
import { preRead } from "./lib/read-hooks"
|
|
27
27
|
import { postRead } from "./lib/read-hooks"
|
|
28
|
-
import { extractDocx } from "./lib/read-hooks/extractors/docx"
|
|
29
|
-
import { extractPptx } from "./lib/read-hooks/extractors/pptx"
|
|
30
|
-
import { extractXlsx } from "./lib/read-hooks/extractors/xlsx"
|
|
31
28
|
import { extractPdfText } from "./lib/read-hooks/extractors/pdf"
|
|
29
|
+
import { createOfficeReadView } from "./lib/read-hooks/office-read-view"
|
|
30
|
+
import { OFFICE_EXTENSIONS, IMAGE_EXTENSIONS, formatExtractedText } from "./lib/read-hooks/dispatch"
|
|
32
31
|
import { handleHelp } from "./lib/commands/help"
|
|
33
32
|
import { handleEnable } from "./lib/commands/enable"
|
|
34
33
|
import { handleDisable } from "./lib/commands/disable"
|
|
@@ -50,6 +49,7 @@ import designsTool from "./tools/designs"
|
|
|
50
49
|
import domainsTool from "./tools/domains"
|
|
51
50
|
import researchSaveTool from "./tools/research-save"
|
|
52
51
|
import workspaceScanTool from "./tools/workspace-scan"
|
|
52
|
+
import extractDocumentMaterialsTool from "./tools/extract-document-materials"
|
|
53
53
|
import qaTool from "./tools/qa"
|
|
54
54
|
import { RESEARCH_PROMPT, RESEARCH_AGENT_SIGNATURE } from "./lib/agents/research-prompt"
|
|
55
55
|
import { runQA, formatReport } from "./lib/qa"
|
|
@@ -225,12 +225,13 @@ const server: Plugin = (async (pluginCtx) => {
|
|
|
225
225
|
throw new Error("__REVELA_UNKNOWN_HANDLED__")
|
|
226
226
|
},
|
|
227
227
|
|
|
228
|
-
// ── LLM tools: designs, domains, research, qa
|
|
228
|
+
// ── LLM tools: designs, domains, research, document materials, qa ─────
|
|
229
229
|
tool: {
|
|
230
230
|
"revela-designs": designsTool,
|
|
231
231
|
"revela-domains": domainsTool,
|
|
232
232
|
"revela-research-save": researchSaveTool,
|
|
233
233
|
"revela-workspace-scan": workspaceScanTool,
|
|
234
|
+
"revela-extract-document-materials": extractDocumentMaterialsTool,
|
|
234
235
|
"revela-qa": qaTool,
|
|
235
236
|
},
|
|
236
237
|
|
|
@@ -239,19 +240,11 @@ const server: Plugin = (async (pluginCtx) => {
|
|
|
239
240
|
// directly — the read tool is never called, so tool.execute.before/after
|
|
240
241
|
// hooks don't fire. This hook intercepts FileParts before LLM sees them.
|
|
241
242
|
//
|
|
242
|
-
// DOCX/PPTX/XLSX/PDF → extract text → replace with TextPart
|
|
243
|
+
// DOCX/PPTX/XLSX/PDF → extract text/read view → replace with TextPart
|
|
243
244
|
// Images → replace with TextPart hint (LLM can use read tool)
|
|
244
245
|
"chat.message": async (input, output) => {
|
|
245
246
|
if (!ctx.enabled) return
|
|
246
247
|
|
|
247
|
-
const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
|
|
248
|
-
const DOC_HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
|
|
249
|
-
".docx": extractDocx,
|
|
250
|
-
".pptx": extractPptx,
|
|
251
|
-
".xlsx": extractXlsx,
|
|
252
|
-
".pdf": extractPdfText,
|
|
253
|
-
}
|
|
254
|
-
|
|
255
248
|
for (let i = 0; i < output.parts.length; i++) {
|
|
256
249
|
const part = output.parts[i] as any
|
|
257
250
|
if (part.type !== "file") continue
|
|
@@ -262,15 +255,22 @@ const server: Plugin = (async (pluginCtx) => {
|
|
|
262
255
|
const name = basename(filePath)
|
|
263
256
|
|
|
264
257
|
try {
|
|
265
|
-
if (
|
|
258
|
+
if (OFFICE_EXTENSIONS.has(ext)) {
|
|
259
|
+
const text = await createOfficeReadView(filePath, process.cwd())
|
|
260
|
+
output.parts[i] = {
|
|
261
|
+
...part,
|
|
262
|
+
type: "text",
|
|
263
|
+
text,
|
|
264
|
+
} as any
|
|
265
|
+
} else if (ext === ".pdf") {
|
|
266
266
|
const buf = readFileSync(filePath)
|
|
267
|
-
const text = await
|
|
267
|
+
const text = await extractPdfText(buf)
|
|
268
268
|
output.parts[i] = {
|
|
269
269
|
...part,
|
|
270
270
|
type: "text",
|
|
271
|
-
text:
|
|
271
|
+
text: formatExtractedText(filePath, text),
|
|
272
272
|
} as any
|
|
273
|
-
} else if (
|
|
273
|
+
} else if (IMAGE_EXTENSIONS.has(ext)) {
|
|
274
274
|
output.parts[i] = {
|
|
275
275
|
...part,
|
|
276
276
|
type: "text",
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
import { tool } from "@opencode-ai/plugin"
|
|
2
|
+
import { extractDocumentMaterials } from "../lib/document-materials/extract"
|
|
3
|
+
|
|
4
|
+
export default tool({
|
|
5
|
+
description:
|
|
6
|
+
"Extract reusable materials from a workspace document into a workspace-local cache. " +
|
|
7
|
+
"Supports pptx, docx, and xlsx. Produces a manifest plus extracted text, embedded images, and available slide/sheet mappings. " +
|
|
8
|
+
"Unsupported file types are skipped instead of failing.",
|
|
9
|
+
args: {
|
|
10
|
+
file: tool.schema
|
|
11
|
+
.string()
|
|
12
|
+
.describe("Document path relative to workspace root. Supports pptx, docx, and xlsx; other file types are skipped."),
|
|
13
|
+
},
|
|
14
|
+
async execute(args, context) {
|
|
15
|
+
const workspaceDir = context.directory ?? process.cwd()
|
|
16
|
+
return JSON.stringify(await extractDocumentMaterials(args.file, workspaceDir), null, 2)
|
|
17
|
+
},
|
|
18
|
+
})
|