@cyber-dash-tech/revela 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/document-materials/extract.ts +337 -15
- package/lib/read-hooks/dispatch.ts +45 -0
- package/lib/read-hooks/index.ts +1 -1
- package/lib/read-hooks/office-read-view.ts +77 -0
- package/lib/read-hooks/post-read.ts +6 -7
- package/lib/read-hooks/pre-read.ts +13 -24
- package/package.json +1 -1
- package/plugin.ts +14 -16
- package/tools/extract-document-materials.ts +1 -1
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import { createHash } from "crypto"
|
|
2
|
-
import { existsSync, mkdirSync, readFileSync, statSync, writeFileSync } from "fs"
|
|
2
|
+
import { existsSync, mkdirSync, readFileSync, realpathSync, statSync, writeFileSync } from "fs"
|
|
3
3
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve } from "path"
|
|
4
4
|
import { DOMParser } from "@xmldom/xmldom"
|
|
5
5
|
import { unzipSync } from "fflate"
|
|
@@ -14,6 +14,37 @@ export type DocumentMaterial = {
|
|
|
14
14
|
note?: string
|
|
15
15
|
}
|
|
16
16
|
|
|
17
|
+
export type SkippedAsset = {
|
|
18
|
+
source_ref: string
|
|
19
|
+
page_or_slide?: string
|
|
20
|
+
reason: "svg_asset" | "unmapped_media" | "low_value_asset"
|
|
21
|
+
kind?: "svg" | "icon" | "logo" | "overlay" | "decoration"
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
export type PptxSlideElement = {
|
|
25
|
+
id: string
|
|
26
|
+
kind: "text" | "image" | "shape"
|
|
27
|
+
zOrder: number
|
|
28
|
+
bbox?: { x: number; y: number; w: number; h: number }
|
|
29
|
+
likelyBackground?: boolean
|
|
30
|
+
likelyHeroImage?: boolean
|
|
31
|
+
likelyLogo?: boolean
|
|
32
|
+
likelyOverlayMask?: boolean
|
|
33
|
+
likelyDecoration?: boolean
|
|
34
|
+
text?: string
|
|
35
|
+
source_ref?: string
|
|
36
|
+
path?: string
|
|
37
|
+
asset_status?: "kept" | "skipped"
|
|
38
|
+
name?: string
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
export type PptxSlide = {
|
|
42
|
+
slide: string
|
|
43
|
+
width?: number
|
|
44
|
+
height?: number
|
|
45
|
+
elements: PptxSlideElement[]
|
|
46
|
+
}
|
|
47
|
+
|
|
17
48
|
export type DocumentMaterialsResult = {
|
|
18
49
|
status: "processed" | "skipped" | "failed"
|
|
19
50
|
source: string
|
|
@@ -22,6 +53,8 @@ export type DocumentMaterialsResult = {
|
|
|
22
53
|
manifest_path?: string
|
|
23
54
|
text_path?: string
|
|
24
55
|
images?: DocumentMaterial[]
|
|
56
|
+
skipped_assets?: SkippedAsset[]
|
|
57
|
+
slides?: PptxSlide[]
|
|
25
58
|
tables?: DocumentMaterial[]
|
|
26
59
|
reason?: string
|
|
27
60
|
}
|
|
@@ -36,9 +69,16 @@ type CachedManifest = {
|
|
|
36
69
|
manifest_path: string
|
|
37
70
|
text_path: string
|
|
38
71
|
images: DocumentMaterial[]
|
|
72
|
+
skipped_assets: SkippedAsset[]
|
|
73
|
+
slides: PptxSlide[]
|
|
39
74
|
tables: DocumentMaterial[]
|
|
40
75
|
}
|
|
41
76
|
|
|
77
|
+
type PptxImageExtraction = {
|
|
78
|
+
images: DocumentMaterial[]
|
|
79
|
+
skipped_assets: SkippedAsset[]
|
|
80
|
+
}
|
|
81
|
+
|
|
42
82
|
const SUPPORTED_EXTENSIONS: Record<string, SupportedType> = {
|
|
43
83
|
".pptx": "pptx",
|
|
44
84
|
".docx": "docx",
|
|
@@ -62,8 +102,11 @@ function normalizeZipTarget(basePath: string, target: string): string {
|
|
|
62
102
|
}
|
|
63
103
|
|
|
64
104
|
function ensureWorkspacePath(filePath: string, workspaceDir: string): string {
|
|
65
|
-
const resolvedWorkspace = resolve(workspaceDir)
|
|
66
|
-
const
|
|
105
|
+
const resolvedWorkspace = realpathSync(resolve(workspaceDir))
|
|
106
|
+
const candidate = isAbsolute(filePath) ? resolve(filePath) : resolve(workspaceDir, filePath)
|
|
107
|
+
const resolvedFile = existsSync(candidate)
|
|
108
|
+
? realpathSync(candidate)
|
|
109
|
+
: candidate
|
|
67
110
|
|
|
68
111
|
if (resolvedFile !== resolvedWorkspace && !resolvedFile.startsWith(resolvedWorkspace + "/")) {
|
|
69
112
|
throw new Error("file must be within workspace")
|
|
@@ -72,8 +115,24 @@ function ensureWorkspacePath(filePath: string, workspaceDir: string): string {
|
|
|
72
115
|
return resolvedFile
|
|
73
116
|
}
|
|
74
117
|
|
|
118
|
+
function normalizeWorkspaceChild(filePath: string, workspaceDir: string): string {
|
|
119
|
+
const workspaceAlias = resolve(workspaceDir)
|
|
120
|
+
const workspaceReal = realpathSync(workspaceAlias)
|
|
121
|
+
const candidate = resolve(filePath)
|
|
122
|
+
|
|
123
|
+
if (existsSync(candidate)) return realpathSync(candidate)
|
|
124
|
+
|
|
125
|
+
if (candidate === workspaceAlias || candidate.startsWith(workspaceAlias + "/")) {
|
|
126
|
+
return join(workspaceReal, relative(workspaceAlias, candidate))
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
return candidate
|
|
130
|
+
}
|
|
131
|
+
|
|
75
132
|
function workspaceRelative(filePath: string, workspaceDir: string): string {
|
|
76
|
-
|
|
133
|
+
const resolvedWorkspace = realpathSync(resolve(workspaceDir))
|
|
134
|
+
const resolvedFile = normalizeWorkspaceChild(filePath, workspaceDir)
|
|
135
|
+
return relative(resolvedWorkspace, resolvedFile).replace(/\\/g, "/")
|
|
77
136
|
}
|
|
78
137
|
|
|
79
138
|
function buildFingerprint(filePath: string): string {
|
|
@@ -98,12 +157,256 @@ function parseXml(files: Record<string, Uint8Array>, path: string): any | null {
|
|
|
98
157
|
return new DOMParser().parseFromString(new TextDecoder().decode(file), "text/xml")
|
|
99
158
|
}
|
|
100
159
|
|
|
101
|
-
function
|
|
160
|
+
function xmlLocalName(node: any): string {
|
|
161
|
+
return node?.localName ?? String(node?.nodeName ?? "").split(":").pop() ?? ""
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
function xmlElementChildren(node: any): any[] {
|
|
165
|
+
const children: any[] = []
|
|
166
|
+
const childNodes = node?.childNodes ?? []
|
|
167
|
+
for (let i = 0; i < childNodes.length; i++) {
|
|
168
|
+
const child = childNodes[i]
|
|
169
|
+
if (child?.nodeType === 1) children.push(child)
|
|
170
|
+
}
|
|
171
|
+
return children
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
function xmlDescendantsByLocalName(node: any, name: string): any[] {
|
|
175
|
+
const matches: any[] = []
|
|
176
|
+
const walk = (current: any) => {
|
|
177
|
+
for (const child of xmlElementChildren(current)) {
|
|
178
|
+
if (xmlLocalName(child) === name) matches.push(child)
|
|
179
|
+
walk(child)
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
walk(node)
|
|
183
|
+
return matches
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
function firstDescendantByLocalName(node: any, name: string): any | null {
|
|
187
|
+
const [match] = xmlDescendantsByLocalName(node, name)
|
|
188
|
+
return match ?? null
|
|
189
|
+
}
|
|
190
|
+
|
|
191
|
+
function extractShapeText(node: any): string | undefined {
|
|
192
|
+
const texts = xmlDescendantsByLocalName(node, "t")
|
|
193
|
+
.map((textNode) => textNode.textContent?.trim())
|
|
194
|
+
.filter(Boolean)
|
|
195
|
+
return texts.length > 0 ? texts.join("\n") : undefined
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
function extractElementName(node: any): string | undefined {
|
|
199
|
+
return firstDescendantByLocalName(node, "cNvPr")?.getAttribute?.("name") || undefined
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
function parseCoordinate(value: string | null | undefined): number | undefined {
|
|
203
|
+
if (value == null || value === "") return undefined
|
|
204
|
+
const parsed = Number(value)
|
|
205
|
+
return Number.isFinite(parsed) ? parsed : undefined
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
function extractElementBBox(node: any): { x: number; y: number; w: number; h: number } | undefined {
|
|
209
|
+
const xfrm = firstDescendantByLocalName(node, "xfrm")
|
|
210
|
+
if (!xfrm) return undefined
|
|
211
|
+
|
|
212
|
+
const off = firstDescendantByLocalName(xfrm, "off")
|
|
213
|
+
const ext = firstDescendantByLocalName(xfrm, "ext")
|
|
214
|
+
if (!off || !ext) return undefined
|
|
215
|
+
|
|
216
|
+
const x = parseCoordinate(off.getAttribute?.("x"))
|
|
217
|
+
const y = parseCoordinate(off.getAttribute?.("y"))
|
|
218
|
+
const w = parseCoordinate(ext.getAttribute?.("cx"))
|
|
219
|
+
const h = parseCoordinate(ext.getAttribute?.("cy"))
|
|
220
|
+
if ([x, y, w, h].some((value) => value == null)) return undefined
|
|
221
|
+
|
|
222
|
+
return { x: x!, y: y!, w: w!, h: h! }
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
function getPptxSlideSize(files: Record<string, Uint8Array>): { width: number; height: number } | undefined {
|
|
226
|
+
const doc = parseXml(files, "ppt/presentation.xml")
|
|
227
|
+
const size = firstDescendantByLocalName(doc, "sldSz")
|
|
228
|
+
if (!size) return undefined
|
|
229
|
+
|
|
230
|
+
const width = parseCoordinate(size.getAttribute?.("cx"))
|
|
231
|
+
const height = parseCoordinate(size.getAttribute?.("cy"))
|
|
232
|
+
if (width == null || height == null) return undefined
|
|
233
|
+
return { width, height }
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
function isNearCorner(
|
|
237
|
+
bbox: { x: number; y: number; w: number; h: number },
|
|
238
|
+
slideWidth: number,
|
|
239
|
+
slideHeight: number,
|
|
240
|
+
): boolean {
|
|
241
|
+
const thresholdX = slideWidth * 0.12
|
|
242
|
+
const thresholdY = slideHeight * 0.12
|
|
243
|
+
const right = bbox.x + bbox.w
|
|
244
|
+
const bottom = bbox.y + bbox.h
|
|
245
|
+
return (
|
|
246
|
+
(bbox.x <= thresholdX && bbox.y <= thresholdY) ||
|
|
247
|
+
(right >= slideWidth - thresholdX && bbox.y <= thresholdY) ||
|
|
248
|
+
(bbox.x <= thresholdX && bottom >= slideHeight - thresholdY) ||
|
|
249
|
+
(right >= slideWidth - thresholdX && bottom >= slideHeight - thresholdY)
|
|
250
|
+
)
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
function applyPptxHeuristics(
|
|
254
|
+
slide: PptxSlide,
|
|
255
|
+
slideWidth: number | undefined,
|
|
256
|
+
slideHeight: number | undefined,
|
|
257
|
+
): PptxSlide {
|
|
258
|
+
if (!slideWidth || !slideHeight) return slide
|
|
259
|
+
|
|
260
|
+
const slideArea = slideWidth * slideHeight
|
|
261
|
+
slide.elements = slide.elements.map((element) => {
|
|
262
|
+
if (!element.bbox) return element
|
|
263
|
+
|
|
264
|
+
const areaRatio = (element.bbox.w * element.bbox.h) / slideArea
|
|
265
|
+
const sourceName = `${element.source_ref ?? ""} ${element.name ?? ""}`.toLowerCase()
|
|
266
|
+
|
|
267
|
+
if (element.kind === "image") {
|
|
268
|
+
const flags: Partial<PptxSlideElement> = {}
|
|
269
|
+
if (areaRatio >= 0.75 && element.asset_status === "kept") flags.likelyBackground = true
|
|
270
|
+
else if (areaRatio >= 0.2 && element.asset_status === "kept") flags.likelyHeroImage = true
|
|
271
|
+
if (areaRatio <= 0.03 && isNearCorner(element.bbox, slideWidth, slideHeight)) flags.likelyLogo = true
|
|
272
|
+
if (/(logo|brand)/.test(sourceName)) flags.likelyLogo = true
|
|
273
|
+
if (/(mask|overlay|shadow)/.test(sourceName) || element.asset_status === "skipped") flags.likelyOverlayMask = true
|
|
274
|
+
if (/(arrow|ornament|decoration)/.test(sourceName)) flags.likelyDecoration = true
|
|
275
|
+
return Object.keys(flags).length > 0 ? { ...element, ...flags } : element
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
if (element.kind === "shape") {
|
|
279
|
+
const flags: Partial<PptxSlideElement> = {}
|
|
280
|
+
if (areaRatio >= 0.4) flags.likelyOverlayMask = true
|
|
281
|
+
if (areaRatio <= 0.03 || /(arrow|ornament|decoration)/.test(sourceName)) flags.likelyDecoration = true
|
|
282
|
+
return Object.keys(flags).length > 0 ? { ...element, ...flags } : element
|
|
283
|
+
}
|
|
284
|
+
|
|
285
|
+
return element
|
|
286
|
+
})
|
|
287
|
+
|
|
288
|
+
return slide
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
function getSlideMediaTargets(files: Record<string, Uint8Array>, slidePath: string): Map<string, string> {
|
|
292
|
+
const relPath = slidePath.replace("/slides/", "/slides/_rels/") + ".rels"
|
|
293
|
+
const doc = parseXml(files, relPath)
|
|
294
|
+
const targets = new Map<string, string>()
|
|
295
|
+
if (!doc) return targets
|
|
296
|
+
|
|
297
|
+
const relationships = doc.getElementsByTagName("Relationship")
|
|
298
|
+
for (let i = 0; i < relationships.length; i++) {
|
|
299
|
+
const rel = relationships[i]
|
|
300
|
+
const id = rel.getAttribute("Id")
|
|
301
|
+
const target = rel.getAttribute("Target")
|
|
302
|
+
if (!id || !target) continue
|
|
303
|
+
const normalized = normalizeZipTarget(slidePath, target)
|
|
304
|
+
if (!normalized.startsWith("ppt/media/")) continue
|
|
305
|
+
targets.set(id, normalized)
|
|
306
|
+
}
|
|
307
|
+
|
|
308
|
+
return targets
|
|
309
|
+
}
|
|
310
|
+
|
|
311
|
+
function extractPptxSlides(
|
|
312
|
+
files: Record<string, Uint8Array>,
|
|
313
|
+
images: DocumentMaterial[],
|
|
314
|
+
skippedAssets: SkippedAsset[],
|
|
315
|
+
): PptxSlide[] {
|
|
316
|
+
const slideSize = getPptxSlideSize(files)
|
|
317
|
+
const keptBySource = new Map(images.map((image) => [image.source_ref, image]))
|
|
318
|
+
const skippedBySource = new Map(skippedAssets.map((asset) => [asset.source_ref, asset]))
|
|
319
|
+
const slideFiles = Object.keys(files)
|
|
320
|
+
.filter((file) => /^ppt\/slides\/slide\d+\.xml$/.test(file))
|
|
321
|
+
.sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
|
|
322
|
+
|
|
323
|
+
return slideFiles.map((slidePath) => {
|
|
324
|
+
const slideNumber = slidePath.match(/slide(\d+)\.xml$/)?.[1] ?? "0"
|
|
325
|
+
const slideId = `slide-${slideNumber.padStart(2, "0")}`
|
|
326
|
+
const doc = parseXml(files, slidePath)
|
|
327
|
+
const mediaTargets = getSlideMediaTargets(files, slidePath)
|
|
328
|
+
const elements: PptxSlideElement[] = []
|
|
329
|
+
|
|
330
|
+
if (!doc) return { slide: slideId, ...(slideSize ?? {}), elements }
|
|
331
|
+
|
|
332
|
+
const spTree = firstDescendantByLocalName(doc, "spTree")
|
|
333
|
+
if (!spTree) return { slide: slideId, ...(slideSize ?? {}), elements }
|
|
334
|
+
|
|
335
|
+
for (const node of xmlElementChildren(spTree)) {
|
|
336
|
+
const kind = xmlLocalName(node)
|
|
337
|
+
if (kind === "nvGrpSpPr" || kind === "grpSpPr") continue
|
|
338
|
+
|
|
339
|
+
const zOrder = elements.length + 1
|
|
340
|
+
const id = `${slideId}-element-${String(zOrder).padStart(2, "0")}`
|
|
341
|
+
const name = extractElementName(node)
|
|
342
|
+
const bbox = extractElementBBox(node)
|
|
343
|
+
|
|
344
|
+
if (kind === "sp") {
|
|
345
|
+
const text = extractShapeText(node)
|
|
346
|
+
elements.push(text
|
|
347
|
+
? { id, kind: "text", zOrder, text, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) }
|
|
348
|
+
: { id, kind: "shape", zOrder, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) })
|
|
349
|
+
continue
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
if (kind === "pic") {
|
|
353
|
+
const blip = firstDescendantByLocalName(node, "blip")
|
|
354
|
+
const rid = blip?.getAttribute?.("r:embed") || blip?.getAttribute?.("embed") || undefined
|
|
355
|
+
const sourceRef = rid ? mediaTargets.get(rid) : undefined
|
|
356
|
+
const kept = sourceRef ? keptBySource.get(sourceRef) : undefined
|
|
357
|
+
const skipped = sourceRef ? skippedBySource.get(sourceRef) : undefined
|
|
358
|
+
|
|
359
|
+
elements.push({
|
|
360
|
+
id,
|
|
361
|
+
kind: "image",
|
|
362
|
+
zOrder,
|
|
363
|
+
...(bbox ? { bbox } : {}),
|
|
364
|
+
...(name ? { name } : {}),
|
|
365
|
+
...(sourceRef ? { source_ref: sourceRef } : {}),
|
|
366
|
+
...(kept?.path ? { path: kept.path } : {}),
|
|
367
|
+
...((kept || skipped) ? { asset_status: kept ? "kept" as const : "skipped" as const } : {}),
|
|
368
|
+
})
|
|
369
|
+
continue
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
if (kind === "cxnSp" || kind === "graphicFrame" || kind === "grpSp") {
|
|
373
|
+
elements.push({ id, kind: "shape", zOrder, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) })
|
|
374
|
+
}
|
|
375
|
+
}
|
|
376
|
+
|
|
377
|
+
return applyPptxHeuristics({ slide: slideId, ...(slideSize ?? {}), elements }, slideSize?.width, slideSize?.height)
|
|
378
|
+
})
|
|
379
|
+
}
|
|
380
|
+
|
|
381
|
+
const LOW_VALUE_PPTX_ASSET = /(icon|logo|mask|overlay|shadow|decoration|ornament|arrow)/i
|
|
382
|
+
|
|
383
|
+
function classifySkippedAsset(sourceRef: string, reason: SkippedAsset["reason"]): SkippedAsset["kind"] | undefined {
|
|
384
|
+
if (sourceRef.endsWith(".svg")) return "svg"
|
|
385
|
+
if (/icon/i.test(sourceRef)) return "icon"
|
|
386
|
+
if (/logo/i.test(sourceRef)) return "logo"
|
|
387
|
+
if (/(mask|overlay|shadow)/i.test(sourceRef)) return "overlay"
|
|
388
|
+
if (/(decoration|ornament|arrow)/i.test(sourceRef)) return "decoration"
|
|
389
|
+
if (reason === "svg_asset") return "svg"
|
|
390
|
+
return undefined
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
function shouldSkipPptxAsset(sourceRef: string): { reason: SkippedAsset["reason"]; kind?: SkippedAsset["kind"] } | null {
|
|
394
|
+
if (sourceRef.endsWith(".svg")) {
|
|
395
|
+
return { reason: "svg_asset", kind: "svg" }
|
|
396
|
+
}
|
|
397
|
+
if (LOW_VALUE_PPTX_ASSET.test(basename(sourceRef))) {
|
|
398
|
+
return { reason: "low_value_asset", kind: classifySkippedAsset(sourceRef, "low_value_asset") }
|
|
399
|
+
}
|
|
400
|
+
return null
|
|
401
|
+
}
|
|
402
|
+
|
|
403
|
+
function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): PptxImageExtraction {
|
|
102
404
|
const relFiles = Object.keys(files)
|
|
103
405
|
.filter((file) => /^ppt\/slides\/_rels\/slide\d+\.xml\.rels$/.test(file))
|
|
104
406
|
.sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
|
|
105
407
|
|
|
106
408
|
const images: DocumentMaterial[] = []
|
|
409
|
+
const skipped_assets: SkippedAsset[] = []
|
|
107
410
|
const seenTargets = new Set<string>()
|
|
108
411
|
|
|
109
412
|
for (const relPath of relFiles) {
|
|
@@ -124,8 +427,19 @@ function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string,
|
|
|
124
427
|
const media = files[normalized]
|
|
125
428
|
if (!media) continue
|
|
126
429
|
|
|
127
|
-
imageIndex += 1
|
|
128
430
|
seenTargets.add(normalized)
|
|
431
|
+
const skipped = shouldSkipPptxAsset(normalized)
|
|
432
|
+
if (skipped) {
|
|
433
|
+
skipped_assets.push({
|
|
434
|
+
source_ref: normalized,
|
|
435
|
+
page_or_slide: `slide-${slideNumber.padStart(2, "0")}`,
|
|
436
|
+
reason: skipped.reason,
|
|
437
|
+
kind: skipped.kind,
|
|
438
|
+
})
|
|
439
|
+
continue
|
|
440
|
+
}
|
|
441
|
+
|
|
442
|
+
imageIndex += 1
|
|
129
443
|
const exportedName = `slide-${slideNumber.padStart(2, "0")}-image-${String(imageIndex).padStart(2, "0")}${extname(normalized)}`
|
|
130
444
|
const outputPath = join(cacheDir, "images", exportedName)
|
|
131
445
|
writeCachedBuffer(outputPath, media)
|
|
@@ -143,18 +457,14 @@ function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string,
|
|
|
143
457
|
.sort()
|
|
144
458
|
|
|
145
459
|
for (const mediaPath of remainingMedia) {
|
|
146
|
-
|
|
147
|
-
const outputPath = join(cacheDir, "images", exportedName)
|
|
148
|
-
writeCachedBuffer(outputPath, files[mediaPath])
|
|
149
|
-
|
|
150
|
-
images.push({
|
|
151
|
-
path: materialPath(cacheDir, workspaceDir, "images", exportedName),
|
|
460
|
+
skipped_assets.push({
|
|
152
461
|
source_ref: mediaPath,
|
|
153
|
-
|
|
462
|
+
reason: "unmapped_media",
|
|
463
|
+
kind: classifySkippedAsset(mediaPath, "unmapped_media"),
|
|
154
464
|
})
|
|
155
465
|
}
|
|
156
466
|
|
|
157
|
-
return images
|
|
467
|
+
return { images, skipped_assets }
|
|
158
468
|
}
|
|
159
469
|
|
|
160
470
|
function extractDocxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): DocumentMaterial[] {
|
|
@@ -295,6 +605,8 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
|
|
|
295
605
|
manifest_path: manifest.manifest_path,
|
|
296
606
|
text_path: manifest.text_path,
|
|
297
607
|
images: manifest.images,
|
|
608
|
+
skipped_assets: manifest.skipped_assets,
|
|
609
|
+
slides: manifest.slides,
|
|
298
610
|
tables: manifest.tables,
|
|
299
611
|
}
|
|
300
612
|
}
|
|
@@ -314,11 +626,17 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
|
|
|
314
626
|
const textPath = join(cacheDir, "text.txt")
|
|
315
627
|
writeFileSync(textPath, `[Extracted from: ${basename(filePath)}]\n\n${text}`, "utf-8")
|
|
316
628
|
|
|
317
|
-
const
|
|
629
|
+
const pptxAssets = type === "pptx"
|
|
318
630
|
? extractPptxImages(files, cacheDir, workspaceDir)
|
|
631
|
+
: null
|
|
632
|
+
const images = type === "pptx"
|
|
633
|
+
? pptxAssets!.images
|
|
319
634
|
: type === "docx"
|
|
320
635
|
? extractDocxImages(files, cacheDir, workspaceDir)
|
|
321
636
|
: extractXlsxImages(files, cacheDir, workspaceDir)
|
|
637
|
+
const slides = type === "pptx"
|
|
638
|
+
? extractPptxSlides(files, images, pptxAssets!.skipped_assets)
|
|
639
|
+
: undefined
|
|
322
640
|
|
|
323
641
|
const result: DocumentMaterialsResult = {
|
|
324
642
|
status: "processed",
|
|
@@ -328,6 +646,8 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
|
|
|
328
646
|
manifest_path: workspaceRelative(manifestPath, workspaceDir),
|
|
329
647
|
text_path: workspaceRelative(textPath, workspaceDir),
|
|
330
648
|
images,
|
|
649
|
+
skipped_assets: pptxAssets?.skipped_assets ?? [],
|
|
650
|
+
slides,
|
|
331
651
|
tables: extractTables(type, workspaceRelative(textPath, workspaceDir)),
|
|
332
652
|
}
|
|
333
653
|
|
|
@@ -339,6 +659,8 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
|
|
|
339
659
|
manifest_path: result.manifest_path!,
|
|
340
660
|
text_path: result.text_path!,
|
|
341
661
|
images: result.images ?? [],
|
|
662
|
+
skipped_assets: result.skipped_assets ?? [],
|
|
663
|
+
slides: result.slides ?? [],
|
|
342
664
|
tables: result.tables ?? [],
|
|
343
665
|
}
|
|
344
666
|
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { basename, extname } from "path"
|
|
2
|
+
export const OFFICE_EXTENSIONS = new Set([".docx", ".pptx", ".xlsx"])
|
|
3
|
+
export const IMAGE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
|
|
4
|
+
|
|
5
|
+
export type ReadStrategy =
|
|
6
|
+
| "before-materialize-document"
|
|
7
|
+
| "after-extract-text"
|
|
8
|
+
| "after-compress-image"
|
|
9
|
+
| "passthrough"
|
|
10
|
+
|
|
11
|
+
export function classifyReadFile(filePath: string): ReadStrategy {
|
|
12
|
+
const ext = extname(filePath).toLowerCase()
|
|
13
|
+
if (OFFICE_EXTENSIONS.has(ext)) return "before-materialize-document"
|
|
14
|
+
if (ext === ".pdf") return "after-extract-text"
|
|
15
|
+
if (IMAGE_EXTENSIONS.has(ext)) return "after-compress-image"
|
|
16
|
+
return "passthrough"
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export function formatExtractedText(filePath: string, text: string): string {
|
|
20
|
+
return `[Extracted from: ${basename(filePath)}]\n\n${text}`
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export function buildOfficeReadView(
|
|
24
|
+
filePath: string,
|
|
25
|
+
text: string,
|
|
26
|
+
images: Array<{ path: string }> | undefined,
|
|
27
|
+
): string {
|
|
28
|
+
const lines = [
|
|
29
|
+
`# Extracted from: ${basename(filePath)}`,
|
|
30
|
+
"",
|
|
31
|
+
"## Text",
|
|
32
|
+
"",
|
|
33
|
+
text.trim() || "No text extracted.",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
lines.push("", "## Images", "")
|
|
37
|
+
|
|
38
|
+
if (!images?.length) {
|
|
39
|
+
lines.push("- None")
|
|
40
|
+
} else {
|
|
41
|
+
for (const image of images) lines.push(`- ${image.path}`)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return lines.join("\n")
|
|
45
|
+
}
|
package/lib/read-hooks/index.ts
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Entry point for the read-hooks module.
|
|
5
5
|
* Exports preRead and postRead for use in plugins/revela.ts hook handlers.
|
|
6
6
|
*
|
|
7
|
-
* preRead → tool.execute.before:
|
|
7
|
+
* preRead → tool.execute.before: materialize Office docs and redirect to temp markdown
|
|
8
8
|
* postRead → tool.execute.after: transform PDF/image attachments before LLM sees them
|
|
9
9
|
*/
|
|
10
10
|
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import { readFileSync } from "fs"
|
|
2
|
+
import { join } from "path"
|
|
3
|
+
import type { PptxSlide } from "../document-materials/extract"
|
|
4
|
+
import { extractDocumentMaterials } from "../document-materials/extract"
|
|
5
|
+
import { buildOfficeReadView } from "./dispatch"
|
|
6
|
+
import { extractDocx } from "./extractors/docx"
|
|
7
|
+
import { extractPptx } from "./extractors/pptx"
|
|
8
|
+
import { extractXlsx } from "./extractors/xlsx"
|
|
9
|
+
import { formatExtractedText } from "./dispatch"
|
|
10
|
+
|
|
11
|
+
const HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
|
|
12
|
+
".docx": extractDocx,
|
|
13
|
+
".pptx": extractPptx,
|
|
14
|
+
".xlsx": extractXlsx,
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function buildPptxStructureHints(slides: PptxSlide[] | undefined): string {
|
|
18
|
+
if (!slides?.length) return ""
|
|
19
|
+
|
|
20
|
+
const lines = ["", "## Slide Structure", ""]
|
|
21
|
+
for (const slide of slides) {
|
|
22
|
+
const textCount = slide.elements.filter((element) => element.kind === "text").length
|
|
23
|
+
const keptImageCount = slide.elements.filter((element) => element.kind === "image" && element.asset_status === "kept").length
|
|
24
|
+
const skippedImageCount = slide.elements.filter((element) => element.kind === "image" && element.asset_status === "skipped").length
|
|
25
|
+
const shapeCount = slide.elements.filter((element) => element.kind === "shape").length
|
|
26
|
+
const summary = [
|
|
27
|
+
textCount > 0 ? `${textCount} text` : null,
|
|
28
|
+
keptImageCount > 0 ? `${keptImageCount} kept image` : null,
|
|
29
|
+
skippedImageCount > 0 ? `${skippedImageCount} skipped image` : null,
|
|
30
|
+
shapeCount > 0 ? `${shapeCount} shape` : null,
|
|
31
|
+
].filter(Boolean).join(", ") || "no parsed elements"
|
|
32
|
+
lines.push(`- ${slide.slide}: ${summary}`)
|
|
33
|
+
|
|
34
|
+
const roleSummary = [
|
|
35
|
+
countRole(slide, (element) => element.likelyBackground, "background image"),
|
|
36
|
+
countRole(slide, (element) => element.likelyHeroImage, "hero image"),
|
|
37
|
+
countRole(slide, (element) => element.likelyLogo, "logo"),
|
|
38
|
+
countRole(slide, (element) => element.likelyOverlayMask, "overlay"),
|
|
39
|
+
countRole(slide, (element) => element.likelyDecoration, "decoration"),
|
|
40
|
+
].filter(Boolean).join(", ")
|
|
41
|
+
if (roleSummary) lines.push(` likely roles: ${roleSummary}`)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return lines.join("\n")
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function countRole(
|
|
48
|
+
slide: PptxSlide,
|
|
49
|
+
predicate: (element: PptxSlide["elements"][number]) => boolean | undefined,
|
|
50
|
+
label: string,
|
|
51
|
+
): string | null {
|
|
52
|
+
const count = slide.elements.filter(predicate).length
|
|
53
|
+
if (count === 0) return null
|
|
54
|
+
return `${count} ${label}${count === 1 ? "" : "s"}`
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export async function createOfficeReadView(filePath: string, workspaceDir: string): Promise<string> {
|
|
58
|
+
const ext = filePath.slice(filePath.lastIndexOf(".")).toLowerCase()
|
|
59
|
+
const handler = HANDLERS[ext]
|
|
60
|
+
if (!handler) throw new Error(`unsupported office file type: ${ext}`)
|
|
61
|
+
|
|
62
|
+
const materialized = await extractDocumentMaterials(filePath, workspaceDir)
|
|
63
|
+
|
|
64
|
+
if (materialized.status === "processed" && materialized.text_path) {
|
|
65
|
+
const textPath = join(workspaceDir, materialized.text_path)
|
|
66
|
+
const extracted = readFileSync(textPath, "utf-8")
|
|
67
|
+
const text = extracted.replace(/^\[Extracted from: .*?\]\n\n/, "")
|
|
68
|
+
const view = buildOfficeReadView(filePath, text, materialized.images)
|
|
69
|
+
return filePath.toLowerCase().endsWith(".pptx")
|
|
70
|
+
? view + buildPptxStructureHints(materialized.slides)
|
|
71
|
+
: view
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const buf = readFileSync(filePath)
|
|
75
|
+
const text = await handler(buf)
|
|
76
|
+
return formatExtractedText(filePath, text)
|
|
77
|
+
}
|
|
@@ -16,11 +16,10 @@
|
|
|
16
16
|
* of packages/opencode/src/session/prompt.ts.
|
|
17
17
|
*/
|
|
18
18
|
|
|
19
|
-
import {
|
|
19
|
+
import { basename } from "path"
|
|
20
20
|
import { extractPdfText } from "./extractors/pdf"
|
|
21
21
|
import { compressImage } from "./image/compress"
|
|
22
|
-
|
|
23
|
-
const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
|
|
22
|
+
import { classifyReadFile, formatExtractedText } from "./dispatch"
|
|
24
23
|
|
|
25
24
|
interface ReadOutput {
|
|
26
25
|
title: string
|
|
@@ -41,10 +40,10 @@ export async function postRead(
|
|
|
41
40
|
): Promise<void> {
|
|
42
41
|
if (!output.attachments?.length) return
|
|
43
42
|
|
|
44
|
-
const
|
|
43
|
+
const strategy = classifyReadFile(args.filePath)
|
|
45
44
|
|
|
46
45
|
// ── PDF: extract text, drop base64 attachment ───────────────────────────
|
|
47
|
-
if (
|
|
46
|
+
if (strategy === "after-extract-text") {
|
|
48
47
|
const attachment = output.attachments[0]
|
|
49
48
|
const base64 = attachment.url.split(",")[1]
|
|
50
49
|
if (!base64) return
|
|
@@ -52,14 +51,14 @@ export async function postRead(
|
|
|
52
51
|
const buf = Buffer.from(base64, "base64")
|
|
53
52
|
const text = await extractPdfText(buf)
|
|
54
53
|
|
|
55
|
-
output.output =
|
|
54
|
+
output.output = formatExtractedText(args.filePath, text)
|
|
56
55
|
output.title = `Extracted text from ${basename(args.filePath)}`
|
|
57
56
|
output.attachments.length = 0 // Remove base64 — saves significant tokens
|
|
58
57
|
return
|
|
59
58
|
}
|
|
60
59
|
|
|
61
60
|
// ── Images: compress attachment to reduce token cost ────────────────────
|
|
62
|
-
if (
|
|
61
|
+
if (strategy === "after-compress-image") {
|
|
63
62
|
const attachment = output.attachments[0]
|
|
64
63
|
const base64 = attachment.url.split(",")[1]
|
|
65
64
|
if (!base64) return
|
|
@@ -7,44 +7,33 @@
|
|
|
7
7
|
* Handles DOCX, PPTX, XLSX — formats that cause read tool to throw
|
|
8
8
|
* Effect.fail("Cannot read binary file"), so the after-hook never fires.
|
|
9
9
|
*
|
|
10
|
-
* Strategy:
|
|
10
|
+
* Strategy: materialize the document into cached text + images, render a
|
|
11
|
+
* markdown read view, then redirect args.filePath to that temp .md file.
|
|
11
12
|
* The read tool then reads the temp file normally. LLM is unaware of the redirect.
|
|
12
13
|
*/
|
|
13
14
|
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
15
|
+
import { writeFileSync } from "fs"
|
|
16
|
+
import { join } from "path"
|
|
16
17
|
import { tmpdir } from "os"
|
|
17
18
|
import { randomUUID } from "crypto"
|
|
18
|
-
import {
|
|
19
|
-
import {
|
|
20
|
-
import { extractXlsx } from "./extractors/xlsx"
|
|
21
|
-
|
|
22
|
-
// Extension → extractor function mapping
|
|
23
|
-
const HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
|
|
24
|
-
".docx": extractDocx,
|
|
25
|
-
".pptx": extractPptx,
|
|
26
|
-
".xlsx": extractXlsx,
|
|
27
|
-
}
|
|
19
|
+
import { classifyReadFile } from "./dispatch"
|
|
20
|
+
import { createOfficeReadView } from "./office-read-view"
|
|
28
21
|
|
|
29
22
|
/**
|
|
30
23
|
* Intercept read tool args before execution.
|
|
31
|
-
* If the file is a supported
|
|
32
|
-
* args.filePath to a
|
|
24
|
+
* If the file is a supported Office document, materialize it into cached
|
|
25
|
+
* text + images and redirect args.filePath to a temporary markdown read view.
|
|
33
26
|
*
|
|
34
27
|
* @param args - Mutable read tool args object (from output.args in before-hook)
|
|
35
28
|
*/
|
|
36
29
|
export async function preRead(args: { filePath: string; [k: string]: any }): Promise<void> {
|
|
37
|
-
|
|
38
|
-
const handler = HANDLERS[ext]
|
|
39
|
-
if (!handler) return // Not a handled format — let read tool proceed normally
|
|
30
|
+
if (classifyReadFile(args.filePath) !== "before-materialize-document") return
|
|
40
31
|
|
|
41
|
-
const
|
|
42
|
-
const
|
|
32
|
+
const workspaceDir = process.cwd()
|
|
33
|
+
const output = await createOfficeReadView(args.filePath, workspaceDir)
|
|
43
34
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
const tmpPath = join(tmpdir(), `revela-${randomUUID()}.txt`)
|
|
47
|
-
writeFileSync(tmpPath, header + text, "utf-8")
|
|
35
|
+
const tmpPath = join(tmpdir(), `revela-${randomUUID()}.md`)
|
|
36
|
+
writeFileSync(tmpPath, output, "utf-8")
|
|
48
37
|
|
|
49
38
|
// Redirect read tool to the temp file
|
|
50
39
|
args.filePath = tmpPath
|
package/package.json
CHANGED
package/plugin.ts
CHANGED
|
@@ -25,10 +25,9 @@ import { ACTIVE_PROMPT_FILE } from "./lib/config"
|
|
|
25
25
|
import { ctx } from "./lib/ctx"
|
|
26
26
|
import { preRead } from "./lib/read-hooks"
|
|
27
27
|
import { postRead } from "./lib/read-hooks"
|
|
28
|
-
import { extractDocx } from "./lib/read-hooks/extractors/docx"
|
|
29
|
-
import { extractPptx } from "./lib/read-hooks/extractors/pptx"
|
|
30
|
-
import { extractXlsx } from "./lib/read-hooks/extractors/xlsx"
|
|
31
28
|
import { extractPdfText } from "./lib/read-hooks/extractors/pdf"
|
|
29
|
+
import { createOfficeReadView } from "./lib/read-hooks/office-read-view"
|
|
30
|
+
import { OFFICE_EXTENSIONS, IMAGE_EXTENSIONS, formatExtractedText } from "./lib/read-hooks/dispatch"
|
|
32
31
|
import { handleHelp } from "./lib/commands/help"
|
|
33
32
|
import { handleEnable } from "./lib/commands/enable"
|
|
34
33
|
import { handleDisable } from "./lib/commands/disable"
|
|
@@ -241,19 +240,11 @@ const server: Plugin = (async (pluginCtx) => {
|
|
|
241
240
|
// directly — the read tool is never called, so tool.execute.before/after
|
|
242
241
|
// hooks don't fire. This hook intercepts FileParts before LLM sees them.
|
|
243
242
|
//
|
|
244
|
-
// DOCX/PPTX/XLSX/PDF → extract text → replace with TextPart
|
|
243
|
+
// DOCX/PPTX/XLSX/PDF → extract text/read view → replace with TextPart
|
|
245
244
|
// Images → replace with TextPart hint (LLM can use read tool)
|
|
246
245
|
"chat.message": async (input, output) => {
|
|
247
246
|
if (!ctx.enabled) return
|
|
248
247
|
|
|
249
|
-
const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
|
|
250
|
-
const DOC_HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
|
|
251
|
-
".docx": extractDocx,
|
|
252
|
-
".pptx": extractPptx,
|
|
253
|
-
".xlsx": extractXlsx,
|
|
254
|
-
".pdf": extractPdfText,
|
|
255
|
-
}
|
|
256
|
-
|
|
257
248
|
for (let i = 0; i < output.parts.length; i++) {
|
|
258
249
|
const part = output.parts[i] as any
|
|
259
250
|
if (part.type !== "file") continue
|
|
@@ -264,15 +255,22 @@ const server: Plugin = (async (pluginCtx) => {
|
|
|
264
255
|
const name = basename(filePath)
|
|
265
256
|
|
|
266
257
|
try {
|
|
267
|
-
if (
|
|
258
|
+
if (OFFICE_EXTENSIONS.has(ext)) {
|
|
259
|
+
const text = await createOfficeReadView(filePath, process.cwd())
|
|
260
|
+
output.parts[i] = {
|
|
261
|
+
...part,
|
|
262
|
+
type: "text",
|
|
263
|
+
text,
|
|
264
|
+
} as any
|
|
265
|
+
} else if (ext === ".pdf") {
|
|
268
266
|
const buf = readFileSync(filePath)
|
|
269
|
-
const text = await
|
|
267
|
+
const text = await extractPdfText(buf)
|
|
270
268
|
output.parts[i] = {
|
|
271
269
|
...part,
|
|
272
270
|
type: "text",
|
|
273
|
-
text:
|
|
271
|
+
text: formatExtractedText(filePath, text),
|
|
274
272
|
} as any
|
|
275
|
-
} else if (
|
|
273
|
+
} else if (IMAGE_EXTENSIONS.has(ext)) {
|
|
276
274
|
output.parts[i] = {
|
|
277
275
|
...part,
|
|
278
276
|
type: "text",
|
|
@@ -3,7 +3,7 @@ import { extractDocumentMaterials } from "../lib/document-materials/extract"
|
|
|
3
3
|
|
|
4
4
|
export default tool({
|
|
5
5
|
description:
|
|
6
|
-
"Extract
|
|
6
|
+
"Extract reusable materials from a workspace document into a workspace-local cache. " +
|
|
7
7
|
"Supports pptx, docx, and xlsx. Produces a manifest plus extracted text, embedded images, and available slide/sheet mappings. " +
|
|
8
8
|
"Unsupported file types are skipped instead of failing.",
|
|
9
9
|
args: {
|