@cyber-dash-tech/revela 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/README.zh-CN.md +2 -2
- package/lib/agents/research-prompt.ts +1 -1
- package/lib/document-materials/extract.ts +476 -17
- package/lib/read-hooks/dispatch.ts +45 -0
- package/lib/read-hooks/index.ts +1 -1
- package/lib/read-hooks/office-read-view.ts +77 -0
- package/lib/read-hooks/post-read.ts +6 -7
- package/lib/read-hooks/pre-read.ts +13 -24
- package/package.json +1 -1
- package/plugin.ts +14 -16
- package/tools/extract-document-materials.ts +3 -3
package/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
**English** | [中文](README.zh-CN.md)
|
|
4
4
|
|
|
5
|
-
[](https://www.npmjs.com/package/@cyber-dash-tech/revela) [](LICENSE) [](https://www.npmjs.com/package/@cyber-dash-tech/revela) [](LICENSE) [](tests/) [](https://opencode.ai) [](https://bun.sh)
|
|
6
6
|
|
|
7
7
|
<p align="center">
|
|
8
8
|
<img src="assets/img/logo.png" alt="Revela" width="800" />
|
|
@@ -19,7 +19,7 @@ Enable it for the current session, assign a presentation task, and the agent can
|
|
|
19
19
|
|
|
20
20
|
- injects a presentation-specific system prompt into your current agent with `/revela enable`
|
|
21
21
|
- builds that prompt from 3 layers: core skill, active domain, active design
|
|
22
|
-
- supports workspace document discovery
|
|
22
|
+
- supports workspace document discovery, transparent text extraction for `.pdf`, `.docx`, `.pptx`, and `.xlsx`, and cached embedded-material extraction for those formats
|
|
23
23
|
- runs automatic layout QA whenever the agent writes `decks/*.html`
|
|
24
24
|
- exports finished decks to PDF and editable PPTX
|
|
25
25
|
- switches designs and domains locally with zero LLM cost
|
package/README.zh-CN.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
[English](README.md) | **中文**
|
|
4
4
|
|
|
5
|
-
[](https://www.npmjs.com/package/@cyber-dash-tech/revela) [](LICENSE) [](https://www.npmjs.com/package/@cyber-dash-tech/revela) [](LICENSE) [](tests/) [](https://opencode.ai) [](https://bun.sh)
|
|
6
6
|
|
|
7
7
|
<p align="center">
|
|
8
8
|
<img src="assets/img/logo.png" alt="Revela" width="800" />
|
|
@@ -19,7 +19,7 @@ Revela 是一个 [OpenCode](https://opencode.ai) 插件,可以把你当前使
|
|
|
19
19
|
|
|
20
20
|
- 通过 `/revela enable` 向当前 agent 注入演示文稿专用 system prompt
|
|
21
21
|
- prompt 由 3 层组成:核心 skill、当前 domain、当前 design
|
|
22
|
-
- 支持工作区文档扫描,以及 `.pdf`、`.docx`、`.pptx`、`.xlsx`
|
|
22
|
+
- 支持工作区文档扫描,以及 `.pdf`、`.docx`、`.pptx`、`.xlsx` 的透明文本提取和嵌入素材缓存提取
|
|
23
23
|
- agent 每次写入 `decks/*.html` 时自动执行布局 QA
|
|
24
24
|
- 支持导出成 PDF 和可编辑 PPTX
|
|
25
25
|
- design 和 domain 的切换都在本地完成,不消耗 LLM token
|
|
@@ -40,7 +40,7 @@ files in the workspace (PDF, Word, Excel, PowerPoint, CSV, text).
|
|
|
40
40
|
Then select the files relevant to your research axis.
|
|
41
41
|
|
|
42
42
|
For every selected file, call **\`revela-extract-document-materials\`** first.
|
|
43
|
-
- \`pptx\`, \`docx\`, and \`xlsx\` will produce a manifest plus extracted text and any available embedded materials
|
|
43
|
+
- \`pdf\`, \`pptx\`, \`docx\`, and \`xlsx\` will produce a manifest plus extracted text and any available embedded materials
|
|
44
44
|
- unsupported file types will be skipped automatically
|
|
45
45
|
|
|
46
46
|
After that, use the \`read\` tool on:
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
import { createHash } from "crypto"
|
|
2
|
-
import { existsSync, mkdirSync, readFileSync, statSync, writeFileSync } from "fs"
|
|
2
|
+
import { existsSync, mkdirSync, readFileSync, realpathSync, statSync, writeFileSync } from "fs"
|
|
3
3
|
import { basename, dirname, extname, isAbsolute, join, relative, resolve } from "path"
|
|
4
4
|
import { DOMParser } from "@xmldom/xmldom"
|
|
5
5
|
import { unzipSync } from "fflate"
|
|
6
|
+
import { Jimp } from "jimp"
|
|
7
|
+
import { extractImages, getDocumentProxy } from "unpdf"
|
|
6
8
|
import { extractDocx } from "../read-hooks/extractors/docx"
|
|
9
|
+
import { extractPdfText } from "../read-hooks/extractors/pdf"
|
|
7
10
|
import { extractPptx } from "../read-hooks/extractors/pptx"
|
|
8
11
|
import { extractXlsx } from "../read-hooks/extractors/xlsx"
|
|
9
12
|
|
|
@@ -14,14 +17,47 @@ export type DocumentMaterial = {
|
|
|
14
17
|
note?: string
|
|
15
18
|
}
|
|
16
19
|
|
|
20
|
+
export type SkippedAsset = {
|
|
21
|
+
source_ref: string
|
|
22
|
+
page_or_slide?: string
|
|
23
|
+
reason: "svg_asset" | "unmapped_media" | "low_value_asset"
|
|
24
|
+
kind?: "svg" | "icon" | "logo" | "overlay" | "decoration"
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
export type PptxSlideElement = {
|
|
28
|
+
id: string
|
|
29
|
+
kind: "text" | "image" | "shape"
|
|
30
|
+
zOrder: number
|
|
31
|
+
bbox?: { x: number; y: number; w: number; h: number }
|
|
32
|
+
likelyBackground?: boolean
|
|
33
|
+
likelyHeroImage?: boolean
|
|
34
|
+
likelyLogo?: boolean
|
|
35
|
+
likelyOverlayMask?: boolean
|
|
36
|
+
likelyDecoration?: boolean
|
|
37
|
+
text?: string
|
|
38
|
+
source_ref?: string
|
|
39
|
+
path?: string
|
|
40
|
+
asset_status?: "kept" | "skipped"
|
|
41
|
+
name?: string
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
export type PptxSlide = {
|
|
45
|
+
slide: string
|
|
46
|
+
width?: number
|
|
47
|
+
height?: number
|
|
48
|
+
elements: PptxSlideElement[]
|
|
49
|
+
}
|
|
50
|
+
|
|
17
51
|
export type DocumentMaterialsResult = {
|
|
18
52
|
status: "processed" | "skipped" | "failed"
|
|
19
53
|
source: string
|
|
20
|
-
type: "pptx" | "docx" | "xlsx" | "other"
|
|
54
|
+
type: "pptx" | "docx" | "xlsx" | "pdf" | "other"
|
|
21
55
|
cache_dir?: string
|
|
22
56
|
manifest_path?: string
|
|
23
57
|
text_path?: string
|
|
24
58
|
images?: DocumentMaterial[]
|
|
59
|
+
skipped_assets?: SkippedAsset[]
|
|
60
|
+
slides?: PptxSlide[]
|
|
25
61
|
tables?: DocumentMaterial[]
|
|
26
62
|
reason?: string
|
|
27
63
|
}
|
|
@@ -36,15 +72,25 @@ type CachedManifest = {
|
|
|
36
72
|
manifest_path: string
|
|
37
73
|
text_path: string
|
|
38
74
|
images: DocumentMaterial[]
|
|
75
|
+
skipped_assets: SkippedAsset[]
|
|
76
|
+
slides: PptxSlide[]
|
|
39
77
|
tables: DocumentMaterial[]
|
|
40
78
|
}
|
|
41
79
|
|
|
80
|
+
type PptxImageExtraction = {
|
|
81
|
+
images: DocumentMaterial[]
|
|
82
|
+
skipped_assets: SkippedAsset[]
|
|
83
|
+
}
|
|
84
|
+
|
|
42
85
|
const SUPPORTED_EXTENSIONS: Record<string, SupportedType> = {
|
|
43
86
|
".pptx": "pptx",
|
|
44
87
|
".docx": "docx",
|
|
45
88
|
".xlsx": "xlsx",
|
|
89
|
+
".pdf": "pdf",
|
|
46
90
|
}
|
|
47
91
|
|
|
92
|
+
type PdfImageData = Awaited<ReturnType<typeof extractImages>>[number]
|
|
93
|
+
|
|
48
94
|
function normalizeZipTarget(basePath: string, target: string): string {
|
|
49
95
|
const segments = join(dirname(basePath), target).split("/")
|
|
50
96
|
const normalized: string[] = []
|
|
@@ -62,8 +108,11 @@ function normalizeZipTarget(basePath: string, target: string): string {
|
|
|
62
108
|
}
|
|
63
109
|
|
|
64
110
|
function ensureWorkspacePath(filePath: string, workspaceDir: string): string {
|
|
65
|
-
const resolvedWorkspace = resolve(workspaceDir)
|
|
66
|
-
const
|
|
111
|
+
const resolvedWorkspace = realpathSync(resolve(workspaceDir))
|
|
112
|
+
const candidate = isAbsolute(filePath) ? resolve(filePath) : resolve(workspaceDir, filePath)
|
|
113
|
+
const resolvedFile = existsSync(candidate)
|
|
114
|
+
? realpathSync(candidate)
|
|
115
|
+
: candidate
|
|
67
116
|
|
|
68
117
|
if (resolvedFile !== resolvedWorkspace && !resolvedFile.startsWith(resolvedWorkspace + "/")) {
|
|
69
118
|
throw new Error("file must be within workspace")
|
|
@@ -72,8 +121,24 @@ function ensureWorkspacePath(filePath: string, workspaceDir: string): string {
|
|
|
72
121
|
return resolvedFile
|
|
73
122
|
}
|
|
74
123
|
|
|
124
|
+
function normalizeWorkspaceChild(filePath: string, workspaceDir: string): string {
|
|
125
|
+
const workspaceAlias = resolve(workspaceDir)
|
|
126
|
+
const workspaceReal = realpathSync(workspaceAlias)
|
|
127
|
+
const candidate = resolve(filePath)
|
|
128
|
+
|
|
129
|
+
if (existsSync(candidate)) return realpathSync(candidate)
|
|
130
|
+
|
|
131
|
+
if (candidate === workspaceAlias || candidate.startsWith(workspaceAlias + "/")) {
|
|
132
|
+
return join(workspaceReal, relative(workspaceAlias, candidate))
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
return candidate
|
|
136
|
+
}
|
|
137
|
+
|
|
75
138
|
function workspaceRelative(filePath: string, workspaceDir: string): string {
|
|
76
|
-
|
|
139
|
+
const resolvedWorkspace = realpathSync(resolve(workspaceDir))
|
|
140
|
+
const resolvedFile = normalizeWorkspaceChild(filePath, workspaceDir)
|
|
141
|
+
return relative(resolvedWorkspace, resolvedFile).replace(/\\/g, "/")
|
|
77
142
|
}
|
|
78
143
|
|
|
79
144
|
function buildFingerprint(filePath: string): string {
|
|
@@ -92,18 +157,303 @@ function materialPath(cacheDir: string, workspaceDir: string, ...segments: strin
|
|
|
92
157
|
return workspaceRelative(join(cacheDir, ...segments), workspaceDir)
|
|
93
158
|
}
|
|
94
159
|
|
|
160
|
+
function toRgbaBuffer(image: PdfImageData): Buffer {
|
|
161
|
+
const pixelCount = image.width * image.height
|
|
162
|
+
|
|
163
|
+
if (image.channels === 4) {
|
|
164
|
+
return Buffer.from(image.data.buffer, image.data.byteOffset, image.data.byteLength)
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
const rgba = Buffer.alloc(pixelCount * 4)
|
|
168
|
+
|
|
169
|
+
for (let i = 0; i < pixelCount; i++) {
|
|
170
|
+
const dest = i * 4
|
|
171
|
+
if (image.channels === 3) {
|
|
172
|
+
const src = i * 3
|
|
173
|
+
rgba[dest] = image.data[src]!
|
|
174
|
+
rgba[dest + 1] = image.data[src + 1]!
|
|
175
|
+
rgba[dest + 2] = image.data[src + 2]!
|
|
176
|
+
rgba[dest + 3] = 255
|
|
177
|
+
continue
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const value = image.data[i]!
|
|
181
|
+
rgba[dest] = value
|
|
182
|
+
rgba[dest + 1] = value
|
|
183
|
+
rgba[dest + 2] = value
|
|
184
|
+
rgba[dest + 3] = 255
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
return rgba
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
async function encodePdfImageAsPng(image: PdfImageData): Promise<Buffer> {
|
|
191
|
+
const bitmap = {
|
|
192
|
+
data: toRgbaBuffer(image),
|
|
193
|
+
width: image.width,
|
|
194
|
+
height: image.height,
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
const png = Jimp.fromBitmap(bitmap)
|
|
198
|
+
return await png.getBuffer("image/png")
|
|
199
|
+
}
|
|
200
|
+
|
|
95
201
|
function parseXml(files: Record<string, Uint8Array>, path: string): any | null {
|
|
96
202
|
const file = files[path]
|
|
97
203
|
if (!file) return null
|
|
98
204
|
return new DOMParser().parseFromString(new TextDecoder().decode(file), "text/xml")
|
|
99
205
|
}
|
|
100
206
|
|
|
101
|
-
function
|
|
207
|
+
function xmlLocalName(node: any): string {
|
|
208
|
+
return node?.localName ?? String(node?.nodeName ?? "").split(":").pop() ?? ""
|
|
209
|
+
}
|
|
210
|
+
|
|
211
|
+
function xmlElementChildren(node: any): any[] {
|
|
212
|
+
const children: any[] = []
|
|
213
|
+
const childNodes = node?.childNodes ?? []
|
|
214
|
+
for (let i = 0; i < childNodes.length; i++) {
|
|
215
|
+
const child = childNodes[i]
|
|
216
|
+
if (child?.nodeType === 1) children.push(child)
|
|
217
|
+
}
|
|
218
|
+
return children
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
function xmlDescendantsByLocalName(node: any, name: string): any[] {
|
|
222
|
+
const matches: any[] = []
|
|
223
|
+
const walk = (current: any) => {
|
|
224
|
+
for (const child of xmlElementChildren(current)) {
|
|
225
|
+
if (xmlLocalName(child) === name) matches.push(child)
|
|
226
|
+
walk(child)
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
walk(node)
|
|
230
|
+
return matches
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
function firstDescendantByLocalName(node: any, name: string): any | null {
|
|
234
|
+
const [match] = xmlDescendantsByLocalName(node, name)
|
|
235
|
+
return match ?? null
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
function extractShapeText(node: any): string | undefined {
|
|
239
|
+
const texts = xmlDescendantsByLocalName(node, "t")
|
|
240
|
+
.map((textNode) => textNode.textContent?.trim())
|
|
241
|
+
.filter(Boolean)
|
|
242
|
+
return texts.length > 0 ? texts.join("\n") : undefined
|
|
243
|
+
}
|
|
244
|
+
|
|
245
|
+
function extractElementName(node: any): string | undefined {
|
|
246
|
+
return firstDescendantByLocalName(node, "cNvPr")?.getAttribute?.("name") || undefined
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
function parseCoordinate(value: string | null | undefined): number | undefined {
|
|
250
|
+
if (value == null || value === "") return undefined
|
|
251
|
+
const parsed = Number(value)
|
|
252
|
+
return Number.isFinite(parsed) ? parsed : undefined
|
|
253
|
+
}
|
|
254
|
+
|
|
255
|
+
function extractElementBBox(node: any): { x: number; y: number; w: number; h: number } | undefined {
|
|
256
|
+
const xfrm = firstDescendantByLocalName(node, "xfrm")
|
|
257
|
+
if (!xfrm) return undefined
|
|
258
|
+
|
|
259
|
+
const off = firstDescendantByLocalName(xfrm, "off")
|
|
260
|
+
const ext = firstDescendantByLocalName(xfrm, "ext")
|
|
261
|
+
if (!off || !ext) return undefined
|
|
262
|
+
|
|
263
|
+
const x = parseCoordinate(off.getAttribute?.("x"))
|
|
264
|
+
const y = parseCoordinate(off.getAttribute?.("y"))
|
|
265
|
+
const w = parseCoordinate(ext.getAttribute?.("cx"))
|
|
266
|
+
const h = parseCoordinate(ext.getAttribute?.("cy"))
|
|
267
|
+
if ([x, y, w, h].some((value) => value == null)) return undefined
|
|
268
|
+
|
|
269
|
+
return { x: x!, y: y!, w: w!, h: h! }
|
|
270
|
+
}
|
|
271
|
+
|
|
272
|
+
function getPptxSlideSize(files: Record<string, Uint8Array>): { width: number; height: number } | undefined {
|
|
273
|
+
const doc = parseXml(files, "ppt/presentation.xml")
|
|
274
|
+
const size = firstDescendantByLocalName(doc, "sldSz")
|
|
275
|
+
if (!size) return undefined
|
|
276
|
+
|
|
277
|
+
const width = parseCoordinate(size.getAttribute?.("cx"))
|
|
278
|
+
const height = parseCoordinate(size.getAttribute?.("cy"))
|
|
279
|
+
if (width == null || height == null) return undefined
|
|
280
|
+
return { width, height }
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
function isNearCorner(
|
|
284
|
+
bbox: { x: number; y: number; w: number; h: number },
|
|
285
|
+
slideWidth: number,
|
|
286
|
+
slideHeight: number,
|
|
287
|
+
): boolean {
|
|
288
|
+
const thresholdX = slideWidth * 0.12
|
|
289
|
+
const thresholdY = slideHeight * 0.12
|
|
290
|
+
const right = bbox.x + bbox.w
|
|
291
|
+
const bottom = bbox.y + bbox.h
|
|
292
|
+
return (
|
|
293
|
+
(bbox.x <= thresholdX && bbox.y <= thresholdY) ||
|
|
294
|
+
(right >= slideWidth - thresholdX && bbox.y <= thresholdY) ||
|
|
295
|
+
(bbox.x <= thresholdX && bottom >= slideHeight - thresholdY) ||
|
|
296
|
+
(right >= slideWidth - thresholdX && bottom >= slideHeight - thresholdY)
|
|
297
|
+
)
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
function applyPptxHeuristics(
|
|
301
|
+
slide: PptxSlide,
|
|
302
|
+
slideWidth: number | undefined,
|
|
303
|
+
slideHeight: number | undefined,
|
|
304
|
+
): PptxSlide {
|
|
305
|
+
if (!slideWidth || !slideHeight) return slide
|
|
306
|
+
|
|
307
|
+
const slideArea = slideWidth * slideHeight
|
|
308
|
+
slide.elements = slide.elements.map((element) => {
|
|
309
|
+
if (!element.bbox) return element
|
|
310
|
+
|
|
311
|
+
const areaRatio = (element.bbox.w * element.bbox.h) / slideArea
|
|
312
|
+
const sourceName = `${element.source_ref ?? ""} ${element.name ?? ""}`.toLowerCase()
|
|
313
|
+
|
|
314
|
+
if (element.kind === "image") {
|
|
315
|
+
const flags: Partial<PptxSlideElement> = {}
|
|
316
|
+
if (areaRatio >= 0.75 && element.asset_status === "kept") flags.likelyBackground = true
|
|
317
|
+
else if (areaRatio >= 0.2 && element.asset_status === "kept") flags.likelyHeroImage = true
|
|
318
|
+
if (areaRatio <= 0.03 && isNearCorner(element.bbox, slideWidth, slideHeight)) flags.likelyLogo = true
|
|
319
|
+
if (/(logo|brand)/.test(sourceName)) flags.likelyLogo = true
|
|
320
|
+
if (/(mask|overlay|shadow)/.test(sourceName) || element.asset_status === "skipped") flags.likelyOverlayMask = true
|
|
321
|
+
if (/(arrow|ornament|decoration)/.test(sourceName)) flags.likelyDecoration = true
|
|
322
|
+
return Object.keys(flags).length > 0 ? { ...element, ...flags } : element
|
|
323
|
+
}
|
|
324
|
+
|
|
325
|
+
if (element.kind === "shape") {
|
|
326
|
+
const flags: Partial<PptxSlideElement> = {}
|
|
327
|
+
if (areaRatio >= 0.4) flags.likelyOverlayMask = true
|
|
328
|
+
if (areaRatio <= 0.03 || /(arrow|ornament|decoration)/.test(sourceName)) flags.likelyDecoration = true
|
|
329
|
+
return Object.keys(flags).length > 0 ? { ...element, ...flags } : element
|
|
330
|
+
}
|
|
331
|
+
|
|
332
|
+
return element
|
|
333
|
+
})
|
|
334
|
+
|
|
335
|
+
return slide
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
function getSlideMediaTargets(files: Record<string, Uint8Array>, slidePath: string): Map<string, string> {
|
|
339
|
+
const relPath = slidePath.replace("/slides/", "/slides/_rels/") + ".rels"
|
|
340
|
+
const doc = parseXml(files, relPath)
|
|
341
|
+
const targets = new Map<string, string>()
|
|
342
|
+
if (!doc) return targets
|
|
343
|
+
|
|
344
|
+
const relationships = doc.getElementsByTagName("Relationship")
|
|
345
|
+
for (let i = 0; i < relationships.length; i++) {
|
|
346
|
+
const rel = relationships[i]
|
|
347
|
+
const id = rel.getAttribute("Id")
|
|
348
|
+
const target = rel.getAttribute("Target")
|
|
349
|
+
if (!id || !target) continue
|
|
350
|
+
const normalized = normalizeZipTarget(slidePath, target)
|
|
351
|
+
if (!normalized.startsWith("ppt/media/")) continue
|
|
352
|
+
targets.set(id, normalized)
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
return targets
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
function extractPptxSlides(
|
|
359
|
+
files: Record<string, Uint8Array>,
|
|
360
|
+
images: DocumentMaterial[],
|
|
361
|
+
skippedAssets: SkippedAsset[],
|
|
362
|
+
): PptxSlide[] {
|
|
363
|
+
const slideSize = getPptxSlideSize(files)
|
|
364
|
+
const keptBySource = new Map(images.map((image) => [image.source_ref, image]))
|
|
365
|
+
const skippedBySource = new Map(skippedAssets.map((asset) => [asset.source_ref, asset]))
|
|
366
|
+
const slideFiles = Object.keys(files)
|
|
367
|
+
.filter((file) => /^ppt\/slides\/slide\d+\.xml$/.test(file))
|
|
368
|
+
.sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
|
|
369
|
+
|
|
370
|
+
return slideFiles.map((slidePath) => {
|
|
371
|
+
const slideNumber = slidePath.match(/slide(\d+)\.xml$/)?.[1] ?? "0"
|
|
372
|
+
const slideId = `slide-${slideNumber.padStart(2, "0")}`
|
|
373
|
+
const doc = parseXml(files, slidePath)
|
|
374
|
+
const mediaTargets = getSlideMediaTargets(files, slidePath)
|
|
375
|
+
const elements: PptxSlideElement[] = []
|
|
376
|
+
|
|
377
|
+
if (!doc) return { slide: slideId, ...(slideSize ?? {}), elements }
|
|
378
|
+
|
|
379
|
+
const spTree = firstDescendantByLocalName(doc, "spTree")
|
|
380
|
+
if (!spTree) return { slide: slideId, ...(slideSize ?? {}), elements }
|
|
381
|
+
|
|
382
|
+
for (const node of xmlElementChildren(spTree)) {
|
|
383
|
+
const kind = xmlLocalName(node)
|
|
384
|
+
if (kind === "nvGrpSpPr" || kind === "grpSpPr") continue
|
|
385
|
+
|
|
386
|
+
const zOrder = elements.length + 1
|
|
387
|
+
const id = `${slideId}-element-${String(zOrder).padStart(2, "0")}`
|
|
388
|
+
const name = extractElementName(node)
|
|
389
|
+
const bbox = extractElementBBox(node)
|
|
390
|
+
|
|
391
|
+
if (kind === "sp") {
|
|
392
|
+
const text = extractShapeText(node)
|
|
393
|
+
elements.push(text
|
|
394
|
+
? { id, kind: "text", zOrder, text, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) }
|
|
395
|
+
: { id, kind: "shape", zOrder, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) })
|
|
396
|
+
continue
|
|
397
|
+
}
|
|
398
|
+
|
|
399
|
+
if (kind === "pic") {
|
|
400
|
+
const blip = firstDescendantByLocalName(node, "blip")
|
|
401
|
+
const rid = blip?.getAttribute?.("r:embed") || blip?.getAttribute?.("embed") || undefined
|
|
402
|
+
const sourceRef = rid ? mediaTargets.get(rid) : undefined
|
|
403
|
+
const kept = sourceRef ? keptBySource.get(sourceRef) : undefined
|
|
404
|
+
const skipped = sourceRef ? skippedBySource.get(sourceRef) : undefined
|
|
405
|
+
|
|
406
|
+
elements.push({
|
|
407
|
+
id,
|
|
408
|
+
kind: "image",
|
|
409
|
+
zOrder,
|
|
410
|
+
...(bbox ? { bbox } : {}),
|
|
411
|
+
...(name ? { name } : {}),
|
|
412
|
+
...(sourceRef ? { source_ref: sourceRef } : {}),
|
|
413
|
+
...(kept?.path ? { path: kept.path } : {}),
|
|
414
|
+
...((kept || skipped) ? { asset_status: kept ? "kept" as const : "skipped" as const } : {}),
|
|
415
|
+
})
|
|
416
|
+
continue
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
if (kind === "cxnSp" || kind === "graphicFrame" || kind === "grpSp") {
|
|
420
|
+
elements.push({ id, kind: "shape", zOrder, ...(bbox ? { bbox } : {}), ...(name ? { name } : {}) })
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
|
|
424
|
+
return applyPptxHeuristics({ slide: slideId, ...(slideSize ?? {}), elements }, slideSize?.width, slideSize?.height)
|
|
425
|
+
})
|
|
426
|
+
}
|
|
427
|
+
|
|
428
|
+
const LOW_VALUE_PPTX_ASSET = /(icon|logo|mask|overlay|shadow|decoration|ornament|arrow)/i
|
|
429
|
+
|
|
430
|
+
function classifySkippedAsset(sourceRef: string, reason: SkippedAsset["reason"]): SkippedAsset["kind"] | undefined {
|
|
431
|
+
if (sourceRef.endsWith(".svg")) return "svg"
|
|
432
|
+
if (/icon/i.test(sourceRef)) return "icon"
|
|
433
|
+
if (/logo/i.test(sourceRef)) return "logo"
|
|
434
|
+
if (/(mask|overlay|shadow)/i.test(sourceRef)) return "overlay"
|
|
435
|
+
if (/(decoration|ornament|arrow)/i.test(sourceRef)) return "decoration"
|
|
436
|
+
if (reason === "svg_asset") return "svg"
|
|
437
|
+
return undefined
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
function shouldSkipPptxAsset(sourceRef: string): { reason: SkippedAsset["reason"]; kind?: SkippedAsset["kind"] } | null {
|
|
441
|
+
if (sourceRef.endsWith(".svg")) {
|
|
442
|
+
return { reason: "svg_asset", kind: "svg" }
|
|
443
|
+
}
|
|
444
|
+
if (LOW_VALUE_PPTX_ASSET.test(basename(sourceRef))) {
|
|
445
|
+
return { reason: "low_value_asset", kind: classifySkippedAsset(sourceRef, "low_value_asset") }
|
|
446
|
+
}
|
|
447
|
+
return null
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): PptxImageExtraction {
|
|
102
451
|
const relFiles = Object.keys(files)
|
|
103
452
|
.filter((file) => /^ppt\/slides\/_rels\/slide\d+\.xml\.rels$/.test(file))
|
|
104
453
|
.sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
|
|
105
454
|
|
|
106
455
|
const images: DocumentMaterial[] = []
|
|
456
|
+
const skipped_assets: SkippedAsset[] = []
|
|
107
457
|
const seenTargets = new Set<string>()
|
|
108
458
|
|
|
109
459
|
for (const relPath of relFiles) {
|
|
@@ -124,8 +474,19 @@ function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string,
|
|
|
124
474
|
const media = files[normalized]
|
|
125
475
|
if (!media) continue
|
|
126
476
|
|
|
127
|
-
imageIndex += 1
|
|
128
477
|
seenTargets.add(normalized)
|
|
478
|
+
const skipped = shouldSkipPptxAsset(normalized)
|
|
479
|
+
if (skipped) {
|
|
480
|
+
skipped_assets.push({
|
|
481
|
+
source_ref: normalized,
|
|
482
|
+
page_or_slide: `slide-${slideNumber.padStart(2, "0")}`,
|
|
483
|
+
reason: skipped.reason,
|
|
484
|
+
kind: skipped.kind,
|
|
485
|
+
})
|
|
486
|
+
continue
|
|
487
|
+
}
|
|
488
|
+
|
|
489
|
+
imageIndex += 1
|
|
129
490
|
const exportedName = `slide-${slideNumber.padStart(2, "0")}-image-${String(imageIndex).padStart(2, "0")}${extname(normalized)}`
|
|
130
491
|
const outputPath = join(cacheDir, "images", exportedName)
|
|
131
492
|
writeCachedBuffer(outputPath, media)
|
|
@@ -143,18 +504,14 @@ function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string,
|
|
|
143
504
|
.sort()
|
|
144
505
|
|
|
145
506
|
for (const mediaPath of remainingMedia) {
|
|
146
|
-
|
|
147
|
-
const outputPath = join(cacheDir, "images", exportedName)
|
|
148
|
-
writeCachedBuffer(outputPath, files[mediaPath])
|
|
149
|
-
|
|
150
|
-
images.push({
|
|
151
|
-
path: materialPath(cacheDir, workspaceDir, "images", exportedName),
|
|
507
|
+
skipped_assets.push({
|
|
152
508
|
source_ref: mediaPath,
|
|
153
|
-
|
|
509
|
+
reason: "unmapped_media",
|
|
510
|
+
kind: classifySkippedAsset(mediaPath, "unmapped_media"),
|
|
154
511
|
})
|
|
155
512
|
}
|
|
156
513
|
|
|
157
|
-
return images
|
|
514
|
+
return { images, skipped_assets }
|
|
158
515
|
}
|
|
159
516
|
|
|
160
517
|
function extractDocxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): DocumentMaterial[] {
|
|
@@ -279,6 +636,94 @@ function extractTables(type: SupportedType, textPath: string): DocumentMaterial[
|
|
|
279
636
|
return [{ path: textPath, source_ref: "workbook", note: "Sheet text and tables extracted to text file" }]
|
|
280
637
|
}
|
|
281
638
|
|
|
639
|
+
async function extractPdfImages(buf: Buffer, cacheDir: string, workspaceDir: string): Promise<DocumentMaterial[]> {
|
|
640
|
+
const pdf = await getDocumentProxy(new Uint8Array(buf))
|
|
641
|
+
const images: DocumentMaterial[] = []
|
|
642
|
+
|
|
643
|
+
for (let pageNumber = 1; pageNumber <= pdf.numPages; pageNumber++) {
|
|
644
|
+
const extracted = await extractImages(pdf, pageNumber)
|
|
645
|
+
|
|
646
|
+
for (let index = 0; index < extracted.length; index++) {
|
|
647
|
+
const image = extracted[index]!
|
|
648
|
+
const exportedName = `page-${String(pageNumber).padStart(2, "0")}-image-${String(index + 1).padStart(2, "0")}.png`
|
|
649
|
+
const outputPath = join(cacheDir, "images", exportedName)
|
|
650
|
+
const png = await encodePdfImageAsPng(image)
|
|
651
|
+
writeFileSync(outputPath, new Uint8Array(png))
|
|
652
|
+
|
|
653
|
+
images.push({
|
|
654
|
+
path: materialPath(cacheDir, workspaceDir, "images", exportedName),
|
|
655
|
+
source_ref: `pdf/page-${String(pageNumber).padStart(2, "0")}/${image.key}`,
|
|
656
|
+
page_or_slide: `page-${String(pageNumber).padStart(2, "0")}`,
|
|
657
|
+
note: `Embedded PDF image (${image.width}x${image.height}, ${image.channels} channel${image.channels === 1 ? "" : "s"})`,
|
|
658
|
+
})
|
|
659
|
+
}
|
|
660
|
+
}
|
|
661
|
+
|
|
662
|
+
return images
|
|
663
|
+
}
|
|
664
|
+
|
|
665
|
+
async function processPdfFile(filePath: string, workspaceDir: string): Promise<DocumentMaterialsResult> {
|
|
666
|
+
const relativeSource = workspaceRelative(filePath, workspaceDir)
|
|
667
|
+
const fingerprint = buildFingerprint(filePath)
|
|
668
|
+
const cacheDir = join(workspaceDir, ".opencode", "revela", "doc-materials", fingerprint)
|
|
669
|
+
const manifestPath = join(cacheDir, "manifest.json")
|
|
670
|
+
|
|
671
|
+
if (existsSync(manifestPath)) {
|
|
672
|
+
const manifest = JSON.parse(readFileSync(manifestPath, "utf-8")) as CachedManifest
|
|
673
|
+
return {
|
|
674
|
+
status: "processed",
|
|
675
|
+
source: manifest.source,
|
|
676
|
+
type: manifest.type,
|
|
677
|
+
cache_dir: manifest.cache_dir,
|
|
678
|
+
manifest_path: manifest.manifest_path,
|
|
679
|
+
text_path: manifest.text_path,
|
|
680
|
+
images: manifest.images,
|
|
681
|
+
skipped_assets: manifest.skipped_assets,
|
|
682
|
+
slides: manifest.slides,
|
|
683
|
+
tables: manifest.tables,
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
mkdirSync(join(cacheDir, "images"), { recursive: true })
|
|
688
|
+
mkdirSync(join(cacheDir, "tables"), { recursive: true })
|
|
689
|
+
|
|
690
|
+
const buf = readFileSync(filePath)
|
|
691
|
+
const text = await extractPdfText(buf)
|
|
692
|
+
const textPath = join(cacheDir, "text.txt")
|
|
693
|
+
writeFileSync(textPath, `[Extracted from: ${basename(filePath)}]\n\n${text}`, "utf-8")
|
|
694
|
+
|
|
695
|
+
const images = await extractPdfImages(buf, cacheDir, workspaceDir)
|
|
696
|
+
|
|
697
|
+
const result: DocumentMaterialsResult = {
|
|
698
|
+
status: "processed",
|
|
699
|
+
source: relativeSource,
|
|
700
|
+
type: "pdf",
|
|
701
|
+
cache_dir: workspaceRelative(cacheDir, workspaceDir),
|
|
702
|
+
manifest_path: workspaceRelative(manifestPath, workspaceDir),
|
|
703
|
+
text_path: workspaceRelative(textPath, workspaceDir),
|
|
704
|
+
images,
|
|
705
|
+
skipped_assets: [],
|
|
706
|
+
slides: [],
|
|
707
|
+
tables: [],
|
|
708
|
+
}
|
|
709
|
+
|
|
710
|
+
const manifest: CachedManifest = {
|
|
711
|
+
source: result.source,
|
|
712
|
+
type: "pdf",
|
|
713
|
+
fingerprint,
|
|
714
|
+
cache_dir: result.cache_dir!,
|
|
715
|
+
manifest_path: result.manifest_path!,
|
|
716
|
+
text_path: result.text_path!,
|
|
717
|
+
images: result.images ?? [],
|
|
718
|
+
skipped_assets: [],
|
|
719
|
+
slides: [],
|
|
720
|
+
tables: [],
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
writeFileSync(manifestPath, JSON.stringify(manifest, null, 2), "utf-8")
|
|
724
|
+
return result
|
|
725
|
+
}
|
|
726
|
+
|
|
282
727
|
async function processOfficeFile(filePath: string, workspaceDir: string, type: SupportedType): Promise<DocumentMaterialsResult> {
|
|
283
728
|
const relativeSource = workspaceRelative(filePath, workspaceDir)
|
|
284
729
|
const fingerprint = buildFingerprint(filePath)
|
|
@@ -295,6 +740,8 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
|
|
|
295
740
|
manifest_path: manifest.manifest_path,
|
|
296
741
|
text_path: manifest.text_path,
|
|
297
742
|
images: manifest.images,
|
|
743
|
+
skipped_assets: manifest.skipped_assets,
|
|
744
|
+
slides: manifest.slides,
|
|
298
745
|
tables: manifest.tables,
|
|
299
746
|
}
|
|
300
747
|
}
|
|
@@ -314,11 +761,17 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
|
|
|
314
761
|
const textPath = join(cacheDir, "text.txt")
|
|
315
762
|
writeFileSync(textPath, `[Extracted from: ${basename(filePath)}]\n\n${text}`, "utf-8")
|
|
316
763
|
|
|
317
|
-
const
|
|
764
|
+
const pptxAssets = type === "pptx"
|
|
318
765
|
? extractPptxImages(files, cacheDir, workspaceDir)
|
|
766
|
+
: null
|
|
767
|
+
const images = type === "pptx"
|
|
768
|
+
? pptxAssets!.images
|
|
319
769
|
: type === "docx"
|
|
320
770
|
? extractDocxImages(files, cacheDir, workspaceDir)
|
|
321
771
|
: extractXlsxImages(files, cacheDir, workspaceDir)
|
|
772
|
+
const slides = type === "pptx"
|
|
773
|
+
? extractPptxSlides(files, images, pptxAssets!.skipped_assets)
|
|
774
|
+
: undefined
|
|
322
775
|
|
|
323
776
|
const result: DocumentMaterialsResult = {
|
|
324
777
|
status: "processed",
|
|
@@ -328,6 +781,8 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
|
|
|
328
781
|
manifest_path: workspaceRelative(manifestPath, workspaceDir),
|
|
329
782
|
text_path: workspaceRelative(textPath, workspaceDir),
|
|
330
783
|
images,
|
|
784
|
+
skipped_assets: pptxAssets?.skipped_assets ?? [],
|
|
785
|
+
slides,
|
|
331
786
|
tables: extractTables(type, workspaceRelative(textPath, workspaceDir)),
|
|
332
787
|
}
|
|
333
788
|
|
|
@@ -339,6 +794,8 @@ async function processOfficeFile(filePath: string, workspaceDir: string, type: S
|
|
|
339
794
|
manifest_path: result.manifest_path!,
|
|
340
795
|
text_path: result.text_path!,
|
|
341
796
|
images: result.images ?? [],
|
|
797
|
+
skipped_assets: result.skipped_assets ?? [],
|
|
798
|
+
slides: result.slides ?? [],
|
|
342
799
|
tables: result.tables ?? [],
|
|
343
800
|
}
|
|
344
801
|
|
|
@@ -361,7 +818,9 @@ export async function extractDocumentMaterials(filePath: string, workspaceDir: s
|
|
|
361
818
|
}
|
|
362
819
|
}
|
|
363
820
|
|
|
364
|
-
return
|
|
821
|
+
return type === "pdf"
|
|
822
|
+
? await processPdfFile(resolvedFile, workspaceDir)
|
|
823
|
+
: await processOfficeFile(resolvedFile, workspaceDir, type)
|
|
365
824
|
} catch (e) {
|
|
366
825
|
return {
|
|
367
826
|
status: "failed",
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { basename, extname } from "path"
|
|
2
|
+
export const OFFICE_EXTENSIONS = new Set([".docx", ".pptx", ".xlsx"])
|
|
3
|
+
export const IMAGE_EXTENSIONS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
|
|
4
|
+
|
|
5
|
+
export type ReadStrategy =
|
|
6
|
+
| "before-materialize-document"
|
|
7
|
+
| "after-extract-text"
|
|
8
|
+
| "after-compress-image"
|
|
9
|
+
| "passthrough"
|
|
10
|
+
|
|
11
|
+
export function classifyReadFile(filePath: string): ReadStrategy {
|
|
12
|
+
const ext = extname(filePath).toLowerCase()
|
|
13
|
+
if (OFFICE_EXTENSIONS.has(ext)) return "before-materialize-document"
|
|
14
|
+
if (ext === ".pdf") return "after-extract-text"
|
|
15
|
+
if (IMAGE_EXTENSIONS.has(ext)) return "after-compress-image"
|
|
16
|
+
return "passthrough"
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
export function formatExtractedText(filePath: string, text: string): string {
|
|
20
|
+
return `[Extracted from: ${basename(filePath)}]\n\n${text}`
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
export function buildOfficeReadView(
|
|
24
|
+
filePath: string,
|
|
25
|
+
text: string,
|
|
26
|
+
images: Array<{ path: string }> | undefined,
|
|
27
|
+
): string {
|
|
28
|
+
const lines = [
|
|
29
|
+
`# Extracted from: ${basename(filePath)}`,
|
|
30
|
+
"",
|
|
31
|
+
"## Text",
|
|
32
|
+
"",
|
|
33
|
+
text.trim() || "No text extracted.",
|
|
34
|
+
]
|
|
35
|
+
|
|
36
|
+
lines.push("", "## Images", "")
|
|
37
|
+
|
|
38
|
+
if (!images?.length) {
|
|
39
|
+
lines.push("- None")
|
|
40
|
+
} else {
|
|
41
|
+
for (const image of images) lines.push(`- ${image.path}`)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return lines.join("\n")
|
|
45
|
+
}
|
package/lib/read-hooks/index.ts
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
* Entry point for the read-hooks module.
|
|
5
5
|
* Exports preRead and postRead for use in plugins/revela.ts hook handlers.
|
|
6
6
|
*
|
|
7
|
-
* preRead → tool.execute.before:
|
|
7
|
+
* preRead → tool.execute.before: materialize Office docs and redirect to temp markdown
|
|
8
8
|
* postRead → tool.execute.after: transform PDF/image attachments before LLM sees them
|
|
9
9
|
*/
|
|
10
10
|
|
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
import { readFileSync } from "fs"
|
|
2
|
+
import { join } from "path"
|
|
3
|
+
import type { PptxSlide } from "../document-materials/extract"
|
|
4
|
+
import { extractDocumentMaterials } from "../document-materials/extract"
|
|
5
|
+
import { buildOfficeReadView } from "./dispatch"
|
|
6
|
+
import { extractDocx } from "./extractors/docx"
|
|
7
|
+
import { extractPptx } from "./extractors/pptx"
|
|
8
|
+
import { extractXlsx } from "./extractors/xlsx"
|
|
9
|
+
import { formatExtractedText } from "./dispatch"
|
|
10
|
+
|
|
11
|
+
const HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
|
|
12
|
+
".docx": extractDocx,
|
|
13
|
+
".pptx": extractPptx,
|
|
14
|
+
".xlsx": extractXlsx,
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
function buildPptxStructureHints(slides: PptxSlide[] | undefined): string {
|
|
18
|
+
if (!slides?.length) return ""
|
|
19
|
+
|
|
20
|
+
const lines = ["", "## Slide Structure", ""]
|
|
21
|
+
for (const slide of slides) {
|
|
22
|
+
const textCount = slide.elements.filter((element) => element.kind === "text").length
|
|
23
|
+
const keptImageCount = slide.elements.filter((element) => element.kind === "image" && element.asset_status === "kept").length
|
|
24
|
+
const skippedImageCount = slide.elements.filter((element) => element.kind === "image" && element.asset_status === "skipped").length
|
|
25
|
+
const shapeCount = slide.elements.filter((element) => element.kind === "shape").length
|
|
26
|
+
const summary = [
|
|
27
|
+
textCount > 0 ? `${textCount} text` : null,
|
|
28
|
+
keptImageCount > 0 ? `${keptImageCount} kept image` : null,
|
|
29
|
+
skippedImageCount > 0 ? `${skippedImageCount} skipped image` : null,
|
|
30
|
+
shapeCount > 0 ? `${shapeCount} shape` : null,
|
|
31
|
+
].filter(Boolean).join(", ") || "no parsed elements"
|
|
32
|
+
lines.push(`- ${slide.slide}: ${summary}`)
|
|
33
|
+
|
|
34
|
+
const roleSummary = [
|
|
35
|
+
countRole(slide, (element) => element.likelyBackground, "background image"),
|
|
36
|
+
countRole(slide, (element) => element.likelyHeroImage, "hero image"),
|
|
37
|
+
countRole(slide, (element) => element.likelyLogo, "logo"),
|
|
38
|
+
countRole(slide, (element) => element.likelyOverlayMask, "overlay"),
|
|
39
|
+
countRole(slide, (element) => element.likelyDecoration, "decoration"),
|
|
40
|
+
].filter(Boolean).join(", ")
|
|
41
|
+
if (roleSummary) lines.push(` likely roles: ${roleSummary}`)
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
return lines.join("\n")
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function countRole(
|
|
48
|
+
slide: PptxSlide,
|
|
49
|
+
predicate: (element: PptxSlide["elements"][number]) => boolean | undefined,
|
|
50
|
+
label: string,
|
|
51
|
+
): string | null {
|
|
52
|
+
const count = slide.elements.filter(predicate).length
|
|
53
|
+
if (count === 0) return null
|
|
54
|
+
return `${count} ${label}${count === 1 ? "" : "s"}`
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export async function createOfficeReadView(filePath: string, workspaceDir: string): Promise<string> {
|
|
58
|
+
const ext = filePath.slice(filePath.lastIndexOf(".")).toLowerCase()
|
|
59
|
+
const handler = HANDLERS[ext]
|
|
60
|
+
if (!handler) throw new Error(`unsupported office file type: ${ext}`)
|
|
61
|
+
|
|
62
|
+
const materialized = await extractDocumentMaterials(filePath, workspaceDir)
|
|
63
|
+
|
|
64
|
+
if (materialized.status === "processed" && materialized.text_path) {
|
|
65
|
+
const textPath = join(workspaceDir, materialized.text_path)
|
|
66
|
+
const extracted = readFileSync(textPath, "utf-8")
|
|
67
|
+
const text = extracted.replace(/^\[Extracted from: .*?\]\n\n/, "")
|
|
68
|
+
const view = buildOfficeReadView(filePath, text, materialized.images)
|
|
69
|
+
return filePath.toLowerCase().endsWith(".pptx")
|
|
70
|
+
? view + buildPptxStructureHints(materialized.slides)
|
|
71
|
+
: view
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const buf = readFileSync(filePath)
|
|
75
|
+
const text = await handler(buf)
|
|
76
|
+
return formatExtractedText(filePath, text)
|
|
77
|
+
}
|
|
@@ -16,11 +16,10 @@
|
|
|
16
16
|
* of packages/opencode/src/session/prompt.ts.
|
|
17
17
|
*/
|
|
18
18
|
|
|
19
|
-
import {
|
|
19
|
+
import { basename } from "path"
|
|
20
20
|
import { extractPdfText } from "./extractors/pdf"
|
|
21
21
|
import { compressImage } from "./image/compress"
|
|
22
|
-
|
|
23
|
-
const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
|
|
22
|
+
import { classifyReadFile, formatExtractedText } from "./dispatch"
|
|
24
23
|
|
|
25
24
|
interface ReadOutput {
|
|
26
25
|
title: string
|
|
@@ -41,10 +40,10 @@ export async function postRead(
|
|
|
41
40
|
): Promise<void> {
|
|
42
41
|
if (!output.attachments?.length) return
|
|
43
42
|
|
|
44
|
-
const
|
|
43
|
+
const strategy = classifyReadFile(args.filePath)
|
|
45
44
|
|
|
46
45
|
// ── PDF: extract text, drop base64 attachment ───────────────────────────
|
|
47
|
-
if (
|
|
46
|
+
if (strategy === "after-extract-text") {
|
|
48
47
|
const attachment = output.attachments[0]
|
|
49
48
|
const base64 = attachment.url.split(",")[1]
|
|
50
49
|
if (!base64) return
|
|
@@ -52,14 +51,14 @@ export async function postRead(
|
|
|
52
51
|
const buf = Buffer.from(base64, "base64")
|
|
53
52
|
const text = await extractPdfText(buf)
|
|
54
53
|
|
|
55
|
-
output.output =
|
|
54
|
+
output.output = formatExtractedText(args.filePath, text)
|
|
56
55
|
output.title = `Extracted text from ${basename(args.filePath)}`
|
|
57
56
|
output.attachments.length = 0 // Remove base64 — saves significant tokens
|
|
58
57
|
return
|
|
59
58
|
}
|
|
60
59
|
|
|
61
60
|
// ── Images: compress attachment to reduce token cost ────────────────────
|
|
62
|
-
if (
|
|
61
|
+
if (strategy === "after-compress-image") {
|
|
63
62
|
const attachment = output.attachments[0]
|
|
64
63
|
const base64 = attachment.url.split(",")[1]
|
|
65
64
|
if (!base64) return
|
|
@@ -7,44 +7,33 @@
|
|
|
7
7
|
* Handles DOCX, PPTX, XLSX — formats that cause read tool to throw
|
|
8
8
|
* Effect.fail("Cannot read binary file"), so the after-hook never fires.
|
|
9
9
|
*
|
|
10
|
-
* Strategy:
|
|
10
|
+
* Strategy: materialize the document into cached text + images, render a
|
|
11
|
+
* markdown read view, then redirect args.filePath to that temp .md file.
|
|
11
12
|
* The read tool then reads the temp file normally. LLM is unaware of the redirect.
|
|
12
13
|
*/
|
|
13
14
|
|
|
14
|
-
import {
|
|
15
|
-
import {
|
|
15
|
+
import { writeFileSync } from "fs"
|
|
16
|
+
import { join } from "path"
|
|
16
17
|
import { tmpdir } from "os"
|
|
17
18
|
import { randomUUID } from "crypto"
|
|
18
|
-
import {
|
|
19
|
-
import {
|
|
20
|
-
import { extractXlsx } from "./extractors/xlsx"
|
|
21
|
-
|
|
22
|
-
// Extension → extractor function mapping
|
|
23
|
-
const HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
|
|
24
|
-
".docx": extractDocx,
|
|
25
|
-
".pptx": extractPptx,
|
|
26
|
-
".xlsx": extractXlsx,
|
|
27
|
-
}
|
|
19
|
+
import { classifyReadFile } from "./dispatch"
|
|
20
|
+
import { createOfficeReadView } from "./office-read-view"
|
|
28
21
|
|
|
29
22
|
/**
|
|
30
23
|
* Intercept read tool args before execution.
|
|
31
|
-
* If the file is a supported
|
|
32
|
-
* args.filePath to a
|
|
24
|
+
* If the file is a supported Office document, materialize it into cached
|
|
25
|
+
* text + images and redirect args.filePath to a temporary markdown read view.
|
|
33
26
|
*
|
|
34
27
|
* @param args - Mutable read tool args object (from output.args in before-hook)
|
|
35
28
|
*/
|
|
36
29
|
export async function preRead(args: { filePath: string; [k: string]: any }): Promise<void> {
|
|
37
|
-
|
|
38
|
-
const handler = HANDLERS[ext]
|
|
39
|
-
if (!handler) return // Not a handled format — let read tool proceed normally
|
|
30
|
+
if (classifyReadFile(args.filePath) !== "before-materialize-document") return
|
|
40
31
|
|
|
41
|
-
const
|
|
42
|
-
const
|
|
32
|
+
const workspaceDir = process.cwd()
|
|
33
|
+
const output = await createOfficeReadView(args.filePath, workspaceDir)
|
|
43
34
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
const tmpPath = join(tmpdir(), `revela-${randomUUID()}.txt`)
|
|
47
|
-
writeFileSync(tmpPath, header + text, "utf-8")
|
|
35
|
+
const tmpPath = join(tmpdir(), `revela-${randomUUID()}.md`)
|
|
36
|
+
writeFileSync(tmpPath, output, "utf-8")
|
|
48
37
|
|
|
49
38
|
// Redirect read tool to the temp file
|
|
50
39
|
args.filePath = tmpPath
|
package/package.json
CHANGED
package/plugin.ts
CHANGED
|
@@ -25,10 +25,9 @@ import { ACTIVE_PROMPT_FILE } from "./lib/config"
|
|
|
25
25
|
import { ctx } from "./lib/ctx"
|
|
26
26
|
import { preRead } from "./lib/read-hooks"
|
|
27
27
|
import { postRead } from "./lib/read-hooks"
|
|
28
|
-
import { extractDocx } from "./lib/read-hooks/extractors/docx"
|
|
29
|
-
import { extractPptx } from "./lib/read-hooks/extractors/pptx"
|
|
30
|
-
import { extractXlsx } from "./lib/read-hooks/extractors/xlsx"
|
|
31
28
|
import { extractPdfText } from "./lib/read-hooks/extractors/pdf"
|
|
29
|
+
import { createOfficeReadView } from "./lib/read-hooks/office-read-view"
|
|
30
|
+
import { OFFICE_EXTENSIONS, IMAGE_EXTENSIONS, formatExtractedText } from "./lib/read-hooks/dispatch"
|
|
32
31
|
import { handleHelp } from "./lib/commands/help"
|
|
33
32
|
import { handleEnable } from "./lib/commands/enable"
|
|
34
33
|
import { handleDisable } from "./lib/commands/disable"
|
|
@@ -241,19 +240,11 @@ const server: Plugin = (async (pluginCtx) => {
|
|
|
241
240
|
// directly — the read tool is never called, so tool.execute.before/after
|
|
242
241
|
// hooks don't fire. This hook intercepts FileParts before LLM sees them.
|
|
243
242
|
//
|
|
244
|
-
// DOCX/PPTX/XLSX/PDF → extract text → replace with TextPart
|
|
243
|
+
// DOCX/PPTX/XLSX/PDF → extract text/read view → replace with TextPart
|
|
245
244
|
// Images → replace with TextPart hint (LLM can use read tool)
|
|
246
245
|
"chat.message": async (input, output) => {
|
|
247
246
|
if (!ctx.enabled) return
|
|
248
247
|
|
|
249
|
-
const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
|
|
250
|
-
const DOC_HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
|
|
251
|
-
".docx": extractDocx,
|
|
252
|
-
".pptx": extractPptx,
|
|
253
|
-
".xlsx": extractXlsx,
|
|
254
|
-
".pdf": extractPdfText,
|
|
255
|
-
}
|
|
256
|
-
|
|
257
248
|
for (let i = 0; i < output.parts.length; i++) {
|
|
258
249
|
const part = output.parts[i] as any
|
|
259
250
|
if (part.type !== "file") continue
|
|
@@ -264,15 +255,22 @@ const server: Plugin = (async (pluginCtx) => {
|
|
|
264
255
|
const name = basename(filePath)
|
|
265
256
|
|
|
266
257
|
try {
|
|
267
|
-
if (
|
|
258
|
+
if (OFFICE_EXTENSIONS.has(ext)) {
|
|
259
|
+
const text = await createOfficeReadView(filePath, process.cwd())
|
|
260
|
+
output.parts[i] = {
|
|
261
|
+
...part,
|
|
262
|
+
type: "text",
|
|
263
|
+
text,
|
|
264
|
+
} as any
|
|
265
|
+
} else if (ext === ".pdf") {
|
|
268
266
|
const buf = readFileSync(filePath)
|
|
269
|
-
const text = await
|
|
267
|
+
const text = await extractPdfText(buf)
|
|
270
268
|
output.parts[i] = {
|
|
271
269
|
...part,
|
|
272
270
|
type: "text",
|
|
273
|
-
text:
|
|
271
|
+
text: formatExtractedText(filePath, text),
|
|
274
272
|
} as any
|
|
275
|
-
} else if (
|
|
273
|
+
} else if (IMAGE_EXTENSIONS.has(ext)) {
|
|
276
274
|
output.parts[i] = {
|
|
277
275
|
...part,
|
|
278
276
|
type: "text",
|
|
@@ -3,13 +3,13 @@ import { extractDocumentMaterials } from "../lib/document-materials/extract"
|
|
|
3
3
|
|
|
4
4
|
export default tool({
|
|
5
5
|
description:
|
|
6
|
-
"Extract
|
|
7
|
-
"Supports pptx, docx, and xlsx. Produces a manifest plus extracted text, embedded images, and available slide/sheet mappings. " +
|
|
6
|
+
"Extract reusable materials from a workspace document into a workspace-local cache. " +
|
|
7
|
+
"Supports pdf, pptx, docx, and xlsx. Produces a manifest plus extracted text, embedded images, and available page/slide/sheet mappings. " +
|
|
8
8
|
"Unsupported file types are skipped instead of failing.",
|
|
9
9
|
args: {
|
|
10
10
|
file: tool.schema
|
|
11
11
|
.string()
|
|
12
|
-
.describe("Document path relative to workspace root. Supports pptx, docx, and xlsx; other file types are skipped."),
|
|
12
|
+
.describe("Document path relative to workspace root. Supports pdf, pptx, docx, and xlsx; other file types are skipped."),
|
|
13
13
|
},
|
|
14
14
|
async execute(args, context) {
|
|
15
15
|
const workspaceDir = context.directory ?? process.cwd()
|