@cyber-dash-tech/revela 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +21 -0
- package/README.md +239 -0
- package/README.zh-CN.md +270 -0
- package/designs/default/DESIGN.md +1100 -0
- package/designs/editorial-ribbon/DESIGN.md +1092 -0
- package/designs/minimal/DESIGN.md +1079 -0
- package/domains/consulting/INDUSTRY.md +230 -0
- package/domains/deeptech-investment/INDUSTRY.md +160 -0
- package/domains/general/INDUSTRY.md +6 -0
- package/index.ts +1 -0
- package/lib/agents/research-prompt.ts +129 -0
- package/lib/commands/designs.ts +59 -0
- package/lib/commands/disable.ts +14 -0
- package/lib/commands/domains.ts +59 -0
- package/lib/commands/enable.ts +48 -0
- package/lib/commands/help.ts +35 -0
- package/lib/config.ts +65 -0
- package/lib/ctx.ts +27 -0
- package/lib/design/designs.ts +389 -0
- package/lib/domain/domains.ts +258 -0
- package/lib/frontmatter.ts +63 -0
- package/lib/log.ts +35 -0
- package/lib/prompt-builder.ts +194 -0
- package/lib/qa/checks.ts +594 -0
- package/lib/qa/index.ts +38 -0
- package/lib/qa/measure.ts +287 -0
- package/lib/read-hooks/extractors/docx.ts +16 -0
- package/lib/read-hooks/extractors/pdf.ts +19 -0
- package/lib/read-hooks/extractors/pptx.ts +53 -0
- package/lib/read-hooks/extractors/xlsx.ts +81 -0
- package/lib/read-hooks/image/compress.ts +36 -0
- package/lib/read-hooks/index.ts +12 -0
- package/lib/read-hooks/post-read.ts +74 -0
- package/lib/read-hooks/pre-read.ts +51 -0
- package/package.json +65 -0
- package/plugin.ts +365 -0
- package/skill/SKILL.md +676 -0
- package/tools/designs.ts +126 -0
- package/tools/domains.ts +73 -0
- package/tools/qa.ts +61 -0
- package/tools/research-save.ts +96 -0
- package/tools/workspace-scan.ts +154 -0
|
@@ -0,0 +1,287 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* lib/qa/measure.ts
|
|
3
|
+
*
|
|
4
|
+
* Puppeteer-based slide layout measurement.
|
|
5
|
+
* Opens the HTML file with a headless Chrome, navigates to each slide,
|
|
6
|
+
* and records the bounding boxes of all visible elements inside the
|
|
7
|
+
* slide canvas (1920×1080).
|
|
8
|
+
*
|
|
9
|
+
* Returns raw per-slide geometry data consumed by checks.ts.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import puppeteer from "puppeteer-core"
|
|
13
|
+
import { pathToFileURL } from "url"
|
|
14
|
+
|
|
15
|
+
// ── Constants ────────────────────────────────────────────────────────────────
|
|
16
|
+
|
|
17
|
+
/** The canonical slide canvas size (matches the design system). */
|
|
18
|
+
export const CANVAS_W = 1920
|
|
19
|
+
export const CANVAS_H = 1080
|
|
20
|
+
|
|
21
|
+
/** Path to system Chrome on macOS. Falls back to common Linux paths. */
|
|
22
|
+
const CHROME_PATHS = [
|
|
23
|
+
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
|
24
|
+
"/Applications/Chromium.app/Contents/MacOS/Chromium",
|
|
25
|
+
"/usr/bin/google-chrome-stable",
|
|
26
|
+
"/usr/bin/google-chrome",
|
|
27
|
+
"/usr/bin/chromium-browser",
|
|
28
|
+
"/usr/bin/chromium",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
// ── Types ────────────────────────────────────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
export interface Rect {
|
|
34
|
+
left: number
|
|
35
|
+
top: number
|
|
36
|
+
right: number
|
|
37
|
+
bottom: number
|
|
38
|
+
width: number
|
|
39
|
+
height: number
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface ElementInfo {
|
|
43
|
+
/** CSS selector path (tag + nth-child chain), for human-readable reports */
|
|
44
|
+
selector: string
|
|
45
|
+
rect: Rect
|
|
46
|
+
/** true if element is considered "visible" (non-zero size, not hidden) */
|
|
47
|
+
visible: boolean
|
|
48
|
+
/** direct children that are also visible */
|
|
49
|
+
children: ElementInfo[]
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
export interface SlideMetrics {
|
|
53
|
+
/** 0-based slide index */
|
|
54
|
+
index: number
|
|
55
|
+
/** slide title extracted from the first h1/h2 inside the slide */
|
|
56
|
+
title: string
|
|
57
|
+
/**
|
|
58
|
+
* Structural role from the slide's `data-slide-type` attribute.
|
|
59
|
+
* Valid values: "cover", "toc", "content", "closing", "divider", "summary".
|
|
60
|
+
* Undefined when the attribute is absent (old/third-party HTML).
|
|
61
|
+
*/
|
|
62
|
+
slideType?: string
|
|
63
|
+
/** bounding box of the slide-canvas element itself (post-scale) */
|
|
64
|
+
canvasRect: Rect
|
|
65
|
+
/** top-level visible children of .slide-canvas */
|
|
66
|
+
elements: ElementInfo[]
|
|
67
|
+
/** union bounding box of all visible leaf elements */
|
|
68
|
+
contentRect: Rect
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// ── Helpers ──────────────────────────────────────────────────────────────────
|
|
72
|
+
|
|
73
|
+
function findChromePath(): string {
|
|
74
|
+
const { existsSync } = require("fs") as typeof import("fs")
|
|
75
|
+
for (const p of CHROME_PATHS) {
|
|
76
|
+
if (existsSync(p)) return p
|
|
77
|
+
}
|
|
78
|
+
throw new Error(
|
|
79
|
+
"Could not find a Chrome/Chromium installation. " +
|
|
80
|
+
"Tried: " + CHROME_PATHS.join(", ")
|
|
81
|
+
)
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// ── Main export ──────────────────────────────────────────────────────────────
|
|
85
|
+
|
|
86
|
+
/**
|
|
87
|
+
* Open `htmlFilePath` in a headless Chrome at 1920×1080, measure each slide,
|
|
88
|
+
* and return an array of SlideMetrics (one per .slide element).
|
|
89
|
+
*/
|
|
90
|
+
export async function measureSlides(htmlFilePath: string): Promise<SlideMetrics[]> {
|
|
91
|
+
const executablePath = findChromePath()
|
|
92
|
+
const fileUrl = pathToFileURL(htmlFilePath).href
|
|
93
|
+
|
|
94
|
+
const browser = await puppeteer.launch({
|
|
95
|
+
executablePath,
|
|
96
|
+
headless: true,
|
|
97
|
+
args: [
|
|
98
|
+
"--no-sandbox",
|
|
99
|
+
"--disable-setuid-sandbox",
|
|
100
|
+
"--disable-dev-shm-usage",
|
|
101
|
+
"--window-size=1920,1080",
|
|
102
|
+
],
|
|
103
|
+
})
|
|
104
|
+
|
|
105
|
+
try {
|
|
106
|
+
const page = await browser.newPage()
|
|
107
|
+
|
|
108
|
+
// Set viewport to exact canvas size so scale === 1 (no CSS transform needed).
|
|
109
|
+
await page.setViewport({ width: CANVAS_W, height: CANVAS_H })
|
|
110
|
+
await page.goto(fileUrl, { waitUntil: "networkidle0", timeout: 30000 })
|
|
111
|
+
|
|
112
|
+
// Wait for any entrance animations / intersection observers to fire.
|
|
113
|
+
await new Promise((r) => setTimeout(r, 600))
|
|
114
|
+
|
|
115
|
+
// Measure slides one-by-one: scroll each into view, wait for animations,
|
|
116
|
+
// then collect geometry relative to the canvas coordinate system.
|
|
117
|
+
const slideCount: number = await page.evaluate(
|
|
118
|
+
() => document.querySelectorAll(".slide").length
|
|
119
|
+
)
|
|
120
|
+
|
|
121
|
+
const metrics: SlideMetrics[] = []
|
|
122
|
+
|
|
123
|
+
for (let idx = 0; idx < slideCount; idx++) {
|
|
124
|
+
// Scroll the slide into view and wait for intersection observers / animations
|
|
125
|
+
await page.evaluate((i: number) => {
|
|
126
|
+
const slides = document.querySelectorAll(".slide")
|
|
127
|
+
const slide = slides[i] as HTMLElement
|
|
128
|
+
if (slide) {
|
|
129
|
+
slide.scrollIntoView({ behavior: "instant" })
|
|
130
|
+
// Force all .reveal elements visible (in case IO didn't fire)
|
|
131
|
+
slide.querySelectorAll(".reveal").forEach((el) => el.classList.add("visible"))
|
|
132
|
+
}
|
|
133
|
+
}, idx)
|
|
134
|
+
|
|
135
|
+
// Wait for CSS transitions + any JS rendering (ECharts, bar animations, etc.)
|
|
136
|
+
await new Promise((r) => setTimeout(r, 800))
|
|
137
|
+
|
|
138
|
+
const slideData = await page.evaluate(
|
|
139
|
+
(slideIdx: number) => {
|
|
140
|
+
// ── In-browser helpers ───────────────────────────────────────────
|
|
141
|
+
|
|
142
|
+
function isVisible(el: Element): boolean {
|
|
143
|
+
const r = el.getBoundingClientRect()
|
|
144
|
+
if (r.width === 0 || r.height === 0) return false
|
|
145
|
+
const style = window.getComputedStyle(el)
|
|
146
|
+
if (style.visibility === "hidden") return false
|
|
147
|
+
if (style.display === "none") return false
|
|
148
|
+
if (parseFloat(style.opacity) < 0.01) return false
|
|
149
|
+
return true
|
|
150
|
+
}
|
|
151
|
+
|
|
152
|
+
function toRectRelative(r: DOMRect, offsetTop: number, offsetLeft: number) {
|
|
153
|
+
return {
|
|
154
|
+
left: r.left - offsetLeft,
|
|
155
|
+
top: r.top - offsetTop,
|
|
156
|
+
right: r.right - offsetLeft,
|
|
157
|
+
bottom: r.bottom - offsetTop,
|
|
158
|
+
width: r.width,
|
|
159
|
+
height: r.height,
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
function selectorOf(el: Element): string {
|
|
164
|
+
const parts: string[] = []
|
|
165
|
+
let cur: Element | null = el
|
|
166
|
+
while (cur && cur !== document.body) {
|
|
167
|
+
const tag = cur.tagName.toLowerCase()
|
|
168
|
+
const cls = Array.from(cur.classList)
|
|
169
|
+
.slice(0, 2)
|
|
170
|
+
.map((c) => "." + c)
|
|
171
|
+
.join("")
|
|
172
|
+
parts.unshift(tag + cls)
|
|
173
|
+
cur = cur.parentElement
|
|
174
|
+
}
|
|
175
|
+
return parts.slice(-3).join(" > ")
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
type EI = {
|
|
179
|
+
selector: string
|
|
180
|
+
rect: ReturnType<typeof toRectRelative>
|
|
181
|
+
visible: boolean
|
|
182
|
+
children: EI[]
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
function collectChildren(
|
|
186
|
+
el: Element,
|
|
187
|
+
offsetTop: number,
|
|
188
|
+
offsetLeft: number,
|
|
189
|
+
depth = 0
|
|
190
|
+
): EI[] {
|
|
191
|
+
if (depth > 4) return []
|
|
192
|
+
const result: EI[] = []
|
|
193
|
+
for (const child of Array.from(el.children)) {
|
|
194
|
+
if (!isVisible(child)) continue
|
|
195
|
+
const rawR = child.getBoundingClientRect()
|
|
196
|
+
const cls = child.className || ""
|
|
197
|
+
if (
|
|
198
|
+
typeof cls === "string" &&
|
|
199
|
+
(cls.includes("aurora") ||
|
|
200
|
+
cls.includes("stars") ||
|
|
201
|
+
cls.includes("progress") ||
|
|
202
|
+
cls.includes("nav-dot") ||
|
|
203
|
+
cls.includes("deco-blob"))
|
|
204
|
+
) continue
|
|
205
|
+
const relR = toRectRelative(rawR, offsetTop, offsetLeft)
|
|
206
|
+
result.push({
|
|
207
|
+
selector: selectorOf(child),
|
|
208
|
+
rect: relR,
|
|
209
|
+
visible: true,
|
|
210
|
+
children: collectChildren(child, offsetTop, offsetLeft, depth + 1),
|
|
211
|
+
})
|
|
212
|
+
}
|
|
213
|
+
return result
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
function unionRect(els: EI[]): ReturnType<typeof toRectRelative> {
|
|
217
|
+
let left = Infinity, top = Infinity, right = -Infinity, bottom = -Infinity
|
|
218
|
+
function walk(list: EI[]) {
|
|
219
|
+
for (const e of list) {
|
|
220
|
+
if (!e.visible) continue
|
|
221
|
+
if (e.children.length > 0) {
|
|
222
|
+
walk(e.children)
|
|
223
|
+
} else {
|
|
224
|
+
left = Math.min(left, e.rect.left)
|
|
225
|
+
top = Math.min(top, e.rect.top)
|
|
226
|
+
right = Math.max(right, e.rect.right)
|
|
227
|
+
bottom = Math.max(bottom, e.rect.bottom)
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
}
|
|
231
|
+
walk(els)
|
|
232
|
+
if (left === Infinity) return { left: 0, top: 0, right: 0, bottom: 0, width: 0, height: 0 }
|
|
233
|
+
return { left, top, right, bottom, width: right - left, height: bottom - top }
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// ── Per-slide measurement ────────────────────────────────────────
|
|
237
|
+
|
|
238
|
+
const slide = document.querySelectorAll(".slide")[slideIdx]
|
|
239
|
+
if (!slide) return null
|
|
240
|
+
|
|
241
|
+
// Read the semantic slide type if the author provided it
|
|
242
|
+
const slideType = (slide as HTMLElement).dataset.slideType || slide.getAttribute("data-slide-type") || undefined
|
|
243
|
+
|
|
244
|
+
const canvas = slide.querySelector(".slide-canvas") as HTMLElement | null
|
|
245
|
+
if (!canvas) return null
|
|
246
|
+
|
|
247
|
+
const canvasRaw = canvas.getBoundingClientRect()
|
|
248
|
+
// Use canvas top-left as the coordinate origin
|
|
249
|
+
const offsetTop = canvasRaw.top
|
|
250
|
+
const offsetLeft = canvasRaw.left
|
|
251
|
+
|
|
252
|
+
const canvasRect = {
|
|
253
|
+
left: 0,
|
|
254
|
+
top: 0,
|
|
255
|
+
right: canvasRaw.width,
|
|
256
|
+
bottom: canvasRaw.height,
|
|
257
|
+
width: canvasRaw.width,
|
|
258
|
+
height: canvasRaw.height,
|
|
259
|
+
}
|
|
260
|
+
|
|
261
|
+
const elements = collectChildren(canvas, offsetTop, offsetLeft)
|
|
262
|
+
|
|
263
|
+
const titleEl = canvas.querySelector("h1, h2")
|
|
264
|
+
const title = titleEl
|
|
265
|
+
? (titleEl.textContent || "").replace(/\s+/g, " ").trim().slice(0, 80)
|
|
266
|
+
: `Slide ${slideIdx + 1}`
|
|
267
|
+
|
|
268
|
+
return {
|
|
269
|
+
index: slideIdx,
|
|
270
|
+
title,
|
|
271
|
+
slideType,
|
|
272
|
+
canvasRect,
|
|
273
|
+
elements,
|
|
274
|
+
contentRect: unionRect(elements),
|
|
275
|
+
}
|
|
276
|
+
},
|
|
277
|
+
idx
|
|
278
|
+
)
|
|
279
|
+
|
|
280
|
+
if (slideData) metrics.push(slideData as SlideMetrics)
|
|
281
|
+
}
|
|
282
|
+
|
|
283
|
+
return metrics
|
|
284
|
+
} finally {
|
|
285
|
+
await browser.close()
|
|
286
|
+
}
|
|
287
|
+
}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* lib/read-hooks/extractors/docx.ts
|
|
3
|
+
*
|
|
4
|
+
* DOCX text extraction using mammoth.js (pure JS, 6k+ stars).
|
|
5
|
+
* Extracts raw text without formatting — suitable for LLM context.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
import mammoth from "mammoth"
|
|
9
|
+
|
|
10
|
+
/**
|
|
11
|
+
* Extract plain text from a DOCX buffer.
|
|
12
|
+
*/
|
|
13
|
+
export async function extractDocx(buf: Buffer): Promise<string> {
|
|
14
|
+
const result = await mammoth.extractRawText({ buffer: buf })
|
|
15
|
+
return result.value
|
|
16
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* lib/read-hooks/extractors/pdf.ts
|
|
3
|
+
*
|
|
4
|
+
* PDF text extraction using unpdf (zero-dependency, pure JS, serverless PDF.js).
|
|
5
|
+
* Only extracts text — image extraction from PDFs requires native deps (@napi-rs/canvas)
|
|
6
|
+
* and is intentionally excluded.
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import { getDocumentProxy, extractText } from "unpdf"
|
|
10
|
+
|
|
11
|
+
/**
|
|
12
|
+
* Extract all text from a PDF buffer.
|
|
13
|
+
* Pages are merged into a single string with double newlines between them.
|
|
14
|
+
*/
|
|
15
|
+
export async function extractPdfText(buf: Buffer): Promise<string> {
|
|
16
|
+
const pdf = await getDocumentProxy(new Uint8Array(buf))
|
|
17
|
+
const { text } = await extractText(pdf, { mergePages: true })
|
|
18
|
+
return text
|
|
19
|
+
}
|
|
@@ -0,0 +1,53 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* lib/read-hooks/extractors/pptx.ts
|
|
3
|
+
*
|
|
4
|
+
* PPTX text extraction using fflate (ZIP decompression) + @xmldom/xmldom (XML parsing).
|
|
5
|
+
* Pure JS, zero native dependencies.
|
|
6
|
+
*
|
|
7
|
+
* PPTX is a ZIP archive containing slide XML files at ppt/slides/slideN.xml.
|
|
8
|
+
* Text content is stored in <a:t> elements under the DrawingML namespace.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
import { unzipSync } from "fflate"
|
|
12
|
+
import { DOMParser } from "@xmldom/xmldom"
|
|
13
|
+
|
|
14
|
+
const DRAWINGML_NS = "http://schemas.openxmlformats.org/drawingml/2006/main"
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Extract text from all slides in a PPTX buffer.
|
|
18
|
+
* Returns slides in order, each prefixed with "--- Slide N ---".
|
|
19
|
+
*/
|
|
20
|
+
export async function extractPptx(buf: Buffer): Promise<string> {
|
|
21
|
+
const files = unzipSync(new Uint8Array(buf))
|
|
22
|
+
const parser = new DOMParser()
|
|
23
|
+
const slides: string[] = []
|
|
24
|
+
|
|
25
|
+
// Collect and sort slide files by slide number
|
|
26
|
+
const slideFiles = Object.keys(files)
|
|
27
|
+
.filter((f) => /^ppt\/slides\/slide\d+\.xml$/.test(f))
|
|
28
|
+
.sort((a, b) => {
|
|
29
|
+
const na = parseInt(a.match(/\d+/)![0], 10)
|
|
30
|
+
const nb = parseInt(b.match(/\d+/)![0], 10)
|
|
31
|
+
return na - nb
|
|
32
|
+
})
|
|
33
|
+
|
|
34
|
+
for (const path of slideFiles) {
|
|
35
|
+
const xml = new TextDecoder().decode(files[path])
|
|
36
|
+
const doc = parser.parseFromString(xml, "text/xml")
|
|
37
|
+
|
|
38
|
+
// Extract all <a:t> text nodes
|
|
39
|
+
const textNodes = doc.getElementsByTagNameNS(DRAWINGML_NS, "t")
|
|
40
|
+
const texts: string[] = []
|
|
41
|
+
for (let i = 0; i < textNodes.length; i++) {
|
|
42
|
+
const t = textNodes[i].textContent?.trim()
|
|
43
|
+
if (t) texts.push(t)
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
if (texts.length) {
|
|
47
|
+
const slideNum = path.match(/\d+/)![0]
|
|
48
|
+
slides.push(`--- Slide ${slideNum} ---\n${texts.join("\n")}`)
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
return slides.join("\n\n")
|
|
53
|
+
}
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* lib/read-hooks/extractors/xlsx.ts
|
|
3
|
+
*
|
|
4
|
+
* XLSX text extraction using fflate (ZIP decompression) + @xmldom/xmldom (XML parsing).
|
|
5
|
+
* Pure JS, zero native dependencies.
|
|
6
|
+
*
|
|
7
|
+
* XLSX is a ZIP archive. Text values are stored in xl/sharedStrings.xml;
|
|
8
|
+
* cell references index into that shared table. Sheet data lives in
|
|
9
|
+
* xl/worksheets/sheetN.xml.
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import { unzipSync } from "fflate"
|
|
13
|
+
import { DOMParser } from "@xmldom/xmldom"
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Extract tabular text from all sheets in an XLSX buffer.
|
|
17
|
+
* Returns sheets in order, each prefixed with "--- Sheet N ---".
|
|
18
|
+
* Cells are tab-separated, rows are newline-separated.
|
|
19
|
+
*/
|
|
20
|
+
export async function extractXlsx(buf: Buffer): Promise<string> {
|
|
21
|
+
const files = unzipSync(new Uint8Array(buf))
|
|
22
|
+
const parser = new DOMParser()
|
|
23
|
+
|
|
24
|
+
// 1. Parse sharedStrings.xml — all string values are stored here by index
|
|
25
|
+
const sharedStrings: string[] = []
|
|
26
|
+
const ssFile = files["xl/sharedStrings.xml"]
|
|
27
|
+
if (ssFile) {
|
|
28
|
+
const doc = parser.parseFromString(new TextDecoder().decode(ssFile), "text/xml")
|
|
29
|
+
const siNodes = doc.getElementsByTagName("si")
|
|
30
|
+
for (let i = 0; i < siNodes.length; i++) {
|
|
31
|
+
const tNodes = siNodes[i].getElementsByTagName("t")
|
|
32
|
+
const parts: string[] = []
|
|
33
|
+
for (let j = 0; j < tNodes.length; j++) {
|
|
34
|
+
parts.push(tNodes[j].textContent ?? "")
|
|
35
|
+
}
|
|
36
|
+
sharedStrings.push(parts.join(""))
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// 2. Parse each worksheet
|
|
41
|
+
const sheets: string[] = []
|
|
42
|
+
const sheetFiles = Object.keys(files)
|
|
43
|
+
.filter((f) => /^xl\/worksheets\/sheet\d+\.xml$/.test(f))
|
|
44
|
+
.sort((a, b) => {
|
|
45
|
+
const na = parseInt(a.match(/\d+/)![0], 10)
|
|
46
|
+
const nb = parseInt(b.match(/\d+/)![0], 10)
|
|
47
|
+
return na - nb
|
|
48
|
+
})
|
|
49
|
+
|
|
50
|
+
for (const path of sheetFiles) {
|
|
51
|
+
const xml = new TextDecoder().decode(files[path])
|
|
52
|
+
const doc = parser.parseFromString(xml, "text/xml")
|
|
53
|
+
const rows = doc.getElementsByTagName("row")
|
|
54
|
+
const rowTexts: string[] = []
|
|
55
|
+
|
|
56
|
+
for (let r = 0; r < rows.length; r++) {
|
|
57
|
+
const cells = rows[r].getElementsByTagName("c")
|
|
58
|
+
const cellValues: string[] = []
|
|
59
|
+
|
|
60
|
+
for (let c = 0; c < cells.length; c++) {
|
|
61
|
+
const cell = cells[c]
|
|
62
|
+
const type = cell.getAttribute("t")
|
|
63
|
+
const vNode = cell.getElementsByTagName("v")[0]
|
|
64
|
+
const v = vNode?.textContent ?? ""
|
|
65
|
+
// type="s" → shared string index; otherwise use raw value
|
|
66
|
+
cellValues.push(type === "s" ? (sharedStrings[parseInt(v, 10)] ?? v) : v)
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
if (cellValues.some(Boolean)) {
|
|
70
|
+
rowTexts.push(cellValues.join("\t"))
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
if (rowTexts.length) {
|
|
75
|
+
const sheetNum = path.match(/\d+/)![0]
|
|
76
|
+
sheets.push(`--- Sheet ${sheetNum} ---\n${rowTexts.join("\n")}`)
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
return sheets.join("\n\n")
|
|
81
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* lib/read-hooks/image/compress.ts
|
|
3
|
+
*
|
|
4
|
+
* Image compression using jimp (pure JS, zero native dependencies, 14k+ stars).
|
|
5
|
+
* Goal: reduce base64 attachment size to save LLM context tokens.
|
|
6
|
+
*
|
|
7
|
+
* Strategy:
|
|
8
|
+
* - Resize to max 1024px on longest side (preserving aspect ratio)
|
|
9
|
+
* - Convert to JPEG at 60% quality
|
|
10
|
+
* - This typically achieves 60-80% size reduction
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
import { Jimp } from "jimp"
|
|
14
|
+
|
|
15
|
+
const MAX_DIMENSION = 1024
|
|
16
|
+
const JPEG_QUALITY = 60
|
|
17
|
+
|
|
18
|
+
/**
|
|
19
|
+
* Compress an image buffer.
|
|
20
|
+
* Returns a JPEG buffer regardless of input format.
|
|
21
|
+
*/
|
|
22
|
+
export async function compressImage(buf: Buffer): Promise<Buffer> {
|
|
23
|
+
const image = await Jimp.read(buf)
|
|
24
|
+
const { width, height } = image.bitmap
|
|
25
|
+
|
|
26
|
+
// Proportional resize if either dimension exceeds MAX_DIMENSION
|
|
27
|
+
if (width > MAX_DIMENSION || height > MAX_DIMENSION) {
|
|
28
|
+
if (width >= height) {
|
|
29
|
+
image.resize({ w: MAX_DIMENSION })
|
|
30
|
+
} else {
|
|
31
|
+
image.resize({ h: MAX_DIMENSION })
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return await image.getBuffer("image/jpeg", { quality: JPEG_QUALITY })
|
|
36
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* lib/read-hooks/index.ts
|
|
3
|
+
*
|
|
4
|
+
* Entry point for the read-hooks module.
|
|
5
|
+
* Exports preRead and postRead for use in plugins/revela.ts hook handlers.
|
|
6
|
+
*
|
|
7
|
+
* preRead → tool.execute.before: redirect binary files (DOCX/PPTX/XLSX) to temp txt
|
|
8
|
+
* postRead → tool.execute.after: transform PDF/image attachments before LLM sees them
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
export { preRead } from "./pre-read"
|
|
12
|
+
export { postRead } from "./post-read"
|
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* lib/read-hooks/post-read.ts
|
|
3
|
+
*
|
|
4
|
+
* After-hook handler for the OpenCode `read` tool.
|
|
5
|
+
* Called from `tool.execute.after` in plugin.ts.
|
|
6
|
+
*
|
|
7
|
+
* Handles PDF and images — formats where read tool succeeds and returns
|
|
8
|
+
* a base64 attachment. The after-hook fires after execution but before
|
|
9
|
+
* the result reaches the LLM, so we can replace the output here.
|
|
10
|
+
*
|
|
11
|
+
* PDF strategy: extract text from base64 → replace output string → remove attachment
|
|
12
|
+
* Image strategy: decompress base64 → jimp compress → re-encode → replace attachment
|
|
13
|
+
*
|
|
14
|
+
* Note: `output.attachments` is present at runtime despite not being in the
|
|
15
|
+
* TypeScript type definition for tool.execute.after. Confirmed via source inspection
|
|
16
|
+
* of packages/opencode/src/session/prompt.ts.
|
|
17
|
+
*/
|
|
18
|
+
|
|
19
|
+
import { extname, basename } from "path"
|
|
20
|
+
import { extractPdfText } from "./extractors/pdf"
|
|
21
|
+
import { compressImage } from "./image/compress"
|
|
22
|
+
|
|
23
|
+
const IMAGE_EXTS = new Set([".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif", ".webp", ".gif"])
|
|
24
|
+
|
|
25
|
+
interface ReadOutput {
|
|
26
|
+
title: string
|
|
27
|
+
output: string
|
|
28
|
+
metadata: any
|
|
29
|
+
attachments?: Array<{ url: string; mime: string; [k: string]: any }>
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
/**
|
|
33
|
+
* Post-process read tool output for PDF and image files.
|
|
34
|
+
*
|
|
35
|
+
* @param args - Read tool args (input.args in after-hook)
|
|
36
|
+
* @param output - Mutable read tool output (output in after-hook)
|
|
37
|
+
*/
|
|
38
|
+
export async function postRead(
|
|
39
|
+
args: { filePath: string; [k: string]: any },
|
|
40
|
+
output: ReadOutput,
|
|
41
|
+
): Promise<void> {
|
|
42
|
+
if (!output.attachments?.length) return
|
|
43
|
+
|
|
44
|
+
const ext = extname(args.filePath).toLowerCase()
|
|
45
|
+
|
|
46
|
+
// ── PDF: extract text, drop base64 attachment ───────────────────────────
|
|
47
|
+
if (ext === ".pdf") {
|
|
48
|
+
const attachment = output.attachments[0]
|
|
49
|
+
const base64 = attachment.url.split(",")[1]
|
|
50
|
+
if (!base64) return
|
|
51
|
+
|
|
52
|
+
const buf = Buffer.from(base64, "base64")
|
|
53
|
+
const text = await extractPdfText(buf)
|
|
54
|
+
|
|
55
|
+
output.output = `[Extracted from: ${basename(args.filePath)}]\n\n${text}`
|
|
56
|
+
output.title = `Extracted text from ${basename(args.filePath)}`
|
|
57
|
+
output.attachments.length = 0 // Remove base64 — saves significant tokens
|
|
58
|
+
return
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// ── Images: compress attachment to reduce token cost ────────────────────
|
|
62
|
+
if (IMAGE_EXTS.has(ext)) {
|
|
63
|
+
const attachment = output.attachments[0]
|
|
64
|
+
const base64 = attachment.url.split(",")[1]
|
|
65
|
+
if (!base64) return
|
|
66
|
+
|
|
67
|
+
const buf = Buffer.from(base64, "base64")
|
|
68
|
+
const compressed = await compressImage(buf)
|
|
69
|
+
|
|
70
|
+
// Replace with compressed JPEG
|
|
71
|
+
attachment.url = `data:image/jpeg;base64,${compressed.toString("base64")}`
|
|
72
|
+
attachment.mime = "image/jpeg"
|
|
73
|
+
}
|
|
74
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* lib/read-hooks/pre-read.ts
|
|
3
|
+
*
|
|
4
|
+
* Before-hook handler for the OpenCode `read` tool.
|
|
5
|
+
* Called from `tool.execute.before` in plugin.ts.
|
|
6
|
+
*
|
|
7
|
+
* Handles DOCX, PPTX, XLSX — formats that cause read tool to throw
|
|
8
|
+
* Effect.fail("Cannot read binary file"), so the after-hook never fires.
|
|
9
|
+
*
|
|
10
|
+
* Strategy: extract text → write temp .txt file → redirect args.filePath.
|
|
11
|
+
* The read tool then reads the temp file normally. LLM is unaware of the redirect.
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { readFileSync, writeFileSync } from "fs"
|
|
15
|
+
import { extname, basename, join } from "path"
|
|
16
|
+
import { tmpdir } from "os"
|
|
17
|
+
import { randomUUID } from "crypto"
|
|
18
|
+
import { extractDocx } from "./extractors/docx"
|
|
19
|
+
import { extractPptx } from "./extractors/pptx"
|
|
20
|
+
import { extractXlsx } from "./extractors/xlsx"
|
|
21
|
+
|
|
22
|
+
// Extension → extractor function mapping
|
|
23
|
+
const HANDLERS: Record<string, (buf: Buffer) => Promise<string>> = {
|
|
24
|
+
".docx": extractDocx,
|
|
25
|
+
".pptx": extractPptx,
|
|
26
|
+
".xlsx": extractXlsx,
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
/**
|
|
30
|
+
* Intercept read tool args before execution.
|
|
31
|
+
* If the file is a supported binary format, extract its text and redirect
|
|
32
|
+
* args.filePath to a temp .txt file containing the extracted content.
|
|
33
|
+
*
|
|
34
|
+
* @param args - Mutable read tool args object (from output.args in before-hook)
|
|
35
|
+
*/
|
|
36
|
+
export async function preRead(args: { filePath: string; [k: string]: any }): Promise<void> {
|
|
37
|
+
const ext = extname(args.filePath).toLowerCase()
|
|
38
|
+
const handler = HANDLERS[ext]
|
|
39
|
+
if (!handler) return // Not a handled format — let read tool proceed normally
|
|
40
|
+
|
|
41
|
+
const buf = readFileSync(args.filePath)
|
|
42
|
+
const text = await handler(buf)
|
|
43
|
+
|
|
44
|
+
// Write extracted text to a temp file, prefixed with source info
|
|
45
|
+
const header = `[Extracted from: ${basename(args.filePath)}]\n\n`
|
|
46
|
+
const tmpPath = join(tmpdir(), `revela-${randomUUID()}.txt`)
|
|
47
|
+
writeFileSync(tmpPath, header + text, "utf-8")
|
|
48
|
+
|
|
49
|
+
// Redirect read tool to the temp file
|
|
50
|
+
args.filePath = tmpPath
|
|
51
|
+
}
|