@cyber-dash-tech/revela 0.1.16 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,373 @@
1
+ import { createHash } from "crypto"
2
+ import { existsSync, mkdirSync, readFileSync, statSync, writeFileSync } from "fs"
3
+ import { basename, dirname, extname, isAbsolute, join, relative, resolve } from "path"
4
+ import { DOMParser } from "@xmldom/xmldom"
5
+ import { unzipSync } from "fflate"
6
+ import { extractDocx } from "../read-hooks/extractors/docx"
7
+ import { extractPptx } from "../read-hooks/extractors/pptx"
8
+ import { extractXlsx } from "../read-hooks/extractors/xlsx"
9
+
10
+ export type DocumentMaterial = {
11
+ path: string
12
+ source_ref: string
13
+ page_or_slide?: string
14
+ note?: string
15
+ }
16
+
17
+ export type DocumentMaterialsResult = {
18
+ status: "processed" | "skipped" | "failed"
19
+ source: string
20
+ type: "pptx" | "docx" | "xlsx" | "other"
21
+ cache_dir?: string
22
+ manifest_path?: string
23
+ text_path?: string
24
+ images?: DocumentMaterial[]
25
+ tables?: DocumentMaterial[]
26
+ reason?: string
27
+ }
28
+
29
+ type SupportedType = Exclude<DocumentMaterialsResult["type"], "other">
30
+
31
+ type CachedManifest = {
32
+ source: string
33
+ type: SupportedType
34
+ fingerprint: string
35
+ cache_dir: string
36
+ manifest_path: string
37
+ text_path: string
38
+ images: DocumentMaterial[]
39
+ tables: DocumentMaterial[]
40
+ }
41
+
42
+ const SUPPORTED_EXTENSIONS: Record<string, SupportedType> = {
43
+ ".pptx": "pptx",
44
+ ".docx": "docx",
45
+ ".xlsx": "xlsx",
46
+ }
47
+
48
+ function normalizeZipTarget(basePath: string, target: string): string {
49
+ const segments = join(dirname(basePath), target).split("/")
50
+ const normalized: string[] = []
51
+
52
+ for (const segment of segments) {
53
+ if (!segment || segment === ".") continue
54
+ if (segment === "..") {
55
+ normalized.pop()
56
+ continue
57
+ }
58
+ normalized.push(segment)
59
+ }
60
+
61
+ return normalized.join("/")
62
+ }
63
+
64
+ function ensureWorkspacePath(filePath: string, workspaceDir: string): string {
65
+ const resolvedWorkspace = resolve(workspaceDir)
66
+ const resolvedFile = isAbsolute(filePath) ? resolve(filePath) : resolve(workspaceDir, filePath)
67
+
68
+ if (resolvedFile !== resolvedWorkspace && !resolvedFile.startsWith(resolvedWorkspace + "/")) {
69
+ throw new Error("file must be within workspace")
70
+ }
71
+
72
+ return resolvedFile
73
+ }
74
+
75
+ function workspaceRelative(filePath: string, workspaceDir: string): string {
76
+ return relative(workspaceDir, filePath).replace(/\\/g, "/")
77
+ }
78
+
79
+ function buildFingerprint(filePath: string): string {
80
+ const stat = statSync(filePath)
81
+ return createHash("sha1")
82
+ .update(`${resolve(filePath)}:${stat.mtimeMs}:${stat.size}`)
83
+ .digest("hex")
84
+ }
85
+
86
+ function writeCachedBuffer(targetPath: string, buf: Uint8Array): void {
87
+ mkdirSync(dirname(targetPath), { recursive: true })
88
+ writeFileSync(targetPath, new Uint8Array(buf))
89
+ }
90
+
91
+ function materialPath(cacheDir: string, workspaceDir: string, ...segments: string[]): string {
92
+ return workspaceRelative(join(cacheDir, ...segments), workspaceDir)
93
+ }
94
+
95
+ function parseXml(files: Record<string, Uint8Array>, path: string): any | null {
96
+ const file = files[path]
97
+ if (!file) return null
98
+ return new DOMParser().parseFromString(new TextDecoder().decode(file), "text/xml")
99
+ }
100
+
101
+ function extractPptxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): DocumentMaterial[] {
102
+ const relFiles = Object.keys(files)
103
+ .filter((file) => /^ppt\/slides\/_rels\/slide\d+\.xml\.rels$/.test(file))
104
+ .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
105
+
106
+ const images: DocumentMaterial[] = []
107
+ const seenTargets = new Set<string>()
108
+
109
+ for (const relPath of relFiles) {
110
+ const slideMatch = relPath.match(/slide(\d+)\.xml\.rels$/)
111
+ const slideNumber = slideMatch?.[1] ?? "0"
112
+ const slidePath = relPath.replace("/_rels/", "/").replace(/\.rels$/, "")
113
+ const doc = parseXml(files, relPath)
114
+ if (!doc) continue
115
+ const relationships = doc.getElementsByTagName("Relationship")
116
+ let imageIndex = 0
117
+
118
+ for (let i = 0; i < relationships.length; i++) {
119
+ const rel = relationships[i]
120
+ const target = rel.getAttribute("Target")
121
+ if (!target) continue
122
+ const normalized = normalizeZipTarget(slidePath, target)
123
+ if (!normalized.startsWith("ppt/media/")) continue
124
+ const media = files[normalized]
125
+ if (!media) continue
126
+
127
+ imageIndex += 1
128
+ seenTargets.add(normalized)
129
+ const exportedName = `slide-${slideNumber.padStart(2, "0")}-image-${String(imageIndex).padStart(2, "0")}${extname(normalized)}`
130
+ const outputPath = join(cacheDir, "images", exportedName)
131
+ writeCachedBuffer(outputPath, media)
132
+
133
+ images.push({
134
+ path: materialPath(cacheDir, workspaceDir, "images", exportedName),
135
+ source_ref: normalized,
136
+ page_or_slide: `slide-${slideNumber.padStart(2, "0")}`,
137
+ })
138
+ }
139
+ }
140
+
141
+ const remainingMedia = Object.keys(files)
142
+ .filter((file) => file.startsWith("ppt/media/") && !seenTargets.has(file))
143
+ .sort()
144
+
145
+ for (const mediaPath of remainingMedia) {
146
+ const exportedName = `unmapped-${basename(mediaPath)}`
147
+ const outputPath = join(cacheDir, "images", exportedName)
148
+ writeCachedBuffer(outputPath, files[mediaPath])
149
+
150
+ images.push({
151
+ path: materialPath(cacheDir, workspaceDir, "images", exportedName),
152
+ source_ref: mediaPath,
153
+ note: "No slide-level relationship found",
154
+ })
155
+ }
156
+
157
+ return images
158
+ }
159
+
160
+ function extractDocxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): DocumentMaterial[] {
161
+ return Object.keys(files)
162
+ .filter((file) => file.startsWith("word/media/"))
163
+ .sort()
164
+ .map((mediaPath, index) => {
165
+ const exportedName = `document-image-${String(index + 1).padStart(2, "0")}${extname(mediaPath)}`
166
+ const outputPath = join(cacheDir, "images", exportedName)
167
+ writeCachedBuffer(outputPath, files[mediaPath])
168
+
169
+ return {
170
+ path: materialPath(cacheDir, workspaceDir, "images", exportedName),
171
+ source_ref: mediaPath,
172
+ note: "Document-wide association",
173
+ }
174
+ })
175
+ }
176
+
177
+ function extractXlsxImages(files: Record<string, Uint8Array>, cacheDir: string, workspaceDir: string): DocumentMaterial[] {
178
+ const drawingToImages = new Map<string, string[]>()
179
+ const drawingRelFiles = Object.keys(files)
180
+ .filter((file) => /^xl\/drawings\/_rels\/drawing\d+\.xml\.rels$/.test(file))
181
+ .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
182
+
183
+ for (const relPath of drawingRelFiles) {
184
+ const relDoc = parseXml(files, relPath)
185
+ if (!relDoc) continue
186
+ const drawingPath = relPath.replace("/_rels/", "/").replace(/\.rels$/, "")
187
+ const drawingDoc = parseXml(files, drawingPath)
188
+ if (!drawingDoc) continue
189
+
190
+ const targetByRid = new Map<string, string>()
191
+ const relationships = relDoc.getElementsByTagName("Relationship")
192
+ for (let i = 0; i < relationships.length; i++) {
193
+ const rel = relationships[i]
194
+ const id = rel.getAttribute("Id")
195
+ const target = rel.getAttribute("Target")
196
+ if (!id || !target) continue
197
+ const normalized = normalizeZipTarget(drawingPath, target)
198
+ if (normalized.startsWith("xl/media/")) {
199
+ targetByRid.set(id, normalized)
200
+ }
201
+ }
202
+
203
+ const blips = drawingDoc.getElementsByTagName("a:blip")
204
+ const mediaPaths: string[] = []
205
+ for (let i = 0; i < blips.length; i++) {
206
+ const rid = blips[i].getAttribute("r:embed") || blips[i].getAttribute("embed")
207
+ if (!rid) continue
208
+ const mediaPath = targetByRid.get(rid)
209
+ if (mediaPath) mediaPaths.push(mediaPath)
210
+ }
211
+
212
+ if (mediaPaths.length > 0) {
213
+ drawingToImages.set(drawingPath, mediaPaths)
214
+ }
215
+ }
216
+
217
+ const images: DocumentMaterial[] = []
218
+ const exportedMedia = new Set<string>()
219
+ const sheetRelFiles = Object.keys(files)
220
+ .filter((file) => /^xl\/worksheets\/_rels\/sheet\d+\.xml\.rels$/.test(file))
221
+ .sort((a, b) => a.localeCompare(b, undefined, { numeric: true }))
222
+
223
+ for (const relPath of sheetRelFiles) {
224
+ const sheetMatch = relPath.match(/sheet(\d+)\.xml\.rels$/)
225
+ const sheetNumber = sheetMatch?.[1] ?? "0"
226
+ const sheetPath = relPath.replace("/_rels/", "/").replace(/\.rels$/, "")
227
+ const relDoc = parseXml(files, relPath)
228
+ if (!relDoc) continue
229
+ const relationships = relDoc.getElementsByTagName("Relationship")
230
+ let imageIndex = 0
231
+
232
+ for (let i = 0; i < relationships.length; i++) {
233
+ const rel = relationships[i]
234
+ const target = rel.getAttribute("Target")
235
+ if (!target) continue
236
+ const normalized = normalizeZipTarget(sheetPath, target)
237
+ const mediaPaths = drawingToImages.get(normalized)
238
+ if (!mediaPaths) continue
239
+
240
+ for (const mediaPath of mediaPaths) {
241
+ const media = files[mediaPath]
242
+ if (!media) continue
243
+ imageIndex += 1
244
+ exportedMedia.add(mediaPath)
245
+ const exportedName = `sheet-${sheetNumber.padStart(2, "0")}-image-${String(imageIndex).padStart(2, "0")}${extname(mediaPath)}`
246
+ const outputPath = join(cacheDir, "images", exportedName)
247
+ writeCachedBuffer(outputPath, media)
248
+
249
+ images.push({
250
+ path: materialPath(cacheDir, workspaceDir, "images", exportedName),
251
+ source_ref: mediaPath,
252
+ page_or_slide: `sheet-${sheetNumber.padStart(2, "0")}`,
253
+ })
254
+ }
255
+ }
256
+ }
257
+
258
+ const unmapped = Object.keys(files)
259
+ .filter((file) => file.startsWith("xl/media/") && !exportedMedia.has(file))
260
+ .sort()
261
+
262
+ for (const mediaPath of unmapped) {
263
+ const exportedName = `unmapped-${basename(mediaPath)}`
264
+ const outputPath = join(cacheDir, "images", exportedName)
265
+ writeCachedBuffer(outputPath, files[mediaPath])
266
+
267
+ images.push({
268
+ path: materialPath(cacheDir, workspaceDir, "images", exportedName),
269
+ source_ref: mediaPath,
270
+ note: "No sheet-level relationship found",
271
+ })
272
+ }
273
+
274
+ return images
275
+ }
276
+
277
+ function extractTables(type: SupportedType, textPath: string): DocumentMaterial[] {
278
+ if (type !== "xlsx") return []
279
+ return [{ path: textPath, source_ref: "workbook", note: "Sheet text and tables extracted to text file" }]
280
+ }
281
+
282
+ async function processOfficeFile(filePath: string, workspaceDir: string, type: SupportedType): Promise<DocumentMaterialsResult> {
283
+ const relativeSource = workspaceRelative(filePath, workspaceDir)
284
+ const fingerprint = buildFingerprint(filePath)
285
+ const cacheDir = join(workspaceDir, ".opencode", "revela", "doc-materials", fingerprint)
286
+ const manifestPath = join(cacheDir, "manifest.json")
287
+
288
+ if (existsSync(manifestPath)) {
289
+ const manifest = JSON.parse(readFileSync(manifestPath, "utf-8")) as CachedManifest
290
+ return {
291
+ status: "processed",
292
+ source: manifest.source,
293
+ type: manifest.type,
294
+ cache_dir: manifest.cache_dir,
295
+ manifest_path: manifest.manifest_path,
296
+ text_path: manifest.text_path,
297
+ images: manifest.images,
298
+ tables: manifest.tables,
299
+ }
300
+ }
301
+
302
+ mkdirSync(join(cacheDir, "images"), { recursive: true })
303
+ mkdirSync(join(cacheDir, "tables"), { recursive: true })
304
+
305
+ const buf = readFileSync(filePath)
306
+ const files = unzipSync(new Uint8Array(buf))
307
+
308
+ const text = type === "pptx"
309
+ ? await extractPptx(buf)
310
+ : type === "docx"
311
+ ? await extractDocx(buf)
312
+ : await extractXlsx(buf)
313
+
314
+ const textPath = join(cacheDir, "text.txt")
315
+ writeFileSync(textPath, `[Extracted from: ${basename(filePath)}]\n\n${text}`, "utf-8")
316
+
317
+ const images = type === "pptx"
318
+ ? extractPptxImages(files, cacheDir, workspaceDir)
319
+ : type === "docx"
320
+ ? extractDocxImages(files, cacheDir, workspaceDir)
321
+ : extractXlsxImages(files, cacheDir, workspaceDir)
322
+
323
+ const result: DocumentMaterialsResult = {
324
+ status: "processed",
325
+ source: relativeSource,
326
+ type,
327
+ cache_dir: workspaceRelative(cacheDir, workspaceDir),
328
+ manifest_path: workspaceRelative(manifestPath, workspaceDir),
329
+ text_path: workspaceRelative(textPath, workspaceDir),
330
+ images,
331
+ tables: extractTables(type, workspaceRelative(textPath, workspaceDir)),
332
+ }
333
+
334
+ const manifest: CachedManifest = {
335
+ source: result.source,
336
+ type,
337
+ fingerprint,
338
+ cache_dir: result.cache_dir!,
339
+ manifest_path: result.manifest_path!,
340
+ text_path: result.text_path!,
341
+ images: result.images ?? [],
342
+ tables: result.tables ?? [],
343
+ }
344
+
345
+ writeFileSync(manifestPath, JSON.stringify(manifest, null, 2), "utf-8")
346
+ return result
347
+ }
348
+
349
+ export async function extractDocumentMaterials(filePath: string, workspaceDir: string): Promise<DocumentMaterialsResult> {
350
+ try {
351
+ const resolvedFile = ensureWorkspacePath(filePath, workspaceDir)
352
+ const relativeSource = workspaceRelative(resolvedFile, workspaceDir)
353
+ const type = SUPPORTED_EXTENSIONS[extname(resolvedFile).toLowerCase()]
354
+
355
+ if (!type) {
356
+ return {
357
+ status: "skipped",
358
+ source: relativeSource,
359
+ type: "other",
360
+ reason: "unsupported_file_type",
361
+ }
362
+ }
363
+
364
+ return await processOfficeFile(resolvedFile, workspaceDir, type)
365
+ } catch (e) {
366
+ return {
367
+ status: "failed",
368
+ source: filePath,
369
+ type: "other",
370
+ reason: e instanceof Error ? e.message : String(e),
371
+ }
372
+ }
373
+ }