gnosys 4.4.7 → 5.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +140 -0
- package/dist/cli.js.map +1 -1
- package/dist/index.js +65 -0
- package/dist/index.js.map +1 -1
- package/dist/lib/attachments.d.ts +43 -0
- package/dist/lib/attachments.d.ts.map +1 -0
- package/dist/lib/attachments.js +154 -0
- package/dist/lib/attachments.js.map +1 -0
- package/dist/lib/audioExtract.d.ts +39 -0
- package/dist/lib/audioExtract.d.ts.map +1 -0
- package/dist/lib/audioExtract.js +220 -0
- package/dist/lib/audioExtract.js.map +1 -0
- package/dist/lib/chunkSplitter.d.ts +46 -0
- package/dist/lib/chunkSplitter.d.ts.map +1 -0
- package/dist/lib/chunkSplitter.js +233 -0
- package/dist/lib/chunkSplitter.js.map +1 -0
- package/dist/lib/config.d.ts +70 -1
- package/dist/lib/config.d.ts.map +1 -1
- package/dist/lib/config.js +24 -0
- package/dist/lib/config.js.map +1 -1
- package/dist/lib/db.d.ts +7 -1
- package/dist/lib/db.d.ts.map +1 -1
- package/dist/lib/db.js +31 -4
- package/dist/lib/db.js.map +1 -1
- package/dist/lib/dbWrite.d.ts.map +1 -1
- package/dist/lib/dbWrite.js +11 -0
- package/dist/lib/dbWrite.js.map +1 -1
- package/dist/lib/docxExtract.d.ts +27 -0
- package/dist/lib/docxExtract.d.ts.map +1 -0
- package/dist/lib/docxExtract.js +80 -0
- package/dist/lib/docxExtract.js.map +1 -0
- package/dist/lib/fileDetect.d.ts +20 -0
- package/dist/lib/fileDetect.d.ts.map +1 -0
- package/dist/lib/fileDetect.js +124 -0
- package/dist/lib/fileDetect.js.map +1 -0
- package/dist/lib/imageExtract.d.ts +26 -0
- package/dist/lib/imageExtract.d.ts.map +1 -0
- package/dist/lib/imageExtract.js +113 -0
- package/dist/lib/imageExtract.js.map +1 -0
- package/dist/lib/llm.d.ts +9 -0
- package/dist/lib/llm.d.ts.map +1 -1
- package/dist/lib/llm.js +102 -0
- package/dist/lib/llm.js.map +1 -1
- package/dist/lib/multimodalIngest.d.ts +68 -0
- package/dist/lib/multimodalIngest.d.ts.map +1 -0
- package/dist/lib/multimodalIngest.js +463 -0
- package/dist/lib/multimodalIngest.js.map +1 -0
- package/dist/lib/pdfExtract.d.ts +29 -0
- package/dist/lib/pdfExtract.d.ts.map +1 -0
- package/dist/lib/pdfExtract.js +163 -0
- package/dist/lib/pdfExtract.js.map +1 -0
- package/dist/lib/store.d.ts +3 -0
- package/dist/lib/store.d.ts.map +1 -1
- package/dist/lib/store.js.map +1 -1
- package/dist/lib/videoExtract.d.ts +30 -0
- package/dist/lib/videoExtract.d.ts.map +1 -0
- package/dist/lib/videoExtract.js +92 -0
- package/dist/lib/videoExtract.js.map +1 -0
- package/package.json +3 -1
|
@@ -0,0 +1,463 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Gnosys Multimodal Ingestion Orchestrator — Ties together file detection,
|
|
3
|
+
* extraction, chunking, attachment management, and memory creation.
|
|
4
|
+
*
|
|
5
|
+
* Supports PDF and DOCX in Phase 2. Image, audio, and video ingestion
|
|
6
|
+
* will be added in Phase 3/4.
|
|
7
|
+
*/
|
|
8
|
+
import * as fs from "fs/promises";
|
|
9
|
+
import * as path from "path";
|
|
10
|
+
import { detectFileType } from "./fileDetect.js";
|
|
11
|
+
import { storeAttachment, linkMemoryToAttachment } from "./attachments.js";
|
|
12
|
+
import { extractPdfText } from "./pdfExtract.js";
|
|
13
|
+
import { extractDocxText } from "./docxExtract.js";
|
|
14
|
+
import { extractImageDescription } from "./imageExtract.js";
|
|
15
|
+
import { transcribeAudio } from "./audioExtract.js";
|
|
16
|
+
import { transcribeVideo } from "./videoExtract.js";
|
|
17
|
+
import { splitSegments, splitIntoChunks } from "./chunkSplitter.js";
|
|
18
|
+
import { GnosysIngestion } from "./ingest.js";
|
|
19
|
+
import { GnosysTagRegistry } from "./tags.js";
|
|
20
|
+
import { GnosysStore } from "./store.js";
|
|
21
|
+
import { createProvider } from "./llm.js";
|
|
22
|
+
import { loadConfig, DEFAULT_CONFIG, getProviderModel } from "./config.js";
|
|
23
|
+
// ─── Helpers ────────────────────────────────────────────────────────────
|
|
24
|
+
/**
|
|
25
|
+
* Build a simple keyword-based relevance string from a chunk of text.
|
|
26
|
+
* Used in "structured" mode when no LLM is available.
|
|
27
|
+
*/
|
|
28
|
+
function buildRelevance(text, sourceFile) {
|
|
29
|
+
// Extract the most frequent meaningful words (basic TF approach)
|
|
30
|
+
const words = text
|
|
31
|
+
.toLowerCase()
|
|
32
|
+
.replace(/[^a-z0-9\s-]/g, " ")
|
|
33
|
+
.split(/\s+/)
|
|
34
|
+
.filter((w) => w.length > 3);
|
|
35
|
+
// Count word frequency
|
|
36
|
+
const freq = new Map();
|
|
37
|
+
for (const w of words) {
|
|
38
|
+
freq.set(w, (freq.get(w) || 0) + 1);
|
|
39
|
+
}
|
|
40
|
+
// Sort by frequency, take top 20
|
|
41
|
+
const topWords = [...freq.entries()]
|
|
42
|
+
.sort((a, b) => b[1] - a[1])
|
|
43
|
+
.slice(0, 20)
|
|
44
|
+
.map(([w]) => w);
|
|
45
|
+
// Add the source filename (without extension) for searchability
|
|
46
|
+
const baseName = path.basename(sourceFile, path.extname(sourceFile))
|
|
47
|
+
.replace(/[^a-zA-Z0-9]+/g, " ")
|
|
48
|
+
.trim();
|
|
49
|
+
return [...new Set([baseName, ...topWords])].join(" ");
|
|
50
|
+
}
|
|
51
|
+
/**
|
|
52
|
+
* Generate a title from a chunk of text.
|
|
53
|
+
* Takes the first line or first ~60 chars as the title.
|
|
54
|
+
*/
|
|
55
|
+
function generateTitle(text, chunkIndex, sourceFile, page) {
|
|
56
|
+
const baseName = path.basename(sourceFile, path.extname(sourceFile));
|
|
57
|
+
// Try to use the first line if it looks like a heading
|
|
58
|
+
const firstLine = text.split("\n")[0].replace(/^#+\s*/, "").trim();
|
|
59
|
+
if (firstLine && firstLine.length > 5 && firstLine.length < 120) {
|
|
60
|
+
return firstLine;
|
|
61
|
+
}
|
|
62
|
+
// Fall back to a descriptive title
|
|
63
|
+
const pageLabel = page ? ` p${page}` : "";
|
|
64
|
+
return `${baseName}${pageLabel} — chunk ${chunkIndex + 1}`;
|
|
65
|
+
}
|
|
66
|
+
// ─── Audio / Video helpers ───────────────────────────────────────────────
|
|
67
|
+
/**
|
|
68
|
+
* Format seconds as "HH:MM:SS" for display in source_timerange.
|
|
69
|
+
*/
|
|
70
|
+
function formatTime(seconds) {
|
|
71
|
+
const h = Math.floor(seconds / 3600);
|
|
72
|
+
const m = Math.floor((seconds % 3600) / 60);
|
|
73
|
+
const s = Math.floor(seconds % 60);
|
|
74
|
+
return `${h.toString().padStart(2, "0")}:${m.toString().padStart(2, "0")}:${s.toString().padStart(2, "0")}`;
|
|
75
|
+
}
|
|
76
|
+
/**
|
|
77
|
+
* Build transcription options from the Gnosys config.
|
|
78
|
+
*/
|
|
79
|
+
function buildTranscriptionOptions(config) {
|
|
80
|
+
return {
|
|
81
|
+
provider: config.multimodal?.transcriptionProvider,
|
|
82
|
+
model: config.multimodal?.whisperModel,
|
|
83
|
+
};
|
|
84
|
+
}
|
|
85
|
+
/**
|
|
86
|
+
* Group transcript segments into ~2-minute time windows and convert
|
|
87
|
+
* them into TextChunks with timerange metadata.
|
|
88
|
+
*
|
|
89
|
+
* Each chunk gets a sourceTimerange like "00:01:23-00:03:15" so the
|
|
90
|
+
* resulting memory can reference back to the original audio/video.
|
|
91
|
+
*/
|
|
92
|
+
function buildTranscriptChunks(segments, targetSize) {
|
|
93
|
+
if (segments.length === 0)
|
|
94
|
+
return [];
|
|
95
|
+
const TARGET_WINDOW_SECONDS = 120; // ~2-minute windows
|
|
96
|
+
// Group segments into time windows
|
|
97
|
+
const windows = [];
|
|
98
|
+
let currentWindow = null;
|
|
99
|
+
for (const seg of segments) {
|
|
100
|
+
if (!seg.text.trim())
|
|
101
|
+
continue;
|
|
102
|
+
if (!currentWindow ||
|
|
103
|
+
seg.startTime - currentWindow.startTime >= TARGET_WINDOW_SECONDS) {
|
|
104
|
+
// Start a new window
|
|
105
|
+
if (currentWindow)
|
|
106
|
+
windows.push(currentWindow);
|
|
107
|
+
currentWindow = {
|
|
108
|
+
texts: [seg.text.trim()],
|
|
109
|
+
startTime: seg.startTime,
|
|
110
|
+
endTime: seg.endTime,
|
|
111
|
+
};
|
|
112
|
+
}
|
|
113
|
+
else {
|
|
114
|
+
// Add to the current window
|
|
115
|
+
currentWindow.texts.push(seg.text.trim());
|
|
116
|
+
currentWindow.endTime = seg.endTime;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
if (currentWindow)
|
|
120
|
+
windows.push(currentWindow);
|
|
121
|
+
// Convert time windows into segments for splitSegments
|
|
122
|
+
const timedSegments = windows.map((w) => ({
|
|
123
|
+
text: w.texts.join(" "),
|
|
124
|
+
timerange: `${formatTime(w.startTime)}-${formatTime(w.endTime)}`,
|
|
125
|
+
}));
|
|
126
|
+
return splitSegments(timedSegments, { targetSize });
|
|
127
|
+
}
|
|
128
|
+
// ─── Public API ─────────────────────────────────────────────────────────
|
|
129
|
+
/**
|
|
130
|
+
* Ingest a file into Gnosys memory.
|
|
131
|
+
*
|
|
132
|
+
* Steps:
|
|
133
|
+
* 1. Detect file type (PDF, DOCX, text, etc.)
|
|
134
|
+
* 2. Validate file size
|
|
135
|
+
* 3. Copy file to .gnosys/attachments/
|
|
136
|
+
* 4. Extract text based on file type
|
|
137
|
+
* 5. Split text into memory-sized chunks
|
|
138
|
+
* 6. Create a memory for each chunk (via LLM or structured mode)
|
|
139
|
+
* 7. Link each memory back to the attachment
|
|
140
|
+
*/
|
|
141
|
+
export async function ingestFile(options) {
|
|
142
|
+
const startTime = Date.now();
|
|
143
|
+
const { filePath, storePath, mode = "llm", author = "human", authority = "imported", dryRun = false, projectRoot, onProgress, } = options;
|
|
144
|
+
// Step 1: Detect file type
|
|
145
|
+
const fileInfo = await detectFileType(filePath);
|
|
146
|
+
// Reject unsupported types early
|
|
147
|
+
if (fileInfo.type === "unknown") {
|
|
148
|
+
throw new Error(`Unsupported file type: ${path.extname(filePath) || "unknown"}. ` +
|
|
149
|
+
`Supported: PDF, DOCX, TXT, MD, PNG, JPG, GIF, WEBP, MP3, WAV, M4A, MP4, MKV, MOV.`);
|
|
150
|
+
}
|
|
151
|
+
// Step 2: Validate file size
|
|
152
|
+
let config = DEFAULT_CONFIG;
|
|
153
|
+
try {
|
|
154
|
+
config = await loadConfig(storePath);
|
|
155
|
+
}
|
|
156
|
+
catch {
|
|
157
|
+
// Use defaults
|
|
158
|
+
}
|
|
159
|
+
const stat = await fs.stat(filePath);
|
|
160
|
+
const fileSizeMb = stat.size / (1024 * 1024);
|
|
161
|
+
const maxSizeMb = config.multimodal?.maxFileSizeMb ?? 100;
|
|
162
|
+
if (fileSizeMb > maxSizeMb) {
|
|
163
|
+
throw new Error(`File is ${fileSizeMb.toFixed(1)}MB, which exceeds the ${maxSizeMb}MB limit. ` +
|
|
164
|
+
`Adjust multimodal.maxFileSizeMb in gnosys.json to increase.`);
|
|
165
|
+
}
|
|
166
|
+
// Step 3: Store as attachment (skip in dry run)
|
|
167
|
+
let attachment;
|
|
168
|
+
if (dryRun) {
|
|
169
|
+
attachment = {
|
|
170
|
+
uuid: "dry-run-uuid",
|
|
171
|
+
originalName: path.basename(filePath),
|
|
172
|
+
extension: fileInfo.extension,
|
|
173
|
+
mimeType: fileInfo.mimeType,
|
|
174
|
+
sizeBytes: stat.size,
|
|
175
|
+
contentHash: "dry-run-hash",
|
|
176
|
+
createdAt: new Date().toISOString(),
|
|
177
|
+
memoryIds: [],
|
|
178
|
+
};
|
|
179
|
+
}
|
|
180
|
+
else {
|
|
181
|
+
attachment = await storeAttachment(storePath, filePath);
|
|
182
|
+
}
|
|
183
|
+
// Step 4: Extract text based on file type
|
|
184
|
+
let chunks;
|
|
185
|
+
const chunkSize = config.multimodal?.chunkSize ?? 1500;
|
|
186
|
+
const sourceFileName = path.basename(filePath);
|
|
187
|
+
switch (fileInfo.type) {
|
|
188
|
+
case "pdf": {
|
|
189
|
+
const pdfChunks = await extractPdfText(filePath);
|
|
190
|
+
// Convert PDF sections into segments — pages tracked as metadata
|
|
191
|
+
const segments = pdfChunks.map((pc) => ({
|
|
192
|
+
text: pc.text,
|
|
193
|
+
page: pc.pages,
|
|
194
|
+
}));
|
|
195
|
+
chunks = splitSegments(segments, { targetSize: chunkSize });
|
|
196
|
+
break;
|
|
197
|
+
}
|
|
198
|
+
case "docx": {
|
|
199
|
+
const docxChunks = await extractDocxText(filePath);
|
|
200
|
+
// Each DOCX section becomes a segment (no page numbers for DOCX)
|
|
201
|
+
const segments = docxChunks.map((dc) => ({
|
|
202
|
+
text: dc.text,
|
|
203
|
+
page: dc.sectionHeading,
|
|
204
|
+
}));
|
|
205
|
+
chunks = splitSegments(segments, { targetSize: chunkSize });
|
|
206
|
+
break;
|
|
207
|
+
}
|
|
208
|
+
case "text": {
|
|
209
|
+
// Plain text / markdown — read raw and split into chunks
|
|
210
|
+
const rawText = await fs.readFile(filePath, "utf-8");
|
|
211
|
+
chunks = splitIntoChunks(rawText, { targetSize: chunkSize });
|
|
212
|
+
break;
|
|
213
|
+
}
|
|
214
|
+
case "image": {
|
|
215
|
+
// Image ingestion: use a vision LLM to describe the image,
|
|
216
|
+
// then create a single memory from the description.
|
|
217
|
+
// This is a separate path — images produce one memory, not chunks.
|
|
218
|
+
return await ingestImage(filePath, storePath, config, attachment, {
|
|
219
|
+
author,
|
|
220
|
+
authority,
|
|
221
|
+
dryRun,
|
|
222
|
+
mode,
|
|
223
|
+
onProgress,
|
|
224
|
+
startTime,
|
|
225
|
+
sourceFileName,
|
|
226
|
+
});
|
|
227
|
+
}
|
|
228
|
+
case "audio": {
|
|
229
|
+
// Audio ingestion: transcribe with Whisper, then chunk by time windows
|
|
230
|
+
const audioOptions = buildTranscriptionOptions(config);
|
|
231
|
+
const transcript = await transcribeAudio(filePath, audioOptions);
|
|
232
|
+
chunks = buildTranscriptChunks(transcript.segments, chunkSize);
|
|
233
|
+
break;
|
|
234
|
+
}
|
|
235
|
+
case "video": {
|
|
236
|
+
// Video ingestion: extract audio via ffmpeg, transcribe, then chunk
|
|
237
|
+
const videoOptions = buildTranscriptionOptions(config);
|
|
238
|
+
const videoTranscript = await transcribeVideo(filePath, videoOptions);
|
|
239
|
+
chunks = buildTranscriptChunks(videoTranscript.segments, chunkSize);
|
|
240
|
+
break;
|
|
241
|
+
}
|
|
242
|
+
default:
|
|
243
|
+
throw new Error(`File type "${fileInfo.type}" is not yet supported for text extraction.`);
|
|
244
|
+
}
|
|
245
|
+
if (chunks.length === 0) {
|
|
246
|
+
return {
|
|
247
|
+
attachment,
|
|
248
|
+
memories: [],
|
|
249
|
+
errors: [{ chunk: 0, error: "No text content could be extracted from the file." }],
|
|
250
|
+
duration: Date.now() - startTime,
|
|
251
|
+
fileType: fileInfo.type,
|
|
252
|
+
};
|
|
253
|
+
}
|
|
254
|
+
// Step 5: Initialize the store and ingestion pipeline
|
|
255
|
+
const gnosysStore = new GnosysStore(storePath);
|
|
256
|
+
const tagRegistry = new GnosysTagRegistry(storePath);
|
|
257
|
+
await tagRegistry.load();
|
|
258
|
+
const ingestion = new GnosysIngestion(gnosysStore, tagRegistry, config);
|
|
259
|
+
const today = new Date().toISOString().split("T")[0];
|
|
260
|
+
const memories = [];
|
|
261
|
+
const errors = [];
|
|
262
|
+
// Step 6: Process each chunk into a memory
|
|
263
|
+
for (let i = 0; i < chunks.length; i++) {
|
|
264
|
+
const chunk = chunks[i];
|
|
265
|
+
if (onProgress) {
|
|
266
|
+
onProgress({
|
|
267
|
+
current: i + 1,
|
|
268
|
+
total: chunks.length,
|
|
269
|
+
title: `Processing chunk ${i + 1}/${chunks.length}`,
|
|
270
|
+
});
|
|
271
|
+
}
|
|
272
|
+
try {
|
|
273
|
+
let title;
|
|
274
|
+
let category;
|
|
275
|
+
let tags;
|
|
276
|
+
let relevance;
|
|
277
|
+
let content;
|
|
278
|
+
let confidence;
|
|
279
|
+
let filename;
|
|
280
|
+
if (mode === "llm" && ingestion.isLLMAvailable) {
|
|
281
|
+
// LLM-powered structuring
|
|
282
|
+
const result = await ingestion.ingest(chunk.text);
|
|
283
|
+
title = result.title;
|
|
284
|
+
category = result.category;
|
|
285
|
+
tags = result.tags;
|
|
286
|
+
relevance = result.relevance;
|
|
287
|
+
content = result.content;
|
|
288
|
+
confidence = result.confidence;
|
|
289
|
+
filename = result.filename;
|
|
290
|
+
}
|
|
291
|
+
else {
|
|
292
|
+
// Structured mode — no LLM needed
|
|
293
|
+
title = generateTitle(chunk.text, i, sourceFileName, chunk.sourcePage);
|
|
294
|
+
category = "imported";
|
|
295
|
+
tags = { type: ["imported"], source: ["document"] };
|
|
296
|
+
relevance = buildRelevance(chunk.text, sourceFileName);
|
|
297
|
+
content = chunk.text;
|
|
298
|
+
confidence = 0.7;
|
|
299
|
+
filename = title
|
|
300
|
+
.toLowerCase()
|
|
301
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
302
|
+
.replace(/^-|-$/g, "")
|
|
303
|
+
.substring(0, 60);
|
|
304
|
+
}
|
|
305
|
+
if (dryRun) {
|
|
306
|
+
memories.push({
|
|
307
|
+
id: `dry-run-${i}`,
|
|
308
|
+
title,
|
|
309
|
+
path: `${category}/${filename}.md`,
|
|
310
|
+
page: chunk.sourcePage,
|
|
311
|
+
timerange: chunk.sourceTimerange,
|
|
312
|
+
});
|
|
313
|
+
continue;
|
|
314
|
+
}
|
|
315
|
+
// Generate a unique ID and write the memory
|
|
316
|
+
const id = await gnosysStore.generateId(category);
|
|
317
|
+
const frontmatter = {
|
|
318
|
+
id,
|
|
319
|
+
title,
|
|
320
|
+
category,
|
|
321
|
+
tags,
|
|
322
|
+
relevance,
|
|
323
|
+
author: author,
|
|
324
|
+
authority: authority,
|
|
325
|
+
confidence,
|
|
326
|
+
created: today,
|
|
327
|
+
modified: today,
|
|
328
|
+
last_reviewed: today,
|
|
329
|
+
status: "active",
|
|
330
|
+
supersedes: null,
|
|
331
|
+
// v5.0: Source tracking
|
|
332
|
+
source_file: sourceFileName,
|
|
333
|
+
source_page: chunk.sourcePage || null,
|
|
334
|
+
source_timerange: chunk.sourceTimerange || null,
|
|
335
|
+
};
|
|
336
|
+
const memoryContent = `# ${title}\n\n${content}`;
|
|
337
|
+
const relPath = await gnosysStore.writeMemory(category, `${filename}.md`, frontmatter, memoryContent);
|
|
338
|
+
// Link the memory to its attachment
|
|
339
|
+
await linkMemoryToAttachment(storePath, attachment.uuid, id);
|
|
340
|
+
memories.push({
|
|
341
|
+
id,
|
|
342
|
+
title,
|
|
343
|
+
path: relPath,
|
|
344
|
+
page: chunk.sourcePage,
|
|
345
|
+
timerange: chunk.sourceTimerange,
|
|
346
|
+
});
|
|
347
|
+
}
|
|
348
|
+
catch (err) {
|
|
349
|
+
errors.push({
|
|
350
|
+
chunk: i,
|
|
351
|
+
error: err instanceof Error ? err.message : String(err),
|
|
352
|
+
});
|
|
353
|
+
}
|
|
354
|
+
}
|
|
355
|
+
return {
|
|
356
|
+
attachment,
|
|
357
|
+
memories,
|
|
358
|
+
errors,
|
|
359
|
+
duration: Date.now() - startTime,
|
|
360
|
+
fileType: fileInfo.type,
|
|
361
|
+
};
|
|
362
|
+
}
|
|
363
|
+
// ─── Image Ingestion ────────────────────────────────────────────────────
|
|
364
|
+
/**
|
|
365
|
+
* Ingest an image file into a single memory using a vision LLM.
|
|
366
|
+
* Resolves the vision provider from config: multimodal.visionProvider
|
|
367
|
+
* falls back to llm.defaultProvider. Similarly for the model.
|
|
368
|
+
*/
|
|
369
|
+
async function ingestImage(filePath, storePath, config, attachment, opts) {
|
|
370
|
+
const { author, authority, dryRun, onProgress, startTime, sourceFileName } = opts;
|
|
371
|
+
if (onProgress) {
|
|
372
|
+
onProgress({ current: 1, total: 1, title: "Analyzing image with vision LLM" });
|
|
373
|
+
}
|
|
374
|
+
// Resolve vision provider: config.multimodal.visionProvider > config.llm.defaultProvider
|
|
375
|
+
const visionProviderName = config.multimodal?.visionProvider || config.llm.defaultProvider;
|
|
376
|
+
const visionModel = config.multimodal?.visionModel || getProviderModel(config, visionProviderName);
|
|
377
|
+
const provider = createProvider(visionProviderName, visionModel, config);
|
|
378
|
+
// Extract description from the image
|
|
379
|
+
const imageDesc = await extractImageDescription(filePath, provider);
|
|
380
|
+
if (!imageDesc.text || imageDesc.text.trim().length === 0) {
|
|
381
|
+
return {
|
|
382
|
+
attachment,
|
|
383
|
+
memories: [],
|
|
384
|
+
errors: [{ chunk: 0, error: "Vision LLM returned an empty description for the image." }],
|
|
385
|
+
duration: Date.now() - startTime,
|
|
386
|
+
fileType: "image",
|
|
387
|
+
};
|
|
388
|
+
}
|
|
389
|
+
// Build the memory
|
|
390
|
+
const baseName = path.basename(filePath, path.extname(filePath));
|
|
391
|
+
const title = imageDesc.description
|
|
392
|
+
? (imageDesc.description.length > 100
|
|
393
|
+
? imageDesc.description.slice(0, 97) + "..."
|
|
394
|
+
: imageDesc.description)
|
|
395
|
+
: `Image: ${baseName}`;
|
|
396
|
+
const category = "imported";
|
|
397
|
+
const tags = {
|
|
398
|
+
type: ["imported", "image"],
|
|
399
|
+
source: ["image"],
|
|
400
|
+
...(imageDesc.topics.length > 0 ? { topic: imageDesc.topics.slice(0, 5) } : {}),
|
|
401
|
+
};
|
|
402
|
+
const relevance = [
|
|
403
|
+
baseName.replace(/[^a-zA-Z0-9]+/g, " ").trim(),
|
|
404
|
+
...imageDesc.topics,
|
|
405
|
+
]
|
|
406
|
+
.filter(Boolean)
|
|
407
|
+
.join(" ");
|
|
408
|
+
const confidence = 0.7;
|
|
409
|
+
const filename = baseName
|
|
410
|
+
.toLowerCase()
|
|
411
|
+
.replace(/[^a-z0-9]+/g, "-")
|
|
412
|
+
.replace(/^-|-$/g, "")
|
|
413
|
+
.substring(0, 60);
|
|
414
|
+
if (dryRun) {
|
|
415
|
+
return {
|
|
416
|
+
attachment,
|
|
417
|
+
memories: [
|
|
418
|
+
{
|
|
419
|
+
id: "dry-run-0",
|
|
420
|
+
title,
|
|
421
|
+
path: `${category}/${filename}.md`,
|
|
422
|
+
},
|
|
423
|
+
],
|
|
424
|
+
errors: [],
|
|
425
|
+
duration: Date.now() - startTime,
|
|
426
|
+
fileType: "image",
|
|
427
|
+
};
|
|
428
|
+
}
|
|
429
|
+
// Write the memory
|
|
430
|
+
const gnosysStore = new GnosysStore(storePath);
|
|
431
|
+
const today = new Date().toISOString().split("T")[0];
|
|
432
|
+
const id = await gnosysStore.generateId(category);
|
|
433
|
+
const frontmatter = {
|
|
434
|
+
id,
|
|
435
|
+
title,
|
|
436
|
+
category,
|
|
437
|
+
tags,
|
|
438
|
+
relevance,
|
|
439
|
+
author,
|
|
440
|
+
authority,
|
|
441
|
+
confidence,
|
|
442
|
+
created: today,
|
|
443
|
+
modified: today,
|
|
444
|
+
last_reviewed: today,
|
|
445
|
+
status: "active",
|
|
446
|
+
supersedes: null,
|
|
447
|
+
source_file: sourceFileName,
|
|
448
|
+
source_page: null,
|
|
449
|
+
source_timerange: null,
|
|
450
|
+
};
|
|
451
|
+
const memoryContent = `# ${title}\n\n${imageDesc.text}`;
|
|
452
|
+
const relPath = await gnosysStore.writeMemory(category, `${filename}.md`, frontmatter, memoryContent);
|
|
453
|
+
// Link the memory to its attachment
|
|
454
|
+
await linkMemoryToAttachment(storePath, attachment.uuid, id);
|
|
455
|
+
return {
|
|
456
|
+
attachment,
|
|
457
|
+
memories: [{ id, title, path: relPath }],
|
|
458
|
+
errors: [],
|
|
459
|
+
duration: Date.now() - startTime,
|
|
460
|
+
fileType: "image",
|
|
461
|
+
};
|
|
462
|
+
}
|
|
463
|
+
//# sourceMappingURL=multimodalIngest.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"multimodalIngest.js","sourceRoot":"","sources":["../../src/lib/multimodalIngest.ts"],"names":[],"mappings":"AAAA;;;;;;GAMG;AAEH,OAAO,KAAK,EAAE,MAAM,aAAa,CAAC;AAClC,OAAO,KAAK,IAAI,MAAM,MAAM,CAAC;AAC7B,OAAO,EAAE,cAAc,EAAY,MAAM,iBAAiB,CAAC;AAC3D,OAAO,EAAE,eAAe,EAAE,sBAAsB,EAAyB,MAAM,kBAAkB,CAAC;AAClG,OAAO,EAAE,cAAc,EAAE,MAAM,iBAAiB,CAAC;AACjD,OAAO,EAAE,eAAe,EAAE,MAAM,kBAAkB,CAAC;AACnD,OAAO,EAAE,uBAAuB,EAAE,MAAM,mBAAmB,CAAC;AAC5D,OAAO,EAAE,eAAe,EAAqD,MAAM,mBAAmB,CAAC;AACvG,OAAO,EAAE,eAAe,EAAE,MAAM,mBAAmB,CAAC;AACpD,OAAO,EAAE,aAAa,EAAE,eAAe,EAAkB,MAAM,oBAAoB,CAAC;AACpF,OAAO,EAAE,eAAe,EAAE,MAAM,aAAa,CAAC;AAC9C,OAAO,EAAE,iBAAiB,EAAE,MAAM,WAAW,CAAC;AAC9C,OAAO,EAAE,WAAW,EAAE,MAAM,YAAY,CAAC;AACzC,OAAO,EAAE,cAAc,EAAE,MAAM,UAAU,CAAC;AAC1C,OAAO,EAAE,UAAU,EAAE,cAAc,EAAE,gBAAgB,EAAqB,MAAM,aAAa,CAAC;AA4C9F,2EAA2E;AAE3E;;;GAGG;AACH,SAAS,cAAc,CAAC,IAAY,EAAE,UAAkB;IACtD,iEAAiE;IACjE,MAAM,KAAK,GAAG,IAAI;SACf,WAAW,EAAE;SACb,OAAO,CAAC,eAAe,EAAE,GAAG,CAAC;SAC7B,KAAK,CAAC,KAAK,CAAC;SACZ,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IAE/B,uBAAuB;IACvB,MAAM,IAAI,GAAG,IAAI,GAAG,EAAkB,CAAC;IACvC,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACtC,CAAC;IAED,iCAAiC;IACjC,MAAM,QAAQ,GAAG,CAAC,GAAG,IAAI,CAAC,OAAO,EAAE,CAAC;SACjC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;SAC3B,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC;SACZ,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC;IAEnB,gEAAgE;IAChE,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,UAAU,EAAE,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC;SACjE,OAAO,CAAC,gBAAgB,EAAE,GAAG,CAAC;SAC9B,IAAI,EAAE,CAAC;IAEV,OAAO,CAAC,GAAG,IAAI,GAAG,CAAC,CAAC,QAAQ,EAAE,GAAG,QAAQ,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC;AACzD,CAAC;AAED;;;GAGG;AACH,SAAS,aAAa,CAAC,IAAY,EAAE,UAAkB,EAAE,UAAkB,EAAE,IAAa;IACxF,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,UAAU,EAAE,IAAI,CAAC,OAAO,CAAC,UAAU,CAAC,CAAC,CAAC;IAErE,uDAAuD;IACvD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;IACnE,IAAI,SAAS,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,IAAI,SAAS,CAAC,MAAM,GAAG,GAAG,EAAE,CAAC;QAChE,OAAO,SAAS,CAAC;IACnB,CAAC;IAED,mCAAmC;IACnC,MAAM,SAAS,GAAG,IAAI,CAAC,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IAC1C,OAAO,GAAG,QAAQ,GAAG,SAAS,YAAY,UAAU,GAAG,CAAC,EAAE,CAAC;AAC7D,CAAC;AAED,4EAA4E;AAE5E;;GAEG;AACH,SAAS,UAAU,CAAC,OAAe;IACjC,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,GAAG,IAAI,CAAC,CAAC;IACrC,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,OAAO,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC;IAC5C,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,OAAO,GAAG,EAAE,CAAC,CAAC;IACnC,OAAO,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC;AAC9G,CAAC;AAED;;GAEG;AACH,SAAS,yBAAyB,CAAC,MAAoB;IACrD,OAAO;QACL,QAAQ,EAAE,MAAM,CAAC,UAAU,EAAE,qBAAqB;QAClD,KAAK,EAAE,MAAM,CAAC,UAAU,EAAE,YAAY;KACvC,CAAC;AACJ,CAAC;AAED;;;;;;GAMG;AACH,SAAS,qBAAqB,CAC5B,QAA6B,EAC7B,UAAkB;IAElB,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IAErC,MAAM,qBAAqB,GAAG,GAAG,CAAC,CAAC,oBAAoB;IAEvD,mCAAmC;IACnC,MAAM,OAAO,GAIR,EAAE,CAAC;IAER,IAAI,aAAa,GAAmE,IAAI,CAAC;IAEzF,KAAK,MAAM,GAAG,IAAI,QAAQ,EAAE,CAAC;QAC3B,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE;YAAE,SAAS;QAE/B,IACE,CAAC,aAAa;YACd,GAAG,CAAC,SAAS,GAAG,aAAa,CAAC,SAAS,IAAI,qBAAqB,EAChE,CAAC;YACD,qBAAqB;YACrB,IAAI,aAAa;gBAAE,OAAO,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;YAC/C,aAAa,GAAG;gBACd,KAAK,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC;gBACxB,SAAS,EAAE,GAAG,CAAC,SAAS;gBACxB,OAAO,EAAE,GAAG,CAAC,OAAO;aACrB,CAAC;QACJ,CAAC;aAAM,CAAC;YACN,4BAA4B;YAC5B,aAAa,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,CAAC;YAC1C,aAAa,CAAC,OAAO,GAAG,GAAG,CAAC,OAAO,CAAC;QACtC,CAAC;IACH,CAAC;IAED,IAAI,aAAa;QAAE,OAAO,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAE/C,uDAAuD;IACvD,MAAM,aAAa,GAAG,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QACxC,IAAI,EAAE,CAAC,CAAC,KAAK,CAAC,IAAI,CAAC,GAAG,CAAC;QACvB,SAAS,EAAE,GAAG,UAAU,CAAC,CAAC,CAAC,SAAS,CAAC,IAAI,UAAU,CAAC,CAAC,CAAC,OAAO,CAAC,EAAE;KACjE,CAAC,CAAC,CAAC;IAEJ,OAAO,aAAa,CAAC,aAAa,EAAE,EAAE,UAAU,EAAE,CAAC,CAAC;AACtD,CAAC;AAED,2EAA2E;AAE3E;;;;;;;;;;;GAWG;AACH,MAAM,CAAC,KAAK,UAAU,UAAU,CAAC,OAAgC;IAC/D,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAC7B,MAAM,EACJ,QAAQ,EACR,SAAS,EACT,IAAI,GAAG,KAAK,EACZ,MAAM,GAAG,OAAO,EAChB,SAAS,GAAG,UAAU,EACtB,MAAM,GAAG,KAAK,EACd,WAAW,EACX,UAAU,GACX,GAAG,OAAO,CAAC;IAEZ,2BAA2B;IAC3B,MAAM,QAAQ,GAAG,MAAM,cAAc,CAAC,QAAQ,CAAC,CAAC;IAEhD,iCAAiC;IACjC,IAAI,QAAQ,CAAC,IAAI,KAAK,SAAS,EAAE,CAAC;QAChC,MAAM,IAAI,KAAK,CACb,0BAA0B,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,IAAI,SAAS,IAAI;YACjE,mFAAmF,CACpF,CAAC;IACJ,CAAC;IAED,6BAA6B;IAC7B,IAAI,MAAM,GAAiB,cAAc,CAAC;IAC1C,IAAI,CAAC;QACH,MAAM,GAAG,MAAM,UAAU,CAAC,SAAS,CAAC,CAAC;IACvC,CAAC;IAAC,MAAM,CAAC;QACP,eAAe;IACjB,CAAC;IAED,MAAM,IAAI,GAAG,MAAM,EAAE,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;IACrC,MAAM,UAAU,GAAG,IAAI,CAAC,IAAI,GAAG,CAAC,IAAI,GAAG,IAAI,CAAC,CAAC;IAC7C,MAAM,SAAS,GAAG,MAAM,CAAC,UAAU,EAAE,aAAa,IAAI,GAAG,CAAC;IAE1D,IAAI,UAAU,GAAG,SAAS,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CACb,WAAW,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,yBAAyB,SAAS,YAAY;YAC9E,6DAA6D,CAC9D,CAAC;IACJ,CAAC;IAED,gDAAgD;IAChD,IAAI,UAA4B,CAAC;IACjC,IAAI,MAAM,EAAE,CAAC;QACX,UAAU,GAAG;YACX,IAAI,EAAE,cAAc;YACpB,YAAY,EAAE,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC;YACrC,SAAS,EAAE,QAAQ,CAAC,SAAS;YAC7B,QAAQ,EAAE,QAAQ,CAAC,QAAQ;YAC3B,SAAS,EAAE,IAAI,CAAC,IAAI;YACpB,WAAW,EAAE,cAAc;YAC3B,SAAS,EAAE,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE;YACnC,SAAS,EAAE,EAAE;SACd,CAAC;IACJ,CAAC;SAAM,CAAC;QACN,UAAU,GAAG,MAAM,eAAe,CAAC,SAAS,EAAE,QAAQ,CAAC,CAAC;IAC1D,CAAC;IAED,0CAA0C;IAC1C,IAAI,MAAmB,CAAC;IACxB,MAAM,SAAS,GAAG,MAAM,CAAC,UAAU,EAAE,SAAS,IAAI,IAAI,CAAC;IACvD,MAAM,cAAc,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,CAAC,CAAC;IAE/C,QAAQ,QAAQ,CAAC,IAAI,EAAE,CAAC;QACtB,KAAK,KAAK,CAAC,CAAC,CAAC;YACX,MAAM,SAAS,GAAG,MAAM,cAAc,CAAC,QAAQ,CAAC,CAAC;YACjD,iEAAiE;YACjE,MAAM,QAAQ,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC;gBACtC,IAAI,EAAE,EAAE,CAAC,IAAI;gBACb,IAAI,EAAE,EAAE,CAAC,KAAK;aACf,CAAC,CAAC,CAAC;YACJ,MAAM,GAAG,aAAa,CAAC,QAAQ,EAAE,EAAE,UAAU,EAAE,SAAS,EAAE,CAAC,CAAC;YAC5D,MAAM;QACR,CAAC;QAED,KAAK,MAAM,CAAC,CAAC,CAAC;YACZ,MAAM,UAAU,GAAG,MAAM,eAAe,CAAC,QAAQ,CAAC,CAAC;YACnD,iEAAiE;YACjE,MAAM,QAAQ,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,CAAC;gBACvC,IAAI,EAAE,EAAE,CAAC,IAAI;gBACb,IAAI,EAAE,EAAE,CAAC,cAAc;aACxB,CAAC,CAAC,CAAC;YACJ,MAAM,GAAG,aAAa,CAAC,QAAQ,EAAE,EAAE,UAAU,EAAE,SAAS,EAAE,CAAC,CAAC;YAC5D,MAAM;QACR,CAAC;QAED,KAAK,MAAM,CAAC,CAAC,CAAC;YACZ,yDAAyD;YACzD,MAAM,OAAO,GAAG,MAAM,EAAE,CAAC,QAAQ,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;YACrD,MAAM,GAAG,eAAe,CAAC,OAAO,EAAE,EAAE,UAAU,EAAE,SAAS,EAAE,CAAC,CAAC;YAC7D,MAAM;QACR,CAAC;QAED,KAAK,OAAO,CAAC,CAAC,CAAC;YACb,2DAA2D;YAC3D,oDAAoD;YACpD,mEAAmE;YACnE,OAAO,MAAM,WAAW,CAAC,QAAQ,EAAE,SAAS,EAAE,MAAM,EAAE,UAAU,EAAE;gBAChE,MAAM;gBACN,SAAS;gBACT,MAAM;gBACN,IAAI;gBACJ,UAAU;gBACV,SAAS;gBACT,cAAc;aACf,CAAC,CAAC;QACL,CAAC;QAED,KAAK,OAAO,CAAC,CAAC,CAAC;YACb,uEAAuE;YACvE,MAAM,YAAY,GAAG,yBAAyB,CAAC,MAAM,CAAC,CAAC;YACvD,MAAM,UAAU,GAAG,MAAM,eAAe,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;YACjE,MAAM,GAAG,qBAAqB,CAAC,UAAU,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;YAC/D,MAAM;QACR,CAAC;QAED,KAAK,OAAO,CAAC,CAAC,CAAC;YACb,oEAAoE;YACpE,MAAM,YAAY,GAAG,yBAAyB,CAAC,MAAM,CAAC,CAAC;YACvD,MAAM,eAAe,GAAG,MAAM,eAAe,CAAC,QAAQ,EAAE,YAAY,CAAC,CAAC;YACtE,MAAM,GAAG,qBAAqB,CAAC,eAAe,CAAC,QAAQ,EAAE,SAAS,CAAC,CAAC;YACpE,MAAM;QACR,CAAC;QAED;YACE,MAAM,IAAI,KAAK,CAAC,cAAc,QAAQ,CAAC,IAAI,6CAA6C,CAAC,CAAC;IAC9F,CAAC;IAED,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO;YACL,UAAU;YACV,QAAQ,EAAE,EAAE;YACZ,MAAM,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,mDAAmD,EAAE,CAAC;YAClF,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;YAChC,QAAQ,EAAE,QAAQ,CAAC,IAAI;SACxB,CAAC;IACJ,CAAC;IAED,sDAAsD;IACtD,MAAM,WAAW,GAAG,IAAI,WAAW,CAAC,SAAS,CAAC,CAAC;IAC/C,MAAM,WAAW,GAAG,IAAI,iBAAiB,CAAC,SAAS,CAAC,CAAC;IACrD,MAAM,WAAW,CAAC,IAAI,EAAE,CAAC;IACzB,MAAM,SAAS,GAAG,IAAI,eAAe,CAAC,WAAW,EAAE,WAAW,EAAE,MAAM,CAAC,CAAC;IAExE,MAAM,KAAK,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IACrD,MAAM,QAAQ,GAAuC,EAAE,CAAC;IACxD,MAAM,MAAM,GAAqC,EAAE,CAAC;IAEpD,2CAA2C;IAC3C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,KAAK,GAAG,MAAM,CAAC,CAAC,CAAC,CAAC;QAExB,IAAI,UAAU,EAAE,CAAC;YACf,UAAU,CAAC;gBACT,OAAO,EAAE,CAAC,GAAG,CAAC;gBACd,KAAK,EAAE,MAAM,CAAC,MAAM;gBACpB,KAAK,EAAE,oBAAoB,CAAC,GAAG,CAAC,IAAI,MAAM,CAAC,MAAM,EAAE;aACpD,CAAC,CAAC;QACL,CAAC;QAED,IAAI,CAAC;YACH,IAAI,KAAa,CAAC;YAClB,IAAI,QAAgB,CAAC;YACrB,IAAI,IAA8B,CAAC;YACnC,IAAI,SAAiB,CAAC;YACtB,IAAI,OAAe,CAAC;YACpB,IAAI,UAAkB,CAAC;YACvB,IAAI,QAAgB,CAAC;YAErB,IAAI,IAAI,KAAK,KAAK,IAAI,SAAS,CAAC,cAAc,EAAE,CAAC;gBAC/C,0BAA0B;gBAC1B,MAAM,MAAM,GAAG,MAAM,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;gBAClD,KAAK,GAAG,MAAM,CAAC,KAAK,CAAC;gBACrB,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;gBAC3B,IAAI,GAAG,MAAM,CAAC,IAAI,CAAC;gBACnB,SAAS,GAAG,MAAM,CAAC,SAAS,CAAC;gBAC7B,OAAO,GAAG,MAAM,CAAC,OAAO,CAAC;gBACzB,UAAU,GAAG,MAAM,CAAC,UAAU,CAAC;gBAC/B,QAAQ,GAAG,MAAM,CAAC,QAAQ,CAAC;YAC7B,CAAC;iBAAM,CAAC;gBACN,kCAAkC;gBAClC,KAAK,GAAG,aAAa,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC,EAAE,cAAc,EAAE,KAAK,CAAC,UAAU,CAAC,CAAC;gBACvE,QAAQ,GAAG,UAAU,CAAC;gBACtB,IAAI,GAAG,EAAE,IAAI,EAAE,CAAC,UAAU,CAAC,EAAE,MAAM,EAAE,CAAC,UAAU,CAAC,EAAE,CAAC;gBACpD,SAAS,GAAG,cAAc,CAAC,KAAK,CAAC,IAAI,EAAE,cAAc,CAAC,CAAC;gBACvD,OAAO,GAAG,KAAK,CAAC,IAAI,CAAC;gBACrB,UAAU,GAAG,GAAG,CAAC;gBACjB,QAAQ,GAAG,KAAK;qBACb,WAAW,EAAE;qBACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC;qBAC3B,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;qBACrB,SAAS,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;YACtB,CAAC;YAED,IAAI,MAAM,EAAE,CAAC;gBACX,QAAQ,CAAC,IAAI,CAAC;oBACZ,EAAE,EAAE,WAAW,CAAC,EAAE;oBAClB,KAAK;oBACL,IAAI,EAAE,GAAG,QAAQ,IAAI,QAAQ,KAAK;oBAClC,IAAI,EAAE,KAAK,CAAC,UAAU;oBACtB,SAAS,EAAE,KAAK,CAAC,eAAe;iBACjC,CAAC,CAAC;gBACH,SAAS;YACX,CAAC;YAED,4CAA4C;YAC5C,MAAM,EAAE,GAAG,MAAM,WAAW,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC;YAElD,MAAM,WAAW,GAAG;gBAClB,EAAE;gBACF,KAAK;gBACL,QAAQ;gBACR,IAAI;gBACJ,SAAS;gBACT,MAAM,EAAE,MAAqC;gBAC7C,SAAS,EAAE,SAA8D;gBACzE,UAAU;gBACV,OAAO,EAAE,KAAK;gBACd,QAAQ,EAAE,KAAK;gBACf,aAAa,EAAE,KAAK;gBACpB,MAAM,EAAE,QAAiB;gBACzB,UAAU,EAAE,IAAI;gBAChB,wBAAwB;gBACxB,WAAW,EAAE,cAAc;gBAC3B,WAAW,EAAE,KAAK,CAAC,UAAU,IAAI,IAAI;gBACrC,gBAAgB,EAAE,KAAK,CAAC,eAAe,IAAI,IAAI;aAChD,CAAC;YAEF,MAAM,aAAa,GAAG,KAAK,KAAK,OAAO,OAAO,EAAE,CAAC;YACjD,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,WAAW,CAC3C,QAAQ,EACR,GAAG,QAAQ,KAAK,EAChB,WAAW,EACX,aAAa,CACd,CAAC;YAEF,oCAAoC;YACpC,MAAM,sBAAsB,CAAC,SAAS,EAAE,UAAU,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;YAE7D,QAAQ,CAAC,IAAI,CAAC;gBACZ,EAAE;gBACF,KAAK;gBACL,IAAI,EAAE,OAAO;gBACb,IAAI,EAAE,KAAK,CAAC,UAAU;gBACtB,SAAS,EAAE,KAAK,CAAC,eAAe;aACjC,CAAC,CAAC;QACL,CAAC;QAAC,OAAO,GAAG,EAAE,CAAC;YACb,MAAM,CAAC,IAAI,CAAC;gBACV,KAAK,EAAE,CAAC;gBACR,KAAK,EAAE,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC;aACxD,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,OAAO;QACL,UAAU;QACV,QAAQ;QACR,MAAM;QACN,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;QAChC,QAAQ,EAAE,QAAQ,CAAC,IAAI;KACxB,CAAC;AACJ,CAAC;AAED,2EAA2E;AAE3E;;;;GAIG;AACH,KAAK,UAAU,WAAW,CACxB,QAAgB,EAChB,SAAiB,EACjB,MAAoB,EACpB,UAA4B,EAC5B,IAQC;IAED,MAAM,EAAE,MAAM,EAAE,SAAS,EAAE,MAAM,EAAE,UAAU,EAAE,SAAS,EAAE,cAAc,EAAE,GAAG,IAAI,CAAC;IAElF,IAAI,UAAU,EAAE,CAAC;QACf,UAAU,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,iCAAiC,EAAE,CAAC,CAAC;IACjF,CAAC;IAED,yFAAyF;IACzF,MAAM,kBAAkB,GAAG,MAAM,CAAC,UAAU,EAAE,cAAc,IAAI,MAAM,CAAC,GAAG,CAAC,eAAe,CAAC;IAC3F,MAAM,WAAW,GAAG,MAAM,CAAC,UAAU,EAAE,WAAW,IAAI,gBAAgB,CAAC,MAAM,EAAE,kBAAkB,CAAC,CAAC;IACnG,MAAM,QAAQ,GAAG,cAAc,CAAC,kBAAkB,EAAE,WAAW,EAAE,MAAM,CAAC,CAAC;IAEzE,qCAAqC;IACrC,MAAM,SAAS,GAAG,MAAM,uBAAuB,CAAC,QAAQ,EAAE,QAAQ,CAAC,CAAC;IAEpE,IAAI,CAAC,SAAS,CAAC,IAAI,IAAI,SAAS,CAAC,IAAI,CAAC,IAAI,EAAE,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC1D,OAAO;YACL,UAAU;YACV,QAAQ,EAAE,EAAE;YACZ,MAAM,EAAE,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,KAAK,EAAE,yDAAyD,EAAE,CAAC;YACxF,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;YAChC,QAAQ,EAAE,OAAO;SAClB,CAAC;IACJ,CAAC;IAED,mBAAmB;IACnB,MAAM,QAAQ,GAAG,IAAI,CAAC,QAAQ,CAAC,QAAQ,EAAE,IAAI,CAAC,OAAO,CAAC,QAAQ,CAAC,CAAC,CAAC;IACjE,MAAM,KAAK,GAAG,SAAS,CAAC,WAAW;QACjC,CAAC,CAAC,CAAC,SAAS,CAAC,WAAW,CAAC,MAAM,GAAG,GAAG;YACjC,CAAC,CAAC,SAAS,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,KAAK;YAC5C,CAAC,CAAC,SAAS,CAAC,WAAW,CAAC;QAC5B,CAAC,CAAC,UAAU,QAAQ,EAAE,CAAC;IACzB,MAAM,QAAQ,GAAG,UAAU,CAAC;IAC5B,MAAM,IAAI,GAA6B;QACrC,IAAI,EAAE,CAAC,UAAU,EAAE,OAAO,CAAC;QAC3B,MAAM,EAAE,CAAC,OAAO,CAAC;QACjB,GAAG,CAAC,SAAS,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,SAAS,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;KAChF,CAAC;IACF,MAAM,SAAS,GAAG;QAChB,QAAQ,CAAC,OAAO,CAAC,gBAAgB,EAAE,GAAG,CAAC,CAAC,IAAI,EAAE;QAC9C,GAAG,SAAS,CAAC,MAAM;KACpB;SACE,MAAM,CAAC,OAAO,CAAC;SACf,IAAI,CAAC,GAAG,CAAC,CAAC;IACb,MAAM,UAAU,GAAG,GAAG,CAAC;IACvB,MAAM,QAAQ,GAAG,QAAQ;SACtB,WAAW,EAAE;SACb,OAAO,CAAC,aAAa,EAAE,GAAG,CAAC;SAC3B,OAAO,CAAC,QAAQ,EAAE,EAAE,CAAC;SACrB,SAAS,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;IAEpB,IAAI,MAAM,EAAE,CAAC;QACX,OAAO;YACL,UAAU;YACV,QAAQ,EAAE;gBACR;oBACE,EAAE,EAAE,WAAW;oBACf,KAAK;oBACL,IAAI,EAAE,GAAG,QAAQ,IAAI,QAAQ,KAAK;iBACnC;aACF;YACD,MAAM,EAAE,EAAE;YACV,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;YAChC,QAAQ,EAAE,OAAO;SAClB,CAAC;IACJ,CAAC;IAED,mBAAmB;IACnB,MAAM,WAAW,GAAG,IAAI,WAAW,CAAC,SAAS,CAAC,CAAC;IAC/C,MAAM,KAAK,GAAG,IAAI,IAAI,EAAE,CAAC,WAAW,EAAE,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IACrD,MAAM,EAAE,GAAG,MAAM,WAAW,CAAC,UAAU,CAAC,QAAQ,CAAC,CAAC;IAElD,MAAM,WAAW,GAAG;QAClB,EAAE;QACF,KAAK;QACL,QAAQ;QACR,IAAI;QACJ,SAAS;QACT,MAAM;QACN,SAAS;QACT,UAAU;QACV,OAAO,EAAE,KAAK;QACd,QAAQ,EAAE,KAAK;QACf,aAAa,EAAE,KAAK;QACpB,MAAM,EAAE,QAAiB;QACzB,UAAU,EAAE,IAAI;QAChB,WAAW,EAAE,cAAc;QAC3B,WAAW,EAAE,IAAI;QACjB,gBAAgB,EAAE,IAAI;KACvB,CAAC;IAEF,MAAM,aAAa,GAAG,KAAK,KAAK,OAAO,SAAS,CAAC,IAAI,EAAE,CAAC;IACxD,MAAM,OAAO,GAAG,MAAM,WAAW,CAAC,WAAW,CAC3C,QAAQ,EACR,GAAG,QAAQ,KAAK,EAChB,WAAW,EACX,aAAa,CACd,CAAC;IAEF,oCAAoC;IACpC,MAAM,sBAAsB,CAAC,SAAS,EAAE,UAAU,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;IAE7D,OAAO;QACL,UAAU;QACV,QAAQ,EAAE,CAAC,EAAE,EAAE,EAAE,KAAK,EAAE,IAAI,EAAE,OAAO,EAAE,CAAC;QACxC,MAAM,EAAE,EAAE;QACV,QAAQ,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;QAChC,QAAQ,EAAE,OAAO;KAClB,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Gnosys PDF Extraction — Extract text from PDF files by section.
|
|
3
|
+
*
|
|
4
|
+
* Splits on content structure (headings, topic boundaries) rather than page
|
|
5
|
+
* breaks. Page numbers are tracked as metadata so memories know which pages
|
|
6
|
+
* they came from. This mirrors DOCX extraction behavior.
|
|
7
|
+
*
|
|
8
|
+
* Uses pdf-parse v2 (class-based API wrapping Mozilla's pdf.js) via dynamic
|
|
9
|
+
* import so it stays optional and doesn't break builds that don't need it.
|
|
10
|
+
*/
|
|
11
|
+
export interface PdfChunk {
|
|
12
|
+
/** Extracted text content for this section */
|
|
13
|
+
text: string;
|
|
14
|
+
/** Section heading if detected */
|
|
15
|
+
sectionHeading?: string;
|
|
16
|
+
/** Page range where this section appears, e.g. "3" or "3-5" */
|
|
17
|
+
pages: string;
|
|
18
|
+
/** Total number of pages in the PDF */
|
|
19
|
+
pageCount: number;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Extract text from a PDF file, splitting by sections/topics.
|
|
23
|
+
*
|
|
24
|
+
* First extracts all text with page tracking, then splits on detected
|
|
25
|
+
* headings or structural boundaries. Each chunk includes the page range
|
|
26
|
+
* where the content appears.
|
|
27
|
+
*/
|
|
28
|
+
export declare function extractPdfText(filePath: string): Promise<PdfChunk[]>;
|
|
29
|
+
//# sourceMappingURL=pdfExtract.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pdfExtract.d.ts","sourceRoot":"","sources":["../../src/lib/pdfExtract.ts"],"names":[],"mappings":"AAAA;;;;;;;;;GASG;AAMH,MAAM,WAAW,QAAQ;IACvB,8CAA8C;IAC9C,IAAI,EAAE,MAAM,CAAC;IACb,kCAAkC;IAClC,cAAc,CAAC,EAAE,MAAM,CAAC;IACxB,+DAA+D;IAC/D,KAAK,EAAE,MAAM,CAAC;IACd,uCAAuC;IACvC,SAAS,EAAE,MAAM,CAAC;CACnB;AAgBD;;;;;;GAMG;AACH,wBAAsB,cAAc,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,QAAQ,EAAE,CAAC,CAiE1E"}
|