vectra-js 0.9.6 → 0.9.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/FUNDING.yml +4 -0
- package/.github/workflows/npm-publish.yml +3 -4
- package/README.md +392 -537
- package/RELEASE_NOTES.md +15 -0
- package/docs/assets/vectraArch.png +0 -0
- package/examples/chromadb.js +96 -0
- package/examples/pg-prisma.js +119 -0
- package/examples/postgress.js +115 -0
- package/package.json +4 -3
- package/src/backends/gemini.js +15 -8
- package/src/backends/openrouter.js +2 -2
- package/src/backends/postgres_store.js +191 -0
- package/src/config.js +1 -1
- package/src/core.js +174 -130
- package/src/observability.js +0 -6
- package/src/processor.js +32 -2
- package/src/webconfig_server.js +1 -1
package/src/core.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
const fs = require('fs');
|
|
2
2
|
const path = require('path');
|
|
3
|
-
const { RAGConfigSchema, ProviderType,
|
|
3
|
+
const { RAGConfigSchema, ProviderType, RetrievalStrategy } = require('./config');
|
|
4
4
|
const crypto = require('crypto');
|
|
5
5
|
const { DocumentProcessor } = require('./processor');
|
|
6
6
|
const { OpenAIBackend } = require('./backends/openai');
|
|
@@ -10,6 +10,7 @@ const { OpenRouterBackend } = require('./backends/openrouter');
|
|
|
10
10
|
const { HuggingFaceBackend } = require('./backends/huggingface');
|
|
11
11
|
const { PrismaVectorStore } = require('./backends/prisma_store');
|
|
12
12
|
const { ChromaVectorStore } = require('./backends/chroma_store');
|
|
13
|
+
const { PostgresVectorStore } = require('./backends/postgres_store');
|
|
13
14
|
const { QdrantVectorStore } = require('./backends/qdrant_store');
|
|
14
15
|
const { MilvusVectorStore } = require('./backends/milvus_store');
|
|
15
16
|
const { LLMReranker } = require('./reranker');
|
|
@@ -19,6 +20,18 @@ const { v5: uuidv5 } = require('uuid');
|
|
|
19
20
|
const { v4: uuidv4 } = require('uuid');
|
|
20
21
|
const SQLiteLogger = require('./observability');
|
|
21
22
|
|
|
23
|
+
const DEFAULT_TOKEN_BUDGET = 2048;
|
|
24
|
+
const DEFAULT_PREFER_SUMMARY_BELOW = 1024;
|
|
25
|
+
const DEFAULT_SUMMARY_LENGTH = 800;
|
|
26
|
+
const DEFAULT_CHUNK_LENGTH = 1200;
|
|
27
|
+
const DEFAULT_FALLBACK_SUMMARY_LENGTH = 300;
|
|
28
|
+
const DEFAULT_KEYWORD_COUNT = 10;
|
|
29
|
+
const DEFAULT_MEMORY_MESSAGES = 20;
|
|
30
|
+
const DEFAULT_CONCURRENCY_LIMIT = 5;
|
|
31
|
+
const DEFAULT_RETRY_ATTEMPTS = 3;
|
|
32
|
+
const DEFAULT_INITIAL_RETRY_DELAY = 500;
|
|
33
|
+
const DEFAULT_MAX_RETRY_DELAY = 4000;
|
|
34
|
+
|
|
22
35
|
class VectraClient {
|
|
23
36
|
constructor(config) {
|
|
24
37
|
const parsed = RAGConfigSchema.parse(config);
|
|
@@ -49,7 +62,7 @@ class VectraClient {
|
|
|
49
62
|
this.vectorStore = this.createVectorStore(this.config.database);
|
|
50
63
|
this._embeddingCache = new Map();
|
|
51
64
|
this._metadataEnrichmentEnabled = !!(this.config.metadata && this.config.metadata.enrichment);
|
|
52
|
-
const mm = this.config.memory?.maxMessages ||
|
|
65
|
+
const mm = this.config.memory?.maxMessages || DEFAULT_MEMORY_MESSAGES;
|
|
53
66
|
if (this.config.memory && this.config.memory.enabled) {
|
|
54
67
|
if (this.config.memory.type === 'in-memory') {
|
|
55
68
|
this.history = new InMemoryHistory(mm);
|
|
@@ -98,6 +111,7 @@ class VectraClient {
|
|
|
98
111
|
if (!dbConfig || !dbConfig.type) throw new Error('Database config missing type');
|
|
99
112
|
const t = dbConfig.type.toLowerCase();
|
|
100
113
|
if (t === 'prisma') return new PrismaVectorStore(dbConfig);
|
|
114
|
+
if (t === 'postgres') return new PostgresVectorStore(dbConfig);
|
|
101
115
|
if (t === 'chroma') return new ChromaVectorStore(dbConfig);
|
|
102
116
|
if (t === 'qdrant') return new QdrantVectorStore(dbConfig);
|
|
103
117
|
if (t === 'milvus') return new MilvusVectorStore(dbConfig);
|
|
@@ -127,14 +141,149 @@ class VectraClient {
|
|
|
127
141
|
const words = c.toLowerCase().replace(/[^a-z0-9\s]/g, ' ').split(/\s+/).filter(w => w.length > 3);
|
|
128
142
|
const freq = {};
|
|
129
143
|
for (const w of words) freq[w] = (freq[w] || 0) + 1;
|
|
130
|
-
const top = Object.entries(freq).sort((a,b)=>b[1]-a[1]).slice(0,
|
|
131
|
-
const summary = c.slice(0,
|
|
144
|
+
const top = Object.entries(freq).sort((a,b)=>b[1]-a[1]).slice(0,DEFAULT_KEYWORD_COUNT).map(([w])=>w);
|
|
145
|
+
const summary = c.slice(0, DEFAULT_FALLBACK_SUMMARY_LENGTH);
|
|
132
146
|
enriched.push({ summary, keywords: top, hypothetical_questions: [] });
|
|
133
147
|
}
|
|
134
148
|
}
|
|
135
149
|
return enriched;
|
|
136
150
|
}
|
|
137
151
|
|
|
152
|
+
async _batchEmbedChunks(toEmbed, mapIndex, hashes) {
|
|
153
|
+
const newEmbeds = [];
|
|
154
|
+
if (toEmbed.length > 0) {
|
|
155
|
+
const enabled = !!(this.config.ingestion && this.config.ingestion.rateLimitEnabled);
|
|
156
|
+
const defaultLimit = (this.config.ingestion && typeof this.config.ingestion.concurrencyLimit === 'number') ? this.config.ingestion.concurrencyLimit : DEFAULT_CONCURRENCY_LIMIT;
|
|
157
|
+
const limit = enabled ? defaultLimit : toEmbed.length;
|
|
158
|
+
const batches = [];
|
|
159
|
+
for (let i = 0; i < toEmbed.length; i += limit) batches.push(toEmbed.slice(i, i + limit));
|
|
160
|
+
for (const batch of batches) {
|
|
161
|
+
let attempt = 0; let delay = DEFAULT_INITIAL_RETRY_DELAY;
|
|
162
|
+
while (true) {
|
|
163
|
+
try {
|
|
164
|
+
const out = await this.embedder.embedDocuments(batch);
|
|
165
|
+
newEmbeds.push(...out);
|
|
166
|
+
break;
|
|
167
|
+
} catch (err) {
|
|
168
|
+
attempt++;
|
|
169
|
+
if (attempt >= DEFAULT_RETRY_ATTEMPTS) throw err;
|
|
170
|
+
await new Promise(r => setTimeout(r, delay));
|
|
171
|
+
delay = Math.min(DEFAULT_MAX_RETRY_DELAY, delay * 2);
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
newEmbeds.forEach((vec, j) => {
|
|
176
|
+
const h = hashes[mapIndex[j]];
|
|
177
|
+
this._embeddingCache.set(h, vec);
|
|
178
|
+
});
|
|
179
|
+
}
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
async _processDirectory(filePath) {
|
|
183
|
+
const files = await fs.promises.readdir(filePath);
|
|
184
|
+
const summary = { processed: 0, succeeded: 0, failed: 0, errors: [] };
|
|
185
|
+
for (const file of files) {
|
|
186
|
+
const full = path.join(filePath, file);
|
|
187
|
+
if (this._isTemporaryFile(full)) continue;
|
|
188
|
+
summary.processed++;
|
|
189
|
+
try {
|
|
190
|
+
await this.ingestDocuments(full);
|
|
191
|
+
summary.succeeded++;
|
|
192
|
+
} catch (err) {
|
|
193
|
+
summary.failed++;
|
|
194
|
+
summary.errors.push({ file: full, message: err?.message || String(err) });
|
|
195
|
+
this.trigger('onError', err);
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
this.trigger('onIngestSummary', summary);
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
async _validateFile(filePath, stats) {
|
|
202
|
+
const absPath = path.resolve(filePath);
|
|
203
|
+
const size = stats.size || 0;
|
|
204
|
+
const mtime = Math.floor(stats.mtimeMs || Date.now());
|
|
205
|
+
const md5 = crypto.createHash('md5');
|
|
206
|
+
const sha = crypto.createHash('sha256');
|
|
207
|
+
await new Promise((resolve, reject) => {
|
|
208
|
+
const s = fs.createReadStream(filePath);
|
|
209
|
+
s.on('data', (chunk) => { md5.update(chunk); sha.update(chunk); });
|
|
210
|
+
s.on('error', reject);
|
|
211
|
+
s.on('end', resolve);
|
|
212
|
+
});
|
|
213
|
+
const fileMD5 = md5.digest('hex');
|
|
214
|
+
const fileSHA256 = sha.digest('hex');
|
|
215
|
+
return { absolutePath: absPath, fileMD5, fileSHA256, fileSize: size, lastModified: mtime, timestamp: Date.now() };
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
async _prepareDocuments(filePath, rawText, chunks, embeddings, hashes, validation) {
|
|
219
|
+
const metas = this.processor.computeChunkMetadata(filePath, rawText, chunks);
|
|
220
|
+
const idNamespace = uuidv5('vectra-js', uuidv5.DNS);
|
|
221
|
+
let documents = chunks.map((content, i) => ({
|
|
222
|
+
id: uuidv5(`${validation.fileSHA256}:${i}`, idNamespace),
|
|
223
|
+
content,
|
|
224
|
+
embedding: embeddings[i],
|
|
225
|
+
metadata: {
|
|
226
|
+
docId: uuidv5(`${validation.fileSHA256}:${i}`, idNamespace),
|
|
227
|
+
source: filePath,
|
|
228
|
+
absolutePath: validation.absolutePath,
|
|
229
|
+
fileMD5: validation.fileMD5,
|
|
230
|
+
fileSHA256: validation.fileSHA256,
|
|
231
|
+
fileSize: validation.fileSize,
|
|
232
|
+
lastModified: validation.lastModified,
|
|
233
|
+
chunkIndex: i,
|
|
234
|
+
sha256: hashes[i],
|
|
235
|
+
fileType: metas[i]?.fileType,
|
|
236
|
+
docTitle: metas[i]?.docTitle,
|
|
237
|
+
pageFrom: metas[i]?.pageFrom,
|
|
238
|
+
pageTo: metas[i]?.pageTo,
|
|
239
|
+
section: metas[i]?.section
|
|
240
|
+
}
|
|
241
|
+
}));
|
|
242
|
+
|
|
243
|
+
if (this._metadataEnrichmentEnabled) {
|
|
244
|
+
const extra = await this._enrichChunkMetadata(chunks);
|
|
245
|
+
documents = documents.map((d, i) => ({
|
|
246
|
+
...d,
|
|
247
|
+
metadata: {
|
|
248
|
+
...d.metadata,
|
|
249
|
+
summary: extra[i]?.summary,
|
|
250
|
+
keywords: extra[i]?.keywords,
|
|
251
|
+
hypothetical_questions: extra[i]?.hypothetical_questions,
|
|
252
|
+
}
|
|
253
|
+
}));
|
|
254
|
+
}
|
|
255
|
+
return documents;
|
|
256
|
+
}
|
|
257
|
+
|
|
258
|
+
async _storeDocuments(documents, mode, absPath) {
|
|
259
|
+
if (this.vectorStore && typeof this.vectorStore.ensureIndexes === 'function') {
|
|
260
|
+
try { await this.vectorStore.ensureIndexes(); } catch (_) {}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
if (mode === 'replace' && this.vectorStore && typeof this.vectorStore.deleteDocuments === 'function') {
|
|
264
|
+
try {
|
|
265
|
+
await this.vectorStore.deleteDocuments({ filter: { absolutePath: absPath } });
|
|
266
|
+
} catch (_) {}
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
let attempt = 0; let delay = DEFAULT_INITIAL_RETRY_DELAY;
|
|
270
|
+
while (true) {
|
|
271
|
+
try {
|
|
272
|
+
if (mode === 'replace' && this.vectorStore && typeof this.vectorStore.upsertDocuments === 'function') {
|
|
273
|
+
await this.vectorStore.upsertDocuments(documents);
|
|
274
|
+
} else {
|
|
275
|
+
await this.vectorStore.addDocuments(documents);
|
|
276
|
+
}
|
|
277
|
+
break;
|
|
278
|
+
} catch (err) {
|
|
279
|
+
attempt++;
|
|
280
|
+
if (attempt >= DEFAULT_RETRY_ATTEMPTS) throw err;
|
|
281
|
+
await new Promise(r => setTimeout(r, delay));
|
|
282
|
+
delay = Math.min(DEFAULT_MAX_RETRY_DELAY, delay * 2);
|
|
283
|
+
}
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
|
|
138
287
|
async ingestDocuments(filePath) {
|
|
139
288
|
const traceId = uuidv4();
|
|
140
289
|
const rootSpanId = uuidv4();
|
|
@@ -146,58 +295,33 @@ class VectraClient {
|
|
|
146
295
|
const stats = await fs.promises.stat(filePath);
|
|
147
296
|
|
|
148
297
|
if (stats.isDirectory()) {
|
|
149
|
-
|
|
150
|
-
const summary = { processed: 0, succeeded: 0, failed: 0, errors: [] };
|
|
151
|
-
for (const file of files) {
|
|
152
|
-
const full = path.join(filePath, file);
|
|
153
|
-
if (this._isTemporaryFile(full)) continue;
|
|
154
|
-
summary.processed++;
|
|
155
|
-
try {
|
|
156
|
-
await this.ingestDocuments(full);
|
|
157
|
-
summary.succeeded++;
|
|
158
|
-
} catch (err) {
|
|
159
|
-
summary.failed++;
|
|
160
|
-
summary.errors.push({ file: full, message: err?.message || String(err) });
|
|
161
|
-
this.trigger('onError', err);
|
|
162
|
-
}
|
|
163
|
-
}
|
|
164
|
-
this.trigger('onIngestSummary', summary);
|
|
298
|
+
await this._processDirectory(filePath);
|
|
165
299
|
return;
|
|
166
300
|
}
|
|
167
301
|
|
|
168
302
|
const t0 = Date.now();
|
|
169
303
|
this.trigger('onIngestStart', filePath);
|
|
170
|
-
|
|
171
|
-
const
|
|
172
|
-
const mtime = Math.floor(stats.mtimeMs || Date.now());
|
|
173
|
-
const md5 = crypto.createHash('md5');
|
|
174
|
-
const sha = crypto.createHash('sha256');
|
|
175
|
-
await new Promise((resolve, reject) => {
|
|
176
|
-
const s = fs.createReadStream(filePath);
|
|
177
|
-
s.on('data', (chunk) => { md5.update(chunk); sha.update(chunk); });
|
|
178
|
-
s.on('error', reject);
|
|
179
|
-
s.on('end', resolve);
|
|
180
|
-
});
|
|
181
|
-
const fileMD5 = md5.digest('hex');
|
|
182
|
-
const fileSHA256 = sha.digest('hex');
|
|
183
|
-
const validation = { absolutePath: absPath, fileMD5, fileSHA256, fileSize: size, lastModified: mtime, timestamp: Date.now() };
|
|
304
|
+
|
|
305
|
+
const validation = await this._validateFile(filePath, stats);
|
|
184
306
|
this.trigger('onPreIngestionValidation', validation);
|
|
307
|
+
|
|
185
308
|
const mode = (this.config.ingestion && this.config.ingestion.mode) ? this.config.ingestion.mode : 'skip';
|
|
186
309
|
let exists = false;
|
|
187
310
|
if (this.vectorStore && typeof this.vectorStore.fileExists === 'function') {
|
|
188
|
-
try { exists = await this.vectorStore.fileExists(fileSHA256,
|
|
311
|
+
try { exists = await this.vectorStore.fileExists(validation.fileSHA256, validation.fileSize, validation.lastModified); } catch { exists = false; }
|
|
189
312
|
}
|
|
190
313
|
if (mode === 'skip' && exists) {
|
|
191
314
|
this.trigger('onIngestSkipped', validation);
|
|
192
315
|
return;
|
|
193
316
|
}
|
|
317
|
+
|
|
194
318
|
const rawText = await this.processor.loadDocument(filePath);
|
|
195
319
|
|
|
196
320
|
this.trigger('onChunkingStart', this.config.chunking.strategy);
|
|
197
321
|
const chunks = await this.processor.process(rawText);
|
|
198
322
|
|
|
199
323
|
this.trigger('onEmbeddingStart', chunks.length);
|
|
200
|
-
|
|
324
|
+
|
|
201
325
|
const hashes = chunks.map(c => crypto.createHash('sha256').update(c).digest('hex'));
|
|
202
326
|
const toEmbed = [];
|
|
203
327
|
const mapIndex = [];
|
|
@@ -206,104 +330,24 @@ class VectraClient {
|
|
|
206
330
|
toEmbed.push(chunks[i]);
|
|
207
331
|
mapIndex.push(i);
|
|
208
332
|
});
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
const defaultLimit = (this.config.ingestion && typeof this.config.ingestion.concurrencyLimit === 'number') ? this.config.ingestion.concurrencyLimit : 5;
|
|
213
|
-
const limit = enabled ? defaultLimit : toEmbed.length;
|
|
214
|
-
const batches = [];
|
|
215
|
-
for (let i = 0; i < toEmbed.length; i += limit) batches.push(toEmbed.slice(i, i + limit));
|
|
216
|
-
for (const batch of batches) {
|
|
217
|
-
let attempt = 0; let delay = 500;
|
|
218
|
-
while (true) {
|
|
219
|
-
try {
|
|
220
|
-
const out = await this.embedder.embedDocuments(batch);
|
|
221
|
-
newEmbeds.push(...out);
|
|
222
|
-
break;
|
|
223
|
-
} catch (err) {
|
|
224
|
-
attempt++;
|
|
225
|
-
if (attempt >= 3) throw err;
|
|
226
|
-
await new Promise(r => setTimeout(r, delay));
|
|
227
|
-
delay = Math.min(4000, delay * 2);
|
|
228
|
-
}
|
|
229
|
-
}
|
|
230
|
-
}
|
|
231
|
-
newEmbeds.forEach((vec, j) => {
|
|
232
|
-
const h = hashes[mapIndex[j]];
|
|
233
|
-
this._embeddingCache.set(h, vec);
|
|
234
|
-
});
|
|
235
|
-
}
|
|
333
|
+
|
|
334
|
+
await this._batchEmbedChunks(toEmbed, mapIndex, hashes);
|
|
335
|
+
|
|
236
336
|
const embeddings = hashes.map((h) => this._embeddingCache.get(h));
|
|
237
337
|
|
|
238
|
-
const
|
|
239
|
-
|
|
240
|
-
let documents = chunks.map((content, i) => ({
|
|
241
|
-
id: uuidv5(`${fileSHA256}:${i}`, idNamespace),
|
|
242
|
-
content,
|
|
243
|
-
embedding: embeddings[i],
|
|
244
|
-
metadata: {
|
|
245
|
-
docId: uuidv5(`${fileSHA256}:${i}`, idNamespace),
|
|
246
|
-
source: filePath,
|
|
247
|
-
absolutePath: absPath,
|
|
248
|
-
fileMD5,
|
|
249
|
-
fileSHA256,
|
|
250
|
-
fileSize: size,
|
|
251
|
-
lastModified: mtime,
|
|
252
|
-
chunkIndex: i,
|
|
253
|
-
sha256: hashes[i],
|
|
254
|
-
fileType: metas[i]?.fileType,
|
|
255
|
-
docTitle: metas[i]?.docTitle,
|
|
256
|
-
pageFrom: metas[i]?.pageFrom,
|
|
257
|
-
pageTo: metas[i]?.pageTo,
|
|
258
|
-
section: metas[i]?.section
|
|
259
|
-
}
|
|
260
|
-
}));
|
|
261
|
-
|
|
262
|
-
if (this._metadataEnrichmentEnabled) {
|
|
263
|
-
const extra = await this._enrichChunkMetadata(chunks);
|
|
264
|
-
documents = documents.map((d, i) => ({
|
|
265
|
-
...d,
|
|
266
|
-
metadata: {
|
|
267
|
-
...d.metadata,
|
|
268
|
-
summary: extra[i]?.summary,
|
|
269
|
-
keywords: extra[i]?.keywords,
|
|
270
|
-
hypothetical_questions: extra[i]?.hypothetical_questions,
|
|
271
|
-
}
|
|
272
|
-
}));
|
|
273
|
-
}
|
|
274
|
-
|
|
275
|
-
if (this.vectorStore && typeof this.vectorStore.ensureIndexes === 'function') {
|
|
276
|
-
try { await this.vectorStore.ensureIndexes(); } catch (_) {}
|
|
277
|
-
}
|
|
338
|
+
const documents = await this._prepareDocuments(filePath, rawText, chunks, embeddings, hashes, validation);
|
|
339
|
+
|
|
278
340
|
let existsServer = false;
|
|
279
341
|
if (this.vectorStore && typeof this.vectorStore.fileExists === 'function') {
|
|
280
|
-
try { existsServer = await this.vectorStore.fileExists(fileSHA256,
|
|
342
|
+
try { existsServer = await this.vectorStore.fileExists(validation.fileSHA256, validation.fileSize, validation.lastModified); } catch { existsServer = false; }
|
|
281
343
|
}
|
|
282
344
|
if (mode === 'skip' && existsServer) {
|
|
283
345
|
this.trigger('onIngestSkipped', validation);
|
|
284
346
|
return;
|
|
285
347
|
}
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
} catch (_) {}
|
|
290
|
-
}
|
|
291
|
-
let attempt = 0; let delay = 500;
|
|
292
|
-
while (true) {
|
|
293
|
-
try {
|
|
294
|
-
if (mode === 'replace' && this.vectorStore && typeof this.vectorStore.upsertDocuments === 'function') {
|
|
295
|
-
await this.vectorStore.upsertDocuments(documents);
|
|
296
|
-
} else {
|
|
297
|
-
await this.vectorStore.addDocuments(documents);
|
|
298
|
-
}
|
|
299
|
-
break;
|
|
300
|
-
} catch (err) {
|
|
301
|
-
attempt++;
|
|
302
|
-
if (attempt >= 3) throw err;
|
|
303
|
-
await new Promise(r => setTimeout(r, delay));
|
|
304
|
-
delay = Math.min(4000, delay * 2);
|
|
305
|
-
}
|
|
306
|
-
}
|
|
348
|
+
|
|
349
|
+
await this._storeDocuments(documents, mode, validation.absolutePath);
|
|
350
|
+
|
|
307
351
|
const durationMs = Date.now() - t0;
|
|
308
352
|
this.trigger('onIngestEnd', filePath, chunks.length, durationMs);
|
|
309
353
|
|
|
@@ -315,7 +359,7 @@ class VectraClient {
|
|
|
315
359
|
endTime: Date.now(),
|
|
316
360
|
input: { filePath },
|
|
317
361
|
output: { chunks: chunks.length, durationMs },
|
|
318
|
-
attributes: { fileSize:
|
|
362
|
+
attributes: { fileSize: validation.fileSize },
|
|
319
363
|
provider,
|
|
320
364
|
modelName
|
|
321
365
|
});
|
|
@@ -393,16 +437,16 @@ class VectraClient {
|
|
|
393
437
|
}
|
|
394
438
|
|
|
395
439
|
buildContextParts(docs, query) {
|
|
396
|
-
const budget = (this.config.queryPlanning && this.config.queryPlanning.tokenBudget) ? this.config.queryPlanning.tokenBudget :
|
|
397
|
-
const preferSumm = (this.config.queryPlanning && this.config.queryPlanning.preferSummariesBelow) ? this.config.queryPlanning.preferSummariesBelow :
|
|
440
|
+
const budget = (this.config.queryPlanning && this.config.queryPlanning.tokenBudget) ? this.config.queryPlanning.tokenBudget : DEFAULT_TOKEN_BUDGET;
|
|
441
|
+
const preferSumm = (this.config.queryPlanning && this.config.queryPlanning.preferSummariesBelow) ? this.config.queryPlanning.preferSummariesBelow : DEFAULT_PREFER_SUMMARY_BELOW;
|
|
398
442
|
const parts = [];
|
|
399
443
|
let used = 0;
|
|
400
444
|
for (const d of docs) {
|
|
401
445
|
const t = d.metadata?.docTitle || '';
|
|
402
446
|
const sec = d.metadata?.section || '';
|
|
403
447
|
const pages = (d.metadata?.pageFrom && d.metadata?.pageTo) ? `pages ${d.metadata.pageFrom}-${d.metadata.pageTo}` : '';
|
|
404
|
-
const sum = d.metadata?.summary ? d.metadata.summary : d.content.slice(0,
|
|
405
|
-
const chosen = (this.tokenEstimate(sum) <= preferSumm) ? sum : d.content.slice(0,
|
|
448
|
+
const sum = d.metadata?.summary ? d.metadata.summary : d.content.slice(0, DEFAULT_SUMMARY_LENGTH);
|
|
449
|
+
const chosen = (this.tokenEstimate(sum) <= preferSumm) ? sum : d.content.slice(0, DEFAULT_CHUNK_LENGTH);
|
|
406
450
|
const part = `${t} ${sec} ${pages}\n${chosen}`;
|
|
407
451
|
const est = this.tokenEstimate(part);
|
|
408
452
|
if (used + est > budget) break;
|
package/src/observability.js
CHANGED
|
@@ -17,26 +17,20 @@ class SQLiteLogger {
|
|
|
17
17
|
const dbPath = path.isAbsolute(rawPath) ? rawPath : path.resolve(process.cwd(), rawPath);
|
|
18
18
|
// Ensure directory exists
|
|
19
19
|
const dbDir = path.dirname(dbPath);
|
|
20
|
-
console.log(`[SQLiteLogger] dbPath: ${dbPath}, dbDir: ${dbDir}`);
|
|
21
20
|
|
|
22
21
|
const fs = require('fs');
|
|
23
22
|
if (!fs.existsSync(dbDir)) {
|
|
24
|
-
console.log(`[SQLiteLogger] Creating directory: ${dbDir}`);
|
|
25
23
|
fs.mkdirSync(dbDir, { recursive: true });
|
|
26
|
-
} else {
|
|
27
|
-
console.log(`[SQLiteLogger] Directory exists: ${dbDir}`);
|
|
28
24
|
}
|
|
29
25
|
|
|
30
26
|
const sqlite3 = require('sqlite3').verbose();
|
|
31
27
|
this.db = new sqlite3.Database(dbPath, (err) => {
|
|
32
28
|
if (err) {
|
|
33
|
-
console.error('Failed to connect to SQLite database:', err);
|
|
34
29
|
throw err;
|
|
35
30
|
}
|
|
36
31
|
});
|
|
37
32
|
this.initializeSchema();
|
|
38
33
|
} catch (error) {
|
|
39
|
-
console.error('Failed to initialize SQLite logger:', error);
|
|
40
34
|
throw error;
|
|
41
35
|
}
|
|
42
36
|
}
|
package/src/processor.js
CHANGED
|
@@ -16,8 +16,38 @@ class DocumentProcessor {
|
|
|
16
16
|
const ext = path.extname(filePath).toLowerCase();
|
|
17
17
|
const buffer = await fs.promises.readFile(filePath);
|
|
18
18
|
if (ext === '.pdf') {
|
|
19
|
+
let PDFParse = pdf.PDFParse;
|
|
20
|
+
if (!PDFParse && pdf.default && pdf.default.PDFParse) {
|
|
21
|
+
PDFParse = pdf.default.PDFParse;
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
if (PDFParse) {
|
|
25
|
+
// Handle pdf-parse v2
|
|
26
|
+
const parser = new PDFParse({ data: buffer });
|
|
27
|
+
const info = await parser.getInfo();
|
|
28
|
+
const total = info.total;
|
|
29
|
+
const pages = [];
|
|
30
|
+
let fullText = '';
|
|
31
|
+
|
|
32
|
+
for (let i = 1; i <= total; i++) {
|
|
33
|
+
const pageRes = await parser.getText({ partial: [i] });
|
|
34
|
+
const pageText = pageRes.text || '';
|
|
35
|
+
pages.push(pageText);
|
|
36
|
+
fullText += pageText + '\n';
|
|
37
|
+
}
|
|
38
|
+
await parser.destroy();
|
|
39
|
+
this._lastPages = pages;
|
|
40
|
+
return fullText;
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// Fallback for v1 (or if PDFParse class not found)
|
|
44
|
+
let pdfFunc = pdf;
|
|
45
|
+
if (typeof pdfFunc !== 'function' && pdfFunc.default) {
|
|
46
|
+
pdfFunc = pdfFunc.default;
|
|
47
|
+
}
|
|
48
|
+
|
|
19
49
|
const pages = [];
|
|
20
|
-
const res = await
|
|
50
|
+
const res = await pdfFunc(buffer, {
|
|
21
51
|
pagerender: pageData => pageData.getTextContent().then(tc => {
|
|
22
52
|
const s = tc.items.map(it => it.str).join(' ');
|
|
23
53
|
pages.push(s);
|
|
@@ -95,7 +125,7 @@ class DocumentProcessor {
|
|
|
95
125
|
} else {
|
|
96
126
|
finalChunks.push(window);
|
|
97
127
|
}
|
|
98
|
-
} catch (
|
|
128
|
+
} catch (_) {
|
|
99
129
|
// Fallback to window if parsing fails
|
|
100
130
|
finalChunks.push(window);
|
|
101
131
|
}
|
package/src/webconfig_server.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
const http = require('http');
|
|
2
2
|
const fs = require('fs');
|
|
3
3
|
const path = require('path');
|
|
4
|
-
const {
|
|
4
|
+
const { ProviderType, ChunkingStrategy, RetrievalStrategy } = require('./config');
|
|
5
5
|
const sqlite3 = require('sqlite3').verbose();
|
|
6
6
|
|
|
7
7
|
|