@robthepcguy/rag-vault 1.7.0 → 1.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -9
- package/dist/chunker/semantic-chunker.d.ts +0 -1
- package/dist/chunker/semantic-chunker.d.ts.map +1 -1
- package/dist/chunker/semantic-chunker.js +1 -1
- package/dist/chunker/semantic-chunker.js.map +1 -1
- package/dist/embedder/index.d.ts +5 -0
- package/dist/embedder/index.d.ts.map +1 -1
- package/dist/embedder/index.js +40 -5
- package/dist/embedder/index.js.map +1 -1
- package/dist/errors/index.d.ts +1 -1
- package/dist/errors/index.d.ts.map +1 -1
- package/dist/flywheel/feedback.d.ts +1 -1
- package/dist/flywheel/feedback.d.ts.map +1 -1
- package/dist/flywheel/feedback.js +1 -1
- package/dist/flywheel/feedback.js.map +1 -1
- package/dist/parser/pdf-filter.d.ts +3 -5
- package/dist/parser/pdf-filter.d.ts.map +1 -1
- package/dist/parser/pdf-filter.js +1 -1
- package/dist/parser/pdf-filter.js.map +1 -1
- package/dist/query/parser.d.ts +2 -6
- package/dist/query/parser.d.ts.map +1 -1
- package/dist/query/parser.js +14 -22
- package/dist/query/parser.js.map +1 -1
- package/dist/server/index.d.ts.map +1 -1
- package/dist/server/index.js +37 -34
- package/dist/server/index.js.map +1 -1
- package/dist/server/raw-data-utils.d.ts +0 -40
- package/dist/server/raw-data-utils.d.ts.map +1 -1
- package/dist/server/raw-data-utils.js +9 -8
- package/dist/server/raw-data-utils.js.map +1 -1
- package/dist/server/remote-transport.d.ts +2 -1
- package/dist/server/remote-transport.d.ts.map +1 -1
- package/dist/server/remote-transport.js +27 -7
- package/dist/server/remote-transport.js.map +1 -1
- package/dist/server/schemas.d.ts +5 -29
- package/dist/server/schemas.d.ts.map +1 -1
- package/dist/server/schemas.js +6 -6
- package/dist/server/schemas.js.map +1 -1
- package/dist/vectordb/index.d.ts +12 -9
- package/dist/vectordb/index.d.ts.map +1 -1
- package/dist/vectordb/index.js +242 -163
- package/dist/vectordb/index.js.map +1 -1
- package/dist/web/api-routes.d.ts.map +1 -1
- package/dist/web/api-routes.js +23 -10
- package/dist/web/api-routes.js.map +1 -1
- package/dist/web/database-manager.d.ts.map +1 -1
- package/dist/web/database-manager.js +32 -25
- package/dist/web/database-manager.js.map +1 -1
- package/dist/web/http-server.d.ts +0 -5
- package/dist/web/http-server.d.ts.map +1 -1
- package/dist/web/http-server.js +4 -8
- package/dist/web/http-server.js.map +1 -1
- package/dist/web/index.js +4 -2
- package/dist/web/index.js.map +1 -1
- package/dist/web/middleware/async-handler.d.ts +2 -1
- package/dist/web/middleware/async-handler.d.ts.map +1 -1
- package/dist/web/middleware/rate-limit.d.ts +2 -1
- package/dist/web/middleware/rate-limit.d.ts.map +1 -1
- package/dist/web/middleware/request-logger.d.ts +1 -1
- package/dist/web/middleware/request-logger.d.ts.map +1 -1
- package/package.json +1 -1
- package/web-ui/dist/assets/{CollectionsPage-BDmEfv3V.js → CollectionsPage-CjLs8_5j.js} +1 -1
- package/web-ui/dist/assets/{FilesPage-pG9HmpgQ.js → FilesPage-Bw9x9aMr.js} +1 -1
- package/web-ui/dist/assets/ReaderPage-JPNiOF-x.js +28 -0
- package/web-ui/dist/assets/{ReaderSettingsContext-CkSjqsRh.js → ReaderSettingsContext-BLFJnEne.js} +1 -1
- package/web-ui/dist/assets/{SearchPage-DAltjnLL.js → SearchPage-D3_Vtbdw.js} +1 -1
- package/web-ui/dist/assets/{SettingsPage-C6J5BITP.js → SettingsPage-BAxB2264.js} +1 -1
- package/web-ui/dist/assets/{StatusPage-powRGmW3.js → StatusPage-CzJZW8Gs.js} +1 -1
- package/web-ui/dist/assets/{UploadPage-eyfSjL4u.js → UploadPage-DW8OujeJ.js} +5 -5
- package/web-ui/dist/assets/index-ANt8Xo4z.js +6 -0
- package/web-ui/dist/assets/index-DovQIIL4.css +1 -0
- package/web-ui/dist/assets/motion-Brxs0UET.js +9 -0
- package/web-ui/dist/assets/vendor-DSXQOR6A.js +10 -0
- package/web-ui/dist/index.html +3 -3
- package/web-ui/dist/assets/ReaderPage-CwMN03NU.js +0 -28
- package/web-ui/dist/assets/index-BpwaiuGL.css +0 -1
- package/web-ui/dist/assets/index-D068MV_o.js +0 -6
- package/web-ui/dist/assets/motion-CKwJwI3J.js +0 -9
- package/web-ui/dist/assets/vendor-C2QPsZ3S.js +0 -10
package/dist/vectordb/index.d.ts
CHANGED
|
@@ -10,12 +10,6 @@ export { DatabaseError } from '../errors/index.js';
|
|
|
10
10
|
* @returns true if path is safe for use in queries
|
|
11
11
|
*/
|
|
12
12
|
export declare function isValidFilePath(filePath: string): boolean;
|
|
13
|
-
/**
|
|
14
|
-
* Generate a content-based fingerprint for a chunk.
|
|
15
|
-
* Uses SHA-256 hash of normalized text (first 16 hex chars for compactness).
|
|
16
|
-
* This enables stable chunk identification across re-indexing.
|
|
17
|
-
*/
|
|
18
|
-
export declare function generateChunkFingerprint(text: string): string;
|
|
19
13
|
/**
|
|
20
14
|
* Grouping mode for quality filtering
|
|
21
15
|
* - 'similar': Only return the most similar group (stops at first distance jump)
|
|
@@ -25,7 +19,7 @@ export type GroupingMode = 'similar' | 'related';
|
|
|
25
19
|
/**
|
|
26
20
|
* VectorStore configuration
|
|
27
21
|
*/
|
|
28
|
-
|
|
22
|
+
interface VectorStoreConfig {
|
|
29
23
|
/** LanceDB database path */
|
|
30
24
|
dbPath: string;
|
|
31
25
|
/** Table name */
|
|
@@ -44,7 +38,7 @@ export interface VectorStoreConfig {
|
|
|
44
38
|
/**
|
|
45
39
|
* Document metadata
|
|
46
40
|
*/
|
|
47
|
-
|
|
41
|
+
interface DocumentMetadata {
|
|
48
42
|
/** File name */
|
|
49
43
|
fileName: string;
|
|
50
44
|
/** File size in bytes */
|
|
@@ -78,7 +72,7 @@ export interface VectorChunk {
|
|
|
78
72
|
/**
|
|
79
73
|
* Search result
|
|
80
74
|
*/
|
|
81
|
-
|
|
75
|
+
interface SearchResult {
|
|
82
76
|
/** File path */
|
|
83
77
|
filePath: string;
|
|
84
78
|
/** Chunk index */
|
|
@@ -183,6 +177,15 @@ export declare class VectorStore {
|
|
|
183
177
|
* @param filePath - File path (absolute)
|
|
184
178
|
*/
|
|
185
179
|
deleteChunks(filePath: string): Promise<void>;
|
|
180
|
+
/**
|
|
181
|
+
* Delete chunks for a file, excluding a set of IDs.
|
|
182
|
+
* Used by insert-then-delete re-ingestion to remove old vectors
|
|
183
|
+
* while keeping newly inserted ones.
|
|
184
|
+
*
|
|
185
|
+
* @param filePath - File path whose old chunks should be removed
|
|
186
|
+
* @param excludeIds - Set of chunk IDs to keep (the new batch)
|
|
187
|
+
*/
|
|
188
|
+
deleteChunksExcluding(filePath: string, excludeIds: Set<string>): Promise<void>;
|
|
186
189
|
/**
|
|
187
190
|
* Batch insert vector chunks
|
|
188
191
|
*
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/vectordb/index.ts"],"names":[],"mappings":"AAOA,OAAO,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAA;
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/vectordb/index.ts"],"names":[],"mappings":"AAOA,OAAO,EAAE,aAAa,EAAE,MAAM,oBAAoB,CAAA;AA0HlD;;;;GAIG;AACH;;;;GAIG;AACH,wBAAgB,eAAe,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAOzD;AAgCD;;;;GAIG;AACH,MAAM,MAAM,YAAY,GAAG,SAAS,GAAG,SAAS,CAAA;AAEhD;;GAEG;AACH,UAAU,iBAAiB;IACzB,4BAA4B;IAC5B,MAAM,EAAE,MAAM,CAAA;IACd,iBAAiB;IACjB,SAAS,EAAE,MAAM,CAAA;IACjB,kEAAkE;IAClE,WAAW,CAAC,EAAE,MAAM,CAAA;IACpB,qDAAqD;IACrD,QAAQ,CAAC,EAAE,YAAY,CAAA;IACvB;;;;OAIG;IACH,YAAY,CAAC,EAAE,MAAM,CAAA;CACtB;AAED;;GAEG;AACH,UAAU,gBAAgB;IACxB,gBAAgB;IAChB,QAAQ,EAAE,MAAM,CAAA;IAChB,yBAAyB;IACzB,QAAQ,EAAE,MAAM,CAAA;IAChB,4BAA4B;IAC5B,QAAQ,EAAE,MAAM,CAAA;IAChB,mEAAmE;IACnE,MAAM,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAA;CAChC;AAED;;GAEG;AACH,MAAM,WAAW,WAAW;IAC1B,sBAAsB;IACtB,EAAE,EAAE,MAAM,CAAA;IACV,2BAA2B;IAC3B,QAAQ,EAAE,MAAM,CAAA;IAChB,+BAA+B;IAC/B,UAAU,EAAE,MAAM,CAAA;IAClB,iBAAiB;IACjB,IAAI,EAAE,MAAM,CAAA;IACZ,oDAAoD;IACpD,MAAM,EAAE,MAAM,EAAE,CAAA;IAChB,eAAe;IACf,QAAQ,EAAE,gBAAgB,CAAA;IAC1B,4CAA4C;IAC5C,SAAS,EAAE,MAAM,CAAA;IACjB,uEAAuE;IACvE,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAED;;GAEG;AACH,UAAU,YAAY;IACpB,gBAAgB;IAChB,QAAQ,EAAE,MAAM,CAAA;IAChB,kBAAkB;IAClB,UAAU,EAAE,MAAM,CAAA;IAClB,iBAAiB;IACjB,IAAI,EAAE,MAAM,CAAA;IACZ;;;;OAIG;IACH,KAAK,EAAE,MAAM,CAAA;IACb,eAAe;IACf,QAAQ,EAAE,gBAAgB,CAAA;IAC1B,sDAAsD;IACtD,WAAW,CAAC,EAAE,MAAM,CAAA;CACrB;AAuGD;;;;;;;;;;;;GAYG;AACH,qBAAa,WAAW;IACtB,OAAO,CAAC,EAAE,CAA0B;IACpC,OAAO,CAAC,KAAK,CAAqB;IAClC,OAAO,CAAC,QAAQ,CAAC,MAAM,CAAmB;IAC1C,OAAO,CAAC,UAAU,CAAQ;IAC1B,OAAO,CAAC,eAAe,CAAI;IAC3B,OAAO,CAAC,cAAc,CAAsB;IAC5C,2DAA2D;IAC3D,OAAO,CAAC,kBAAkB,CAAgC;IAC1D,qEAAqE;IACrE,OAAO,CAAC,oBAAoB,CAAsB;IAClD,6EAA6E;IAC7E,OAAO,CAAC,oBAAoB,CAA6B;gBAE7C,MAAM,EAAE,iBAAiB;IAIrC;;OAEG;IACH,eAAe,IAAI,MAAM;IAIzB;;;OAGG;IACH,eAAe,CAAC,MAAM,EAAE,MAAM,GAAG,IAAI;IAOrC;;;;;OAKG;IACH,OAAO,CAAC,gBAAgB;IAqCxB;;;;OAIG;IACH,OAAO,CAAC,gBAAgB;IAcxB;;OAEG;IACH,OAAO,CAAC,gBAAgB;IAQxB;;;;;;;OAOG;IACH,OAAO,CAAC,qCAAqC;IAU7C;;;;;;OAMG;IACH,OAAO,CAAC,8BAA8B;IAoDtC;;;;;OAKG;YACW,2BAA2B;IA+CzC;;OAEG;IACG,UAAU,IAAI,OAAO,CAAC,IAAI,CAAC;IA6CjC;;;;OAIG;IACG,YAAY,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC;IAkDnD;;;;;;;OAOG;IACG,qBAAqB,CAAC,QAAQ,EAAE,MAAM,EAAE,UAAU,EAAE,GAAG,CAAC,MAAM,CAAC,GAAG,OAAO,CAAC,IAAI,CAAC;IA8CrF;;;;OAIG;IACG,YAAY,CAAC,MAAM,EAAE,WAAW,EAAE,GAAG,OAAO,CAAC,IAAI,CAAC;IA4DxD;;;;OAIG;YACW,cAAc;IAyC5B;;;;OAIG;YACW,eAAe;IAY7B;;;;;;;;;;OAUG;IACH,OAAO,CAAC,aAAa;IA2CrB;;;;;;;;;;;;OAYG;IACG,MAAM,CAAC,WAAW,EAAE,MAAM,EAAE,EAAE,SAAS,CAAC,EAAE,MAAM,EAAE,KAAK,SAAK,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;IA6E5F;;;;;;;;;;;;OAYG;IACH,OAAO,CAAC,iBAAiB;IA0CzB;;;;;;;OAOG;IACG,SAAS,CAAC,OAAO,CAAC,EAAE;QACxB,KAAK,CAAC,EAAE,MAAM,CAAA;QACd,MAAM,CAAC,EAAE,MAAM,CAAA;KAChB,GAAG,OAAO,CAAC;QAAE,QAAQ,EAAE,MAAM,CAAC;QAAC,UAAU,EAAE,MAAM,CAAC;QAAC,SAAS,EAAE,MAAM,CAAA;KAAE,EAAE,CAAC;IA8D1E;;;;;;;;;OASG;IACG,KAAK,IAAI,OAAO,CAAC,IAAI,CAAC;IAuC5B;;;;;OAKG;IACG,iBAAiB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,YAAY,EAAE,CAAC;IAsClE;;;;;;;;OAQG;IACG,iBAAiB,CACrB,QAAQ,EAAE,MAAM,EAChB,UAAU,EAAE,MAAM,EAClB,KAAK,SAAI,EACT,mBAAmB,UAAO,GACzB,OAAO,CAAC,YAAY,EAAE,CAAC;IAgF1B;;;;OAIG;IACG,SAAS,IAAI,OAAO,CAAC;QACzB,aAAa,EAAE,MAAM,CAAA;QACrB,UAAU,EAAE,MAAM,CAAA;QAClB,WAAW,EAAE,MAAM,CAAA;QACnB,MAAM,EAAE,MAAM,CAAA;QACd,eAAe,EAAE,OAAO,CAAA;QACxB,UAAU,EAAE,QAAQ,GAAG,aAAa,CAAA;KACrC,CAAC;CA2CH"}
|
package/dist/vectordb/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
// VectorStore implementation with LanceDB integration
|
|
2
2
|
import { createHash } from 'node:crypto';
|
|
3
|
-
import {
|
|
3
|
+
import { connect, Index } from '@lancedb/lancedb';
|
|
4
4
|
import { DatabaseError } from '../errors/index.js';
|
|
5
5
|
// Re-export error class for backwards compatibility
|
|
6
6
|
export { DatabaseError } from '../errors/index.js';
|
|
@@ -15,7 +15,9 @@ function parseEnvNumber(envVar, defaultValue) {
|
|
|
15
15
|
if (!value)
|
|
16
16
|
return defaultValue;
|
|
17
17
|
const parsed = Number.parseFloat(value);
|
|
18
|
-
|
|
18
|
+
if (Number.isNaN(parsed) || !Number.isFinite(parsed))
|
|
19
|
+
return defaultValue;
|
|
20
|
+
return parsed;
|
|
19
21
|
}
|
|
20
22
|
/**
|
|
21
23
|
* Parse an integer environment variable with fallback
|
|
@@ -25,7 +27,9 @@ function parseEnvInt(envVar, defaultValue) {
|
|
|
25
27
|
if (!value)
|
|
26
28
|
return defaultValue;
|
|
27
29
|
const parsed = Number.parseInt(value, 10);
|
|
28
|
-
|
|
30
|
+
if (Number.isNaN(parsed) || !Number.isFinite(parsed))
|
|
31
|
+
return defaultValue;
|
|
32
|
+
return parsed;
|
|
29
33
|
}
|
|
30
34
|
/**
|
|
31
35
|
* Standard deviation multiplier for detecting group boundaries.
|
|
@@ -81,6 +85,27 @@ const CUSTOM_METADATA_ALL_FIELDS = '__all__';
|
|
|
81
85
|
* Rejects paths with SQL injection attempts or path traversal.
|
|
82
86
|
*/
|
|
83
87
|
const SAFE_PATH_REGEX = /^[a-zA-Z0-9\\/_.:\- ]+$/;
|
|
88
|
+
/**
|
|
89
|
+
* Retry a read-only async operation with exponential backoff.
|
|
90
|
+
* Used for transient disk/IO errors on VectorStore reads.
|
|
91
|
+
*/
|
|
92
|
+
async function withRetry(fn, label, maxAttempts = 3, baseDelayMs = 100) {
|
|
93
|
+
let lastError;
|
|
94
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
95
|
+
try {
|
|
96
|
+
return await fn();
|
|
97
|
+
}
|
|
98
|
+
catch (error) {
|
|
99
|
+
lastError = error;
|
|
100
|
+
if (attempt < maxAttempts) {
|
|
101
|
+
const delayMs = baseDelayMs * 2 ** (attempt - 1);
|
|
102
|
+
console.warn(`${label}: attempt ${attempt}/${maxAttempts} failed (${lastError.message}), retrying in ${delayMs}ms...`);
|
|
103
|
+
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
throw lastError;
|
|
108
|
+
}
|
|
84
109
|
/**
|
|
85
110
|
* Validate file path to prevent SQL injection and path traversal attacks.
|
|
86
111
|
* @param filePath - The file path to validate
|
|
@@ -121,7 +146,7 @@ function normalizeTextForFingerprint(text) {
|
|
|
121
146
|
* Uses SHA-256 hash of normalized text (first 16 hex chars for compactness).
|
|
122
147
|
* This enables stable chunk identification across re-indexing.
|
|
123
148
|
*/
|
|
124
|
-
|
|
149
|
+
function generateChunkFingerprint(text) {
|
|
125
150
|
const normalized = normalizeTextForFingerprint(text);
|
|
126
151
|
const hash = createHash('sha256').update(normalized, 'utf8').digest('hex');
|
|
127
152
|
// Use first 16 characters (64 bits) - sufficient for practical uniqueness
|
|
@@ -507,6 +532,48 @@ export class VectorStore {
|
|
|
507
532
|
// Ignorable errors (no matching records) are logged but not thrown
|
|
508
533
|
}
|
|
509
534
|
}
|
|
535
|
+
/**
|
|
536
|
+
* Delete chunks for a file, excluding a set of IDs.
|
|
537
|
+
* Used by insert-then-delete re-ingestion to remove old vectors
|
|
538
|
+
* while keeping newly inserted ones.
|
|
539
|
+
*
|
|
540
|
+
* @param filePath - File path whose old chunks should be removed
|
|
541
|
+
* @param excludeIds - Set of chunk IDs to keep (the new batch)
|
|
542
|
+
*/
|
|
543
|
+
async deleteChunksExcluding(filePath, excludeIds) {
|
|
544
|
+
if (!this.table || excludeIds.size === 0) {
|
|
545
|
+
return;
|
|
546
|
+
}
|
|
547
|
+
if (!isValidFilePath(filePath)) {
|
|
548
|
+
throw new DatabaseError('Invalid file path: contains disallowed characters or patterns');
|
|
549
|
+
}
|
|
550
|
+
const escapedFilePath = filePath.replace(/'/g, "''");
|
|
551
|
+
try {
|
|
552
|
+
// Query existing chunks for this file to find old IDs
|
|
553
|
+
const existing = await this.table
|
|
554
|
+
.query()
|
|
555
|
+
.where(`\`filePath\` = '${escapedFilePath}'`)
|
|
556
|
+
.select(['id'])
|
|
557
|
+
.toArray();
|
|
558
|
+
const oldIds = existing.map((row) => row.id).filter((id) => !excludeIds.has(id));
|
|
559
|
+
if (oldIds.length === 0) {
|
|
560
|
+
return;
|
|
561
|
+
}
|
|
562
|
+
// Delete old chunks by ID
|
|
563
|
+
const idList = oldIds.map((id) => `'${id.replace(/'/g, "''")}'`).join(', ');
|
|
564
|
+
await this.table.delete(`\`id\` IN (${idList})`);
|
|
565
|
+
console.error(`VectorStore: Removed ${oldIds.length} old chunks for "${filePath}"`);
|
|
566
|
+
await this.rebuildFtsIndex();
|
|
567
|
+
}
|
|
568
|
+
catch (error) {
|
|
569
|
+
// Non-fatal: temporary duplicates are acceptable
|
|
570
|
+
const errorMessage = error.message.toLowerCase();
|
|
571
|
+
const isIgnorable = DELETE_IGNORABLE_PATTERNS.some((pattern) => errorMessage.includes(pattern));
|
|
572
|
+
if (!isIgnorable) {
|
|
573
|
+
throw new DatabaseError(`Failed to clean up old chunks for file: ${filePath}`, error);
|
|
574
|
+
}
|
|
575
|
+
}
|
|
576
|
+
}
|
|
510
577
|
/**
|
|
511
578
|
* Batch insert vector chunks
|
|
512
579
|
*
|
|
@@ -688,55 +755,61 @@ export class VectorStore {
|
|
|
688
755
|
if (limit < 1 || limit > 20) {
|
|
689
756
|
throw new DatabaseError(`Invalid limit: expected 1-20, got ${limit}`);
|
|
690
757
|
}
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
// Convert to SearchResult format with type validation
|
|
702
|
-
let results = vectorResults.map((result) => toSearchResult(result));
|
|
703
|
-
// Step 2: Apply grouping filter on vector distances (before keyword boost)
|
|
704
|
-
// Grouping is meaningful only on semantic distances, not after keyword boost
|
|
705
|
-
if (this.config.grouping && results.length > 1) {
|
|
706
|
-
results = this.applyGrouping(results, this.config.grouping);
|
|
707
|
-
}
|
|
708
|
-
// Step 3: Apply keyword boost if enabled (with circuit breaker)
|
|
709
|
-
const hybridWeight = this.getHybridWeight();
|
|
710
|
-
if (this.shouldAttemptFts() && queryText && queryText.trim().length > 0 && hybridWeight > 0) {
|
|
711
|
-
try {
|
|
712
|
-
// Get unique filePaths from vector results to filter FTS search
|
|
713
|
-
const uniqueFilePaths = [...new Set(results.map((r) => r.filePath))];
|
|
714
|
-
// Build WHERE clause with IN for targeted FTS search
|
|
715
|
-
// Use backticks for column name (required for camelCase in LanceDB)
|
|
716
|
-
const escapedPaths = uniqueFilePaths.map((p) => `'${p.replace(/'/g, "''")}'`);
|
|
717
|
-
const whereClause = `\`filePath\` IN (${escapedPaths.join(', ')})`;
|
|
718
|
-
const ftsResults = await this.table
|
|
719
|
-
.search(queryText, 'fts', 'text')
|
|
720
|
-
.where(whereClause)
|
|
721
|
-
.select(['filePath', 'chunkIndex', 'text', 'metadata', '_score'])
|
|
722
|
-
.limit(results.length * 2) // Enough to cover all vector results
|
|
723
|
-
.toArray();
|
|
724
|
-
results = this.applyKeywordBoost(results, ftsResults, hybridWeight);
|
|
725
|
-
// FTS succeeded - reset circuit breaker
|
|
726
|
-
this.recordFtsSuccess();
|
|
758
|
+
const table = this.table;
|
|
759
|
+
return withRetry(async () => {
|
|
760
|
+
try {
|
|
761
|
+
// Step 1: Semantic (vector) search - always the primary search
|
|
762
|
+
const candidateLimit = limit * HYBRID_SEARCH_CANDIDATE_MULTIPLIER;
|
|
763
|
+
// Assumes normalized embeddings so dot behaves like cosine distance (lower is better, [0,2]).
|
|
764
|
+
let query = table.vectorSearch(queryVector).distanceType('dot').limit(candidateLimit);
|
|
765
|
+
// Apply distance threshold at query level
|
|
766
|
+
if (this.config.maxDistance !== undefined) {
|
|
767
|
+
query = query.distanceRange(undefined, this.config.maxDistance);
|
|
727
768
|
}
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
769
|
+
const vectorResults = await query.toArray();
|
|
770
|
+
// Convert to SearchResult format with type validation
|
|
771
|
+
let results = vectorResults.map((result) => toSearchResult(result));
|
|
772
|
+
// Step 2: Apply grouping filter on vector distances (before keyword boost)
|
|
773
|
+
// Grouping is meaningful only on semantic distances, not after keyword boost
|
|
774
|
+
if (this.config.grouping && results.length > 1) {
|
|
775
|
+
results = this.applyGrouping(results, this.config.grouping);
|
|
732
776
|
}
|
|
777
|
+
// Step 3: Apply keyword boost if enabled (with circuit breaker)
|
|
778
|
+
const hybridWeight = this.getHybridWeight();
|
|
779
|
+
if (this.shouldAttemptFts() &&
|
|
780
|
+
queryText &&
|
|
781
|
+
queryText.trim().length > 0 &&
|
|
782
|
+
hybridWeight > 0) {
|
|
783
|
+
try {
|
|
784
|
+
// Get unique filePaths from vector results to filter FTS search
|
|
785
|
+
const uniqueFilePaths = [...new Set(results.map((r) => r.filePath))];
|
|
786
|
+
// Build WHERE clause with IN for targeted FTS search
|
|
787
|
+
// Use backticks for column name (required for camelCase in LanceDB)
|
|
788
|
+
const escapedPaths = uniqueFilePaths.map((p) => `'${p.replace(/'/g, "''")}'`);
|
|
789
|
+
const whereClause = `\`filePath\` IN (${escapedPaths.join(', ')})`;
|
|
790
|
+
const ftsResults = await table
|
|
791
|
+
.search(queryText, 'fts', 'text')
|
|
792
|
+
.where(whereClause)
|
|
793
|
+
.select(['filePath', 'chunkIndex', 'text', 'metadata', '_score'])
|
|
794
|
+
.limit(results.length * 2) // Enough to cover all vector results
|
|
795
|
+
.toArray();
|
|
796
|
+
results = this.applyKeywordBoost(results, ftsResults, hybridWeight);
|
|
797
|
+
// FTS succeeded - reset circuit breaker
|
|
798
|
+
this.recordFtsSuccess();
|
|
799
|
+
}
|
|
800
|
+
catch (ftsError) {
|
|
801
|
+
// Record failure for circuit breaker (will auto-recover after cooldown)
|
|
802
|
+
this.recordFtsFailure(ftsError);
|
|
803
|
+
// Continue with vector-only results
|
|
804
|
+
}
|
|
805
|
+
}
|
|
806
|
+
// Return top results after all filtering and boosting
|
|
807
|
+
return results.slice(0, limit);
|
|
733
808
|
}
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
throw new DatabaseError('Failed to search vectors', error);
|
|
739
|
-
}
|
|
809
|
+
catch (error) {
|
|
810
|
+
throw new DatabaseError('Failed to search vectors', error);
|
|
811
|
+
}
|
|
812
|
+
}, 'VectorStore.search');
|
|
740
813
|
}
|
|
741
814
|
/**
|
|
742
815
|
* Apply keyword boost to rerank vector search results
|
|
@@ -798,52 +871,55 @@ export class VectorStore {
|
|
|
798
871
|
if (!this.table) {
|
|
799
872
|
return []; // Return empty array if table doesn't exist
|
|
800
873
|
}
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
const
|
|
810
|
-
const
|
|
811
|
-
|
|
812
|
-
const
|
|
813
|
-
if (
|
|
814
|
-
fileInfo
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
874
|
+
const table = this.table;
|
|
875
|
+
return withRetry(async () => {
|
|
876
|
+
try {
|
|
877
|
+
// Retrieve all records - LanceDB doesn't support GROUP BY aggregation,
|
|
878
|
+
// so we must fetch records and group in memory
|
|
879
|
+
// TODO(perf): Consider caching file list or using incremental updates for very large datasets
|
|
880
|
+
const allRecords = await table.query().toArray();
|
|
881
|
+
// Group by file path
|
|
882
|
+
const fileMap = new Map();
|
|
883
|
+
for (const record of allRecords) {
|
|
884
|
+
const filePath = record.filePath;
|
|
885
|
+
const timestamp = record.timestamp;
|
|
886
|
+
if (fileMap.has(filePath)) {
|
|
887
|
+
const fileInfo = fileMap.get(filePath);
|
|
888
|
+
if (fileInfo) {
|
|
889
|
+
fileInfo.chunkCount += 1;
|
|
890
|
+
// Keep most recent timestamp
|
|
891
|
+
if (timestamp > fileInfo.timestamp) {
|
|
892
|
+
fileInfo.timestamp = timestamp;
|
|
893
|
+
}
|
|
818
894
|
}
|
|
819
895
|
}
|
|
896
|
+
else {
|
|
897
|
+
fileMap.set(filePath, { chunkCount: 1, timestamp });
|
|
898
|
+
}
|
|
820
899
|
}
|
|
821
|
-
|
|
822
|
-
|
|
900
|
+
// Convert Map to array of objects
|
|
901
|
+
let results = Array.from(fileMap.entries()).map(([filePath, info]) => ({
|
|
902
|
+
filePath,
|
|
903
|
+
chunkCount: info.chunkCount,
|
|
904
|
+
timestamp: info.timestamp,
|
|
905
|
+
}));
|
|
906
|
+
// Sort by timestamp descending (most recent first)
|
|
907
|
+
results.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
|
|
908
|
+
// Apply pagination if provided
|
|
909
|
+
const offset = options?.offset ?? 0;
|
|
910
|
+
const limit = options?.limit;
|
|
911
|
+
if (offset > 0) {
|
|
912
|
+
results = results.slice(offset);
|
|
823
913
|
}
|
|
914
|
+
if (limit !== undefined && limit > 0) {
|
|
915
|
+
results = results.slice(0, limit);
|
|
916
|
+
}
|
|
917
|
+
return results;
|
|
824
918
|
}
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
filePath,
|
|
828
|
-
chunkCount: info.chunkCount,
|
|
829
|
-
timestamp: info.timestamp,
|
|
830
|
-
}));
|
|
831
|
-
// Sort by timestamp descending (most recent first)
|
|
832
|
-
results.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
|
|
833
|
-
// Apply pagination if provided
|
|
834
|
-
const offset = options?.offset ?? 0;
|
|
835
|
-
const limit = options?.limit;
|
|
836
|
-
if (offset > 0) {
|
|
837
|
-
results = results.slice(offset);
|
|
838
|
-
}
|
|
839
|
-
if (limit !== undefined && limit > 0) {
|
|
840
|
-
results = results.slice(0, limit);
|
|
919
|
+
catch (error) {
|
|
920
|
+
throw new DatabaseError('Failed to list files', error);
|
|
841
921
|
}
|
|
842
|
-
|
|
843
|
-
}
|
|
844
|
-
catch (error) {
|
|
845
|
-
throw new DatabaseError('Failed to list files', error);
|
|
846
|
-
}
|
|
922
|
+
}, 'VectorStore.listFiles');
|
|
847
923
|
}
|
|
848
924
|
/**
|
|
849
925
|
* Close the database connection and release resources
|
|
@@ -902,30 +978,30 @@ export class VectorStore {
|
|
|
902
978
|
if (!isValidFilePath(filePath)) {
|
|
903
979
|
throw new DatabaseError(`Invalid file path: contains disallowed characters or patterns`);
|
|
904
980
|
}
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
.
|
|
909
|
-
.where(`\`filePath\` = '${escapedFilePath}'`)
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
};
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
}
|
|
981
|
+
const table = this.table;
|
|
982
|
+
return withRetry(async () => {
|
|
983
|
+
try {
|
|
984
|
+
const escapedFilePath = filePath.replace(/'/g, "''");
|
|
985
|
+
const results = await table.query().where(`\`filePath\` = '${escapedFilePath}'`).toArray();
|
|
986
|
+
// Convert to SearchResult format and sort by chunkIndex
|
|
987
|
+
const chunks = results.map((record) => {
|
|
988
|
+
const text = record.text;
|
|
989
|
+
return {
|
|
990
|
+
filePath: record.filePath,
|
|
991
|
+
chunkIndex: record.chunkIndex,
|
|
992
|
+
text,
|
|
993
|
+
score: 0, // No distance score for direct retrieval
|
|
994
|
+
metadata: record.metadata,
|
|
995
|
+
// Include fingerprint - generate if not stored (backwards compatibility)
|
|
996
|
+
fingerprint: record.fingerprint || generateChunkFingerprint(text),
|
|
997
|
+
};
|
|
998
|
+
});
|
|
999
|
+
return chunks.sort((a, b) => a.chunkIndex - b.chunkIndex);
|
|
1000
|
+
}
|
|
1001
|
+
catch (error) {
|
|
1002
|
+
throw new DatabaseError(`Failed to get document chunks for: ${filePath}`, error);
|
|
1003
|
+
}
|
|
1004
|
+
}, 'VectorStore.getDocumentChunks');
|
|
929
1005
|
}
|
|
930
1006
|
/**
|
|
931
1007
|
* Find related chunks using a chunk's stored embedding
|
|
@@ -944,59 +1020,62 @@ export class VectorStore {
|
|
|
944
1020
|
if (!isValidFilePath(filePath)) {
|
|
945
1021
|
throw new DatabaseError(`Invalid file path: contains disallowed characters or patterns`);
|
|
946
1022
|
}
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
.
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
const rawVector = sourceChunk?.vector;
|
|
959
|
-
// LanceDB may return vectors as Arrow Vector or Float32Array, not plain Array
|
|
960
|
-
// Convert to number[] for compatibility
|
|
961
|
-
let sourceVector;
|
|
962
|
-
if (rawVector) {
|
|
963
|
-
if (Array.isArray(rawVector)) {
|
|
964
|
-
sourceVector = rawVector;
|
|
1023
|
+
const table = this.table;
|
|
1024
|
+
return withRetry(async () => {
|
|
1025
|
+
try {
|
|
1026
|
+
// First, fetch the source chunk to get its vector
|
|
1027
|
+
const escapedFilePath = filePath.replace(/'/g, "''");
|
|
1028
|
+
const sourceResults = await table
|
|
1029
|
+
.query()
|
|
1030
|
+
.where(`\`filePath\` = '${escapedFilePath}' AND \`chunkIndex\` = ${chunkIndex}`)
|
|
1031
|
+
.toArray();
|
|
1032
|
+
if (sourceResults.length === 0) {
|
|
1033
|
+
return [];
|
|
965
1034
|
}
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
1035
|
+
const sourceChunk = sourceResults[0];
|
|
1036
|
+
const rawVector = sourceChunk?.vector;
|
|
1037
|
+
// LanceDB may return vectors as Arrow Vector or Float32Array, not plain Array
|
|
1038
|
+
// Convert to number[] for compatibility
|
|
1039
|
+
let sourceVector;
|
|
1040
|
+
if (rawVector) {
|
|
1041
|
+
if (Array.isArray(rawVector)) {
|
|
1042
|
+
sourceVector = rawVector;
|
|
1043
|
+
}
|
|
1044
|
+
else if (typeof rawVector === 'object' && 'length' in rawVector) {
|
|
1045
|
+
// Handle Arrow Vector, Float32Array, or other array-like objects
|
|
1046
|
+
sourceVector = Array.from(rawVector);
|
|
1047
|
+
}
|
|
969
1048
|
}
|
|
1049
|
+
if (!sourceVector || sourceVector.length === 0) {
|
|
1050
|
+
// Chunk exists but has no embedding (e.g., upload timed out mid-process)
|
|
1051
|
+
// Return empty results instead of throwing - allows batch operations to continue
|
|
1052
|
+
console.warn(`Chunk ${filePath}:${chunkIndex} has no valid vector (possibly corrupted)`);
|
|
1053
|
+
return [];
|
|
1054
|
+
}
|
|
1055
|
+
// Search for similar chunks using the source vector
|
|
1056
|
+
// Request more candidates to allow for filtering
|
|
1057
|
+
const candidateLimit = excludeSameDocument ? limit * 3 : limit + 1;
|
|
1058
|
+
let query = table.vectorSearch(sourceVector).distanceType('dot').limit(candidateLimit);
|
|
1059
|
+
// Apply distance threshold if configured
|
|
1060
|
+
if (this.config.maxDistance !== undefined) {
|
|
1061
|
+
query = query.distanceRange(undefined, this.config.maxDistance);
|
|
1062
|
+
}
|
|
1063
|
+
const vectorResults = await query.toArray();
|
|
1064
|
+
// Convert to SearchResult format with type validation
|
|
1065
|
+
let results = vectorResults.map((result) => toSearchResult(result));
|
|
1066
|
+
// Filter out the source chunk itself
|
|
1067
|
+
results = results.filter((r) => !(r.filePath === filePath && r.chunkIndex === chunkIndex));
|
|
1068
|
+
// Optionally filter out same-document chunks
|
|
1069
|
+
if (excludeSameDocument) {
|
|
1070
|
+
results = results.filter((r) => r.filePath !== filePath);
|
|
1071
|
+
}
|
|
1072
|
+
return results.slice(0, limit);
|
|
970
1073
|
}
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
console.warn(`Chunk ${filePath}:${chunkIndex} has no valid vector (possibly corrupted)`);
|
|
975
|
-
return [];
|
|
976
|
-
}
|
|
977
|
-
// Search for similar chunks using the source vector
|
|
978
|
-
// Request more candidates to allow for filtering
|
|
979
|
-
const candidateLimit = excludeSameDocument ? limit * 3 : limit + 1;
|
|
980
|
-
let query = this.table.vectorSearch(sourceVector).distanceType('dot').limit(candidateLimit);
|
|
981
|
-
// Apply distance threshold if configured
|
|
982
|
-
if (this.config.maxDistance !== undefined) {
|
|
983
|
-
query = query.distanceRange(undefined, this.config.maxDistance);
|
|
984
|
-
}
|
|
985
|
-
const vectorResults = await query.toArray();
|
|
986
|
-
// Convert to SearchResult format with type validation
|
|
987
|
-
let results = vectorResults.map((result) => toSearchResult(result));
|
|
988
|
-
// Filter out the source chunk itself
|
|
989
|
-
results = results.filter((r) => !(r.filePath === filePath && r.chunkIndex === chunkIndex));
|
|
990
|
-
// Optionally filter out same-document chunks
|
|
991
|
-
if (excludeSameDocument) {
|
|
992
|
-
results = results.filter((r) => r.filePath !== filePath);
|
|
1074
|
+
catch (error) {
|
|
1075
|
+
const cause = error instanceof Error ? error.message : String(error);
|
|
1076
|
+
throw new DatabaseError(`Failed to find related chunks for: ${filePath}:${chunkIndex}: ${cause}`, error);
|
|
993
1077
|
}
|
|
994
|
-
|
|
995
|
-
}
|
|
996
|
-
catch (error) {
|
|
997
|
-
const cause = error instanceof Error ? error.message : String(error);
|
|
998
|
-
throw new DatabaseError(`Failed to find related chunks for: ${filePath}:${chunkIndex}: ${cause}`, error);
|
|
999
|
-
}
|
|
1078
|
+
}, 'VectorStore.findRelatedChunks');
|
|
1000
1079
|
}
|
|
1001
1080
|
/**
|
|
1002
1081
|
* Get system status
|