@robthepcguy/rag-vault 1.7.2 → 1.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +92 -40
- package/dist/chunker/semantic-chunker.d.ts +0 -1
- package/dist/chunker/semantic-chunker.d.ts.map +1 -1
- package/dist/chunker/semantic-chunker.js +1 -1
- package/dist/chunker/semantic-chunker.js.map +1 -1
- package/dist/embedder/index.d.ts +5 -0
- package/dist/embedder/index.d.ts.map +1 -1
- package/dist/embedder/index.js +40 -5
- package/dist/embedder/index.js.map +1 -1
- package/dist/errors/index.d.ts +1 -1
- package/dist/errors/index.d.ts.map +1 -1
- package/dist/flywheel/feedback.d.ts +1 -1
- package/dist/flywheel/feedback.d.ts.map +1 -1
- package/dist/flywheel/feedback.js +1 -1
- package/dist/flywheel/feedback.js.map +1 -1
- package/dist/hyde/index.d.ts +47 -0
- package/dist/hyde/index.d.ts.map +1 -0
- package/dist/hyde/index.js +203 -0
- package/dist/hyde/index.js.map +1 -0
- package/dist/parser/pdf-filter.d.ts +3 -5
- package/dist/parser/pdf-filter.d.ts.map +1 -1
- package/dist/parser/pdf-filter.js +1 -1
- package/dist/parser/pdf-filter.js.map +1 -1
- package/dist/query/parser.d.ts +2 -6
- package/dist/query/parser.d.ts.map +1 -1
- package/dist/query/parser.js +14 -22
- package/dist/query/parser.js.map +1 -1
- package/dist/reranker/index.d.ts +76 -0
- package/dist/reranker/index.d.ts.map +1 -0
- package/dist/reranker/index.js +199 -0
- package/dist/reranker/index.js.map +1 -0
- package/dist/server/index.d.ts +25 -0
- package/dist/server/index.d.ts.map +1 -1
- package/dist/server/index.js +140 -48
- package/dist/server/index.js.map +1 -1
- package/dist/server/raw-data-utils.d.ts +0 -40
- package/dist/server/raw-data-utils.d.ts.map +1 -1
- package/dist/server/raw-data-utils.js +9 -8
- package/dist/server/raw-data-utils.js.map +1 -1
- package/dist/server/remote-transport.d.ts +2 -1
- package/dist/server/remote-transport.d.ts.map +1 -1
- package/dist/server/remote-transport.js +26 -6
- package/dist/server/remote-transport.js.map +1 -1
- package/dist/server/schemas.d.ts +26 -129
- package/dist/server/schemas.d.ts.map +1 -1
- package/dist/server/schemas.js +9 -9
- package/dist/server/schemas.js.map +1 -1
- package/dist/utils/config-parsers.d.ts +14 -0
- package/dist/utils/config-parsers.d.ts.map +1 -1
- package/dist/utils/config-parsers.js +26 -0
- package/dist/utils/config-parsers.js.map +1 -1
- package/dist/utils/config.d.ts +23 -0
- package/dist/utils/config.d.ts.map +1 -1
- package/dist/utils/config.js +39 -1
- package/dist/utils/config.js.map +1 -1
- package/dist/utils/file-utils.d.ts.map +1 -1
- package/dist/utils/file-utils.js +17 -1
- package/dist/utils/file-utils.js.map +1 -1
- package/dist/vectordb/index.d.ts +45 -16
- package/dist/vectordb/index.d.ts.map +1 -1
- package/dist/vectordb/index.js +363 -170
- package/dist/vectordb/index.js.map +1 -1
- package/dist/web/api-routes.d.ts.map +1 -1
- package/dist/web/api-routes.js +23 -10
- package/dist/web/api-routes.js.map +1 -1
- package/dist/web/database-manager.d.ts.map +1 -1
- package/dist/web/database-manager.js +32 -25
- package/dist/web/database-manager.js.map +1 -1
- package/dist/web/http-server.d.ts +0 -5
- package/dist/web/http-server.d.ts.map +1 -1
- package/dist/web/http-server.js +3 -7
- package/dist/web/http-server.js.map +1 -1
- package/dist/web/middleware/async-handler.d.ts +2 -1
- package/dist/web/middleware/async-handler.d.ts.map +1 -1
- package/dist/web/middleware/rate-limit.d.ts +2 -1
- package/dist/web/middleware/rate-limit.d.ts.map +1 -1
- package/dist/web/middleware/request-logger.d.ts +1 -1
- package/dist/web/middleware/request-logger.d.ts.map +1 -1
- package/package.json +8 -7
- package/skills/rag-vault/SKILL.md +3 -3
- package/skills/rag-vault/references/html-ingestion.md +1 -1
- package/web-ui/dist/assets/{CollectionsPage-BDmEfv3V.js → CollectionsPage-wbfgYFTw.js} +1 -1
- package/web-ui/dist/assets/{FilesPage-pG9HmpgQ.js → FilesPage-D6TlldaR.js} +1 -1
- package/web-ui/dist/assets/ReaderPage-Sgy0vMZ6.js +28 -0
- package/web-ui/dist/assets/{ReaderSettingsContext-CkSjqsRh.js → ReaderSettingsContext-DsvLXuaf.js} +1 -1
- package/web-ui/dist/assets/{SearchPage-DAltjnLL.js → SearchPage-mPKXZEyq.js} +1 -1
- package/web-ui/dist/assets/{SettingsPage-C6J5BITP.js → SettingsPage-DXeWwfvd.js} +1 -1
- package/web-ui/dist/assets/{StatusPage-powRGmW3.js → StatusPage-AirpfsGF.js} +1 -1
- package/web-ui/dist/assets/{UploadPage-eyfSjL4u.js → UploadPage-Cob25kDa.js} +5 -5
- package/web-ui/dist/assets/index-BZMzEssr.js +6 -0
- package/web-ui/dist/assets/index-DovQIIL4.css +1 -0
- package/web-ui/dist/assets/motion-DdHBXDWx.js +9 -0
- package/web-ui/dist/assets/query-DbAD_nLW.js +1 -0
- package/web-ui/dist/assets/vendor-DNJ-hWNb.js +10 -0
- package/web-ui/dist/index.html +4 -4
- package/web-ui/dist/assets/ReaderPage-CwMN03NU.js +0 -28
- package/web-ui/dist/assets/index-BpwaiuGL.css +0 -1
- package/web-ui/dist/assets/index-D068MV_o.js +0 -6
- package/web-ui/dist/assets/motion-CKwJwI3J.js +0 -9
- package/web-ui/dist/assets/query-DPt-uCb6.js +0 -1
- package/web-ui/dist/assets/vendor-C2QPsZ3S.js +0 -10
package/dist/vectordb/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
// VectorStore implementation with LanceDB integration
|
|
2
2
|
import { createHash } from 'node:crypto';
|
|
3
|
-
import {
|
|
3
|
+
import { connect, Index } from '@lancedb/lancedb';
|
|
4
4
|
import { DatabaseError } from '../errors/index.js';
|
|
5
5
|
// Re-export error class for backwards compatibility
|
|
6
6
|
export { DatabaseError } from '../errors/index.js';
|
|
@@ -15,7 +15,9 @@ function parseEnvNumber(envVar, defaultValue) {
|
|
|
15
15
|
if (!value)
|
|
16
16
|
return defaultValue;
|
|
17
17
|
const parsed = Number.parseFloat(value);
|
|
18
|
-
|
|
18
|
+
if (Number.isNaN(parsed) || !Number.isFinite(parsed))
|
|
19
|
+
return defaultValue;
|
|
20
|
+
return parsed;
|
|
19
21
|
}
|
|
20
22
|
/**
|
|
21
23
|
* Parse an integer environment variable with fallback
|
|
@@ -25,7 +27,9 @@ function parseEnvInt(envVar, defaultValue) {
|
|
|
25
27
|
if (!value)
|
|
26
28
|
return defaultValue;
|
|
27
29
|
const parsed = Number.parseInt(value, 10);
|
|
28
|
-
|
|
30
|
+
if (Number.isNaN(parsed) || !Number.isFinite(parsed))
|
|
31
|
+
return defaultValue;
|
|
32
|
+
return parsed;
|
|
29
33
|
}
|
|
30
34
|
/**
|
|
31
35
|
* Standard deviation multiplier for detecting group boundaries.
|
|
@@ -39,6 +43,17 @@ const GROUPING_BOUNDARY_STD_MULTIPLIER = parseEnvNumber('RAG_GROUPING_STD_MULTIP
|
|
|
39
43
|
* Configure via RAG_HYBRID_CANDIDATE_MULTIPLIER environment variable.
|
|
40
44
|
*/
|
|
41
45
|
const HYBRID_SEARCH_CANDIDATE_MULTIPLIER = parseEnvInt('RAG_HYBRID_CANDIDATE_MULTIPLIER', 2);
|
|
46
|
+
/**
|
|
47
|
+
* RRF smoothing constant (k). Higher values produce smoother rank fusion.
|
|
48
|
+
* Standard value is 60. Configure via RAG_RRF_K environment variable.
|
|
49
|
+
*/
|
|
50
|
+
const RRF_K = parseEnvInt('RAG_RRF_K', 60);
|
|
51
|
+
const SEARCH_MODE = (() => {
|
|
52
|
+
const mode = process.env['RAG_SEARCH_MODE']?.toLowerCase().trim();
|
|
53
|
+
if (mode === 'rrf')
|
|
54
|
+
return 'rrf';
|
|
55
|
+
return 'boost'; // default: backward-compatible
|
|
56
|
+
})();
|
|
42
57
|
/** FTS index name (bump version when changing tokenizer settings) */
|
|
43
58
|
const FTS_INDEX_NAME = 'fts_index_v2';
|
|
44
59
|
/** Threshold for cleaning up old index versions (1 minute) */
|
|
@@ -81,6 +96,27 @@ const CUSTOM_METADATA_ALL_FIELDS = '__all__';
|
|
|
81
96
|
* Rejects paths with SQL injection attempts or path traversal.
|
|
82
97
|
*/
|
|
83
98
|
const SAFE_PATH_REGEX = /^[a-zA-Z0-9\\/_.:\- ]+$/;
|
|
99
|
+
/**
|
|
100
|
+
* Retry a read-only async operation with exponential backoff.
|
|
101
|
+
* Used for transient disk/IO errors on VectorStore reads.
|
|
102
|
+
*/
|
|
103
|
+
async function withRetry(fn, label, maxAttempts = 3, baseDelayMs = 100) {
|
|
104
|
+
let lastError;
|
|
105
|
+
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
106
|
+
try {
|
|
107
|
+
return await fn();
|
|
108
|
+
}
|
|
109
|
+
catch (error) {
|
|
110
|
+
lastError = error;
|
|
111
|
+
if (attempt < maxAttempts) {
|
|
112
|
+
const delayMs = baseDelayMs * 2 ** (attempt - 1);
|
|
113
|
+
console.warn(`${label}: attempt ${attempt}/${maxAttempts} failed (${lastError.message}), retrying in ${delayMs}ms...`);
|
|
114
|
+
await new Promise((resolve) => setTimeout(resolve, delayMs));
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
}
|
|
118
|
+
throw lastError;
|
|
119
|
+
}
|
|
84
120
|
/**
|
|
85
121
|
* Validate file path to prevent SQL injection and path traversal attacks.
|
|
86
122
|
* @param filePath - The file path to validate
|
|
@@ -121,7 +157,7 @@ function normalizeTextForFingerprint(text) {
|
|
|
121
157
|
* Uses SHA-256 hash of normalized text (first 16 hex chars for compactness).
|
|
122
158
|
* This enables stable chunk identification across re-indexing.
|
|
123
159
|
*/
|
|
124
|
-
|
|
160
|
+
function generateChunkFingerprint(text) {
|
|
125
161
|
const normalized = normalizeTextForFingerprint(text);
|
|
126
162
|
const hash = createHash('sha256').update(normalized, 'utf8').digest('hex');
|
|
127
163
|
// Use first 16 characters (64 bits) - sufficient for practical uniqueness
|
|
@@ -223,6 +259,7 @@ export class VectorStore {
|
|
|
223
259
|
db = null;
|
|
224
260
|
table = null;
|
|
225
261
|
config;
|
|
262
|
+
rrfMaxDistanceWarned = false;
|
|
226
263
|
ftsEnabled = false;
|
|
227
264
|
ftsFailureCount = 0;
|
|
228
265
|
ftsLastFailure = null;
|
|
@@ -507,6 +544,48 @@ export class VectorStore {
|
|
|
507
544
|
// Ignorable errors (no matching records) are logged but not thrown
|
|
508
545
|
}
|
|
509
546
|
}
|
|
547
|
+
/**
|
|
548
|
+
* Delete chunks for a file, excluding a set of IDs.
|
|
549
|
+
* Used by insert-then-delete re-ingestion to remove old vectors
|
|
550
|
+
* while keeping newly inserted ones.
|
|
551
|
+
*
|
|
552
|
+
* @param filePath - File path whose old chunks should be removed
|
|
553
|
+
* @param excludeIds - Set of chunk IDs to keep (the new batch)
|
|
554
|
+
*/
|
|
555
|
+
async deleteChunksExcluding(filePath, excludeIds) {
|
|
556
|
+
if (!this.table || excludeIds.size === 0) {
|
|
557
|
+
return;
|
|
558
|
+
}
|
|
559
|
+
if (!isValidFilePath(filePath)) {
|
|
560
|
+
throw new DatabaseError('Invalid file path: contains disallowed characters or patterns');
|
|
561
|
+
}
|
|
562
|
+
const escapedFilePath = filePath.replace(/'/g, "''");
|
|
563
|
+
try {
|
|
564
|
+
// Query existing chunks for this file to find old IDs
|
|
565
|
+
const existing = await this.table
|
|
566
|
+
.query()
|
|
567
|
+
.where(`\`filePath\` = '${escapedFilePath}'`)
|
|
568
|
+
.select(['id'])
|
|
569
|
+
.toArray();
|
|
570
|
+
const oldIds = existing.map((row) => row.id).filter((id) => !excludeIds.has(id));
|
|
571
|
+
if (oldIds.length === 0) {
|
|
572
|
+
return;
|
|
573
|
+
}
|
|
574
|
+
// Delete old chunks by ID
|
|
575
|
+
const idList = oldIds.map((id) => `'${id.replace(/'/g, "''")}'`).join(', ');
|
|
576
|
+
await this.table.delete(`\`id\` IN (${idList})`);
|
|
577
|
+
console.error(`VectorStore: Removed ${oldIds.length} old chunks for "${filePath}"`);
|
|
578
|
+
await this.rebuildFtsIndex();
|
|
579
|
+
}
|
|
580
|
+
catch (error) {
|
|
581
|
+
// Non-fatal: temporary duplicates are acceptable
|
|
582
|
+
const errorMessage = error.message.toLowerCase();
|
|
583
|
+
const isIgnorable = DELETE_IGNORABLE_PATTERNS.some((pattern) => errorMessage.includes(pattern));
|
|
584
|
+
if (!isIgnorable) {
|
|
585
|
+
throw new DatabaseError(`Failed to clean up old chunks for file: ${filePath}`, error);
|
|
586
|
+
}
|
|
587
|
+
}
|
|
588
|
+
}
|
|
510
589
|
/**
|
|
511
590
|
* Batch insert vector chunks
|
|
512
591
|
*
|
|
@@ -669,18 +748,22 @@ export class VectorStore {
|
|
|
669
748
|
}
|
|
670
749
|
/**
|
|
671
750
|
* Execute vector search with quality filtering
|
|
672
|
-
* Architecture: Semantic search → Filter (maxDistance, grouping) → Keyword boost
|
|
673
751
|
*
|
|
674
|
-
*
|
|
675
|
-
* -
|
|
676
|
-
*
|
|
752
|
+
* Supports two search modes (configured via RAG_SEARCH_MODE):
|
|
753
|
+
* - 'rrf': Reciprocal Rank Fusion — vector and BM25 are independent voters,
|
|
754
|
+
* results are fused by rank position. Recommended for most use cases.
|
|
755
|
+
* - 'boost': Legacy mode — BM25 multiplicatively boosts vector distances.
|
|
756
|
+
*
|
|
757
|
+
* Architecture (RRF mode): Vector search + FTS search → RRF fusion → Grouping → Limit
|
|
758
|
+
* Architecture (Boost mode): Vector search → Grouping → Keyword boost → Limit
|
|
677
759
|
*
|
|
678
760
|
* @param queryVector - Query vector (dimension depends on model)
|
|
679
|
-
* @param queryText - Optional query text for keyword
|
|
761
|
+
* @param queryText - Optional query text for keyword matching (BM25)
|
|
680
762
|
* @param limit - Number of results to retrieve (default 10)
|
|
763
|
+
* @param additionalVectors - Optional additional query vectors with weights (for HyDE)
|
|
681
764
|
* @returns Array of search results (sorted by distance ascending, filtered by quality settings)
|
|
682
765
|
*/
|
|
683
|
-
async search(queryVector, queryText, limit = 10) {
|
|
766
|
+
async search(queryVector, queryText, limit = 10, additionalVectors) {
|
|
684
767
|
if (!this.table) {
|
|
685
768
|
console.error('VectorStore: Returning empty results as table does not exist');
|
|
686
769
|
return [];
|
|
@@ -688,55 +771,153 @@ export class VectorStore {
|
|
|
688
771
|
if (limit < 1 || limit > 20) {
|
|
689
772
|
throw new DatabaseError(`Invalid limit: expected 1-20, got ${limit}`);
|
|
690
773
|
}
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
// Step 2: Apply grouping filter on vector distances (before keyword boost)
|
|
704
|
-
// Grouping is meaningful only on semantic distances, not after keyword boost
|
|
705
|
-
if (this.config.grouping && results.length > 1) {
|
|
706
|
-
results = this.applyGrouping(results, this.config.grouping);
|
|
707
|
-
}
|
|
708
|
-
// Step 3: Apply keyword boost if enabled (with circuit breaker)
|
|
709
|
-
const hybridWeight = this.getHybridWeight();
|
|
710
|
-
if (this.shouldAttemptFts() && queryText && queryText.trim().length > 0 && hybridWeight > 0) {
|
|
711
|
-
try {
|
|
712
|
-
// Get unique filePaths from vector results to filter FTS search
|
|
713
|
-
const uniqueFilePaths = [...new Set(results.map((r) => r.filePath))];
|
|
714
|
-
// Build WHERE clause with IN for targeted FTS search
|
|
715
|
-
// Use backticks for column name (required for camelCase in LanceDB)
|
|
716
|
-
const escapedPaths = uniqueFilePaths.map((p) => `'${p.replace(/'/g, "''")}'`);
|
|
717
|
-
const whereClause = `\`filePath\` IN (${escapedPaths.join(', ')})`;
|
|
718
|
-
const ftsResults = await this.table
|
|
719
|
-
.search(queryText, 'fts', 'text')
|
|
720
|
-
.where(whereClause)
|
|
721
|
-
.select(['filePath', 'chunkIndex', 'text', 'metadata', '_score'])
|
|
722
|
-
.limit(results.length * 2) // Enough to cover all vector results
|
|
723
|
-
.toArray();
|
|
724
|
-
results = this.applyKeywordBoost(results, ftsResults, hybridWeight);
|
|
725
|
-
// FTS succeeded - reset circuit breaker
|
|
726
|
-
this.recordFtsSuccess();
|
|
774
|
+
const table = this.table;
|
|
775
|
+
return withRetry(async () => {
|
|
776
|
+
try {
|
|
777
|
+
const candidateLimit = limit * HYBRID_SEARCH_CANDIDATE_MULTIPLIER;
|
|
778
|
+
const hybridWeight = this.getHybridWeight();
|
|
779
|
+
const useRRF = (this.config.searchMode ?? SEARCH_MODE) === 'rrf';
|
|
780
|
+
if (useRRF && this.config.maxDistance !== undefined && !this.rrfMaxDistanceWarned) {
|
|
781
|
+
this.rrfMaxDistanceWarned = true;
|
|
782
|
+
console.error(`Warning: maxDistance (${this.config.maxDistance}) is set with RRF search mode. ` +
|
|
783
|
+
'In RRF mode, scores are rank-based pseudo-distances, not vector distances. ' +
|
|
784
|
+
'Your threshold may not behave as expected. Consider using top-k limiting instead, ' +
|
|
785
|
+
'or set RAG_SEARCH_MODE=boost for distance-based filtering.');
|
|
727
786
|
}
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
787
|
+
// Step 1: Primary vector search
|
|
788
|
+
let vectorQuery = table.vectorSearch(queryVector).distanceType('dot').limit(candidateLimit);
|
|
789
|
+
if (!useRRF && this.config.maxDistance !== undefined) {
|
|
790
|
+
vectorQuery = vectorQuery.distanceRange(undefined, this.config.maxDistance);
|
|
732
791
|
}
|
|
792
|
+
const vectorRaw = await vectorQuery.toArray();
|
|
793
|
+
let results = vectorRaw.map((result) => toSearchResult(result));
|
|
794
|
+
if (useRRF) {
|
|
795
|
+
// ===== RRF MODE =====
|
|
796
|
+
// Collect all vector results into a single RRF candidate pool
|
|
797
|
+
// Primary query has weight 1.0; additional vectors (HyDE) have their own weights
|
|
798
|
+
const allVectorResults = [
|
|
799
|
+
{ results, weight: 1.0 },
|
|
800
|
+
];
|
|
801
|
+
// Run additional vector searches (HyDE expansions) in parallel if provided
|
|
802
|
+
if (additionalVectors && additionalVectors.length > 0) {
|
|
803
|
+
const additionalSearches = additionalVectors.map(async ({ vector, weight }) => {
|
|
804
|
+
const addlQuery = table.vectorSearch(vector).distanceType('dot').limit(candidateLimit);
|
|
805
|
+
const addlRaw = await addlQuery.toArray();
|
|
806
|
+
return {
|
|
807
|
+
results: addlRaw.map((r) => toSearchResult(r)),
|
|
808
|
+
weight,
|
|
809
|
+
};
|
|
810
|
+
});
|
|
811
|
+
allVectorResults.push(...(await Promise.all(additionalSearches)));
|
|
812
|
+
}
|
|
813
|
+
// Build unified candidate map with RRF scoring across all vector queries
|
|
814
|
+
const candidates = new Map();
|
|
815
|
+
const k = this.config.rrfK ?? RRF_K;
|
|
816
|
+
for (const { results: vecResults, weight: queryWeight } of allVectorResults) {
|
|
817
|
+
const vectorWeight = queryWeight * (1 - hybridWeight);
|
|
818
|
+
for (let rank = 0; rank < vecResults.length; rank++) {
|
|
819
|
+
const r = vecResults[rank];
|
|
820
|
+
if (!r)
|
|
821
|
+
continue;
|
|
822
|
+
const key = `${r.filePath}:${r.chunkIndex}`;
|
|
823
|
+
const contribution = vectorWeight / (k + rank + 1);
|
|
824
|
+
const existing = candidates.get(key);
|
|
825
|
+
if (existing) {
|
|
826
|
+
existing.rrfScore += contribution;
|
|
827
|
+
}
|
|
828
|
+
else {
|
|
829
|
+
candidates.set(key, { result: r, rrfScore: contribution });
|
|
830
|
+
}
|
|
831
|
+
}
|
|
832
|
+
}
|
|
833
|
+
// FTS channel (BM25) — run independently, no pre-filtering by vector results
|
|
834
|
+
if (this.shouldAttemptFts() &&
|
|
835
|
+
queryText &&
|
|
836
|
+
queryText.trim().length > 0 &&
|
|
837
|
+
hybridWeight > 0) {
|
|
838
|
+
try {
|
|
839
|
+
const ftsResults = await table
|
|
840
|
+
.search(queryText, 'fts', 'text')
|
|
841
|
+
.select(['filePath', 'chunkIndex', 'text', 'metadata', '_score'])
|
|
842
|
+
.limit(candidateLimit)
|
|
843
|
+
.toArray();
|
|
844
|
+
// BM25 channel votes
|
|
845
|
+
const bm25Weight = hybridWeight;
|
|
846
|
+
for (let rank = 0; rank < ftsResults.length; rank++) {
|
|
847
|
+
const ftsResult = ftsResults[rank];
|
|
848
|
+
if (!ftsResult)
|
|
849
|
+
continue;
|
|
850
|
+
const key = `${ftsResult['filePath']}:${ftsResult['chunkIndex']}`;
|
|
851
|
+
const contribution = bm25Weight / (k + rank + 1);
|
|
852
|
+
const existing = candidates.get(key);
|
|
853
|
+
if (existing) {
|
|
854
|
+
existing.rrfScore += contribution;
|
|
855
|
+
}
|
|
856
|
+
else {
|
|
857
|
+
try {
|
|
858
|
+
const searchResult = toSearchResult(ftsResult);
|
|
859
|
+
candidates.set(key, { result: searchResult, rrfScore: contribution });
|
|
860
|
+
}
|
|
861
|
+
catch { }
|
|
862
|
+
}
|
|
863
|
+
}
|
|
864
|
+
this.recordFtsSuccess();
|
|
865
|
+
}
|
|
866
|
+
catch (ftsError) {
|
|
867
|
+
this.recordFtsFailure(ftsError);
|
|
868
|
+
}
|
|
869
|
+
}
|
|
870
|
+
// Sort by RRF score descending, convert to pseudo-distance
|
|
871
|
+
const sorted = Array.from(candidates.values()).sort((a, b) => b.rrfScore - a.rrfScore);
|
|
872
|
+
results = sorted.map(({ result, rrfScore }) => ({
|
|
873
|
+
...result,
|
|
874
|
+
score: 1 / (1 + rrfScore),
|
|
875
|
+
}));
|
|
876
|
+
// Apply maxDistance filter on pseudo-distances
|
|
877
|
+
if (this.config.maxDistance !== undefined) {
|
|
878
|
+
results = results.filter((r) => r.score <= this.config.maxDistance);
|
|
879
|
+
}
|
|
880
|
+
// Apply grouping on RRF pseudo-distances
|
|
881
|
+
if (this.config.grouping && results.length > 1) {
|
|
882
|
+
results = this.applyGrouping(results, this.config.grouping);
|
|
883
|
+
}
|
|
884
|
+
}
|
|
885
|
+
else {
|
|
886
|
+
// ===== LEGACY BOOST MODE =====
|
|
887
|
+
// Step 2: Apply grouping on vector distances (before keyword boost)
|
|
888
|
+
if (this.config.grouping && results.length > 1) {
|
|
889
|
+
results = this.applyGrouping(results, this.config.grouping);
|
|
890
|
+
}
|
|
891
|
+
// Step 3: Apply keyword boost if enabled (with circuit breaker)
|
|
892
|
+
if (this.shouldAttemptFts() &&
|
|
893
|
+
queryText &&
|
|
894
|
+
queryText.trim().length > 0 &&
|
|
895
|
+
hybridWeight > 0) {
|
|
896
|
+
try {
|
|
897
|
+
const uniqueFilePaths = [...new Set(results.map((r) => r.filePath))];
|
|
898
|
+
const escapedPaths = uniqueFilePaths.map((p) => `'${p.replace(/'/g, "''")}'`);
|
|
899
|
+
const whereClause = `\`filePath\` IN (${escapedPaths.join(', ')})`;
|
|
900
|
+
const ftsResults = await table
|
|
901
|
+
.search(queryText, 'fts', 'text')
|
|
902
|
+
.where(whereClause)
|
|
903
|
+
.select(['filePath', 'chunkIndex', 'text', 'metadata', '_score'])
|
|
904
|
+
.limit(results.length * 2)
|
|
905
|
+
.toArray();
|
|
906
|
+
results = this.applyKeywordBoost(results, ftsResults, hybridWeight);
|
|
907
|
+
this.recordFtsSuccess();
|
|
908
|
+
}
|
|
909
|
+
catch (ftsError) {
|
|
910
|
+
this.recordFtsFailure(ftsError);
|
|
911
|
+
}
|
|
912
|
+
}
|
|
913
|
+
}
|
|
914
|
+
// Return top results after all filtering and fusion
|
|
915
|
+
return results.slice(0, limit);
|
|
733
916
|
}
|
|
734
|
-
|
|
735
|
-
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
throw new DatabaseError('Failed to search vectors', error);
|
|
739
|
-
}
|
|
917
|
+
catch (error) {
|
|
918
|
+
throw new DatabaseError('Failed to search vectors', error);
|
|
919
|
+
}
|
|
920
|
+
}, 'VectorStore.search');
|
|
740
921
|
}
|
|
741
922
|
/**
|
|
742
923
|
* Apply keyword boost to rerank vector search results
|
|
@@ -798,52 +979,55 @@ export class VectorStore {
|
|
|
798
979
|
if (!this.table) {
|
|
799
980
|
return []; // Return empty array if table doesn't exist
|
|
800
981
|
}
|
|
801
|
-
|
|
802
|
-
|
|
803
|
-
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
|
|
807
|
-
|
|
808
|
-
|
|
809
|
-
const
|
|
810
|
-
const
|
|
811
|
-
|
|
812
|
-
const
|
|
813
|
-
if (
|
|
814
|
-
fileInfo
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
982
|
+
const table = this.table;
|
|
983
|
+
return withRetry(async () => {
|
|
984
|
+
try {
|
|
985
|
+
// Retrieve all records - LanceDB doesn't support GROUP BY aggregation,
|
|
986
|
+
// so we must fetch records and group in memory
|
|
987
|
+
// TODO(perf): Consider caching file list or using incremental updates for very large datasets
|
|
988
|
+
const allRecords = await table.query().toArray();
|
|
989
|
+
// Group by file path
|
|
990
|
+
const fileMap = new Map();
|
|
991
|
+
for (const record of allRecords) {
|
|
992
|
+
const filePath = record.filePath;
|
|
993
|
+
const timestamp = record.timestamp;
|
|
994
|
+
if (fileMap.has(filePath)) {
|
|
995
|
+
const fileInfo = fileMap.get(filePath);
|
|
996
|
+
if (fileInfo) {
|
|
997
|
+
fileInfo.chunkCount += 1;
|
|
998
|
+
// Keep most recent timestamp
|
|
999
|
+
if (timestamp > fileInfo.timestamp) {
|
|
1000
|
+
fileInfo.timestamp = timestamp;
|
|
1001
|
+
}
|
|
818
1002
|
}
|
|
819
1003
|
}
|
|
1004
|
+
else {
|
|
1005
|
+
fileMap.set(filePath, { chunkCount: 1, timestamp });
|
|
1006
|
+
}
|
|
820
1007
|
}
|
|
821
|
-
|
|
822
|
-
|
|
1008
|
+
// Convert Map to array of objects
|
|
1009
|
+
let results = Array.from(fileMap.entries()).map(([filePath, info]) => ({
|
|
1010
|
+
filePath,
|
|
1011
|
+
chunkCount: info.chunkCount,
|
|
1012
|
+
timestamp: info.timestamp,
|
|
1013
|
+
}));
|
|
1014
|
+
// Sort by timestamp descending (most recent first)
|
|
1015
|
+
results.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
|
|
1016
|
+
// Apply pagination if provided
|
|
1017
|
+
const offset = options?.offset ?? 0;
|
|
1018
|
+
const limit = options?.limit;
|
|
1019
|
+
if (offset > 0) {
|
|
1020
|
+
results = results.slice(offset);
|
|
823
1021
|
}
|
|
1022
|
+
if (limit !== undefined && limit > 0) {
|
|
1023
|
+
results = results.slice(0, limit);
|
|
1024
|
+
}
|
|
1025
|
+
return results;
|
|
824
1026
|
}
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
filePath,
|
|
828
|
-
chunkCount: info.chunkCount,
|
|
829
|
-
timestamp: info.timestamp,
|
|
830
|
-
}));
|
|
831
|
-
// Sort by timestamp descending (most recent first)
|
|
832
|
-
results.sort((a, b) => b.timestamp.localeCompare(a.timestamp));
|
|
833
|
-
// Apply pagination if provided
|
|
834
|
-
const offset = options?.offset ?? 0;
|
|
835
|
-
const limit = options?.limit;
|
|
836
|
-
if (offset > 0) {
|
|
837
|
-
results = results.slice(offset);
|
|
838
|
-
}
|
|
839
|
-
if (limit !== undefined && limit > 0) {
|
|
840
|
-
results = results.slice(0, limit);
|
|
1027
|
+
catch (error) {
|
|
1028
|
+
throw new DatabaseError('Failed to list files', error);
|
|
841
1029
|
}
|
|
842
|
-
|
|
843
|
-
}
|
|
844
|
-
catch (error) {
|
|
845
|
-
throw new DatabaseError('Failed to list files', error);
|
|
846
|
-
}
|
|
1030
|
+
}, 'VectorStore.listFiles');
|
|
847
1031
|
}
|
|
848
1032
|
/**
|
|
849
1033
|
* Close the database connection and release resources
|
|
@@ -902,30 +1086,30 @@ export class VectorStore {
|
|
|
902
1086
|
if (!isValidFilePath(filePath)) {
|
|
903
1087
|
throw new DatabaseError(`Invalid file path: contains disallowed characters or patterns`);
|
|
904
1088
|
}
|
|
905
|
-
|
|
906
|
-
|
|
907
|
-
|
|
908
|
-
.
|
|
909
|
-
.where(`\`filePath\` = '${escapedFilePath}'`)
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
918
|
-
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
};
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
928
|
-
}
|
|
1089
|
+
const table = this.table;
|
|
1090
|
+
return withRetry(async () => {
|
|
1091
|
+
try {
|
|
1092
|
+
const escapedFilePath = filePath.replace(/'/g, "''");
|
|
1093
|
+
const results = await table.query().where(`\`filePath\` = '${escapedFilePath}'`).toArray();
|
|
1094
|
+
// Convert to SearchResult format and sort by chunkIndex
|
|
1095
|
+
const chunks = results.map((record) => {
|
|
1096
|
+
const text = record.text;
|
|
1097
|
+
return {
|
|
1098
|
+
filePath: record.filePath,
|
|
1099
|
+
chunkIndex: record.chunkIndex,
|
|
1100
|
+
text,
|
|
1101
|
+
score: 0, // No distance score for direct retrieval
|
|
1102
|
+
metadata: record.metadata,
|
|
1103
|
+
// Include fingerprint - generate if not stored (backwards compatibility)
|
|
1104
|
+
fingerprint: record.fingerprint || generateChunkFingerprint(text),
|
|
1105
|
+
};
|
|
1106
|
+
});
|
|
1107
|
+
return chunks.sort((a, b) => a.chunkIndex - b.chunkIndex);
|
|
1108
|
+
}
|
|
1109
|
+
catch (error) {
|
|
1110
|
+
throw new DatabaseError(`Failed to get document chunks for: ${filePath}`, error);
|
|
1111
|
+
}
|
|
1112
|
+
}, 'VectorStore.getDocumentChunks');
|
|
929
1113
|
}
|
|
930
1114
|
/**
|
|
931
1115
|
* Find related chunks using a chunk's stored embedding
|
|
@@ -944,59 +1128,62 @@ export class VectorStore {
|
|
|
944
1128
|
if (!isValidFilePath(filePath)) {
|
|
945
1129
|
throw new DatabaseError(`Invalid file path: contains disallowed characters or patterns`);
|
|
946
1130
|
}
|
|
947
|
-
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
.
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
const rawVector = sourceChunk?.vector;
|
|
959
|
-
// LanceDB may return vectors as Arrow Vector or Float32Array, not plain Array
|
|
960
|
-
// Convert to number[] for compatibility
|
|
961
|
-
let sourceVector;
|
|
962
|
-
if (rawVector) {
|
|
963
|
-
if (Array.isArray(rawVector)) {
|
|
964
|
-
sourceVector = rawVector;
|
|
1131
|
+
const table = this.table;
|
|
1132
|
+
return withRetry(async () => {
|
|
1133
|
+
try {
|
|
1134
|
+
// First, fetch the source chunk to get its vector
|
|
1135
|
+
const escapedFilePath = filePath.replace(/'/g, "''");
|
|
1136
|
+
const sourceResults = await table
|
|
1137
|
+
.query()
|
|
1138
|
+
.where(`\`filePath\` = '${escapedFilePath}' AND \`chunkIndex\` = ${chunkIndex}`)
|
|
1139
|
+
.toArray();
|
|
1140
|
+
if (sourceResults.length === 0) {
|
|
1141
|
+
return [];
|
|
965
1142
|
}
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
1143
|
+
const sourceChunk = sourceResults[0];
|
|
1144
|
+
const rawVector = sourceChunk?.vector;
|
|
1145
|
+
// LanceDB may return vectors as Arrow Vector or Float32Array, not plain Array
|
|
1146
|
+
// Convert to number[] for compatibility
|
|
1147
|
+
let sourceVector;
|
|
1148
|
+
if (rawVector) {
|
|
1149
|
+
if (Array.isArray(rawVector)) {
|
|
1150
|
+
sourceVector = rawVector;
|
|
1151
|
+
}
|
|
1152
|
+
else if (typeof rawVector === 'object' && 'length' in rawVector) {
|
|
1153
|
+
// Handle Arrow Vector, Float32Array, or other array-like objects
|
|
1154
|
+
sourceVector = Array.from(rawVector);
|
|
1155
|
+
}
|
|
969
1156
|
}
|
|
1157
|
+
if (!sourceVector || sourceVector.length === 0) {
|
|
1158
|
+
// Chunk exists but has no embedding (e.g., upload timed out mid-process)
|
|
1159
|
+
// Return empty results instead of throwing - allows batch operations to continue
|
|
1160
|
+
console.warn(`Chunk ${filePath}:${chunkIndex} has no valid vector (possibly corrupted)`);
|
|
1161
|
+
return [];
|
|
1162
|
+
}
|
|
1163
|
+
// Search for similar chunks using the source vector
|
|
1164
|
+
// Request more candidates to allow for filtering
|
|
1165
|
+
const candidateLimit = excludeSameDocument ? limit * 3 : limit + 1;
|
|
1166
|
+
let query = table.vectorSearch(sourceVector).distanceType('dot').limit(candidateLimit);
|
|
1167
|
+
// Apply distance threshold if configured
|
|
1168
|
+
if (this.config.maxDistance !== undefined) {
|
|
1169
|
+
query = query.distanceRange(undefined, this.config.maxDistance);
|
|
1170
|
+
}
|
|
1171
|
+
const vectorResults = await query.toArray();
|
|
1172
|
+
// Convert to SearchResult format with type validation
|
|
1173
|
+
let results = vectorResults.map((result) => toSearchResult(result));
|
|
1174
|
+
// Filter out the source chunk itself
|
|
1175
|
+
results = results.filter((r) => !(r.filePath === filePath && r.chunkIndex === chunkIndex));
|
|
1176
|
+
// Optionally filter out same-document chunks
|
|
1177
|
+
if (excludeSameDocument) {
|
|
1178
|
+
results = results.filter((r) => r.filePath !== filePath);
|
|
1179
|
+
}
|
|
1180
|
+
return results.slice(0, limit);
|
|
970
1181
|
}
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
console.warn(`Chunk ${filePath}:${chunkIndex} has no valid vector (possibly corrupted)`);
|
|
975
|
-
return [];
|
|
976
|
-
}
|
|
977
|
-
// Search for similar chunks using the source vector
|
|
978
|
-
// Request more candidates to allow for filtering
|
|
979
|
-
const candidateLimit = excludeSameDocument ? limit * 3 : limit + 1;
|
|
980
|
-
let query = this.table.vectorSearch(sourceVector).distanceType('dot').limit(candidateLimit);
|
|
981
|
-
// Apply distance threshold if configured
|
|
982
|
-
if (this.config.maxDistance !== undefined) {
|
|
983
|
-
query = query.distanceRange(undefined, this.config.maxDistance);
|
|
984
|
-
}
|
|
985
|
-
const vectorResults = await query.toArray();
|
|
986
|
-
// Convert to SearchResult format with type validation
|
|
987
|
-
let results = vectorResults.map((result) => toSearchResult(result));
|
|
988
|
-
// Filter out the source chunk itself
|
|
989
|
-
results = results.filter((r) => !(r.filePath === filePath && r.chunkIndex === chunkIndex));
|
|
990
|
-
// Optionally filter out same-document chunks
|
|
991
|
-
if (excludeSameDocument) {
|
|
992
|
-
results = results.filter((r) => r.filePath !== filePath);
|
|
1182
|
+
catch (error) {
|
|
1183
|
+
const cause = error instanceof Error ? error.message : String(error);
|
|
1184
|
+
throw new DatabaseError(`Failed to find related chunks for: ${filePath}:${chunkIndex}: ${cause}`, error);
|
|
993
1185
|
}
|
|
994
|
-
|
|
995
|
-
}
|
|
996
|
-
catch (error) {
|
|
997
|
-
const cause = error instanceof Error ? error.message : String(error);
|
|
998
|
-
throw new DatabaseError(`Failed to find related chunks for: ${filePath}:${chunkIndex}: ${cause}`, error);
|
|
999
|
-
}
|
|
1186
|
+
}, 'VectorStore.findRelatedChunks');
|
|
1000
1187
|
}
|
|
1001
1188
|
/**
|
|
1002
1189
|
* Get system status
|
|
@@ -1034,7 +1221,13 @@ export class VectorStore {
|
|
|
1034
1221
|
memoryUsage,
|
|
1035
1222
|
uptime,
|
|
1036
1223
|
ftsIndexEnabled: this.ftsEnabled,
|
|
1037
|
-
searchMode:
|
|
1224
|
+
searchMode: (this.config.searchMode ?? SEARCH_MODE) === 'rrf'
|
|
1225
|
+
? ftsEffectivelyEnabled && this.getHybridWeight() > 0
|
|
1226
|
+
? 'hybrid-rrf'
|
|
1227
|
+
: 'vector-rrf'
|
|
1228
|
+
: ftsEffectivelyEnabled && this.getHybridWeight() > 0
|
|
1229
|
+
? 'hybrid'
|
|
1230
|
+
: 'vector-only',
|
|
1038
1231
|
};
|
|
1039
1232
|
}
|
|
1040
1233
|
catch (error) {
|