@cosmocoder/mcp-web-docs 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +22 -0
- package/README.md +368 -0
- package/build/__mocks__/embeddings.d.ts +17 -0
- package/build/__mocks__/embeddings.js +66 -0
- package/build/__mocks__/embeddings.js.map +1 -0
- package/build/config.d.ts +44 -0
- package/build/config.js +158 -0
- package/build/config.js.map +1 -0
- package/build/config.test.d.ts +1 -0
- package/build/config.test.js +165 -0
- package/build/config.test.js.map +1 -0
- package/build/crawler/auth.d.ts +128 -0
- package/build/crawler/auth.js +546 -0
- package/build/crawler/auth.js.map +1 -0
- package/build/crawler/auth.test.d.ts +1 -0
- package/build/crawler/auth.test.js +174 -0
- package/build/crawler/auth.test.js.map +1 -0
- package/build/crawler/base.d.ts +24 -0
- package/build/crawler/base.js +149 -0
- package/build/crawler/base.js.map +1 -0
- package/build/crawler/base.test.d.ts +1 -0
- package/build/crawler/base.test.js +234 -0
- package/build/crawler/base.test.js.map +1 -0
- package/build/crawler/browser-config.d.ts +2 -0
- package/build/crawler/browser-config.js +29 -0
- package/build/crawler/browser-config.js.map +1 -0
- package/build/crawler/browser-config.test.d.ts +1 -0
- package/build/crawler/browser-config.test.js +56 -0
- package/build/crawler/browser-config.test.js.map +1 -0
- package/build/crawler/cheerio.d.ts +11 -0
- package/build/crawler/cheerio.js +134 -0
- package/build/crawler/cheerio.js.map +1 -0
- package/build/crawler/chromium.d.ts +21 -0
- package/build/crawler/chromium.js +596 -0
- package/build/crawler/chromium.js.map +1 -0
- package/build/crawler/content-extractor-types.d.ts +25 -0
- package/build/crawler/content-extractor-types.js +2 -0
- package/build/crawler/content-extractor-types.js.map +1 -0
- package/build/crawler/content-extractors.d.ts +9 -0
- package/build/crawler/content-extractors.js +9 -0
- package/build/crawler/content-extractors.js.map +1 -0
- package/build/crawler/content-utils.d.ts +2 -0
- package/build/crawler/content-utils.js +22 -0
- package/build/crawler/content-utils.js.map +1 -0
- package/build/crawler/content-utils.test.d.ts +1 -0
- package/build/crawler/content-utils.test.js +99 -0
- package/build/crawler/content-utils.test.js.map +1 -0
- package/build/crawler/crawlee-crawler.d.ts +63 -0
- package/build/crawler/crawlee-crawler.js +342 -0
- package/build/crawler/crawlee-crawler.js.map +1 -0
- package/build/crawler/crawlee-crawler.test.d.ts +1 -0
- package/build/crawler/crawlee-crawler.test.js +280 -0
- package/build/crawler/crawlee-crawler.test.js.map +1 -0
- package/build/crawler/default-extractor.d.ts +4 -0
- package/build/crawler/default-extractor.js +26 -0
- package/build/crawler/default-extractor.js.map +1 -0
- package/build/crawler/default-extractor.test.d.ts +1 -0
- package/build/crawler/default-extractor.test.js +200 -0
- package/build/crawler/default-extractor.test.js.map +1 -0
- package/build/crawler/default.d.ts +11 -0
- package/build/crawler/default.js +138 -0
- package/build/crawler/default.js.map +1 -0
- package/build/crawler/docs-crawler.d.ts +26 -0
- package/build/crawler/docs-crawler.js +97 -0
- package/build/crawler/docs-crawler.js.map +1 -0
- package/build/crawler/docs-crawler.test.d.ts +1 -0
- package/build/crawler/docs-crawler.test.js +185 -0
- package/build/crawler/docs-crawler.test.js.map +1 -0
- package/build/crawler/factory.d.ts +6 -0
- package/build/crawler/factory.js +83 -0
- package/build/crawler/factory.js.map +1 -0
- package/build/crawler/github-pages-extractor.d.ts +4 -0
- package/build/crawler/github-pages-extractor.js +33 -0
- package/build/crawler/github-pages-extractor.js.map +1 -0
- package/build/crawler/github-pages-extractor.test.d.ts +1 -0
- package/build/crawler/github-pages-extractor.test.js +184 -0
- package/build/crawler/github-pages-extractor.test.js.map +1 -0
- package/build/crawler/github.d.ts +20 -0
- package/build/crawler/github.js +181 -0
- package/build/crawler/github.js.map +1 -0
- package/build/crawler/github.test.d.ts +1 -0
- package/build/crawler/github.test.js +326 -0
- package/build/crawler/github.test.js.map +1 -0
- package/build/crawler/puppeteer.d.ts +16 -0
- package/build/crawler/puppeteer.js +191 -0
- package/build/crawler/puppeteer.js.map +1 -0
- package/build/crawler/queue-manager.d.ts +43 -0
- package/build/crawler/queue-manager.js +169 -0
- package/build/crawler/queue-manager.js.map +1 -0
- package/build/crawler/queue-manager.test.d.ts +1 -0
- package/build/crawler/queue-manager.test.js +509 -0
- package/build/crawler/queue-manager.test.js.map +1 -0
- package/build/crawler/site-rules.d.ts +11 -0
- package/build/crawler/site-rules.js +104 -0
- package/build/crawler/site-rules.js.map +1 -0
- package/build/crawler/site-rules.test.d.ts +1 -0
- package/build/crawler/site-rules.test.js +139 -0
- package/build/crawler/site-rules.test.js.map +1 -0
- package/build/crawler/storybook-extractor.d.ts +34 -0
- package/build/crawler/storybook-extractor.js +767 -0
- package/build/crawler/storybook-extractor.js.map +1 -0
- package/build/crawler/storybook-extractor.test.d.ts +1 -0
- package/build/crawler/storybook-extractor.test.js +491 -0
- package/build/crawler/storybook-extractor.test.js.map +1 -0
- package/build/embeddings/fastembed.d.ts +25 -0
- package/build/embeddings/fastembed.js +188 -0
- package/build/embeddings/fastembed.js.map +1 -0
- package/build/embeddings/fastembed.test.d.ts +1 -0
- package/build/embeddings/fastembed.test.js +307 -0
- package/build/embeddings/fastembed.test.js.map +1 -0
- package/build/embeddings/openai.d.ts +8 -0
- package/build/embeddings/openai.js +56 -0
- package/build/embeddings/openai.js.map +1 -0
- package/build/embeddings/types.d.ts +4 -0
- package/build/embeddings/types.js +2 -0
- package/build/embeddings/types.js.map +1 -0
- package/build/index.d.ts +2 -0
- package/build/index.js +1007 -0
- package/build/index.js.map +1 -0
- package/build/index.test.d.ts +1 -0
- package/build/index.test.js +364 -0
- package/build/index.test.js.map +1 -0
- package/build/indexing/queue-manager.d.ts +36 -0
- package/build/indexing/queue-manager.js +86 -0
- package/build/indexing/queue-manager.js.map +1 -0
- package/build/indexing/queue-manager.test.d.ts +1 -0
- package/build/indexing/queue-manager.test.js +257 -0
- package/build/indexing/queue-manager.test.js.map +1 -0
- package/build/indexing/status.d.ts +39 -0
- package/build/indexing/status.js +207 -0
- package/build/indexing/status.js.map +1 -0
- package/build/indexing/status.test.d.ts +1 -0
- package/build/indexing/status.test.js +246 -0
- package/build/indexing/status.test.js.map +1 -0
- package/build/processor/content.d.ts +16 -0
- package/build/processor/content.js +286 -0
- package/build/processor/content.js.map +1 -0
- package/build/processor/content.test.d.ts +1 -0
- package/build/processor/content.test.js +369 -0
- package/build/processor/content.test.js.map +1 -0
- package/build/processor/markdown.d.ts +11 -0
- package/build/processor/markdown.js +256 -0
- package/build/processor/markdown.js.map +1 -0
- package/build/processor/markdown.test.d.ts +1 -0
- package/build/processor/markdown.test.js +312 -0
- package/build/processor/markdown.test.js.map +1 -0
- package/build/processor/metadata-parser.d.ts +37 -0
- package/build/processor/metadata-parser.js +245 -0
- package/build/processor/metadata-parser.js.map +1 -0
- package/build/processor/metadata-parser.test.d.ts +1 -0
- package/build/processor/metadata-parser.test.js +357 -0
- package/build/processor/metadata-parser.test.js.map +1 -0
- package/build/processor/processor.d.ts +8 -0
- package/build/processor/processor.js +190 -0
- package/build/processor/processor.js.map +1 -0
- package/build/processor/processor.test.d.ts +1 -0
- package/build/processor/processor.test.js +357 -0
- package/build/processor/processor.test.js.map +1 -0
- package/build/rag/cache.d.ts +10 -0
- package/build/rag/cache.js +10 -0
- package/build/rag/cache.js.map +1 -0
- package/build/rag/code-generator.d.ts +11 -0
- package/build/rag/code-generator.js +30 -0
- package/build/rag/code-generator.js.map +1 -0
- package/build/rag/context-assembler.d.ts +23 -0
- package/build/rag/context-assembler.js +113 -0
- package/build/rag/context-assembler.js.map +1 -0
- package/build/rag/docs-search.d.ts +55 -0
- package/build/rag/docs-search.js +380 -0
- package/build/rag/docs-search.js.map +1 -0
- package/build/rag/pipeline.d.ts +26 -0
- package/build/rag/pipeline.js +91 -0
- package/build/rag/pipeline.js.map +1 -0
- package/build/rag/query-processor.d.ts +14 -0
- package/build/rag/query-processor.js +57 -0
- package/build/rag/query-processor.js.map +1 -0
- package/build/rag/reranker.d.ts +55 -0
- package/build/rag/reranker.js +210 -0
- package/build/rag/reranker.js.map +1 -0
- package/build/rag/response-generator.d.ts +20 -0
- package/build/rag/response-generator.js +101 -0
- package/build/rag/response-generator.js.map +1 -0
- package/build/rag/retriever.d.ts +19 -0
- package/build/rag/retriever.js +111 -0
- package/build/rag/retriever.js.map +1 -0
- package/build/rag/validator.d.ts +22 -0
- package/build/rag/validator.js +128 -0
- package/build/rag/validator.js.map +1 -0
- package/build/rag/version-manager.d.ts +23 -0
- package/build/rag/version-manager.js +98 -0
- package/build/rag/version-manager.js.map +1 -0
- package/build/setupTests.d.ts +4 -0
- package/build/setupTests.js +50 -0
- package/build/setupTests.js.map +1 -0
- package/build/storage/storage.d.ts +38 -0
- package/build/storage/storage.js +700 -0
- package/build/storage/storage.js.map +1 -0
- package/build/storage/storage.test.d.ts +1 -0
- package/build/storage/storage.test.js +338 -0
- package/build/storage/storage.test.js.map +1 -0
- package/build/types/rag.d.ts +27 -0
- package/build/types/rag.js +2 -0
- package/build/types/rag.js.map +1 -0
- package/build/types.d.ts +120 -0
- package/build/types.js +2 -0
- package/build/types.js.map +1 -0
- package/build/util/content-utils.d.ts +31 -0
- package/build/util/content-utils.js +120 -0
- package/build/util/content-utils.js.map +1 -0
- package/build/util/content.d.ts +1 -0
- package/build/util/content.js +16 -0
- package/build/util/content.js.map +1 -0
- package/build/util/docs.d.ts +1 -0
- package/build/util/docs.js +26 -0
- package/build/util/docs.js.map +1 -0
- package/build/util/docs.test.d.ts +1 -0
- package/build/util/docs.test.js +49 -0
- package/build/util/docs.test.js.map +1 -0
- package/build/util/favicon.d.ts +6 -0
- package/build/util/favicon.js +88 -0
- package/build/util/favicon.js.map +1 -0
- package/build/util/favicon.test.d.ts +1 -0
- package/build/util/favicon.test.js +140 -0
- package/build/util/favicon.test.js.map +1 -0
- package/build/util/logger.d.ts +17 -0
- package/build/util/logger.js +72 -0
- package/build/util/logger.js.map +1 -0
- package/build/util/logger.test.d.ts +1 -0
- package/build/util/logger.test.js +46 -0
- package/build/util/logger.test.js.map +1 -0
- package/build/util/security.d.ts +312 -0
- package/build/util/security.js +719 -0
- package/build/util/security.js.map +1 -0
- package/build/util/security.test.d.ts +1 -0
- package/build/util/security.test.js +524 -0
- package/build/util/security.test.js.map +1 -0
- package/build/util/site-detector.d.ts +22 -0
- package/build/util/site-detector.js +42 -0
- package/build/util/site-detector.js.map +1 -0
- package/package.json +112 -0
|
@@ -0,0 +1,700 @@
|
|
|
1
|
+
import sqlite3 from 'sqlite3';
|
|
2
|
+
import { open } from 'sqlite';
|
|
3
|
+
import * as lancedb from '@lancedb/lancedb';
|
|
4
|
+
import { PhraseQuery, MatchQuery, BooleanQuery, Occur } from '@lancedb/lancedb';
|
|
5
|
+
import { Field, FixedSizeList, Float32, Schema, Utf8, Int32 } from 'apache-arrow';
|
|
6
|
+
import QuickLRU from 'quick-lru';
|
|
7
|
+
import { mkdir } from 'fs/promises';
|
|
8
|
+
import { dirname } from 'path';
|
|
9
|
+
import { logger } from '../util/logger.js';
|
|
10
|
+
import { escapeFilterValue } from '../util/security.js';
|
|
11
|
+
/**
|
|
12
|
+
* Preprocesses a search query - keeps it generic for any documentation type.
|
|
13
|
+
* Only extracts explicitly quoted phrases, otherwise passes through to LanceDB's
|
|
14
|
+
* built-in tokenization which handles stop words and stemming.
|
|
15
|
+
*/
|
|
16
|
+
function preprocessQuery(query) {
|
|
17
|
+
const result = {
|
|
18
|
+
phrases: [],
|
|
19
|
+
cleanedQuery: query,
|
|
20
|
+
original: query,
|
|
21
|
+
};
|
|
22
|
+
// Extract quoted phrases for exact matching
|
|
23
|
+
const quotedPattern = /"([^"]+)"/g;
|
|
24
|
+
let match;
|
|
25
|
+
while ((match = quotedPattern.exec(query)) !== null) {
|
|
26
|
+
result.phrases.push(match[1]);
|
|
27
|
+
}
|
|
28
|
+
// Remove quotes from cleaned query
|
|
29
|
+
result.cleanedQuery = query.replace(/"([^"]+)"/g, '$1').trim();
|
|
30
|
+
logger.debug('[QueryPreprocess] Processed query:', result);
|
|
31
|
+
return result;
|
|
32
|
+
}
|
|
33
|
+
export class DocumentStore {
|
|
34
|
+
dbPath;
|
|
35
|
+
vectorDbPath;
|
|
36
|
+
embeddings;
|
|
37
|
+
sqliteDb;
|
|
38
|
+
lanceConn;
|
|
39
|
+
lanceTable;
|
|
40
|
+
searchCache;
|
|
41
|
+
ftsIndexCreated = false;
|
|
42
|
+
constructor(dbPath, vectorDbPath, embeddings, maxCacheSize = 1000) {
|
|
43
|
+
this.dbPath = dbPath;
|
|
44
|
+
this.vectorDbPath = vectorDbPath;
|
|
45
|
+
this.embeddings = embeddings;
|
|
46
|
+
logger.debug(`[DocumentStore] Initializing with paths:`, {
|
|
47
|
+
dbPath,
|
|
48
|
+
vectorDbPath,
|
|
49
|
+
maxCacheSize,
|
|
50
|
+
});
|
|
51
|
+
this.searchCache = new QuickLRU({ maxSize: maxCacheSize });
|
|
52
|
+
}
|
|
53
|
+
async initialize() {
|
|
54
|
+
logger.debug(`[DocumentStore] Starting initialization with paths:`, {
|
|
55
|
+
dbPath: this.dbPath,
|
|
56
|
+
vectorDbPath: this.vectorDbPath,
|
|
57
|
+
});
|
|
58
|
+
try {
|
|
59
|
+
// Create directories with error handling
|
|
60
|
+
try {
|
|
61
|
+
logger.debug(`[DocumentStore] Creating SQLite directory: ${dirname(this.dbPath)}`);
|
|
62
|
+
await mkdir(dirname(this.dbPath), { recursive: true });
|
|
63
|
+
logger.debug(`[DocumentStore] Creating LanceDB directory: ${this.vectorDbPath}`);
|
|
64
|
+
await mkdir(this.vectorDbPath, { recursive: true });
|
|
65
|
+
}
|
|
66
|
+
catch (error) {
|
|
67
|
+
logger.error('[DocumentStore] Error creating directories:', error);
|
|
68
|
+
throw new Error(`Failed to create storage directories: ${error instanceof Error ? error.message : String(error)}`);
|
|
69
|
+
}
|
|
70
|
+
// Initialize SQLite with error handling
|
|
71
|
+
try {
|
|
72
|
+
logger.debug(`[DocumentStore] Opening SQLite database at ${this.dbPath}`);
|
|
73
|
+
this.sqliteDb = await open({
|
|
74
|
+
filename: this.dbPath,
|
|
75
|
+
driver: sqlite3.Database,
|
|
76
|
+
});
|
|
77
|
+
logger.debug(`[DocumentStore] Configuring SQLite database`);
|
|
78
|
+
await this.sqliteDb.exec('PRAGMA busy_timeout = 5000;');
|
|
79
|
+
await this.sqliteDb.exec('PRAGMA journal_mode = WAL;');
|
|
80
|
+
}
|
|
81
|
+
catch (error) {
|
|
82
|
+
logger.error('[DocumentStore] Error initializing SQLite:', error);
|
|
83
|
+
throw new Error(`Failed to initialize SQLite: ${error instanceof Error ? error.message : String(error)}`);
|
|
84
|
+
}
|
|
85
|
+
// Create tables if they don't exist
|
|
86
|
+
await this.sqliteDb.exec(`
|
|
87
|
+
CREATE TABLE IF NOT EXISTS documents (
|
|
88
|
+
url TEXT PRIMARY KEY,
|
|
89
|
+
title TEXT NOT NULL,
|
|
90
|
+
favicon TEXT,
|
|
91
|
+
last_indexed DATETIME NOT NULL
|
|
92
|
+
);
|
|
93
|
+
CREATE INDEX IF NOT EXISTS idx_last_indexed ON documents(last_indexed);
|
|
94
|
+
`);
|
|
95
|
+
// Initialize LanceDB with error handling
|
|
96
|
+
try {
|
|
97
|
+
logger.debug(`[DocumentStore] Connecting to LanceDB at ${this.vectorDbPath}`);
|
|
98
|
+
this.lanceConn = await lancedb.connect(this.vectorDbPath);
|
|
99
|
+
logger.debug(`[DocumentStore] Getting table list`);
|
|
100
|
+
const tableNames = await this.lanceConn.tableNames();
|
|
101
|
+
logger.debug(`[DocumentStore] Existing tables:`, tableNames);
|
|
102
|
+
// Only create the table if it doesn't exist
|
|
103
|
+
if (!tableNames.includes('chunks')) {
|
|
104
|
+
logger.debug(`[DocumentStore] Creating chunks table with dimensions: ${this.embeddings.dimensions}`);
|
|
105
|
+
// Define schema using Apache Arrow
|
|
106
|
+
const vectorType = new FixedSizeList(this.embeddings.dimensions, new Field('item', new Float32(), true));
|
|
107
|
+
const schema = new Schema([
|
|
108
|
+
new Field('url', new Utf8(), false),
|
|
109
|
+
new Field('title', new Utf8(), false),
|
|
110
|
+
new Field('content', new Utf8(), false),
|
|
111
|
+
new Field('path', new Utf8(), false),
|
|
112
|
+
new Field('startLine', new Int32(), false),
|
|
113
|
+
new Field('endLine', new Int32(), false),
|
|
114
|
+
new Field('vector', vectorType, false),
|
|
115
|
+
new Field('type', new Utf8(), false),
|
|
116
|
+
new Field('lastUpdated', new Utf8(), false),
|
|
117
|
+
new Field('version', new Utf8(), true),
|
|
118
|
+
new Field('framework', new Utf8(), true),
|
|
119
|
+
new Field('language', new Utf8(), true),
|
|
120
|
+
// Flatten arrays to simple strings for better FTS support
|
|
121
|
+
new Field('codeBlocks', new Utf8(), true),
|
|
122
|
+
new Field('props', new Utf8(), true),
|
|
123
|
+
]);
|
|
124
|
+
// Create empty table with schema
|
|
125
|
+
this.lanceTable = await this.lanceConn.createEmptyTable('chunks', schema, { mode: 'create' });
|
|
126
|
+
logger.debug(`[DocumentStore] New chunks table created successfully`);
|
|
127
|
+
// Create FTS index for better text search
|
|
128
|
+
await this.createFTSIndex();
|
|
129
|
+
}
|
|
130
|
+
else {
|
|
131
|
+
logger.debug(`[DocumentStore] Using existing chunks table`);
|
|
132
|
+
this.lanceTable = await this.lanceConn.openTable('chunks');
|
|
133
|
+
// Try to create FTS index if it doesn't exist
|
|
134
|
+
await this.createFTSIndex();
|
|
135
|
+
}
|
|
136
|
+
// Verify table is accessible
|
|
137
|
+
const rowCount = await this.lanceTable.countRows();
|
|
138
|
+
logger.debug(`[DocumentStore] Chunks table initialized, contains ${rowCount} rows`);
|
|
139
|
+
}
|
|
140
|
+
catch (error) {
|
|
141
|
+
logger.error('[DocumentStore] Error initializing LanceDB:', error);
|
|
142
|
+
throw new Error(`Failed to initialize LanceDB: ${error instanceof Error ? error.message : String(error)}`);
|
|
143
|
+
}
|
|
144
|
+
logger.debug(`[DocumentStore] All storage components initialized successfully`);
|
|
145
|
+
}
|
|
146
|
+
catch (error) {
|
|
147
|
+
logger.error('[DocumentStore] Error initializing storage:', error);
|
|
148
|
+
throw error;
|
|
149
|
+
}
|
|
150
|
+
}
|
|
151
|
+
async addDocument(doc) {
|
|
152
|
+
logger.debug(`[DocumentStore] Starting addDocument for:`, {
|
|
153
|
+
url: doc.metadata.url,
|
|
154
|
+
title: doc.metadata.title,
|
|
155
|
+
chunks: doc.chunks.length,
|
|
156
|
+
});
|
|
157
|
+
// Add diagnostic logging for vector dimensions
|
|
158
|
+
if (doc.chunks.length > 0) {
|
|
159
|
+
logger.debug(`[DocumentStore] Sample vector dimensions: ${doc.chunks[0].vector.length}`);
|
|
160
|
+
logger.debug(`[DocumentStore] Sample vector first 5 values: ${doc.chunks[0].vector.slice(0, 5)}`);
|
|
161
|
+
}
|
|
162
|
+
// Validate storage initialization
|
|
163
|
+
if (!this.sqliteDb) {
|
|
164
|
+
logger.debug('[DocumentStore] SQLite not initialized during addDocument');
|
|
165
|
+
throw new Error('SQLite storage not initialized');
|
|
166
|
+
}
|
|
167
|
+
if (!this.lanceTable) {
|
|
168
|
+
logger.debug('[DocumentStore] LanceDB not initialized during addDocument');
|
|
169
|
+
throw new Error('LanceDB storage not initialized');
|
|
170
|
+
}
|
|
171
|
+
try {
|
|
172
|
+
// Check if document already exists
|
|
173
|
+
const existing = await this.getDocument(doc.metadata.url);
|
|
174
|
+
if (existing) {
|
|
175
|
+
logger.debug(`[DocumentStore] Existing document found, will update:`, existing);
|
|
176
|
+
}
|
|
177
|
+
logger.debug(`[DocumentStore] Starting SQLite transaction`);
|
|
178
|
+
await this.sqliteDb.run('BEGIN TRANSACTION');
|
|
179
|
+
// Add metadata to SQLite
|
|
180
|
+
await this.sqliteDb.run('INSERT OR REPLACE INTO documents (url, title, favicon, last_indexed) VALUES (?, ?, ?, ?)', [
|
|
181
|
+
doc.metadata.url,
|
|
182
|
+
doc.metadata.title,
|
|
183
|
+
doc.metadata.favicon,
|
|
184
|
+
doc.metadata.lastIndexed.toISOString(),
|
|
185
|
+
]);
|
|
186
|
+
logger.debug(`[DocumentStore] Added metadata to SQLite`);
|
|
187
|
+
// Delete existing chunks for this document (using escaped value to prevent injection)
|
|
188
|
+
await this.lanceTable.delete(`url = '${escapeFilterValue(doc.metadata.url)}'`);
|
|
189
|
+
logger.debug(`[DocumentStore] Deleted existing chunks`);
|
|
190
|
+
// Add new chunks to LanceDB
|
|
191
|
+
const rows = doc.chunks.map((chunk) => ({
|
|
192
|
+
url: doc.metadata.url,
|
|
193
|
+
title: doc.metadata.title,
|
|
194
|
+
content: chunk.content,
|
|
195
|
+
path: chunk.path,
|
|
196
|
+
startLine: chunk.startLine,
|
|
197
|
+
endLine: chunk.endLine,
|
|
198
|
+
vector: chunk.vector,
|
|
199
|
+
type: chunk.metadata.type,
|
|
200
|
+
lastUpdated: new Date().toISOString(),
|
|
201
|
+
version: '',
|
|
202
|
+
framework: '',
|
|
203
|
+
language: '',
|
|
204
|
+
// Serialize code blocks and props as JSON strings
|
|
205
|
+
codeBlocks: JSON.stringify(chunk.metadata.codeBlocks || []),
|
|
206
|
+
props: JSON.stringify(chunk.metadata.props || []),
|
|
207
|
+
}));
|
|
208
|
+
logger.debug(`[DocumentStore] Adding ${rows.length} chunks to LanceDB`);
|
|
209
|
+
await this.lanceTable.add(rows);
|
|
210
|
+
// Verify data was added
|
|
211
|
+
const rowCount = await this.lanceTable.countRows();
|
|
212
|
+
logger.debug(`[DocumentStore] Table now contains ${rowCount} rows`);
|
|
213
|
+
// Commit transaction
|
|
214
|
+
await this.sqliteDb.run('COMMIT');
|
|
215
|
+
logger.debug(`[DocumentStore] Committed transaction`);
|
|
216
|
+
// Clear search cache for this URL
|
|
217
|
+
this.clearCacheForUrl(doc.metadata.url);
|
|
218
|
+
}
|
|
219
|
+
catch (error) {
|
|
220
|
+
// Rollback on error
|
|
221
|
+
if (this.sqliteDb) {
|
|
222
|
+
await this.sqliteDb.run('ROLLBACK');
|
|
223
|
+
}
|
|
224
|
+
logger.error('[DocumentStore] Error adding document:', error);
|
|
225
|
+
throw error;
|
|
226
|
+
}
|
|
227
|
+
}
|
|
228
|
+
async searchDocuments(queryVector, options = {}) {
|
|
229
|
+
if (!this.lanceTable) {
|
|
230
|
+
throw new Error('Storage not initialized');
|
|
231
|
+
}
|
|
232
|
+
const { limit = 10, includeVectors = false, filterByType, textQuery } = options;
|
|
233
|
+
logger.debug(`[DocumentStore] Searching documents with vector:`, {
|
|
234
|
+
dimensions: queryVector.length,
|
|
235
|
+
limit,
|
|
236
|
+
includeVectors,
|
|
237
|
+
filterByType,
|
|
238
|
+
hasTextQuery: !!textQuery,
|
|
239
|
+
});
|
|
240
|
+
// Add validation for query vector
|
|
241
|
+
if (queryVector.length === 0 && !textQuery) {
|
|
242
|
+
logger.debug('[DocumentStore] Empty query vector and no text query provided');
|
|
243
|
+
return [];
|
|
244
|
+
}
|
|
245
|
+
// Log search parameters
|
|
246
|
+
logger.debug(`[DocumentStore] Search parameters:`, {
|
|
247
|
+
vectorDimensions: queryVector.length,
|
|
248
|
+
expectedDimensions: this.embeddings.dimensions,
|
|
249
|
+
limit,
|
|
250
|
+
filterType: filterByType,
|
|
251
|
+
});
|
|
252
|
+
// Ensure vector dimensions match if provided
|
|
253
|
+
if (queryVector.length > 0 && queryVector.length !== this.embeddings.dimensions) {
|
|
254
|
+
logger.debug(`[DocumentStore] Vector dimension mismatch: got ${queryVector.length}, expected ${this.embeddings.dimensions}`);
|
|
255
|
+
// Consider padding or truncating the vector to match expected dimensions
|
|
256
|
+
if (queryVector.length < this.embeddings.dimensions) {
|
|
257
|
+
// Pad the vector with zeros
|
|
258
|
+
queryVector = [...queryVector, ...new Array(this.embeddings.dimensions - queryVector.length).fill(0)];
|
|
259
|
+
logger.debug(`[DocumentStore] Padded vector to ${queryVector.length} dimensions`);
|
|
260
|
+
}
|
|
261
|
+
else {
|
|
262
|
+
// Truncate the vector
|
|
263
|
+
queryVector = queryVector.slice(0, this.embeddings.dimensions);
|
|
264
|
+
logger.debug(`[DocumentStore] Truncated vector to ${queryVector.length} dimensions`);
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
try {
|
|
268
|
+
// Log query vector for debugging
|
|
269
|
+
logger.debug(`[DocumentStore] Query vector first 5 values: ${queryVector.slice(0, 5)}`);
|
|
270
|
+
// Ensure we have a valid query vector
|
|
271
|
+
if (queryVector.length === 0) {
|
|
272
|
+
logger.debug('[DocumentStore] Empty query vector provided for search');
|
|
273
|
+
// Use a default vector of the correct dimension instead of an empty array
|
|
274
|
+
queryVector = new Array(this.embeddings.dimensions).fill(0);
|
|
275
|
+
logger.debug(`[DocumentStore] Using default zero vector with ${queryVector.length} dimensions`);
|
|
276
|
+
}
|
|
277
|
+
// Create search query
|
|
278
|
+
let query = this.lanceTable.search(queryVector).limit(limit);
|
|
279
|
+
if (filterByType) {
|
|
280
|
+
query = query.where(`type = '${escapeFilterValue(filterByType)}'`);
|
|
281
|
+
}
|
|
282
|
+
const results = await query.toArray();
|
|
283
|
+
logger.debug(`[DocumentStore] Found ${results.length} results`);
|
|
284
|
+
// Log the first result for debugging if available
|
|
285
|
+
if (results.length > 0) {
|
|
286
|
+
logger.debug(`[DocumentStore] First result:`, {
|
|
287
|
+
id: results[0].id,
|
|
288
|
+
score: results[0].score,
|
|
289
|
+
hasVector: 'vector' in results[0],
|
|
290
|
+
vectorType: typeof results[0].vector,
|
|
291
|
+
vectorLength: Array.isArray(results[0].vector) ? results[0].vector.length : 'not an array',
|
|
292
|
+
});
|
|
293
|
+
}
|
|
294
|
+
const searchResults = results.map((result) => {
|
|
295
|
+
// Log the raw result for debugging
|
|
296
|
+
logger.debug(`[DocumentStore] Raw search result:`, {
|
|
297
|
+
id: result.id,
|
|
298
|
+
url: result.url,
|
|
299
|
+
hasVector: !!result.vector,
|
|
300
|
+
vectorType: result.vector ? typeof result.vector : 'undefined',
|
|
301
|
+
vectorLength: result.vector ? (Array.isArray(result.vector) ? result.vector.length : 'not an array') : 0,
|
|
302
|
+
});
|
|
303
|
+
// Parse JSON fields
|
|
304
|
+
let codeBlocks;
|
|
305
|
+
let props;
|
|
306
|
+
try {
|
|
307
|
+
codeBlocks = result.codeBlocks ? JSON.parse(result.codeBlocks) : undefined;
|
|
308
|
+
}
|
|
309
|
+
catch {
|
|
310
|
+
codeBlocks = undefined;
|
|
311
|
+
}
|
|
312
|
+
try {
|
|
313
|
+
props = result.props ? JSON.parse(result.props) : undefined;
|
|
314
|
+
}
|
|
315
|
+
catch {
|
|
316
|
+
props = undefined;
|
|
317
|
+
}
|
|
318
|
+
return {
|
|
319
|
+
id: String(result.id || result.url),
|
|
320
|
+
content: String(result.content),
|
|
321
|
+
url: String(result.url),
|
|
322
|
+
title: String(result.title),
|
|
323
|
+
score: result._distance != null ? 1 - result._distance : (result.score ?? 0),
|
|
324
|
+
...(includeVectors && { vector: result.vector }),
|
|
325
|
+
metadata: {
|
|
326
|
+
type: (result.type || 'overview'),
|
|
327
|
+
path: String(result.path),
|
|
328
|
+
lastUpdated: new Date(result.lastUpdated ? String(result.lastUpdated) : Date.now()),
|
|
329
|
+
version: result.version,
|
|
330
|
+
framework: result.framework,
|
|
331
|
+
language: result.language,
|
|
332
|
+
codeBlocks,
|
|
333
|
+
props,
|
|
334
|
+
},
|
|
335
|
+
};
|
|
336
|
+
});
|
|
337
|
+
return searchResults;
|
|
338
|
+
}
|
|
339
|
+
catch (error) {
|
|
340
|
+
logger.error('[DocumentStore] Error searching documents:', error);
|
|
341
|
+
throw error;
|
|
342
|
+
}
|
|
343
|
+
}
|
|
344
|
+
/**
|
|
345
|
+
* Create full-text search index on the content field
|
|
346
|
+
*/
|
|
347
|
+
async createFTSIndex() {
|
|
348
|
+
if (!this.lanceTable || this.ftsIndexCreated) {
|
|
349
|
+
return;
|
|
350
|
+
}
|
|
351
|
+
try {
|
|
352
|
+
logger.debug('[DocumentStore] Creating FTS index on content field...');
|
|
353
|
+
await this.lanceTable.createIndex('content', {
|
|
354
|
+
config: lancedb.Index.fts(),
|
|
355
|
+
});
|
|
356
|
+
this.ftsIndexCreated = true;
|
|
357
|
+
logger.debug('[DocumentStore] FTS index created successfully');
|
|
358
|
+
}
|
|
359
|
+
catch (error) {
|
|
360
|
+
const err = error;
|
|
361
|
+
if (err.message?.toLowerCase().includes('already exists')) {
|
|
362
|
+
logger.debug('[DocumentStore] FTS index already exists');
|
|
363
|
+
this.ftsIndexCreated = true;
|
|
364
|
+
}
|
|
365
|
+
else {
|
|
366
|
+
logger.warn('[DocumentStore] Failed to create FTS index:', err.message);
|
|
367
|
+
// Don't throw - FTS is optional, we can fall back to vector search
|
|
368
|
+
}
|
|
369
|
+
}
|
|
370
|
+
}
|
|
371
|
+
async searchByText(query, options = {}) {
|
|
372
|
+
logger.debug(`[DocumentStore] Searching documents by text:`, { query, options });
|
|
373
|
+
const cacheKey = `text:${query}:${JSON.stringify(options)}`;
|
|
374
|
+
const cached = this.searchCache.get(cacheKey);
|
|
375
|
+
if (cached) {
|
|
376
|
+
logger.debug(`[DocumentStore] Returning cached results`);
|
|
377
|
+
return cached;
|
|
378
|
+
}
|
|
379
|
+
const { limit = 10, filterByType, filterUrl } = options;
|
|
380
|
+
// Build WHERE clause for filtering (using escaped values to prevent injection)
|
|
381
|
+
const buildWhereClause = () => {
|
|
382
|
+
const conditions = [];
|
|
383
|
+
if (filterByType) {
|
|
384
|
+
conditions.push(`type = '${escapeFilterValue(filterByType)}'`);
|
|
385
|
+
}
|
|
386
|
+
if (filterUrl) {
|
|
387
|
+
// Filter by base URL - use LIKE to match URLs that start with the base URL
|
|
388
|
+
// Escape the filterUrl and also escape LIKE wildcards within the value
|
|
389
|
+
const escapedUrl = escapeFilterValue(filterUrl).replace(/%/g, '\\%').replace(/_/g, '\\_');
|
|
390
|
+
conditions.push(`url LIKE '${escapedUrl}%'`);
|
|
391
|
+
}
|
|
392
|
+
return conditions.length > 0 ? conditions.join(' AND ') : undefined;
|
|
393
|
+
};
|
|
394
|
+
const whereClause = buildWhereClause();
|
|
395
|
+
try {
|
|
396
|
+
if (!this.lanceTable) {
|
|
397
|
+
throw new Error('Storage not initialized');
|
|
398
|
+
}
|
|
399
|
+
// Preprocess query - only extracts quoted phrases, keeps everything else generic
|
|
400
|
+
const processedQuery = preprocessQuery(query);
|
|
401
|
+
// Generate embedding for vector search
|
|
402
|
+
const queryVector = await this.embeddings.embed(query);
|
|
403
|
+
logger.debug('[DocumentStore] Attempting hybrid search (FTS + vector with RRF)');
|
|
404
|
+
// Strategy 1: If user provided quoted phrases, use phrase matching
|
|
405
|
+
if (this.ftsIndexCreated && processedQuery.phrases.length > 0) {
|
|
406
|
+
try {
|
|
407
|
+
logger.debug('[DocumentStore] Using phrase-based search for quoted terms:', processedQuery.phrases);
|
|
408
|
+
// Build boolean query: phrase matches (must) + general terms (should)
|
|
409
|
+
const queries = [];
|
|
410
|
+
// Add phrase queries for quoted phrases (exact match)
|
|
411
|
+
for (const phrase of processedQuery.phrases) {
|
|
412
|
+
queries.push([Occur.Must, new PhraseQuery(phrase, 'content', { slop: 0 })]);
|
|
413
|
+
}
|
|
414
|
+
// Add fuzzy match for the overall cleaned query
|
|
415
|
+
if (processedQuery.cleanedQuery) {
|
|
416
|
+
queries.push([Occur.Should, new MatchQuery(processedQuery.cleanedQuery, 'content', { fuzziness: 1 })]);
|
|
417
|
+
}
|
|
418
|
+
const boolQuery = new BooleanQuery(queries);
|
|
419
|
+
let ftsQuery = this.lanceTable
|
|
420
|
+
.query()
|
|
421
|
+
.fullTextSearch(boolQuery)
|
|
422
|
+
.limit(limit * 2);
|
|
423
|
+
if (whereClause) {
|
|
424
|
+
ftsQuery = ftsQuery.where(whereClause);
|
|
425
|
+
}
|
|
426
|
+
const ftsResults = await ftsQuery.toArray();
|
|
427
|
+
logger.debug(`[DocumentStore] Phrase-based FTS returned ${ftsResults.length} results`);
|
|
428
|
+
if (ftsResults.length > 0) {
|
|
429
|
+
// Combine with vector search for semantic relevance
|
|
430
|
+
let vectorQuery = this.lanceTable.search(queryVector).limit(limit * 2);
|
|
431
|
+
if (whereClause) {
|
|
432
|
+
vectorQuery = vectorQuery.where(whereClause);
|
|
433
|
+
}
|
|
434
|
+
const vectorResults = await vectorQuery.toArray();
|
|
435
|
+
const mergedResults = this.mergeAndRankResults(ftsResults, vectorResults, limit);
|
|
436
|
+
const searchResults = this.formatSearchResults(mergedResults);
|
|
437
|
+
this.searchCache.set(cacheKey, searchResults);
|
|
438
|
+
return searchResults;
|
|
439
|
+
}
|
|
440
|
+
}
|
|
441
|
+
catch (phraseError) {
|
|
442
|
+
const err = phraseError;
|
|
443
|
+
logger.debug('[DocumentStore] Phrase-based search failed:', err.message);
|
|
444
|
+
}
|
|
445
|
+
}
|
|
446
|
+
// Strategy 2: Standard hybrid search - FTS with fuzziness + vector search
|
|
447
|
+
if (this.ftsIndexCreated) {
|
|
448
|
+
try {
|
|
449
|
+
// LanceDB's FTS already handles stop words and stemming
|
|
450
|
+
// Add fuzziness for typo tolerance
|
|
451
|
+
const matchQuery = new MatchQuery(processedQuery.cleanedQuery, 'content', { fuzziness: 1 });
|
|
452
|
+
let ftsQuery = this.lanceTable
|
|
453
|
+
.query()
|
|
454
|
+
.fullTextSearch(matchQuery)
|
|
455
|
+
.limit(limit * 2);
|
|
456
|
+
if (whereClause) {
|
|
457
|
+
ftsQuery = ftsQuery.where(whereClause);
|
|
458
|
+
}
|
|
459
|
+
const ftsResults = await ftsQuery.toArray();
|
|
460
|
+
logger.debug(`[DocumentStore] FTS returned ${ftsResults.length} results`);
|
|
461
|
+
// Always combine with vector search for best results
|
|
462
|
+
let vectorQuery = this.lanceTable.search(queryVector).limit(limit * 2);
|
|
463
|
+
if (whereClause) {
|
|
464
|
+
vectorQuery = vectorQuery.where(whereClause);
|
|
465
|
+
}
|
|
466
|
+
const vectorResults = await vectorQuery.toArray();
|
|
467
|
+
logger.debug(`[DocumentStore] Vector search returned ${vectorResults.length} results`);
|
|
468
|
+
// Merge using RRF even if one is empty - ensures we get results
|
|
469
|
+
const mergedResults = this.mergeAndRankResults(ftsResults, vectorResults, limit);
|
|
470
|
+
if (mergedResults.length > 0) {
|
|
471
|
+
const searchResults = this.formatSearchResults(mergedResults);
|
|
472
|
+
this.searchCache.set(cacheKey, searchResults);
|
|
473
|
+
return searchResults;
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
catch (ftsError) {
|
|
477
|
+
const err = ftsError;
|
|
478
|
+
logger.debug('[DocumentStore] FTS search failed, falling back to vector search:', err.message);
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
// Strategy 3: Fallback to pure vector search (semantic similarity)
|
|
482
|
+
logger.debug('[DocumentStore] Falling back to pure vector search');
|
|
483
|
+
const results = await this.searchDocuments(queryVector, options);
|
|
484
|
+
this.searchCache.set(cacheKey, results);
|
|
485
|
+
return results;
|
|
486
|
+
}
|
|
487
|
+
catch (error) {
|
|
488
|
+
logger.error('[DocumentStore] Error searching documents by text:', error);
|
|
489
|
+
throw error;
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
/**
|
|
493
|
+
* Merge FTS and vector results using Reciprocal Rank Fusion (RRF)
|
|
494
|
+
*/
|
|
495
|
+
mergeAndRankResults(ftsResults, vectorResults, limit) {
|
|
496
|
+
const k = 60; // RRF constant
|
|
497
|
+
const scores = new Map();
|
|
498
|
+
// Score FTS results
|
|
499
|
+
ftsResults.forEach((result, rank) => {
|
|
500
|
+
const key = `${result.url}:${result.path}:${result.startLine}`;
|
|
501
|
+
const rrfScore = 1 / (k + rank + 1);
|
|
502
|
+
scores.set(key, { result, score: rrfScore });
|
|
503
|
+
});
|
|
504
|
+
// Add/combine vector results
|
|
505
|
+
vectorResults.forEach((result, rank) => {
|
|
506
|
+
const key = `${result.url}:${result.path}:${result.startLine}`;
|
|
507
|
+
const rrfScore = 1 / (k + rank + 1);
|
|
508
|
+
if (scores.has(key)) {
|
|
509
|
+
// Combine scores if result appears in both
|
|
510
|
+
const existing = scores.get(key);
|
|
511
|
+
existing.score += rrfScore;
|
|
512
|
+
}
|
|
513
|
+
else {
|
|
514
|
+
scores.set(key, { result, score: rrfScore });
|
|
515
|
+
}
|
|
516
|
+
});
|
|
517
|
+
// Sort by combined RRF score and return top results
|
|
518
|
+
return Array.from(scores.values())
|
|
519
|
+
.sort((a, b) => b.score - a.score)
|
|
520
|
+
.slice(0, limit)
|
|
521
|
+
.map((item) => ({ ...item.result, _rrfScore: item.score }));
|
|
522
|
+
}
|
|
523
|
+
/**
|
|
524
|
+
* Format raw LanceDB results into SearchResult objects
|
|
525
|
+
*/
|
|
526
|
+
formatSearchResults(results) {
|
|
527
|
+
return results.map((result) => {
|
|
528
|
+
let codeBlocks, props;
|
|
529
|
+
try {
|
|
530
|
+
codeBlocks = result.codeBlocks ? JSON.parse(result.codeBlocks) : undefined;
|
|
531
|
+
}
|
|
532
|
+
catch {
|
|
533
|
+
codeBlocks = undefined;
|
|
534
|
+
}
|
|
535
|
+
try {
|
|
536
|
+
props = result.props ? JSON.parse(result.props) : undefined;
|
|
537
|
+
}
|
|
538
|
+
catch {
|
|
539
|
+
props = undefined;
|
|
540
|
+
}
|
|
541
|
+
return {
|
|
542
|
+
id: String(result.url),
|
|
543
|
+
content: String(result.content),
|
|
544
|
+
url: String(result.url),
|
|
545
|
+
title: String(result.title),
|
|
546
|
+
score: result._rrfScore ?? (result._distance != null ? 1 - result._distance : (result._score ?? 0)),
|
|
547
|
+
metadata: {
|
|
548
|
+
type: (result.type || 'overview'),
|
|
549
|
+
path: String(result.path),
|
|
550
|
+
lastUpdated: new Date(result.lastUpdated ? String(result.lastUpdated) : Date.now()),
|
|
551
|
+
version: result.version,
|
|
552
|
+
framework: result.framework,
|
|
553
|
+
language: result.language,
|
|
554
|
+
codeBlocks,
|
|
555
|
+
props,
|
|
556
|
+
},
|
|
557
|
+
};
|
|
558
|
+
});
|
|
559
|
+
}
|
|
560
|
+
async listDocuments() {
|
|
561
|
+
if (!this.sqliteDb) {
|
|
562
|
+
throw new Error('Storage not initialized');
|
|
563
|
+
}
|
|
564
|
+
logger.debug(`[DocumentStore] Listing documents`);
|
|
565
|
+
try {
|
|
566
|
+
const rows = await this.sqliteDb.all('SELECT url, title, favicon, last_indexed FROM documents ORDER BY last_indexed DESC');
|
|
567
|
+
logger.debug(`[DocumentStore] Found ${rows.length} documents`);
|
|
568
|
+
return rows.map((row) => ({
|
|
569
|
+
url: row.url,
|
|
570
|
+
title: row.title,
|
|
571
|
+
favicon: row.favicon ?? undefined,
|
|
572
|
+
lastIndexed: new Date(row.last_indexed),
|
|
573
|
+
}));
|
|
574
|
+
}
|
|
575
|
+
catch (error) {
|
|
576
|
+
logger.error('[DocumentStore] Error listing documents:', error);
|
|
577
|
+
throw error;
|
|
578
|
+
}
|
|
579
|
+
}
|
|
580
|
+
async deleteDocument(url) {
|
|
581
|
+
if (!this.sqliteDb || !this.lanceTable) {
|
|
582
|
+
throw new Error('Storage not initialized');
|
|
583
|
+
}
|
|
584
|
+
logger.debug(`[DocumentStore] Deleting document: ${url}`);
|
|
585
|
+
try {
|
|
586
|
+
await this.sqliteDb.run('BEGIN TRANSACTION');
|
|
587
|
+
await this.sqliteDb.run('DELETE FROM documents WHERE url = ?', [url]);
|
|
588
|
+
await this.lanceTable.delete(`url = '${escapeFilterValue(url)}'`);
|
|
589
|
+
await this.sqliteDb.run('COMMIT');
|
|
590
|
+
// Clear cache for this URL
|
|
591
|
+
this.clearCacheForUrl(url);
|
|
592
|
+
logger.debug(`[DocumentStore] Document deleted successfully`);
|
|
593
|
+
}
|
|
594
|
+
catch (error) {
|
|
595
|
+
if (this.sqliteDb) {
|
|
596
|
+
await this.sqliteDb.run('ROLLBACK');
|
|
597
|
+
}
|
|
598
|
+
logger.error('[DocumentStore] Error deleting document:', error);
|
|
599
|
+
throw error;
|
|
600
|
+
}
|
|
601
|
+
}
|
|
602
|
+
async getDocument(url) {
|
|
603
|
+
if (!this.sqliteDb) {
|
|
604
|
+
throw new Error('Storage not initialized');
|
|
605
|
+
}
|
|
606
|
+
logger.debug(`[DocumentStore] Getting document: ${url}`);
|
|
607
|
+
try {
|
|
608
|
+
// Check if SQLite is properly initialized
|
|
609
|
+
if (!this.sqliteDb) {
|
|
610
|
+
logger.debug('[DocumentStore] SQLite not initialized during getDocument');
|
|
611
|
+
throw new Error('Storage not initialized');
|
|
612
|
+
}
|
|
613
|
+
// Log the query being executed
|
|
614
|
+
logger.debug(`[DocumentStore] Executing SQLite query for URL: ${url}`);
|
|
615
|
+
const row = await this.sqliteDb.get('SELECT url, title, favicon, last_indexed FROM documents WHERE url = ?', [url]);
|
|
616
|
+
if (!row) {
|
|
617
|
+
logger.debug(`[DocumentStore] Document not found in SQLite: ${url}`);
|
|
618
|
+
return null;
|
|
619
|
+
}
|
|
620
|
+
// Check if LanceDB has any chunks for this document
|
|
621
|
+
if (this.lanceTable) {
|
|
622
|
+
const chunks = await this.lanceTable.countRows(`url = '${escapeFilterValue(url)}'`);
|
|
623
|
+
logger.debug(`[DocumentStore] Found ${chunks} chunks in LanceDB for ${url}`);
|
|
624
|
+
}
|
|
625
|
+
logger.debug(`[DocumentStore] Document found in SQLite:`, row);
|
|
626
|
+
return {
|
|
627
|
+
url: row.url,
|
|
628
|
+
title: row.title,
|
|
629
|
+
favicon: row.favicon ?? undefined,
|
|
630
|
+
lastIndexed: new Date(row.last_indexed),
|
|
631
|
+
};
|
|
632
|
+
}
|
|
633
|
+
catch (error) {
|
|
634
|
+
logger.error('[DocumentStore] Error getting document:', error);
|
|
635
|
+
throw error;
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
clearCacheForUrl(url) {
|
|
639
|
+
// Clear all cache entries that might contain results for this URL
|
|
640
|
+
for (const key of this.searchCache.keys()) {
|
|
641
|
+
const results = this.searchCache.get(key);
|
|
642
|
+
if (results?.some((result) => result.url === url)) {
|
|
643
|
+
this.searchCache.delete(key);
|
|
644
|
+
}
|
|
645
|
+
}
|
|
646
|
+
}
|
|
647
|
+
/**
|
|
648
|
+
* Validates that vectors are properly stored and retrievable from LanceDB
|
|
649
|
+
* @returns Promise<boolean> True if vectors are valid, false otherwise
|
|
650
|
+
*/
|
|
651
|
+
async validateVectors() {
|
|
652
|
+
if (!this.lanceTable) {
|
|
653
|
+
logger.debug('[DocumentStore] Cannot validate vectors: Storage not initialized');
|
|
654
|
+
throw new Error('Storage not initialized');
|
|
655
|
+
}
|
|
656
|
+
try {
|
|
657
|
+
// Get total row count
|
|
658
|
+
const rowCount = await this.lanceTable.countRows();
|
|
659
|
+
logger.debug(`[DocumentStore] Vector validation: Table contains ${rowCount} rows`);
|
|
660
|
+
if (rowCount === 0) {
|
|
661
|
+
logger.debug('[DocumentStore] Vector validation: No rows found in vector table');
|
|
662
|
+
return false;
|
|
663
|
+
}
|
|
664
|
+
// Get a sample row using a query
|
|
665
|
+
const sample = await this.lanceTable.query().limit(1).toArray();
|
|
666
|
+
if (sample.length === 0) {
|
|
667
|
+
logger.debug('[DocumentStore] Vector validation: No rows returned from query');
|
|
668
|
+
return false;
|
|
669
|
+
}
|
|
670
|
+
// Log detailed information about the sample
|
|
671
|
+
logger.debug('[DocumentStore] Vector validation sample:', {
|
|
672
|
+
hasVector: 'vector' in sample[0],
|
|
673
|
+
vectorType: typeof sample[0].vector,
|
|
674
|
+
isArray: Array.isArray(sample[0].vector),
|
|
675
|
+
length: Array.isArray(sample[0].vector) ? sample[0].vector.length : 'N/A',
|
|
676
|
+
sample: Array.isArray(sample[0].vector) ? sample[0].vector.slice(0, 5) : sample[0].vector,
|
|
677
|
+
});
|
|
678
|
+
// Try a simple vector search with a random vector
|
|
679
|
+
const testVector = new Array(this.embeddings.dimensions).fill(0).map(() => Math.random());
|
|
680
|
+
logger.debug(`[DocumentStore] Testing vector search with random vector of length ${testVector.length}`);
|
|
681
|
+
const searchResults = await this.lanceTable.search(testVector).limit(1).toArray();
|
|
682
|
+
logger.debug(`[DocumentStore] Vector search test returned ${searchResults.length} results`);
|
|
683
|
+
if (searchResults.length > 0) {
|
|
684
|
+
logger.debug('[DocumentStore] Vector search test result:', {
|
|
685
|
+
score: searchResults[0].score,
|
|
686
|
+
hasVector: 'vector' in searchResults[0],
|
|
687
|
+
vectorLength: Array.isArray(searchResults[0].vector) ? searchResults[0].vector.length : 'N/A',
|
|
688
|
+
});
|
|
689
|
+
}
|
|
690
|
+
// Consider vectors valid if we have rows and can perform a search
|
|
691
|
+
// Even if scores are null, the search is still working
|
|
692
|
+
return rowCount > 0 && sample.length > 0 && searchResults.length > 0;
|
|
693
|
+
}
|
|
694
|
+
catch (error) {
|
|
695
|
+
logger.error('[DocumentStore] Error validating vectors:', error);
|
|
696
|
+
return false;
|
|
697
|
+
}
|
|
698
|
+
}
|
|
699
|
+
}
|
|
700
|
+
//# sourceMappingURL=storage.js.map
|