@cosmocoder/mcp-web-docs 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (240) hide show
  1. package/LICENSE +22 -0
  2. package/README.md +368 -0
  3. package/build/__mocks__/embeddings.d.ts +17 -0
  4. package/build/__mocks__/embeddings.js +66 -0
  5. package/build/__mocks__/embeddings.js.map +1 -0
  6. package/build/config.d.ts +44 -0
  7. package/build/config.js +158 -0
  8. package/build/config.js.map +1 -0
  9. package/build/config.test.d.ts +1 -0
  10. package/build/config.test.js +165 -0
  11. package/build/config.test.js.map +1 -0
  12. package/build/crawler/auth.d.ts +128 -0
  13. package/build/crawler/auth.js +546 -0
  14. package/build/crawler/auth.js.map +1 -0
  15. package/build/crawler/auth.test.d.ts +1 -0
  16. package/build/crawler/auth.test.js +174 -0
  17. package/build/crawler/auth.test.js.map +1 -0
  18. package/build/crawler/base.d.ts +24 -0
  19. package/build/crawler/base.js +149 -0
  20. package/build/crawler/base.js.map +1 -0
  21. package/build/crawler/base.test.d.ts +1 -0
  22. package/build/crawler/base.test.js +234 -0
  23. package/build/crawler/base.test.js.map +1 -0
  24. package/build/crawler/browser-config.d.ts +2 -0
  25. package/build/crawler/browser-config.js +29 -0
  26. package/build/crawler/browser-config.js.map +1 -0
  27. package/build/crawler/browser-config.test.d.ts +1 -0
  28. package/build/crawler/browser-config.test.js +56 -0
  29. package/build/crawler/browser-config.test.js.map +1 -0
  30. package/build/crawler/cheerio.d.ts +11 -0
  31. package/build/crawler/cheerio.js +134 -0
  32. package/build/crawler/cheerio.js.map +1 -0
  33. package/build/crawler/chromium.d.ts +21 -0
  34. package/build/crawler/chromium.js +596 -0
  35. package/build/crawler/chromium.js.map +1 -0
  36. package/build/crawler/content-extractor-types.d.ts +25 -0
  37. package/build/crawler/content-extractor-types.js +2 -0
  38. package/build/crawler/content-extractor-types.js.map +1 -0
  39. package/build/crawler/content-extractors.d.ts +9 -0
  40. package/build/crawler/content-extractors.js +9 -0
  41. package/build/crawler/content-extractors.js.map +1 -0
  42. package/build/crawler/content-utils.d.ts +2 -0
  43. package/build/crawler/content-utils.js +22 -0
  44. package/build/crawler/content-utils.js.map +1 -0
  45. package/build/crawler/content-utils.test.d.ts +1 -0
  46. package/build/crawler/content-utils.test.js +99 -0
  47. package/build/crawler/content-utils.test.js.map +1 -0
  48. package/build/crawler/crawlee-crawler.d.ts +63 -0
  49. package/build/crawler/crawlee-crawler.js +342 -0
  50. package/build/crawler/crawlee-crawler.js.map +1 -0
  51. package/build/crawler/crawlee-crawler.test.d.ts +1 -0
  52. package/build/crawler/crawlee-crawler.test.js +280 -0
  53. package/build/crawler/crawlee-crawler.test.js.map +1 -0
  54. package/build/crawler/default-extractor.d.ts +4 -0
  55. package/build/crawler/default-extractor.js +26 -0
  56. package/build/crawler/default-extractor.js.map +1 -0
  57. package/build/crawler/default-extractor.test.d.ts +1 -0
  58. package/build/crawler/default-extractor.test.js +200 -0
  59. package/build/crawler/default-extractor.test.js.map +1 -0
  60. package/build/crawler/default.d.ts +11 -0
  61. package/build/crawler/default.js +138 -0
  62. package/build/crawler/default.js.map +1 -0
  63. package/build/crawler/docs-crawler.d.ts +26 -0
  64. package/build/crawler/docs-crawler.js +97 -0
  65. package/build/crawler/docs-crawler.js.map +1 -0
  66. package/build/crawler/docs-crawler.test.d.ts +1 -0
  67. package/build/crawler/docs-crawler.test.js +185 -0
  68. package/build/crawler/docs-crawler.test.js.map +1 -0
  69. package/build/crawler/factory.d.ts +6 -0
  70. package/build/crawler/factory.js +83 -0
  71. package/build/crawler/factory.js.map +1 -0
  72. package/build/crawler/github-pages-extractor.d.ts +4 -0
  73. package/build/crawler/github-pages-extractor.js +33 -0
  74. package/build/crawler/github-pages-extractor.js.map +1 -0
  75. package/build/crawler/github-pages-extractor.test.d.ts +1 -0
  76. package/build/crawler/github-pages-extractor.test.js +184 -0
  77. package/build/crawler/github-pages-extractor.test.js.map +1 -0
  78. package/build/crawler/github.d.ts +20 -0
  79. package/build/crawler/github.js +181 -0
  80. package/build/crawler/github.js.map +1 -0
  81. package/build/crawler/github.test.d.ts +1 -0
  82. package/build/crawler/github.test.js +326 -0
  83. package/build/crawler/github.test.js.map +1 -0
  84. package/build/crawler/puppeteer.d.ts +16 -0
  85. package/build/crawler/puppeteer.js +191 -0
  86. package/build/crawler/puppeteer.js.map +1 -0
  87. package/build/crawler/queue-manager.d.ts +43 -0
  88. package/build/crawler/queue-manager.js +169 -0
  89. package/build/crawler/queue-manager.js.map +1 -0
  90. package/build/crawler/queue-manager.test.d.ts +1 -0
  91. package/build/crawler/queue-manager.test.js +509 -0
  92. package/build/crawler/queue-manager.test.js.map +1 -0
  93. package/build/crawler/site-rules.d.ts +11 -0
  94. package/build/crawler/site-rules.js +104 -0
  95. package/build/crawler/site-rules.js.map +1 -0
  96. package/build/crawler/site-rules.test.d.ts +1 -0
  97. package/build/crawler/site-rules.test.js +139 -0
  98. package/build/crawler/site-rules.test.js.map +1 -0
  99. package/build/crawler/storybook-extractor.d.ts +34 -0
  100. package/build/crawler/storybook-extractor.js +767 -0
  101. package/build/crawler/storybook-extractor.js.map +1 -0
  102. package/build/crawler/storybook-extractor.test.d.ts +1 -0
  103. package/build/crawler/storybook-extractor.test.js +491 -0
  104. package/build/crawler/storybook-extractor.test.js.map +1 -0
  105. package/build/embeddings/fastembed.d.ts +25 -0
  106. package/build/embeddings/fastembed.js +188 -0
  107. package/build/embeddings/fastembed.js.map +1 -0
  108. package/build/embeddings/fastembed.test.d.ts +1 -0
  109. package/build/embeddings/fastembed.test.js +307 -0
  110. package/build/embeddings/fastembed.test.js.map +1 -0
  111. package/build/embeddings/openai.d.ts +8 -0
  112. package/build/embeddings/openai.js +56 -0
  113. package/build/embeddings/openai.js.map +1 -0
  114. package/build/embeddings/types.d.ts +4 -0
  115. package/build/embeddings/types.js +2 -0
  116. package/build/embeddings/types.js.map +1 -0
  117. package/build/index.d.ts +2 -0
  118. package/build/index.js +1007 -0
  119. package/build/index.js.map +1 -0
  120. package/build/index.test.d.ts +1 -0
  121. package/build/index.test.js +364 -0
  122. package/build/index.test.js.map +1 -0
  123. package/build/indexing/queue-manager.d.ts +36 -0
  124. package/build/indexing/queue-manager.js +86 -0
  125. package/build/indexing/queue-manager.js.map +1 -0
  126. package/build/indexing/queue-manager.test.d.ts +1 -0
  127. package/build/indexing/queue-manager.test.js +257 -0
  128. package/build/indexing/queue-manager.test.js.map +1 -0
  129. package/build/indexing/status.d.ts +39 -0
  130. package/build/indexing/status.js +207 -0
  131. package/build/indexing/status.js.map +1 -0
  132. package/build/indexing/status.test.d.ts +1 -0
  133. package/build/indexing/status.test.js +246 -0
  134. package/build/indexing/status.test.js.map +1 -0
  135. package/build/processor/content.d.ts +16 -0
  136. package/build/processor/content.js +286 -0
  137. package/build/processor/content.js.map +1 -0
  138. package/build/processor/content.test.d.ts +1 -0
  139. package/build/processor/content.test.js +369 -0
  140. package/build/processor/content.test.js.map +1 -0
  141. package/build/processor/markdown.d.ts +11 -0
  142. package/build/processor/markdown.js +256 -0
  143. package/build/processor/markdown.js.map +1 -0
  144. package/build/processor/markdown.test.d.ts +1 -0
  145. package/build/processor/markdown.test.js +312 -0
  146. package/build/processor/markdown.test.js.map +1 -0
  147. package/build/processor/metadata-parser.d.ts +37 -0
  148. package/build/processor/metadata-parser.js +245 -0
  149. package/build/processor/metadata-parser.js.map +1 -0
  150. package/build/processor/metadata-parser.test.d.ts +1 -0
  151. package/build/processor/metadata-parser.test.js +357 -0
  152. package/build/processor/metadata-parser.test.js.map +1 -0
  153. package/build/processor/processor.d.ts +8 -0
  154. package/build/processor/processor.js +190 -0
  155. package/build/processor/processor.js.map +1 -0
  156. package/build/processor/processor.test.d.ts +1 -0
  157. package/build/processor/processor.test.js +357 -0
  158. package/build/processor/processor.test.js.map +1 -0
  159. package/build/rag/cache.d.ts +10 -0
  160. package/build/rag/cache.js +10 -0
  161. package/build/rag/cache.js.map +1 -0
  162. package/build/rag/code-generator.d.ts +11 -0
  163. package/build/rag/code-generator.js +30 -0
  164. package/build/rag/code-generator.js.map +1 -0
  165. package/build/rag/context-assembler.d.ts +23 -0
  166. package/build/rag/context-assembler.js +113 -0
  167. package/build/rag/context-assembler.js.map +1 -0
  168. package/build/rag/docs-search.d.ts +55 -0
  169. package/build/rag/docs-search.js +380 -0
  170. package/build/rag/docs-search.js.map +1 -0
  171. package/build/rag/pipeline.d.ts +26 -0
  172. package/build/rag/pipeline.js +91 -0
  173. package/build/rag/pipeline.js.map +1 -0
  174. package/build/rag/query-processor.d.ts +14 -0
  175. package/build/rag/query-processor.js +57 -0
  176. package/build/rag/query-processor.js.map +1 -0
  177. package/build/rag/reranker.d.ts +55 -0
  178. package/build/rag/reranker.js +210 -0
  179. package/build/rag/reranker.js.map +1 -0
  180. package/build/rag/response-generator.d.ts +20 -0
  181. package/build/rag/response-generator.js +101 -0
  182. package/build/rag/response-generator.js.map +1 -0
  183. package/build/rag/retriever.d.ts +19 -0
  184. package/build/rag/retriever.js +111 -0
  185. package/build/rag/retriever.js.map +1 -0
  186. package/build/rag/validator.d.ts +22 -0
  187. package/build/rag/validator.js +128 -0
  188. package/build/rag/validator.js.map +1 -0
  189. package/build/rag/version-manager.d.ts +23 -0
  190. package/build/rag/version-manager.js +98 -0
  191. package/build/rag/version-manager.js.map +1 -0
  192. package/build/setupTests.d.ts +4 -0
  193. package/build/setupTests.js +50 -0
  194. package/build/setupTests.js.map +1 -0
  195. package/build/storage/storage.d.ts +38 -0
  196. package/build/storage/storage.js +700 -0
  197. package/build/storage/storage.js.map +1 -0
  198. package/build/storage/storage.test.d.ts +1 -0
  199. package/build/storage/storage.test.js +338 -0
  200. package/build/storage/storage.test.js.map +1 -0
  201. package/build/types/rag.d.ts +27 -0
  202. package/build/types/rag.js +2 -0
  203. package/build/types/rag.js.map +1 -0
  204. package/build/types.d.ts +120 -0
  205. package/build/types.js +2 -0
  206. package/build/types.js.map +1 -0
  207. package/build/util/content-utils.d.ts +31 -0
  208. package/build/util/content-utils.js +120 -0
  209. package/build/util/content-utils.js.map +1 -0
  210. package/build/util/content.d.ts +1 -0
  211. package/build/util/content.js +16 -0
  212. package/build/util/content.js.map +1 -0
  213. package/build/util/docs.d.ts +1 -0
  214. package/build/util/docs.js +26 -0
  215. package/build/util/docs.js.map +1 -0
  216. package/build/util/docs.test.d.ts +1 -0
  217. package/build/util/docs.test.js +49 -0
  218. package/build/util/docs.test.js.map +1 -0
  219. package/build/util/favicon.d.ts +6 -0
  220. package/build/util/favicon.js +88 -0
  221. package/build/util/favicon.js.map +1 -0
  222. package/build/util/favicon.test.d.ts +1 -0
  223. package/build/util/favicon.test.js +140 -0
  224. package/build/util/favicon.test.js.map +1 -0
  225. package/build/util/logger.d.ts +17 -0
  226. package/build/util/logger.js +72 -0
  227. package/build/util/logger.js.map +1 -0
  228. package/build/util/logger.test.d.ts +1 -0
  229. package/build/util/logger.test.js +46 -0
  230. package/build/util/logger.test.js.map +1 -0
  231. package/build/util/security.d.ts +312 -0
  232. package/build/util/security.js +719 -0
  233. package/build/util/security.js.map +1 -0
  234. package/build/util/security.test.d.ts +1 -0
  235. package/build/util/security.test.js +524 -0
  236. package/build/util/security.test.js.map +1 -0
  237. package/build/util/site-detector.d.ts +22 -0
  238. package/build/util/site-detector.js +42 -0
  239. package/build/util/site-detector.js.map +1 -0
  240. package/package.json +112 -0
@@ -0,0 +1,700 @@
1
+ import sqlite3 from 'sqlite3';
2
+ import { open } from 'sqlite';
3
+ import * as lancedb from '@lancedb/lancedb';
4
+ import { PhraseQuery, MatchQuery, BooleanQuery, Occur } from '@lancedb/lancedb';
5
+ import { Field, FixedSizeList, Float32, Schema, Utf8, Int32 } from 'apache-arrow';
6
+ import QuickLRU from 'quick-lru';
7
+ import { mkdir } from 'fs/promises';
8
+ import { dirname } from 'path';
9
+ import { logger } from '../util/logger.js';
10
+ import { escapeFilterValue } from '../util/security.js';
11
+ /**
12
+ * Preprocesses a search query - keeps it generic for any documentation type.
13
+ * Only extracts explicitly quoted phrases, otherwise passes through to LanceDB's
14
+ * built-in tokenization which handles stop words and stemming.
15
+ */
16
+ function preprocessQuery(query) {
17
+ const result = {
18
+ phrases: [],
19
+ cleanedQuery: query,
20
+ original: query,
21
+ };
22
+ // Extract quoted phrases for exact matching
23
+ const quotedPattern = /"([^"]+)"/g;
24
+ let match;
25
+ while ((match = quotedPattern.exec(query)) !== null) {
26
+ result.phrases.push(match[1]);
27
+ }
28
+ // Remove quotes from cleaned query
29
+ result.cleanedQuery = query.replace(/"([^"]+)"/g, '$1').trim();
30
+ logger.debug('[QueryPreprocess] Processed query:', result);
31
+ return result;
32
+ }
33
+ export class DocumentStore {
34
+ dbPath;
35
+ vectorDbPath;
36
+ embeddings;
37
+ sqliteDb;
38
+ lanceConn;
39
+ lanceTable;
40
+ searchCache;
41
+ ftsIndexCreated = false;
42
+ constructor(dbPath, vectorDbPath, embeddings, maxCacheSize = 1000) {
43
+ this.dbPath = dbPath;
44
+ this.vectorDbPath = vectorDbPath;
45
+ this.embeddings = embeddings;
46
+ logger.debug(`[DocumentStore] Initializing with paths:`, {
47
+ dbPath,
48
+ vectorDbPath,
49
+ maxCacheSize,
50
+ });
51
+ this.searchCache = new QuickLRU({ maxSize: maxCacheSize });
52
+ }
53
+ async initialize() {
54
+ logger.debug(`[DocumentStore] Starting initialization with paths:`, {
55
+ dbPath: this.dbPath,
56
+ vectorDbPath: this.vectorDbPath,
57
+ });
58
+ try {
59
+ // Create directories with error handling
60
+ try {
61
+ logger.debug(`[DocumentStore] Creating SQLite directory: ${dirname(this.dbPath)}`);
62
+ await mkdir(dirname(this.dbPath), { recursive: true });
63
+ logger.debug(`[DocumentStore] Creating LanceDB directory: ${this.vectorDbPath}`);
64
+ await mkdir(this.vectorDbPath, { recursive: true });
65
+ }
66
+ catch (error) {
67
+ logger.error('[DocumentStore] Error creating directories:', error);
68
+ throw new Error(`Failed to create storage directories: ${error instanceof Error ? error.message : String(error)}`);
69
+ }
70
+ // Initialize SQLite with error handling
71
+ try {
72
+ logger.debug(`[DocumentStore] Opening SQLite database at ${this.dbPath}`);
73
+ this.sqliteDb = await open({
74
+ filename: this.dbPath,
75
+ driver: sqlite3.Database,
76
+ });
77
+ logger.debug(`[DocumentStore] Configuring SQLite database`);
78
+ await this.sqliteDb.exec('PRAGMA busy_timeout = 5000;');
79
+ await this.sqliteDb.exec('PRAGMA journal_mode = WAL;');
80
+ }
81
+ catch (error) {
82
+ logger.error('[DocumentStore] Error initializing SQLite:', error);
83
+ throw new Error(`Failed to initialize SQLite: ${error instanceof Error ? error.message : String(error)}`);
84
+ }
85
+ // Create tables if they don't exist
86
+ await this.sqliteDb.exec(`
87
+ CREATE TABLE IF NOT EXISTS documents (
88
+ url TEXT PRIMARY KEY,
89
+ title TEXT NOT NULL,
90
+ favicon TEXT,
91
+ last_indexed DATETIME NOT NULL
92
+ );
93
+ CREATE INDEX IF NOT EXISTS idx_last_indexed ON documents(last_indexed);
94
+ `);
95
+ // Initialize LanceDB with error handling
96
+ try {
97
+ logger.debug(`[DocumentStore] Connecting to LanceDB at ${this.vectorDbPath}`);
98
+ this.lanceConn = await lancedb.connect(this.vectorDbPath);
99
+ logger.debug(`[DocumentStore] Getting table list`);
100
+ const tableNames = await this.lanceConn.tableNames();
101
+ logger.debug(`[DocumentStore] Existing tables:`, tableNames);
102
+ // Only create the table if it doesn't exist
103
+ if (!tableNames.includes('chunks')) {
104
+ logger.debug(`[DocumentStore] Creating chunks table with dimensions: ${this.embeddings.dimensions}`);
105
+ // Define schema using Apache Arrow
106
+ const vectorType = new FixedSizeList(this.embeddings.dimensions, new Field('item', new Float32(), true));
107
+ const schema = new Schema([
108
+ new Field('url', new Utf8(), false),
109
+ new Field('title', new Utf8(), false),
110
+ new Field('content', new Utf8(), false),
111
+ new Field('path', new Utf8(), false),
112
+ new Field('startLine', new Int32(), false),
113
+ new Field('endLine', new Int32(), false),
114
+ new Field('vector', vectorType, false),
115
+ new Field('type', new Utf8(), false),
116
+ new Field('lastUpdated', new Utf8(), false),
117
+ new Field('version', new Utf8(), true),
118
+ new Field('framework', new Utf8(), true),
119
+ new Field('language', new Utf8(), true),
120
+ // Flatten arrays to simple strings for better FTS support
121
+ new Field('codeBlocks', new Utf8(), true),
122
+ new Field('props', new Utf8(), true),
123
+ ]);
124
+ // Create empty table with schema
125
+ this.lanceTable = await this.lanceConn.createEmptyTable('chunks', schema, { mode: 'create' });
126
+ logger.debug(`[DocumentStore] New chunks table created successfully`);
127
+ // Create FTS index for better text search
128
+ await this.createFTSIndex();
129
+ }
130
+ else {
131
+ logger.debug(`[DocumentStore] Using existing chunks table`);
132
+ this.lanceTable = await this.lanceConn.openTable('chunks');
133
+ // Try to create FTS index if it doesn't exist
134
+ await this.createFTSIndex();
135
+ }
136
+ // Verify table is accessible
137
+ const rowCount = await this.lanceTable.countRows();
138
+ logger.debug(`[DocumentStore] Chunks table initialized, contains ${rowCount} rows`);
139
+ }
140
+ catch (error) {
141
+ logger.error('[DocumentStore] Error initializing LanceDB:', error);
142
+ throw new Error(`Failed to initialize LanceDB: ${error instanceof Error ? error.message : String(error)}`);
143
+ }
144
+ logger.debug(`[DocumentStore] All storage components initialized successfully`);
145
+ }
146
+ catch (error) {
147
+ logger.error('[DocumentStore] Error initializing storage:', error);
148
+ throw error;
149
+ }
150
+ }
151
+ async addDocument(doc) {
152
+ logger.debug(`[DocumentStore] Starting addDocument for:`, {
153
+ url: doc.metadata.url,
154
+ title: doc.metadata.title,
155
+ chunks: doc.chunks.length,
156
+ });
157
+ // Add diagnostic logging for vector dimensions
158
+ if (doc.chunks.length > 0) {
159
+ logger.debug(`[DocumentStore] Sample vector dimensions: ${doc.chunks[0].vector.length}`);
160
+ logger.debug(`[DocumentStore] Sample vector first 5 values: ${doc.chunks[0].vector.slice(0, 5)}`);
161
+ }
162
+ // Validate storage initialization
163
+ if (!this.sqliteDb) {
164
+ logger.debug('[DocumentStore] SQLite not initialized during addDocument');
165
+ throw new Error('SQLite storage not initialized');
166
+ }
167
+ if (!this.lanceTable) {
168
+ logger.debug('[DocumentStore] LanceDB not initialized during addDocument');
169
+ throw new Error('LanceDB storage not initialized');
170
+ }
171
+ try {
172
+ // Check if document already exists
173
+ const existing = await this.getDocument(doc.metadata.url);
174
+ if (existing) {
175
+ logger.debug(`[DocumentStore] Existing document found, will update:`, existing);
176
+ }
177
+ logger.debug(`[DocumentStore] Starting SQLite transaction`);
178
+ await this.sqliteDb.run('BEGIN TRANSACTION');
179
+ // Add metadata to SQLite
180
+ await this.sqliteDb.run('INSERT OR REPLACE INTO documents (url, title, favicon, last_indexed) VALUES (?, ?, ?, ?)', [
181
+ doc.metadata.url,
182
+ doc.metadata.title,
183
+ doc.metadata.favicon,
184
+ doc.metadata.lastIndexed.toISOString(),
185
+ ]);
186
+ logger.debug(`[DocumentStore] Added metadata to SQLite`);
187
+ // Delete existing chunks for this document (using escaped value to prevent injection)
188
+ await this.lanceTable.delete(`url = '${escapeFilterValue(doc.metadata.url)}'`);
189
+ logger.debug(`[DocumentStore] Deleted existing chunks`);
190
+ // Add new chunks to LanceDB
191
+ const rows = doc.chunks.map((chunk) => ({
192
+ url: doc.metadata.url,
193
+ title: doc.metadata.title,
194
+ content: chunk.content,
195
+ path: chunk.path,
196
+ startLine: chunk.startLine,
197
+ endLine: chunk.endLine,
198
+ vector: chunk.vector,
199
+ type: chunk.metadata.type,
200
+ lastUpdated: new Date().toISOString(),
201
+ version: '',
202
+ framework: '',
203
+ language: '',
204
+ // Serialize code blocks and props as JSON strings
205
+ codeBlocks: JSON.stringify(chunk.metadata.codeBlocks || []),
206
+ props: JSON.stringify(chunk.metadata.props || []),
207
+ }));
208
+ logger.debug(`[DocumentStore] Adding ${rows.length} chunks to LanceDB`);
209
+ await this.lanceTable.add(rows);
210
+ // Verify data was added
211
+ const rowCount = await this.lanceTable.countRows();
212
+ logger.debug(`[DocumentStore] Table now contains ${rowCount} rows`);
213
+ // Commit transaction
214
+ await this.sqliteDb.run('COMMIT');
215
+ logger.debug(`[DocumentStore] Committed transaction`);
216
+ // Clear search cache for this URL
217
+ this.clearCacheForUrl(doc.metadata.url);
218
+ }
219
+ catch (error) {
220
+ // Rollback on error
221
+ if (this.sqliteDb) {
222
+ await this.sqliteDb.run('ROLLBACK');
223
+ }
224
+ logger.error('[DocumentStore] Error adding document:', error);
225
+ throw error;
226
+ }
227
+ }
228
+ async searchDocuments(queryVector, options = {}) {
229
+ if (!this.lanceTable) {
230
+ throw new Error('Storage not initialized');
231
+ }
232
+ const { limit = 10, includeVectors = false, filterByType, textQuery } = options;
233
+ logger.debug(`[DocumentStore] Searching documents with vector:`, {
234
+ dimensions: queryVector.length,
235
+ limit,
236
+ includeVectors,
237
+ filterByType,
238
+ hasTextQuery: !!textQuery,
239
+ });
240
+ // Add validation for query vector
241
+ if (queryVector.length === 0 && !textQuery) {
242
+ logger.debug('[DocumentStore] Empty query vector and no text query provided');
243
+ return [];
244
+ }
245
+ // Log search parameters
246
+ logger.debug(`[DocumentStore] Search parameters:`, {
247
+ vectorDimensions: queryVector.length,
248
+ expectedDimensions: this.embeddings.dimensions,
249
+ limit,
250
+ filterType: filterByType,
251
+ });
252
+ // Ensure vector dimensions match if provided
253
+ if (queryVector.length > 0 && queryVector.length !== this.embeddings.dimensions) {
254
+ logger.debug(`[DocumentStore] Vector dimension mismatch: got ${queryVector.length}, expected ${this.embeddings.dimensions}`);
255
+ // Consider padding or truncating the vector to match expected dimensions
256
+ if (queryVector.length < this.embeddings.dimensions) {
257
+ // Pad the vector with zeros
258
+ queryVector = [...queryVector, ...new Array(this.embeddings.dimensions - queryVector.length).fill(0)];
259
+ logger.debug(`[DocumentStore] Padded vector to ${queryVector.length} dimensions`);
260
+ }
261
+ else {
262
+ // Truncate the vector
263
+ queryVector = queryVector.slice(0, this.embeddings.dimensions);
264
+ logger.debug(`[DocumentStore] Truncated vector to ${queryVector.length} dimensions`);
265
+ }
266
+ }
267
+ try {
268
+ // Log query vector for debugging
269
+ logger.debug(`[DocumentStore] Query vector first 5 values: ${queryVector.slice(0, 5)}`);
270
+ // Ensure we have a valid query vector
271
+ if (queryVector.length === 0) {
272
+ logger.debug('[DocumentStore] Empty query vector provided for search');
273
+ // Use a default vector of the correct dimension instead of an empty array
274
+ queryVector = new Array(this.embeddings.dimensions).fill(0);
275
+ logger.debug(`[DocumentStore] Using default zero vector with ${queryVector.length} dimensions`);
276
+ }
277
+ // Create search query
278
+ let query = this.lanceTable.search(queryVector).limit(limit);
279
+ if (filterByType) {
280
+ query = query.where(`type = '${escapeFilterValue(filterByType)}'`);
281
+ }
282
+ const results = await query.toArray();
283
+ logger.debug(`[DocumentStore] Found ${results.length} results`);
284
+ // Log the first result for debugging if available
285
+ if (results.length > 0) {
286
+ logger.debug(`[DocumentStore] First result:`, {
287
+ id: results[0].id,
288
+ score: results[0].score,
289
+ hasVector: 'vector' in results[0],
290
+ vectorType: typeof results[0].vector,
291
+ vectorLength: Array.isArray(results[0].vector) ? results[0].vector.length : 'not an array',
292
+ });
293
+ }
294
+ const searchResults = results.map((result) => {
295
+ // Log the raw result for debugging
296
+ logger.debug(`[DocumentStore] Raw search result:`, {
297
+ id: result.id,
298
+ url: result.url,
299
+ hasVector: !!result.vector,
300
+ vectorType: result.vector ? typeof result.vector : 'undefined',
301
+ vectorLength: result.vector ? (Array.isArray(result.vector) ? result.vector.length : 'not an array') : 0,
302
+ });
303
+ // Parse JSON fields
304
+ let codeBlocks;
305
+ let props;
306
+ try {
307
+ codeBlocks = result.codeBlocks ? JSON.parse(result.codeBlocks) : undefined;
308
+ }
309
+ catch {
310
+ codeBlocks = undefined;
311
+ }
312
+ try {
313
+ props = result.props ? JSON.parse(result.props) : undefined;
314
+ }
315
+ catch {
316
+ props = undefined;
317
+ }
318
+ return {
319
+ id: String(result.id || result.url),
320
+ content: String(result.content),
321
+ url: String(result.url),
322
+ title: String(result.title),
323
+ score: result._distance != null ? 1 - result._distance : (result.score ?? 0),
324
+ ...(includeVectors && { vector: result.vector }),
325
+ metadata: {
326
+ type: (result.type || 'overview'),
327
+ path: String(result.path),
328
+ lastUpdated: new Date(result.lastUpdated ? String(result.lastUpdated) : Date.now()),
329
+ version: result.version,
330
+ framework: result.framework,
331
+ language: result.language,
332
+ codeBlocks,
333
+ props,
334
+ },
335
+ };
336
+ });
337
+ return searchResults;
338
+ }
339
+ catch (error) {
340
+ logger.error('[DocumentStore] Error searching documents:', error);
341
+ throw error;
342
+ }
343
+ }
344
+ /**
345
+ * Create full-text search index on the content field
346
+ */
347
+ async createFTSIndex() {
348
+ if (!this.lanceTable || this.ftsIndexCreated) {
349
+ return;
350
+ }
351
+ try {
352
+ logger.debug('[DocumentStore] Creating FTS index on content field...');
353
+ await this.lanceTable.createIndex('content', {
354
+ config: lancedb.Index.fts(),
355
+ });
356
+ this.ftsIndexCreated = true;
357
+ logger.debug('[DocumentStore] FTS index created successfully');
358
+ }
359
+ catch (error) {
360
+ const err = error;
361
+ if (err.message?.toLowerCase().includes('already exists')) {
362
+ logger.debug('[DocumentStore] FTS index already exists');
363
+ this.ftsIndexCreated = true;
364
+ }
365
+ else {
366
+ logger.warn('[DocumentStore] Failed to create FTS index:', err.message);
367
+ // Don't throw - FTS is optional, we can fall back to vector search
368
+ }
369
+ }
370
+ }
371
+ async searchByText(query, options = {}) {
372
+ logger.debug(`[DocumentStore] Searching documents by text:`, { query, options });
373
+ const cacheKey = `text:${query}:${JSON.stringify(options)}`;
374
+ const cached = this.searchCache.get(cacheKey);
375
+ if (cached) {
376
+ logger.debug(`[DocumentStore] Returning cached results`);
377
+ return cached;
378
+ }
379
+ const { limit = 10, filterByType, filterUrl } = options;
380
+ // Build WHERE clause for filtering (using escaped values to prevent injection)
381
+ const buildWhereClause = () => {
382
+ const conditions = [];
383
+ if (filterByType) {
384
+ conditions.push(`type = '${escapeFilterValue(filterByType)}'`);
385
+ }
386
+ if (filterUrl) {
387
+ // Filter by base URL - use LIKE to match URLs that start with the base URL
388
+ // Escape the filterUrl and also escape LIKE wildcards within the value
389
+ const escapedUrl = escapeFilterValue(filterUrl).replace(/%/g, '\\%').replace(/_/g, '\\_');
390
+ conditions.push(`url LIKE '${escapedUrl}%'`);
391
+ }
392
+ return conditions.length > 0 ? conditions.join(' AND ') : undefined;
393
+ };
394
+ const whereClause = buildWhereClause();
395
+ try {
396
+ if (!this.lanceTable) {
397
+ throw new Error('Storage not initialized');
398
+ }
399
+ // Preprocess query - only extracts quoted phrases, keeps everything else generic
400
+ const processedQuery = preprocessQuery(query);
401
+ // Generate embedding for vector search
402
+ const queryVector = await this.embeddings.embed(query);
403
+ logger.debug('[DocumentStore] Attempting hybrid search (FTS + vector with RRF)');
404
+ // Strategy 1: If user provided quoted phrases, use phrase matching
405
+ if (this.ftsIndexCreated && processedQuery.phrases.length > 0) {
406
+ try {
407
+ logger.debug('[DocumentStore] Using phrase-based search for quoted terms:', processedQuery.phrases);
408
+ // Build boolean query: phrase matches (must) + general terms (should)
409
+ const queries = [];
410
+ // Add phrase queries for quoted phrases (exact match)
411
+ for (const phrase of processedQuery.phrases) {
412
+ queries.push([Occur.Must, new PhraseQuery(phrase, 'content', { slop: 0 })]);
413
+ }
414
+ // Add fuzzy match for the overall cleaned query
415
+ if (processedQuery.cleanedQuery) {
416
+ queries.push([Occur.Should, new MatchQuery(processedQuery.cleanedQuery, 'content', { fuzziness: 1 })]);
417
+ }
418
+ const boolQuery = new BooleanQuery(queries);
419
+ let ftsQuery = this.lanceTable
420
+ .query()
421
+ .fullTextSearch(boolQuery)
422
+ .limit(limit * 2);
423
+ if (whereClause) {
424
+ ftsQuery = ftsQuery.where(whereClause);
425
+ }
426
+ const ftsResults = await ftsQuery.toArray();
427
+ logger.debug(`[DocumentStore] Phrase-based FTS returned ${ftsResults.length} results`);
428
+ if (ftsResults.length > 0) {
429
+ // Combine with vector search for semantic relevance
430
+ let vectorQuery = this.lanceTable.search(queryVector).limit(limit * 2);
431
+ if (whereClause) {
432
+ vectorQuery = vectorQuery.where(whereClause);
433
+ }
434
+ const vectorResults = await vectorQuery.toArray();
435
+ const mergedResults = this.mergeAndRankResults(ftsResults, vectorResults, limit);
436
+ const searchResults = this.formatSearchResults(mergedResults);
437
+ this.searchCache.set(cacheKey, searchResults);
438
+ return searchResults;
439
+ }
440
+ }
441
+ catch (phraseError) {
442
+ const err = phraseError;
443
+ logger.debug('[DocumentStore] Phrase-based search failed:', err.message);
444
+ }
445
+ }
446
+ // Strategy 2: Standard hybrid search - FTS with fuzziness + vector search
447
+ if (this.ftsIndexCreated) {
448
+ try {
449
+ // LanceDB's FTS already handles stop words and stemming
450
+ // Add fuzziness for typo tolerance
451
+ const matchQuery = new MatchQuery(processedQuery.cleanedQuery, 'content', { fuzziness: 1 });
452
+ let ftsQuery = this.lanceTable
453
+ .query()
454
+ .fullTextSearch(matchQuery)
455
+ .limit(limit * 2);
456
+ if (whereClause) {
457
+ ftsQuery = ftsQuery.where(whereClause);
458
+ }
459
+ const ftsResults = await ftsQuery.toArray();
460
+ logger.debug(`[DocumentStore] FTS returned ${ftsResults.length} results`);
461
+ // Always combine with vector search for best results
462
+ let vectorQuery = this.lanceTable.search(queryVector).limit(limit * 2);
463
+ if (whereClause) {
464
+ vectorQuery = vectorQuery.where(whereClause);
465
+ }
466
+ const vectorResults = await vectorQuery.toArray();
467
+ logger.debug(`[DocumentStore] Vector search returned ${vectorResults.length} results`);
468
+ // Merge using RRF even if one is empty - ensures we get results
469
+ const mergedResults = this.mergeAndRankResults(ftsResults, vectorResults, limit);
470
+ if (mergedResults.length > 0) {
471
+ const searchResults = this.formatSearchResults(mergedResults);
472
+ this.searchCache.set(cacheKey, searchResults);
473
+ return searchResults;
474
+ }
475
+ }
476
+ catch (ftsError) {
477
+ const err = ftsError;
478
+ logger.debug('[DocumentStore] FTS search failed, falling back to vector search:', err.message);
479
+ }
480
+ }
481
+ // Strategy 3: Fallback to pure vector search (semantic similarity)
482
+ logger.debug('[DocumentStore] Falling back to pure vector search');
483
+ const results = await this.searchDocuments(queryVector, options);
484
+ this.searchCache.set(cacheKey, results);
485
+ return results;
486
+ }
487
+ catch (error) {
488
+ logger.error('[DocumentStore] Error searching documents by text:', error);
489
+ throw error;
490
+ }
491
+ }
492
+ /**
493
+ * Merge FTS and vector results using Reciprocal Rank Fusion (RRF)
494
+ */
495
+ mergeAndRankResults(ftsResults, vectorResults, limit) {
496
+ const k = 60; // RRF constant
497
+ const scores = new Map();
498
+ // Score FTS results
499
+ ftsResults.forEach((result, rank) => {
500
+ const key = `${result.url}:${result.path}:${result.startLine}`;
501
+ const rrfScore = 1 / (k + rank + 1);
502
+ scores.set(key, { result, score: rrfScore });
503
+ });
504
+ // Add/combine vector results
505
+ vectorResults.forEach((result, rank) => {
506
+ const key = `${result.url}:${result.path}:${result.startLine}`;
507
+ const rrfScore = 1 / (k + rank + 1);
508
+ if (scores.has(key)) {
509
+ // Combine scores if result appears in both
510
+ const existing = scores.get(key);
511
+ existing.score += rrfScore;
512
+ }
513
+ else {
514
+ scores.set(key, { result, score: rrfScore });
515
+ }
516
+ });
517
+ // Sort by combined RRF score and return top results
518
+ return Array.from(scores.values())
519
+ .sort((a, b) => b.score - a.score)
520
+ .slice(0, limit)
521
+ .map((item) => ({ ...item.result, _rrfScore: item.score }));
522
+ }
523
+ /**
524
+ * Format raw LanceDB results into SearchResult objects
525
+ */
526
+ formatSearchResults(results) {
527
+ return results.map((result) => {
528
+ let codeBlocks, props;
529
+ try {
530
+ codeBlocks = result.codeBlocks ? JSON.parse(result.codeBlocks) : undefined;
531
+ }
532
+ catch {
533
+ codeBlocks = undefined;
534
+ }
535
+ try {
536
+ props = result.props ? JSON.parse(result.props) : undefined;
537
+ }
538
+ catch {
539
+ props = undefined;
540
+ }
541
+ return {
542
+ id: String(result.url),
543
+ content: String(result.content),
544
+ url: String(result.url),
545
+ title: String(result.title),
546
+ score: result._rrfScore ?? (result._distance != null ? 1 - result._distance : (result._score ?? 0)),
547
+ metadata: {
548
+ type: (result.type || 'overview'),
549
+ path: String(result.path),
550
+ lastUpdated: new Date(result.lastUpdated ? String(result.lastUpdated) : Date.now()),
551
+ version: result.version,
552
+ framework: result.framework,
553
+ language: result.language,
554
+ codeBlocks,
555
+ props,
556
+ },
557
+ };
558
+ });
559
+ }
560
+ async listDocuments() {
561
+ if (!this.sqliteDb) {
562
+ throw new Error('Storage not initialized');
563
+ }
564
+ logger.debug(`[DocumentStore] Listing documents`);
565
+ try {
566
+ const rows = await this.sqliteDb.all('SELECT url, title, favicon, last_indexed FROM documents ORDER BY last_indexed DESC');
567
+ logger.debug(`[DocumentStore] Found ${rows.length} documents`);
568
+ return rows.map((row) => ({
569
+ url: row.url,
570
+ title: row.title,
571
+ favicon: row.favicon ?? undefined,
572
+ lastIndexed: new Date(row.last_indexed),
573
+ }));
574
+ }
575
+ catch (error) {
576
+ logger.error('[DocumentStore] Error listing documents:', error);
577
+ throw error;
578
+ }
579
+ }
580
+ async deleteDocument(url) {
581
+ if (!this.sqliteDb || !this.lanceTable) {
582
+ throw new Error('Storage not initialized');
583
+ }
584
+ logger.debug(`[DocumentStore] Deleting document: ${url}`);
585
+ try {
586
+ await this.sqliteDb.run('BEGIN TRANSACTION');
587
+ await this.sqliteDb.run('DELETE FROM documents WHERE url = ?', [url]);
588
+ await this.lanceTable.delete(`url = '${escapeFilterValue(url)}'`);
589
+ await this.sqliteDb.run('COMMIT');
590
+ // Clear cache for this URL
591
+ this.clearCacheForUrl(url);
592
+ logger.debug(`[DocumentStore] Document deleted successfully`);
593
+ }
594
+ catch (error) {
595
+ if (this.sqliteDb) {
596
+ await this.sqliteDb.run('ROLLBACK');
597
+ }
598
+ logger.error('[DocumentStore] Error deleting document:', error);
599
+ throw error;
600
+ }
601
+ }
602
+ async getDocument(url) {
603
+ if (!this.sqliteDb) {
604
+ throw new Error('Storage not initialized');
605
+ }
606
+ logger.debug(`[DocumentStore] Getting document: ${url}`);
607
+ try {
608
+ // Check if SQLite is properly initialized
609
+ if (!this.sqliteDb) {
610
+ logger.debug('[DocumentStore] SQLite not initialized during getDocument');
611
+ throw new Error('Storage not initialized');
612
+ }
613
+ // Log the query being executed
614
+ logger.debug(`[DocumentStore] Executing SQLite query for URL: ${url}`);
615
+ const row = await this.sqliteDb.get('SELECT url, title, favicon, last_indexed FROM documents WHERE url = ?', [url]);
616
+ if (!row) {
617
+ logger.debug(`[DocumentStore] Document not found in SQLite: ${url}`);
618
+ return null;
619
+ }
620
+ // Check if LanceDB has any chunks for this document
621
+ if (this.lanceTable) {
622
+ const chunks = await this.lanceTable.countRows(`url = '${escapeFilterValue(url)}'`);
623
+ logger.debug(`[DocumentStore] Found ${chunks} chunks in LanceDB for ${url}`);
624
+ }
625
+ logger.debug(`[DocumentStore] Document found in SQLite:`, row);
626
+ return {
627
+ url: row.url,
628
+ title: row.title,
629
+ favicon: row.favicon ?? undefined,
630
+ lastIndexed: new Date(row.last_indexed),
631
+ };
632
+ }
633
+ catch (error) {
634
+ logger.error('[DocumentStore] Error getting document:', error);
635
+ throw error;
636
+ }
637
+ }
638
+ clearCacheForUrl(url) {
639
+ // Clear all cache entries that might contain results for this URL
640
+ for (const key of this.searchCache.keys()) {
641
+ const results = this.searchCache.get(key);
642
+ if (results?.some((result) => result.url === url)) {
643
+ this.searchCache.delete(key);
644
+ }
645
+ }
646
+ }
647
+ /**
648
+ * Validates that vectors are properly stored and retrievable from LanceDB
649
+ * @returns Promise<boolean> True if vectors are valid, false otherwise
650
+ */
651
+ async validateVectors() {
652
+ if (!this.lanceTable) {
653
+ logger.debug('[DocumentStore] Cannot validate vectors: Storage not initialized');
654
+ throw new Error('Storage not initialized');
655
+ }
656
+ try {
657
+ // Get total row count
658
+ const rowCount = await this.lanceTable.countRows();
659
+ logger.debug(`[DocumentStore] Vector validation: Table contains ${rowCount} rows`);
660
+ if (rowCount === 0) {
661
+ logger.debug('[DocumentStore] Vector validation: No rows found in vector table');
662
+ return false;
663
+ }
664
+ // Get a sample row using a query
665
+ const sample = await this.lanceTable.query().limit(1).toArray();
666
+ if (sample.length === 0) {
667
+ logger.debug('[DocumentStore] Vector validation: No rows returned from query');
668
+ return false;
669
+ }
670
+ // Log detailed information about the sample
671
+ logger.debug('[DocumentStore] Vector validation sample:', {
672
+ hasVector: 'vector' in sample[0],
673
+ vectorType: typeof sample[0].vector,
674
+ isArray: Array.isArray(sample[0].vector),
675
+ length: Array.isArray(sample[0].vector) ? sample[0].vector.length : 'N/A',
676
+ sample: Array.isArray(sample[0].vector) ? sample[0].vector.slice(0, 5) : sample[0].vector,
677
+ });
678
+ // Try a simple vector search with a random vector
679
+ const testVector = new Array(this.embeddings.dimensions).fill(0).map(() => Math.random());
680
+ logger.debug(`[DocumentStore] Testing vector search with random vector of length ${testVector.length}`);
681
+ const searchResults = await this.lanceTable.search(testVector).limit(1).toArray();
682
+ logger.debug(`[DocumentStore] Vector search test returned ${searchResults.length} results`);
683
+ if (searchResults.length > 0) {
684
+ logger.debug('[DocumentStore] Vector search test result:', {
685
+ score: searchResults[0].score,
686
+ hasVector: 'vector' in searchResults[0],
687
+ vectorLength: Array.isArray(searchResults[0].vector) ? searchResults[0].vector.length : 'N/A',
688
+ });
689
+ }
690
+ // Consider vectors valid if we have rows and can perform a search
691
+ // Even if scores are null, the search is still working
692
+ return rowCount > 0 && sample.length > 0 && searchResults.length > 0;
693
+ }
694
+ catch (error) {
695
+ logger.error('[DocumentStore] Error validating vectors:', error);
696
+ return false;
697
+ }
698
+ }
699
+ }
700
+ //# sourceMappingURL=storage.js.map