@lon-ask/dockit 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/LICENSE +674 -0
  2. package/README.md +496 -0
  3. package/SKILL.md +154 -0
  4. package/apps/client/dist/assets/index-CqOXxsEZ.js +240 -0
  5. package/apps/client/dist/assets/index-DwvaANnI.css +1 -0
  6. package/apps/client/dist/index.html +13 -0
  7. package/apps/server/src/core/domain/entry.ts +22 -0
  8. package/apps/server/src/core/domain/errors.ts +27 -0
  9. package/apps/server/src/core/domain/knowledge-graph.ts +51 -0
  10. package/apps/server/src/core/domain/types.ts +168 -0
  11. package/apps/server/src/core/ports/IBuildRepository.ts +7 -0
  12. package/apps/server/src/core/ports/IDocumentNormalizer.ts +6 -0
  13. package/apps/server/src/core/ports/IDocumentStore.ts +4 -0
  14. package/apps/server/src/core/ports/IEntryReadModel.ts +9 -0
  15. package/apps/server/src/core/ports/IEntryRepository.ts +11 -0
  16. package/apps/server/src/core/ports/IKnowledgeGraph.ts +10 -0
  17. package/apps/server/src/core/ports/IPathResolver.ts +3 -0
  18. package/apps/server/src/core/ports/ISearchEngine.ts +9 -0
  19. package/apps/server/src/core/ports/ISourceProcessor.ts +7 -0
  20. package/apps/server/src/core/ports/ISourceRepository.ts +11 -0
  21. package/apps/server/src/core/usecases/BuildUseCase.ts +98 -0
  22. package/apps/server/src/core/usecases/ConfigUseCase.ts +64 -0
  23. package/apps/server/src/core/usecases/SearchUseCase.ts +16 -0
  24. package/apps/server/src/index.ts +98 -0
  25. package/apps/server/src/infrastructure/filesystem/FileSystemDocumentStore.ts +27 -0
  26. package/apps/server/src/infrastructure/graph/GraphSearchDecorator.ts +53 -0
  27. package/apps/server/src/infrastructure/graph/GraphifyKnowledgeGraph.ts +172 -0
  28. package/apps/server/src/infrastructure/graph/index.ts +2 -0
  29. package/apps/server/src/infrastructure/persistence/sqlite/SqliteBuildRepository.ts +34 -0
  30. package/apps/server/src/infrastructure/persistence/sqlite/SqliteEntryReadModel.ts +17 -0
  31. package/apps/server/src/infrastructure/persistence/sqlite/SqliteEntryRepository.ts +81 -0
  32. package/apps/server/src/infrastructure/persistence/sqlite/SqliteSourceRepository.ts +65 -0
  33. package/apps/server/src/infrastructure/persistence/sqlite/connection.ts +52 -0
  34. package/apps/server/src/infrastructure/search/SearchEngineFactory.ts +43 -0
  35. package/apps/server/src/infrastructure/search/json/JsonSearchEngine.ts +164 -0
  36. package/apps/server/src/infrastructure/search/vector/EmbeddingService.ts +23 -0
  37. package/apps/server/src/infrastructure/search/vector/VectorSearchEngine.ts +480 -0
  38. package/apps/server/src/infrastructure/source-processors/AntoraSourceProcessor.ts +14 -0
  39. package/apps/server/src/infrastructure/source-processors/AsciidocSourceProcessor.ts +12 -0
  40. package/apps/server/src/infrastructure/source-processors/DocumentNormalizer.ts +16 -0
  41. package/apps/server/src/infrastructure/source-processors/GithubMarkdownSourceProcessor.ts +12 -0
  42. package/apps/server/src/infrastructure/source-processors/MavenSourceProcessor.ts +12 -0
  43. package/apps/server/src/infrastructure/source-processors/PathResolver.ts +6 -0
  44. package/apps/server/src/infrastructure/source-processors/SourceCodeSourceProcessor.ts +260 -0
  45. package/apps/server/src/infrastructure/source-processors/ZipSourceProcessor.ts +12 -0
  46. package/apps/server/src/mcp-http.ts +102 -0
  47. package/apps/server/src/mcp.ts +432 -0
  48. package/apps/server/src/routes/build.ts +105 -0
  49. package/apps/server/src/routes/entries.ts +62 -0
  50. package/apps/server/src/routes/graph.ts +57 -0
  51. package/apps/server/src/routes/search.ts +28 -0
  52. package/apps/server/src/routes/sources.ts +105 -0
  53. package/apps/server/src/routes/viewer.ts +28 -0
  54. package/apps/server/src/services/antora.ts +238 -0
  55. package/apps/server/src/services/asciidoc.ts +221 -0
  56. package/apps/server/src/services/configLoader.ts +207 -0
  57. package/apps/server/src/services/githubMarkdown.ts +236 -0
  58. package/apps/server/src/services/maven.ts +178 -0
  59. package/apps/server/src/services/normalizer.ts +63 -0
  60. package/apps/server/src/services/paths.ts +5 -0
  61. package/apps/server/src/services/textExtractor.ts +49 -0
  62. package/apps/server/src/services/zip.ts +84 -0
  63. package/bin/commands/build.ts +85 -0
  64. package/bin/commands/dev.ts +36 -0
  65. package/bin/commands/get.ts +36 -0
  66. package/bin/commands/graph.ts +153 -0
  67. package/bin/commands/init.ts +170 -0
  68. package/bin/commands/list.ts +47 -0
  69. package/bin/commands/mcp.ts +32 -0
  70. package/bin/commands/search.ts +185 -0
  71. package/bin/commands/serve.ts +23 -0
  72. package/bin/commands/status.ts +46 -0
  73. package/bin/dockit-cli.ts +92 -0
  74. package/bin/dockit.js +17 -0
  75. package/bin/utils.ts +85 -0
  76. package/dockit.yaml +154 -0
  77. package/package.json +60 -0
  78. package/scripts/mcp-wrapper.sh +44 -0
@@ -0,0 +1,164 @@
1
+ import path from 'node:path';
2
+ import fs from 'node:fs';
3
+ import { parse } from 'node-html-parser';
4
+ import type { ISearchEngine } from '../../../core/ports/ISearchEngine.js';
5
+ import type { IEntryReadModel } from '../../../core/ports/IEntryReadModel.js';
6
+ import type { SearchResult, GlobalSearchResult, HtmlFile } from '../../../core/domain/types.js';
7
+ import { DATA_ROOT } from '../../../services/paths.js';
8
+
9
+ const STOP_WORDS = new Set([
10
+ 'a', 'an', 'the', 'and', 'or', 'but', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
11
+ 'do', 'does', 'did', 'doing', 'have', 'has', 'had', 'having', 'will', 'would', 'shall',
12
+ 'should', 'can', 'could', 'may', 'might', 'must', 'to', 'of', 'in', 'for', 'on', 'with',
13
+ 'at', 'by', 'from', 'as', 'into', 'through', 'during', 'before', 'after', 'above', 'below',
14
+ 'between', 'out', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
15
+ 'there', 'when', 'where', 'why', 'how', 'what', 'which', 'who', 'whom', 'this', 'that',
16
+ 'these', 'those', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
17
+ 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her',
18
+ 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
19
+ 'not', 'no', 'nor', 'so', 'if', 'about', 'up', 'down', 'just', 'only', 'own', 'same',
20
+ 'than', 'too', 'very', 'some', 'any', 'each', 'every', 'all', 'both', 'few', 'more',
21
+ 'most', 'other', 'such', 'also', 'get', 'got', 'like', 'make', 'made', 'use', 'used',
22
+ 'using', 'create', 'new', 'way', 'need', 'want', 'know', 'tell', 'say', 'said', 'go',
23
+ 'went', 'come', 'see', 'look', 'find', 'give', 'take', 'put', 'set', 'let', 'keep',
24
+ 'work', 'call', 'try', 'ask', 'show', 'think', 'help', 'run', 'move', 'live', 'believe',
25
+ ]);
26
+
27
+ function countOccurrences(text: string, term: string): number {
28
+ let count = 0;
29
+ let idx = text.indexOf(term);
30
+ while (idx !== -1) {
31
+ count++;
32
+ idx = text.indexOf(term, idx + term.length);
33
+ }
34
+ return count;
35
+ }
36
+
37
+ export class JsonSearchEngine implements ISearchEngine {
38
+ readonly capability = 'json' as const;
39
+
40
+ constructor(private readonly entryReadModel: IEntryReadModel) {}
41
+
42
+ async buildIndex(entryId: string, htmlFiles: HtmlFile[], log: (msg: string) => void): Promise<void> {
43
+ log(`Building search index for ${htmlFiles.length} files`);
44
+ const entryDir = path.join(DATA_ROOT, entryId);
45
+ const bundleDir = path.join(entryDir, 'bundle');
46
+ const indexPath = path.join(entryDir, 'index.json');
47
+ const index: SearchResult[] = [];
48
+
49
+ for (const file of htmlFiles) {
50
+ try {
51
+ const html = fs.readFileSync(file.fullPath, 'utf-8');
52
+ const root = parse(html);
53
+
54
+ const title = root.querySelector('title')?.text.trim()
55
+ || root.querySelector('h1')?.text.trim()
56
+ || path.basename(file.relativePath, '.html');
57
+
58
+ const headings: string[] = [];
59
+ root.querySelectorAll('h1, h2, h3, h4').forEach((el) => {
60
+ const text = el.text.trim();
61
+ if (text) headings.push(text);
62
+ });
63
+
64
+ const bodyEl = root.querySelector('body');
65
+ const bodyText = bodyEl ? bodyEl.text.replace(/\s+/g, ' ').trim() : '';
66
+ const snippet = bodyText.slice(0, 300);
67
+
68
+ index.push({
69
+ path: file.relativePath,
70
+ title,
71
+ headings,
72
+ snippet,
73
+ });
74
+ } catch (err) {
75
+ log(` Warning: could not parse ${file.relativePath}: ${(err as Error).message}`);
76
+ }
77
+ }
78
+
79
+ fs.mkdirSync(path.dirname(indexPath), { recursive: true });
80
+ fs.writeFileSync(indexPath, JSON.stringify(index, null, 2), 'utf-8');
81
+ log(`Search index written to ${indexPath} with ${index.length} entries`);
82
+ }
83
+
84
+ async search(entryId: string, query: string, limit = 20): Promise<SearchResult[]> {
85
+ const indexPath = path.join(DATA_ROOT, entryId, 'index.json');
86
+ if (!fs.existsSync(indexPath)) return [];
87
+
88
+ const index: SearchResult[] = JSON.parse(fs.readFileSync(indexPath, 'utf-8'));
89
+ return this.scoreAndFilter(index, query, limit);
90
+ }
91
+
92
+ async globalSearch(query: string, limit = 30): Promise<GlobalSearchResult[]> {
93
+ const readyEntries = await this.entryReadModel.listReadyEntries();
94
+
95
+ const allResults: GlobalSearchResult[] = [];
96
+ for (const entry of readyEntries) {
97
+ const results = await this.search(entry.id, query, 10);
98
+ for (const r of results) {
99
+ allResults.push({
100
+ ...r,
101
+ entryId: entry.id,
102
+ entryName: entry.name,
103
+ entryVersion: entry.version,
104
+ });
105
+ }
106
+ }
107
+
108
+ allResults.sort((a, b) => ((b as any).score || 0) - ((a as any).score || 0));
109
+ return allResults.slice(0, limit);
110
+ }
111
+
112
+ private scoreAndFilter(index: SearchResult[], query: string, maxResults: number): SearchResult[] {
113
+ const allTerms = query.toLowerCase().split(/\s+/).filter(Boolean);
114
+ const terms = allTerms.filter((t) => !STOP_WORDS.has(t));
115
+
116
+ if (terms.length === 0) return index.slice(0, maxResults);
117
+
118
+ const docFreq: Record<string, number> = {};
119
+ const totalDocs = index.length;
120
+ for (const term of terms) {
121
+ docFreq[term] = 0;
122
+ for (const item of index) {
123
+ const searchText = [item.title, ...item.headings, item.snippet].join(' ').toLowerCase();
124
+ if (searchText.includes(term)) {
125
+ docFreq[term]++;
126
+ }
127
+ }
128
+ }
129
+
130
+ return index
131
+ .map((item) => {
132
+ const titleLower = item.title.toLowerCase();
133
+ const headingsLower = item.headings.map((h) => h.toLowerCase());
134
+ const snippetLower = item.snippet.toLowerCase();
135
+
136
+ let score = 0;
137
+
138
+ for (const term of terms) {
139
+ const idf = Math.log(totalDocs / (1 + docFreq[term]));
140
+
141
+ const titleCount = countOccurrences(titleLower, term);
142
+ if (titleCount > 0) score += titleCount * idf * 10;
143
+
144
+ let headingCount = 0;
145
+ for (const heading of headingsLower) {
146
+ if (heading.includes(term)) headingCount++;
147
+ }
148
+ if (headingCount > 0) score += Math.min(headingCount, 5) * idf * 3;
149
+
150
+ const snippetCount = countOccurrences(snippetLower, term);
151
+ if (snippetCount > 0) score += Math.log(1 + snippetCount) * idf;
152
+ }
153
+
154
+ const allTermsInTitle = terms.every((t) => titleLower.includes(t));
155
+ if (allTermsInTitle) score += 20 * terms.length;
156
+
157
+ return { item, score };
158
+ })
159
+ .filter(({ score }) => score > 0)
160
+ .sort((a, b) => b.score - a.score)
161
+ .slice(0, maxResults)
162
+ .map(({ item }) => item);
163
+ }
164
+ }
@@ -0,0 +1,23 @@
1
+ export class EmbeddingService {
2
+ private initialized = false;
3
+ private embedFn: ((texts: string[]) => Promise<number[][]>) | null = null;
4
+
5
+ private async init() {
6
+ if (this.initialized) return;
7
+
8
+ const mod = await import('@dockit/embeddings');
9
+ // Configure for bundled offline mode by default.
10
+ // env.cacheDir defaults to <package>/model/; allowRemoteModels defaults to true
11
+ // (permits download if model not yet cached). For air-gapped environments,
12
+ // call mod.configure({ offline: true }) before first embed().
13
+ mod.configure();
14
+ this.embedFn = mod.embed;
15
+ this.initialized = true;
16
+ }
17
+
18
+ async embed(texts: string[]): Promise<number[][]> {
19
+ await this.init();
20
+ if (!this.embedFn) throw new Error('Embedding service not initialized');
21
+ return this.embedFn(texts);
22
+ }
23
+ }
@@ -0,0 +1,480 @@
1
+ import path from 'node:path';
2
+ import fs from 'node:fs';
3
+ import { parse, HTMLElement } from 'node-html-parser';
4
+ import type { ISearchEngine } from '../../../core/ports/ISearchEngine.js';
5
+ import type { IEntryReadModel } from '../../../core/ports/IEntryReadModel.js';
6
+ import type { SearchResult, GlobalSearchResult, HtmlFile } from '../../../core/domain/types.js';
7
+ import { DATA_ROOT } from '../../../services/paths.js';
8
+ import { EmbeddingService } from './EmbeddingService.js';
9
+ import type { Connection, Table } from '@lancedb/lancedb';
10
+
11
+ const LANCE_DB_DIR = path.join(DATA_ROOT, '.lancedb');
12
+ const VECTOR_DIM = 384;
13
+ const MAX_EMBED_CHARS = 2000;
14
+ const MAX_SNIPPET_CHARS = 500;
15
+ const MIN_CHUNK_CHARS = 50;
16
+ const RRF_K = 25;
17
+ const FTS_WEAK_WEIGHT = 0.7;
18
+ const FTS_STRONG_WEIGHT = 2.0;
19
+ const FTS_MIN_SCORE_RATIO = 0.3;
20
+ const FTS_CONFIDENCE_RATIO = 1.3;
21
+ const PARALLEL_QUERY_LIMIT = 40;
22
+
23
+ interface Chunk {
24
+ primaryTitle: string;
25
+ sectionTitle: string;
26
+ text: string;
27
+ headingPath: string[];
28
+ }
29
+
30
+ interface LanceDoc {
31
+ path: string;
32
+ primaryTitle: string;
33
+ sectionTitle: string;
34
+ content: string;
35
+ searchText: string;
36
+ embedText: string;
37
+ headings: string;
38
+ entryId: string;
39
+ vector: Float32Array;
40
+ }
41
+
42
+ interface LanceDbQueryResult {
43
+ path: string;
44
+ primaryTitle: string;
45
+ sectionTitle: string;
46
+ content: string;
47
+ headings: string;
48
+ entryId: string;
49
+ vector: Float32Array;
50
+ _distance: number;
51
+ _score?: number;
52
+ _query?: string;
53
+ }
54
+
55
+ export class VectorSearchEngine implements ISearchEngine {
56
+ readonly capability = 'vector' as const;
57
+ private embeddingService: EmbeddingService;
58
+ private dbPromise: Promise<Connection> | null = null;
59
+ private entryReadModel: IEntryReadModel;
60
+
61
+ constructor(entryReadModel: IEntryReadModel, embeddingService?: EmbeddingService) {
62
+ this.entryReadModel = entryReadModel;
63
+ this.embeddingService = embeddingService ?? new EmbeddingService();
64
+ }
65
+
66
+ private async getDb(): Promise<Connection> {
67
+ if (!this.dbPromise) {
68
+ const lancedb = await import('@lancedb/lancedb');
69
+ this.dbPromise = lancedb.connect(LANCE_DB_DIR);
70
+ }
71
+ return this.dbPromise;
72
+ }
73
+
74
+ async buildIndex(entryId: string, htmlFiles: HtmlFile[], log: (msg: string) => void): Promise<void> {
75
+ log(`Building vector search index for ${htmlFiles.length} files`);
76
+ const db = await this.getDb();
77
+
78
+ const tableName = this.sanitizeTableName(entryId);
79
+ try {
80
+ const names = await db.tableNames();
81
+ if (names.includes(tableName)) {
82
+ await db.dropTable(tableName);
83
+ log(`Dropped existing table ${tableName}`);
84
+ }
85
+ } catch {
86
+ // Table may not exist
87
+ }
88
+
89
+ const allChunks: LanceDoc[] = [];
90
+
91
+ for (const file of htmlFiles) {
92
+ try {
93
+ const html = fs.readFileSync(file.fullPath, 'utf-8');
94
+ const root = parse(html);
95
+
96
+ const primaryTitle = root.querySelector('title')?.text.trim()
97
+ || root.querySelector('h1')?.text.trim()
98
+ || path.basename(file.relativePath, '.html');
99
+
100
+ const chunks = chunkDocument(root, primaryTitle);
101
+
102
+ if (chunks.length === 0) {
103
+ // No sections found, treat whole document as one chunk
104
+ const bodyEl = root.querySelector('body');
105
+ const bodyText = bodyEl ? bodyEl.text.replace(/\s+/g, ' ').trim() : '';
106
+ const embedText = `${primaryTitle}. ${primaryTitle}. ${bodyText.replace(/\s+/g, ' ').trim()}`.substring(0, MAX_EMBED_CHARS);
107
+ const snippet = bodyText.substring(0, MAX_SNIPPET_CHARS);
108
+
109
+ allChunks.push({
110
+ path: file.relativePath,
111
+ primaryTitle,
112
+ sectionTitle: primaryTitle,
113
+ content: snippet,
114
+ searchText: `${primaryTitle}. ${primaryTitle}. ${bodyText.replace(/\s+/g, ' ').trim()}`,
115
+ embedText,
116
+ headings: primaryTitle,
117
+ entryId,
118
+ vector: new Float32Array(VECTOR_DIM),
119
+ });
120
+ } else {
121
+ for (const chunk of chunks) {
122
+ const embedText = `${primaryTitle}. ${primaryTitle}. ${chunk.sectionTitle}. ${chunk.text.replace(/\s+/g, ' ').trim()}`
123
+ .substring(0, MAX_EMBED_CHARS);
124
+ const searchText = `${primaryTitle}. ${primaryTitle}. ${chunk.sectionTitle}. ${chunk.text.replace(/\s+/g, ' ').trim()}`;
125
+ const snippet = chunk.text.replace(/\s+/g, ' ').trim().substring(0, MAX_SNIPPET_CHARS);
126
+
127
+ allChunks.push({
128
+ path: file.relativePath,
129
+ primaryTitle,
130
+ sectionTitle: chunk.sectionTitle,
131
+ content: snippet,
132
+ searchText,
133
+ embedText,
134
+ headings: [...chunk.headingPath, chunk.sectionTitle].join(' | '),
135
+ entryId,
136
+ vector: new Float32Array(VECTOR_DIM),
137
+ });
138
+ }
139
+ }
140
+ } catch (err) {
141
+ log(` Warning: could not parse ${file.relativePath}: ${(err as Error).message}`);
142
+ }
143
+ }
144
+
145
+ if (allChunks.length === 0) {
146
+ log('No documents to index');
147
+ return;
148
+ }
149
+
150
+ log(`Created ${allChunks.length} chunks across ${htmlFiles.length} files`);
151
+
152
+ // Batch embed all chunks
153
+ const batchSize = 32;
154
+ const totalChunks = allChunks.length;
155
+
156
+ for (let i = 0; i < totalChunks; i += batchSize) {
157
+ const batch = allChunks.slice(i, i + batchSize);
158
+ const texts = batch.map((d) => d.embedText);
159
+ const embeddings = await this.embeddingService.embed(texts);
160
+ for (let j = 0; j < batch.length; j++) {
161
+ batch[j].vector = new Float32Array(embeddings[j]);
162
+ }
163
+ if (i % 128 === 0 || i + batchSize >= totalChunks) {
164
+ log(`Embedded ${Math.min(i + batchSize, totalChunks)}/${totalChunks} chunks`);
165
+ }
166
+ }
167
+
168
+ // Create LanceDB table
169
+ // LanceDB types require Record<string, unknown> for createTable; Float32Array vectors don't satisfy this
170
+ const table = await db.createTable(tableName, allChunks as any[], {
171
+ mode: 'overwrite',
172
+ });
173
+ log(`Created table ${tableName} with ${allChunks.length} rows`);
174
+
175
+ // Create vector index with cosine distance
176
+ try {
177
+ const lancedb = await import('@lancedb/lancedb');
178
+ await table.createIndex('vector', {
179
+ config: lancedb.Index.ivfPq({ distanceType: 'cosine' }),
180
+ });
181
+ log(`Created vector index (cosine) on ${tableName}`);
182
+ } catch (err) {
183
+ log(` Warning: could not create vector index: ${(err as Error).message}`);
184
+ }
185
+
186
+ // Create FTS index on searchText column (includes title for better keyword matching)
187
+ try {
188
+ const lancedb = await import('@lancedb/lancedb');
189
+ await table.createIndex('searchText', {
190
+ config: lancedb.Index.fts(),
191
+ });
192
+ log(`Created FTS index on ${tableName}`);
193
+ } catch (err) {
194
+ log(` Warning: could not create FTS index: ${(err as Error).message}`);
195
+ }
196
+ }
197
+
198
+ async search(entryId: string, query: string, limit = 20): Promise<SearchResult[]> {
199
+ const db = await this.getDb();
200
+ const tableName = this.sanitizeTableName(entryId);
201
+
202
+ let table: Table;
203
+ try {
204
+ table = await db.openTable(tableName);
205
+ } catch {
206
+ return [];
207
+ }
208
+
209
+ const results = await this.hybridSearch(table, query, limit);
210
+ return results;
211
+ }
212
+
213
+ async globalSearch(query: string, limit = 30): Promise<GlobalSearchResult[]> {
214
+ const db = await this.getDb();
215
+ const readyEntries = await this.entryReadModel.listReadyEntries();
216
+
217
+ if (readyEntries.length === 0) return [];
218
+
219
+ // Search all entries in parallel
220
+ const fetchLimit = Math.min(5, Math.ceil(limit / readyEntries.length));
221
+ const perEntry = Math.max(5, fetchLimit);
222
+
223
+ const entryResults = await Promise.all(
224
+ readyEntries.map(async (entry) => {
225
+ try {
226
+ const table = await db.openTable(this.sanitizeTableName(entry.id));
227
+ const results = await this.hybridSearch(table, query, perEntry);
228
+ return results.map((r) => ({
229
+ ...r,
230
+ entryId: entry.id,
231
+ entryName: entry.name,
232
+ entryVersion: entry.version,
233
+ }));
234
+ } catch {
235
+ return [] as GlobalSearchResult[];
236
+ }
237
+ })
238
+ );
239
+
240
+ // Flatten and re-sort by RRF methodology
241
+ // All results already have internal ordering, just merge and limit
242
+ const allResults = entryResults.flat();
243
+ return this.deduplicateByPath(allResults).slice(0, limit);
244
+ }
245
+
246
+ private async hybridSearch(table: Table, query: string, limit: number): Promise<SearchResult[]> {
247
+ const queryEmbedding = await this.embeddingService.embed([query]);
248
+ const queryVector = new Float32Array(queryEmbedding[0]);
249
+
250
+ // Run vector and FTS queries in parallel
251
+ const [vecResults, ftsResults] = await Promise.allSettled([
252
+ table
253
+ .query()
254
+ .nearestTo(queryVector)
255
+ .distanceType('cosine')
256
+ .limit(PARALLEL_QUERY_LIMIT)
257
+ .toArray(),
258
+ table
259
+ .query()
260
+ .fullTextSearch(query, { columns: ['searchText'] })
261
+ .limit(PARALLEL_QUERY_LIMIT)
262
+ .toArray(),
263
+ ]);
264
+
265
+ const vec = vecResults.status === 'fulfilled' ? (vecResults.value as LanceDbQueryResult[]) : [];
266
+ const fts = ftsResults.status === 'fulfilled' ? (ftsResults.value as LanceDbQueryResult[]) : [];
267
+
268
+ if (vec.length === 0 && fts.length === 0) return [];
269
+
270
+ // If only one query succeeded, use its results directly
271
+ if (vec.length === 0) {
272
+ return this.deduplicateByPath(
273
+ fts.map((r: LanceDbQueryResult) => ({
274
+ path: r.path,
275
+ title: r.primaryTitle || r.sectionTitle,
276
+ headings: r.headings ? r.headings.split(' | ') : [],
277
+ snippet: r.content,
278
+ }))
279
+ ).slice(0, limit);
280
+ }
281
+
282
+ if (fts.length === 0) {
283
+ return this.deduplicateByPath(
284
+ vec.map((r: LanceDbQueryResult) => ({
285
+ path: r.path,
286
+ title: r.primaryTitle || r.sectionTitle,
287
+ headings: r.headings ? r.headings.split(' | ') : [],
288
+ snippet: r.content,
289
+ }))
290
+ ).slice(0, limit);
291
+ }
292
+
293
+ // Hybrid fusion: Reciprocal Rank Fusion
294
+ const fused = this.hybridFuse(vec, fts, limit);
295
+ return fused;
296
+ }
297
+
298
+ private hybridFuse(vecResults: LanceDbQueryResult[], ftsResults: LanceDbQueryResult[], limit: number): SearchResult[] {
299
+ // Deduplicate: keep only best chunk per path BEFORE RRF fusion.
300
+ const dedupVec = this.dedupBest(vecResults, (r) => r._distance ?? Infinity, 'asc');
301
+ let dedupFts = this.dedupBest(ftsResults, (r) => r._score ?? 0, 'desc');
302
+
303
+ // Filter FTS results by minimum relevance threshold
304
+ if (dedupFts.length > 0) {
305
+ const maxScore = dedupFts[0]._score ?? 0;
306
+ const minScore = maxScore * FTS_MIN_SCORE_RATIO;
307
+ dedupFts = dedupFts.filter((r) => (r._score ?? 0) >= minScore);
308
+ }
309
+
310
+ // Dynamic FTS weight: if FTS is confident (clear score gap between #1 and others),
311
+ // weight FTS higher. If scores are similar, FTS is uncertain, rely more on vector.
312
+ let ftsWeight = FTS_WEAK_WEIGHT;
313
+ if (dedupFts.length >= 2) {
314
+ const maxScore = dedupFts[0]._score ?? 0;
315
+ const secondScore = dedupFts[1]._score ?? 0;
316
+ if (secondScore > 0 && maxScore / secondScore > FTS_CONFIDENCE_RATIO) {
317
+ ftsWeight = FTS_STRONG_WEIGHT;
318
+ }
319
+ } else if (dedupFts.length === 1) {
320
+ ftsWeight = FTS_STRONG_WEIGHT; // Single result = high confidence
321
+ }
322
+
323
+ const scores = new Map<string, { path: string; title: string; headings: string[]; snippet: string; score: number }>();
324
+
325
+ // Apply RRF from vector results
326
+ dedupVec.forEach((r, i) => {
327
+ const path = r.path as string;
328
+ const rrfScore = 1 / (RRF_K + i + 1);
329
+ this.addScore(scores, path, r.primaryTitle || r.sectionTitle, r.headings, r.content, rrfScore);
330
+ });
331
+
332
+ // Apply RRF from FTS results (dynamically weighted, with title match boosting)
333
+ dedupFts.forEach((r, i) => {
334
+ const path = r.path as string;
335
+ let rrfScore = ftsWeight / (RRF_K + i + 1);
336
+
337
+ // Title match boost: if query terms appear in title, extra 50%
338
+ const queryTerms = (r._query || '').toLowerCase().split(/\s+/).filter((t: string) => t.length > 2);
339
+ const sectionTitle = (r.sectionTitle || '').toLowerCase();
340
+ const primaryTitle = (r.primaryTitle || '').toLowerCase();
341
+ const titleMatch = queryTerms.some(
342
+ (t: string) => sectionTitle.includes(t) || primaryTitle.includes(t)
343
+ );
344
+ if (titleMatch) {
345
+ rrfScore *= 1.5;
346
+ }
347
+
348
+ this.addScore(scores, path, r.primaryTitle || r.sectionTitle, r.headings, r.content, rrfScore);
349
+ });
350
+
351
+ // Sort by fused RRF score descending
352
+ return [...scores.values()]
353
+ .sort((a, b) => b.score - a.score)
354
+ .slice(0, limit)
355
+ .map(({ score, ...rest }) => rest);
356
+ }
357
+
358
+ private addScore(
359
+ map: Map<string, { path: string; title: string; headings: string[]; snippet: string; score: number }>,
360
+ path: string,
361
+ title: string,
362
+ headings: string,
363
+ snippet: string,
364
+ score: number,
365
+ ): void {
366
+ const current = map.get(path);
367
+ if (!current) {
368
+ map.set(path, {
369
+ path,
370
+ title,
371
+ headings: headings ? headings.split(' | ') : [],
372
+ snippet,
373
+ score,
374
+ });
375
+ } else {
376
+ current.score += score;
377
+ // Use FTS-chosen content (more likely to have keyword match in snippet)
378
+ if (score > 0 && snippet) {
379
+ current.snippet = snippet;
380
+ current.title = title;
381
+ }
382
+ }
383
+ }
384
+
385
+ private dedupBest<T extends { path: string }>(
386
+ results: T[],
387
+ scoreFn: (r: T) => number,
388
+ order: 'asc' | 'desc',
389
+ ): T[] {
390
+ const best = new Map<string, { item: T; score: number }>();
391
+ for (const r of results) {
392
+ const s = scoreFn(r);
393
+ const existing = best.get(r.path);
394
+ if (
395
+ !existing ||
396
+ (order === 'asc' && s < existing.score) ||
397
+ (order === 'desc' && s > existing.score)
398
+ ) {
399
+ best.set(r.path, { item: r, score: s });
400
+ }
401
+ }
402
+ return [...best.values()].map((v) => v.item);
403
+ }
404
+
405
+ private deduplicateByPath<T extends { path: string }>(results: T[]): T[] {
406
+ const seen = new Set<string>();
407
+ return results.filter((r) => {
408
+ if (seen.has(r.path)) return false;
409
+ seen.add(r.path);
410
+ return true;
411
+ });
412
+ }
413
+
414
+ private sanitizeTableName(entryId: string): string {
415
+ return entryId.replace(/[^a-zA-Z0-9_]/g, '_');
416
+ }
417
+ }
418
+
419
+ function chunkDocument(root: ReturnType<typeof parse>, pageTitle: string): Chunk[] {
420
+ const chunks: Chunk[] = [];
421
+ const body = root.querySelector('body');
422
+ if (!body) return chunks;
423
+
424
+ const headingStack: string[] = [];
425
+ let currentSectionHeading = pageTitle;
426
+ let currentText = '';
427
+
428
+ const headingSelector = 'h1, h2, h3, h4';
429
+
430
+ // Collect all heading and text elements in document order
431
+ const elements = body.querySelectorAll(
432
+ `${headingSelector}, p, div, section, article, ul, ol, dl, pre, blockquote, table, figure`
433
+ );
434
+
435
+ for (const el of elements) {
436
+ const tagName = el.tagName?.toLowerCase();
437
+ const headingMatch = tagName?.match(/^h([1-4])$/);
438
+
439
+ if (headingMatch) {
440
+ // Save previous chunk if it has enough content
441
+ if (currentText.trim().length >= MIN_CHUNK_CHARS) {
442
+ chunks.push({
443
+ primaryTitle: pageTitle,
444
+ sectionTitle: currentSectionHeading,
445
+ text: currentText.replace(/\s+/g, ' ').trim().substring(0, MAX_EMBED_CHARS),
446
+ headingPath: [...headingStack],
447
+ });
448
+ }
449
+
450
+ // Start new section
451
+ const level = parseInt(headingMatch[1]);
452
+ const headingText = el.text.trim();
453
+ currentSectionHeading = headingText || currentSectionHeading;
454
+
455
+ // Adjust heading stack
456
+ while (headingStack.length >= level) headingStack.pop();
457
+ headingStack.push(headingText || pageTitle);
458
+
459
+ currentText = '';
460
+ } else {
461
+ // Accumulate text
462
+ const text = el.text?.trim();
463
+ if (text) {
464
+ currentText += ' ' + text;
465
+ }
466
+ }
467
+ }
468
+
469
+ // Save the last chunk
470
+ if (currentText.trim().length >= MIN_CHUNK_CHARS) {
471
+ chunks.push({
472
+ primaryTitle: pageTitle,
473
+ sectionTitle: currentSectionHeading,
474
+ text: currentText.replace(/\s+/g, ' ').trim().substring(0, MAX_EMBED_CHARS),
475
+ headingPath: [...headingStack],
476
+ });
477
+ }
478
+
479
+ return chunks;
480
+ }
@@ -0,0 +1,14 @@
1
+ import path from 'node:path';
2
+ import type { ISourceProcessor } from '../../core/ports/ISourceProcessor.js';
3
+ import type { Source, AntoraSourceConfig } from '../../core/domain/types.js';
4
+ import { buildAntoraSource } from '../../services/antora.js';
5
+
6
+ export class AntoraSourceProcessor implements ISourceProcessor {
7
+ readonly sourceType = 'antora' as const;
8
+
9
+ async process(source: Source, _sourceDir: string, entryDir: string, entryId: string, log: (msg: string) => void): Promise<string> {
10
+ const config = source.config as AntoraSourceConfig;
11
+ const workDir = path.join(entryDir, 'antora', source.id);
12
+ return buildAntoraSource(config, entryId, workDir, log);
13
+ }
14
+ }
@@ -0,0 +1,12 @@
1
+ import type { ISourceProcessor } from '../../core/ports/ISourceProcessor.js';
2
+ import type { Source, AsciidocSourceConfig } from '../../core/domain/types.js';
3
+ import { buildAsciidocSource } from '../../services/asciidoc.js';
4
+
5
+ export class AsciidocSourceProcessor implements ISourceProcessor {
6
+ readonly sourceType = 'asciidoc' as const;
7
+
8
+ async process(source: Source, sourceDir: string, _entryDir: string, _entryId: string, log: (msg: string) => void): Promise<string> {
9
+ await buildAsciidocSource(source.config as AsciidocSourceConfig, sourceDir, log);
10
+ return sourceDir;
11
+ }
12
+ }