@lon-ask/dockit 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +674 -0
- package/README.md +496 -0
- package/SKILL.md +154 -0
- package/apps/client/dist/assets/index-CqOXxsEZ.js +240 -0
- package/apps/client/dist/assets/index-DwvaANnI.css +1 -0
- package/apps/client/dist/index.html +13 -0
- package/apps/server/src/core/domain/entry.ts +22 -0
- package/apps/server/src/core/domain/errors.ts +27 -0
- package/apps/server/src/core/domain/knowledge-graph.ts +51 -0
- package/apps/server/src/core/domain/types.ts +168 -0
- package/apps/server/src/core/ports/IBuildRepository.ts +7 -0
- package/apps/server/src/core/ports/IDocumentNormalizer.ts +6 -0
- package/apps/server/src/core/ports/IDocumentStore.ts +4 -0
- package/apps/server/src/core/ports/IEntryReadModel.ts +9 -0
- package/apps/server/src/core/ports/IEntryRepository.ts +11 -0
- package/apps/server/src/core/ports/IKnowledgeGraph.ts +10 -0
- package/apps/server/src/core/ports/IPathResolver.ts +3 -0
- package/apps/server/src/core/ports/ISearchEngine.ts +9 -0
- package/apps/server/src/core/ports/ISourceProcessor.ts +7 -0
- package/apps/server/src/core/ports/ISourceRepository.ts +11 -0
- package/apps/server/src/core/usecases/BuildUseCase.ts +98 -0
- package/apps/server/src/core/usecases/ConfigUseCase.ts +64 -0
- package/apps/server/src/core/usecases/SearchUseCase.ts +16 -0
- package/apps/server/src/index.ts +98 -0
- package/apps/server/src/infrastructure/filesystem/FileSystemDocumentStore.ts +27 -0
- package/apps/server/src/infrastructure/graph/GraphSearchDecorator.ts +53 -0
- package/apps/server/src/infrastructure/graph/GraphifyKnowledgeGraph.ts +172 -0
- package/apps/server/src/infrastructure/graph/index.ts +2 -0
- package/apps/server/src/infrastructure/persistence/sqlite/SqliteBuildRepository.ts +34 -0
- package/apps/server/src/infrastructure/persistence/sqlite/SqliteEntryReadModel.ts +17 -0
- package/apps/server/src/infrastructure/persistence/sqlite/SqliteEntryRepository.ts +81 -0
- package/apps/server/src/infrastructure/persistence/sqlite/SqliteSourceRepository.ts +65 -0
- package/apps/server/src/infrastructure/persistence/sqlite/connection.ts +52 -0
- package/apps/server/src/infrastructure/search/SearchEngineFactory.ts +43 -0
- package/apps/server/src/infrastructure/search/json/JsonSearchEngine.ts +164 -0
- package/apps/server/src/infrastructure/search/vector/EmbeddingService.ts +23 -0
- package/apps/server/src/infrastructure/search/vector/VectorSearchEngine.ts +480 -0
- package/apps/server/src/infrastructure/source-processors/AntoraSourceProcessor.ts +14 -0
- package/apps/server/src/infrastructure/source-processors/AsciidocSourceProcessor.ts +12 -0
- package/apps/server/src/infrastructure/source-processors/DocumentNormalizer.ts +16 -0
- package/apps/server/src/infrastructure/source-processors/GithubMarkdownSourceProcessor.ts +12 -0
- package/apps/server/src/infrastructure/source-processors/MavenSourceProcessor.ts +12 -0
- package/apps/server/src/infrastructure/source-processors/PathResolver.ts +6 -0
- package/apps/server/src/infrastructure/source-processors/SourceCodeSourceProcessor.ts +260 -0
- package/apps/server/src/infrastructure/source-processors/ZipSourceProcessor.ts +12 -0
- package/apps/server/src/mcp-http.ts +102 -0
- package/apps/server/src/mcp.ts +432 -0
- package/apps/server/src/routes/build.ts +105 -0
- package/apps/server/src/routes/entries.ts +62 -0
- package/apps/server/src/routes/graph.ts +57 -0
- package/apps/server/src/routes/search.ts +28 -0
- package/apps/server/src/routes/sources.ts +105 -0
- package/apps/server/src/routes/viewer.ts +28 -0
- package/apps/server/src/services/antora.ts +238 -0
- package/apps/server/src/services/asciidoc.ts +221 -0
- package/apps/server/src/services/configLoader.ts +207 -0
- package/apps/server/src/services/githubMarkdown.ts +236 -0
- package/apps/server/src/services/maven.ts +178 -0
- package/apps/server/src/services/normalizer.ts +63 -0
- package/apps/server/src/services/paths.ts +5 -0
- package/apps/server/src/services/textExtractor.ts +49 -0
- package/apps/server/src/services/zip.ts +84 -0
- package/bin/commands/build.ts +85 -0
- package/bin/commands/dev.ts +36 -0
- package/bin/commands/get.ts +36 -0
- package/bin/commands/graph.ts +153 -0
- package/bin/commands/init.ts +170 -0
- package/bin/commands/list.ts +47 -0
- package/bin/commands/mcp.ts +32 -0
- package/bin/commands/search.ts +185 -0
- package/bin/commands/serve.ts +23 -0
- package/bin/commands/status.ts +46 -0
- package/bin/dockit-cli.ts +92 -0
- package/bin/dockit.js +17 -0
- package/bin/utils.ts +85 -0
- package/dockit.yaml +154 -0
- package/package.json +60 -0
- package/scripts/mcp-wrapper.sh +44 -0
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import fs from 'node:fs';
|
|
3
|
+
import { parse } from 'node-html-parser';
|
|
4
|
+
import type { ISearchEngine } from '../../../core/ports/ISearchEngine.js';
|
|
5
|
+
import type { IEntryReadModel } from '../../../core/ports/IEntryReadModel.js';
|
|
6
|
+
import type { SearchResult, GlobalSearchResult, HtmlFile } from '../../../core/domain/types.js';
|
|
7
|
+
import { DATA_ROOT } from '../../../services/paths.js';
|
|
8
|
+
|
|
9
|
+
const STOP_WORDS = new Set([
|
|
10
|
+
'a', 'an', 'the', 'and', 'or', 'but', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
|
|
11
|
+
'do', 'does', 'did', 'doing', 'have', 'has', 'had', 'having', 'will', 'would', 'shall',
|
|
12
|
+
'should', 'can', 'could', 'may', 'might', 'must', 'to', 'of', 'in', 'for', 'on', 'with',
|
|
13
|
+
'at', 'by', 'from', 'as', 'into', 'through', 'during', 'before', 'after', 'above', 'below',
|
|
14
|
+
'between', 'out', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
|
|
15
|
+
'there', 'when', 'where', 'why', 'how', 'what', 'which', 'who', 'whom', 'this', 'that',
|
|
16
|
+
'these', 'those', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
|
|
17
|
+
'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her',
|
|
18
|
+
'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
|
|
19
|
+
'not', 'no', 'nor', 'so', 'if', 'about', 'up', 'down', 'just', 'only', 'own', 'same',
|
|
20
|
+
'than', 'too', 'very', 'some', 'any', 'each', 'every', 'all', 'both', 'few', 'more',
|
|
21
|
+
'most', 'other', 'such', 'also', 'get', 'got', 'like', 'make', 'made', 'use', 'used',
|
|
22
|
+
'using', 'create', 'new', 'way', 'need', 'want', 'know', 'tell', 'say', 'said', 'go',
|
|
23
|
+
'went', 'come', 'see', 'look', 'find', 'give', 'take', 'put', 'set', 'let', 'keep',
|
|
24
|
+
'work', 'call', 'try', 'ask', 'show', 'think', 'help', 'run', 'move', 'live', 'believe',
|
|
25
|
+
]);
|
|
26
|
+
|
|
27
|
+
function countOccurrences(text: string, term: string): number {
|
|
28
|
+
let count = 0;
|
|
29
|
+
let idx = text.indexOf(term);
|
|
30
|
+
while (idx !== -1) {
|
|
31
|
+
count++;
|
|
32
|
+
idx = text.indexOf(term, idx + term.length);
|
|
33
|
+
}
|
|
34
|
+
return count;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
export class JsonSearchEngine implements ISearchEngine {
|
|
38
|
+
readonly capability = 'json' as const;
|
|
39
|
+
|
|
40
|
+
constructor(private readonly entryReadModel: IEntryReadModel) {}
|
|
41
|
+
|
|
42
|
+
async buildIndex(entryId: string, htmlFiles: HtmlFile[], log: (msg: string) => void): Promise<void> {
|
|
43
|
+
log(`Building search index for ${htmlFiles.length} files`);
|
|
44
|
+
const entryDir = path.join(DATA_ROOT, entryId);
|
|
45
|
+
const bundleDir = path.join(entryDir, 'bundle');
|
|
46
|
+
const indexPath = path.join(entryDir, 'index.json');
|
|
47
|
+
const index: SearchResult[] = [];
|
|
48
|
+
|
|
49
|
+
for (const file of htmlFiles) {
|
|
50
|
+
try {
|
|
51
|
+
const html = fs.readFileSync(file.fullPath, 'utf-8');
|
|
52
|
+
const root = parse(html);
|
|
53
|
+
|
|
54
|
+
const title = root.querySelector('title')?.text.trim()
|
|
55
|
+
|| root.querySelector('h1')?.text.trim()
|
|
56
|
+
|| path.basename(file.relativePath, '.html');
|
|
57
|
+
|
|
58
|
+
const headings: string[] = [];
|
|
59
|
+
root.querySelectorAll('h1, h2, h3, h4').forEach((el) => {
|
|
60
|
+
const text = el.text.trim();
|
|
61
|
+
if (text) headings.push(text);
|
|
62
|
+
});
|
|
63
|
+
|
|
64
|
+
const bodyEl = root.querySelector('body');
|
|
65
|
+
const bodyText = bodyEl ? bodyEl.text.replace(/\s+/g, ' ').trim() : '';
|
|
66
|
+
const snippet = bodyText.slice(0, 300);
|
|
67
|
+
|
|
68
|
+
index.push({
|
|
69
|
+
path: file.relativePath,
|
|
70
|
+
title,
|
|
71
|
+
headings,
|
|
72
|
+
snippet,
|
|
73
|
+
});
|
|
74
|
+
} catch (err) {
|
|
75
|
+
log(` Warning: could not parse ${file.relativePath}: ${(err as Error).message}`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
fs.mkdirSync(path.dirname(indexPath), { recursive: true });
|
|
80
|
+
fs.writeFileSync(indexPath, JSON.stringify(index, null, 2), 'utf-8');
|
|
81
|
+
log(`Search index written to ${indexPath} with ${index.length} entries`);
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
async search(entryId: string, query: string, limit = 20): Promise<SearchResult[]> {
|
|
85
|
+
const indexPath = path.join(DATA_ROOT, entryId, 'index.json');
|
|
86
|
+
if (!fs.existsSync(indexPath)) return [];
|
|
87
|
+
|
|
88
|
+
const index: SearchResult[] = JSON.parse(fs.readFileSync(indexPath, 'utf-8'));
|
|
89
|
+
return this.scoreAndFilter(index, query, limit);
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
async globalSearch(query: string, limit = 30): Promise<GlobalSearchResult[]> {
|
|
93
|
+
const readyEntries = await this.entryReadModel.listReadyEntries();
|
|
94
|
+
|
|
95
|
+
const allResults: GlobalSearchResult[] = [];
|
|
96
|
+
for (const entry of readyEntries) {
|
|
97
|
+
const results = await this.search(entry.id, query, 10);
|
|
98
|
+
for (const r of results) {
|
|
99
|
+
allResults.push({
|
|
100
|
+
...r,
|
|
101
|
+
entryId: entry.id,
|
|
102
|
+
entryName: entry.name,
|
|
103
|
+
entryVersion: entry.version,
|
|
104
|
+
});
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
allResults.sort((a, b) => ((b as any).score || 0) - ((a as any).score || 0));
|
|
109
|
+
return allResults.slice(0, limit);
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
private scoreAndFilter(index: SearchResult[], query: string, maxResults: number): SearchResult[] {
|
|
113
|
+
const allTerms = query.toLowerCase().split(/\s+/).filter(Boolean);
|
|
114
|
+
const terms = allTerms.filter((t) => !STOP_WORDS.has(t));
|
|
115
|
+
|
|
116
|
+
if (terms.length === 0) return index.slice(0, maxResults);
|
|
117
|
+
|
|
118
|
+
const docFreq: Record<string, number> = {};
|
|
119
|
+
const totalDocs = index.length;
|
|
120
|
+
for (const term of terms) {
|
|
121
|
+
docFreq[term] = 0;
|
|
122
|
+
for (const item of index) {
|
|
123
|
+
const searchText = [item.title, ...item.headings, item.snippet].join(' ').toLowerCase();
|
|
124
|
+
if (searchText.includes(term)) {
|
|
125
|
+
docFreq[term]++;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
return index
|
|
131
|
+
.map((item) => {
|
|
132
|
+
const titleLower = item.title.toLowerCase();
|
|
133
|
+
const headingsLower = item.headings.map((h) => h.toLowerCase());
|
|
134
|
+
const snippetLower = item.snippet.toLowerCase();
|
|
135
|
+
|
|
136
|
+
let score = 0;
|
|
137
|
+
|
|
138
|
+
for (const term of terms) {
|
|
139
|
+
const idf = Math.log(totalDocs / (1 + docFreq[term]));
|
|
140
|
+
|
|
141
|
+
const titleCount = countOccurrences(titleLower, term);
|
|
142
|
+
if (titleCount > 0) score += titleCount * idf * 10;
|
|
143
|
+
|
|
144
|
+
let headingCount = 0;
|
|
145
|
+
for (const heading of headingsLower) {
|
|
146
|
+
if (heading.includes(term)) headingCount++;
|
|
147
|
+
}
|
|
148
|
+
if (headingCount > 0) score += Math.min(headingCount, 5) * idf * 3;
|
|
149
|
+
|
|
150
|
+
const snippetCount = countOccurrences(snippetLower, term);
|
|
151
|
+
if (snippetCount > 0) score += Math.log(1 + snippetCount) * idf;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
const allTermsInTitle = terms.every((t) => titleLower.includes(t));
|
|
155
|
+
if (allTermsInTitle) score += 20 * terms.length;
|
|
156
|
+
|
|
157
|
+
return { item, score };
|
|
158
|
+
})
|
|
159
|
+
.filter(({ score }) => score > 0)
|
|
160
|
+
.sort((a, b) => b.score - a.score)
|
|
161
|
+
.slice(0, maxResults)
|
|
162
|
+
.map(({ item }) => item);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
export class EmbeddingService {
|
|
2
|
+
private initialized = false;
|
|
3
|
+
private embedFn: ((texts: string[]) => Promise<number[][]>) | null = null;
|
|
4
|
+
|
|
5
|
+
private async init() {
|
|
6
|
+
if (this.initialized) return;
|
|
7
|
+
|
|
8
|
+
const mod = await import('@dockit/embeddings');
|
|
9
|
+
// Configure for bundled offline mode by default.
|
|
10
|
+
// env.cacheDir defaults to <package>/model/; allowRemoteModels defaults to true
|
|
11
|
+
// (permits download if model not yet cached). For air-gapped environments,
|
|
12
|
+
// call mod.configure({ offline: true }) before first embed().
|
|
13
|
+
mod.configure();
|
|
14
|
+
this.embedFn = mod.embed;
|
|
15
|
+
this.initialized = true;
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
async embed(texts: string[]): Promise<number[][]> {
|
|
19
|
+
await this.init();
|
|
20
|
+
if (!this.embedFn) throw new Error('Embedding service not initialized');
|
|
21
|
+
return this.embedFn(texts);
|
|
22
|
+
}
|
|
23
|
+
}
|
|
@@ -0,0 +1,480 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import fs from 'node:fs';
|
|
3
|
+
import { parse, HTMLElement } from 'node-html-parser';
|
|
4
|
+
import type { ISearchEngine } from '../../../core/ports/ISearchEngine.js';
|
|
5
|
+
import type { IEntryReadModel } from '../../../core/ports/IEntryReadModel.js';
|
|
6
|
+
import type { SearchResult, GlobalSearchResult, HtmlFile } from '../../../core/domain/types.js';
|
|
7
|
+
import { DATA_ROOT } from '../../../services/paths.js';
|
|
8
|
+
import { EmbeddingService } from './EmbeddingService.js';
|
|
9
|
+
import type { Connection, Table } from '@lancedb/lancedb';
|
|
10
|
+
|
|
11
|
+
const LANCE_DB_DIR = path.join(DATA_ROOT, '.lancedb');
|
|
12
|
+
const VECTOR_DIM = 384;
|
|
13
|
+
const MAX_EMBED_CHARS = 2000;
|
|
14
|
+
const MAX_SNIPPET_CHARS = 500;
|
|
15
|
+
const MIN_CHUNK_CHARS = 50;
|
|
16
|
+
const RRF_K = 25;
|
|
17
|
+
const FTS_WEAK_WEIGHT = 0.7;
|
|
18
|
+
const FTS_STRONG_WEIGHT = 2.0;
|
|
19
|
+
const FTS_MIN_SCORE_RATIO = 0.3;
|
|
20
|
+
const FTS_CONFIDENCE_RATIO = 1.3;
|
|
21
|
+
const PARALLEL_QUERY_LIMIT = 40;
|
|
22
|
+
|
|
23
|
+
interface Chunk {
|
|
24
|
+
primaryTitle: string;
|
|
25
|
+
sectionTitle: string;
|
|
26
|
+
text: string;
|
|
27
|
+
headingPath: string[];
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
interface LanceDoc {
|
|
31
|
+
path: string;
|
|
32
|
+
primaryTitle: string;
|
|
33
|
+
sectionTitle: string;
|
|
34
|
+
content: string;
|
|
35
|
+
searchText: string;
|
|
36
|
+
embedText: string;
|
|
37
|
+
headings: string;
|
|
38
|
+
entryId: string;
|
|
39
|
+
vector: Float32Array;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
interface LanceDbQueryResult {
|
|
43
|
+
path: string;
|
|
44
|
+
primaryTitle: string;
|
|
45
|
+
sectionTitle: string;
|
|
46
|
+
content: string;
|
|
47
|
+
headings: string;
|
|
48
|
+
entryId: string;
|
|
49
|
+
vector: Float32Array;
|
|
50
|
+
_distance: number;
|
|
51
|
+
_score?: number;
|
|
52
|
+
_query?: string;
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
export class VectorSearchEngine implements ISearchEngine {
|
|
56
|
+
readonly capability = 'vector' as const;
|
|
57
|
+
private embeddingService: EmbeddingService;
|
|
58
|
+
private dbPromise: Promise<Connection> | null = null;
|
|
59
|
+
private entryReadModel: IEntryReadModel;
|
|
60
|
+
|
|
61
|
+
constructor(entryReadModel: IEntryReadModel, embeddingService?: EmbeddingService) {
|
|
62
|
+
this.entryReadModel = entryReadModel;
|
|
63
|
+
this.embeddingService = embeddingService ?? new EmbeddingService();
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
private async getDb(): Promise<Connection> {
|
|
67
|
+
if (!this.dbPromise) {
|
|
68
|
+
const lancedb = await import('@lancedb/lancedb');
|
|
69
|
+
this.dbPromise = lancedb.connect(LANCE_DB_DIR);
|
|
70
|
+
}
|
|
71
|
+
return this.dbPromise;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
async buildIndex(entryId: string, htmlFiles: HtmlFile[], log: (msg: string) => void): Promise<void> {
|
|
75
|
+
log(`Building vector search index for ${htmlFiles.length} files`);
|
|
76
|
+
const db = await this.getDb();
|
|
77
|
+
|
|
78
|
+
const tableName = this.sanitizeTableName(entryId);
|
|
79
|
+
try {
|
|
80
|
+
const names = await db.tableNames();
|
|
81
|
+
if (names.includes(tableName)) {
|
|
82
|
+
await db.dropTable(tableName);
|
|
83
|
+
log(`Dropped existing table ${tableName}`);
|
|
84
|
+
}
|
|
85
|
+
} catch {
|
|
86
|
+
// Table may not exist
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
const allChunks: LanceDoc[] = [];
|
|
90
|
+
|
|
91
|
+
for (const file of htmlFiles) {
|
|
92
|
+
try {
|
|
93
|
+
const html = fs.readFileSync(file.fullPath, 'utf-8');
|
|
94
|
+
const root = parse(html);
|
|
95
|
+
|
|
96
|
+
const primaryTitle = root.querySelector('title')?.text.trim()
|
|
97
|
+
|| root.querySelector('h1')?.text.trim()
|
|
98
|
+
|| path.basename(file.relativePath, '.html');
|
|
99
|
+
|
|
100
|
+
const chunks = chunkDocument(root, primaryTitle);
|
|
101
|
+
|
|
102
|
+
if (chunks.length === 0) {
|
|
103
|
+
// No sections found, treat whole document as one chunk
|
|
104
|
+
const bodyEl = root.querySelector('body');
|
|
105
|
+
const bodyText = bodyEl ? bodyEl.text.replace(/\s+/g, ' ').trim() : '';
|
|
106
|
+
const embedText = `${primaryTitle}. ${primaryTitle}. ${bodyText.replace(/\s+/g, ' ').trim()}`.substring(0, MAX_EMBED_CHARS);
|
|
107
|
+
const snippet = bodyText.substring(0, MAX_SNIPPET_CHARS);
|
|
108
|
+
|
|
109
|
+
allChunks.push({
|
|
110
|
+
path: file.relativePath,
|
|
111
|
+
primaryTitle,
|
|
112
|
+
sectionTitle: primaryTitle,
|
|
113
|
+
content: snippet,
|
|
114
|
+
searchText: `${primaryTitle}. ${primaryTitle}. ${bodyText.replace(/\s+/g, ' ').trim()}`,
|
|
115
|
+
embedText,
|
|
116
|
+
headings: primaryTitle,
|
|
117
|
+
entryId,
|
|
118
|
+
vector: new Float32Array(VECTOR_DIM),
|
|
119
|
+
});
|
|
120
|
+
} else {
|
|
121
|
+
for (const chunk of chunks) {
|
|
122
|
+
const embedText = `${primaryTitle}. ${primaryTitle}. ${chunk.sectionTitle}. ${chunk.text.replace(/\s+/g, ' ').trim()}`
|
|
123
|
+
.substring(0, MAX_EMBED_CHARS);
|
|
124
|
+
const searchText = `${primaryTitle}. ${primaryTitle}. ${chunk.sectionTitle}. ${chunk.text.replace(/\s+/g, ' ').trim()}`;
|
|
125
|
+
const snippet = chunk.text.replace(/\s+/g, ' ').trim().substring(0, MAX_SNIPPET_CHARS);
|
|
126
|
+
|
|
127
|
+
allChunks.push({
|
|
128
|
+
path: file.relativePath,
|
|
129
|
+
primaryTitle,
|
|
130
|
+
sectionTitle: chunk.sectionTitle,
|
|
131
|
+
content: snippet,
|
|
132
|
+
searchText,
|
|
133
|
+
embedText,
|
|
134
|
+
headings: [...chunk.headingPath, chunk.sectionTitle].join(' | '),
|
|
135
|
+
entryId,
|
|
136
|
+
vector: new Float32Array(VECTOR_DIM),
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
} catch (err) {
|
|
141
|
+
log(` Warning: could not parse ${file.relativePath}: ${(err as Error).message}`);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
if (allChunks.length === 0) {
|
|
146
|
+
log('No documents to index');
|
|
147
|
+
return;
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
log(`Created ${allChunks.length} chunks across ${htmlFiles.length} files`);
|
|
151
|
+
|
|
152
|
+
// Batch embed all chunks
|
|
153
|
+
const batchSize = 32;
|
|
154
|
+
const totalChunks = allChunks.length;
|
|
155
|
+
|
|
156
|
+
for (let i = 0; i < totalChunks; i += batchSize) {
|
|
157
|
+
const batch = allChunks.slice(i, i + batchSize);
|
|
158
|
+
const texts = batch.map((d) => d.embedText);
|
|
159
|
+
const embeddings = await this.embeddingService.embed(texts);
|
|
160
|
+
for (let j = 0; j < batch.length; j++) {
|
|
161
|
+
batch[j].vector = new Float32Array(embeddings[j]);
|
|
162
|
+
}
|
|
163
|
+
if (i % 128 === 0 || i + batchSize >= totalChunks) {
|
|
164
|
+
log(`Embedded ${Math.min(i + batchSize, totalChunks)}/${totalChunks} chunks`);
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
// Create LanceDB table
|
|
169
|
+
// LanceDB types require Record<string, unknown> for createTable; Float32Array vectors don't satisfy this
|
|
170
|
+
const table = await db.createTable(tableName, allChunks as any[], {
|
|
171
|
+
mode: 'overwrite',
|
|
172
|
+
});
|
|
173
|
+
log(`Created table ${tableName} with ${allChunks.length} rows`);
|
|
174
|
+
|
|
175
|
+
// Create vector index with cosine distance
|
|
176
|
+
try {
|
|
177
|
+
const lancedb = await import('@lancedb/lancedb');
|
|
178
|
+
await table.createIndex('vector', {
|
|
179
|
+
config: lancedb.Index.ivfPq({ distanceType: 'cosine' }),
|
|
180
|
+
});
|
|
181
|
+
log(`Created vector index (cosine) on ${tableName}`);
|
|
182
|
+
} catch (err) {
|
|
183
|
+
log(` Warning: could not create vector index: ${(err as Error).message}`);
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
// Create FTS index on searchText column (includes title for better keyword matching)
|
|
187
|
+
try {
|
|
188
|
+
const lancedb = await import('@lancedb/lancedb');
|
|
189
|
+
await table.createIndex('searchText', {
|
|
190
|
+
config: lancedb.Index.fts(),
|
|
191
|
+
});
|
|
192
|
+
log(`Created FTS index on ${tableName}`);
|
|
193
|
+
} catch (err) {
|
|
194
|
+
log(` Warning: could not create FTS index: ${(err as Error).message}`);
|
|
195
|
+
}
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
async search(entryId: string, query: string, limit = 20): Promise<SearchResult[]> {
|
|
199
|
+
const db = await this.getDb();
|
|
200
|
+
const tableName = this.sanitizeTableName(entryId);
|
|
201
|
+
|
|
202
|
+
let table: Table;
|
|
203
|
+
try {
|
|
204
|
+
table = await db.openTable(tableName);
|
|
205
|
+
} catch {
|
|
206
|
+
return [];
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
const results = await this.hybridSearch(table, query, limit);
|
|
210
|
+
return results;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
async globalSearch(query: string, limit = 30): Promise<GlobalSearchResult[]> {
|
|
214
|
+
const db = await this.getDb();
|
|
215
|
+
const readyEntries = await this.entryReadModel.listReadyEntries();
|
|
216
|
+
|
|
217
|
+
if (readyEntries.length === 0) return [];
|
|
218
|
+
|
|
219
|
+
// Search all entries in parallel
|
|
220
|
+
const fetchLimit = Math.min(5, Math.ceil(limit / readyEntries.length));
|
|
221
|
+
const perEntry = Math.max(5, fetchLimit);
|
|
222
|
+
|
|
223
|
+
const entryResults = await Promise.all(
|
|
224
|
+
readyEntries.map(async (entry) => {
|
|
225
|
+
try {
|
|
226
|
+
const table = await db.openTable(this.sanitizeTableName(entry.id));
|
|
227
|
+
const results = await this.hybridSearch(table, query, perEntry);
|
|
228
|
+
return results.map((r) => ({
|
|
229
|
+
...r,
|
|
230
|
+
entryId: entry.id,
|
|
231
|
+
entryName: entry.name,
|
|
232
|
+
entryVersion: entry.version,
|
|
233
|
+
}));
|
|
234
|
+
} catch {
|
|
235
|
+
return [] as GlobalSearchResult[];
|
|
236
|
+
}
|
|
237
|
+
})
|
|
238
|
+
);
|
|
239
|
+
|
|
240
|
+
// Flatten and re-sort by RRF methodology
|
|
241
|
+
// All results already have internal ordering, just merge and limit
|
|
242
|
+
const allResults = entryResults.flat();
|
|
243
|
+
return this.deduplicateByPath(allResults).slice(0, limit);
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
private async hybridSearch(table: Table, query: string, limit: number): Promise<SearchResult[]> {
|
|
247
|
+
const queryEmbedding = await this.embeddingService.embed([query]);
|
|
248
|
+
const queryVector = new Float32Array(queryEmbedding[0]);
|
|
249
|
+
|
|
250
|
+
// Run vector and FTS queries in parallel
|
|
251
|
+
const [vecResults, ftsResults] = await Promise.allSettled([
|
|
252
|
+
table
|
|
253
|
+
.query()
|
|
254
|
+
.nearestTo(queryVector)
|
|
255
|
+
.distanceType('cosine')
|
|
256
|
+
.limit(PARALLEL_QUERY_LIMIT)
|
|
257
|
+
.toArray(),
|
|
258
|
+
table
|
|
259
|
+
.query()
|
|
260
|
+
.fullTextSearch(query, { columns: ['searchText'] })
|
|
261
|
+
.limit(PARALLEL_QUERY_LIMIT)
|
|
262
|
+
.toArray(),
|
|
263
|
+
]);
|
|
264
|
+
|
|
265
|
+
const vec = vecResults.status === 'fulfilled' ? (vecResults.value as LanceDbQueryResult[]) : [];
|
|
266
|
+
const fts = ftsResults.status === 'fulfilled' ? (ftsResults.value as LanceDbQueryResult[]) : [];
|
|
267
|
+
|
|
268
|
+
if (vec.length === 0 && fts.length === 0) return [];
|
|
269
|
+
|
|
270
|
+
// If only one query succeeded, use its results directly
|
|
271
|
+
if (vec.length === 0) {
|
|
272
|
+
return this.deduplicateByPath(
|
|
273
|
+
fts.map((r: LanceDbQueryResult) => ({
|
|
274
|
+
path: r.path,
|
|
275
|
+
title: r.primaryTitle || r.sectionTitle,
|
|
276
|
+
headings: r.headings ? r.headings.split(' | ') : [],
|
|
277
|
+
snippet: r.content,
|
|
278
|
+
}))
|
|
279
|
+
).slice(0, limit);
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
if (fts.length === 0) {
|
|
283
|
+
return this.deduplicateByPath(
|
|
284
|
+
vec.map((r: LanceDbQueryResult) => ({
|
|
285
|
+
path: r.path,
|
|
286
|
+
title: r.primaryTitle || r.sectionTitle,
|
|
287
|
+
headings: r.headings ? r.headings.split(' | ') : [],
|
|
288
|
+
snippet: r.content,
|
|
289
|
+
}))
|
|
290
|
+
).slice(0, limit);
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
// Hybrid fusion: Reciprocal Rank Fusion
|
|
294
|
+
const fused = this.hybridFuse(vec, fts, limit);
|
|
295
|
+
return fused;
|
|
296
|
+
}
|
|
297
|
+
|
|
298
|
+
private hybridFuse(vecResults: LanceDbQueryResult[], ftsResults: LanceDbQueryResult[], limit: number): SearchResult[] {
|
|
299
|
+
// Deduplicate: keep only best chunk per path BEFORE RRF fusion.
|
|
300
|
+
const dedupVec = this.dedupBest(vecResults, (r) => r._distance ?? Infinity, 'asc');
|
|
301
|
+
let dedupFts = this.dedupBest(ftsResults, (r) => r._score ?? 0, 'desc');
|
|
302
|
+
|
|
303
|
+
// Filter FTS results by minimum relevance threshold
|
|
304
|
+
if (dedupFts.length > 0) {
|
|
305
|
+
const maxScore = dedupFts[0]._score ?? 0;
|
|
306
|
+
const minScore = maxScore * FTS_MIN_SCORE_RATIO;
|
|
307
|
+
dedupFts = dedupFts.filter((r) => (r._score ?? 0) >= minScore);
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
// Dynamic FTS weight: if FTS is confident (clear score gap between #1 and others),
|
|
311
|
+
// weight FTS higher. If scores are similar, FTS is uncertain, rely more on vector.
|
|
312
|
+
let ftsWeight = FTS_WEAK_WEIGHT;
|
|
313
|
+
if (dedupFts.length >= 2) {
|
|
314
|
+
const maxScore = dedupFts[0]._score ?? 0;
|
|
315
|
+
const secondScore = dedupFts[1]._score ?? 0;
|
|
316
|
+
if (secondScore > 0 && maxScore / secondScore > FTS_CONFIDENCE_RATIO) {
|
|
317
|
+
ftsWeight = FTS_STRONG_WEIGHT;
|
|
318
|
+
}
|
|
319
|
+
} else if (dedupFts.length === 1) {
|
|
320
|
+
ftsWeight = FTS_STRONG_WEIGHT; // Single result = high confidence
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
const scores = new Map<string, { path: string; title: string; headings: string[]; snippet: string; score: number }>();
|
|
324
|
+
|
|
325
|
+
// Apply RRF from vector results
|
|
326
|
+
dedupVec.forEach((r, i) => {
|
|
327
|
+
const path = r.path as string;
|
|
328
|
+
const rrfScore = 1 / (RRF_K + i + 1);
|
|
329
|
+
this.addScore(scores, path, r.primaryTitle || r.sectionTitle, r.headings, r.content, rrfScore);
|
|
330
|
+
});
|
|
331
|
+
|
|
332
|
+
// Apply RRF from FTS results (dynamically weighted, with title match boosting)
|
|
333
|
+
dedupFts.forEach((r, i) => {
|
|
334
|
+
const path = r.path as string;
|
|
335
|
+
let rrfScore = ftsWeight / (RRF_K + i + 1);
|
|
336
|
+
|
|
337
|
+
// Title match boost: if query terms appear in title, extra 50%
|
|
338
|
+
const queryTerms = (r._query || '').toLowerCase().split(/\s+/).filter((t: string) => t.length > 2);
|
|
339
|
+
const sectionTitle = (r.sectionTitle || '').toLowerCase();
|
|
340
|
+
const primaryTitle = (r.primaryTitle || '').toLowerCase();
|
|
341
|
+
const titleMatch = queryTerms.some(
|
|
342
|
+
(t: string) => sectionTitle.includes(t) || primaryTitle.includes(t)
|
|
343
|
+
);
|
|
344
|
+
if (titleMatch) {
|
|
345
|
+
rrfScore *= 1.5;
|
|
346
|
+
}
|
|
347
|
+
|
|
348
|
+
this.addScore(scores, path, r.primaryTitle || r.sectionTitle, r.headings, r.content, rrfScore);
|
|
349
|
+
});
|
|
350
|
+
|
|
351
|
+
// Sort by fused RRF score descending
|
|
352
|
+
return [...scores.values()]
|
|
353
|
+
.sort((a, b) => b.score - a.score)
|
|
354
|
+
.slice(0, limit)
|
|
355
|
+
.map(({ score, ...rest }) => rest);
|
|
356
|
+
}
|
|
357
|
+
|
|
358
|
+
private addScore(
|
|
359
|
+
map: Map<string, { path: string; title: string; headings: string[]; snippet: string; score: number }>,
|
|
360
|
+
path: string,
|
|
361
|
+
title: string,
|
|
362
|
+
headings: string,
|
|
363
|
+
snippet: string,
|
|
364
|
+
score: number,
|
|
365
|
+
): void {
|
|
366
|
+
const current = map.get(path);
|
|
367
|
+
if (!current) {
|
|
368
|
+
map.set(path, {
|
|
369
|
+
path,
|
|
370
|
+
title,
|
|
371
|
+
headings: headings ? headings.split(' | ') : [],
|
|
372
|
+
snippet,
|
|
373
|
+
score,
|
|
374
|
+
});
|
|
375
|
+
} else {
|
|
376
|
+
current.score += score;
|
|
377
|
+
// Use FTS-chosen content (more likely to have keyword match in snippet)
|
|
378
|
+
if (score > 0 && snippet) {
|
|
379
|
+
current.snippet = snippet;
|
|
380
|
+
current.title = title;
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
|
|
385
|
+
private dedupBest<T extends { path: string }>(
|
|
386
|
+
results: T[],
|
|
387
|
+
scoreFn: (r: T) => number,
|
|
388
|
+
order: 'asc' | 'desc',
|
|
389
|
+
): T[] {
|
|
390
|
+
const best = new Map<string, { item: T; score: number }>();
|
|
391
|
+
for (const r of results) {
|
|
392
|
+
const s = scoreFn(r);
|
|
393
|
+
const existing = best.get(r.path);
|
|
394
|
+
if (
|
|
395
|
+
!existing ||
|
|
396
|
+
(order === 'asc' && s < existing.score) ||
|
|
397
|
+
(order === 'desc' && s > existing.score)
|
|
398
|
+
) {
|
|
399
|
+
best.set(r.path, { item: r, score: s });
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
return [...best.values()].map((v) => v.item);
|
|
403
|
+
}
|
|
404
|
+
|
|
405
|
+
private deduplicateByPath<T extends { path: string }>(results: T[]): T[] {
|
|
406
|
+
const seen = new Set<string>();
|
|
407
|
+
return results.filter((r) => {
|
|
408
|
+
if (seen.has(r.path)) return false;
|
|
409
|
+
seen.add(r.path);
|
|
410
|
+
return true;
|
|
411
|
+
});
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
private sanitizeTableName(entryId: string): string {
|
|
415
|
+
return entryId.replace(/[^a-zA-Z0-9_]/g, '_');
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
|
|
419
|
+
function chunkDocument(root: ReturnType<typeof parse>, pageTitle: string): Chunk[] {
|
|
420
|
+
const chunks: Chunk[] = [];
|
|
421
|
+
const body = root.querySelector('body');
|
|
422
|
+
if (!body) return chunks;
|
|
423
|
+
|
|
424
|
+
const headingStack: string[] = [];
|
|
425
|
+
let currentSectionHeading = pageTitle;
|
|
426
|
+
let currentText = '';
|
|
427
|
+
|
|
428
|
+
const headingSelector = 'h1, h2, h3, h4';
|
|
429
|
+
|
|
430
|
+
// Collect all heading and text elements in document order
|
|
431
|
+
const elements = body.querySelectorAll(
|
|
432
|
+
`${headingSelector}, p, div, section, article, ul, ol, dl, pre, blockquote, table, figure`
|
|
433
|
+
);
|
|
434
|
+
|
|
435
|
+
for (const el of elements) {
|
|
436
|
+
const tagName = el.tagName?.toLowerCase();
|
|
437
|
+
const headingMatch = tagName?.match(/^h([1-4])$/);
|
|
438
|
+
|
|
439
|
+
if (headingMatch) {
|
|
440
|
+
// Save previous chunk if it has enough content
|
|
441
|
+
if (currentText.trim().length >= MIN_CHUNK_CHARS) {
|
|
442
|
+
chunks.push({
|
|
443
|
+
primaryTitle: pageTitle,
|
|
444
|
+
sectionTitle: currentSectionHeading,
|
|
445
|
+
text: currentText.replace(/\s+/g, ' ').trim().substring(0, MAX_EMBED_CHARS),
|
|
446
|
+
headingPath: [...headingStack],
|
|
447
|
+
});
|
|
448
|
+
}
|
|
449
|
+
|
|
450
|
+
// Start new section
|
|
451
|
+
const level = parseInt(headingMatch[1]);
|
|
452
|
+
const headingText = el.text.trim();
|
|
453
|
+
currentSectionHeading = headingText || currentSectionHeading;
|
|
454
|
+
|
|
455
|
+
// Adjust heading stack
|
|
456
|
+
while (headingStack.length >= level) headingStack.pop();
|
|
457
|
+
headingStack.push(headingText || pageTitle);
|
|
458
|
+
|
|
459
|
+
currentText = '';
|
|
460
|
+
} else {
|
|
461
|
+
// Accumulate text
|
|
462
|
+
const text = el.text?.trim();
|
|
463
|
+
if (text) {
|
|
464
|
+
currentText += ' ' + text;
|
|
465
|
+
}
|
|
466
|
+
}
|
|
467
|
+
}
|
|
468
|
+
|
|
469
|
+
// Save the last chunk
|
|
470
|
+
if (currentText.trim().length >= MIN_CHUNK_CHARS) {
|
|
471
|
+
chunks.push({
|
|
472
|
+
primaryTitle: pageTitle,
|
|
473
|
+
sectionTitle: currentSectionHeading,
|
|
474
|
+
text: currentText.replace(/\s+/g, ' ').trim().substring(0, MAX_EMBED_CHARS),
|
|
475
|
+
headingPath: [...headingStack],
|
|
476
|
+
});
|
|
477
|
+
}
|
|
478
|
+
|
|
479
|
+
return chunks;
|
|
480
|
+
}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import path from 'node:path';
|
|
2
|
+
import type { ISourceProcessor } from '../../core/ports/ISourceProcessor.js';
|
|
3
|
+
import type { Source, AntoraSourceConfig } from '../../core/domain/types.js';
|
|
4
|
+
import { buildAntoraSource } from '../../services/antora.js';
|
|
5
|
+
|
|
6
|
+
export class AntoraSourceProcessor implements ISourceProcessor {
|
|
7
|
+
readonly sourceType = 'antora' as const;
|
|
8
|
+
|
|
9
|
+
async process(source: Source, _sourceDir: string, entryDir: string, entryId: string, log: (msg: string) => void): Promise<string> {
|
|
10
|
+
const config = source.config as AntoraSourceConfig;
|
|
11
|
+
const workDir = path.join(entryDir, 'antora', source.id);
|
|
12
|
+
return buildAntoraSource(config, entryId, workDir, log);
|
|
13
|
+
}
|
|
14
|
+
}
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
import type { ISourceProcessor } from '../../core/ports/ISourceProcessor.js';
|
|
2
|
+
import type { Source, AsciidocSourceConfig } from '../../core/domain/types.js';
|
|
3
|
+
import { buildAsciidocSource } from '../../services/asciidoc.js';
|
|
4
|
+
|
|
5
|
+
export class AsciidocSourceProcessor implements ISourceProcessor {
|
|
6
|
+
readonly sourceType = 'asciidoc' as const;
|
|
7
|
+
|
|
8
|
+
async process(source: Source, sourceDir: string, _entryDir: string, _entryId: string, log: (msg: string) => void): Promise<string> {
|
|
9
|
+
await buildAsciidocSource(source.config as AsciidocSourceConfig, sourceDir, log);
|
|
10
|
+
return sourceDir;
|
|
11
|
+
}
|
|
12
|
+
}
|