gitmem-mcp 1.4.4 → 1.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +27 -0
- package/README.md +21 -4
- package/bin/gitmem.js +10 -0
- package/dist/commands/activate.d.ts +20 -0
- package/dist/commands/activate.js +562 -0
- package/dist/commands/deactivate.d.ts +10 -0
- package/dist/commands/deactivate.js +95 -0
- package/dist/commands/migrate-local.d.ts +53 -0
- package/dist/commands/migrate-local.js +177 -0
- package/dist/hooks/format-utils.js +4 -0
- package/dist/schemas/log.d.ts +2 -2
- package/dist/schemas/search.d.ts +2 -2
- package/dist/schemas/session-close.d.ts +12 -12
- package/dist/server.js +33 -2
- package/dist/services/analytics.d.ts +22 -0
- package/dist/services/analytics.js +68 -0
- package/dist/services/doc-chunker.d.ts +45 -0
- package/dist/services/doc-chunker.js +208 -0
- package/dist/services/doc-index.d.ts +88 -0
- package/dist/services/doc-index.js +328 -0
- package/dist/services/license.d.ts +57 -0
- package/dist/services/license.js +200 -0
- package/dist/services/supabase-client.d.ts +6 -0
- package/dist/services/supabase-client.js +75 -22
- package/dist/services/tier.d.ts +13 -3
- package/dist/services/tier.js +38 -7
- package/dist/tools/definitions.d.ts +688 -0
- package/dist/tools/definitions.js +87 -0
- package/dist/tools/index-docs.d.ts +30 -0
- package/dist/tools/index-docs.js +163 -0
- package/dist/tools/prepare-context.js +7 -0
- package/dist/tools/recall.js +25 -4
- package/dist/tools/search-docs.d.ts +38 -0
- package/dist/tools/search-docs.js +94 -0
- package/dist/tools/search.js +11 -1
- package/dist/tools/session-close.js +76 -7
- package/dist/tools/session-start.js +57 -5
- package/package.json +1 -1
- package/schema/setup.sql +489 -25
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Chunker — Split markdown files into searchable chunks
|
|
3
|
+
*
|
|
4
|
+
* Strategy:
|
|
5
|
+
* 1. Split on H2 headers first (natural semantic boundaries)
|
|
6
|
+
* 2. If a section exceeds target size, split on paragraph boundaries
|
|
7
|
+
* 3. Each chunk carries metadata: file path, title, category, chunk index
|
|
8
|
+
*
|
|
9
|
+
* Target chunk size: 500-800 tokens (~2000-3200 chars)
|
|
10
|
+
*/
|
|
11
|
+
export interface DocChunk {
|
|
12
|
+
file_path: string;
|
|
13
|
+
chunk_index: number;
|
|
14
|
+
title: string;
|
|
15
|
+
section_title: string;
|
|
16
|
+
category: string;
|
|
17
|
+
content: string;
|
|
18
|
+
file_hash: string;
|
|
19
|
+
}
|
|
20
|
+
export interface DocFile {
|
|
21
|
+
absolute_path: string;
|
|
22
|
+
relative_path: string;
|
|
23
|
+
content: string;
|
|
24
|
+
hash: string;
|
|
25
|
+
}
|
|
26
|
+
/**
|
|
27
|
+
* Chunk a single markdown file into searchable segments
|
|
28
|
+
*/
|
|
29
|
+
export declare function chunkDocument(doc: DocFile): DocChunk[];
|
|
30
|
+
/**
|
|
31
|
+
* Scan a directory for markdown files
|
|
32
|
+
*/
|
|
33
|
+
export declare function scanDirectory(dirPath: string, options?: {
|
|
34
|
+
exclude?: string[];
|
|
35
|
+
}): DocFile[];
|
|
36
|
+
/**
|
|
37
|
+
* Chunk all markdown files in a directory
|
|
38
|
+
*/
|
|
39
|
+
export declare function chunkDirectory(dirPath: string, options?: {
|
|
40
|
+
exclude?: string[];
|
|
41
|
+
}): {
|
|
42
|
+
files: DocFile[];
|
|
43
|
+
chunks: DocChunk[];
|
|
44
|
+
};
|
|
45
|
+
//# sourceMappingURL=doc-chunker.d.ts.map
|
|
@@ -0,0 +1,208 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Chunker — Split markdown files into searchable chunks
|
|
3
|
+
*
|
|
4
|
+
* Strategy:
|
|
5
|
+
* 1. Split on H2 headers first (natural semantic boundaries)
|
|
6
|
+
* 2. If a section exceeds target size, split on paragraph boundaries
|
|
7
|
+
* 3. Each chunk carries metadata: file path, title, category, chunk index
|
|
8
|
+
*
|
|
9
|
+
* Target chunk size: 500-800 tokens (~2000-3200 chars)
|
|
10
|
+
*/
|
|
11
|
+
import * as fs from "fs";
|
|
12
|
+
import * as path from "path";
|
|
13
|
+
import * as crypto from "crypto";
|
|
14
|
+
const TARGET_CHUNK_CHARS = 2400; // ~600 tokens
|
|
15
|
+
const MAX_CHUNK_CHARS = 3600; // ~900 tokens hard limit
|
|
16
|
+
const MIN_CHUNK_CHARS = 200; // Don't create tiny chunks
|
|
17
|
+
/**
|
|
18
|
+
* Extract title from markdown content (first H1, or filename)
|
|
19
|
+
*/
|
|
20
|
+
function extractTitle(content, filePath) {
|
|
21
|
+
const h1Match = content.match(/^#\s+(.+)$/m);
|
|
22
|
+
if (h1Match)
|
|
23
|
+
return h1Match[1].trim();
|
|
24
|
+
// Fall back to filename without extension
|
|
25
|
+
return path.basename(filePath, ".md").replace(/[-_]/g, " ");
|
|
26
|
+
}
|
|
27
|
+
/**
|
|
28
|
+
* Extract category from directory structure
|
|
29
|
+
*/
|
|
30
|
+
function extractCategory(relativePath) {
|
|
31
|
+
const parts = relativePath.split(path.sep);
|
|
32
|
+
if (parts.length > 1)
|
|
33
|
+
return parts[0];
|
|
34
|
+
return "root";
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* Split markdown into sections by H2 headers
|
|
38
|
+
*/
|
|
39
|
+
function splitByH2(content) {
|
|
40
|
+
const sections = [];
|
|
41
|
+
const lines = content.split("\n");
|
|
42
|
+
let currentTitle = "";
|
|
43
|
+
let currentLines = [];
|
|
44
|
+
for (const line of lines) {
|
|
45
|
+
const h2Match = line.match(/^##\s+(.+)$/);
|
|
46
|
+
if (h2Match) {
|
|
47
|
+
// Save previous section if it has content
|
|
48
|
+
if (currentLines.length > 0) {
|
|
49
|
+
const text = currentLines.join("\n").trim();
|
|
50
|
+
if (text.length > 0) {
|
|
51
|
+
sections.push({ title: currentTitle, content: text });
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
currentTitle = h2Match[1].trim();
|
|
55
|
+
currentLines = [];
|
|
56
|
+
}
|
|
57
|
+
else {
|
|
58
|
+
currentLines.push(line);
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
// Don't forget the last section
|
|
62
|
+
if (currentLines.length > 0) {
|
|
63
|
+
const text = currentLines.join("\n").trim();
|
|
64
|
+
if (text.length > 0) {
|
|
65
|
+
sections.push({ title: currentTitle, content: text });
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return sections;
|
|
69
|
+
}
|
|
70
|
+
/**
|
|
71
|
+
* Split a text blob on paragraph boundaries to fit within target size
|
|
72
|
+
*/
|
|
73
|
+
function splitByParagraphs(text, maxChars) {
|
|
74
|
+
if (text.length <= maxChars)
|
|
75
|
+
return [text];
|
|
76
|
+
const chunks = [];
|
|
77
|
+
const paragraphs = text.split(/\n\n+/);
|
|
78
|
+
let current = "";
|
|
79
|
+
for (const para of paragraphs) {
|
|
80
|
+
if (current.length + para.length + 2 > maxChars && current.length > 0) {
|
|
81
|
+
chunks.push(current.trim());
|
|
82
|
+
current = para;
|
|
83
|
+
}
|
|
84
|
+
else {
|
|
85
|
+
current = current ? current + "\n\n" + para : para;
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
if (current.trim().length > 0) {
|
|
89
|
+
chunks.push(current.trim());
|
|
90
|
+
}
|
|
91
|
+
return chunks;
|
|
92
|
+
}
|
|
93
|
+
/**
|
|
94
|
+
* Compute SHA-256 hash of content
|
|
95
|
+
*/
|
|
96
|
+
function hashContent(content) {
|
|
97
|
+
return crypto.createHash("sha256").update(content).digest("hex");
|
|
98
|
+
}
|
|
99
|
+
/**
|
|
100
|
+
* Chunk a single markdown file into searchable segments
|
|
101
|
+
*/
|
|
102
|
+
export function chunkDocument(doc) {
|
|
103
|
+
const title = extractTitle(doc.content, doc.relative_path);
|
|
104
|
+
const category = extractCategory(doc.relative_path);
|
|
105
|
+
const chunks = [];
|
|
106
|
+
let chunkIndex = 0;
|
|
107
|
+
// Split by H2 headers
|
|
108
|
+
const sections = splitByH2(doc.content);
|
|
109
|
+
for (const section of sections) {
|
|
110
|
+
// If section fits in one chunk, use it directly
|
|
111
|
+
if (section.content.length <= MAX_CHUNK_CHARS) {
|
|
112
|
+
if (section.content.length >= MIN_CHUNK_CHARS) {
|
|
113
|
+
chunks.push({
|
|
114
|
+
file_path: doc.relative_path,
|
|
115
|
+
chunk_index: chunkIndex++,
|
|
116
|
+
title,
|
|
117
|
+
section_title: section.title,
|
|
118
|
+
category,
|
|
119
|
+
content: section.content,
|
|
120
|
+
file_hash: doc.hash,
|
|
121
|
+
});
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
else {
|
|
125
|
+
// Section too large — split by paragraphs
|
|
126
|
+
const subChunks = splitByParagraphs(section.content, TARGET_CHUNK_CHARS);
|
|
127
|
+
for (const sub of subChunks) {
|
|
128
|
+
if (sub.length >= MIN_CHUNK_CHARS) {
|
|
129
|
+
chunks.push({
|
|
130
|
+
file_path: doc.relative_path,
|
|
131
|
+
chunk_index: chunkIndex++,
|
|
132
|
+
title,
|
|
133
|
+
section_title: section.title,
|
|
134
|
+
category,
|
|
135
|
+
content: sub,
|
|
136
|
+
file_hash: doc.hash,
|
|
137
|
+
});
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
// Edge case: file with no H2 headers and short content — one chunk
|
|
143
|
+
if (chunks.length === 0 && doc.content.trim().length >= MIN_CHUNK_CHARS) {
|
|
144
|
+
chunks.push({
|
|
145
|
+
file_path: doc.relative_path,
|
|
146
|
+
chunk_index: 0,
|
|
147
|
+
title,
|
|
148
|
+
section_title: "",
|
|
149
|
+
category,
|
|
150
|
+
content: doc.content.trim().slice(0, MAX_CHUNK_CHARS),
|
|
151
|
+
file_hash: doc.hash,
|
|
152
|
+
});
|
|
153
|
+
}
|
|
154
|
+
return chunks;
|
|
155
|
+
}
|
|
156
|
+
/**
|
|
157
|
+
* Scan a directory for markdown files
|
|
158
|
+
*/
|
|
159
|
+
export function scanDirectory(dirPath, options = {}) {
|
|
160
|
+
const exclude = options.exclude || ["_archive", "node_modules", ".git"];
|
|
161
|
+
const files = [];
|
|
162
|
+
function walk(currentPath) {
|
|
163
|
+
let entries;
|
|
164
|
+
try {
|
|
165
|
+
entries = fs.readdirSync(currentPath, { withFileTypes: true });
|
|
166
|
+
}
|
|
167
|
+
catch {
|
|
168
|
+
return; // Permission denied or inaccessible
|
|
169
|
+
}
|
|
170
|
+
for (const entry of entries) {
|
|
171
|
+
const fullPath = path.join(currentPath, entry.name);
|
|
172
|
+
if (entry.isDirectory()) {
|
|
173
|
+
if (!exclude.includes(entry.name)) {
|
|
174
|
+
walk(fullPath);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
else if (entry.isFile() && entry.name.endsWith(".md")) {
|
|
178
|
+
try {
|
|
179
|
+
const content = fs.readFileSync(fullPath, "utf-8");
|
|
180
|
+
const relativePath = path.relative(dirPath, fullPath);
|
|
181
|
+
files.push({
|
|
182
|
+
absolute_path: fullPath,
|
|
183
|
+
relative_path: relativePath,
|
|
184
|
+
content,
|
|
185
|
+
hash: hashContent(content),
|
|
186
|
+
});
|
|
187
|
+
}
|
|
188
|
+
catch {
|
|
189
|
+
// Skip unreadable files
|
|
190
|
+
}
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
walk(dirPath);
|
|
195
|
+
return files;
|
|
196
|
+
}
|
|
197
|
+
/**
|
|
198
|
+
* Chunk all markdown files in a directory
|
|
199
|
+
*/
|
|
200
|
+
export function chunkDirectory(dirPath, options = {}) {
|
|
201
|
+
const files = scanDirectory(dirPath, options);
|
|
202
|
+
const chunks = [];
|
|
203
|
+
for (const file of files) {
|
|
204
|
+
chunks.push(...chunkDocument(file));
|
|
205
|
+
}
|
|
206
|
+
return { files, chunks };
|
|
207
|
+
}
|
|
208
|
+
//# sourceMappingURL=doc-chunker.js.map
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Index — Storage and search for indexed doc chunks
|
|
3
|
+
*
|
|
4
|
+
* Supports two backends:
|
|
5
|
+
* - Free tier: Local JSON file with BM25 keyword search
|
|
6
|
+
* - Pro/dev tier: In-memory vector index with embeddings
|
|
7
|
+
*
|
|
8
|
+
* Follows the same patterns as local-vector-search.ts and local-file-storage.ts
|
|
9
|
+
*/
|
|
10
|
+
import type { DocChunk } from "./doc-chunker.js";
|
|
11
|
+
export interface IndexedDocChunk {
|
|
12
|
+
id: string;
|
|
13
|
+
file_path: string;
|
|
14
|
+
chunk_index: number;
|
|
15
|
+
title: string;
|
|
16
|
+
section_title: string;
|
|
17
|
+
category: string;
|
|
18
|
+
content: string;
|
|
19
|
+
file_hash: string;
|
|
20
|
+
project: string;
|
|
21
|
+
embedding?: number[];
|
|
22
|
+
indexed_at: string;
|
|
23
|
+
}
|
|
24
|
+
export interface DocSearchResult {
|
|
25
|
+
id: string;
|
|
26
|
+
file_path: string;
|
|
27
|
+
chunk_index: number;
|
|
28
|
+
title: string;
|
|
29
|
+
section_title: string;
|
|
30
|
+
category: string;
|
|
31
|
+
content: string;
|
|
32
|
+
similarity: number;
|
|
33
|
+
project: string;
|
|
34
|
+
}
|
|
35
|
+
export interface IndexStats {
|
|
36
|
+
total_chunks: number;
|
|
37
|
+
total_files: number;
|
|
38
|
+
files_indexed: string[];
|
|
39
|
+
categories: Record<string, number>;
|
|
40
|
+
project: string;
|
|
41
|
+
has_embeddings: boolean;
|
|
42
|
+
}
|
|
43
|
+
/**
|
|
44
|
+
* Index doc chunks into storage.
|
|
45
|
+
*
|
|
46
|
+
* - Removes old chunks for the same project + file_path
|
|
47
|
+
* - Generates embeddings if available (pro/dev tier)
|
|
48
|
+
* - Stores to local JSON file
|
|
49
|
+
* - Loads into in-memory vector index if embeddings present
|
|
50
|
+
*
|
|
51
|
+
* Returns count of chunks indexed.
|
|
52
|
+
*/
|
|
53
|
+
export declare function indexChunks(chunks: DocChunk[], project: string, options?: {
|
|
54
|
+
batchSize?: number;
|
|
55
|
+
}): Promise<{
|
|
56
|
+
indexed: number;
|
|
57
|
+
embedded: number;
|
|
58
|
+
errors: number;
|
|
59
|
+
}>;
|
|
60
|
+
/**
|
|
61
|
+
* Search indexed docs using semantic similarity (pro/dev) or BM25 (free)
|
|
62
|
+
*/
|
|
63
|
+
export declare function searchDocs(query: string, options?: {
|
|
64
|
+
project?: string;
|
|
65
|
+
category?: string;
|
|
66
|
+
match_count?: number;
|
|
67
|
+
}): Promise<DocSearchResult[]>;
|
|
68
|
+
/**
|
|
69
|
+
* Get index statistics
|
|
70
|
+
*/
|
|
71
|
+
export declare function getIndexStats(project?: string): IndexStats;
|
|
72
|
+
/**
|
|
73
|
+
* Check which files have changed since last index (by hash)
|
|
74
|
+
*/
|
|
75
|
+
export declare function getChangedFiles(fileHashes: Map<string, string>, project: string): {
|
|
76
|
+
changed: string[];
|
|
77
|
+
unchanged: string[];
|
|
78
|
+
new_files: string[];
|
|
79
|
+
};
|
|
80
|
+
/**
|
|
81
|
+
* Initialize vector index from local storage on startup
|
|
82
|
+
*/
|
|
83
|
+
export declare function initDocVectorIndex(): void;
|
|
84
|
+
/**
|
|
85
|
+
* Clear the doc index for a project (or all)
|
|
86
|
+
*/
|
|
87
|
+
export declare function clearDocIndex(project?: string): number;
|
|
88
|
+
//# sourceMappingURL=doc-index.d.ts.map
|
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Document Index — Storage and search for indexed doc chunks
|
|
3
|
+
*
|
|
4
|
+
* Supports two backends:
|
|
5
|
+
* - Free tier: Local JSON file with BM25 keyword search
|
|
6
|
+
* - Pro/dev tier: In-memory vector index with embeddings
|
|
7
|
+
*
|
|
8
|
+
* Follows the same patterns as local-vector-search.ts and local-file-storage.ts
|
|
9
|
+
*/
|
|
10
|
+
import * as fs from "fs";
|
|
11
|
+
import * as path from "path";
|
|
12
|
+
import { v4 as uuidv4 } from "uuid";
|
|
13
|
+
import { getGitmemDir } from "./gitmem-dir.js";
|
|
14
|
+
import { bm25Search } from "./bm25.js";
|
|
15
|
+
import { embed as generateEmbedding, isEmbeddingAvailable } from "./embedding.js";
|
|
16
|
+
// --- Local File Index ---
|
|
17
|
+
const INDEX_FILE = "docs-index.json";
|
|
18
|
+
const MAX_INDEX_SIZE = 20 * 1024 * 1024; // 20MB
|
|
19
|
+
/**
|
|
20
|
+
* Get the path to the local docs index file
|
|
21
|
+
*/
|
|
22
|
+
function getIndexPath() {
|
|
23
|
+
return path.join(getGitmemDir(), INDEX_FILE);
|
|
24
|
+
}
|
|
25
|
+
/**
|
|
26
|
+
* Read the local index from disk
|
|
27
|
+
*/
|
|
28
|
+
function readLocalIndex() {
|
|
29
|
+
const indexPath = getIndexPath();
|
|
30
|
+
if (!fs.existsSync(indexPath))
|
|
31
|
+
return [];
|
|
32
|
+
try {
|
|
33
|
+
const raw = fs.readFileSync(indexPath, "utf-8");
|
|
34
|
+
return JSON.parse(raw);
|
|
35
|
+
}
|
|
36
|
+
catch {
|
|
37
|
+
console.error("[doc-index] Failed to read docs-index.json, starting fresh");
|
|
38
|
+
return [];
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Write the local index to disk
|
|
43
|
+
*/
|
|
44
|
+
function writeLocalIndex(chunks) {
|
|
45
|
+
const indexPath = getIndexPath();
|
|
46
|
+
const dir = path.dirname(indexPath);
|
|
47
|
+
if (!fs.existsSync(dir)) {
|
|
48
|
+
fs.mkdirSync(dir, { recursive: true });
|
|
49
|
+
}
|
|
50
|
+
// Strip embeddings from local file to save space
|
|
51
|
+
const stripped = chunks.map(({ embedding: _e, ...rest }) => rest);
|
|
52
|
+
const json = JSON.stringify(stripped, null, 2);
|
|
53
|
+
if (Buffer.byteLength(json, "utf-8") > MAX_INDEX_SIZE) {
|
|
54
|
+
console.error("[doc-index] Warning: docs-index.json exceeds 20MB");
|
|
55
|
+
}
|
|
56
|
+
fs.writeFileSync(indexPath, json, "utf-8");
|
|
57
|
+
}
|
|
58
|
+
let vectorIndex = [];
|
|
59
|
+
/**
|
|
60
|
+
* Compute cosine similarity between two normalized vectors
|
|
61
|
+
*/
|
|
62
|
+
function cosineSimilarity(a, b) {
|
|
63
|
+
if (a.length !== b.length)
|
|
64
|
+
return 0;
|
|
65
|
+
let dot = 0;
|
|
66
|
+
for (let i = 0; i < a.length; i++) {
|
|
67
|
+
dot += a[i] * b[i];
|
|
68
|
+
}
|
|
69
|
+
return dot;
|
|
70
|
+
}
|
|
71
|
+
// --- Public API ---
|
|
72
|
+
/**
|
|
73
|
+
* Index doc chunks into storage.
|
|
74
|
+
*
|
|
75
|
+
* - Removes old chunks for the same project + file_path
|
|
76
|
+
* - Generates embeddings if available (pro/dev tier)
|
|
77
|
+
* - Stores to local JSON file
|
|
78
|
+
* - Loads into in-memory vector index if embeddings present
|
|
79
|
+
*
|
|
80
|
+
* Returns count of chunks indexed.
|
|
81
|
+
*/
|
|
82
|
+
export async function indexChunks(chunks, project, options = {}) {
|
|
83
|
+
const batchSize = options.batchSize || 10;
|
|
84
|
+
const now = new Date().toISOString();
|
|
85
|
+
let embedded = 0;
|
|
86
|
+
let errors = 0;
|
|
87
|
+
// Read existing index
|
|
88
|
+
const existing = readLocalIndex();
|
|
89
|
+
// Build set of file paths being re-indexed
|
|
90
|
+
const reindexedPaths = new Set(chunks.map((c) => `${project}:${c.file_path}`));
|
|
91
|
+
// Remove old chunks for files being re-indexed
|
|
92
|
+
const kept = existing.filter((c) => !reindexedPaths.has(`${c.project}:${c.file_path}`));
|
|
93
|
+
// Create new indexed chunks
|
|
94
|
+
const newChunks = [];
|
|
95
|
+
// Process in batches for embedding
|
|
96
|
+
for (let i = 0; i < chunks.length; i += batchSize) {
|
|
97
|
+
const batch = chunks.slice(i, i + batchSize);
|
|
98
|
+
for (const chunk of batch) {
|
|
99
|
+
const indexed = {
|
|
100
|
+
id: uuidv4(),
|
|
101
|
+
file_path: chunk.file_path,
|
|
102
|
+
chunk_index: chunk.chunk_index,
|
|
103
|
+
title: chunk.title,
|
|
104
|
+
section_title: chunk.section_title,
|
|
105
|
+
category: chunk.category,
|
|
106
|
+
content: chunk.content,
|
|
107
|
+
file_hash: chunk.file_hash,
|
|
108
|
+
project,
|
|
109
|
+
indexed_at: now,
|
|
110
|
+
};
|
|
111
|
+
// Generate embedding if available
|
|
112
|
+
if (isEmbeddingAvailable()) {
|
|
113
|
+
try {
|
|
114
|
+
// Embed title + section + content for richer representation
|
|
115
|
+
const textToEmbed = [
|
|
116
|
+
indexed.title,
|
|
117
|
+
indexed.section_title,
|
|
118
|
+
indexed.content,
|
|
119
|
+
]
|
|
120
|
+
.filter(Boolean)
|
|
121
|
+
.join(" | ");
|
|
122
|
+
const embedding = await generateEmbedding(textToEmbed);
|
|
123
|
+
if (embedding) {
|
|
124
|
+
indexed.embedding = embedding;
|
|
125
|
+
embedded++;
|
|
126
|
+
}
|
|
127
|
+
}
|
|
128
|
+
catch (err) {
|
|
129
|
+
console.error(`[doc-index] Embedding failed for ${chunk.file_path}:${chunk.chunk_index}:`, err instanceof Error ? err.message : err);
|
|
130
|
+
errors++;
|
|
131
|
+
}
|
|
132
|
+
}
|
|
133
|
+
newChunks.push(indexed);
|
|
134
|
+
}
|
|
135
|
+
// Progress logging for large batches (every ~100 chunks)
|
|
136
|
+
if (chunks.length > 50 && i + batchSize < chunks.length && (i + batchSize) % 100 < batchSize) {
|
|
137
|
+
console.error(`[doc-index] Progress: ${Math.min(i + batchSize, chunks.length)}/${chunks.length} chunks`);
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
// Merge and write
|
|
141
|
+
const merged = [...kept, ...newChunks];
|
|
142
|
+
writeLocalIndex(merged);
|
|
143
|
+
// Update in-memory vector index
|
|
144
|
+
rebuildVectorIndex(merged);
|
|
145
|
+
return { indexed: newChunks.length, embedded, errors };
|
|
146
|
+
}
|
|
147
|
+
/**
|
|
148
|
+
* Rebuild the in-memory vector index from stored chunks
|
|
149
|
+
*/
|
|
150
|
+
function rebuildVectorIndex(chunks) {
|
|
151
|
+
vectorIndex = chunks
|
|
152
|
+
.filter((c) => c.embedding && Array.isArray(c.embedding) && c.embedding.length > 0)
|
|
153
|
+
.map((c) => ({
|
|
154
|
+
chunk: c,
|
|
155
|
+
embedding: c.embedding,
|
|
156
|
+
}));
|
|
157
|
+
console.error(`[doc-index] Vector index rebuilt: ${vectorIndex.length} entries with embeddings`);
|
|
158
|
+
}
|
|
159
|
+
/**
|
|
160
|
+
* Search indexed docs using semantic similarity (pro/dev) or BM25 (free)
|
|
161
|
+
*/
|
|
162
|
+
export async function searchDocs(query, options = {}) {
|
|
163
|
+
const matchCount = options.match_count || 5;
|
|
164
|
+
// Try vector search first (pro/dev tier with embeddings)
|
|
165
|
+
if (isEmbeddingAvailable() && vectorIndex.length > 0) {
|
|
166
|
+
return vectorSearchDocs(query, options);
|
|
167
|
+
}
|
|
168
|
+
// Fall back to BM25 keyword search
|
|
169
|
+
return bm25SearchDocs(query, options);
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Vector-based semantic search over doc chunks
|
|
173
|
+
*/
|
|
174
|
+
async function vectorSearchDocs(query, options) {
|
|
175
|
+
const matchCount = options.match_count || 5;
|
|
176
|
+
// Generate query embedding
|
|
177
|
+
const queryEmbedding = await generateEmbedding(query);
|
|
178
|
+
if (!queryEmbedding) {
|
|
179
|
+
console.error("[doc-index] Query embedding failed, falling back to BM25");
|
|
180
|
+
return bm25SearchDocs(query, options);
|
|
181
|
+
}
|
|
182
|
+
// Filter candidates
|
|
183
|
+
let candidates = vectorIndex;
|
|
184
|
+
if (options.project) {
|
|
185
|
+
candidates = candidates.filter((e) => e.chunk.project === options.project);
|
|
186
|
+
}
|
|
187
|
+
if (options.category) {
|
|
188
|
+
candidates = candidates.filter((e) => e.chunk.category === options.category);
|
|
189
|
+
}
|
|
190
|
+
// Score by cosine similarity
|
|
191
|
+
const scored = candidates.map((entry) => ({
|
|
192
|
+
chunk: entry.chunk,
|
|
193
|
+
similarity: cosineSimilarity(queryEmbedding, entry.embedding),
|
|
194
|
+
}));
|
|
195
|
+
// Sort and take top k
|
|
196
|
+
scored.sort((a, b) => b.similarity - a.similarity);
|
|
197
|
+
const topK = scored.slice(0, matchCount);
|
|
198
|
+
return topK.map(({ chunk, similarity }) => ({
|
|
199
|
+
id: chunk.id,
|
|
200
|
+
file_path: chunk.file_path,
|
|
201
|
+
chunk_index: chunk.chunk_index,
|
|
202
|
+
title: chunk.title,
|
|
203
|
+
section_title: chunk.section_title,
|
|
204
|
+
category: chunk.category,
|
|
205
|
+
content: chunk.content,
|
|
206
|
+
similarity: Math.round(similarity * 1000) / 1000,
|
|
207
|
+
project: chunk.project,
|
|
208
|
+
}));
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* BM25 keyword search over doc chunks (free tier)
|
|
212
|
+
*/
|
|
213
|
+
function bm25SearchDocs(query, options) {
|
|
214
|
+
const matchCount = options.match_count || 5;
|
|
215
|
+
const chunks = readLocalIndex();
|
|
216
|
+
// Filter by project and category
|
|
217
|
+
let filtered = chunks;
|
|
218
|
+
if (options.project) {
|
|
219
|
+
filtered = filtered.filter((c) => c.project === options.project);
|
|
220
|
+
}
|
|
221
|
+
if (options.category) {
|
|
222
|
+
filtered = filtered.filter((c) => c.category === options.category);
|
|
223
|
+
}
|
|
224
|
+
if (filtered.length === 0)
|
|
225
|
+
return [];
|
|
226
|
+
// Build BM25 documents with field boosting
|
|
227
|
+
const docs = filtered.map((c) => ({
|
|
228
|
+
id: c.id,
|
|
229
|
+
fields: [
|
|
230
|
+
{ text: c.title, boost: 3 },
|
|
231
|
+
{ text: c.section_title || "", boost: 2 },
|
|
232
|
+
{ text: c.category, boost: 1.5 },
|
|
233
|
+
{ text: c.content, boost: 1 },
|
|
234
|
+
],
|
|
235
|
+
}));
|
|
236
|
+
const results = bm25Search(query, docs, matchCount);
|
|
237
|
+
// Map back to DocSearchResult
|
|
238
|
+
const byId = new Map(filtered.map((c) => [c.id, c]));
|
|
239
|
+
return results
|
|
240
|
+
.map((r) => {
|
|
241
|
+
const c = byId.get(r.id);
|
|
242
|
+
if (!c)
|
|
243
|
+
return null;
|
|
244
|
+
return {
|
|
245
|
+
id: c.id,
|
|
246
|
+
file_path: c.file_path,
|
|
247
|
+
chunk_index: c.chunk_index,
|
|
248
|
+
title: c.title,
|
|
249
|
+
section_title: c.section_title,
|
|
250
|
+
category: c.category,
|
|
251
|
+
content: c.content,
|
|
252
|
+
similarity: r.similarity,
|
|
253
|
+
project: c.project,
|
|
254
|
+
};
|
|
255
|
+
})
|
|
256
|
+
.filter((r) => r !== null);
|
|
257
|
+
}
|
|
258
|
+
/**
|
|
259
|
+
* Get index statistics
|
|
260
|
+
*/
|
|
261
|
+
export function getIndexStats(project) {
|
|
262
|
+
const chunks = readLocalIndex();
|
|
263
|
+
const filtered = project
|
|
264
|
+
? chunks.filter((c) => c.project === project)
|
|
265
|
+
: chunks;
|
|
266
|
+
const files = new Set(filtered.map((c) => c.file_path));
|
|
267
|
+
const categories = {};
|
|
268
|
+
for (const c of filtered) {
|
|
269
|
+
categories[c.category] = (categories[c.category] || 0) + 1;
|
|
270
|
+
}
|
|
271
|
+
return {
|
|
272
|
+
total_chunks: filtered.length,
|
|
273
|
+
total_files: files.size,
|
|
274
|
+
files_indexed: Array.from(files).sort(),
|
|
275
|
+
categories,
|
|
276
|
+
project: project || "all",
|
|
277
|
+
has_embeddings: vectorIndex.length > 0,
|
|
278
|
+
};
|
|
279
|
+
}
|
|
280
|
+
/**
|
|
281
|
+
* Check which files have changed since last index (by hash)
|
|
282
|
+
*/
|
|
283
|
+
export function getChangedFiles(fileHashes, project) {
|
|
284
|
+
const existing = readLocalIndex().filter((c) => c.project === project);
|
|
285
|
+
const existingHashes = new Map();
|
|
286
|
+
for (const c of existing) {
|
|
287
|
+
existingHashes.set(c.file_path, c.file_hash);
|
|
288
|
+
}
|
|
289
|
+
const changed = [];
|
|
290
|
+
const unchanged = [];
|
|
291
|
+
const newFiles = [];
|
|
292
|
+
for (const [filePath, hash] of fileHashes) {
|
|
293
|
+
const existingHash = existingHashes.get(filePath);
|
|
294
|
+
if (!existingHash) {
|
|
295
|
+
newFiles.push(filePath);
|
|
296
|
+
}
|
|
297
|
+
else if (existingHash !== hash) {
|
|
298
|
+
changed.push(filePath);
|
|
299
|
+
}
|
|
300
|
+
else {
|
|
301
|
+
unchanged.push(filePath);
|
|
302
|
+
}
|
|
303
|
+
}
|
|
304
|
+
return { changed, unchanged, new_files: newFiles };
|
|
305
|
+
}
|
|
306
|
+
/**
|
|
307
|
+
* Initialize vector index from local storage on startup
|
|
308
|
+
*/
|
|
309
|
+
export function initDocVectorIndex() {
|
|
310
|
+
const chunks = readLocalIndex();
|
|
311
|
+
rebuildVectorIndex(chunks);
|
|
312
|
+
}
|
|
313
|
+
/**
|
|
314
|
+
* Clear the doc index for a project (or all)
|
|
315
|
+
*/
|
|
316
|
+
export function clearDocIndex(project) {
|
|
317
|
+
const existing = readLocalIndex();
|
|
318
|
+
if (!project) {
|
|
319
|
+
writeLocalIndex([]);
|
|
320
|
+
vectorIndex = [];
|
|
321
|
+
return existing.length;
|
|
322
|
+
}
|
|
323
|
+
const kept = existing.filter((c) => c.project !== project);
|
|
324
|
+
writeLocalIndex(kept);
|
|
325
|
+
rebuildVectorIndex(kept);
|
|
326
|
+
return existing.length - kept.length;
|
|
327
|
+
}
|
|
328
|
+
//# sourceMappingURL=doc-index.js.map
|