@lojban/semantic-search-mcp 1.0.15 → 1.0.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -1
- package/package.json +3 -2
- package/src/index.ts +51 -7
- package/src/markdown-chunks.test.ts +106 -0
- package/src/markdown-chunks.ts +408 -0
- package/src/scanner.ts +26 -6
- package/src/storage.ts +127 -18
package/README.md
CHANGED
|
@@ -12,7 +12,7 @@ Use it in **Cursor**, **Claude Code**, or any IDE that supports MCP to search th
|
|
|
12
12
|
|
|
13
13
|
## How it works
|
|
14
14
|
|
|
15
|
-
- **Indexing**: On startup, the server indexes content in the background. If **`SEMANTIC_SEARCH_INDEX_DIRS`** is set (comma-separated paths), it scans those directories. If it is *not* set, the server downloads the [lojban/sampu_vlaste](https://github.com/lojban/sampu_vlaste) repository from GitHub and indexes that instead. In both cases, the server looks for `.txt`, `.md`, `.tsv`, `.csv` files.
|
|
15
|
+
- **Indexing**: On startup, the server indexes content in the background. If **`SEMANTIC_SEARCH_INDEX_DIRS`** is set (comma-separated paths), it scans those directories. If it is *not* set, the server downloads the [lojban/sampu_vlaste](https://github.com/lojban/sampu_vlaste) repository from GitHub and indexes that instead. In both cases, the server looks for `.txt`, `.md`, `.tsv`, `.csv` files. **`.txt`, `.tsv`, `.csv`**: each non-empty line is one record. **`.md`**: chunks by paragraphs and blocks—merged multi-line `>` blockquotes (e.g. Lojban + glosses), whole HTML `<table>...</table>` blocks, and blank-line-separated prose (including consecutive list items). Latest `##` / `###` titles are prepended as `Context: …` on each chunk for better retrieval. Each chunk gets one embedding (via [Hugging Face Transformers.js](https://huggingface.co/docs/transformers.js), model `Xenova/all-MiniLM-L6-v2`) and is stored in a local SQLite database with [@dao-xyz/sqlite3-vec](https://www.npmjs.com/package/@dao-xyz/sqlite3-vec) (SQLite + sqlite-vec for Node and browser). The `line` field in search results is the **start line** of that chunk in the file. After upgrading to a version that changes chunking, restart the server so files are re-indexed (mtime/content hash refresh).
|
|
16
16
|
- **Search**: You send a natural-language query; the server embeds it and returns the closest lines by cosine similarity.
|
|
17
17
|
- **Storage**: Index is stored in your project's `.semantic-search/data/` (or set `SEMANTIC_SEARCH_DATA_DIR`). No cloud, no API keys.
|
|
18
18
|
|
|
@@ -98,6 +98,8 @@ The server is **not built to JavaScript**; it runs via **`npx tsx src/index.ts`*
|
|
|
98
98
|
|
|
99
99
|
To run the server from the repo: `npm run dev` or `npx tsx src/index.ts`.
|
|
100
100
|
|
|
101
|
+
Run tests: `npm test`.
|
|
102
|
+
|
|
101
103
|
## License
|
|
102
104
|
|
|
103
105
|
MIT
|
package/package.json
CHANGED
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@lojban/semantic-search-mcp",
|
|
3
|
-
"version": "1.0.
|
|
3
|
+
"version": "1.0.18",
|
|
4
4
|
"description": "Local-first MCP server for semantic search using transformers.js and SQLite",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"scripts": {
|
|
7
|
-
"dev": "tsx src/index.ts"
|
|
7
|
+
"dev": "tsx src/index.ts",
|
|
8
|
+
"test": "node --import tsx --test src/*.test.ts"
|
|
8
9
|
},
|
|
9
10
|
"dependencies": {
|
|
10
11
|
"@dao-xyz/sqlite3-vec": "^0.0.19",
|
package/src/index.ts
CHANGED
|
@@ -6,16 +6,17 @@ import {
|
|
|
6
6
|
ListToolsRequestSchema,
|
|
7
7
|
} from '@modelcontextprotocol/sdk/types.js';
|
|
8
8
|
import path from 'path';
|
|
9
|
+
import os from 'os';
|
|
9
10
|
import { getEmbedding, getBatchEmbeddings } from './embeddings.js';
|
|
10
11
|
import { createVectorStorage, type SearchResult } from './storage.js';
|
|
11
12
|
import { normalizePath } from './path-util.js';
|
|
12
13
|
import { listFilesInDirectories, readFileWithMetadata } from './scanner.js';
|
|
13
14
|
import { getSampuVlasteDir } from './download-sampu-vlaste.js';
|
|
14
15
|
|
|
15
|
-
// Data dir: use env, or
|
|
16
|
+
// Data dir: use env, or $HOME/.semantic-search/data so the same index is used across MCP version upgrades (npx cwd can change per version)
|
|
16
17
|
const dataDir =
|
|
17
18
|
process.env.SEMANTIC_SEARCH_DATA_DIR ||
|
|
18
|
-
path.join(
|
|
19
|
+
path.join(os.homedir(), '.semantic-search', 'data');
|
|
19
20
|
const DB_PATH = path.join(dataDir, 'vectors.db');
|
|
20
21
|
|
|
21
22
|
// Background indexing state (progress for get_index_stats)
|
|
@@ -58,6 +59,7 @@ async function runBackgroundIndexing(
|
|
|
58
59
|
storage.getFileMetadata(),
|
|
59
60
|
storage.getIndexedFilePaths(),
|
|
60
61
|
]);
|
|
62
|
+
const indexedPathSet = new Set(indexedPaths.map((p) => normalizePath(p)));
|
|
61
63
|
|
|
62
64
|
// Remove from DB any file whose directory is no longer in SEMANTIC_SEARCH_INDEX_DIRS
|
|
63
65
|
for (const filePath of indexedPaths) {
|
|
@@ -67,7 +69,15 @@ async function runBackgroundIndexing(
|
|
|
67
69
|
}
|
|
68
70
|
|
|
69
71
|
const processBatch = async (
|
|
70
|
-
batch: Array<{
|
|
72
|
+
batch: Array<{
|
|
73
|
+
filePath: string;
|
|
74
|
+
lineNumber: number;
|
|
75
|
+
content: string;
|
|
76
|
+
chunkType?: 'prose' | 'list' | 'quote' | 'example' | 'table';
|
|
77
|
+
chapter?: string | null;
|
|
78
|
+
section?: string | null;
|
|
79
|
+
exampleId?: string | null;
|
|
80
|
+
}>
|
|
71
81
|
): Promise<void> => {
|
|
72
82
|
if (batch.length === 0) return;
|
|
73
83
|
const contents = batch.map((l) => l.content);
|
|
@@ -77,6 +87,10 @@ async function runBackgroundIndexing(
|
|
|
77
87
|
lineNumber: line.lineNumber,
|
|
78
88
|
content: line.content,
|
|
79
89
|
embedding: embeddings[idx],
|
|
90
|
+
chunkType: line.chunkType ?? null,
|
|
91
|
+
chapter: line.chapter ?? null,
|
|
92
|
+
section: line.section ?? null,
|
|
93
|
+
exampleId: line.exampleId ?? null,
|
|
80
94
|
}));
|
|
81
95
|
await storage.upsertLinesBatch(batchData);
|
|
82
96
|
indexingState.linesIndexed += batch.length;
|
|
@@ -88,14 +102,32 @@ async function runBackgroundIndexing(
|
|
|
88
102
|
new Promise((resolve) => setImmediate(resolve));
|
|
89
103
|
|
|
90
104
|
const currentFilesOnDisk = new Set<string>();
|
|
91
|
-
const toIndex: Array<{
|
|
105
|
+
const toIndex: Array<{
|
|
106
|
+
filePath: string;
|
|
107
|
+
mtimeMs: number;
|
|
108
|
+
contentHash: string;
|
|
109
|
+
lines: Array<{
|
|
110
|
+
filePath: string;
|
|
111
|
+
lineNumber: number;
|
|
112
|
+
content: string;
|
|
113
|
+
chunkType?: 'prose' | 'list' | 'quote' | 'example' | 'table';
|
|
114
|
+
chapter?: string | null;
|
|
115
|
+
section?: string | null;
|
|
116
|
+
exampleId?: string | null;
|
|
117
|
+
}>;
|
|
118
|
+
}> = [];
|
|
92
119
|
|
|
93
120
|
for await (const { filePath, mtimeMs } of listFilesInDirectories(directories)) {
|
|
94
121
|
currentFilesOnDisk.add(filePath);
|
|
95
122
|
const meta = fileMetadata.get(filePath);
|
|
96
123
|
const content = readFileWithMetadata(filePath);
|
|
97
124
|
if (!content) continue;
|
|
98
|
-
if (
|
|
125
|
+
if (
|
|
126
|
+
indexedPathSet.has(normalizePath(filePath)) &&
|
|
127
|
+
meta &&
|
|
128
|
+
meta.mtimeMs === content.mtimeMs &&
|
|
129
|
+
meta.contentHash === content.contentHash
|
|
130
|
+
) {
|
|
99
131
|
continue; // unchanged, skip
|
|
100
132
|
}
|
|
101
133
|
toIndex.push({
|
|
@@ -113,7 +145,15 @@ async function runBackgroundIndexing(
|
|
|
113
145
|
}
|
|
114
146
|
}
|
|
115
147
|
|
|
116
|
-
let currentBatch: Array<{
|
|
148
|
+
let currentBatch: Array<{
|
|
149
|
+
filePath: string;
|
|
150
|
+
lineNumber: number;
|
|
151
|
+
content: string;
|
|
152
|
+
chunkType?: 'prose' | 'list' | 'quote' | 'example' | 'table';
|
|
153
|
+
chapter?: string | null;
|
|
154
|
+
section?: string | null;
|
|
155
|
+
exampleId?: string | null;
|
|
156
|
+
}> = [];
|
|
117
157
|
let processingPromise: Promise<void> | null = null;
|
|
118
158
|
|
|
119
159
|
for (const entry of toIndex) {
|
|
@@ -207,7 +247,7 @@ async function main() {
|
|
|
207
247
|
const limit = (args as { query: string; limit?: number }).limit ?? 10;
|
|
208
248
|
|
|
209
249
|
const queryEmbedding = await getEmbedding(query);
|
|
210
|
-
const results = await storage.search(queryEmbedding, limit);
|
|
250
|
+
const results = await storage.search(queryEmbedding, limit, query);
|
|
211
251
|
|
|
212
252
|
return {
|
|
213
253
|
content: [
|
|
@@ -220,6 +260,10 @@ async function main() {
|
|
|
220
260
|
line: r.line_number,
|
|
221
261
|
content: r.content,
|
|
222
262
|
score: Math.round(r.score * 1000) / 1000,
|
|
263
|
+
chunk_type: r.chunk_type ?? undefined,
|
|
264
|
+
chapter: r.chapter ?? undefined,
|
|
265
|
+
section: r.section ?? undefined,
|
|
266
|
+
example_id: r.example_id ?? undefined,
|
|
223
267
|
})),
|
|
224
268
|
}),
|
|
225
269
|
},
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
import { describe, it } from 'node:test';
|
|
2
|
+
import assert from 'node:assert/strict';
|
|
3
|
+
import { extractMdChunks } from './markdown-chunks.js';
|
|
4
|
+
|
|
5
|
+
const fp = '/books/cll/5.md';
|
|
6
|
+
|
|
7
|
+
describe('extractMdChunks', () => {
|
|
8
|
+
it('merges example span + multiline blockquote and prefixes ex id', () => {
|
|
9
|
+
const md = `## Chapter 5: Selbri
|
|
10
|
+
|
|
11
|
+
### Brivla
|
|
12
|
+
|
|
13
|
+
<span id="ex-5-1"></span><span id="example-x"></span>
|
|
14
|
+
> **do mamta mi**
|
|
15
|
+
> _You are-a-mother-of me_
|
|
16
|
+
> _You are my mother_
|
|
17
|
+
|
|
18
|
+
The next paragraph after a blank line.
|
|
19
|
+
|
|
20
|
+
`;
|
|
21
|
+
const chunks = extractMdChunks(md, fp);
|
|
22
|
+
const ex = chunks.find((c) => c.content.includes('Example ex-5-1'));
|
|
23
|
+
assert.ok(ex, 'expected chunk with ex anchor');
|
|
24
|
+
assert.match(ex!.content, /do mamta mi/);
|
|
25
|
+
assert.match(ex!.content, /You are my mother/);
|
|
26
|
+
assert.ok(ex!.content.includes('Context:'), 'heading context prepended');
|
|
27
|
+
assert.ok(ex!.content.includes('Chapter 5: Selbri'));
|
|
28
|
+
assert.ok(ex!.content.includes('Brivla'));
|
|
29
|
+
assert.equal(ex!.lineNumber, 6);
|
|
30
|
+
assert.equal(ex!.chunkType, 'example');
|
|
31
|
+
assert.equal(ex!.exampleId, 'ex-5-1');
|
|
32
|
+
assert.equal(ex!.chapter, 'Chapter 5: Selbri');
|
|
33
|
+
assert.equal(ex!.section, 'Brivla');
|
|
34
|
+
});
|
|
35
|
+
|
|
36
|
+
it('flattens HTML table into one chunk', () => {
|
|
37
|
+
const md = `## Ch 7
|
|
38
|
+
|
|
39
|
+
<table>
|
|
40
|
+
<tbody>
|
|
41
|
+
<tr>
|
|
42
|
+
<td>**mi**</td>
|
|
43
|
+
<td>\`KOhA\`</td>
|
|
44
|
+
<td>I, me</td>
|
|
45
|
+
</tr>
|
|
46
|
+
<tr>
|
|
47
|
+
<td>**do**</td>
|
|
48
|
+
<td>\`KOhA\`</td>
|
|
49
|
+
<td>you</td>
|
|
50
|
+
</tr>
|
|
51
|
+
</tbody>
|
|
52
|
+
</table>
|
|
53
|
+
|
|
54
|
+
`;
|
|
55
|
+
const chunks = extractMdChunks(md, fp);
|
|
56
|
+
const t = chunks.find((c) => c.content.includes('**mi**'));
|
|
57
|
+
assert.ok(!t, 'old markdown-formatted variant should not remain');
|
|
58
|
+
const cleaned = chunks.find((c) => c.content.includes('mi') && c.content.includes('do'));
|
|
59
|
+
assert.ok(cleaned);
|
|
60
|
+
assert.match(cleaned!.content, /\bmi\b/);
|
|
61
|
+
assert.match(cleaned!.content, /\bdo\b/);
|
|
62
|
+
assert.ok(cleaned!.content.includes('I, me'));
|
|
63
|
+
});
|
|
64
|
+
|
|
65
|
+
it('merges consecutive list items without blank lines into one chunk', () => {
|
|
66
|
+
const md = `## Lists
|
|
67
|
+
|
|
68
|
+
Intro line.
|
|
69
|
+
|
|
70
|
+
- **mi'o** speaker and listener
|
|
71
|
+
- **mi'a** speaker and others
|
|
72
|
+
|
|
73
|
+
After blank.
|
|
74
|
+
`;
|
|
75
|
+
const chunks = extractMdChunks(md, fp);
|
|
76
|
+
const list = chunks.find((c) => c.content.includes("mi'o") && c.content.includes("mi'a"));
|
|
77
|
+
assert.ok(list);
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
it('merges orphan blockquotes without leading span', () => {
|
|
81
|
+
const md = `> **ta bloti**
|
|
82
|
+
> _That is a boat._
|
|
83
|
+
|
|
84
|
+
Done.
|
|
85
|
+
`;
|
|
86
|
+
const chunks = extractMdChunks(md, fp);
|
|
87
|
+
const q = chunks.find((c) => c.content.includes('ta bloti'));
|
|
88
|
+
assert.ok(q);
|
|
89
|
+
assert.match(q!.content, /That is a boat/);
|
|
90
|
+
assert.equal(q!.chunkType, 'quote');
|
|
91
|
+
});
|
|
92
|
+
|
|
93
|
+
it('skips non-example span-only anchor lines', () => {
|
|
94
|
+
const md = `## Chapter
|
|
95
|
+
|
|
96
|
+
<span id="sec-5-1"></span><span id="section-content-words-brivla"></span>
|
|
97
|
+
|
|
98
|
+
Paragraph with real content.
|
|
99
|
+
`;
|
|
100
|
+
const chunks = extractMdChunks(md, fp);
|
|
101
|
+
const anchorOnly = chunks.find((c) => c.content.includes('sec-5-1'));
|
|
102
|
+
assert.equal(anchorOnly, undefined);
|
|
103
|
+
const prose = chunks.find((c) => c.content.includes('Paragraph with real content.'));
|
|
104
|
+
assert.ok(prose);
|
|
105
|
+
});
|
|
106
|
+
});
|
|
@@ -0,0 +1,408 @@
|
|
|
1
|
+
import type { FileLine } from './scanner.js'; // type-only: no runtime cycle with scanner importing this module
|
|
2
|
+
|
|
3
|
+
/** Min chunk length after merge (book mode); keeps short examples, drops noise */
|
|
4
|
+
export const MD_MIN_CHUNK_LENGTH = 12;
|
|
5
|
+
|
|
6
|
+
/** Rough max characters per indexed chunk before sentence split */
|
|
7
|
+
export const MD_MAX_CHUNK_CHARS = 1800;
|
|
8
|
+
|
|
9
|
+
/** Overlap when splitting long chunks (characters) */
|
|
10
|
+
export const MD_SPLIT_OVERLAP = 100;
|
|
11
|
+
|
|
12
|
+
const HEADING_RE = /^(#{1,6})\s+(.+)$/;
|
|
13
|
+
const BLOCKQUOTE_RE = /^\s*>[ \t]?(.*)$/;
|
|
14
|
+
const EX_ID_RE = /id="(ex-[^"]+)"/;
|
|
15
|
+
const SPAN_TAG_RE = /<span\b[^>]*>\s*<\/span>/gi;
|
|
16
|
+
type ChunkType = NonNullable<FileLine['chunkType']>;
|
|
17
|
+
|
|
18
|
+
function isBlank(trimmed: string): boolean {
|
|
19
|
+
return trimmed.length === 0;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
function isHtmlCommentLine(trimmed: string): boolean {
|
|
23
|
+
return /^<!--.*-->$/.test(trimmed);
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
function isMarkdownHeading(trimmed: string): boolean {
|
|
27
|
+
return HEADING_RE.test(trimmed);
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
function parseHeading(trimmed: string): { level: number; title: string } | null {
|
|
31
|
+
const m = trimmed.match(HEADING_RE);
|
|
32
|
+
if (!m) return null;
|
|
33
|
+
return { level: m[1].length, title: m[2].trim() };
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/** CLL: line of span(s) before blockquote, with an example id */
|
|
37
|
+
function isExampleAnchorLine(trimmed: string): boolean {
|
|
38
|
+
if (trimmed.startsWith('>')) return false;
|
|
39
|
+
if (!trimmed.includes('id="ex-')) return false;
|
|
40
|
+
// Mostly HTML span markup (allows multiple <span>...</span>)
|
|
41
|
+
const withoutTags = trimmed.replace(/<span\b[^>]*>[\s\S]*?<\/span>/gi, '').replace(/\s+/g, '');
|
|
42
|
+
return withoutTags.length === 0;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
function extractFirstExId(trimmed: string): string | null {
|
|
46
|
+
const m = trimmed.match(EX_ID_RE);
|
|
47
|
+
return m ? m[1] : null;
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
function isSpanOnlyLine(trimmed: string): boolean {
|
|
51
|
+
const withoutSpans = trimmed.replace(SPAN_TAG_RE, '').replace(/\s+/g, '');
|
|
52
|
+
return withoutSpans.length === 0 && trimmed.includes('<span');
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function stripBlockquotePrefix(line: string): string {
|
|
56
|
+
const m = line.match(BLOCKQUOTE_RE);
|
|
57
|
+
return m ? m[1] : line.trim();
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function isBlockquoteLine(trimmed: string): boolean {
|
|
61
|
+
return /^\s*>/.test(trimmed);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
interface HeadingState {
|
|
65
|
+
byLevel: Array<string | null>;
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function contextPrefix(state: HeadingState): string {
|
|
69
|
+
const parts: string[] = [];
|
|
70
|
+
for (let level = 2; level <= 4; level++) {
|
|
71
|
+
const title = state.byLevel[level];
|
|
72
|
+
if (title) parts.push(title);
|
|
73
|
+
}
|
|
74
|
+
if (parts.length === 0) return '';
|
|
75
|
+
return `Context: ${parts.join(' / ')}\n\n`;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function updateHeadings(state: HeadingState, level: number, title: string): void {
|
|
79
|
+
if (level < 2) return;
|
|
80
|
+
const clamped = Math.min(level, 6);
|
|
81
|
+
state.byLevel[clamped] = title;
|
|
82
|
+
for (let l = clamped + 1; l <= 6; l++) {
|
|
83
|
+
state.byLevel[l] = null;
|
|
84
|
+
}
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
function normalizeInlineMarkdown(s: string): string {
|
|
88
|
+
let out = s;
|
|
89
|
+
// Keep visible text, drop link URLs.
|
|
90
|
+
out = out.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
|
|
91
|
+
// Remove common inline formatting wrappers.
|
|
92
|
+
out = out
|
|
93
|
+
.replace(/\*\*([^*]+)\*\*/g, '$1')
|
|
94
|
+
.replace(/__([^_]+)__/g, '$1')
|
|
95
|
+
.replace(/`([^`]+)`/g, '$1')
|
|
96
|
+
.replace(/\*([^*]+)\*/g, '$1')
|
|
97
|
+
.replace(/_([^_]+)_/g, '$1');
|
|
98
|
+
// Drop remaining HTML tags.
|
|
99
|
+
out = out.replace(/<[^>]+>/g, ' ');
|
|
100
|
+
// Decode common entities.
|
|
101
|
+
out = out
|
|
102
|
+
.replace(/ /gi, ' ')
|
|
103
|
+
.replace(/&/gi, '&')
|
|
104
|
+
.replace(/</gi, '<')
|
|
105
|
+
.replace(/>/gi, '>')
|
|
106
|
+
.replace(/"/gi, '"')
|
|
107
|
+
.replace(/'/gi, "'");
|
|
108
|
+
return out.replace(/\s+/g, ' ').trim();
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
function proseChunkTypeFromPieces(pieces: Array<{ line: number; text: string }>): ChunkType {
|
|
112
|
+
if (pieces.length > 0 && pieces.every((p) => /^([-*+]\s+|\d+[.)]\s+)/.test(p.text))) return 'list';
|
|
113
|
+
return 'prose';
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
function flattenTableLines(tableLines: string[]): string {
|
|
117
|
+
const joined = tableLines.join('\n');
|
|
118
|
+
let s = joined
|
|
119
|
+
.replace(/<tbody[^>]*>|<\/tbody>|<thead[^>]*>|<\/thead>|<table[^>]*>|<\/table>/gi, '')
|
|
120
|
+
.replace(/<\/tr>/gi, '\n')
|
|
121
|
+
.replace(/<tr[^>]*>/gi, '')
|
|
122
|
+
.replace(/<\/td>/gi, ' | ')
|
|
123
|
+
.replace(/<\/th>/gi, ' | ')
|
|
124
|
+
.replace(/<t[dh][^>]*>/gi, '');
|
|
125
|
+
s = s.replace(/<[^>]+>/g, '');
|
|
126
|
+
return s
|
|
127
|
+
.split('\n')
|
|
128
|
+
.map((l) => normalizeInlineMarkdown(l.replace(/\s*\|\s*$/, '').trim()))
|
|
129
|
+
.filter((l) => l.length > 0)
|
|
130
|
+
.join('\n');
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
function bodyJoined(pieces: Array<{ line: number; text: string }>): string {
|
|
134
|
+
return pieces.map((p) => p.text).join('\n');
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
/** Char index in `prefix + bodyJoined` (no context) -> source line */
|
|
138
|
+
function lineInBody(
|
|
139
|
+
prefixLen: number,
|
|
140
|
+
pieces: Array<{ line: number; text: string }>,
|
|
141
|
+
idx: number
|
|
142
|
+
): number {
|
|
143
|
+
if (idx < prefixLen || pieces.length === 0) return pieces[0]?.line ?? 1;
|
|
144
|
+
let j = idx - prefixLen;
|
|
145
|
+
let o = 0;
|
|
146
|
+
for (let i = 0; i < pieces.length; i++) {
|
|
147
|
+
const seg = pieces[i].text + (i < pieces.length - 1 ? '\n' : '');
|
|
148
|
+
if (j < o + seg.length) return pieces[i].line;
|
|
149
|
+
o += seg.length;
|
|
150
|
+
}
|
|
151
|
+
return pieces[pieces.length - 1]?.line ?? 1;
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
function lineInFull(ctxLen: number, prefixLen: number, pieces: Array<{ line: number; text: string }>, idx: number): number {
|
|
155
|
+
if (idx < ctxLen) return pieces[0]?.line ?? 1;
|
|
156
|
+
return lineInBody(prefixLen, pieces, idx - ctxLen);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
function firstNonWsIndex(s: string, from: number, to: number): number {
|
|
160
|
+
let i = from;
|
|
161
|
+
while (i < to && i < s.length && /\s/.test(s[i])) i++;
|
|
162
|
+
return Math.min(i, s.length);
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
function splitLongText(
|
|
166
|
+
full: string,
|
|
167
|
+
ctxLen: number,
|
|
168
|
+
prefixLen: number,
|
|
169
|
+
pieces: Array<{ line: number; text: string }>
|
|
170
|
+
): Array<{ startLine: number; content: string }> {
|
|
171
|
+
if (full.length <= MD_MAX_CHUNK_CHARS) {
|
|
172
|
+
const idx = firstNonWsIndex(full, 0, full.length);
|
|
173
|
+
return [{ startLine: lineInFull(ctxLen, prefixLen, pieces, idx), content: full.trim() }];
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
const out: Array<{ startLine: number; content: string }> = [];
|
|
177
|
+
let pos = 0;
|
|
178
|
+
const n = full.length;
|
|
179
|
+
while (pos < n) {
|
|
180
|
+
let end = Math.min(pos + MD_MAX_CHUNK_CHARS, n);
|
|
181
|
+
if (end < n) {
|
|
182
|
+
const window = full.slice(pos, end);
|
|
183
|
+
const lastSentence = Math.max(
|
|
184
|
+
window.lastIndexOf('. '),
|
|
185
|
+
window.lastIndexOf('? '),
|
|
186
|
+
window.lastIndexOf('! '),
|
|
187
|
+
window.lastIndexOf('.\n'),
|
|
188
|
+
window.lastIndexOf('?\n'),
|
|
189
|
+
window.lastIndexOf('!\n')
|
|
190
|
+
);
|
|
191
|
+
if (lastSentence >= MD_MAX_CHUNK_CHARS * 0.35) {
|
|
192
|
+
end = pos + lastSentence + 2;
|
|
193
|
+
} else {
|
|
194
|
+
const lastNl = window.lastIndexOf('\n');
|
|
195
|
+
if (lastNl > MD_MAX_CHUNK_CHARS * 0.28) end = pos + lastNl + 1;
|
|
196
|
+
}
|
|
197
|
+
}
|
|
198
|
+
const sliceStart = firstNonWsIndex(full, pos, end);
|
|
199
|
+
const part = full.slice(pos, end).trim();
|
|
200
|
+
if (part.length >= MD_MIN_CHUNK_LENGTH) {
|
|
201
|
+
out.push({ startLine: lineInFull(ctxLen, prefixLen, pieces, sliceStart), content: part });
|
|
202
|
+
}
|
|
203
|
+
if (end >= n) break;
|
|
204
|
+
const nextPos = end - MD_SPLIT_OVERLAP;
|
|
205
|
+
pos = nextPos > pos ? nextPos : end;
|
|
206
|
+
}
|
|
207
|
+
if (out.length === 0 && full.trim().length > 0) {
|
|
208
|
+
const idx = firstNonWsIndex(full, 0, n);
|
|
209
|
+
out.push({ startLine: lineInFull(ctxLen, prefixLen, pieces, idx), content: full.trim().slice(0, MD_MAX_CHUNK_CHARS) });
|
|
210
|
+
}
|
|
211
|
+
if (out.length > 1) {
|
|
212
|
+
const lines = new Set(out.map((o) => o.startLine));
|
|
213
|
+
if (lines.size === 1) {
|
|
214
|
+
const idx = firstNonWsIndex(full, 0, n);
|
|
215
|
+
return [{ startLine: lineInFull(ctxLen, prefixLen, pieces, idx), content: full.trim().slice(0, MD_MAX_CHUNK_CHARS) }];
|
|
216
|
+
}
|
|
217
|
+
}
|
|
218
|
+
return out;
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
function emitChunk(
|
|
222
|
+
filePath: string,
|
|
223
|
+
pieces: Array<{ line: number; text: string }>,
|
|
224
|
+
prefix: string,
|
|
225
|
+
headingState: HeadingState,
|
|
226
|
+
chunkType: ChunkType,
|
|
227
|
+
exampleId: string | null,
|
|
228
|
+
out: FileLine[]
|
|
229
|
+
): void {
|
|
230
|
+
if (pieces.length === 0) return;
|
|
231
|
+
const body = bodyJoined(pieces);
|
|
232
|
+
const ctx = contextPrefix(headingState);
|
|
233
|
+
const inner = prefix + body;
|
|
234
|
+
const full = ctx + inner;
|
|
235
|
+
const ctxLen = ctx.length;
|
|
236
|
+
const prefixLen = prefix.length;
|
|
237
|
+
const splits = splitLongText(full, ctxLen, prefixLen, pieces);
|
|
238
|
+
const chapter = headingState.byLevel[2] ?? null;
|
|
239
|
+
const sectionParts: string[] = [];
|
|
240
|
+
for (let level = 3; level <= 4; level++) {
|
|
241
|
+
const t = headingState.byLevel[level];
|
|
242
|
+
if (t) sectionParts.push(t);
|
|
243
|
+
}
|
|
244
|
+
const section = sectionParts.length > 0 ? sectionParts.join(' / ') : null;
|
|
245
|
+
for (const s of splits) {
|
|
246
|
+
if (s.content.length >= MD_MIN_CHUNK_LENGTH) {
|
|
247
|
+
out.push({
|
|
248
|
+
filePath,
|
|
249
|
+
lineNumber: s.startLine,
|
|
250
|
+
content: s.content,
|
|
251
|
+
chunkType,
|
|
252
|
+
chapter,
|
|
253
|
+
section,
|
|
254
|
+
exampleId,
|
|
255
|
+
});
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Book-style Markdown chunks: merged blockquotes (Lojban + glosses), whole HTML tables,
|
|
262
|
+
* blank-line paragraphs, consecutive list items as one block. Heading trail prepended for context.
|
|
263
|
+
*/
|
|
264
|
+
export function extractMdChunks(text: string, filePath: string): FileLine[] {
|
|
265
|
+
const rawLines = text.split(/\r?\n/);
|
|
266
|
+
const lines: Array<{ num: number; raw: string }> = [];
|
|
267
|
+
let n = 0;
|
|
268
|
+
for (const raw of rawLines) {
|
|
269
|
+
n++;
|
|
270
|
+
lines.push({ num: n, raw });
|
|
271
|
+
}
|
|
272
|
+
|
|
273
|
+
const out: FileLine[] = [];
|
|
274
|
+
const headingState: HeadingState = { byLevel: Array<string | null>(7).fill(null) };
|
|
275
|
+
let i = 0;
|
|
276
|
+
|
|
277
|
+
while (i < lines.length) {
|
|
278
|
+
const { num, raw } = lines[i];
|
|
279
|
+
const trimmed = raw.trim();
|
|
280
|
+
|
|
281
|
+
if (isBlank(trimmed)) {
|
|
282
|
+
i++;
|
|
283
|
+
continue;
|
|
284
|
+
}
|
|
285
|
+
if (isHtmlCommentLine(trimmed)) {
|
|
286
|
+
i++;
|
|
287
|
+
continue;
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
if (isMarkdownHeading(trimmed)) {
|
|
291
|
+
const ph = parseHeading(trimmed);
|
|
292
|
+
if (ph) updateHeadings(headingState, ph.level, normalizeInlineMarkdown(ph.title));
|
|
293
|
+
i++;
|
|
294
|
+
continue;
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
if (trimmed.includes('<table')) {
|
|
298
|
+
const startLine = num;
|
|
299
|
+
const tableLines: string[] = [];
|
|
300
|
+
let j = i;
|
|
301
|
+
while (j < lines.length) {
|
|
302
|
+
tableLines.push(lines[j].raw);
|
|
303
|
+
if (lines[j].raw.includes('</table>')) {
|
|
304
|
+
j++;
|
|
305
|
+
break;
|
|
306
|
+
}
|
|
307
|
+
j++;
|
|
308
|
+
}
|
|
309
|
+
const flat = flattenTableLines(tableLines).trim();
|
|
310
|
+
i = j;
|
|
311
|
+
if (flat.length >= MD_MIN_CHUNK_LENGTH) {
|
|
312
|
+
emitChunk(filePath, [{ line: startLine, text: flat }], '', headingState, 'table', null, out);
|
|
313
|
+
}
|
|
314
|
+
continue;
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
if (isExampleAnchorLine(trimmed)) {
|
|
318
|
+
const exId = extractFirstExId(trimmed);
|
|
319
|
+
const prefix = exId ? `Example ${exId}\n` : 'Example\n';
|
|
320
|
+
let j = i + 1;
|
|
321
|
+
const quotePieces: Array<{ line: number; text: string }> = [];
|
|
322
|
+
const anchorLine = num;
|
|
323
|
+
while (j < lines.length) {
|
|
324
|
+
const t = lines[j].raw.trim();
|
|
325
|
+
if (isBlank(t)) break;
|
|
326
|
+
if (isBlockquoteLine(t)) {
|
|
327
|
+
const cleaned = normalizeInlineMarkdown(stripBlockquotePrefix(lines[j].raw));
|
|
328
|
+
if (cleaned.length > 0) quotePieces.push({ line: lines[j].num, text: cleaned });
|
|
329
|
+
j++;
|
|
330
|
+
continue;
|
|
331
|
+
}
|
|
332
|
+
if (isExampleAnchorLine(t)) break;
|
|
333
|
+
break;
|
|
334
|
+
}
|
|
335
|
+
if (quotePieces.length > 0) {
|
|
336
|
+
emitChunk(filePath, quotePieces, prefix, headingState, 'example', exId, out);
|
|
337
|
+
i = j;
|
|
338
|
+
continue;
|
|
339
|
+
}
|
|
340
|
+
i++;
|
|
341
|
+
if ((prefix + trimmed).length >= MD_MIN_CHUNK_LENGTH) {
|
|
342
|
+
emitChunk(
|
|
343
|
+
filePath,
|
|
344
|
+
[{ line: anchorLine, text: normalizeInlineMarkdown(trimmed) }],
|
|
345
|
+
'',
|
|
346
|
+
headingState,
|
|
347
|
+
'example',
|
|
348
|
+
exId,
|
|
349
|
+
out
|
|
350
|
+
);
|
|
351
|
+
}
|
|
352
|
+
continue;
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
if (isSpanOnlyLine(trimmed)) {
|
|
356
|
+
i++;
|
|
357
|
+
continue;
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
if (isBlockquoteLine(trimmed)) {
|
|
361
|
+
const quotePieces: Array<{ line: number; text: string }> = [];
|
|
362
|
+
let j = i;
|
|
363
|
+
while (j < lines.length) {
|
|
364
|
+
const t = lines[j].raw.trim();
|
|
365
|
+
if (isBlockquoteLine(t)) {
|
|
366
|
+
const cleaned = normalizeInlineMarkdown(stripBlockquotePrefix(lines[j].raw));
|
|
367
|
+
if (cleaned.length > 0) quotePieces.push({ line: lines[j].num, text: cleaned });
|
|
368
|
+
j++;
|
|
369
|
+
} else break;
|
|
370
|
+
}
|
|
371
|
+
emitChunk(filePath, quotePieces, '', headingState, 'quote', null, out);
|
|
372
|
+
i = j;
|
|
373
|
+
continue;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
const prosePieces: Array<{ line: number; text: string }> = [];
|
|
377
|
+
let j = i;
|
|
378
|
+
while (j < lines.length) {
|
|
379
|
+
const t = lines[j].raw.trim();
|
|
380
|
+
if (isBlank(t)) break;
|
|
381
|
+
if (isHtmlCommentLine(t)) {
|
|
382
|
+
j++;
|
|
383
|
+
continue;
|
|
384
|
+
}
|
|
385
|
+
if (isMarkdownHeading(t)) break;
|
|
386
|
+
if (t.includes('<table')) break;
|
|
387
|
+
if (isExampleAnchorLine(t)) break;
|
|
388
|
+
if (isBlockquoteLine(t)) break;
|
|
389
|
+
if (isSpanOnlyLine(t)) {
|
|
390
|
+
j++;
|
|
391
|
+
continue;
|
|
392
|
+
}
|
|
393
|
+
const cleaned = normalizeInlineMarkdown(lines[j].raw.trim());
|
|
394
|
+
if (cleaned.length > 0) prosePieces.push({ line: lines[j].num, text: cleaned });
|
|
395
|
+
j++;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
if (prosePieces.length > 0) {
|
|
399
|
+
emitChunk(filePath, prosePieces, '', headingState, proseChunkTypeFromPieces(prosePieces), null, out);
|
|
400
|
+
i = j;
|
|
401
|
+
continue;
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
i++;
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
return out;
|
|
408
|
+
}
|
package/src/scanner.ts
CHANGED
|
@@ -4,11 +4,16 @@ import path from 'path';
|
|
|
4
4
|
import readline from 'readline';
|
|
5
5
|
import { createHash } from 'crypto';
|
|
6
6
|
import { normalizePath } from './path-util.js';
|
|
7
|
+
import { extractMdChunks } from './markdown-chunks.js';
|
|
7
8
|
|
|
8
9
|
export interface FileLine {
|
|
9
10
|
filePath: string;
|
|
10
11
|
lineNumber: number;
|
|
11
12
|
content: string;
|
|
13
|
+
chunkType?: 'prose' | 'list' | 'quote' | 'example' | 'table';
|
|
14
|
+
chapter?: string | null;
|
|
15
|
+
section?: string | null;
|
|
16
|
+
exampleId?: string | null;
|
|
12
17
|
}
|
|
13
18
|
|
|
14
19
|
// File extensions to index
|
|
@@ -50,6 +55,16 @@ export async function* scanDirectory(dirPath: string): AsyncGenerator<FileLine>
|
|
|
50
55
|
continue;
|
|
51
56
|
}
|
|
52
57
|
|
|
58
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
59
|
+
if (ext === '.md') {
|
|
60
|
+
const normalizedPath = normalizePath(filePath);
|
|
61
|
+
const text = readFileSync(filePath, 'utf8');
|
|
62
|
+
for (const chunk of extractMdChunks(text, normalizedPath)) {
|
|
63
|
+
yield chunk;
|
|
64
|
+
}
|
|
65
|
+
continue;
|
|
66
|
+
}
|
|
67
|
+
|
|
53
68
|
fileStream = createReadStream(filePath);
|
|
54
69
|
rl = readline.createInterface({
|
|
55
70
|
input: fileStream,
|
|
@@ -136,12 +151,17 @@ export function readFileWithMetadata(filePath: string): FileContentWithMetadata
|
|
|
136
151
|
const text = raw.toString('utf8');
|
|
137
152
|
const lines: FileLine[] = [];
|
|
138
153
|
const normalizedPath = normalizePath(filePath);
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
154
|
+
const ext = path.extname(filePath).toLowerCase();
|
|
155
|
+
if (ext === '.md') {
|
|
156
|
+
lines.push(...extractMdChunks(text, normalizedPath));
|
|
157
|
+
} else {
|
|
158
|
+
let lineNumber = 0;
|
|
159
|
+
for (const rawLine of text.split(/\r?\n/)) {
|
|
160
|
+
lineNumber++;
|
|
161
|
+
const trimmed = rawLine.trim();
|
|
162
|
+
if (trimmed.length >= MIN_LINE_LENGTH) {
|
|
163
|
+
lines.push({ filePath: normalizedPath, lineNumber, content: trimmed });
|
|
164
|
+
}
|
|
145
165
|
}
|
|
146
166
|
}
|
|
147
167
|
return { filePath: normalizedPath, mtimeMs, contentHash, lines };
|
package/src/storage.ts
CHANGED
|
@@ -17,6 +17,10 @@ export interface LineRecord {
|
|
|
17
17
|
line_number: number;
|
|
18
18
|
content: string;
|
|
19
19
|
embedding: Float32Array;
|
|
20
|
+
chunk_type?: 'prose' | 'list' | 'quote' | 'example' | 'table' | null;
|
|
21
|
+
chapter?: string | null;
|
|
22
|
+
section?: string | null;
|
|
23
|
+
example_id?: string | null;
|
|
20
24
|
}
|
|
21
25
|
|
|
22
26
|
export interface SearchResult {
|
|
@@ -24,6 +28,10 @@ export interface SearchResult {
|
|
|
24
28
|
line_number: number;
|
|
25
29
|
content: string;
|
|
26
30
|
score: number;
|
|
31
|
+
chunk_type?: 'prose' | 'list' | 'quote' | 'example' | 'table' | null;
|
|
32
|
+
chapter?: string | null;
|
|
33
|
+
section?: string | null;
|
|
34
|
+
example_id?: string | null;
|
|
27
35
|
}
|
|
28
36
|
|
|
29
37
|
type DB = Awaited<ReturnType<typeof createDatabase>>;
|
|
@@ -47,10 +55,28 @@ export class VectorStorage {
|
|
|
47
55
|
file_path TEXT NOT NULL,
|
|
48
56
|
line_number INTEGER NOT NULL,
|
|
49
57
|
content TEXT NOT NULL,
|
|
58
|
+
chunk_type TEXT,
|
|
59
|
+
chapter TEXT,
|
|
60
|
+
section TEXT,
|
|
61
|
+
example_id TEXT,
|
|
50
62
|
UNIQUE(file_path, line_number)
|
|
51
63
|
);
|
|
52
64
|
CREATE INDEX IF NOT EXISTS idx_file ON lines(file_path);
|
|
65
|
+
CREATE INDEX IF NOT EXISTS idx_example_id ON lines(example_id);
|
|
53
66
|
`);
|
|
67
|
+
// Forward-only schema evolution for existing DBs.
|
|
68
|
+
try {
|
|
69
|
+
this.db.exec('ALTER TABLE lines ADD COLUMN chunk_type TEXT');
|
|
70
|
+
} catch {}
|
|
71
|
+
try {
|
|
72
|
+
this.db.exec('ALTER TABLE lines ADD COLUMN chapter TEXT');
|
|
73
|
+
} catch {}
|
|
74
|
+
try {
|
|
75
|
+
this.db.exec('ALTER TABLE lines ADD COLUMN section TEXT');
|
|
76
|
+
} catch {}
|
|
77
|
+
try {
|
|
78
|
+
this.db.exec('ALTER TABLE lines ADD COLUMN example_id TEXT');
|
|
79
|
+
} catch {}
|
|
54
80
|
this.db.exec(`
|
|
55
81
|
CREATE VIRTUAL TABLE IF NOT EXISTS vec_lines USING vec0(
|
|
56
82
|
line_id INTEGER PRIMARY KEY,
|
|
@@ -69,13 +95,37 @@ export class VectorStorage {
|
|
|
69
95
|
/**
|
|
70
96
|
* Insert or update a line with its embedding
|
|
71
97
|
*/
|
|
72
|
-
async upsertLine(
|
|
98
|
+
async upsertLine(
|
|
99
|
+
filePath: string,
|
|
100
|
+
lineNumber: number,
|
|
101
|
+
content: string,
|
|
102
|
+
embedding: Float32Array,
|
|
103
|
+
meta?: {
|
|
104
|
+
chunkType?: string | null;
|
|
105
|
+
chapter?: string | null;
|
|
106
|
+
section?: string | null;
|
|
107
|
+
exampleId?: string | null;
|
|
108
|
+
}
|
|
109
|
+
): Promise<void> {
|
|
73
110
|
const insertLine = await this.db.prepare(
|
|
74
|
-
`INSERT INTO lines (file_path, line_number, content)
|
|
75
|
-
VALUES (?, ?, ?)
|
|
76
|
-
ON CONFLICT(file_path, line_number) DO UPDATE SET
|
|
111
|
+
`INSERT INTO lines (file_path, line_number, content, chunk_type, chapter, section, example_id)
|
|
112
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
113
|
+
ON CONFLICT(file_path, line_number) DO UPDATE SET
|
|
114
|
+
content = excluded.content,
|
|
115
|
+
chunk_type = excluded.chunk_type,
|
|
116
|
+
chapter = excluded.chapter,
|
|
117
|
+
section = excluded.section,
|
|
118
|
+
example_id = excluded.example_id`
|
|
77
119
|
);
|
|
78
|
-
insertLine.run([
|
|
120
|
+
insertLine.run([
|
|
121
|
+
filePath,
|
|
122
|
+
lineNumber,
|
|
123
|
+
content,
|
|
124
|
+
meta?.chunkType ?? null,
|
|
125
|
+
meta?.chapter ?? null,
|
|
126
|
+
meta?.section ?? null,
|
|
127
|
+
meta?.exampleId ?? null,
|
|
128
|
+
]);
|
|
79
129
|
|
|
80
130
|
const sel = await this.db.prepare('SELECT id FROM lines WHERE file_path = ? AND line_number = ?');
|
|
81
131
|
const row = sel.get([filePath, lineNumber]) as { id: number } | undefined;
|
|
@@ -90,14 +140,28 @@ export class VectorStorage {
|
|
|
90
140
|
* Batch insert lines for efficiency (single transaction)
|
|
91
141
|
*/
|
|
92
142
|
async upsertLinesBatch(
|
|
93
|
-
lines: Array<{
|
|
143
|
+
lines: Array<{
|
|
144
|
+
filePath: string;
|
|
145
|
+
lineNumber: number;
|
|
146
|
+
content: string;
|
|
147
|
+
embedding: Float32Array;
|
|
148
|
+
chunkType?: string | null;
|
|
149
|
+
chapter?: string | null;
|
|
150
|
+
section?: string | null;
|
|
151
|
+
exampleId?: string | null;
|
|
152
|
+
}>
|
|
94
153
|
): Promise<void> {
|
|
95
154
|
if (lines.length === 0) return;
|
|
96
155
|
|
|
97
156
|
const insertLine = await this.db.prepare(
|
|
98
|
-
`INSERT INTO lines (file_path, line_number, content)
|
|
99
|
-
VALUES (?, ?, ?)
|
|
100
|
-
ON CONFLICT(file_path, line_number) DO UPDATE SET
|
|
157
|
+
`INSERT INTO lines (file_path, line_number, content, chunk_type, chapter, section, example_id)
|
|
158
|
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
|
159
|
+
ON CONFLICT(file_path, line_number) DO UPDATE SET
|
|
160
|
+
content = excluded.content,
|
|
161
|
+
chunk_type = excluded.chunk_type,
|
|
162
|
+
chapter = excluded.chapter,
|
|
163
|
+
section = excluded.section,
|
|
164
|
+
example_id = excluded.example_id`
|
|
101
165
|
);
|
|
102
166
|
const selId = await this.db.prepare('SELECT id FROM lines WHERE file_path = ? AND line_number = ?');
|
|
103
167
|
const deleteVec = await this.db.prepare('DELETE FROM vec_lines WHERE line_id = ?');
|
|
@@ -106,7 +170,15 @@ export class VectorStorage {
|
|
|
106
170
|
this.db.exec('BEGIN');
|
|
107
171
|
try {
|
|
108
172
|
for (const item of lines) {
|
|
109
|
-
insertLine.run([
|
|
173
|
+
insertLine.run([
|
|
174
|
+
item.filePath,
|
|
175
|
+
item.lineNumber,
|
|
176
|
+
item.content,
|
|
177
|
+
item.chunkType ?? null,
|
|
178
|
+
item.chapter ?? null,
|
|
179
|
+
item.section ?? null,
|
|
180
|
+
item.exampleId ?? null,
|
|
181
|
+
]);
|
|
110
182
|
const row = selId.get([item.filePath, item.lineNumber]) as { id: number };
|
|
111
183
|
const idInt = BigInt(row.id);
|
|
112
184
|
deleteVec.run([idInt]);
|
|
@@ -122,30 +194,67 @@ export class VectorStorage {
|
|
|
122
194
|
/**
|
|
123
195
|
* Search for similar lines using sqlite-vec cosine distance
|
|
124
196
|
*/
|
|
125
|
-
async search(queryEmbedding: Float32Array, limit: number = 10): Promise<SearchResult[]> {
|
|
197
|
+
async search(queryEmbedding: Float32Array, limit: number = 10, queryText: string = ''): Promise<SearchResult[]> {
|
|
198
|
+
const candidateLimit = Math.max(limit * 4, limit);
|
|
126
199
|
const stmt = await this.db.prepare(`
|
|
127
200
|
SELECT
|
|
128
201
|
l.file_path,
|
|
129
202
|
l.line_number,
|
|
130
203
|
l.content,
|
|
204
|
+
l.chunk_type,
|
|
205
|
+
l.chapter,
|
|
206
|
+
l.section,
|
|
207
|
+
l.example_id,
|
|
131
208
|
vec_distance_cosine(v.embedding, ?) AS distance
|
|
132
209
|
FROM vec_lines v
|
|
133
210
|
INNER JOIN lines l ON v.line_id = l.id
|
|
134
211
|
ORDER BY distance
|
|
135
212
|
LIMIT ?
|
|
136
213
|
`);
|
|
137
|
-
const rows = stmt.all([queryEmbedding.buffer,
|
|
214
|
+
const rows = stmt.all([queryEmbedding.buffer, candidateLimit]) as Array<{
|
|
138
215
|
file_path: string;
|
|
139
216
|
line_number: number;
|
|
140
217
|
content: string;
|
|
218
|
+
chunk_type?: 'prose' | 'list' | 'quote' | 'example' | 'table' | null;
|
|
219
|
+
chapter?: string | null;
|
|
220
|
+
section?: string | null;
|
|
221
|
+
example_id?: string | null;
|
|
141
222
|
distance: number;
|
|
142
223
|
}>;
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
224
|
+
const queryLower = queryText.toLowerCase();
|
|
225
|
+
const queryExample = queryLower.match(/\bex-\d+(?:-\d+)+\b/)?.[0] ?? null;
|
|
226
|
+
const queryTokens = new Set((queryLower.match(/[a-z0-9]{3,}/g) ?? []).slice(0, 32));
|
|
227
|
+
const wantsExamples = /\b(example|translation|translate|gloss|lojban)\b/.test(queryLower);
|
|
228
|
+
const wantsTable = /\b(table|cmavo|selma'o|series)\b/.test(queryLower);
|
|
229
|
+
|
|
230
|
+
const scored = rows.map((row) => {
|
|
231
|
+
let score = 1 - row.distance;
|
|
232
|
+
if (queryExample && row.example_id && row.example_id.toLowerCase() === queryExample) {
|
|
233
|
+
score += 0.25;
|
|
234
|
+
}
|
|
235
|
+
if (wantsExamples && row.chunk_type === 'example') score += 0.09;
|
|
236
|
+
if (wantsTable && row.chunk_type === 'table') score += 0.07;
|
|
237
|
+
if (queryTokens.size > 0) {
|
|
238
|
+
const titleText = `${row.chapter ?? ''} ${row.section ?? ''}`.toLowerCase();
|
|
239
|
+
const titleTokens = new Set(titleText.match(/[a-z0-9]{3,}/g) ?? []);
|
|
240
|
+
let overlap = 0;
|
|
241
|
+
for (const t of queryTokens) if (titleTokens.has(t)) overlap++;
|
|
242
|
+
if (overlap > 0) score += Math.min(0.08, overlap * 0.02);
|
|
243
|
+
}
|
|
244
|
+
return {
|
|
245
|
+
file_path: row.file_path,
|
|
246
|
+
line_number: row.line_number,
|
|
247
|
+
content: row.content,
|
|
248
|
+
score: Math.round(score * 1000) / 1000,
|
|
249
|
+
chunk_type: row.chunk_type ?? null,
|
|
250
|
+
chapter: row.chapter ?? null,
|
|
251
|
+
section: row.section ?? null,
|
|
252
|
+
example_id: row.example_id ?? null,
|
|
253
|
+
};
|
|
254
|
+
});
|
|
255
|
+
|
|
256
|
+
scored.sort((a, b) => b.score - a.score);
|
|
257
|
+
return scored.slice(0, limit);
|
|
149
258
|
}
|
|
150
259
|
|
|
151
260
|
async getStats(): Promise<{ totalFiles: number; totalLines: number }> {
|