@lojban/semantic-search-mcp 1.0.15 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -12,7 +12,7 @@ Use it in **Cursor**, **Claude Code**, or any IDE that supports MCP to search th
12
12
 
13
13
  ## How it works
14
14
 
15
- - **Indexing**: On startup, the server indexes content in the background. If **`SEMANTIC_SEARCH_INDEX_DIRS`** is set (comma-separated paths), it scans those directories. If it is *not* set, the server downloads the [lojban/sampu_vlaste](https://github.com/lojban/sampu_vlaste) repository from GitHub and indexes that instead. In both cases, the server looks for `.txt`, `.md`, `.tsv`, `.csv` files. Each non-empty line gets a vector embedding (via [Hugging Face Transformers.js](https://huggingface.co/docs/transformers.js), model `Xenova/all-MiniLM-L6-v2`) and is stored in a local SQLite database with [@dao-xyz/sqlite3-vec](https://www.npmjs.com/package/@dao-xyz/sqlite3-vec) (SQLite + sqlite-vec for Node and browser). Indexing runs asynchronously so the server stays responsive and uses bounded memory.
15
+ - **Indexing**: On startup, the server indexes content in the background. If **`SEMANTIC_SEARCH_INDEX_DIRS`** is set (comma-separated paths), it scans those directories. If it is *not* set, the server downloads the [lojban/sampu_vlaste](https://github.com/lojban/sampu_vlaste) repository from GitHub and indexes that instead. In both cases, the server looks for `.txt`, `.md`, `.tsv`, `.csv` files. **`.txt`, `.tsv`, `.csv`**: each non-empty line is one record. **`.md`**: chunks by paragraphs and blocks—merged multi-line `>` blockquotes (e.g. Lojban + glosses), whole HTML `<table>...</table>` blocks, and blank-line-separated prose (including consecutive list items). Latest `##` / `###` titles are prepended as `Context: …` on each chunk for better retrieval. Each chunk gets one embedding (via [Hugging Face Transformers.js](https://huggingface.co/docs/transformers.js), model `Xenova/all-MiniLM-L6-v2`) and is stored in a local SQLite database with [@dao-xyz/sqlite3-vec](https://www.npmjs.com/package/@dao-xyz/sqlite3-vec) (SQLite + sqlite-vec for Node and browser). The `line` field in search results is the **start line** of that chunk in the file. After upgrading to a version that changes chunking, restart the server so files are re-indexed (mtime/content hash refresh).
16
16
  - **Search**: You send a natural-language query; the server embeds it and returns the closest lines by cosine similarity.
17
17
  - **Storage**: Index is stored in your project's `.semantic-search/data/` (or set `SEMANTIC_SEARCH_DATA_DIR`). No cloud, no API keys.
18
18
 
@@ -98,6 +98,8 @@ The server is **not built to JavaScript**; it runs via **`npx tsx src/index.ts`*
98
98
 
99
99
  To run the server from the repo: `npm run dev` or `npx tsx src/index.ts`.
100
100
 
101
+ Run tests: `npm test`.
102
+
101
103
  ## License
102
104
 
103
105
  MIT
package/package.json CHANGED
@@ -1,10 +1,11 @@
1
1
  {
2
2
  "name": "@lojban/semantic-search-mcp",
3
- "version": "1.0.15",
3
+ "version": "1.0.18",
4
4
  "description": "Local-first MCP server for semantic search using transformers.js and SQLite",
5
5
  "type": "module",
6
6
  "scripts": {
7
- "dev": "tsx src/index.ts"
7
+ "dev": "tsx src/index.ts",
8
+ "test": "node --import tsx --test src/*.test.ts"
8
9
  },
9
10
  "dependencies": {
10
11
  "@dao-xyz/sqlite3-vec": "^0.0.19",
package/src/index.ts CHANGED
@@ -6,16 +6,17 @@ import {
6
6
  ListToolsRequestSchema,
7
7
  } from '@modelcontextprotocol/sdk/types.js';
8
8
  import path from 'path';
9
+ import os from 'os';
9
10
  import { getEmbedding, getBatchEmbeddings } from './embeddings.js';
10
11
  import { createVectorStorage, type SearchResult } from './storage.js';
11
12
  import { normalizePath } from './path-util.js';
12
13
  import { listFilesInDirectories, readFileWithMetadata } from './scanner.js';
13
14
  import { getSampuVlasteDir } from './download-sampu-vlaste.js';
14
15
 
15
- // Data dir: use env, or project cwd so each workspace has its own index when run via npx from Cursor
16
+ // Data dir: use env, or $HOME/.semantic-search/data so the same index is used across MCP version upgrades (npx cwd can change per version)
16
17
  const dataDir =
17
18
  process.env.SEMANTIC_SEARCH_DATA_DIR ||
18
- path.join(process.cwd(), '.semantic-search', 'data');
19
+ path.join(os.homedir(), '.semantic-search', 'data');
19
20
  const DB_PATH = path.join(dataDir, 'vectors.db');
20
21
 
21
22
  // Background indexing state (progress for get_index_stats)
@@ -58,6 +59,7 @@ async function runBackgroundIndexing(
58
59
  storage.getFileMetadata(),
59
60
  storage.getIndexedFilePaths(),
60
61
  ]);
62
+ const indexedPathSet = new Set(indexedPaths.map((p) => normalizePath(p)));
61
63
 
62
64
  // Remove from DB any file whose directory is no longer in SEMANTIC_SEARCH_INDEX_DIRS
63
65
  for (const filePath of indexedPaths) {
@@ -67,7 +69,15 @@ async function runBackgroundIndexing(
67
69
  }
68
70
 
69
71
  const processBatch = async (
70
- batch: Array<{ filePath: string; lineNumber: number; content: string }>
72
+ batch: Array<{
73
+ filePath: string;
74
+ lineNumber: number;
75
+ content: string;
76
+ chunkType?: 'prose' | 'list' | 'quote' | 'example' | 'table';
77
+ chapter?: string | null;
78
+ section?: string | null;
79
+ exampleId?: string | null;
80
+ }>
71
81
  ): Promise<void> => {
72
82
  if (batch.length === 0) return;
73
83
  const contents = batch.map((l) => l.content);
@@ -77,6 +87,10 @@ async function runBackgroundIndexing(
77
87
  lineNumber: line.lineNumber,
78
88
  content: line.content,
79
89
  embedding: embeddings[idx],
90
+ chunkType: line.chunkType ?? null,
91
+ chapter: line.chapter ?? null,
92
+ section: line.section ?? null,
93
+ exampleId: line.exampleId ?? null,
80
94
  }));
81
95
  await storage.upsertLinesBatch(batchData);
82
96
  indexingState.linesIndexed += batch.length;
@@ -88,14 +102,32 @@ async function runBackgroundIndexing(
88
102
  new Promise((resolve) => setImmediate(resolve));
89
103
 
90
104
  const currentFilesOnDisk = new Set<string>();
91
- const toIndex: Array<{ filePath: string; mtimeMs: number; contentHash: string; lines: Array<{ filePath: string; lineNumber: number; content: string }> }> = [];
105
+ const toIndex: Array<{
106
+ filePath: string;
107
+ mtimeMs: number;
108
+ contentHash: string;
109
+ lines: Array<{
110
+ filePath: string;
111
+ lineNumber: number;
112
+ content: string;
113
+ chunkType?: 'prose' | 'list' | 'quote' | 'example' | 'table';
114
+ chapter?: string | null;
115
+ section?: string | null;
116
+ exampleId?: string | null;
117
+ }>;
118
+ }> = [];
92
119
 
93
120
  for await (const { filePath, mtimeMs } of listFilesInDirectories(directories)) {
94
121
  currentFilesOnDisk.add(filePath);
95
122
  const meta = fileMetadata.get(filePath);
96
123
  const content = readFileWithMetadata(filePath);
97
124
  if (!content) continue;
98
- if (meta && meta.mtimeMs === content.mtimeMs && meta.contentHash === content.contentHash) {
125
+ if (
126
+ indexedPathSet.has(normalizePath(filePath)) &&
127
+ meta &&
128
+ meta.mtimeMs === content.mtimeMs &&
129
+ meta.contentHash === content.contentHash
130
+ ) {
99
131
  continue; // unchanged, skip
100
132
  }
101
133
  toIndex.push({
@@ -113,7 +145,15 @@ async function runBackgroundIndexing(
113
145
  }
114
146
  }
115
147
 
116
- let currentBatch: Array<{ filePath: string; lineNumber: number; content: string }> = [];
148
+ let currentBatch: Array<{
149
+ filePath: string;
150
+ lineNumber: number;
151
+ content: string;
152
+ chunkType?: 'prose' | 'list' | 'quote' | 'example' | 'table';
153
+ chapter?: string | null;
154
+ section?: string | null;
155
+ exampleId?: string | null;
156
+ }> = [];
117
157
  let processingPromise: Promise<void> | null = null;
118
158
 
119
159
  for (const entry of toIndex) {
@@ -207,7 +247,7 @@ async function main() {
207
247
  const limit = (args as { query: string; limit?: number }).limit ?? 10;
208
248
 
209
249
  const queryEmbedding = await getEmbedding(query);
210
- const results = await storage.search(queryEmbedding, limit);
250
+ const results = await storage.search(queryEmbedding, limit, query);
211
251
 
212
252
  return {
213
253
  content: [
@@ -220,6 +260,10 @@ async function main() {
220
260
  line: r.line_number,
221
261
  content: r.content,
222
262
  score: Math.round(r.score * 1000) / 1000,
263
+ chunk_type: r.chunk_type ?? undefined,
264
+ chapter: r.chapter ?? undefined,
265
+ section: r.section ?? undefined,
266
+ example_id: r.example_id ?? undefined,
223
267
  })),
224
268
  }),
225
269
  },
@@ -0,0 +1,106 @@
1
+ import { describe, it } from 'node:test';
2
+ import assert from 'node:assert/strict';
3
+ import { extractMdChunks } from './markdown-chunks.js';
4
+
5
+ const fp = '/books/cll/5.md';
6
+
7
+ describe('extractMdChunks', () => {
8
+ it('merges example span + multiline blockquote and prefixes ex id', () => {
9
+ const md = `## Chapter 5: Selbri
10
+
11
+ ### Brivla
12
+
13
+ <span id="ex-5-1"></span><span id="example-x"></span>
14
+ > **do mamta mi**
15
+ > _You are-a-mother-of me_
16
+ > _You are my mother_
17
+
18
+ The next paragraph after a blank line.
19
+
20
+ `;
21
+ const chunks = extractMdChunks(md, fp);
22
+ const ex = chunks.find((c) => c.content.includes('Example ex-5-1'));
23
+ assert.ok(ex, 'expected chunk with ex anchor');
24
+ assert.match(ex!.content, /do mamta mi/);
25
+ assert.match(ex!.content, /You are my mother/);
26
+ assert.ok(ex!.content.includes('Context:'), 'heading context prepended');
27
+ assert.ok(ex!.content.includes('Chapter 5: Selbri'));
28
+ assert.ok(ex!.content.includes('Brivla'));
29
+ assert.equal(ex!.lineNumber, 6);
30
+ assert.equal(ex!.chunkType, 'example');
31
+ assert.equal(ex!.exampleId, 'ex-5-1');
32
+ assert.equal(ex!.chapter, 'Chapter 5: Selbri');
33
+ assert.equal(ex!.section, 'Brivla');
34
+ });
35
+
36
+ it('flattens HTML table into one chunk', () => {
37
+ const md = `## Ch 7
38
+
39
+ <table>
40
+ <tbody>
41
+ <tr>
42
+ <td>**mi**</td>
43
+ <td>\`KOhA\`</td>
44
+ <td>I, me</td>
45
+ </tr>
46
+ <tr>
47
+ <td>**do**</td>
48
+ <td>\`KOhA\`</td>
49
+ <td>you</td>
50
+ </tr>
51
+ </tbody>
52
+ </table>
53
+
54
+ `;
55
+ const chunks = extractMdChunks(md, fp);
56
+ const t = chunks.find((c) => c.content.includes('**mi**'));
57
+ assert.ok(!t, 'old markdown-formatted variant should not remain');
58
+ const cleaned = chunks.find((c) => c.content.includes('mi') && c.content.includes('do'));
59
+ assert.ok(cleaned);
60
+ assert.match(cleaned!.content, /\bmi\b/);
61
+ assert.match(cleaned!.content, /\bdo\b/);
62
+ assert.ok(cleaned!.content.includes('I, me'));
63
+ });
64
+
65
+ it('merges consecutive list items without blank lines into one chunk', () => {
66
+ const md = `## Lists
67
+
68
+ Intro line.
69
+
70
+ - **mi'o** speaker and listener
71
+ - **mi'a** speaker and others
72
+
73
+ After blank.
74
+ `;
75
+ const chunks = extractMdChunks(md, fp);
76
+ const list = chunks.find((c) => c.content.includes("mi'o") && c.content.includes("mi'a"));
77
+ assert.ok(list);
78
+ });
79
+
80
+ it('merges orphan blockquotes without leading span', () => {
81
+ const md = `> **ta bloti**
82
+ > _That is a boat._
83
+
84
+ Done.
85
+ `;
86
+ const chunks = extractMdChunks(md, fp);
87
+ const q = chunks.find((c) => c.content.includes('ta bloti'));
88
+ assert.ok(q);
89
+ assert.match(q!.content, /That is a boat/);
90
+ assert.equal(q!.chunkType, 'quote');
91
+ });
92
+
93
+ it('skips non-example span-only anchor lines', () => {
94
+ const md = `## Chapter
95
+
96
+ <span id="sec-5-1"></span><span id="section-content-words-brivla"></span>
97
+
98
+ Paragraph with real content.
99
+ `;
100
+ const chunks = extractMdChunks(md, fp);
101
+ const anchorOnly = chunks.find((c) => c.content.includes('sec-5-1'));
102
+ assert.equal(anchorOnly, undefined);
103
+ const prose = chunks.find((c) => c.content.includes('Paragraph with real content.'));
104
+ assert.ok(prose);
105
+ });
106
+ });
@@ -0,0 +1,408 @@
1
+ import type { FileLine } from './scanner.js'; // type-only: no runtime cycle with scanner importing this module
2
+
3
+ /** Min chunk length after merge (book mode); keeps short examples, drops noise */
4
+ export const MD_MIN_CHUNK_LENGTH = 12;
5
+
6
+ /** Rough max characters per indexed chunk before sentence split */
7
+ export const MD_MAX_CHUNK_CHARS = 1800;
8
+
9
+ /** Overlap when splitting long chunks (characters) */
10
+ export const MD_SPLIT_OVERLAP = 100;
11
+
12
+ const HEADING_RE = /^(#{1,6})\s+(.+)$/;
13
+ const BLOCKQUOTE_RE = /^\s*>[ \t]?(.*)$/;
14
+ const EX_ID_RE = /id="(ex-[^"]+)"/;
15
+ const SPAN_TAG_RE = /<span\b[^>]*>\s*<\/span>/gi;
16
+ type ChunkType = NonNullable<FileLine['chunkType']>;
17
+
18
+ function isBlank(trimmed: string): boolean {
19
+ return trimmed.length === 0;
20
+ }
21
+
22
+ function isHtmlCommentLine(trimmed: string): boolean {
23
+ return /^<!--.*-->$/.test(trimmed);
24
+ }
25
+
26
+ function isMarkdownHeading(trimmed: string): boolean {
27
+ return HEADING_RE.test(trimmed);
28
+ }
29
+
30
+ function parseHeading(trimmed: string): { level: number; title: string } | null {
31
+ const m = trimmed.match(HEADING_RE);
32
+ if (!m) return null;
33
+ return { level: m[1].length, title: m[2].trim() };
34
+ }
35
+
36
+ /** CLL: line of span(s) before blockquote, with an example id */
37
+ function isExampleAnchorLine(trimmed: string): boolean {
38
+ if (trimmed.startsWith('>')) return false;
39
+ if (!trimmed.includes('id="ex-')) return false;
40
+ // Mostly HTML span markup (allows multiple <span>...</span>)
41
+ const withoutTags = trimmed.replace(/<span\b[^>]*>[\s\S]*?<\/span>/gi, '').replace(/\s+/g, '');
42
+ return withoutTags.length === 0;
43
+ }
44
+
45
+ function extractFirstExId(trimmed: string): string | null {
46
+ const m = trimmed.match(EX_ID_RE);
47
+ return m ? m[1] : null;
48
+ }
49
+
50
+ function isSpanOnlyLine(trimmed: string): boolean {
51
+ const withoutSpans = trimmed.replace(SPAN_TAG_RE, '').replace(/\s+/g, '');
52
+ return withoutSpans.length === 0 && trimmed.includes('<span');
53
+ }
54
+
55
+ function stripBlockquotePrefix(line: string): string {
56
+ const m = line.match(BLOCKQUOTE_RE);
57
+ return m ? m[1] : line.trim();
58
+ }
59
+
60
+ function isBlockquoteLine(trimmed: string): boolean {
61
+ return /^\s*>/.test(trimmed);
62
+ }
63
+
64
+ interface HeadingState {
65
+ byLevel: Array<string | null>;
66
+ }
67
+
68
+ function contextPrefix(state: HeadingState): string {
69
+ const parts: string[] = [];
70
+ for (let level = 2; level <= 4; level++) {
71
+ const title = state.byLevel[level];
72
+ if (title) parts.push(title);
73
+ }
74
+ if (parts.length === 0) return '';
75
+ return `Context: ${parts.join(' / ')}\n\n`;
76
+ }
77
+
78
+ function updateHeadings(state: HeadingState, level: number, title: string): void {
79
+ if (level < 2) return;
80
+ const clamped = Math.min(level, 6);
81
+ state.byLevel[clamped] = title;
82
+ for (let l = clamped + 1; l <= 6; l++) {
83
+ state.byLevel[l] = null;
84
+ }
85
+ }
86
+
87
+ function normalizeInlineMarkdown(s: string): string {
88
+ let out = s;
89
+ // Keep visible text, drop link URLs.
90
+ out = out.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
91
+ // Remove common inline formatting wrappers.
92
+ out = out
93
+ .replace(/\*\*([^*]+)\*\*/g, '$1')
94
+ .replace(/__([^_]+)__/g, '$1')
95
+ .replace(/`([^`]+)`/g, '$1')
96
+ .replace(/\*([^*]+)\*/g, '$1')
97
+ .replace(/_([^_]+)_/g, '$1');
98
+ // Drop remaining HTML tags.
99
+ out = out.replace(/<[^>]+>/g, ' ');
100
+ // Decode common entities.
101
+ out = out
102
+ .replace(/&nbsp;/gi, ' ')
103
+ .replace(/&amp;/gi, '&')
104
+ .replace(/&lt;/gi, '<')
105
+ .replace(/&gt;/gi, '>')
106
+ .replace(/&quot;/gi, '"')
107
+ .replace(/&#39;/gi, "'");
108
+ return out.replace(/\s+/g, ' ').trim();
109
+ }
110
+
111
+ function proseChunkTypeFromPieces(pieces: Array<{ line: number; text: string }>): ChunkType {
112
+ if (pieces.length > 0 && pieces.every((p) => /^([-*+]\s+|\d+[.)]\s+)/.test(p.text))) return 'list';
113
+ return 'prose';
114
+ }
115
+
116
+ function flattenTableLines(tableLines: string[]): string {
117
+ const joined = tableLines.join('\n');
118
+ let s = joined
119
+ .replace(/<tbody[^>]*>|<\/tbody>|<thead[^>]*>|<\/thead>|<table[^>]*>|<\/table>/gi, '')
120
+ .replace(/<\/tr>/gi, '\n')
121
+ .replace(/<tr[^>]*>/gi, '')
122
+ .replace(/<\/td>/gi, ' | ')
123
+ .replace(/<\/th>/gi, ' | ')
124
+ .replace(/<t[dh][^>]*>/gi, '');
125
+ s = s.replace(/<[^>]+>/g, '');
126
+ return s
127
+ .split('\n')
128
+ .map((l) => normalizeInlineMarkdown(l.replace(/\s*\|\s*$/, '').trim()))
129
+ .filter((l) => l.length > 0)
130
+ .join('\n');
131
+ }
132
+
133
+ function bodyJoined(pieces: Array<{ line: number; text: string }>): string {
134
+ return pieces.map((p) => p.text).join('\n');
135
+ }
136
+
137
+ /** Char index in `prefix + bodyJoined` (no context) -> source line */
138
+ function lineInBody(
139
+ prefixLen: number,
140
+ pieces: Array<{ line: number; text: string }>,
141
+ idx: number
142
+ ): number {
143
+ if (idx < prefixLen || pieces.length === 0) return pieces[0]?.line ?? 1;
144
+ let j = idx - prefixLen;
145
+ let o = 0;
146
+ for (let i = 0; i < pieces.length; i++) {
147
+ const seg = pieces[i].text + (i < pieces.length - 1 ? '\n' : '');
148
+ if (j < o + seg.length) return pieces[i].line;
149
+ o += seg.length;
150
+ }
151
+ return pieces[pieces.length - 1]?.line ?? 1;
152
+ }
153
+
154
+ function lineInFull(ctxLen: number, prefixLen: number, pieces: Array<{ line: number; text: string }>, idx: number): number {
155
+ if (idx < ctxLen) return pieces[0]?.line ?? 1;
156
+ return lineInBody(prefixLen, pieces, idx - ctxLen);
157
+ }
158
+
159
+ function firstNonWsIndex(s: string, from: number, to: number): number {
160
+ let i = from;
161
+ while (i < to && i < s.length && /\s/.test(s[i])) i++;
162
+ return Math.min(i, s.length);
163
+ }
164
+
165
+ function splitLongText(
166
+ full: string,
167
+ ctxLen: number,
168
+ prefixLen: number,
169
+ pieces: Array<{ line: number; text: string }>
170
+ ): Array<{ startLine: number; content: string }> {
171
+ if (full.length <= MD_MAX_CHUNK_CHARS) {
172
+ const idx = firstNonWsIndex(full, 0, full.length);
173
+ return [{ startLine: lineInFull(ctxLen, prefixLen, pieces, idx), content: full.trim() }];
174
+ }
175
+
176
+ const out: Array<{ startLine: number; content: string }> = [];
177
+ let pos = 0;
178
+ const n = full.length;
179
+ while (pos < n) {
180
+ let end = Math.min(pos + MD_MAX_CHUNK_CHARS, n);
181
+ if (end < n) {
182
+ const window = full.slice(pos, end);
183
+ const lastSentence = Math.max(
184
+ window.lastIndexOf('. '),
185
+ window.lastIndexOf('? '),
186
+ window.lastIndexOf('! '),
187
+ window.lastIndexOf('.\n'),
188
+ window.lastIndexOf('?\n'),
189
+ window.lastIndexOf('!\n')
190
+ );
191
+ if (lastSentence >= MD_MAX_CHUNK_CHARS * 0.35) {
192
+ end = pos + lastSentence + 2;
193
+ } else {
194
+ const lastNl = window.lastIndexOf('\n');
195
+ if (lastNl > MD_MAX_CHUNK_CHARS * 0.28) end = pos + lastNl + 1;
196
+ }
197
+ }
198
+ const sliceStart = firstNonWsIndex(full, pos, end);
199
+ const part = full.slice(pos, end).trim();
200
+ if (part.length >= MD_MIN_CHUNK_LENGTH) {
201
+ out.push({ startLine: lineInFull(ctxLen, prefixLen, pieces, sliceStart), content: part });
202
+ }
203
+ if (end >= n) break;
204
+ const nextPos = end - MD_SPLIT_OVERLAP;
205
+ pos = nextPos > pos ? nextPos : end;
206
+ }
207
+ if (out.length === 0 && full.trim().length > 0) {
208
+ const idx = firstNonWsIndex(full, 0, n);
209
+ out.push({ startLine: lineInFull(ctxLen, prefixLen, pieces, idx), content: full.trim().slice(0, MD_MAX_CHUNK_CHARS) });
210
+ }
211
+ if (out.length > 1) {
212
+ const lines = new Set(out.map((o) => o.startLine));
213
+ if (lines.size === 1) {
214
+ const idx = firstNonWsIndex(full, 0, n);
215
+ return [{ startLine: lineInFull(ctxLen, prefixLen, pieces, idx), content: full.trim().slice(0, MD_MAX_CHUNK_CHARS) }];
216
+ }
217
+ }
218
+ return out;
219
+ }
220
+
221
+ function emitChunk(
222
+ filePath: string,
223
+ pieces: Array<{ line: number; text: string }>,
224
+ prefix: string,
225
+ headingState: HeadingState,
226
+ chunkType: ChunkType,
227
+ exampleId: string | null,
228
+ out: FileLine[]
229
+ ): void {
230
+ if (pieces.length === 0) return;
231
+ const body = bodyJoined(pieces);
232
+ const ctx = contextPrefix(headingState);
233
+ const inner = prefix + body;
234
+ const full = ctx + inner;
235
+ const ctxLen = ctx.length;
236
+ const prefixLen = prefix.length;
237
+ const splits = splitLongText(full, ctxLen, prefixLen, pieces);
238
+ const chapter = headingState.byLevel[2] ?? null;
239
+ const sectionParts: string[] = [];
240
+ for (let level = 3; level <= 4; level++) {
241
+ const t = headingState.byLevel[level];
242
+ if (t) sectionParts.push(t);
243
+ }
244
+ const section = sectionParts.length > 0 ? sectionParts.join(' / ') : null;
245
+ for (const s of splits) {
246
+ if (s.content.length >= MD_MIN_CHUNK_LENGTH) {
247
+ out.push({
248
+ filePath,
249
+ lineNumber: s.startLine,
250
+ content: s.content,
251
+ chunkType,
252
+ chapter,
253
+ section,
254
+ exampleId,
255
+ });
256
+ }
257
+ }
258
+ }
259
+
260
+ /**
261
+ * Book-style Markdown chunks: merged blockquotes (Lojban + glosses), whole HTML tables,
262
+ * blank-line paragraphs, consecutive list items as one block. Heading trail prepended for context.
263
+ */
264
+ export function extractMdChunks(text: string, filePath: string): FileLine[] {
265
+ const rawLines = text.split(/\r?\n/);
266
+ const lines: Array<{ num: number; raw: string }> = [];
267
+ let n = 0;
268
+ for (const raw of rawLines) {
269
+ n++;
270
+ lines.push({ num: n, raw });
271
+ }
272
+
273
+ const out: FileLine[] = [];
274
+ const headingState: HeadingState = { byLevel: Array<string | null>(7).fill(null) };
275
+ let i = 0;
276
+
277
+ while (i < lines.length) {
278
+ const { num, raw } = lines[i];
279
+ const trimmed = raw.trim();
280
+
281
+ if (isBlank(trimmed)) {
282
+ i++;
283
+ continue;
284
+ }
285
+ if (isHtmlCommentLine(trimmed)) {
286
+ i++;
287
+ continue;
288
+ }
289
+
290
+ if (isMarkdownHeading(trimmed)) {
291
+ const ph = parseHeading(trimmed);
292
+ if (ph) updateHeadings(headingState, ph.level, normalizeInlineMarkdown(ph.title));
293
+ i++;
294
+ continue;
295
+ }
296
+
297
+ if (trimmed.includes('<table')) {
298
+ const startLine = num;
299
+ const tableLines: string[] = [];
300
+ let j = i;
301
+ while (j < lines.length) {
302
+ tableLines.push(lines[j].raw);
303
+ if (lines[j].raw.includes('</table>')) {
304
+ j++;
305
+ break;
306
+ }
307
+ j++;
308
+ }
309
+ const flat = flattenTableLines(tableLines).trim();
310
+ i = j;
311
+ if (flat.length >= MD_MIN_CHUNK_LENGTH) {
312
+ emitChunk(filePath, [{ line: startLine, text: flat }], '', headingState, 'table', null, out);
313
+ }
314
+ continue;
315
+ }
316
+
317
+ if (isExampleAnchorLine(trimmed)) {
318
+ const exId = extractFirstExId(trimmed);
319
+ const prefix = exId ? `Example ${exId}\n` : 'Example\n';
320
+ let j = i + 1;
321
+ const quotePieces: Array<{ line: number; text: string }> = [];
322
+ const anchorLine = num;
323
+ while (j < lines.length) {
324
+ const t = lines[j].raw.trim();
325
+ if (isBlank(t)) break;
326
+ if (isBlockquoteLine(t)) {
327
+ const cleaned = normalizeInlineMarkdown(stripBlockquotePrefix(lines[j].raw));
328
+ if (cleaned.length > 0) quotePieces.push({ line: lines[j].num, text: cleaned });
329
+ j++;
330
+ continue;
331
+ }
332
+ if (isExampleAnchorLine(t)) break;
333
+ break;
334
+ }
335
+ if (quotePieces.length > 0) {
336
+ emitChunk(filePath, quotePieces, prefix, headingState, 'example', exId, out);
337
+ i = j;
338
+ continue;
339
+ }
340
+ i++;
341
+ if ((prefix + trimmed).length >= MD_MIN_CHUNK_LENGTH) {
342
+ emitChunk(
343
+ filePath,
344
+ [{ line: anchorLine, text: normalizeInlineMarkdown(trimmed) }],
345
+ '',
346
+ headingState,
347
+ 'example',
348
+ exId,
349
+ out
350
+ );
351
+ }
352
+ continue;
353
+ }
354
+
355
+ if (isSpanOnlyLine(trimmed)) {
356
+ i++;
357
+ continue;
358
+ }
359
+
360
+ if (isBlockquoteLine(trimmed)) {
361
+ const quotePieces: Array<{ line: number; text: string }> = [];
362
+ let j = i;
363
+ while (j < lines.length) {
364
+ const t = lines[j].raw.trim();
365
+ if (isBlockquoteLine(t)) {
366
+ const cleaned = normalizeInlineMarkdown(stripBlockquotePrefix(lines[j].raw));
367
+ if (cleaned.length > 0) quotePieces.push({ line: lines[j].num, text: cleaned });
368
+ j++;
369
+ } else break;
370
+ }
371
+ emitChunk(filePath, quotePieces, '', headingState, 'quote', null, out);
372
+ i = j;
373
+ continue;
374
+ }
375
+
376
+ const prosePieces: Array<{ line: number; text: string }> = [];
377
+ let j = i;
378
+ while (j < lines.length) {
379
+ const t = lines[j].raw.trim();
380
+ if (isBlank(t)) break;
381
+ if (isHtmlCommentLine(t)) {
382
+ j++;
383
+ continue;
384
+ }
385
+ if (isMarkdownHeading(t)) break;
386
+ if (t.includes('<table')) break;
387
+ if (isExampleAnchorLine(t)) break;
388
+ if (isBlockquoteLine(t)) break;
389
+ if (isSpanOnlyLine(t)) {
390
+ j++;
391
+ continue;
392
+ }
393
+ const cleaned = normalizeInlineMarkdown(lines[j].raw.trim());
394
+ if (cleaned.length > 0) prosePieces.push({ line: lines[j].num, text: cleaned });
395
+ j++;
396
+ }
397
+
398
+ if (prosePieces.length > 0) {
399
+ emitChunk(filePath, prosePieces, '', headingState, proseChunkTypeFromPieces(prosePieces), null, out);
400
+ i = j;
401
+ continue;
402
+ }
403
+
404
+ i++;
405
+ }
406
+
407
+ return out;
408
+ }
package/src/scanner.ts CHANGED
@@ -4,11 +4,16 @@ import path from 'path';
4
4
  import readline from 'readline';
5
5
  import { createHash } from 'crypto';
6
6
  import { normalizePath } from './path-util.js';
7
+ import { extractMdChunks } from './markdown-chunks.js';
7
8
 
8
9
  export interface FileLine {
9
10
  filePath: string;
10
11
  lineNumber: number;
11
12
  content: string;
13
+ chunkType?: 'prose' | 'list' | 'quote' | 'example' | 'table';
14
+ chapter?: string | null;
15
+ section?: string | null;
16
+ exampleId?: string | null;
12
17
  }
13
18
 
14
19
  // File extensions to index
@@ -50,6 +55,16 @@ export async function* scanDirectory(dirPath: string): AsyncGenerator<FileLine>
50
55
  continue;
51
56
  }
52
57
 
58
+ const ext = path.extname(filePath).toLowerCase();
59
+ if (ext === '.md') {
60
+ const normalizedPath = normalizePath(filePath);
61
+ const text = readFileSync(filePath, 'utf8');
62
+ for (const chunk of extractMdChunks(text, normalizedPath)) {
63
+ yield chunk;
64
+ }
65
+ continue;
66
+ }
67
+
53
68
  fileStream = createReadStream(filePath);
54
69
  rl = readline.createInterface({
55
70
  input: fileStream,
@@ -136,12 +151,17 @@ export function readFileWithMetadata(filePath: string): FileContentWithMetadata
136
151
  const text = raw.toString('utf8');
137
152
  const lines: FileLine[] = [];
138
153
  const normalizedPath = normalizePath(filePath);
139
- let lineNumber = 0;
140
- for (const rawLine of text.split(/\r?\n/)) {
141
- lineNumber++;
142
- const trimmed = rawLine.trim();
143
- if (trimmed.length >= MIN_LINE_LENGTH) {
144
- lines.push({ filePath: normalizedPath, lineNumber, content: trimmed });
154
+ const ext = path.extname(filePath).toLowerCase();
155
+ if (ext === '.md') {
156
+ lines.push(...extractMdChunks(text, normalizedPath));
157
+ } else {
158
+ let lineNumber = 0;
159
+ for (const rawLine of text.split(/\r?\n/)) {
160
+ lineNumber++;
161
+ const trimmed = rawLine.trim();
162
+ if (trimmed.length >= MIN_LINE_LENGTH) {
163
+ lines.push({ filePath: normalizedPath, lineNumber, content: trimmed });
164
+ }
145
165
  }
146
166
  }
147
167
  return { filePath: normalizedPath, mtimeMs, contentHash, lines };
package/src/storage.ts CHANGED
@@ -17,6 +17,10 @@ export interface LineRecord {
17
17
  line_number: number;
18
18
  content: string;
19
19
  embedding: Float32Array;
20
+ chunk_type?: 'prose' | 'list' | 'quote' | 'example' | 'table' | null;
21
+ chapter?: string | null;
22
+ section?: string | null;
23
+ example_id?: string | null;
20
24
  }
21
25
 
22
26
  export interface SearchResult {
@@ -24,6 +28,10 @@ export interface SearchResult {
24
28
  line_number: number;
25
29
  content: string;
26
30
  score: number;
31
+ chunk_type?: 'prose' | 'list' | 'quote' | 'example' | 'table' | null;
32
+ chapter?: string | null;
33
+ section?: string | null;
34
+ example_id?: string | null;
27
35
  }
28
36
 
29
37
  type DB = Awaited<ReturnType<typeof createDatabase>>;
@@ -47,10 +55,28 @@ export class VectorStorage {
47
55
  file_path TEXT NOT NULL,
48
56
  line_number INTEGER NOT NULL,
49
57
  content TEXT NOT NULL,
58
+ chunk_type TEXT,
59
+ chapter TEXT,
60
+ section TEXT,
61
+ example_id TEXT,
50
62
  UNIQUE(file_path, line_number)
51
63
  );
52
64
  CREATE INDEX IF NOT EXISTS idx_file ON lines(file_path);
65
+ CREATE INDEX IF NOT EXISTS idx_example_id ON lines(example_id);
53
66
  `);
67
+ // Forward-only schema evolution for existing DBs.
68
+ try {
69
+ this.db.exec('ALTER TABLE lines ADD COLUMN chunk_type TEXT');
70
+ } catch {}
71
+ try {
72
+ this.db.exec('ALTER TABLE lines ADD COLUMN chapter TEXT');
73
+ } catch {}
74
+ try {
75
+ this.db.exec('ALTER TABLE lines ADD COLUMN section TEXT');
76
+ } catch {}
77
+ try {
78
+ this.db.exec('ALTER TABLE lines ADD COLUMN example_id TEXT');
79
+ } catch {}
54
80
  this.db.exec(`
55
81
  CREATE VIRTUAL TABLE IF NOT EXISTS vec_lines USING vec0(
56
82
  line_id INTEGER PRIMARY KEY,
@@ -69,13 +95,37 @@ export class VectorStorage {
69
95
  /**
70
96
  * Insert or update a line with its embedding
71
97
  */
72
- async upsertLine(filePath: string, lineNumber: number, content: string, embedding: Float32Array): Promise<void> {
98
+ async upsertLine(
99
+ filePath: string,
100
+ lineNumber: number,
101
+ content: string,
102
+ embedding: Float32Array,
103
+ meta?: {
104
+ chunkType?: string | null;
105
+ chapter?: string | null;
106
+ section?: string | null;
107
+ exampleId?: string | null;
108
+ }
109
+ ): Promise<void> {
73
110
  const insertLine = await this.db.prepare(
74
- `INSERT INTO lines (file_path, line_number, content)
75
- VALUES (?, ?, ?)
76
- ON CONFLICT(file_path, line_number) DO UPDATE SET content = excluded.content`
111
+ `INSERT INTO lines (file_path, line_number, content, chunk_type, chapter, section, example_id)
112
+ VALUES (?, ?, ?, ?, ?, ?, ?)
113
+ ON CONFLICT(file_path, line_number) DO UPDATE SET
114
+ content = excluded.content,
115
+ chunk_type = excluded.chunk_type,
116
+ chapter = excluded.chapter,
117
+ section = excluded.section,
118
+ example_id = excluded.example_id`
77
119
  );
78
- insertLine.run([filePath, lineNumber, content]);
120
+ insertLine.run([
121
+ filePath,
122
+ lineNumber,
123
+ content,
124
+ meta?.chunkType ?? null,
125
+ meta?.chapter ?? null,
126
+ meta?.section ?? null,
127
+ meta?.exampleId ?? null,
128
+ ]);
79
129
 
80
130
  const sel = await this.db.prepare('SELECT id FROM lines WHERE file_path = ? AND line_number = ?');
81
131
  const row = sel.get([filePath, lineNumber]) as { id: number } | undefined;
@@ -90,14 +140,28 @@ export class VectorStorage {
90
140
  * Batch insert lines for efficiency (single transaction)
91
141
  */
92
142
  async upsertLinesBatch(
93
- lines: Array<{ filePath: string; lineNumber: number; content: string; embedding: Float32Array }>
143
+ lines: Array<{
144
+ filePath: string;
145
+ lineNumber: number;
146
+ content: string;
147
+ embedding: Float32Array;
148
+ chunkType?: string | null;
149
+ chapter?: string | null;
150
+ section?: string | null;
151
+ exampleId?: string | null;
152
+ }>
94
153
  ): Promise<void> {
95
154
  if (lines.length === 0) return;
96
155
 
97
156
  const insertLine = await this.db.prepare(
98
- `INSERT INTO lines (file_path, line_number, content)
99
- VALUES (?, ?, ?)
100
- ON CONFLICT(file_path, line_number) DO UPDATE SET content = excluded.content`
157
+ `INSERT INTO lines (file_path, line_number, content, chunk_type, chapter, section, example_id)
158
+ VALUES (?, ?, ?, ?, ?, ?, ?)
159
+ ON CONFLICT(file_path, line_number) DO UPDATE SET
160
+ content = excluded.content,
161
+ chunk_type = excluded.chunk_type,
162
+ chapter = excluded.chapter,
163
+ section = excluded.section,
164
+ example_id = excluded.example_id`
101
165
  );
102
166
  const selId = await this.db.prepare('SELECT id FROM lines WHERE file_path = ? AND line_number = ?');
103
167
  const deleteVec = await this.db.prepare('DELETE FROM vec_lines WHERE line_id = ?');
@@ -106,7 +170,15 @@ export class VectorStorage {
106
170
  this.db.exec('BEGIN');
107
171
  try {
108
172
  for (const item of lines) {
109
- insertLine.run([item.filePath, item.lineNumber, item.content]);
173
+ insertLine.run([
174
+ item.filePath,
175
+ item.lineNumber,
176
+ item.content,
177
+ item.chunkType ?? null,
178
+ item.chapter ?? null,
179
+ item.section ?? null,
180
+ item.exampleId ?? null,
181
+ ]);
110
182
  const row = selId.get([item.filePath, item.lineNumber]) as { id: number };
111
183
  const idInt = BigInt(row.id);
112
184
  deleteVec.run([idInt]);
@@ -122,30 +194,67 @@ export class VectorStorage {
122
194
  /**
123
195
  * Search for similar lines using sqlite-vec cosine distance
124
196
  */
125
- async search(queryEmbedding: Float32Array, limit: number = 10): Promise<SearchResult[]> {
197
+ async search(queryEmbedding: Float32Array, limit: number = 10, queryText: string = ''): Promise<SearchResult[]> {
198
+ const candidateLimit = Math.max(limit * 4, limit);
126
199
  const stmt = await this.db.prepare(`
127
200
  SELECT
128
201
  l.file_path,
129
202
  l.line_number,
130
203
  l.content,
204
+ l.chunk_type,
205
+ l.chapter,
206
+ l.section,
207
+ l.example_id,
131
208
  vec_distance_cosine(v.embedding, ?) AS distance
132
209
  FROM vec_lines v
133
210
  INNER JOIN lines l ON v.line_id = l.id
134
211
  ORDER BY distance
135
212
  LIMIT ?
136
213
  `);
137
- const rows = stmt.all([queryEmbedding.buffer, limit]) as Array<{
214
+ const rows = stmt.all([queryEmbedding.buffer, candidateLimit]) as Array<{
138
215
  file_path: string;
139
216
  line_number: number;
140
217
  content: string;
218
+ chunk_type?: 'prose' | 'list' | 'quote' | 'example' | 'table' | null;
219
+ chapter?: string | null;
220
+ section?: string | null;
221
+ example_id?: string | null;
141
222
  distance: number;
142
223
  }>;
143
- return rows.map((row) => ({
144
- file_path: row.file_path,
145
- line_number: row.line_number,
146
- content: row.content,
147
- score: 1 - row.distance,
148
- }));
224
+ const queryLower = queryText.toLowerCase();
225
+ const queryExample = queryLower.match(/\bex-\d+(?:-\d+)+\b/)?.[0] ?? null;
226
+ const queryTokens = new Set((queryLower.match(/[a-z0-9]{3,}/g) ?? []).slice(0, 32));
227
+ const wantsExamples = /\b(example|translation|translate|gloss|lojban)\b/.test(queryLower);
228
+ const wantsTable = /\b(table|cmavo|selma'o|series)\b/.test(queryLower);
229
+
230
+ const scored = rows.map((row) => {
231
+ let score = 1 - row.distance;
232
+ if (queryExample && row.example_id && row.example_id.toLowerCase() === queryExample) {
233
+ score += 0.25;
234
+ }
235
+ if (wantsExamples && row.chunk_type === 'example') score += 0.09;
236
+ if (wantsTable && row.chunk_type === 'table') score += 0.07;
237
+ if (queryTokens.size > 0) {
238
+ const titleText = `${row.chapter ?? ''} ${row.section ?? ''}`.toLowerCase();
239
+ const titleTokens = new Set(titleText.match(/[a-z0-9]{3,}/g) ?? []);
240
+ let overlap = 0;
241
+ for (const t of queryTokens) if (titleTokens.has(t)) overlap++;
242
+ if (overlap > 0) score += Math.min(0.08, overlap * 0.02);
243
+ }
244
+ return {
245
+ file_path: row.file_path,
246
+ line_number: row.line_number,
247
+ content: row.content,
248
+ score: Math.round(score * 1000) / 1000,
249
+ chunk_type: row.chunk_type ?? null,
250
+ chapter: row.chapter ?? null,
251
+ section: row.section ?? null,
252
+ example_id: row.example_id ?? null,
253
+ };
254
+ });
255
+
256
+ scored.sort((a, b) => b.score - a.score);
257
+ return scored.slice(0, limit);
149
258
  }
150
259
 
151
260
  async getStats(): Promise<{ totalFiles: number; totalLines: number }> {