npm - @lojban/semantic-search-mcp - Versions diffs - 1.0.15 → 1.0.18 - Mend

@lojban/semantic-search-mcp 1.0.15 → 1.0.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

package/README.md +3 -1
package/package.json +3 -2
package/src/index.ts +51 -7
package/src/markdown-chunks.test.ts +106 -0
package/src/markdown-chunks.ts +408 -0
package/src/scanner.ts +26 -6
package/src/storage.ts +127 -18

package/README.md CHANGED Viewed

@@ -12,7 +12,7 @@ Use it in **Cursor**, **Claude Code**, or any IDE that supports MCP to search th
 ## How it works
-- **Indexing**: On startup, the server indexes content in the background. If **`SEMANTIC_SEARCH_INDEX_DIRS`** is set (comma-separated paths), it scans those directories. If it is *not* set, the server downloads the [lojban/sampu_vlaste](https://github.com/lojban/sampu_vlaste) repository from GitHub and indexes that instead. In both cases, the server looks for `.txt`, `.md`, `.tsv`, `.csv` files. Each non-empty line gets a vector embedding (via [Hugging Face Transformers.js](https://huggingface.co/docs/transformers.js), model `Xenova/all-MiniLM-L6-v2`) and is stored in a local SQLite database with [@dao-xyz/sqlite3-vec](https://www.npmjs.com/package/@dao-xyz/sqlite3-vec) (SQLite + sqlite-vec for Node and browser). Indexing runs asynchronously so the server stays responsive and uses bounded memory.
+- **Indexing**: On startup, the server indexes content in the background. If **`SEMANTIC_SEARCH_INDEX_DIRS`** is set (comma-separated paths), it scans those directories. If it is *not* set, the server downloads the [lojban/sampu_vlaste](https://github.com/lojban/sampu_vlaste) repository from GitHub and indexes that instead. In both cases, the server looks for `.txt`, `.md`, `.tsv`, `.csv` files. **`.txt`, `.tsv`, `.csv`**: each non-empty line is one record. **`.md`**: chunks by paragraphs and blocks—merged multi-line `>` blockquotes (e.g. Lojban + glosses), whole HTML `<table>...</table>` blocks, and blank-line-separated prose (including consecutive list items). Latest `##` / `###` titles are prepended as `Context: …` on each chunk for better retrieval. Each chunk gets one embedding (via [Hugging Face Transformers.js](https://huggingface.co/docs/transformers.js), model `Xenova/all-MiniLM-L6-v2`) and is stored in a local SQLite database with [@dao-xyz/sqlite3-vec](https://www.npmjs.com/package/@dao-xyz/sqlite3-vec) (SQLite + sqlite-vec for Node and browser). The `line` field in search results is the **start line** of that chunk in the file. After upgrading to a version that changes chunking, restart the server so files are re-indexed (mtime/content hash refresh).
 - **Search**: You send a natural-language query; the server embeds it and returns the closest lines by cosine similarity.
 - **Storage**: Index is stored in your project's `.semantic-search/data/` (or set `SEMANTIC_SEARCH_DATA_DIR`). No cloud, no API keys.
@@ -98,6 +98,8 @@ The server is **not built to JavaScript**; it runs via **`npx tsx src/index.ts`*
 To run the server from the repo: `npm run dev` or `npx tsx src/index.ts`.
+Run tests: `npm test`.
 ## License
 MIT

package/package.json CHANGED Viewed

@@ -1,10 +1,11 @@
 {
   "name": "@lojban/semantic-search-mcp",
-  "version": "1.0.15",
+  "version": "1.0.18",
   "description": "Local-first MCP server for semantic search using transformers.js and SQLite",
   "type": "module",
   "scripts": {
-    "dev": "tsx src/index.ts"
+    "dev": "tsx src/index.ts",
+    "test": "node --import tsx --test src/*.test.ts"
   },
   "dependencies": {
     "@dao-xyz/sqlite3-vec": "^0.0.19",

package/src/index.ts CHANGED Viewed

@@ -6,16 +6,17 @@ import {
   ListToolsRequestSchema,
 } from '@modelcontextprotocol/sdk/types.js';
 import path from 'path';
+import os from 'os';
 import { getEmbedding, getBatchEmbeddings } from './embeddings.js';
 import { createVectorStorage, type SearchResult } from './storage.js';
 import { normalizePath } from './path-util.js';
 import { listFilesInDirectories, readFileWithMetadata } from './scanner.js';
 import { getSampuVlasteDir } from './download-sampu-vlaste.js';
-// Data dir: use env, or project cwd so each workspace has its own index when run via npx from Cursor
+// Data dir: use env, or $HOME/.semantic-search/data so the same index is used across MCP version upgrades (npx cwd can change per version)
 const dataDir =
   process.env.SEMANTIC_SEARCH_DATA_DIR ||
-  path.join(process.cwd(), '.semantic-search', 'data');
+  path.join(os.homedir(), '.semantic-search', 'data');
 const DB_PATH = path.join(dataDir, 'vectors.db');
 // Background indexing state (progress for get_index_stats)
@@ -58,6 +59,7 @@ async function runBackgroundIndexing(
       storage.getFileMetadata(),
       storage.getIndexedFilePaths(),
     ]);
+    const indexedPathSet = new Set(indexedPaths.map((p) => normalizePath(p)));
     // Remove from DB any file whose directory is no longer in SEMANTIC_SEARCH_INDEX_DIRS
     for (const filePath of indexedPaths) {
@@ -67,7 +69,15 @@ async function runBackgroundIndexing(
     }
     const processBatch = async (
-      batch: Array<{ filePath: string; lineNumber: number; content: string }>
+      batch: Array<{
+        filePath: string;
+        lineNumber: number;
+        content: string;
+        chunkType?: 'prose' | 'list' | 'quote' | 'example' | 'table';
+        chapter?: string | null;
+        section?: string | null;
+        exampleId?: string | null;
+      }>
     ): Promise<void> => {
       if (batch.length === 0) return;
       const contents = batch.map((l) => l.content);
@@ -77,6 +87,10 @@ async function runBackgroundIndexing(
         lineNumber: line.lineNumber,
         content: line.content,
         embedding: embeddings[idx],
+        chunkType: line.chunkType ?? null,
+        chapter: line.chapter ?? null,
+        section: line.section ?? null,
+        exampleId: line.exampleId ?? null,
       }));
       await storage.upsertLinesBatch(batchData);
       indexingState.linesIndexed += batch.length;
@@ -88,14 +102,32 @@ async function runBackgroundIndexing(
       new Promise((resolve) => setImmediate(resolve));
     const currentFilesOnDisk = new Set<string>();
-    const toIndex: Array<{ filePath: string; mtimeMs: number; contentHash: string; lines: Array<{ filePath: string; lineNumber: number; content: string }> }> = [];
+    const toIndex: Array<{
+      filePath: string;
+      mtimeMs: number;
+      contentHash: string;
+      lines: Array<{
+        filePath: string;
+        lineNumber: number;
+        content: string;
+        chunkType?: 'prose' | 'list' | 'quote' | 'example' | 'table';
+        chapter?: string | null;
+        section?: string | null;
+        exampleId?: string | null;
+      }>;
+    }> = [];
     for await (const { filePath, mtimeMs } of listFilesInDirectories(directories)) {
       currentFilesOnDisk.add(filePath);
       const meta = fileMetadata.get(filePath);
       const content = readFileWithMetadata(filePath);
       if (!content) continue;
-      if (meta && meta.mtimeMs === content.mtimeMs && meta.contentHash === content.contentHash) {
+      if (
+        indexedPathSet.has(normalizePath(filePath)) &&
+        meta &&
+        meta.mtimeMs === content.mtimeMs &&
+        meta.contentHash === content.contentHash
+      ) {
         continue; // unchanged, skip
       }
       toIndex.push({
@@ -113,7 +145,15 @@ async function runBackgroundIndexing(
       }
     }
-    let currentBatch: Array<{ filePath: string; lineNumber: number; content: string }> = [];
+    let currentBatch: Array<{
+      filePath: string;
+      lineNumber: number;
+      content: string;
+      chunkType?: 'prose' | 'list' | 'quote' | 'example' | 'table';
+      chapter?: string | null;
+      section?: string | null;
+      exampleId?: string | null;
+    }> = [];
     let processingPromise: Promise<void> | null = null;
     for (const entry of toIndex) {
@@ -207,7 +247,7 @@ async function main() {
           const limit = (args as { query: string; limit?: number }).limit ?? 10;
           const queryEmbedding = await getEmbedding(query);
-          const results = await storage.search(queryEmbedding, limit);
+          const results = await storage.search(queryEmbedding, limit, query);
           return {
             content: [
@@ -220,6 +260,10 @@ async function main() {
                     line: r.line_number,
                     content: r.content,
                     score: Math.round(r.score * 1000) / 1000,
+                    chunk_type: r.chunk_type ?? undefined,
+                    chapter: r.chapter ?? undefined,
+                    section: r.section ?? undefined,
+                    example_id: r.example_id ?? undefined,
                   })),
                 }),
               },

package/src/markdown-chunks.test.ts ADDED Viewed

@@ -0,0 +1,106 @@
+import { describe, it } from 'node:test';
+import assert from 'node:assert/strict';
+import { extractMdChunks } from './markdown-chunks.js';
+const fp = '/books/cll/5.md';
+describe('extractMdChunks', () => {
+  it('merges example span + multiline blockquote and prefixes ex id', () => {
+    const md = `## Chapter 5: Selbri
+### Brivla
+<span id="ex-5-1"></span><span id="example-x"></span>
+> **do mamta mi**
+> _You are-a-mother-of me_
+> _You are my mother_
+The next paragraph after a blank line.
+`;
+    const chunks = extractMdChunks(md, fp);
+    const ex = chunks.find((c) => c.content.includes('Example ex-5-1'));
+    assert.ok(ex, 'expected chunk with ex anchor');
+    assert.match(ex!.content, /do mamta mi/);
+    assert.match(ex!.content, /You are my mother/);
+    assert.ok(ex!.content.includes('Context:'), 'heading context prepended');
+    assert.ok(ex!.content.includes('Chapter 5: Selbri'));
+    assert.ok(ex!.content.includes('Brivla'));
+    assert.equal(ex!.lineNumber, 6);
+    assert.equal(ex!.chunkType, 'example');
+    assert.equal(ex!.exampleId, 'ex-5-1');
+    assert.equal(ex!.chapter, 'Chapter 5: Selbri');
+    assert.equal(ex!.section, 'Brivla');
+  });
+  it('flattens HTML table into one chunk', () => {
+    const md = `## Ch 7
+<table>
+<tbody>
+<tr>
+<td>**mi**</td>
+<td>\`KOhA\`</td>
+<td>I, me</td>
+</tr>
+<tr>
+<td>**do**</td>
+<td>\`KOhA\`</td>
+<td>you</td>
+</tr>
+</tbody>
+</table>
+`;
+    const chunks = extractMdChunks(md, fp);
+    const t = chunks.find((c) => c.content.includes('**mi**'));
+    assert.ok(!t, 'old markdown-formatted variant should not remain');
+    const cleaned = chunks.find((c) => c.content.includes('mi') && c.content.includes('do'));
+    assert.ok(cleaned);
+    assert.match(cleaned!.content, /\bmi\b/);
+    assert.match(cleaned!.content, /\bdo\b/);
+    assert.ok(cleaned!.content.includes('I, me'));
+  });
+  it('merges consecutive list items without blank lines into one chunk', () => {
+    const md = `## Lists
+Intro line.
+- **mi'o** speaker and listener
+- **mi'a** speaker and others
+After blank.
+`;
+    const chunks = extractMdChunks(md, fp);
+    const list = chunks.find((c) => c.content.includes("mi'o") && c.content.includes("mi'a"));
+    assert.ok(list);
+  });
+  it('merges orphan blockquotes without leading span', () => {
+    const md = `> **ta bloti**
+> _That is a boat._
+Done.
+`;
+    const chunks = extractMdChunks(md, fp);
+    const q = chunks.find((c) => c.content.includes('ta bloti'));
+    assert.ok(q);
+    assert.match(q!.content, /That is a boat/);
+    assert.equal(q!.chunkType, 'quote');
+  });
+  it('skips non-example span-only anchor lines', () => {
+    const md = `## Chapter
+<span id="sec-5-1"></span><span id="section-content-words-brivla"></span>
+Paragraph with real content.
+`;
+    const chunks = extractMdChunks(md, fp);
+    const anchorOnly = chunks.find((c) => c.content.includes('sec-5-1'));
+    assert.equal(anchorOnly, undefined);
+    const prose = chunks.find((c) => c.content.includes('Paragraph with real content.'));
+    assert.ok(prose);
+  });
+});

package/src/markdown-chunks.ts ADDED Viewed

@@ -0,0 +1,408 @@
+import type { FileLine } from './scanner.js'; // type-only: no runtime cycle with scanner importing this module
+/** Min chunk length after merge (book mode); keeps short examples, drops noise */
+export const MD_MIN_CHUNK_LENGTH = 12;
+/** Rough max characters per indexed chunk before sentence split */
+export const MD_MAX_CHUNK_CHARS = 1800;
+/** Overlap when splitting long chunks (characters) */
+export const MD_SPLIT_OVERLAP = 100;
+const HEADING_RE = /^(#{1,6})\s+(.+)$/;
+const BLOCKQUOTE_RE = /^\s*>[ \t]?(.*)$/;
+const EX_ID_RE = /id="(ex-[^"]+)"/;
+const SPAN_TAG_RE = /<span\b[^>]*>\s*<\/span>/gi;
+type ChunkType = NonNullable<FileLine['chunkType']>;
+function isBlank(trimmed: string): boolean {
+  return trimmed.length === 0;
+}
+function isHtmlCommentLine(trimmed: string): boolean {
+  return /^<!--.*-->$/.test(trimmed);
+}
+function isMarkdownHeading(trimmed: string): boolean {
+  return HEADING_RE.test(trimmed);
+}
+function parseHeading(trimmed: string): { level: number; title: string } | null {
+  const m = trimmed.match(HEADING_RE);
+  if (!m) return null;
+  return { level: m[1].length, title: m[2].trim() };
+}
+/** CLL: line of span(s) before blockquote, with an example id */
+function isExampleAnchorLine(trimmed: string): boolean {
+  if (trimmed.startsWith('>')) return false;
+  if (!trimmed.includes('id="ex-')) return false;
+  // Mostly HTML span markup (allows multiple <span>...</span>)
+  const withoutTags = trimmed.replace(/<span\b[^>]*>[\s\S]*?<\/span>/gi, '').replace(/\s+/g, '');
+  return withoutTags.length === 0;
+}
+function extractFirstExId(trimmed: string): string | null {
+  const m = trimmed.match(EX_ID_RE);
+  return m ? m[1] : null;
+}
+function isSpanOnlyLine(trimmed: string): boolean {
+  const withoutSpans = trimmed.replace(SPAN_TAG_RE, '').replace(/\s+/g, '');
+  return withoutSpans.length === 0 && trimmed.includes('<span');
+}
+function stripBlockquotePrefix(line: string): string {
+  const m = line.match(BLOCKQUOTE_RE);
+  return m ? m[1] : line.trim();
+}
+function isBlockquoteLine(trimmed: string): boolean {
+  return /^\s*>/.test(trimmed);
+}
+interface HeadingState {
+  byLevel: Array<string | null>;
+}
+function contextPrefix(state: HeadingState): string {
+  const parts: string[] = [];
+  for (let level = 2; level <= 4; level++) {
+    const title = state.byLevel[level];
+    if (title) parts.push(title);
+  }
+  if (parts.length === 0) return '';
+  return `Context: ${parts.join(' / ')}\n\n`;
+}
+function updateHeadings(state: HeadingState, level: number, title: string): void {
+  if (level < 2) return;
+  const clamped = Math.min(level, 6);
+  state.byLevel[clamped] = title;
+  for (let l = clamped + 1; l <= 6; l++) {
+    state.byLevel[l] = null;
+  }
+}
+function normalizeInlineMarkdown(s: string): string {
+  let out = s;
+  // Keep visible text, drop link URLs.
+  out = out.replace(/\[([^\]]+)\]\([^)]+\)/g, '$1');
+  // Remove common inline formatting wrappers.
+  out = out
+    .replace(/\*\*([^*]+)\*\*/g, '$1')
+    .replace(/__([^_]+)__/g, '$1')
+    .replace(/`([^`]+)`/g, '$1')
+    .replace(/\*([^*]+)\*/g, '$1')
+    .replace(/_([^_]+)_/g, '$1');
+  // Drop remaining HTML tags.
+  out = out.replace(/<[^>]+>/g, ' ');
+  // Decode common entities.
+  out = out
+    .replace(/&nbsp;/gi, ' ')
+    .replace(/&amp;/gi, '&')
+    .replace(/&lt;/gi, '<')
+    .replace(/&gt;/gi, '>')
+    .replace(/&quot;/gi, '"')
+    .replace(/&#39;/gi, "'");
+  return out.replace(/\s+/g, ' ').trim();
+}
+function proseChunkTypeFromPieces(pieces: Array<{ line: number; text: string }>): ChunkType {
+  if (pieces.length > 0 && pieces.every((p) => /^([-*+]\s+|\d+[.)]\s+)/.test(p.text))) return 'list';
+  return 'prose';
+}
+function flattenTableLines(tableLines: string[]): string {
+  const joined = tableLines.join('\n');
+  let s = joined
+    .replace(/<tbody[^>]*>|<\/tbody>|<thead[^>]*>|<\/thead>|<table[^>]*>|<\/table>/gi, '')
+    .replace(/<\/tr>/gi, '\n')
+    .replace(/<tr[^>]*>/gi, '')
+    .replace(/<\/td>/gi, ' | ')
+    .replace(/<\/th>/gi, ' | ')
+    .replace(/<t[dh][^>]*>/gi, '');
+  s = s.replace(/<[^>]+>/g, '');
+  return s
+    .split('\n')
+    .map((l) => normalizeInlineMarkdown(l.replace(/\s*\|\s*$/, '').trim()))
+    .filter((l) => l.length > 0)
+    .join('\n');
+}
+function bodyJoined(pieces: Array<{ line: number; text: string }>): string {
+  return pieces.map((p) => p.text).join('\n');
+}
+/** Char index in `prefix + bodyJoined` (no context) -> source line */
+function lineInBody(
+  prefixLen: number,
+  pieces: Array<{ line: number; text: string }>,
+  idx: number
+): number {
+  if (idx < prefixLen || pieces.length === 0) return pieces[0]?.line ?? 1;
+  let j = idx - prefixLen;
+  let o = 0;
+  for (let i = 0; i < pieces.length; i++) {
+    const seg = pieces[i].text + (i < pieces.length - 1 ? '\n' : '');
+    if (j < o + seg.length) return pieces[i].line;
+    o += seg.length;
+  }
+  return pieces[pieces.length - 1]?.line ?? 1;
+}
+function lineInFull(ctxLen: number, prefixLen: number, pieces: Array<{ line: number; text: string }>, idx: number): number {
+  if (idx < ctxLen) return pieces[0]?.line ?? 1;
+  return lineInBody(prefixLen, pieces, idx - ctxLen);
+}
+function firstNonWsIndex(s: string, from: number, to: number): number {
+  let i = from;
+  while (i < to && i < s.length && /\s/.test(s[i])) i++;
+  return Math.min(i, s.length);
+}
+function splitLongText(
+  full: string,
+  ctxLen: number,
+  prefixLen: number,
+  pieces: Array<{ line: number; text: string }>
+): Array<{ startLine: number; content: string }> {
+  if (full.length <= MD_MAX_CHUNK_CHARS) {
+    const idx = firstNonWsIndex(full, 0, full.length);
+    return [{ startLine: lineInFull(ctxLen, prefixLen, pieces, idx), content: full.trim() }];
+  }
+  const out: Array<{ startLine: number; content: string }> = [];
+  let pos = 0;
+  const n = full.length;
+  while (pos < n) {
+    let end = Math.min(pos + MD_MAX_CHUNK_CHARS, n);
+    if (end < n) {
+      const window = full.slice(pos, end);
+      const lastSentence = Math.max(
+        window.lastIndexOf('. '),
+        window.lastIndexOf('? '),
+        window.lastIndexOf('! '),
+        window.lastIndexOf('.\n'),
+        window.lastIndexOf('?\n'),
+        window.lastIndexOf('!\n')
+      );
+      if (lastSentence >= MD_MAX_CHUNK_CHARS * 0.35) {
+        end = pos + lastSentence + 2;
+      } else {
+        const lastNl = window.lastIndexOf('\n');
+        if (lastNl > MD_MAX_CHUNK_CHARS * 0.28) end = pos + lastNl + 1;
+      }
+    }
+    const sliceStart = firstNonWsIndex(full, pos, end);
+    const part = full.slice(pos, end).trim();
+    if (part.length >= MD_MIN_CHUNK_LENGTH) {
+      out.push({ startLine: lineInFull(ctxLen, prefixLen, pieces, sliceStart), content: part });
+    }
+    if (end >= n) break;
+    const nextPos = end - MD_SPLIT_OVERLAP;
+    pos = nextPos > pos ? nextPos : end;
+  }
+  if (out.length === 0 && full.trim().length > 0) {
+    const idx = firstNonWsIndex(full, 0, n);
+    out.push({ startLine: lineInFull(ctxLen, prefixLen, pieces, idx), content: full.trim().slice(0, MD_MAX_CHUNK_CHARS) });
+  }
+  if (out.length > 1) {
+    const lines = new Set(out.map((o) => o.startLine));
+    if (lines.size === 1) {
+      const idx = firstNonWsIndex(full, 0, n);
+      return [{ startLine: lineInFull(ctxLen, prefixLen, pieces, idx), content: full.trim().slice(0, MD_MAX_CHUNK_CHARS) }];
+    }
+  }
+  return out;
+}
+function emitChunk(
+  filePath: string,
+  pieces: Array<{ line: number; text: string }>,
+  prefix: string,
+  headingState: HeadingState,
+  chunkType: ChunkType,
+  exampleId: string | null,
+  out: FileLine[]
+): void {
+  if (pieces.length === 0) return;
+  const body = bodyJoined(pieces);
+  const ctx = contextPrefix(headingState);
+  const inner = prefix + body;
+  const full = ctx + inner;
+  const ctxLen = ctx.length;
+  const prefixLen = prefix.length;
+  const splits = splitLongText(full, ctxLen, prefixLen, pieces);
+  const chapter = headingState.byLevel[2] ?? null;
+  const sectionParts: string[] = [];
+  for (let level = 3; level <= 4; level++) {
+    const t = headingState.byLevel[level];
+    if (t) sectionParts.push(t);
+  }
+  const section = sectionParts.length > 0 ? sectionParts.join(' / ') : null;
+  for (const s of splits) {
+    if (s.content.length >= MD_MIN_CHUNK_LENGTH) {
+      out.push({
+        filePath,
+        lineNumber: s.startLine,
+        content: s.content,
+        chunkType,
+        chapter,
+        section,
+        exampleId,
+      });
+    }
+  }
+}
+/**
+ * Book-style Markdown chunks: merged blockquotes (Lojban + glosses), whole HTML tables,
+ * blank-line paragraphs, consecutive list items as one block. Heading trail prepended for context.
+ */
+export function extractMdChunks(text: string, filePath: string): FileLine[] {
+  const rawLines = text.split(/\r?\n/);
+  const lines: Array<{ num: number; raw: string }> = [];
+  let n = 0;
+  for (const raw of rawLines) {
+    n++;
+    lines.push({ num: n, raw });
+  }
+  const out: FileLine[] = [];
+  const headingState: HeadingState = { byLevel: Array<string | null>(7).fill(null) };
+  let i = 0;
+  while (i < lines.length) {
+    const { num, raw } = lines[i];
+    const trimmed = raw.trim();
+    if (isBlank(trimmed)) {
+      i++;
+      continue;
+    }
+    if (isHtmlCommentLine(trimmed)) {
+      i++;
+      continue;
+    }
+    if (isMarkdownHeading(trimmed)) {
+      const ph = parseHeading(trimmed);
+      if (ph) updateHeadings(headingState, ph.level, normalizeInlineMarkdown(ph.title));
+      i++;
+      continue;
+    }
+    if (trimmed.includes('<table')) {
+      const startLine = num;
+      const tableLines: string[] = [];
+      let j = i;
+      while (j < lines.length) {
+        tableLines.push(lines[j].raw);
+        if (lines[j].raw.includes('</table>')) {
+          j++;
+          break;
+        }
+        j++;
+      }
+      const flat = flattenTableLines(tableLines).trim();
+      i = j;
+      if (flat.length >= MD_MIN_CHUNK_LENGTH) {
+        emitChunk(filePath, [{ line: startLine, text: flat }], '', headingState, 'table', null, out);
+      }
+      continue;
+    }
+    if (isExampleAnchorLine(trimmed)) {
+      const exId = extractFirstExId(trimmed);
+      const prefix = exId ? `Example ${exId}\n` : 'Example\n';
+      let j = i + 1;
+      const quotePieces: Array<{ line: number; text: string }> = [];
+      const anchorLine = num;
+      while (j < lines.length) {
+        const t = lines[j].raw.trim();
+        if (isBlank(t)) break;
+        if (isBlockquoteLine(t)) {
+          const cleaned = normalizeInlineMarkdown(stripBlockquotePrefix(lines[j].raw));
+          if (cleaned.length > 0) quotePieces.push({ line: lines[j].num, text: cleaned });
+          j++;
+          continue;
+        }
+        if (isExampleAnchorLine(t)) break;
+        break;
+      }
+      if (quotePieces.length > 0) {
+        emitChunk(filePath, quotePieces, prefix, headingState, 'example', exId, out);
+        i = j;
+        continue;
+      }
+      i++;
+      if ((prefix + trimmed).length >= MD_MIN_CHUNK_LENGTH) {
+        emitChunk(
+          filePath,
+          [{ line: anchorLine, text: normalizeInlineMarkdown(trimmed) }],
+          '',
+          headingState,
+          'example',
+          exId,
+          out
+        );
+      }
+      continue;
+    }
+    if (isSpanOnlyLine(trimmed)) {
+      i++;
+      continue;
+    }
+    if (isBlockquoteLine(trimmed)) {
+      const quotePieces: Array<{ line: number; text: string }> = [];
+      let j = i;
+      while (j < lines.length) {
+        const t = lines[j].raw.trim();
+        if (isBlockquoteLine(t)) {
+          const cleaned = normalizeInlineMarkdown(stripBlockquotePrefix(lines[j].raw));
+          if (cleaned.length > 0) quotePieces.push({ line: lines[j].num, text: cleaned });
+          j++;
+        } else break;
+      }
+      emitChunk(filePath, quotePieces, '', headingState, 'quote', null, out);
+      i = j;
+      continue;
+    }
+    const prosePieces: Array<{ line: number; text: string }> = [];
+    let j = i;
+    while (j < lines.length) {
+      const t = lines[j].raw.trim();
+      if (isBlank(t)) break;
+      if (isHtmlCommentLine(t)) {
+        j++;
+        continue;
+      }
+      if (isMarkdownHeading(t)) break;
+      if (t.includes('<table')) break;
+      if (isExampleAnchorLine(t)) break;
+      if (isBlockquoteLine(t)) break;
+      if (isSpanOnlyLine(t)) {
+        j++;
+        continue;
+      }
+      const cleaned = normalizeInlineMarkdown(lines[j].raw.trim());
+      if (cleaned.length > 0) prosePieces.push({ line: lines[j].num, text: cleaned });
+      j++;
+    }
+    if (prosePieces.length > 0) {
+      emitChunk(filePath, prosePieces, '', headingState, proseChunkTypeFromPieces(prosePieces), null, out);
+      i = j;
+      continue;
+    }
+    i++;
+  }
+  return out;
+}

package/src/scanner.ts CHANGED Viewed

@@ -4,11 +4,16 @@ import path from 'path';
 import readline from 'readline';
 import { createHash } from 'crypto';
 import { normalizePath } from './path-util.js';
+import { extractMdChunks } from './markdown-chunks.js';
 export interface FileLine {
   filePath: string;
   lineNumber: number;
   content: string;
+  chunkType?: 'prose' | 'list' | 'quote' | 'example' | 'table';
+  chapter?: string | null;
+  section?: string | null;
+  exampleId?: string | null;
 }
 // File extensions to index
@@ -50,6 +55,16 @@ export async function* scanDirectory(dirPath: string): AsyncGenerator<FileLine>
         continue;
       }
+      const ext = path.extname(filePath).toLowerCase();
+      if (ext === '.md') {
+        const normalizedPath = normalizePath(filePath);
+        const text = readFileSync(filePath, 'utf8');
+        for (const chunk of extractMdChunks(text, normalizedPath)) {
+          yield chunk;
+        }
+        continue;
+      }
       fileStream = createReadStream(filePath);
       rl = readline.createInterface({
         input: fileStream,
@@ -136,12 +151,17 @@ export function readFileWithMetadata(filePath: string): FileContentWithMetadata
     const text = raw.toString('utf8');
     const lines: FileLine[] = [];
     const normalizedPath = normalizePath(filePath);
-    let lineNumber = 0;
-    for (const rawLine of text.split(/\r?\n/)) {
-      lineNumber++;
-      const trimmed = rawLine.trim();
-      if (trimmed.length >= MIN_LINE_LENGTH) {
-        lines.push({ filePath: normalizedPath, lineNumber, content: trimmed });
+    const ext = path.extname(filePath).toLowerCase();
+    if (ext === '.md') {
+      lines.push(...extractMdChunks(text, normalizedPath));
+    } else {
+      let lineNumber = 0;
+      for (const rawLine of text.split(/\r?\n/)) {
+        lineNumber++;
+        const trimmed = rawLine.trim();
+        if (trimmed.length >= MIN_LINE_LENGTH) {
+          lines.push({ filePath: normalizedPath, lineNumber, content: trimmed });
+        }
       }
     }
     return { filePath: normalizedPath, mtimeMs, contentHash, lines };

package/src/storage.ts CHANGED Viewed

@@ -17,6 +17,10 @@ export interface LineRecord {
   line_number: number;
   content: string;
   embedding: Float32Array;
+  chunk_type?: 'prose' | 'list' | 'quote' | 'example' | 'table' | null;
+  chapter?: string | null;
+  section?: string | null;
+  example_id?: string | null;
 }
 export interface SearchResult {
@@ -24,6 +28,10 @@ export interface SearchResult {
   line_number: number;
   content: string;
   score: number;
+  chunk_type?: 'prose' | 'list' | 'quote' | 'example' | 'table' | null;
+  chapter?: string | null;
+  section?: string | null;
+  example_id?: string | null;
 }
 type DB = Awaited<ReturnType<typeof createDatabase>>;
@@ -47,10 +55,28 @@ export class VectorStorage {
         file_path TEXT NOT NULL,
         line_number INTEGER NOT NULL,
         content TEXT NOT NULL,
+        chunk_type TEXT,
+        chapter TEXT,
+        section TEXT,
+        example_id TEXT,
         UNIQUE(file_path, line_number)
       );
       CREATE INDEX IF NOT EXISTS idx_file ON lines(file_path);
+      CREATE INDEX IF NOT EXISTS idx_example_id ON lines(example_id);
     `);
+    // Forward-only schema evolution for existing DBs.
+    try {
+      this.db.exec('ALTER TABLE lines ADD COLUMN chunk_type TEXT');
+    } catch {}
+    try {
+      this.db.exec('ALTER TABLE lines ADD COLUMN chapter TEXT');
+    } catch {}
+    try {
+      this.db.exec('ALTER TABLE lines ADD COLUMN section TEXT');
+    } catch {}
+    try {
+      this.db.exec('ALTER TABLE lines ADD COLUMN example_id TEXT');
+    } catch {}
     this.db.exec(`
       CREATE VIRTUAL TABLE IF NOT EXISTS vec_lines USING vec0(
         line_id INTEGER PRIMARY KEY,
@@ -69,13 +95,37 @@ export class VectorStorage {
   /**
    * Insert or update a line with its embedding
    */
-  async upsertLine(filePath: string, lineNumber: number, content: string, embedding: Float32Array): Promise<void> {
+  async upsertLine(
+    filePath: string,
+    lineNumber: number,
+    content: string,
+    embedding: Float32Array,
+    meta?: {
+      chunkType?: string | null;
+      chapter?: string | null;
+      section?: string | null;
+      exampleId?: string | null;
+    }
+  ): Promise<void> {
     const insertLine = await this.db.prepare(
-      `INSERT INTO lines (file_path, line_number, content)
-       VALUES (?, ?, ?)
-       ON CONFLICT(file_path, line_number) DO UPDATE SET content = excluded.content`
+      `INSERT INTO lines (file_path, line_number, content, chunk_type, chapter, section, example_id)
+       VALUES (?, ?, ?, ?, ?, ?, ?)
+       ON CONFLICT(file_path, line_number) DO UPDATE SET
+         content = excluded.content,
+         chunk_type = excluded.chunk_type,
+         chapter = excluded.chapter,
+         section = excluded.section,
+         example_id = excluded.example_id`
     );
-    insertLine.run([filePath, lineNumber, content]);
+    insertLine.run([
+      filePath,
+      lineNumber,
+      content,
+      meta?.chunkType ?? null,
+      meta?.chapter ?? null,
+      meta?.section ?? null,
+      meta?.exampleId ?? null,
+    ]);
     const sel = await this.db.prepare('SELECT id FROM lines WHERE file_path = ? AND line_number = ?');
     const row = sel.get([filePath, lineNumber]) as { id: number } | undefined;
@@ -90,14 +140,28 @@ export class VectorStorage {
    * Batch insert lines for efficiency (single transaction)
    */
   async upsertLinesBatch(
-    lines: Array<{ filePath: string; lineNumber: number; content: string; embedding: Float32Array }>
+    lines: Array<{
+      filePath: string;
+      lineNumber: number;
+      content: string;
+      embedding: Float32Array;
+      chunkType?: string | null;
+      chapter?: string | null;
+      section?: string | null;
+      exampleId?: string | null;
+    }>
   ): Promise<void> {
     if (lines.length === 0) return;
     const insertLine = await this.db.prepare(
-      `INSERT INTO lines (file_path, line_number, content)
-       VALUES (?, ?, ?)
-       ON CONFLICT(file_path, line_number) DO UPDATE SET content = excluded.content`
+      `INSERT INTO lines (file_path, line_number, content, chunk_type, chapter, section, example_id)
+       VALUES (?, ?, ?, ?, ?, ?, ?)
+       ON CONFLICT(file_path, line_number) DO UPDATE SET
+         content = excluded.content,
+         chunk_type = excluded.chunk_type,
+         chapter = excluded.chapter,
+         section = excluded.section,
+         example_id = excluded.example_id`
     );
     const selId = await this.db.prepare('SELECT id FROM lines WHERE file_path = ? AND line_number = ?');
     const deleteVec = await this.db.prepare('DELETE FROM vec_lines WHERE line_id = ?');
@@ -106,7 +170,15 @@ export class VectorStorage {
     this.db.exec('BEGIN');
     try {
       for (const item of lines) {
-        insertLine.run([item.filePath, item.lineNumber, item.content]);
+        insertLine.run([
+          item.filePath,
+          item.lineNumber,
+          item.content,
+          item.chunkType ?? null,
+          item.chapter ?? null,
+          item.section ?? null,
+          item.exampleId ?? null,
+        ]);
         const row = selId.get([item.filePath, item.lineNumber]) as { id: number };
         const idInt = BigInt(row.id);
         deleteVec.run([idInt]);
@@ -122,30 +194,67 @@ export class VectorStorage {
   /**
    * Search for similar lines using sqlite-vec cosine distance
    */
-  async search(queryEmbedding: Float32Array, limit: number = 10): Promise<SearchResult[]> {
+  async search(queryEmbedding: Float32Array, limit: number = 10, queryText: string = ''): Promise<SearchResult[]> {
+    const candidateLimit = Math.max(limit * 4, limit);
     const stmt = await this.db.prepare(`
       SELECT
         l.file_path,
         l.line_number,
         l.content,
+        l.chunk_type,
+        l.chapter,
+        l.section,
+        l.example_id,
         vec_distance_cosine(v.embedding, ?) AS distance
       FROM vec_lines v
       INNER JOIN lines l ON v.line_id = l.id
       ORDER BY distance
       LIMIT ?
     `);
-    const rows = stmt.all([queryEmbedding.buffer, limit]) as Array<{
+    const rows = stmt.all([queryEmbedding.buffer, candidateLimit]) as Array<{
       file_path: string;
       line_number: number;
       content: string;
+      chunk_type?: 'prose' | 'list' | 'quote' | 'example' | 'table' | null;
+      chapter?: string | null;
+      section?: string | null;
+      example_id?: string | null;
       distance: number;
     }>;
-    return rows.map((row) => ({
-      file_path: row.file_path,
-      line_number: row.line_number,
-      content: row.content,
-      score: 1 - row.distance,
-    }));
+    const queryLower = queryText.toLowerCase();
+    const queryExample = queryLower.match(/\bex-\d+(?:-\d+)+\b/)?.[0] ?? null;
+    const queryTokens = new Set((queryLower.match(/[a-z0-9]{3,}/g) ?? []).slice(0, 32));
+    const wantsExamples = /\b(example|translation|translate|gloss|lojban)\b/.test(queryLower);
+    const wantsTable = /\b(table|cmavo|selma'o|series)\b/.test(queryLower);
+    const scored = rows.map((row) => {
+      let score = 1 - row.distance;
+      if (queryExample && row.example_id && row.example_id.toLowerCase() === queryExample) {
+        score += 0.25;
+      }
+      if (wantsExamples && row.chunk_type === 'example') score += 0.09;
+      if (wantsTable && row.chunk_type === 'table') score += 0.07;
+      if (queryTokens.size > 0) {
+        const titleText = `${row.chapter ?? ''} ${row.section ?? ''}`.toLowerCase();
+        const titleTokens = new Set(titleText.match(/[a-z0-9]{3,}/g) ?? []);
+        let overlap = 0;
+        for (const t of queryTokens) if (titleTokens.has(t)) overlap++;
+        if (overlap > 0) score += Math.min(0.08, overlap * 0.02);
+      }
+      return {
+        file_path: row.file_path,
+        line_number: row.line_number,
+        content: row.content,
+        score: Math.round(score * 1000) / 1000,
+        chunk_type: row.chunk_type ?? null,
+        chapter: row.chapter ?? null,
+        section: row.section ?? null,
+        example_id: row.example_id ?? null,
+      };
+    });
+    scored.sort((a, b) => b.score - a.score);
+    return scored.slice(0, limit);
   }
   async getStats(): Promise<{ totalFiles: number; totalLines: number }> {