ralph-hero-knowledge-index 0.1.21 → 0.1.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/.claude-plugin/plugin.json +1 -1
  2. package/.mcp.json +1 -1
  3. package/README.md +109 -0
  4. package/dist/config.d.ts +32 -0
  5. package/dist/config.js +75 -0
  6. package/dist/config.js.map +1 -0
  7. package/dist/db.d.ts +7 -0
  8. package/dist/db.js +17 -0
  9. package/dist/db.js.map +1 -1
  10. package/dist/embedder.d.ts +27 -0
  11. package/dist/embedder.js +43 -4
  12. package/dist/embedder.js.map +1 -1
  13. package/dist/file-scanner.d.ts +13 -1
  14. package/dist/file-scanner.js +30 -3
  15. package/dist/file-scanner.js.map +1 -1
  16. package/dist/hybrid-search.d.ts +12 -0
  17. package/dist/hybrid-search.js +74 -5
  18. package/dist/hybrid-search.js.map +1 -1
  19. package/dist/ignore.d.ts +29 -0
  20. package/dist/ignore.js +65 -0
  21. package/dist/ignore.js.map +1 -0
  22. package/dist/index.d.ts +9 -1
  23. package/dist/index.js +166 -6
  24. package/dist/index.js.map +1 -1
  25. package/dist/llm-client.d.ts +41 -0
  26. package/dist/llm-client.js +98 -0
  27. package/dist/llm-client.js.map +1 -0
  28. package/dist/reindex.d.ts +22 -3
  29. package/dist/reindex.js +85 -13
  30. package/dist/reindex.js.map +1 -1
  31. package/dist/search.d.ts +12 -0
  32. package/dist/search.js +15 -1
  33. package/dist/search.js.map +1 -1
  34. package/dist/vector-search.d.ts +10 -0
  35. package/dist/vector-search.js +15 -0
  36. package/dist/vector-search.js.map +1 -1
  37. package/package.json +2 -1
  38. package/src/__tests__/config.test.ts +173 -0
  39. package/src/__tests__/embedder.test.ts +103 -4
  40. package/src/__tests__/file-scanner.test.ts +88 -0
  41. package/src/__tests__/hybrid-search.test.ts +107 -0
  42. package/src/__tests__/ignore.test.ts +86 -0
  43. package/src/__tests__/index.test.ts +450 -0
  44. package/src/__tests__/llm-client.test.ts +349 -0
  45. package/src/__tests__/memory-stats.test.ts +204 -0
  46. package/src/__tests__/reindex.test.ts +187 -11
  47. package/src/__tests__/search.test.ts +37 -0
  48. package/src/config.ts +105 -0
  49. package/src/db.ts +17 -0
  50. package/src/embedder.ts +61 -4
  51. package/src/file-scanner.ts +28 -3
  52. package/src/hybrid-search.ts +88 -5
  53. package/src/ignore.ts +82 -0
  54. package/src/index.ts +202 -7
  55. package/src/llm-client.ts +136 -0
  56. package/src/reindex.ts +115 -14
  57. package/src/search.ts +27 -1
  58. package/src/vector-search.ts +16 -0
@@ -4,21 +4,41 @@ import { join, resolve } from "node:path";
4
4
  import { tmpdir } from "node:os";
5
5
  import { findMarkdownFiles } from "../file-scanner.js";
6
6
  import { FtsSearch } from "../search.js";
7
+ import { VectorSearch } from "../vector-search.js";
8
+
9
+ // Mock embedder so we don't load the real transformer model during tests.
10
+ // embedDocument returns one DocumentChunk per call with a constant 384-dim
11
+ // embedding; this matches the new chunk-aware reindex flow.
12
+ vi.mock("../embedder.js", async () => {
13
+ // Import the real chunker so the mock chunks content the same way as prod.
14
+ const { chunkText } = await import("../chunker.js");
15
+ return {
16
+ embed: vi.fn(async () => new Float32Array(384)),
17
+ embedDocument: vi.fn(async (_title: string, _tags: string[], content: string) => {
18
+ const chunks = content.length === 0
19
+ ? [{ index: 0, content: "", charStart: 0, charEnd: 0 }]
20
+ : chunkText(content);
21
+ return chunks.map(c => ({
22
+ index: c.index,
23
+ content: c.content,
24
+ charStart: c.charStart,
25
+ charEnd: c.charEnd,
26
+ embedding: new Float32Array(384),
27
+ }));
28
+ }),
29
+ prepareTextForEmbedding: vi.fn((title: string, tags: string[], content: string) => {
30
+ const tagLine = tags.length > 0 ? tags.join(", ") : "";
31
+ const parts = [title, tagLine, content].filter(p => p.length > 0);
32
+ return parts.join("\n");
33
+ }),
34
+ };
35
+ });
7
36
 
8
- vi.mock("../embedder.js", () => ({
9
- embed: vi.fn(async () => new Float32Array(384)),
10
- prepareTextForEmbedding: vi.fn((title: string, tags: string[], content: string) => {
11
- const tagLine = tags.length > 0 ? tags.join(", ") : "";
12
- const parts = [title, tagLine, content].filter(p => p.length > 0);
13
- return parts.join("\n").slice(0, 500);
14
- }),
15
- }));
16
-
17
- import { embed } from "../embedder.js";
37
+ import { embedDocument } from "../embedder.js";
18
38
  import { reindex } from "../reindex.js";
19
39
  import { KnowledgeDB } from "../db.js";
20
40
 
21
- const mockedEmbed = vi.mocked(embed);
41
+ const mockedEmbed = vi.mocked(embedDocument);
22
42
 
23
43
  function makeDoc(title: string): string {
24
44
  return `---\ndate: 2026-03-24\ntype: research\nstatus: draft\n---\n\n# ${title}\n\nContent for ${title}.`;
@@ -353,4 +373,160 @@ describe("incremental reindex", () => {
353
373
  expect(results.some(r => r.id === "fresh-doc")).toBe(true);
354
374
  db1.close();
355
375
  });
376
+
377
+ it("scenario 13: 8K-char document produces >= 4 chunk rows", async () => {
378
+ const longBody = "A".repeat(8000);
379
+ writeFileSync(
380
+ join(dir, "long-doc.md"),
381
+ `---\ndate: 2026-03-24\ntype: research\nstatus: draft\n---\n\n# Long Doc\n\n${longBody}`,
382
+ );
383
+
384
+ await reindex([dir], dbPath);
385
+
386
+ const db = new KnowledgeDB(dbPath);
387
+ const row = db.db
388
+ .prepare("SELECT COUNT(*) as n FROM chunks WHERE document_id = ?")
389
+ .get("long-doc") as { n: number };
390
+ expect(row.n).toBeGreaterThanOrEqual(4);
391
+ db.close();
392
+ });
393
+
394
+ it("scenario 14: documents_vec row count equals total chunk count", async () => {
395
+ writeFileSync(join(dir, "doc-a.md"), makeDoc("Doc A"));
396
+ writeFileSync(join(dir, "doc-b.md"), makeDoc("Doc B"));
397
+ const longBody = "A".repeat(6000);
398
+ writeFileSync(
399
+ join(dir, "long-doc.md"),
400
+ `---\ndate: 2026-03-24\ntype: research\nstatus: draft\n---\n\n# Long Doc\n\n${longBody}`,
401
+ );
402
+
403
+ await reindex([dir], dbPath);
404
+
405
+ const db = new KnowledgeDB(dbPath);
406
+ // Instantiating VectorSearch loads sqlite-vec so documents_vec is queryable.
407
+ new VectorSearch(db).createIndex();
408
+ const chunksRow = db.db.prepare("SELECT COUNT(*) as n FROM chunks").get() as {
409
+ n: number;
410
+ };
411
+ const vecRow = db.db
412
+ .prepare("SELECT COUNT(*) as n FROM documents_vec")
413
+ .get() as { n: number };
414
+ expect(vecRow.n).toBe(chunksRow.n);
415
+ expect(chunksRow.n).toBeGreaterThanOrEqual(3); // at least one per doc
416
+ db.close();
417
+ });
418
+
419
+ it("scenario 15: chunk ids follow pattern {docId}#c{index}", async () => {
420
+ const longBody = "A".repeat(6000);
421
+ writeFileSync(
422
+ join(dir, "long-doc.md"),
423
+ `---\ndate: 2026-03-24\ntype: research\nstatus: draft\n---\n\n# Long Doc\n\n${longBody}`,
424
+ );
425
+
426
+ await reindex([dir], dbPath);
427
+
428
+ const db = new KnowledgeDB(dbPath);
429
+ new VectorSearch(db).createIndex();
430
+ const rows = db.db
431
+ .prepare("SELECT id, chunk_index FROM chunks WHERE document_id = ? ORDER BY chunk_index")
432
+ .all("long-doc") as Array<{ id: string; chunk_index: number }>;
433
+ expect(rows.length).toBeGreaterThan(1);
434
+ const idPattern = /^long-doc#c\d+$/;
435
+ for (const r of rows) {
436
+ expect(r.id).toMatch(idPattern);
437
+ expect(r.id).toBe(`long-doc#c${r.chunk_index}`);
438
+ }
439
+ // Verify documents_vec ids also follow the pattern for this doc.
440
+ const vecRows = db.db
441
+ .prepare("SELECT id FROM documents_vec WHERE id GLOB ?")
442
+ .all("long-doc#c*") as Array<{ id: string }>;
443
+ expect(vecRows.length).toBe(rows.length);
444
+ for (const v of vecRows) {
445
+ expect(v.id).toMatch(idPattern);
446
+ }
447
+ db.close();
448
+ });
449
+
450
+ it("scenario 16: deleting source file removes its chunks and vec rows", async () => {
451
+ const filePath = join(dir, "disposable.md");
452
+ const longBody = "A".repeat(6000);
453
+ writeFileSync(
454
+ filePath,
455
+ `---\ndate: 2026-03-24\ntype: research\nstatus: draft\n---\n\n# Disposable\n\n${longBody}`,
456
+ );
457
+ writeFileSync(join(dir, "keeper.md"), makeDoc("Keeper"));
458
+
459
+ await reindex([dir], dbPath);
460
+
461
+ const db1 = new KnowledgeDB(dbPath);
462
+ new VectorSearch(db1).createIndex();
463
+ const chunksBefore = db1.db
464
+ .prepare("SELECT COUNT(*) as n FROM chunks WHERE document_id = ?")
465
+ .get("disposable") as { n: number };
466
+ expect(chunksBefore.n).toBeGreaterThan(1);
467
+ const vecsBefore = db1.db
468
+ .prepare("SELECT COUNT(*) as n FROM documents_vec WHERE id GLOB ?")
469
+ .get("disposable#c*") as { n: number };
470
+ expect(vecsBefore.n).toBe(chunksBefore.n);
471
+ db1.close();
472
+
473
+ unlinkSync(filePath);
474
+ await reindex([dir], dbPath);
475
+
476
+ const db2 = new KnowledgeDB(dbPath);
477
+ new VectorSearch(db2).createIndex();
478
+ // Document gone -> chunks cascaded.
479
+ expect(db2.getDocument("disposable")).toBeUndefined();
480
+ const chunksAfter = db2.db
481
+ .prepare("SELECT COUNT(*) as n FROM chunks WHERE document_id = ?")
482
+ .get("disposable") as { n: number };
483
+ expect(chunksAfter.n).toBe(0);
484
+ // Vec rows for the deleted doc are gone (GLOB-based cleanup).
485
+ const vecsAfter = db2.db
486
+ .prepare("SELECT COUNT(*) as n FROM documents_vec WHERE id GLOB ?")
487
+ .get("disposable#c*") as { n: number };
488
+ expect(vecsAfter.n).toBe(0);
489
+ // The kept doc still has its chunks.
490
+ const keeperChunks = db2.db
491
+ .prepare("SELECT COUNT(*) as n FROM chunks WHERE document_id = ?")
492
+ .get("keeper") as { n: number };
493
+ expect(keeperChunks.n).toBeGreaterThanOrEqual(1);
494
+ db2.close();
495
+ });
496
+
497
+ it("scenario 17: re-indexing same file does not duplicate chunks", async () => {
498
+ const filePath = join(dir, "stable.md");
499
+ const body = "A".repeat(6000);
500
+ writeFileSync(
501
+ filePath,
502
+ `---\ndate: 2026-03-24\ntype: research\nstatus: draft\n---\n\n# Stable\n\n${body}`,
503
+ );
504
+
505
+ await reindex([dir], dbPath);
506
+ const db1 = new KnowledgeDB(dbPath);
507
+ const firstCount = (db1.db
508
+ .prepare("SELECT COUNT(*) as n FROM chunks WHERE document_id = ?")
509
+ .get("stable") as { n: number }).n;
510
+ db1.close();
511
+ expect(firstCount).toBeGreaterThan(1);
512
+
513
+ // Bump mtime to force re-embed.
514
+ const future = Date.now() / 1000 + 2;
515
+ utimesSync(filePath, future, future);
516
+
517
+ await reindex([dir], dbPath);
518
+ const db2 = new KnowledgeDB(dbPath);
519
+ new VectorSearch(db2).createIndex();
520
+ const secondCount = (db2.db
521
+ .prepare("SELECT COUNT(*) as n FROM chunks WHERE document_id = ?")
522
+ .get("stable") as { n: number }).n;
523
+ // Stale deletion before insert means chunk count stays the same, not 2x.
524
+ expect(secondCount).toBe(firstCount);
525
+ // And vec rows should match.
526
+ const vecCount = (db2.db
527
+ .prepare("SELECT COUNT(*) as n FROM documents_vec WHERE id GLOB ?")
528
+ .get("stable#c*") as { n: number }).n;
529
+ expect(vecCount).toBe(secondCount);
530
+ db2.close();
531
+ });
356
532
  });
@@ -205,6 +205,43 @@ describe("FtsSearch", () => {
205
205
  });
206
206
  });
207
207
 
208
+ describe("memory_tier filter", () => {
209
+ it("filters by memory_tier when schema has the column", () => {
210
+ db.db
211
+ .prepare("UPDATE documents SET memory_tier = ? WHERE id = ?")
212
+ .run("reflection", "auth-doc");
213
+ fts.rebuildIndex();
214
+
215
+ // auth-doc is "reflection", so search for terms in auth-doc should hit.
216
+ const reflectionHits = fts.search("authentication", { memoryTier: "reflection" });
217
+ const ids = reflectionHits.map((r) => r.id);
218
+ expect(ids).toContain("auth-doc");
219
+
220
+ // A "doc" filter should omit the reflection-tagged doc.
221
+ const docHits = fts.search("authentication", { memoryTier: "doc" });
222
+ expect(docHits.some((r) => r.id === "auth-doc")).toBe(false);
223
+ });
224
+
225
+ it("ignores memory_tier silently when column is absent (v2 schema)", () => {
226
+ // beforeEach gives us a v2 schema — column does not exist.
227
+ const results = fts.search("cache", { memoryTier: "reflection" });
228
+ // Filter is a no-op on v2; regular FTS results come through.
229
+ expect(Array.isArray(results)).toBe(true);
230
+ });
231
+
232
+ it("returns all tiers when memoryTier='any'", () => {
233
+ db.db
234
+ .prepare("UPDATE documents SET memory_tier = ? WHERE id = ?")
235
+ .run("reflection", "auth-doc");
236
+ fts.rebuildIndex();
237
+
238
+ const authHits = fts.search("authentication", { memoryTier: "any" });
239
+ expect(authHits.some((r) => r.id === "auth-doc")).toBe(true);
240
+ const cacheHits = fts.search("cache", { memoryTier: "any" });
241
+ expect(cacheHits.some((r) => r.id === "cache-doc")).toBe(true);
242
+ });
243
+ });
244
+
208
245
  describe("ensureTable", () => {
209
246
  it("creates FTS table if it does not exist", () => {
210
247
  // Create a fresh DB without FTS table
package/src/config.ts ADDED
@@ -0,0 +1,105 @@
1
+ import { readFileSync, existsSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ import { homedir } from "node:os";
4
+
5
+ /**
6
+ * Shape of the optional `~/.ralph/knowledge.config.json` file.
7
+ *
8
+ * All fields are optional. Unknown fields are preserved at parse time but are
9
+ * not surfaced through this interface — callers should treat the file as
10
+ * forward-compatible.
11
+ */
12
+ export interface KnowledgeConfig {
13
+ /** Absolute or `~`-prefixed directories to index. */
14
+ roots?: string[];
15
+ /** Extra gitignore-syntax patterns layered on top of per-root `.ralphignore`. */
16
+ ignorePatterns?: string[];
17
+ /** Override for the SQLite database path. */
18
+ dbPath?: string;
19
+ }
20
+
21
+ /**
22
+ * Expand a leading `~` or `~/` segment in a path to the user's home directory.
23
+ * Paths that do not begin with `~` are returned unchanged.
24
+ */
25
+ export function expandHome(p: string): string {
26
+ if (!p) return p;
27
+ if (p === "~") return homedir();
28
+ if (p.startsWith("~/") || p.startsWith("~\\")) {
29
+ return join(homedir(), p.slice(2));
30
+ }
31
+ return p;
32
+ }
33
+
34
+ /**
35
+ * Resolve the knowledge config file path. Precedence:
36
+ * 1. `process.env.RALPH_KNOWLEDGE_CONFIG`
37
+ * 2. `~/.ralph/knowledge.config.json`
38
+ */
39
+ export function resolveConfigPath(): string {
40
+ const envPath = process.env.RALPH_KNOWLEDGE_CONFIG;
41
+ if (envPath && envPath.trim().length > 0) {
42
+ return expandHome(envPath);
43
+ }
44
+ return join(homedir(), ".ralph", "knowledge.config.json");
45
+ }
46
+
47
+ /**
48
+ * Load the optional `knowledge.config.json` file. Returns an empty object when
49
+ * the file is missing or malformed. Tilde-prefixed paths inside `roots` and
50
+ * `dbPath` are expanded eagerly so callers receive absolute paths.
51
+ */
52
+ export function loadConfig(): KnowledgeConfig {
53
+ const configPath = resolveConfigPath();
54
+ if (!existsSync(configPath)) {
55
+ return {};
56
+ }
57
+
58
+ let raw: string;
59
+ try {
60
+ raw = readFileSync(configPath, "utf-8");
61
+ } catch (e) {
62
+ console.warn(
63
+ `Failed to read knowledge config at ${configPath}: ${(e as Error).message}`,
64
+ );
65
+ return {};
66
+ }
67
+
68
+ let parsed: unknown;
69
+ try {
70
+ parsed = JSON.parse(raw);
71
+ } catch (e) {
72
+ console.warn(
73
+ `Malformed JSON in knowledge config at ${configPath}: ${(e as Error).message}`,
74
+ );
75
+ return {};
76
+ }
77
+
78
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
79
+ console.warn(
80
+ `Knowledge config at ${configPath} is not a JSON object; ignoring.`,
81
+ );
82
+ return {};
83
+ }
84
+
85
+ const obj = parsed as Record<string, unknown>;
86
+ const out: KnowledgeConfig = {};
87
+
88
+ if (Array.isArray(obj.roots)) {
89
+ out.roots = obj.roots
90
+ .filter((r): r is string => typeof r === "string" && r.length > 0)
91
+ .map(expandHome);
92
+ }
93
+
94
+ if (Array.isArray(obj.ignorePatterns)) {
95
+ out.ignorePatterns = obj.ignorePatterns.filter(
96
+ (p): p is string => typeof p === "string" && p.length > 0,
97
+ );
98
+ }
99
+
100
+ if (typeof obj.dbPath === "string" && obj.dbPath.length > 0) {
101
+ out.dbPath = expandHome(obj.dbPath);
102
+ }
103
+
104
+ return out;
105
+ }
package/src/db.ts CHANGED
@@ -468,6 +468,23 @@ export class KnowledgeDB {
468
468
  return row !== undefined;
469
469
  }
470
470
 
471
+ /**
472
+ * Returns the `memory_tier` for the given document id. Returns `undefined`
473
+ * when the document does not exist OR when the `memory_tier` column is
474
+ * absent from the schema (pre-v3 databases). Used by MCP `knowledge_*`
475
+ * tools that need to post-filter result sets by tier.
476
+ */
477
+ getMemoryTier(id: string): string | undefined {
478
+ const columns = this.db
479
+ .prepare("PRAGMA table_info(documents)")
480
+ .all() as Array<{ name: string }>;
481
+ if (!columns.some((c) => c.name === "memory_tier")) return undefined;
482
+ const row = this.db
483
+ .prepare("SELECT memory_tier AS memoryTier FROM documents WHERE id = ?")
484
+ .get(id) as { memoryTier: string } | undefined;
485
+ return row?.memoryTier;
486
+ }
487
+
471
488
  deleteDocument(id: string): void {
472
489
  this.db.prepare("DELETE FROM documents WHERE id = ?").run(id);
473
490
  }
package/src/embedder.ts CHANGED
@@ -2,9 +2,9 @@ import {
2
2
  pipeline,
3
3
  type FeatureExtractionPipeline,
4
4
  } from "@huggingface/transformers";
5
+ import { chunkText, type Chunk, type ChunkerOptions } from "./chunker.js";
5
6
 
6
7
  const MODEL_ID = "Xenova/all-MiniLM-L6-v2";
7
- const MAX_CHARS = 500;
8
8
 
9
9
  let embedderInstance: FeatureExtractionPipeline | null = null;
10
10
 
@@ -21,14 +21,71 @@ export async function getEmbedder(): Promise<FeatureExtractionPipeline> {
21
21
 
22
22
  export async function embed(text: string): Promise<Float32Array> {
23
23
  const embedder = await getEmbedder();
24
- const truncated = text.slice(0, MAX_CHARS);
25
- const output = await embedder(truncated, {
24
+ // Pass text directly — the transformer's own 512-token window handles overflow.
25
+ const output = await embedder(text, {
26
26
  pooling: "mean",
27
27
  normalize: true,
28
28
  });
29
29
  return new Float32Array(output.data as ArrayLike<number>);
30
30
  }
31
31
 
32
+ /**
33
+ * A chunk paired with the embedding of its (contextualized) content.
34
+ * Extends the base Chunk from the chunker module with an embedding vector
35
+ * and an optional contextPrefix (populated by Phase 6 — contextual retrieval).
36
+ */
37
+ export interface DocumentChunk extends Chunk {
38
+ embedding: Float32Array;
39
+ contextPrefix?: string;
40
+ }
41
+
42
+ /**
43
+ * Embed a document by splitting it into chunks and emitting one embedding
44
+ * per chunk. The embedded text for each chunk is
45
+ * `${title}\n${tagLine}\n${chunk.content}` so the semantic anchors (title +
46
+ * tags) travel with every chunk embedding — matching the shape of the legacy
47
+ * `prepareTextForEmbedding()` but without the 500-char truncation.
48
+ *
49
+ * Short documents (<= chunkSize) produce exactly one chunk covering the whole
50
+ * content. Empty content yields a single chunk with empty content (so callers
51
+ * still get a title/tag-only embedding for stub documents).
52
+ */
53
+ export async function embedDocument(
54
+ title: string,
55
+ tags: string[],
56
+ content: string,
57
+ opts?: ChunkerOptions,
58
+ ): Promise<DocumentChunk[]> {
59
+ const tagLine = tags.length > 0 ? tags.join(", ") : "";
60
+
61
+ // If content is empty, still emit one chunk so the document has a searchable
62
+ // embedding anchored on title + tags (preserves legacy behavior for
63
+ // frontmatter-only / stub documents).
64
+ const chunks: Chunk[] = content.length === 0
65
+ ? [{ index: 0, content: "", charStart: 0, charEnd: 0 }]
66
+ : chunkText(content, opts);
67
+
68
+ const out: DocumentChunk[] = [];
69
+ for (const chunk of chunks) {
70
+ const parts = [title, tagLine, chunk.content].filter(p => p.length > 0);
71
+ const embedText = parts.join("\n");
72
+ const embedding = await embed(embedText);
73
+ out.push({
74
+ index: chunk.index,
75
+ content: chunk.content,
76
+ charStart: chunk.charStart,
77
+ charEnd: chunk.charEnd,
78
+ embedding,
79
+ });
80
+ }
81
+ return out;
82
+ }
83
+
84
+ /**
85
+ * Back-compat shim: kept so callers outside the reindex path can still build
86
+ * a title/tags/first-paragraph string. No longer used by `embedDocument` (the
87
+ * per-chunk flow prepends title + tags directly).
88
+ */
32
89
  export function prepareTextForEmbedding(
33
90
  title: string,
34
91
  tags: string[],
@@ -39,5 +96,5 @@ export function prepareTextForEmbedding(
39
96
  const paragraphs = content.split(/\n\n+/);
40
97
  const firstParagraph = paragraphs.find(p => p.trim().length > 0)?.trim() ?? "";
41
98
  const parts = [title, tagLine, firstParagraph].filter(p => p.length > 0);
42
- return parts.join("\n").slice(0, MAX_CHARS);
99
+ return parts.join("\n");
43
100
  }
@@ -1,14 +1,39 @@
1
1
  import { readdirSync } from "node:fs";
2
- import { join } from "node:path";
2
+ import { join, relative } from "node:path";
3
+ import type { IgnoreMatcher } from "./ignore.js";
3
4
 
4
- export function findMarkdownFiles(dir: string): string[] {
5
+ /**
6
+ * Recursively find all `.md` files under `dir`.
7
+ *
8
+ * Directory names beginning with `.` or `_` and file names beginning with `_`
9
+ * are always skipped (fast-path). When an {@link IgnoreMatcher} is supplied,
10
+ * each remaining path is additionally tested against it via its root-relative
11
+ * form; matches are skipped.
12
+ *
13
+ * @param dir root directory to walk
14
+ * @param matcher optional matcher built via `loadIgnoreForRoot(dir, …)`
15
+ */
16
+ export function findMarkdownFiles(dir: string, matcher?: IgnoreMatcher): string[] {
5
17
  const results: string[] = [];
6
18
  function walk(d: string) {
7
19
  for (const entry of readdirSync(d, { withFileTypes: true })) {
8
20
  const fullPath = join(d, entry.name);
9
- if (entry.isDirectory() && !entry.name.startsWith(".") && !entry.name.startsWith("_")) {
21
+ if (entry.isDirectory()) {
22
+ // Fast-path: hidden/underscored directories are always skipped.
23
+ if (entry.name.startsWith(".") || entry.name.startsWith("_")) continue;
24
+ if (matcher) {
25
+ // Test both bare and trailing-slash forms so gitignore-style
26
+ // directory-only patterns (e.g., `dist/`) match even when the
27
+ // directory itself has not yet been descended.
28
+ const rel = relative(dir, fullPath);
29
+ if (matcher.isIgnored(rel) || matcher.isIgnored(`${rel}/`)) continue;
30
+ }
10
31
  walk(fullPath);
11
32
  } else if (entry.isFile() && entry.name.endsWith(".md") && !entry.name.startsWith("_")) {
33
+ if (matcher) {
34
+ const rel = relative(dir, fullPath);
35
+ if (matcher.isIgnored(rel)) continue;
36
+ }
12
37
  results.push(fullPath);
13
38
  }
14
39
  }
@@ -4,6 +4,16 @@ import type { VectorSearch } from "./vector-search.js";
4
4
 
5
5
  export type EmbedFn = (text: string) => Promise<Float32Array>;
6
6
 
7
+ interface ChunkRow {
8
+ id: string;
9
+ document_id: string;
10
+ chunk_index: number;
11
+ char_start: number;
12
+ char_end: number;
13
+ context_prefix: string;
14
+ content: string;
15
+ }
16
+
7
17
  export class HybridSearch {
8
18
  private static readonly RRF_K = 60;
9
19
 
@@ -14,23 +24,64 @@ export class HybridSearch {
14
24
  private readonly embedFn: EmbedFn,
15
25
  ) {}
16
26
 
27
+ /**
28
+ * Returns true when the `chunks` table exists (schema v3+). When absent we
29
+ * behave as if all vector ids are doc ids (pre-chunking behavior).
30
+ */
31
+ private chunksTableExists(): boolean {
32
+ const row = this.db.db
33
+ .prepare(
34
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'",
35
+ )
36
+ .get();
37
+ return row !== undefined;
38
+ }
39
+
40
+ /**
41
+ * Given a vector-search id, return the `document_id` portion. Chunk ids
42
+ * follow the pattern `{doc_id}#c{index}` per Shared Constraint #6 of the
43
+ * GH-0761 plan. Legacy non-chunk ids pass through unchanged.
44
+ */
45
+ private docIdFromVecId(vecId: string): string {
46
+ const marker = vecId.lastIndexOf("#c");
47
+ if (marker === -1) return vecId;
48
+ const suffix = vecId.slice(marker + 2);
49
+ if (suffix.length === 0 || !/^\d+$/.test(suffix)) return vecId;
50
+ return vecId.slice(0, marker);
51
+ }
52
+
53
+ private fetchChunk(chunkId: string): ChunkRow | undefined {
54
+ if (!this.chunksTableExists()) return undefined;
55
+ return this.db.db
56
+ .prepare(
57
+ `SELECT id, document_id, chunk_index, char_start, char_end, context_prefix, content
58
+ FROM chunks WHERE id = ?`,
59
+ )
60
+ .get(chunkId) as ChunkRow | undefined;
61
+ }
62
+
17
63
  async search(
18
64
  query: string,
19
65
  options: SearchOptions = {},
20
66
  ): Promise<SearchResult[]> {
21
- const { type, tags, includeSuperseded = false, limit = 20 } = options;
67
+ const { type, tags, includeSuperseded = false, limit = 20, memoryTier } = options;
22
68
 
23
- // Run FTS and vector search
69
+ // Run FTS and vector search (FTS already applies memoryTier filter in SQL
70
+ // when the schema supports it).
24
71
  const ftsResults = this.fts.search(query, {
25
72
  includeSuperseded: true,
26
73
  limit: limit * 2,
74
+ memoryTier,
27
75
  });
28
76
 
29
77
  const queryEmbedding = await this.embedFn(query);
30
78
  const vecResults = this.vec.search(queryEmbedding, limit * 2);
31
79
 
32
- // Build RRF score map
80
+ // Build RRF score map, keyed by document_id. When vec ids are chunk ids
81
+ // like `{doc}#c{n}`, we collapse to the parent doc for scoring but
82
+ // remember the best-scoring chunk id per doc for later meta enrichment.
33
83
  const scores = new Map<string, number>();
84
+ const bestChunkByDoc = new Map<string, { chunkId: string; rank: number }>();
34
85
 
35
86
  for (let i = 0; i < ftsResults.length; i++) {
36
87
  const id = ftsResults[i].id;
@@ -39,9 +90,16 @@ export class HybridSearch {
39
90
  }
40
91
 
41
92
  for (let i = 0; i < vecResults.length; i++) {
42
- const id = vecResults[i].id;
93
+ const vecId = vecResults[i].id;
94
+ const docId = this.docIdFromVecId(vecId);
43
95
  const rrfScore = 1 / (HybridSearch.RRF_K + i + 1);
44
- scores.set(id, (scores.get(id) ?? 0) + rrfScore);
96
+ scores.set(docId, (scores.get(docId) ?? 0) + rrfScore);
97
+ if (vecId !== docId) {
98
+ const existing = bestChunkByDoc.get(docId);
99
+ if (!existing || i < existing.rank) {
100
+ bestChunkByDoc.set(docId, { chunkId: vecId, rank: i });
101
+ }
102
+ }
45
103
  }
46
104
 
47
105
  // Build a lookup of FTS results by id for quick access
@@ -98,6 +156,31 @@ export class HybridSearch {
98
156
  });
99
157
  }
100
158
 
159
+ // Post-filter: memory_tier for vector-only hits that bypassed the FTS
160
+ // SQL filter. Also covers the case where the FTS stage returned 0 rows
161
+ // but vec returned chunks from a doc in another tier.
162
+ if (memoryTier && memoryTier !== "any") {
163
+ filtered = filtered.filter((r) => {
164
+ const tier = this.db.getMemoryTier(r.id);
165
+ // When column absent (v2 schema) treat as "doc"
166
+ return (tier ?? "doc") === memoryTier;
167
+ });
168
+ }
169
+
170
+ // Enrich with chunk meta when chunk data is available (best-scoring
171
+ // chunk per doc).
172
+ for (const r of filtered) {
173
+ const best = bestChunkByDoc.get(r.id);
174
+ if (!best) continue;
175
+ const chunk = this.fetchChunk(best.chunkId);
176
+ if (!chunk) continue;
177
+ r.bestChunkId = chunk.id;
178
+ r.chunkIndex = chunk.chunk_index;
179
+ r.charStart = chunk.char_start;
180
+ r.charEnd = chunk.char_end;
181
+ r.contextPrefix = chunk.context_prefix;
182
+ }
183
+
101
184
  return filtered.slice(0, limit);
102
185
  }
103
186
  }