ralph-hero-knowledge-index 0.1.21 → 0.1.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/.claude-plugin/plugin.json +1 -1
  2. package/.mcp.json +1 -1
  3. package/README.md +109 -0
  4. package/dist/config.d.ts +32 -0
  5. package/dist/config.js +75 -0
  6. package/dist/config.js.map +1 -0
  7. package/dist/db.d.ts +7 -0
  8. package/dist/db.js +17 -0
  9. package/dist/db.js.map +1 -1
  10. package/dist/embedder.d.ts +27 -0
  11. package/dist/embedder.js +43 -4
  12. package/dist/embedder.js.map +1 -1
  13. package/dist/file-scanner.d.ts +13 -1
  14. package/dist/file-scanner.js +30 -3
  15. package/dist/file-scanner.js.map +1 -1
  16. package/dist/hybrid-search.d.ts +12 -0
  17. package/dist/hybrid-search.js +74 -5
  18. package/dist/hybrid-search.js.map +1 -1
  19. package/dist/ignore.d.ts +29 -0
  20. package/dist/ignore.js +65 -0
  21. package/dist/ignore.js.map +1 -0
  22. package/dist/index.d.ts +9 -1
  23. package/dist/index.js +166 -6
  24. package/dist/index.js.map +1 -1
  25. package/dist/llm-client.d.ts +41 -0
  26. package/dist/llm-client.js +98 -0
  27. package/dist/llm-client.js.map +1 -0
  28. package/dist/reindex.d.ts +22 -3
  29. package/dist/reindex.js +85 -13
  30. package/dist/reindex.js.map +1 -1
  31. package/dist/search.d.ts +12 -0
  32. package/dist/search.js +15 -1
  33. package/dist/search.js.map +1 -1
  34. package/dist/vector-search.d.ts +10 -0
  35. package/dist/vector-search.js +15 -0
  36. package/dist/vector-search.js.map +1 -1
  37. package/package.json +2 -1
  38. package/src/__tests__/config.test.ts +173 -0
  39. package/src/__tests__/embedder.test.ts +103 -4
  40. package/src/__tests__/file-scanner.test.ts +88 -0
  41. package/src/__tests__/hybrid-search.test.ts +107 -0
  42. package/src/__tests__/ignore.test.ts +86 -0
  43. package/src/__tests__/index.test.ts +450 -0
  44. package/src/__tests__/llm-client.test.ts +349 -0
  45. package/src/__tests__/memory-stats.test.ts +204 -0
  46. package/src/__tests__/reindex.test.ts +187 -11
  47. package/src/__tests__/search.test.ts +37 -0
  48. package/src/config.ts +105 -0
  49. package/src/db.ts +17 -0
  50. package/src/embedder.ts +61 -4
  51. package/src/file-scanner.ts +28 -3
  52. package/src/hybrid-search.ts +88 -5
  53. package/src/ignore.ts +82 -0
  54. package/src/index.ts +202 -7
  55. package/src/llm-client.ts +136 -0
  56. package/src/reindex.ts +115 -14
  57. package/src/search.ts +27 -1
  58. package/src/vector-search.ts +16 -0
package/src/ignore.ts ADDED
@@ -0,0 +1,82 @@
1
+ import { readFileSync, existsSync } from "node:fs";
2
+ import { join } from "node:path";
3
+ import ignorePkg, { type Ignore } from "ignore";
4
+
5
+ // The `ignore` CJS module exposes the factory via `module.exports = factory`
6
+ // with `factory.default = factory` attached. Under `NodeNext` + ESM, depending
7
+ // on the interop mode, the default import can resolve to either the factory
8
+ // itself or the whole namespace. Probe and pick the callable form.
9
+ const ignore: (options?: { ignorecase?: boolean }) => Ignore = (
10
+ typeof (ignorePkg as unknown) === "function"
11
+ ? (ignorePkg as unknown as (options?: { ignorecase?: boolean }) => Ignore)
12
+ : ((ignorePkg as unknown as { default: (options?: { ignorecase?: boolean }) => Ignore }).default)
13
+ );
14
+
15
+ /**
16
+ * Default ignore patterns applied to every root even when no `.ralphignore`
17
+ * file or caller-supplied globals are provided. These target directories and
18
+ * files that should virtually never be indexed.
19
+ */
20
+ export const DEFAULT_IGNORE_PATTERNS: string[] = [
21
+ ".claude/",
22
+ "node_modules/",
23
+ "dist/",
24
+ ".git/",
25
+ "*.log",
26
+ ];
27
+
28
+ /**
29
+ * Opaque matcher returned by {@link loadIgnoreForRoot}. Given a path relative
30
+ * to the root used to construct the matcher, {@link isIgnored} reports whether
31
+ * the path should be skipped by the scanner.
32
+ */
33
+ export interface IgnoreMatcher {
34
+ isIgnored(relativePath: string): boolean;
35
+ }
36
+
37
+ /**
38
+ * Build an {@link IgnoreMatcher} for a given root directory. The matcher
39
+ * combines (in order):
40
+ * 1. {@link DEFAULT_IGNORE_PATTERNS} — always applied.
41
+ * 2. `globalPatterns` — caller-supplied patterns (typically from
42
+ * `knowledge.config.json`'s `ignorePatterns`).
43
+ * 3. Contents of `<rootDir>/.ralphignore`, if present.
44
+ *
45
+ * All patterns follow gitignore syntax via the `ignore` package.
46
+ *
47
+ * @param rootDir absolute path of the root being scanned
48
+ * @param globalPatterns optional extra patterns applied before the per-root
49
+ * `.ralphignore` file
50
+ */
51
+ export function loadIgnoreForRoot(
52
+ rootDir: string,
53
+ globalPatterns?: string[],
54
+ ): IgnoreMatcher {
55
+ const ign: Ignore = ignore();
56
+ ign.add(DEFAULT_IGNORE_PATTERNS);
57
+ if (globalPatterns && globalPatterns.length > 0) {
58
+ ign.add(globalPatterns);
59
+ }
60
+
61
+ const ralphIgnorePath = join(rootDir, ".ralphignore");
62
+ if (existsSync(ralphIgnorePath)) {
63
+ try {
64
+ const contents = readFileSync(ralphIgnorePath, "utf-8");
65
+ ign.add(contents);
66
+ } catch (e) {
67
+ console.warn(
68
+ `Failed to read .ralphignore at ${ralphIgnorePath}: ${(e as Error).message}`,
69
+ );
70
+ }
71
+ }
72
+
73
+ return {
74
+ isIgnored(relativePath: string): boolean {
75
+ if (!relativePath) return false;
76
+ // `ignore` package requires forward-slash paths with no leading slash.
77
+ const normalized = relativePath.replace(/\\/g, "/").replace(/^\/+/, "");
78
+ if (!normalized) return false;
79
+ return ign.ignores(normalized);
80
+ },
81
+ };
82
+ }
package/src/index.ts CHANGED
@@ -22,12 +22,60 @@ function resolveEnv(name: string): string | undefined {
22
22
  return val;
23
23
  }
24
24
 
25
- export function createServer(dbPath: string) {
25
+ /**
26
+ * True when the `chunks` table exists in the schema (v3+). When absent,
27
+ * `knowledge_memory_stats` reports 0 chunks-per-doc percentiles.
28
+ */
29
+ function chunksTableExists(db: KnowledgeDB): boolean {
30
+ const row = db.db
31
+ .prepare(
32
+ "SELECT name FROM sqlite_master WHERE type='table' AND name='chunks'",
33
+ )
34
+ .get();
35
+ return row !== undefined;
36
+ }
37
+
38
+ /**
39
+ * True when the `documents.memory_tier` column exists (v3+). Used to decide
40
+ * whether tier-level stats can be produced from the schema at all.
41
+ */
42
+ function memoryTierColumnExists(db: KnowledgeDB): boolean {
43
+ const rows = db.db
44
+ .prepare("PRAGMA table_info(documents)")
45
+ .all() as Array<{ name: string }>;
46
+ return rows.some((r) => r.name === "memory_tier");
47
+ }
48
+
49
+ /**
50
+ * Percentile helper using nearest-rank. For n sorted values returns the value
51
+ * at index `floor(n * p)` clamped to [0, n-1]. Returns 0 on empty input.
52
+ * Matches the spec in Phase 8 Task 8.4: "pick index at floor(n*0.5)".
53
+ */
54
+ function percentile(sortedValues: number[], p: number): number {
55
+ if (sortedValues.length === 0) return 0;
56
+ const idx = Math.min(
57
+ sortedValues.length - 1,
58
+ Math.max(0, Math.floor(sortedValues.length * p)),
59
+ );
60
+ return sortedValues[idx];
61
+ }
62
+
63
+ /**
64
+ * Options for `createServer`. When `embedFn` is provided it replaces the
65
+ * production `embed` import, allowing tests to bypass the HuggingFace model
66
+ * download.
67
+ */
68
+ export interface CreateServerOptions {
69
+ embedFn?: (text: string) => Promise<Float32Array>;
70
+ }
71
+
72
+ export function createServer(dbPath: string, opts: CreateServerOptions = {}) {
26
73
  const server = new McpServer({ name: "ralph-hero-knowledge", version: "0.1.0" });
27
74
  const db = new KnowledgeDB(dbPath);
28
75
  const fts = new FtsSearch(db);
29
76
  const vec = new VectorSearch(db);
30
- const hybrid = new HybridSearch(db, fts, vec, embed);
77
+ const embedImpl = opts.embedFn ?? embed;
78
+ const hybrid = new HybridSearch(db, fts, vec, embedImpl);
31
79
  const traverser = new Traverser(db);
32
80
 
33
81
  server.tool(
@@ -40,6 +88,16 @@ export function createServer(dbPath: string) {
40
88
  limit: z.number().optional().describe("Max results (default: 10)"),
41
89
  includeSuperseded: z.boolean().optional().describe("Include superseded documents (default: false)"),
42
90
  brief: z.boolean().optional().describe("Return minimal metadata only (default: false)"),
91
+ memory_tier: z
92
+ .enum(["doc", "raw", "reflection", "any"])
93
+ .optional()
94
+ .default("any")
95
+ .describe("Filter by memory tier: 'doc' (curated), 'raw' (dream-loop ingest), 'reflection' (synthesized), 'any' (default)"),
96
+ return_chunk_meta: z
97
+ .boolean()
98
+ .optional()
99
+ .default(false)
100
+ .describe("Include chunk_index/char_start/char_end/context_prefix in each hit when chunk data is available"),
43
101
  },
44
102
  async (args) => {
45
103
  try {
@@ -48,18 +106,33 @@ export function createServer(dbPath: string) {
48
106
  type: args.type,
49
107
  limit: args.limit ?? 10,
50
108
  includeSuperseded: args.includeSuperseded,
109
+ memoryTier: args.memory_tier,
51
110
  });
52
- const enriched = results.map(r => {
53
- const base = { ...r, tags: db.getTags(r.id) };
111
+ const enriched = results.map((r) => {
112
+ // Start with the camelCase SearchResult shape so existing callers
113
+ // keep working, then optionally add snake_case aliases for new
114
+ // chunk fields and strip them when callers didn't opt in.
115
+ const { chunkIndex, charStart, charEnd, contextPrefix, bestChunkId, ...rest } = r;
116
+ const base: Record<string, unknown> = { ...rest, tags: db.getTags(r.id) };
117
+ if (args.return_chunk_meta) {
118
+ if (chunkIndex !== undefined) base.chunk_index = chunkIndex;
119
+ if (charStart !== undefined) base.char_start = charStart;
120
+ if (charEnd !== undefined) base.char_end = charEnd;
121
+ if (contextPrefix !== undefined) base.context_prefix = contextPrefix;
122
+ if (bestChunkId !== undefined) base.best_chunk_id = bestChunkId;
123
+ }
54
124
  // SearchResult does not carry githubIssue — fetch from documents table
55
125
  const doc = db.getDocument(r.id);
56
126
  if (doc?.githubIssue) {
57
127
  const outcomes = db.getOutcomeSummary(doc.githubIssue);
58
- if (outcomes) return { ...base, outcomes_summary: outcomes };
128
+ if (outcomes) base.outcomes_summary = outcomes;
59
129
  }
60
130
  return base;
61
131
  });
62
- const formatted = formatSearchResults(enriched, args.brief ?? false);
132
+ const formatted = formatSearchResults(
133
+ enriched as unknown as Parameters<typeof formatSearchResults>[0],
134
+ args.brief ?? false,
135
+ );
63
136
  return { content: [{ type: "text" as const, text: JSON.stringify(formatted, null, 2) }] };
64
137
  } catch (e) {
65
138
  return { content: [{ type: "text" as const, text: `Error: ${(e as Error).message}` }], isError: true };
@@ -76,13 +149,26 @@ export function createServer(dbPath: string) {
76
149
  depth: z.number().optional().describe("Max traversal depth (default: 3)"),
77
150
  direction: z.enum(["outgoing", "incoming"]).optional().describe("Edge direction (default: outgoing)"),
78
151
  brief: z.boolean().optional().describe("Return minimal metadata only (default: false)"),
152
+ memory_tier: z
153
+ .enum(["doc", "raw", "reflection", "any"])
154
+ .optional()
155
+ .default("any")
156
+ .describe("Filter traversed nodes by memory tier (default: 'any')"),
79
157
  },
80
158
  async (args) => {
81
159
  try {
82
160
  const opts = { type: args.type, depth: args.depth ?? 3 };
83
- const results = args.direction === "incoming"
161
+ let results = args.direction === "incoming"
84
162
  ? traverser.traverseIncoming(args.from, opts)
85
163
  : traverser.traverse(args.from, opts);
164
+ if (args.memory_tier && args.memory_tier !== "any") {
165
+ const wantedTier = args.memory_tier;
166
+ results = results.filter((r) => {
167
+ const tier = db.getMemoryTier(r.targetId);
168
+ // When memory_tier column is absent (pre-v3 DB) treat as "doc"
169
+ return (tier ?? "doc") === wantedTier;
170
+ });
171
+ }
86
172
  const formatted = formatTraverseResults(results, (id) => db.getTags(id), args.brief ?? false);
87
173
  return { content: [{ type: "text" as const, text: JSON.stringify(formatted, null, 2) }] };
88
174
  } catch (e) {
@@ -91,6 +177,115 @@ export function createServer(dbPath: string) {
91
177
  },
92
178
  );
93
179
 
180
+ server.tool(
181
+ "knowledge_memory_stats",
182
+ "Return counts of documents by memory_tier plus chunk percentiles and last-reflection timestamp. Used by the dream-loop to confirm ingest/reflection completion.",
183
+ {
184
+ since: z
185
+ .string()
186
+ .optional()
187
+ .describe("ISO timestamp — counts for 'new_since' are computed against this. Defaults to 24 hours ago."),
188
+ },
189
+ async (args) => {
190
+ try {
191
+ const since = args.since ?? new Date(Date.now() - 24 * 3600 * 1000).toISOString();
192
+ const hasTier = memoryTierColumnExists(db);
193
+ const hasChunks = chunksTableExists(db);
194
+
195
+ const totalRow = db.db
196
+ .prepare("SELECT COUNT(*) AS c FROM documents")
197
+ .get() as { c: number };
198
+ const totalDocuments = totalRow.c;
199
+
200
+ const byTier: Record<"doc" | "raw" | "reflection", number> = {
201
+ doc: 0,
202
+ raw: 0,
203
+ reflection: 0,
204
+ };
205
+ const newSince: Record<"doc" | "raw" | "reflection", number> = {
206
+ doc: 0,
207
+ raw: 0,
208
+ reflection: 0,
209
+ };
210
+
211
+ if (hasTier) {
212
+ const rows = db.db
213
+ .prepare(
214
+ `SELECT memory_tier AS tier, COUNT(*) AS c
215
+ FROM documents GROUP BY memory_tier`,
216
+ )
217
+ .all() as Array<{ tier: string; c: number }>;
218
+ for (const r of rows) {
219
+ if (r.tier === "doc" || r.tier === "raw" || r.tier === "reflection") {
220
+ byTier[r.tier] = r.c;
221
+ }
222
+ }
223
+ const newRows = db.db
224
+ .prepare(
225
+ `SELECT memory_tier AS tier, COUNT(*) AS c
226
+ FROM documents
227
+ WHERE date IS NOT NULL AND date >= @since
228
+ GROUP BY memory_tier`,
229
+ )
230
+ .all({ since }) as Array<{ tier: string; c: number }>;
231
+ for (const r of newRows) {
232
+ if (r.tier === "doc" || r.tier === "raw" || r.tier === "reflection") {
233
+ newSince[r.tier] = r.c;
234
+ }
235
+ }
236
+ } else {
237
+ // v2 schema — everything treated as "doc"
238
+ byTier.doc = totalDocuments;
239
+ const newDocRow = db.db
240
+ .prepare(
241
+ "SELECT COUNT(*) AS c FROM documents WHERE date IS NOT NULL AND date >= ?",
242
+ )
243
+ .get(since) as { c: number };
244
+ newSince.doc = newDocRow.c;
245
+ }
246
+
247
+ let chunksPerDocP50 = 0;
248
+ let chunksPerDocP90 = 0;
249
+ if (hasChunks) {
250
+ const perDoc = db.db
251
+ .prepare(
252
+ `SELECT COUNT(*) AS c FROM chunks GROUP BY document_id`,
253
+ )
254
+ .all() as Array<{ c: number }>;
255
+ const counts = perDoc.map((r) => r.c).sort((a, b) => a - b);
256
+ chunksPerDocP50 = percentile(counts, 0.5);
257
+ chunksPerDocP90 = percentile(counts, 0.9);
258
+ }
259
+
260
+ let lastReflectionAt: string | null = null;
261
+ if (hasTier) {
262
+ const row = db.db
263
+ .prepare(
264
+ `SELECT date FROM documents
265
+ WHERE memory_tier = 'reflection' AND date IS NOT NULL
266
+ ORDER BY date DESC LIMIT 1`,
267
+ )
268
+ .get() as { date: string } | undefined;
269
+ lastReflectionAt = row?.date ?? null;
270
+ }
271
+
272
+ const payload = {
273
+ total_documents: totalDocuments,
274
+ by_tier: byTier,
275
+ new_since: newSince,
276
+ chunks_per_doc_p50: chunksPerDocP50,
277
+ chunks_per_doc_p90: chunksPerDocP90,
278
+ last_reflection_at: lastReflectionAt,
279
+ since,
280
+ };
281
+
282
+ return { content: [{ type: "text" as const, text: JSON.stringify(payload, null, 2) }] };
283
+ } catch (e) {
284
+ return { content: [{ type: "text" as const, text: `Error: ${(e as Error).message}` }], isError: true };
285
+ }
286
+ },
287
+ );
288
+
94
289
  server.tool(
95
290
  "knowledge_record_outcome",
96
291
  "Record a pipeline outcome event (research, plan, phase, validation, etc.)",
@@ -0,0 +1,136 @@
1
+ /**
2
+ * Minimal OpenAI-compatible LLM client for Contextual Retrieval.
3
+ *
4
+ * Probes `${baseUrl}/v1/models` for availability and calls
5
+ * `${baseUrl}/v1/chat/completions` to generate short context prefixes for
6
+ * document chunks. Uses native `fetch` + `AbortController` — no SDK dependency.
7
+ *
8
+ * Fail-open semantics: network errors, timeouts, non-200 responses, and
9
+ * malformed JSON all resolve without throwing. `available()` returns `false`;
10
+ * `contextualize()` returns an empty string. The caller is expected to treat
11
+ * an empty context prefix as "no context available" and continue.
12
+ *
13
+ * Defaults target `gemma-lab` at `http://localhost:8000` with the Gemma 4 26B
14
+ * MXFP8 model. Override via `RALPH_LLM_URL` / `RALPH_LLM_MODEL` env vars or
15
+ * explicit options.
16
+ */
17
+
18
+ export interface LlmClientOptions {
19
+ /** Base URL for the OpenAI-compatible endpoint. Default: RALPH_LLM_URL env or http://localhost:8000. */
20
+ baseUrl?: string;
21
+ /** Model identifier sent in chat completion requests. Default: RALPH_LLM_MODEL env or mlx-community/gemma-4-26b-a4b-it-mxfp8. */
22
+ model?: string;
23
+ /** Timeout for contextualize() in milliseconds. Default: 30000. */
24
+ timeoutMs?: number;
25
+ }
26
+
27
+ export interface LlmClient {
28
+ /**
29
+ * Probe the endpoint for availability.
30
+ * Returns `true` iff `${baseUrl}/v1/models` responds with HTTP 200 within 2000ms.
31
+ * Returns `false` on timeout, connection refused, non-200, or any thrown exception.
32
+ */
33
+ available(): Promise<boolean>;
34
+
35
+ /**
36
+ * Generate a short (≤100 token) context prefix situating `chunkContent`
37
+ * within `fullDocument`, using the Anthropic Contextual Retrieval prompt.
38
+ *
39
+ * Returns the trimmed content string on success, or `""` on any error
40
+ * (network failure, timeout, non-2xx response, missing choices, malformed JSON).
41
+ */
42
+ contextualize(fullDocument: string, chunkContent: string): Promise<string>;
43
+ }
44
+
45
+ const DEFAULT_BASE_URL = "http://localhost:8000";
46
+ const DEFAULT_MODEL = "mlx-community/gemma-4-26b-a4b-it-mxfp8";
47
+ const DEFAULT_TIMEOUT_MS = 30000;
48
+ const AVAILABLE_PROBE_TIMEOUT_MS = 2000;
49
+ const MAX_CONTEXT_TOKENS = 120;
50
+
51
+ /**
52
+ * Anthropic Contextual Retrieval prompt, verbatim from the parent plan Phase 2.
53
+ * Placeholders `{fullDocument}` and `{chunkContent}` are filled at call time.
54
+ */
55
+ function buildContextualizePrompt(fullDocument: string, chunkContent: string): string {
56
+ return `<document>
57
+ ${fullDocument}
58
+ </document>
59
+
60
+ Here is the chunk we want to situate within the whole document:
61
+
62
+ <chunk>
63
+ ${chunkContent}
64
+ </chunk>
65
+
66
+ Please give a short succinct context to situate this chunk within the overall
67
+ document for the purposes of improving search retrieval of the chunk. Answer only
68
+ with the succinct context and nothing else.`;
69
+ }
70
+
71
+ interface ChatCompletionResponse {
72
+ choices?: Array<{
73
+ message?: {
74
+ content?: string;
75
+ };
76
+ }>;
77
+ }
78
+
79
+ export function createLlmClient(opts: LlmClientOptions = {}): LlmClient {
80
+ const baseUrl = opts.baseUrl ?? process.env.RALPH_LLM_URL ?? DEFAULT_BASE_URL;
81
+ const model = opts.model ?? process.env.RALPH_LLM_MODEL ?? DEFAULT_MODEL;
82
+ const timeoutMs = opts.timeoutMs ?? DEFAULT_TIMEOUT_MS;
83
+
84
+ async function available(): Promise<boolean> {
85
+ const controller = new AbortController();
86
+ const timer = setTimeout(() => controller.abort(), AVAILABLE_PROBE_TIMEOUT_MS);
87
+ try {
88
+ const response = await fetch(`${baseUrl}/v1/models`, {
89
+ method: "GET",
90
+ signal: controller.signal,
91
+ });
92
+ return response.status === 200;
93
+ } catch {
94
+ return false;
95
+ } finally {
96
+ clearTimeout(timer);
97
+ }
98
+ }
99
+
100
+ async function contextualize(fullDocument: string, chunkContent: string): Promise<string> {
101
+ const controller = new AbortController();
102
+ const timer = setTimeout(() => controller.abort(), timeoutMs);
103
+ try {
104
+ const prompt = buildContextualizePrompt(fullDocument, chunkContent);
105
+ const response = await fetch(`${baseUrl}/v1/chat/completions`, {
106
+ method: "POST",
107
+ headers: {
108
+ "Content-Type": "application/json",
109
+ },
110
+ body: JSON.stringify({
111
+ model,
112
+ messages: [{ role: "user", content: prompt }],
113
+ max_tokens: MAX_CONTEXT_TOKENS,
114
+ }),
115
+ signal: controller.signal,
116
+ });
117
+
118
+ if (!response.ok) {
119
+ return "";
120
+ }
121
+
122
+ const data = (await response.json()) as ChatCompletionResponse;
123
+ const content = data?.choices?.[0]?.message?.content;
124
+ if (typeof content !== "string") {
125
+ return "";
126
+ }
127
+ return content.trim();
128
+ } catch {
129
+ return "";
130
+ } finally {
131
+ clearTimeout(timer);
132
+ }
133
+ }
134
+
135
+ return { available, contextualize };
136
+ }
package/src/reindex.ts CHANGED
@@ -4,12 +4,19 @@ import { homedir } from "node:os";
4
4
  import { KnowledgeDB } from "./db.js";
5
5
  import { FtsSearch } from "./search.js";
6
6
  import { VectorSearch } from "./vector-search.js";
7
- import { embed, prepareTextForEmbedding } from "./embedder.js";
7
+ import { embedDocument } from "./embedder.js";
8
8
  import { parseDocument, type ParsedDocument } from "./parser.js";
9
9
  import { findMarkdownFiles } from "./file-scanner.js";
10
10
  import { generateIndexes } from "./generate-indexes.js";
11
+ import { loadConfig, type KnowledgeConfig } from "./config.js";
12
+ import { loadIgnoreForRoot } from "./ignore.js";
11
13
 
12
- export async function reindex(dirs: string[], dbPath: string, generate: boolean = false): Promise<void> {
14
+ export async function reindex(
15
+ dirs: string[],
16
+ dbPath: string,
17
+ generate: boolean = false,
18
+ ignorePatterns?: string[],
19
+ ): Promise<void> {
13
20
  console.log(`Indexing ${dirs.join(", ")} -> ${dbPath}`);
14
21
 
15
22
  const db = new KnowledgeDB(dbPath);
@@ -32,7 +39,8 @@ export async function reindex(dirs: string[], dbPath: string, generate: boolean
32
39
  // Phase 1: Discover files on disk
33
40
  const filesOnDisk: string[] = [];
34
41
  for (const dir of dirs) {
35
- const found = findMarkdownFiles(dir);
42
+ const matcher = loadIgnoreForRoot(dir, ignorePatterns);
43
+ const found = findMarkdownFiles(dir, matcher);
36
44
  console.log(` ${dir}: ${found.length} files`);
37
45
  filesOnDisk.push(...found);
38
46
  }
@@ -40,7 +48,10 @@ export async function reindex(dirs: string[], dbPath: string, generate: boolean
40
48
 
41
49
  const filesOnDiskSet = new Set(filesOnDisk.map(f => resolve(f)));
42
50
 
43
- // Phase 1: Delete stale entries for files no longer on disk
51
+ // Phase 1: Delete stale entries for files no longer on disk.
52
+ // Chunk rows cascade from documents via ON DELETE CASCADE on chunks.document_id,
53
+ // but the vec0 virtual table does not participate in FK cascades — we must
54
+ // explicitly delete chunk-level vec rows via GLOB pattern.
44
55
  const syncedPaths = db.getAllSyncPaths();
45
56
  let deleted = 0;
46
57
  for (const syncedPath of syncedPaths) {
@@ -48,6 +59,8 @@ export async function reindex(dirs: string[], dbPath: string, generate: boolean
48
59
  const id = basename(syncedPath, ".md");
49
60
  fts.deleteFtsEntry(id);
50
61
  db.deleteDocument(id);
62
+ vec.deleteChunkVecsByDoc(id);
63
+ // Also delete any legacy doc-level vec row (pre-chunks schema).
51
64
  vec.deleteEmbedding(id);
52
65
  db.deleteSyncRecord(syncedPath);
53
66
  deleted++;
@@ -130,10 +143,35 @@ export async function reindex(dirs: string[], dbPath: string, generate: boolean
130
143
  db.addRelationship(edge.sourceId, edge.targetId, "untyped", edge.context);
131
144
  }
132
145
 
133
- const text = prepareTextForEmbedding(parsed.title, parsed.tags, parsed.content);
146
+ // Chunk-aware embedding: emit one embedding per chunk, persist to both
147
+ // the `chunks` table and the `documents_vec` virtual table with chunk ids
148
+ // of the form `${doc.id}#c${index}`.
149
+ //
150
+ // We first clear any stale chunk rows for this doc_id (the document
151
+ // body may have shrunk across re-indexes) and stale chunk vec rows (which
152
+ // don't cascade from the `chunks` table because vec0 is a virtual table).
153
+ db.db.prepare("DELETE FROM chunks WHERE document_id = ?").run(parsed.id);
154
+ vec.deleteChunkVecsByDoc(parsed.id);
155
+ // Drop any pre-chunks schema vec row that used the bare doc id.
156
+ vec.deleteEmbedding(parsed.id);
157
+
134
158
  try {
135
- const embedding = await embed(text);
136
- vec.upsertEmbedding(parsed.id, embedding);
159
+ const chunks = await embedDocument(parsed.title, parsed.tags, parsed.content);
160
+ const insertChunk = db.db.prepare(
161
+ "INSERT INTO chunks (id, document_id, chunk_index, content, char_start, char_end) VALUES (?, ?, ?, ?, ?, ?)"
162
+ );
163
+ for (const chunk of chunks) {
164
+ const chunkId = `${parsed.id}#c${chunk.index}`;
165
+ insertChunk.run(
166
+ chunkId,
167
+ parsed.id,
168
+ chunk.index,
169
+ chunk.content,
170
+ chunk.charStart,
171
+ chunk.charEnd,
172
+ );
173
+ vec.upsertEmbedding(chunkId, chunk.embedding);
174
+ }
137
175
  } catch (e) {
138
176
  console.warn(`Failed to embed ${id}: ${(e as Error).message}`);
139
177
  }
@@ -182,31 +220,94 @@ export async function reindex(dirs: string[], dbPath: string, generate: boolean
182
220
 
183
221
  const DEFAULT_DB_PATH = join(homedir(), ".ralph-hero", "knowledge.db");
184
222
 
185
- export function resolveDirs(): { dirs: string[]; dbPath: string; generate: boolean } {
223
+ export type ResolvedDirsSource = "cli" | "env" | "config" | "fallback";
224
+
225
+ export interface ResolvedDirs {
226
+ dirs: string[];
227
+ dbPath: string;
228
+ generate: boolean;
229
+ source: ResolvedDirsSource;
230
+ config: KnowledgeConfig;
231
+ }
232
+
233
+ /**
234
+ * Resolve the set of roots, database path, and generate flag for a reindex
235
+ * run. Precedence (highest to lowest):
236
+ * 1. CLI positional args
237
+ * 2. `RALPH_KNOWLEDGE_DIRS` environment variable
238
+ * 3. `config.roots` from `~/.ralph/knowledge.config.json`
239
+ * 4. `"../../thoughts"` fallback
240
+ *
241
+ * `dbPath` precedence is independent: CLI `.db` positional > `RALPH_KNOWLEDGE_DB`
242
+ * env var > `config.dbPath` > {@link DEFAULT_DB_PATH}.
243
+ *
244
+ * The returned `config` is forwarded to the caller so `ignorePatterns` can be
245
+ * threaded into {@link reindex}.
246
+ */
247
+ export function resolveDirs(): ResolvedDirs {
186
248
  const cliArgs = process.argv.slice(2);
187
249
  const noGenerate = cliArgs.includes("--no-generate");
188
250
  const positional = cliArgs.filter(a => !a.startsWith("--"));
189
251
  const cliDb = positional.find(a => a.endsWith(".db"));
190
252
  const cliDirs = positional.filter(a => !a.endsWith(".db"));
191
253
 
254
+ const config = loadConfig();
255
+
256
+ const resolveDbPath = (): string =>
257
+ cliDb ??
258
+ process.env.RALPH_KNOWLEDGE_DB ??
259
+ config.dbPath ??
260
+ DEFAULT_DB_PATH;
261
+
192
262
  if (cliDirs.length > 0) {
193
- return { dirs: cliDirs, dbPath: cliDb ?? DEFAULT_DB_PATH, generate: !noGenerate };
263
+ console.log("Using roots from: CLI");
264
+ return {
265
+ dirs: cliDirs,
266
+ dbPath: resolveDbPath(),
267
+ generate: !noGenerate,
268
+ source: "cli",
269
+ config,
270
+ };
194
271
  }
195
272
 
196
273
  const envDirs = process.env.RALPH_KNOWLEDGE_DIRS;
197
274
  if (envDirs) {
275
+ const parsed = envDirs.split(",").map(d => d.trim()).filter(Boolean);
276
+ if (parsed.length > 0) {
277
+ console.log("Using roots from: env");
278
+ return {
279
+ dirs: parsed,
280
+ dbPath: resolveDbPath(),
281
+ generate: !noGenerate,
282
+ source: "env",
283
+ config,
284
+ };
285
+ }
286
+ }
287
+
288
+ if (config.roots && config.roots.length > 0) {
289
+ console.log("Using roots from: config");
198
290
  return {
199
- dirs: envDirs.split(",").map(d => d.trim()).filter(Boolean),
200
- dbPath: cliDb ?? process.env.RALPH_KNOWLEDGE_DB ?? DEFAULT_DB_PATH,
291
+ dirs: config.roots,
292
+ dbPath: resolveDbPath(),
201
293
  generate: !noGenerate,
294
+ source: "config",
295
+ config,
202
296
  };
203
297
  }
204
298
 
205
- return { dirs: ["../../thoughts"], dbPath: cliDb ?? DEFAULT_DB_PATH, generate: !noGenerate };
299
+ console.log("Using roots from: fallback");
300
+ return {
301
+ dirs: ["../../thoughts"],
302
+ dbPath: resolveDbPath(),
303
+ generate: !noGenerate,
304
+ source: "fallback",
305
+ config,
306
+ };
206
307
  }
207
308
 
208
309
  const isMain = process.argv[1]?.endsWith("reindex.js");
209
310
  if (isMain) {
210
- const { dirs, dbPath, generate } = resolveDirs();
211
- reindex(dirs, dbPath, generate).catch(console.error);
311
+ const { dirs, dbPath, generate, config } = resolveDirs();
312
+ reindex(dirs, dbPath, generate, config.ignorePatterns).catch(console.error);
212
313
  }