ralph-hero-knowledge-index 0.1.25 → 0.1.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ralph-knowledge",
3
- "version": "0.1.25",
3
+ "version": "0.1.26",
4
4
  "description": "Knowledge graph for ralph-hero: semantic search, relationship traversal, and document indexing across thoughts/ documents. Optional companion to ralph-hero.",
5
5
  "author": {
6
6
  "name": "Chad Dubiel",
package/.mcp.json CHANGED
@@ -2,7 +2,7 @@
2
2
  "mcpServers": {
3
3
  "ralph-knowledge": {
4
4
  "command": "npx",
5
- "args": ["-y", "ralph-hero-knowledge-index@0.1.25"]
5
+ "args": ["-y", "ralph-hero-knowledge-index@0.1.26"]
6
6
  }
7
7
  }
8
8
  }
@@ -1,5 +1,6 @@
1
1
  import { type FeatureExtractionPipeline } from "@huggingface/transformers";
2
2
  import { type Chunk, type ChunkerOptions } from "./chunker.js";
3
+ import type { LlmClient } from "./llm-client.js";
3
4
  export declare function getEmbedder(): Promise<FeatureExtractionPipeline>;
4
5
  export declare function embed(text: string): Promise<Float32Array>;
5
6
  /**
@@ -11,6 +12,24 @@ export interface DocumentChunk extends Chunk {
11
12
  embedding: Float32Array;
12
13
  contextPrefix?: string;
13
14
  }
15
+ /**
16
+ * Options accepted by `embedDocument`. Extends `ChunkerOptions` with optional
17
+ * Contextual Retrieval inputs:
18
+ *
19
+ * - `llm`: when present, each chunk is run through `llm.contextualize(fullDoc, chunkContent)`
20
+ * and the returned string is prepended to the embed text (and persisted on the
21
+ * resulting `DocumentChunk.contextPrefix`). Empty-string returns (fail-open from
22
+ * the LLM client) cause the embed text to fall back to the legacy
23
+ * `${title}\n${tagLine}\n${chunk.content}` shape.
24
+ * - `cachedPrefixes`: optional `Map<chunkIndex, contextPrefix>` from a prior run.
25
+ * When a chunk's index has a cached prefix, the LLM call is skipped and the
26
+ * cached string is reused verbatim. Used by the reindex content-hash cache
27
+ * fast-path (Task 6.4) so unchanged docs don't re-contact the LLM endpoint.
28
+ */
29
+ export interface EmbedDocumentOptions extends ChunkerOptions {
30
+ llm?: LlmClient;
31
+ cachedPrefixes?: Map<number, string>;
32
+ }
14
33
  /**
15
34
  * Embed a document by splitting it into chunks and emitting one embedding
16
35
  * per chunk. The embedded text for each chunk is
@@ -18,11 +37,17 @@ export interface DocumentChunk extends Chunk {
18
37
  * tags) travel with every chunk embedding — matching the shape of the legacy
19
38
  * `prepareTextForEmbedding()` but without the 500-char truncation.
20
39
  *
40
+ * When `opts.llm` is provided (Phase 6 — Contextual Retrieval), a short
41
+ * context prefix is generated per chunk via `opts.llm.contextualize(content, chunk.content)`
42
+ * and prepended to the embed text as `${contextPrefix}\n${title}\n${tagLine}\n${chunk.content}`.
43
+ * If `contextualize` returns `""` (fail-open path), the embed text reverts to the
44
+ * no-context shape so we never emit a leading blank line.
45
+ *
21
46
  * Short documents (<= chunkSize) produce exactly one chunk covering the whole
22
47
  * content. Empty content yields a single chunk with empty content (so callers
23
48
  * still get a title/tag-only embedding for stub documents).
24
49
  */
25
- export declare function embedDocument(title: string, tags: string[], content: string, opts?: ChunkerOptions): Promise<DocumentChunk[]>;
50
+ export declare function embedDocument(title: string, tags: string[], content: string, opts?: EmbedDocumentOptions): Promise<DocumentChunk[]>;
26
51
  /**
27
52
  * Back-compat shim: kept so callers outside the reindex path can still build
28
53
  * a title/tags/first-paragraph string. No longer used by `embedDocument` (the
package/dist/embedder.js CHANGED
@@ -25,6 +25,12 @@ export async function embed(text) {
25
25
  * tags) travel with every chunk embedding — matching the shape of the legacy
26
26
  * `prepareTextForEmbedding()` but without the 500-char truncation.
27
27
  *
28
+ * When `opts.llm` is provided (Phase 6 — Contextual Retrieval), a short
29
+ * context prefix is generated per chunk via `opts.llm.contextualize(content, chunk.content)`
30
+ * and prepended to the embed text as `${contextPrefix}\n${title}\n${tagLine}\n${chunk.content}`.
31
+ * If `contextualize` returns `""` (fail-open path), the embed text reverts to the
32
+ * no-context shape so we never emit a leading blank line.
33
+ *
28
34
  * Short documents (<= chunkSize) produce exactly one chunk covering the whole
29
35
  * content. Empty content yields a single chunk with empty content (so callers
30
36
  * still get a title/tag-only embedding for stub documents).
@@ -37,10 +43,29 @@ export async function embedDocument(title, tags, content, opts) {
37
43
  const chunks = content.length === 0
38
44
  ? [{ index: 0, content: "", charStart: 0, charEnd: 0 }]
39
45
  : chunkText(content, opts);
46
+ const llm = opts?.llm;
47
+ const cached = opts?.cachedPrefixes;
40
48
  const out = [];
41
49
  for (const chunk of chunks) {
42
- const parts = [title, tagLine, chunk.content].filter(p => p.length > 0);
43
- const embedText = parts.join("\n");
50
+ let contextPrefix = "";
51
+ if (llm) {
52
+ // Cache hit: reuse prior context_prefix when the caller supplied a map
53
+ // keyed by chunk.index. Avoids an LLM round-trip per unchanged chunk.
54
+ if (cached && cached.has(chunk.index)) {
55
+ contextPrefix = cached.get(chunk.index) ?? "";
56
+ }
57
+ else {
58
+ // `contextualize` is fail-open: it returns "" on any network/timeout/
59
+ // malformed-response error. That empty string propagates into the
60
+ // returned `DocumentChunk.contextPrefix` (persisted by the caller) and
61
+ // causes the embed text to skip the leading blank line below.
62
+ contextPrefix = await llm.contextualize(content, chunk.content);
63
+ }
64
+ }
65
+ const parts = contextPrefix.length > 0
66
+ ? [contextPrefix, title, tagLine, chunk.content]
67
+ : [title, tagLine, chunk.content];
68
+ const embedText = parts.filter(p => p.length > 0).join("\n");
44
69
  const embedding = await embed(embedText);
45
70
  out.push({
46
71
  index: chunk.index,
@@ -48,6 +73,7 @@ export async function embedDocument(title, tags, content, opts) {
48
73
  charStart: chunk.charStart,
49
74
  charEnd: chunk.charEnd,
50
75
  embedding,
76
+ contextPrefix,
51
77
  });
52
78
  }
53
79
  return out;
@@ -1 +1 @@
1
- {"version":3,"file":"embedder.js","sourceRoot":"","sources":["../src/embedder.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,QAAQ,GAET,MAAM,2BAA2B,CAAC;AACnC,OAAO,EAAE,SAAS,EAAmC,MAAM,cAAc,CAAC;AAE1E,MAAM,QAAQ,GAAG,yBAAyB,CAAC;AAE3C,IAAI,gBAAgB,GAAqC,IAAI,CAAC;AAE9D,MAAM,CAAC,KAAK,UAAU,WAAW;IAC/B,IAAI,CAAC,gBAAgB,EAAE,CAAC;QACtB,mEAAmE;QACnE,gBAAgB,GAAG,CAAC,MAAM,QAAQ,CAChC,oBAAoB,EACpB,QAAQ,CACT,CAA8B,CAAC;IAClC,CAAC;IACD,OAAO,gBAAgB,CAAC;AAC1B,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,KAAK,CAAC,IAAY;IACtC,MAAM,QAAQ,GAAG,MAAM,WAAW,EAAE,CAAC;IACrC,gFAAgF;IAChF,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE;QAClC,OAAO,EAAE,MAAM;QACf,SAAS,EAAE,IAAI;KAChB,CAAC,CAAC;IACH,OAAO,IAAI,YAAY,CAAC,MAAM,CAAC,IAAyB,CAAC,CAAC;AAC5D,CAAC;AAYD;;;;;;;;;;GAUG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,KAAa,EACb,IAAc,EACd,OAAe,EACf,IAAqB;IAErB,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAEvD,6EAA6E;IAC7E,oEAAoE;IACpE,sCAAsC;IACtC,MAAM,MAAM,GAAY,OAAO,CAAC,MAAM,KAAK,CAAC;QAC1C,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,OAAO,EAAE,EAAE,EAAE,SAAS,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC;QACvD,CAAC,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;IAE7B,MAAM,GAAG,GAAoB,EAAE,CAAC;IAChC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,MAAM,KAAK,GAAG,CAAC,KAAK,EAAE,OAAO,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACxE,MAAM,SAAS,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACnC,MAAM,SAAS,GAAG,MAAM,KAAK,CAAC,SAAS,CAAC,CAAC;QACzC,GAAG,CAAC,IAAI,CAAC;YACP,KAAK,EAAE,KAAK,CAAC,KAAK;YAClB,OAAO,EAAE,KAAK,CAAC,OAAO;YACtB,SAAS,EAAE,KAAK,CAAC,SAAS;YAC1B,OAAO,EAAE,KAAK,CAAC,OAAO;YACtB,SAAS;SACV,CAAC,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,uBAAuB,CACrC,KAAa,EACb,IAAc,EACd,OAAe;IAEf,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IACvD,8EAA8E;IAC9E,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IAC1C,MAAM,cAAc,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAC/E,MAAM,KAAK,GAAG,CAAC,KAAK,EAAE,OAAO,EAAE,cAAc,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACzE,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC"}
1
+ {"version":3,"file":"embedder.js","sourceRoot":"","sources":["../src/embedder.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,QAAQ,GAET,MAAM,2BAA2B,CAAC;AACnC,OAAO,EAAE,SAAS,EAAmC,MAAM,cAAc,CAAC;AAG1E,MAAM,QAAQ,GAAG,yBAAyB,CAAC;AAE3C,IAAI,gBAAgB,GAAqC,IAAI,CAAC;AAE9D,MAAM,CAAC,KAAK,UAAU,WAAW;IAC/B,IAAI,CAAC,gBAAgB,EAAE,CAAC;QACtB,mEAAmE;QACnE,gBAAgB,GAAG,CAAC,MAAM,QAAQ,CAChC,oBAAoB,EACpB,QAAQ,CACT,CAA8B,CAAC;IAClC,CAAC;IACD,OAAO,gBAAgB,CAAC;AAC1B,CAAC;AAED,MAAM,CAAC,KAAK,UAAU,KAAK,CAAC,IAAY;IACtC,MAAM,QAAQ,GAAG,MAAM,WAAW,EAAE,CAAC;IACrC,gFAAgF;IAChF,MAAM,MAAM,GAAG,MAAM,QAAQ,CAAC,IAAI,EAAE;QAClC,OAAO,EAAE,MAAM;QACf,SAAS,EAAE,IAAI;KAChB,CAAC,CAAC;IACH,OAAO,IAAI,YAAY,CAAC,MAAM,CAAC,IAAyB,CAAC,CAAC;AAC5D,CAAC;AA+BD;;;;;;;;;;;;;;;;GAgBG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,KAAa,EACb,IAAc,EACd,OAAe,EACf,IAA2B;IAE3B,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IAEvD,6EAA6E;IAC7E,oEAAoE;IACpE,sCAAsC;IACtC,MAAM,MAAM,GAAY,OAAO,CAAC,MAAM,KAAK,CAAC;QAC1C,CAAC,CAAC,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,OAAO,EAAE,EAAE,EAAE,SAAS,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,CAAC;QACvD,CAAC,CAAC,SAAS,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;IAE7B,MAAM,GAAG,GAAG,IAAI,EAAE,GAAG,CAAC;IACtB,MAAM,MAAM,GAAG,IAAI,EAAE,cAAc,CAAC;IAEpC,MAAM,GAAG,GAAoB,EAAE,CAAC;IAChC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,IAAI,aAAa,GAAG,EAAE,CAAC;QACvB,IAAI,GAAG,EAAE,CAAC;YACR,uEAAuE;YACvE,sEAAsE;YACtE,IAAI,MAAM,IAAI,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,EAAE,CAAC;gBACtC,aAAa,GAAG,MAAM,CAAC,GAAG,CAAC,KAAK,CAAC,KAAK,CAAC,IAAI,EAAE,CAAC;YAChD,CAAC;iBAAM,CAAC;gBACN,sEAAsE;gBACtE,kEAAkE;gBAClE,uEAAuE;gBACvE,8DAA8D;gBAC9D,aAAa,GAAG,MAAM,GAAG,CAAC,aAAa,CAAC,OAAO,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;YAClE,CAAC;QACH,CAAC;QAED,MAAM,KAAK,GAAG,aAAa,CAAC,MAAM,GAAG,CAAC;YACpC,CAAC,CAAC,CAAC,aAAa,EAAE,KAAK,EAAE,OAAO,EAAE,KAAK,CAAC,OAAO,CAAC;YAChD,CAAC,CAAC,CAAC,KAAK,EAAE,OAAO,EAAE,KAAK,CAAC,OAAO,CAAC,CAAC;QACpC,MAAM,SAAS,GAAG,KAAK,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAC7D,MAAM,SAAS,GAAG,MAAM,KAAK,CAAC,SAAS,CAAC,CAAC;QACzC,GAAG,CAAC,IAAI,CAAC;YACP,KAAK,EAAE,KAAK,CAAC,KAAK;YAClB,OAAO,EAAE,KAAK,CAAC,OAAO;YACtB,SAAS,EAAE,KAAK,CAAC,SAAS;YAC1B,OAAO,EAAE,KAAK,CAAC,OAAO;YACtB,SAAS;YACT,aAAa;SACd,CAAC,CAAC;IACL,CAAC;IACD,OAAO,GAAG,CAAC;AACb,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,uBAAuB,CACrC,KAAa,EACb,IAAc,EACd,OAAe;IAEf,MAAM,OAAO,GAAG,IAAI,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;IACvD,8EAA8E;IAC9E,MAAM,UAAU,GAAG,OAAO,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC;IAC1C,MAAM,cAAc,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC;IAC/E,MAAM,KAAK,GAAG,CAAC,KAAK,EAAE,OAAO,EAAE,cAAc,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;IACzE,OAAO,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAC1B,CAAC"}
package/dist/reindex.js CHANGED
@@ -1,6 +1,7 @@
1
1
  import { readFileSync, statSync } from "node:fs";
2
2
  import { join, relative, resolve, basename } from "node:path";
3
3
  import { homedir } from "node:os";
4
+ import { createHash } from "node:crypto";
4
5
  import { KnowledgeDB } from "./db.js";
5
6
  import { FtsSearch } from "./search.js";
6
7
  import { VectorSearch } from "./vector-search.js";
@@ -10,6 +11,7 @@ import { findMarkdownFiles } from "./file-scanner.js";
10
11
  import { generateIndexes } from "./generate-indexes.js";
11
12
  import { loadConfig } from "./config.js";
12
13
  import { loadIgnoreForRoot } from "./ignore.js";
14
+ import { createLlmClient } from "./llm-client.js";
13
15
  export async function reindex(dirs, dbPath, generate = false, ignorePatterns) {
14
16
  console.log(`Indexing ${dirs.join(", ")} -> ${dbPath}`);
15
17
  const db = new KnowledgeDB(dbPath);
@@ -17,6 +19,26 @@ export async function reindex(dirs, dbPath, generate = false, ignorePatterns) {
17
19
  fts.ensureTable();
18
20
  const vec = new VectorSearch(db);
19
21
  vec.createIndex();
22
+ // Phase 6 (GH-767): Contextual Retrieval wiring.
23
+ // `RALPH_CONTEXTUAL_RETRIEVAL` gates the whole feature. Default on; treat
24
+ // literal "0" / "false" as disabled. When enabled we probe the endpoint once
25
+ // and fail open on unreachable — all downstream chunks then embed without a
26
+ // context prefix and we log a single warning so the operator knows why.
27
+ const flagRaw = process.env.RALPH_CONTEXTUAL_RETRIEVAL;
28
+ const contextualEnabled = flagRaw !== "0" && flagRaw !== "false";
29
+ let llm;
30
+ if (contextualEnabled) {
31
+ const llmUrl = process.env.RALPH_LLM_URL ?? "http://localhost:8000";
32
+ const candidate = createLlmClient();
33
+ const llmReady = await candidate.available();
34
+ if (llmReady) {
35
+ llm = candidate;
36
+ }
37
+ else {
38
+ console.warn(`LLM endpoint unreachable at ${llmUrl}, contextual retrieval disabled for this run`);
39
+ llm = undefined;
40
+ }
41
+ }
20
42
  // Schema version check — force full re-embed when embedding algorithm changes
21
43
  const SCHEMA_VERSION = "3";
22
44
  const currentVersion = db.getMeta("schema_version");
@@ -62,6 +84,7 @@ export async function reindex(dirs, dbPath, generate = false, ignorePatterns) {
62
84
  const parsedDocs = [];
63
85
  let indexed = 0;
64
86
  let skipped = 0;
87
+ let totalChunks = 0;
65
88
  for (const filePath of filesOnDisk) {
66
89
  const absPath = resolve(filePath);
67
90
  const mtime = Math.trunc(statSync(absPath).mtimeMs);
@@ -122,6 +145,29 @@ export async function reindex(dirs, dbPath, generate = false, ignorePatterns) {
122
145
  db.upsertStubDocument(edge.targetId);
123
146
  db.addRelationship(edge.sourceId, edge.targetId, "untyped", edge.context);
124
147
  }
148
+ // Content-hash cache for Contextual Retrieval prefixes. The outer mtime
149
+ // skip at line ~75 already short-circuits the overwhelming majority of
150
+ // unchanged docs (no embedder or LLM calls). This inner hash check is
151
+ // specifically for the rare case where mtime differs but content is
152
+ // byte-identical (e.g., git checkout touching the file). When hash matches
153
+ // AND we have a live LLM AND chunks already exist, we reuse the prior
154
+ // context_prefix map and skip the per-chunk LLM round-trips.
155
+ //
156
+ // Simpler alternative considered: rely entirely on mtime. Rejected because
157
+ // the feature spec (Task 6.4 acceptance) explicitly requires re-running
158
+ // reindex without content changes to reuse existing context_prefix.
159
+ const contentHash = createHash("sha256").update(parsed.content).digest("hex").slice(0, 16);
160
+ const hashKey = `content_hash:${parsed.id}`;
161
+ const priorHash = db.getMeta(hashKey);
162
+ let cachedPrefixes;
163
+ if (llm && priorHash === contentHash) {
164
+ const priorChunks = db.db
165
+ .prepare("SELECT chunk_index, context_prefix FROM chunks WHERE document_id = ? ORDER BY chunk_index")
166
+ .all(parsed.id);
167
+ if (priorChunks.length > 0) {
168
+ cachedPrefixes = new Map(priorChunks.map(r => [r.chunk_index, r.context_prefix ?? ""]));
169
+ }
170
+ }
125
171
  // Chunk-aware embedding: emit one embedding per chunk, persist to both
126
172
  // the `chunks` table and the `documents_vec` virtual table with chunk ids
127
173
  // of the form `${doc.id}#c${index}`.
@@ -134,13 +180,22 @@ export async function reindex(dirs, dbPath, generate = false, ignorePatterns) {
134
180
  // Drop any pre-chunks schema vec row that used the bare doc id.
135
181
  vec.deleteEmbedding(parsed.id);
136
182
  try {
137
- const chunks = await embedDocument(parsed.title, parsed.tags, parsed.content);
138
- const insertChunk = db.db.prepare("INSERT INTO chunks (id, document_id, chunk_index, content, char_start, char_end) VALUES (?, ?, ?, ?, ?, ?)");
183
+ const chunks = await embedDocument(parsed.title, parsed.tags, parsed.content, {
184
+ llm,
185
+ cachedPrefixes,
186
+ });
187
+ const insertChunk = db.db.prepare("INSERT INTO chunks (id, document_id, chunk_index, content, char_start, char_end, context_prefix) VALUES (?, ?, ?, ?, ?, ?, ?)");
139
188
  for (const chunk of chunks) {
140
189
  const chunkId = `${parsed.id}#c${chunk.index}`;
141
- insertChunk.run(chunkId, parsed.id, chunk.index, chunk.content, chunk.charStart, chunk.charEnd);
190
+ insertChunk.run(chunkId, parsed.id, chunk.index, chunk.content, chunk.charStart, chunk.charEnd, chunk.contextPrefix ?? "");
142
191
  vec.upsertEmbedding(chunkId, chunk.embedding);
192
+ totalChunks++;
193
+ if (totalChunks % 50 === 0) {
194
+ console.log(` ${totalChunks} chunks embedded`);
195
+ }
143
196
  }
197
+ // Record the content hash for the next reindex cache check.
198
+ db.setMeta(hashKey, contentHash);
144
199
  }
145
200
  catch (e) {
146
201
  console.warn(`Failed to embed ${id}: ${e.message}`);
@@ -1 +1 @@
1
- {"version":3,"file":"reindex.js","sourceRoot":"","sources":["../src/reindex.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,QAAQ,EAAE,MAAM,SAAS,CAAC;AACjD,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAC9D,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAClC,OAAO,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AACtC,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAC9C,OAAO,EAAE,aAAa,EAAuB,MAAM,aAAa,CAAC;AACjE,OAAO,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,UAAU,EAAwB,MAAM,aAAa,CAAC;AAC/D,OAAO,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AAEhD,MAAM,CAAC,KAAK,UAAU,OAAO,CAC3B,IAAc,EACd,MAAc,EACd,WAAoB,KAAK,EACzB,cAAyB;IAEzB,OAAO,CAAC,GAAG,CAAC,YAAY,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,MAAM,EAAE,CAAC,CAAC;IAExD,MAAM,EAAE,GAAG,IAAI,WAAW,CAAC,MAAM,CAAC,CAAC;IACnC,MAAM,GAAG,GAAG,IAAI,SAAS,CAAC,EAAE,CAAC,CAAC;IAC9B,GAAG,CAAC,WAAW,EAAE,CAAC;IAClB,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,EAAE,CAAC,CAAC;IACjC,GAAG,CAAC,WAAW,EAAE,CAAC;IAElB,8EAA8E;IAC9E,MAAM,cAAc,GAAG,GAAG,CAAC;IAC3B,MAAM,cAAc,GAAG,EAAE,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAC;IACpD,IAAI,mBAAmB,GAAG,KAAK,CAAC;IAChC,IAAI,cAAc,KAAK,cAAc,EAAE,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,uEAAuE,CAAC,CAAC;QACrF,EAAE,CAAC,gBAAgB,EAAE,CAAC;QACtB,EAAE,CAAC,OAAO,CAAC,gBAAgB,EAAE,cAAc,CAAC,CAAC;QAC7C,mBAAmB,GAAG,IAAI,CAAC;IAC7B,CAAC;IAED,kCAAkC;IAClC,MAAM,WAAW,GAAa,EAAE,CAAC;IACjC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,OAAO,GAAG,iBAAiB,CAAC,GAAG,EAAE,cAAc,CAAC,CAAC;QACvD,MAAM,KAAK,GAAG,iBAAiB,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;QAC9C,OAAO,CAAC,GAAG,CAAC,KAAK,GAAG,KAAK,KAAK,CAAC,MAAM,QAAQ,CAAC,CAAC;QAC/C,WAAW,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,CAAC;IAC7B,CAAC;IACD,OAAO,CAAC,GAAG,CAAC,SAAS,WAAW,CAAC,MAAM,uBAAuB,CAAC,CAAC;IAEhE,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAEjE,6DAA6D;IAC7D,iFAAiF;IACjF,2EAA2E;IAC3E,2DAA2D;IAC3D,MAAM,WAAW,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC;IACzC,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,KAAK,MAAM,UAAU,IAAI,WAAW,EAAE,CAAC;QACrC,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;YACpC,MAAM,EAAE,GAAG,QAAQ,CAAC,UAAU,EAAE,KAAK,CAAC,CAAC;YACvC,GAAG,CAAC,cAAc,CAAC,EAAE,CAAC,CAAC;YACvB,EAAE,CAAC,cAAc,CAAC,EAAE,CAAC,CAAC;YACtB,GAAG,CAAC,oBAAoB,CAAC,EAAE,CAAC,CAAC;YAC7B,gEAAgE;YAChE,GAAG,CAAC,eAAe,CAAC,EAAE,CAAC,CAAC;YACxB,EAAE,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC;YAChC,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IACD,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;QAChB,OAAO,CAAC,GAAG,CAAC,aAAa,OAAO,gBAAgB,CAAC,CAAC;IACpD,CAAC;IAED,yCAAyC;IACzC,MAAM,UAAU,GAAqB,EAAE,CAAC;IACxC,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,KAAK,MAAM,QAAQ,IAAI,WAAW,EAAE,CAAC;QACnC,MAAM,OAAO,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;QAClC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,CAAC;QAEpD,8CAA8C;QAC9C,MAAM,UAAU,GAAG,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;QAC7C,IAAI,UAAU,IAAI,UAAU,CAAC,KAAK,KAAK,KAAK,EAAE,CAAC;YAC7C,OAAO,EAAE,CAAC;YACV,SAAS;QACX,CAAC;QAED,MAAM,GAAG,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC5C,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACjE,MAAM,OAAO,GAAG,SAAS;YACvB,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC;YAC7C,CAAC,CAAC,QAAQ,CAAC;QACb,MAAM,EAAE,GAAG,QAAQ,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QAErC,MAAM,MAAM,GAAG,aAAa,CAAC,EAAE,EAAE,OAAO,EAAE,GAAG,CAAC,CAAC;QAC/C,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAExB,MAAM,OAAO,GAAa,EAAE,CAAC;QAC7B,IAAI,CAAC,MAAM,CAAC,IAAI;YAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACvC,IAAI,CAAC,MAAM,CAAC,IAAI;YAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACvC,IAAI,CAAC,MAAM,CAAC,MAAM;YAAE,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC3C,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvB,OAAO,CAAC,IAAI,CAAC,cAAc,EAAE,yBAAyB,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAC9E,CAAC;QAED,uEAAuE;QACvE,IAAI,EAAE,CAAC,cAAc,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC;YACjC,GAAG,CAAC,cAAc,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAChC,CAAC;QAED,EAAE,CAAC,cAAc,CAAC;YAChB,EAAE,EAAE,MAAM,CAAC,EAAE;YACb,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,KAAK,EAAE,MAAM,CAAC,KAAK;YACnB,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,MAAM,EAAE,MAAM,CAAC,MAAM;YACrB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,OAAO,EAAE,MAAM,CAAC,OAAO;SACxB,CAAC,CAAC;QAEH,oCAAoC;QACpC,GAAG,CAAC,cAAc,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAE9B,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC3B,EAAE,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC;QACrC,CAAC;QAED,4EAA4E;QAC5E,EAAE,CAAC,EAAE,CAAC,OAAO,CAAC,+CAA+C,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAE9E,6EAA6E;QAC7E,gFAAgF;QAChF,8EAA8E;QAC9E,8CAA8C;QAC9C,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;YACvC,EAAE,CAAC,kBAAkB,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YACpC,EAAE,CAAC,eAAe,CAAC,GAAG,CAAC,QAAQ,EAAE,GAAG,CAAC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;QAC3D,CAAC;QAED,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;YACvC,EAAE,CAAC,kBAAkB,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACrC,EAAE,CAAC,eAAe,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;QAC5E,CAAC;QAED,uEAAuE;QACvE,0EAA0E;QAC1E,qCAAqC;QACrC,EAAE;QACF,oEAAoE;QACpE,0EAA0E;QAC1E,0EAA0E;QAC1E,EAAE,CAAC,EAAE,CAAC,OAAO,CAAC,0CAA0C,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QACzE,GAAG,CAAC,oBAAoB,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QACpC,gEAAgE;QAChE,GAAG,CAAC,eAAe,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAE/B,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,MAAM,CAAC,KAAK,EAAE,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC;YAC9E,MAAM,WAAW,GAAG,EAAE,CAAC,EAAE,CAAC,OAAO,CAC/B,4GAA4G,CAC7G,CAAC;YACF,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;gBAC3B,MAAM,OAAO,GAAG,GAAG,MAAM,CAAC,EAAE,KAAK,KAAK,CAAC,KAAK,EAAE,CAAC;gBAC/C,WAAW,CAAC,GAAG,CACb,OAAO,EACP,MAAM,CAAC,EAAE,EACT,KAAK,CAAC,KAAK,EACX,KAAK,CAAC,OAAO,EACb,KAAK,CAAC,SAAS,EACf,KAAK,CAAC,OAAO,CACd,CAAC;gBACF,GAAG,CAAC,eAAe,CAAC,OAAO,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;YAChD,CAAC;QACH,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,IAAI,CAAC,mBAAmB,EAAE,KAAM,CAAW,CAAC,OAAO,EAAE,CAAC,CAAC;QACjE,CAAC;QAED,EAAE,CAAC,gBAAgB,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;QAEpC,OAAO,EAAE,CAAC;QACV,IAAI,OAAO,GAAG,EAAE,KAAK,CAAC,EAAE,CAAC;YACvB,OAAO,CAAC,GAAG,CAAC,KAAK,OAAO,IAAI,WAAW,CAAC,MAAM,UAAU,CAAC,CAAC;QAC5D,CAAC;IACH,CAAC;IAED,qFAAqF;IACrF,8FAA8F;IAC9F,IAAI,mBAAmB,EAAE,CAAC;QACxB,GAAG,CAAC,YAAY,EAAE,CAAC;IACrB,CAAC;IAED,gGAAgG;IAChG,MAAM,YAAY,GAAG,IAAI,GAAG,CACzB,EAAE,CAAC,EAAE,CAAC,OAAO,CAAC,8CAA8C,CAAC,CAAC,GAAG,EAAmC;SAClG,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CACzB,CAAC;IAEF,uEAAuE;IACvE,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,KAAK,MAAM,QAAQ,IAAI,YAAY,EAAE,CAAC;QACpC,IAAI,CAAC,EAAE,CAAC,cAAc,CAAC,QAAQ,CAAC,EAAE,CAAC;YACjC,EAAE,CAAC,kBAAkB,CAAC,QAAQ,CAAC,CAAC;YAChC,SAAS,EAAE,CAAC;QACd,CAAC;IACH,CAAC;IACD,OAAO,CAAC,GAAG,CAAC,aAAa,SAAS,sCAAsC,CAAC,CAAC;IAE1E,IAAI,CAAC;QACH,IAAI,QAAQ,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAChC,OAAO,CAAC,GAAG,CAAC,2BAA2B,CAAC,CAAC;YACzC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;YACrC,OAAO,CAAC,GAAG,CAAC,wBAAwB,CAAC,CAAC;QACxC,CAAC;IACH,CAAC;YAAS,CAAC;QACT,OAAO,CAAC,GAAG,CAAC,SAAS,OAAO,uBAAuB,OAAO,uBAAuB,CAAC,CAAC;QACnF,EAAE,CAAC,KAAK,EAAE,CAAC;IACb,CAAC;AACH,CAAC;AAED,MAAM,eAAe,GAAG,IAAI,CAAC,OAAO,EAAE,EAAE,aAAa,EAAE,cAAc,CAAC,CAAC;AAYvE;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,WAAW;IACzB,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACtC,MAAM,UAAU,GAAG,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC;IACrD,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC;IAC5D,MAAM,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;IACtD,MAAM,OAAO,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;IAE3D,MAAM,MAAM,GAAG,UAAU,EAAE,CAAC;IAE5B,MAAM,aAAa,GAAG,GAAW,EAAE,CACjC,KAAK;QACL,OAAO,CAAC,GAAG,CAAC,kBAAkB;QAC9B,MAAM,CAAC,MAAM;QACb,eAAe,CAAC;IAElB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;QACrC,OAAO;YACL,IAAI,EAAE,OAAO;YACb,MAAM,EAAE,aAAa,EAAE;YACvB,QAAQ,EAAE,CAAC,UAAU;YACrB,MAAM,EAAE,KAAK;YACb,MAAM;SACP,CAAC;IACJ,CAAC;IAED,MAAM,OAAO,GAAG,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC;IACjD,IAAI,OAAO,EAAE,CAAC;QACZ,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACrE,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtB,OAAO,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;YACrC,OAAO;gBACL,IAAI,EAAE,MAAM;gBACZ,MAAM,EAAE,aAAa,EAAE;gBACvB,QAAQ,EAAE,CAAC,UAAU;gBACrB,MAAM,EAAE,KAAK;gBACb,MAAM;aACP,CAAC;QACJ,CAAC;IACH,CAAC;IAED,IAAI,MAAM,CAAC,KAAK,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5C,OAAO,CAAC,GAAG,CAAC,0BAA0B,CAAC,CAAC;QACxC,OAAO;YACL,IAAI,EAAE,MAAM,CAAC,KAAK;YAClB,MAAM,EAAE,aAAa,EAAE;YACvB,QAAQ,EAAE,CAAC,UAAU;YACrB,MAAM,EAAE,QAAQ;YAChB,MAAM;SACP,CAAC;IACJ,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,4BAA4B,CAAC,CAAC;IAC1C,OAAO;QACL,IAAI,EAAE,CAAC,gBAAgB,CAAC;QACxB,MAAM,EAAE,aAAa,EAAE;QACvB,QAAQ,EAAE,CAAC,UAAU;QACrB,MAAM,EAAE,UAAU;QAClB,MAAM;KACP,CAAC;AACJ,CAAC;AAED,MAAM,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,QAAQ,CAAC,YAAY,CAAC,CAAC;AACvD,IAAI,MAAM,EAAE,CAAC;IACX,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,WAAW,EAAE,CAAC;IACzD,OAAO,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,cAAc,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;AAC9E,CAAC"}
1
+ {"version":3,"file":"reindex.js","sourceRoot":"","sources":["../src/reindex.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAE,QAAQ,EAAE,MAAM,SAAS,CAAC;AACjD,OAAO,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,WAAW,CAAC;AAC9D,OAAO,EAAE,OAAO,EAAE,MAAM,SAAS,CAAC;AAClC,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,OAAO,EAAE,WAAW,EAAE,MAAM,SAAS,CAAC;AACtC,OAAO,EAAE,SAAS,EAAE,MAAM,aAAa,CAAC;AACxC,OAAO,EAAE,YAAY,EAAE,MAAM,oBAAoB,CAAC;AAClD,OAAO,EAAE,aAAa,EAAE,MAAM,eAAe,CAAC;AAC9C,OAAO,EAAE,aAAa,EAAuB,MAAM,aAAa,CAAC;AACjE,OAAO,EAAE,iBAAiB,EAAE,MAAM,mBAAmB,CAAC;AACtD,OAAO,EAAE,eAAe,EAAE,MAAM,uBAAuB,CAAC;AACxD,OAAO,EAAE,UAAU,EAAwB,MAAM,aAAa,CAAC;AAC/D,OAAO,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AAChD,OAAO,EAAE,eAAe,EAAkB,MAAM,iBAAiB,CAAC;AAClE,MAAM,CAAC,KAAK,UAAU,OAAO,CAC3B,IAAc,EACd,MAAc,EACd,WAAoB,KAAK,EACzB,cAAyB;IAEzB,OAAO,CAAC,GAAG,CAAC,YAAY,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,OAAO,MAAM,EAAE,CAAC,CAAC;IAExD,MAAM,EAAE,GAAG,IAAI,WAAW,CAAC,MAAM,CAAC,CAAC;IACnC,MAAM,GAAG,GAAG,IAAI,SAAS,CAAC,EAAE,CAAC,CAAC;IAC9B,GAAG,CAAC,WAAW,EAAE,CAAC;IAClB,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,EAAE,CAAC,CAAC;IACjC,GAAG,CAAC,WAAW,EAAE,CAAC;IAElB,iDAAiD;IACjD,0EAA0E;IAC1E,6EAA6E;IAC7E,4EAA4E;IAC5E,wEAAwE;IACxE,MAAM,OAAO,GAAG,OAAO,CAAC,GAAG,CAAC,0BAA0B,CAAC;IACvD,MAAM,iBAAiB,GAAG,OAAO,KAAK,GAAG,IAAI,OAAO,KAAK,OAAO,CAAC;IACjE,IAAI,GAA0B,CAAC;IAC/B,IAAI,iBAAiB,EAAE,CAAC;QACtB,MAAM,MAAM,GAAG,OAAO,CAAC,GAAG,CAAC,aAAa,IAAI,uBAAuB,CAAC;QACpE,MAAM,SAAS,GAAG,eAAe,EAAE,CAAC;QACpC,MAAM,QAAQ,GAAG,MAAM,SAAS,CAAC,SAAS,EAAE,CAAC;QAC7C,IAAI,QAAQ,EAAE,CAAC;YACb,GAAG,GAAG,SAAS,CAAC;QAClB,CAAC;aAAM,CAAC;YACN,OAAO,CAAC,IAAI,CACV,+BAA+B,MAAM,8CAA8C,CACpF,CAAC;YACF,GAAG,GAAG,SAAS,CAAC;QAClB,CAAC;IACH,CAAC;IAED,8EAA8E;IAC9E,MAAM,cAAc,GAAG,GAAG,CAAC;IAC3B,MAAM,cAAc,GAAG,EAAE,CAAC,OAAO,CAAC,gBAAgB,CAAC,CAAC;IACpD,IAAI,mBAAmB,GAAG,KAAK,CAAC;IAChC,IAAI,cAAc,KAAK,cAAc,EAAE,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,uEAAuE,CAAC,CAAC;QACrF,EAAE,CAAC,gBAAgB,EAAE,CAAC;QACtB,EAAE,CAAC,OAAO,CAAC,gBAAgB,EAAE,cAAc,CAAC,CAAC;QAC7C,mBAAmB,GAAG,IAAI,CAAC;IAC7B,CAAC;IAED,kCAAkC;IAClC,MAAM,WAAW,GAAa,EAAE,CAAC;IACjC,KAAK,MAAM,GAAG,IAAI,IAAI,EAAE,CAAC;QACvB,MAAM,OAAO,GAAG,iBAAiB,CAAC,GAAG,EAAE,cAAc,CAAC,CAAC;QACvD,MAAM,KAAK,GAAG,iBAAiB,CAAC,GAAG,EAAE,OAAO,CAAC,CAAC;QAC9C,OAAO,CAAC,GAAG,CAAC,KAAK,GAAG,KAAK,KAAK,CAAC,MAAM,QAAQ,CAAC,CAAC;QAC/C,WAAW,CAAC,IAAI,CAAC,GAAG,KAAK,CAAC,CAAC;IAC7B,CAAC;IACD,OAAO,CAAC,GAAG,CAAC,SAAS,WAAW,CAAC,MAAM,uBAAuB,CAAC,CAAC;IAEhE,MAAM,cAAc,GAAG,IAAI,GAAG,CAAC,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAEjE,6DAA6D;IAC7D,iFAAiF;IACjF,2EAA2E;IAC3E,2DAA2D;IAC3D,MAAM,WAAW,GAAG,EAAE,CAAC,eAAe,EAAE,CAAC;IACzC,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,KAAK,MAAM,UAAU,IAAI,WAAW,EAAE,CAAC;QACrC,IAAI,CAAC,cAAc,CAAC,GAAG,CAAC,UAAU,CAAC,EAAE,CAAC;YACpC,MAAM,EAAE,GAAG,QAAQ,CAAC,UAAU,EAAE,KAAK,CAAC,CAAC;YACvC,GAAG,CAAC,cAAc,CAAC,EAAE,CAAC,CAAC;YACvB,EAAE,CAAC,cAAc,CAAC,EAAE,CAAC,CAAC;YACtB,GAAG,CAAC,oBAAoB,CAAC,EAAE,CAAC,CAAC;YAC7B,gEAAgE;YAChE,GAAG,CAAC,eAAe,CAAC,EAAE,CAAC,CAAC;YACxB,EAAE,CAAC,gBAAgB,CAAC,UAAU,CAAC,CAAC;YAChC,OAAO,EAAE,CAAC;QACZ,CAAC;IACH,CAAC;IACD,IAAI,OAAO,GAAG,CAAC,EAAE,CAAC;QAChB,OAAO,CAAC,GAAG,CAAC,aAAa,OAAO,gBAAgB,CAAC,CAAC;IACpD,CAAC;IAED,yCAAyC;IACzC,MAAM,UAAU,GAAqB,EAAE,CAAC;IACxC,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,IAAI,OAAO,GAAG,CAAC,CAAC;IAChB,IAAI,WAAW,GAAG,CAAC,CAAC;IACpB,KAAK,MAAM,QAAQ,IAAI,WAAW,EAAE,CAAC;QACnC,MAAM,OAAO,GAAG,OAAO,CAAC,QAAQ,CAAC,CAAC;QAClC,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,OAAO,CAAC,CAAC;QAEpD,8CAA8C;QAC9C,MAAM,UAAU,GAAG,EAAE,CAAC,aAAa,CAAC,OAAO,CAAC,CAAC;QAC7C,IAAI,UAAU,IAAI,UAAU,CAAC,KAAK,KAAK,KAAK,EAAE,CAAC;YAC7C,OAAO,EAAE,CAAC;YACV,SAAS;QACX,CAAC;QAED,MAAM,GAAG,GAAG,YAAY,CAAC,QAAQ,EAAE,OAAO,CAAC,CAAC;QAC5C,MAAM,SAAS,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,OAAO,CAAC,UAAU,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACjE,MAAM,OAAO,GAAG,SAAS;YACvB,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,SAAS,EAAE,IAAI,CAAC,EAAE,OAAO,CAAC;YAC7C,CAAC,CAAC,QAAQ,CAAC;QACb,MAAM,EAAE,GAAG,QAAQ,CAAC,QAAQ,EAAE,KAAK,CAAC,CAAC;QAErC,MAAM,MAAM,GAAG,aAAa,CAAC,EAAE,EAAE,OAAO,EAAE,GAAG,CAAC,CAAC;QAC/C,UAAU,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAExB,MAAM,OAAO,GAAa,EAAE,CAAC;QAC7B,IAAI,CAAC,MAAM,CAAC,IAAI;YAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACvC,IAAI,CAAC,MAAM,CAAC,IAAI;YAAE,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QACvC,IAAI,CAAC,MAAM,CAAC,MAAM;YAAE,OAAO,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;QAC3C,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACvB,OAAO,CAAC,IAAI,CAAC,cAAc,EAAE,yBAAyB,OAAO,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;QAC9E,CAAC;QAED,uEAAuE;QACvE,IAAI,EAAE,CAAC,cAAc,CAAC,MAAM,CAAC,EAAE,CAAC,EAAE,CAAC;YACjC,GAAG,CAAC,cAAc,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAChC,CAAC;QAED,EAAE,CAAC,cAAc,CAAC;YAChB,EAAE,EAAE,MAAM,CAAC,EAAE;YACb,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,KAAK,EAAE,MAAM,CAAC,KAAK;YACnB,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,IAAI,EAAE,MAAM,CAAC,IAAI;YACjB,MAAM,EAAE,MAAM,CAAC,MAAM;YACrB,WAAW,EAAE,MAAM,CAAC,WAAW;YAC/B,OAAO,EAAE,MAAM,CAAC,OAAO;SACxB,CAAC,CAAC;QAEH,oCAAoC;QACpC,GAAG,CAAC,cAAc,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAE9B,IAAI,MAAM,CAAC,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC3B,EAAE,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC;QACrC,CAAC;QAED,4EAA4E;QAC5E,EAAE,CAAC,EAAE,CAAC,OAAO,CAAC,+CAA+C,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAE9E,6EAA6E;QAC7E,gFAAgF;QAChF,8EAA8E;QAC9E,8CAA8C;QAC9C,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;YACvC,EAAE,CAAC,kBAAkB,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YACpC,EAAE,CAAC,eAAe,CAAC,GAAG,CAAC,QAAQ,EAAE,GAAG,CAAC,QAAQ,EAAE,GAAG,CAAC,IAAI,CAAC,CAAC;QAC3D,CAAC;QAED,KAAK,MAAM,IAAI,IAAI,MAAM,CAAC,YAAY,EAAE,CAAC;YACvC,EAAE,CAAC,kBAAkB,CAAC,IAAI,CAAC,QAAQ,CAAC,CAAC;YACrC,EAAE,CAAC,eAAe,CAAC,IAAI,CAAC,QAAQ,EAAE,IAAI,CAAC,QAAQ,EAAE,SAAS,EAAE,IAAI,CAAC,OAAO,CAAC,CAAC;QAC5E,CAAC;QAED,wEAAwE;QACxE,uEAAuE;QACvE,sEAAsE;QACtE,oEAAoE;QACpE,2EAA2E;QAC3E,sEAAsE;QACtE,6DAA6D;QAC7D,EAAE;QACF,2EAA2E;QAC3E,wEAAwE;QACxE,oEAAoE;QACpE,MAAM,WAAW,GAAG,UAAU,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;QAC3F,MAAM,OAAO,GAAG,gBAAgB,MAAM,CAAC,EAAE,EAAE,CAAC;QAC5C,MAAM,SAAS,GAAG,EAAE,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QAEtC,IAAI,cAA+C,CAAC;QACpD,IAAI,GAAG,IAAI,SAAS,KAAK,WAAW,EAAE,CAAC;YACrC,MAAM,WAAW,GAAG,EAAE,CAAC,EAAE;iBACtB,OAAO,CACN,2FAA2F,CAC5F;iBACA,GAAG,CAAC,MAAM,CAAC,EAAE,CAA2D,CAAC;YAC5E,IAAI,WAAW,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC3B,cAAc,GAAG,IAAI,GAAG,CACtB,WAAW,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC,cAAc,IAAI,EAAE,CAAqB,CAAC,CAClF,CAAC;YACJ,CAAC;QACH,CAAC;QAED,uEAAuE;QACvE,0EAA0E;QAC1E,qCAAqC;QACrC,EAAE;QACF,oEAAoE;QACpE,0EAA0E;QAC1E,0EAA0E;QAC1E,EAAE,CAAC,EAAE,CAAC,OAAO,CAAC,0CAA0C,CAAC,CAAC,GAAG,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QACzE,GAAG,CAAC,oBAAoB,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QACpC,gEAAgE;QAChE,GAAG,CAAC,eAAe,CAAC,MAAM,CAAC,EAAE,CAAC,CAAC;QAE/B,IAAI,CAAC;YACH,MAAM,MAAM,GAAG,MAAM,aAAa,CAAC,MAAM,CAAC,KAAK,EAAE,MAAM,CAAC,IAAI,EAAE,MAAM,CAAC,OAAO,EAAE;gBAC5E,GAAG;gBACH,cAAc;aACf,CAAC,CAAC;YACH,MAAM,WAAW,GAAG,EAAE,CAAC,EAAE,CAAC,OAAO,CAC/B,+HAA+H,CAChI,CAAC;YACF,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;gBAC3B,MAAM,OAAO,GAAG,GAAG,MAAM,CAAC,EAAE,KAAK,KAAK,CAAC,KAAK,EAAE,CAAC;gBAC/C,WAAW,CAAC,GAAG,CACb,OAAO,EACP,MAAM,CAAC,EAAE,EACT,KAAK,CAAC,KAAK,EACX,KAAK,CAAC,OAAO,EACb,KAAK,CAAC,SAAS,EACf,KAAK,CAAC,OAAO,EACb,KAAK,CAAC,aAAa,IAAI,EAAE,CAC1B,CAAC;gBACF,GAAG,CAAC,eAAe,CAAC,OAAO,EAAE,KAAK,CAAC,SAAS,CAAC,CAAC;gBAC9C,WAAW,EAAE,CAAC;gBACd,IAAI,WAAW,GAAG,EAAE,KAAK,CAAC,EAAE,CAAC;oBAC3B,OAAO,CAAC,GAAG,CAAC,KAAK,WAAW,kBAAkB,CAAC,CAAC;gBAClD,CAAC;YACH,CAAC;YACD,4DAA4D;YAC5D,EAAE,CAAC,OAAO,CAAC,OAAO,EAAE,WAAW,CAAC,CAAC;QACnC,CAAC;QAAC,OAAO,CAAC,EAAE,CAAC;YACX,OAAO,CAAC,IAAI,CAAC,mBAAmB,EAAE,KAAM,CAAW,CAAC,OAAO,EAAE,CAAC,CAAC;QACjE,CAAC;QAED,EAAE,CAAC,gBAAgB,CAAC,OAAO,EAAE,KAAK,CAAC,CAAC;QAEpC,OAAO,EAAE,CAAC;QACV,IAAI,OAAO,GAAG,EAAE,KAAK,CAAC,EAAE,CAAC;YACvB,OAAO,CAAC,GAAG,CAAC,KAAK,OAAO,IAAI,WAAW,CAAC,MAAM,UAAU,CAAC,CAAC;QAC5D,CAAC;IACH,CAAC;IAED,qFAAqF;IACrF,8FAA8F;IAC9F,IAAI,mBAAmB,EAAE,CAAC;QACxB,GAAG,CAAC,YAAY,EAAE,CAAC;IACrB,CAAC;IAED,gGAAgG;IAChG,MAAM,YAAY,GAAG,IAAI,GAAG,CACzB,EAAE,CAAC,EAAE,CAAC,OAAO,CAAC,8CAA8C,CAAC,CAAC,GAAG,EAAmC;SAClG,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC,CACzB,CAAC;IAEF,uEAAuE;IACvE,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,KAAK,MAAM,QAAQ,IAAI,YAAY,EAAE,CAAC;QACpC,IAAI,CAAC,EAAE,CAAC,cAAc,CAAC,QAAQ,CAAC,EAAE,CAAC;YACjC,EAAE,CAAC,kBAAkB,CAAC,QAAQ,CAAC,CAAC;YAChC,SAAS,EAAE,CAAC;QACd,CAAC;IACH,CAAC;IACD,OAAO,CAAC,GAAG,CAAC,aAAa,SAAS,sCAAsC,CAAC,CAAC;IAE1E,IAAI,CAAC;QACH,IAAI,QAAQ,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAChC,OAAO,CAAC,GAAG,CAAC,2BAA2B,CAAC,CAAC;YACzC,eAAe,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,UAAU,CAAC,CAAC;YACrC,OAAO,CAAC,GAAG,CAAC,wBAAwB,CAAC,CAAC;QACxC,CAAC;IACH,CAAC;YAAS,CAAC;QACT,OAAO,CAAC,GAAG,CAAC,SAAS,OAAO,uBAAuB,OAAO,uBAAuB,CAAC,CAAC;QACnF,EAAE,CAAC,KAAK,EAAE,CAAC;IACb,CAAC;AACH,CAAC;AAED,MAAM,eAAe,GAAG,IAAI,CAAC,OAAO,EAAE,EAAE,aAAa,EAAE,cAAc,CAAC,CAAC;AAYvE;;;;;;;;;;;;;GAaG;AACH,MAAM,UAAU,WAAW;IACzB,MAAM,OAAO,GAAG,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC;IACtC,MAAM,UAAU,GAAG,OAAO,CAAC,QAAQ,CAAC,eAAe,CAAC,CAAC;IACrD,MAAM,UAAU,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC;IAC5D,MAAM,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;IACtD,MAAM,OAAO,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;IAE3D,MAAM,MAAM,GAAG,UAAU,EAAE,CAAC;IAE5B,MAAM,aAAa,GAAG,GAAW,EAAE,CACjC,KAAK;QACL,OAAO,CAAC,GAAG,CAAC,kBAAkB;QAC9B,MAAM,CAAC,MAAM;QACb,eAAe,CAAC;IAElB,IAAI,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACvB,OAAO,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;QACrC,OAAO;YACL,IAAI,EAAE,OAAO;YACb,MAAM,EAAE,aAAa,EAAE;YACvB,QAAQ,EAAE,CAAC,UAAU;YACrB,MAAM,EAAE,KAAK;YACb,MAAM;SACP,CAAC;IACJ,CAAC;IAED,MAAM,OAAO,GAAG,OAAO,CAAC,GAAG,CAAC,oBAAoB,CAAC;IACjD,IAAI,OAAO,EAAE,CAAC;QACZ,MAAM,MAAM,GAAG,OAAO,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC;QACrE,IAAI,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACtB,OAAO,CAAC,GAAG,CAAC,uBAAuB,CAAC,CAAC;YACrC,OAAO;gBACL,IAAI,EAAE,MAAM;gBACZ,MAAM,EAAE,aAAa,EAAE;gBACvB,QAAQ,EAAE,CAAC,UAAU;gBACrB,MAAM,EAAE,KAAK;gBACb,MAAM;aACP,CAAC;QACJ,CAAC;IACH,CAAC;IAED,IAAI,MAAM,CAAC,KAAK,IAAI,MAAM,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC5C,OAAO,CAAC,GAAG,CAAC,0BAA0B,CAAC,CAAC;QACxC,OAAO;YACL,IAAI,EAAE,MAAM,CAAC,KAAK;YAClB,MAAM,EAAE,aAAa,EAAE;YACvB,QAAQ,EAAE,CAAC,UAAU;YACrB,MAAM,EAAE,QAAQ;YAChB,MAAM;SACP,CAAC;IACJ,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,4BAA4B,CAAC,CAAC;IAC1C,OAAO;QACL,IAAI,EAAE,CAAC,gBAAgB,CAAC;QACxB,MAAM,EAAE,aAAa,EAAE;QACvB,QAAQ,EAAE,CAAC,UAAU;QACrB,MAAM,EAAE,UAAU;QAClB,MAAM;KACP,CAAC;AACJ,CAAC;AAED,MAAM,MAAM,GAAG,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,QAAQ,CAAC,YAAY,CAAC,CAAC;AACvD,IAAI,MAAM,EAAE,CAAC;IACX,MAAM,EAAE,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,EAAE,GAAG,WAAW,EAAE,CAAC;IACzD,OAAO,CAAC,IAAI,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,cAAc,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC;AAC9E,CAAC"}
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "ralph-hero-knowledge-index",
3
- "version": "0.1.25",
3
+ "version": "0.1.26",
4
4
  "type": "module",
5
5
  "main": "dist/index.js",
6
6
  "bin": {
@@ -15,6 +15,14 @@ vi.mock("@huggingface/transformers", () => {
15
15
  });
16
16
 
17
17
  import { prepareTextForEmbedding, embedDocument } from "../embedder.js";
18
+ import type { LlmClient } from "../llm-client.js";
19
+
20
+ function makeMockLlm(contextualize: LlmClient["contextualize"]): LlmClient {
21
+ return {
22
+ available: vi.fn(async () => true),
23
+ contextualize: vi.fn(contextualize),
24
+ };
25
+ }
18
26
 
19
27
  describe("prepareTextForEmbedding", () => {
20
28
  it("includes title, tags, and first paragraph", () => {
@@ -196,4 +204,121 @@ describe("embedDocument", () => {
196
204
  // With chunkSize=100 over 500 chars, we expect multiple chunks.
197
205
  expect(result.length).toBeGreaterThan(1);
198
206
  });
207
+
208
+ // Phase 6 (GH-767): Contextual Retrieval integration.
209
+ describe("contextual retrieval", () => {
210
+ it("calls llm.contextualize once per chunk when llm is provided", async () => {
211
+ const content = "A".repeat(500);
212
+ const mockLlm = makeMockLlm(async () => "CTX");
213
+ const result = await embedDocument("T", [], content, {
214
+ llm: mockLlm,
215
+ chunkSize: 100,
216
+ chunkOverlap: 10,
217
+ });
218
+ expect(result.length).toBeGreaterThan(1);
219
+ expect(mockLlm.contextualize).toHaveBeenCalledTimes(result.length);
220
+ });
221
+
222
+ it("stores non-empty contextPrefix on every returned chunk", async () => {
223
+ const mockLlm = makeMockLlm(async () => "THIS IS CONTEXT");
224
+ const result = await embedDocument("Title", ["tag"], "body text", { llm: mockLlm });
225
+ expect(result).toHaveLength(1);
226
+ expect(result[0]!.contextPrefix).toBe("THIS IS CONTEXT");
227
+ // Embed text prepends contextPrefix ahead of title/tags/content.
228
+ expect(embedCalls[0]).toBe("THIS IS CONTEXT\nTitle\ntag\nbody text");
229
+ });
230
+
231
+ it("passes the full document (not the chunk) as the first contextualize arg", async () => {
232
+ const longContent = "A".repeat(500);
233
+ const mockLlm = makeMockLlm(async () => "CTX");
234
+ await embedDocument("T", [], longContent, {
235
+ llm: mockLlm,
236
+ chunkSize: 100,
237
+ chunkOverlap: 10,
238
+ });
239
+ const calls = (mockLlm.contextualize as ReturnType<typeof vi.fn>).mock.calls;
240
+ expect(calls.length).toBeGreaterThan(1);
241
+ for (const [fullDoc] of calls) {
242
+ expect(fullDoc).toBe(longContent);
243
+ }
244
+ });
245
+
246
+ it("fail-open (empty string) yields contextPrefix: '' and omits leading blank line", async () => {
247
+ const mockLlm = makeMockLlm(async () => "");
248
+ const result = await embedDocument("Title", ["tag"], "body text", { llm: mockLlm });
249
+ expect(result[0]!.contextPrefix).toBe("");
250
+ // No leading blank line from an empty contextPrefix — falls back to the
251
+ // no-context embed shape.
252
+ expect(embedCalls[0]).toBe("Title\ntag\nbody text");
253
+ expect(embedCalls[0]!.startsWith("\n")).toBe(false);
254
+ });
255
+
256
+ it("uses cachedPrefixes entry for a chunk and skips llm.contextualize for that index", async () => {
257
+ const content = "A".repeat(500);
258
+ const mockLlm = makeMockLlm(async () => "LIVE CTX");
259
+ // First run to discover the chunk layout.
260
+ const baseline = await embedDocument("T", [], content, {
261
+ llm: mockLlm,
262
+ chunkSize: 100,
263
+ chunkOverlap: 10,
264
+ });
265
+ const chunkCount = baseline.length;
266
+ expect(chunkCount).toBeGreaterThan(1);
267
+
268
+ (mockLlm.contextualize as ReturnType<typeof vi.fn>).mockClear();
269
+ embedCalls.length = 0;
270
+
271
+ // Cache all but the last chunk index.
272
+ const cached = new Map<number, string>();
273
+ for (let i = 0; i < chunkCount - 1; i++) {
274
+ cached.set(i, `CACHED-${i}`);
275
+ }
276
+
277
+ const result = await embedDocument("T", [], content, {
278
+ llm: mockLlm,
279
+ cachedPrefixes: cached,
280
+ chunkSize: 100,
281
+ chunkOverlap: 10,
282
+ });
283
+
284
+ expect(result).toHaveLength(chunkCount);
285
+ // Only the last (uncached) chunk triggered a live LLM call.
286
+ expect(mockLlm.contextualize).toHaveBeenCalledTimes(1);
287
+ // Cached chunks preserve the cached prefix verbatim.
288
+ for (let i = 0; i < chunkCount - 1; i++) {
289
+ expect(result[i]!.contextPrefix).toBe(`CACHED-${i}`);
290
+ }
291
+ expect(result[chunkCount - 1]!.contextPrefix).toBe("LIVE CTX");
292
+ });
293
+
294
+ it("with no llm, contextPrefix is '' on every chunk and no LLM calls occur", async () => {
295
+ const content = "A".repeat(500);
296
+ const mockLlm = makeMockLlm(async () => "SHOULD NOT CALL");
297
+ // Note: do NOT pass `llm` into opts — this is the flag-off path.
298
+ const result = await embedDocument("T", [], content, {
299
+ chunkSize: 100,
300
+ chunkOverlap: 10,
301
+ });
302
+ expect(result.length).toBeGreaterThan(1);
303
+ expect(mockLlm.contextualize).not.toHaveBeenCalled();
304
+ for (const chunk of result) {
305
+ expect(chunk.contextPrefix).toBe("");
306
+ }
307
+ // Embed text uses the no-context shape.
308
+ expect(embedCalls[0]!.startsWith("T\n")).toBe(true);
309
+ });
310
+
311
+ it("cachedPrefixes without llm has no effect (no LLM, caching is moot)", async () => {
312
+ const content = "short content";
313
+ const mockLlm = makeMockLlm(async () => "LIVE");
314
+ const cached = new Map<number, string>([[0, "CACHED"]]);
315
+ const result = await embedDocument("T", [], content, {
316
+ cachedPrefixes: cached,
317
+ });
318
+ // Without llm, no contextualize calls happen and no cached prefix is applied
319
+ // (since caching is only a fast-path on the LLM branch).
320
+ expect(mockLlm.contextualize).not.toHaveBeenCalled();
321
+ expect(result[0]!.contextPrefix).toBe("");
322
+ });
323
+ });
199
324
  });
@@ -1,4 +1,4 @@
1
- import { describe, it, expect, vi, beforeEach } from "vitest";
1
+ import { describe, it, expect, vi, beforeEach, afterEach } from "vitest";
2
2
  import { mkdtempSync, writeFileSync, mkdirSync, unlinkSync, utimesSync } from "node:fs";
3
3
  import { join, resolve } from "node:path";
4
4
  import { tmpdir } from "node:os";
@@ -7,24 +7,46 @@ import { FtsSearch } from "../search.js";
7
7
  import { VectorSearch } from "../vector-search.js";
8
8
 
9
9
  // Mock embedder so we don't load the real transformer model during tests.
10
- // embedDocument returns one DocumentChunk per call with a constant 384-dim
11
- // embedding; this matches the new chunk-aware reindex flow.
10
+ // embedDocument honors the Phase 6 `opts.llm` contract: when present it calls
11
+ // `llm.contextualize(fullDoc, chunk.content)` and surfaces the returned string
12
+ // on `DocumentChunk.contextPrefix` (empty on fail-open). When `cachedPrefixes`
13
+ // is provided and has a value for a chunk's index, the live LLM call is skipped.
12
14
  vi.mock("../embedder.js", async () => {
13
15
  // Import the real chunker so the mock chunks content the same way as prod.
14
16
  const { chunkText } = await import("../chunker.js");
17
+ type LlmLike = { contextualize: (fullDoc: string, chunk: string) => Promise<string> };
18
+ type EmbedOpts = { llm?: LlmLike; cachedPrefixes?: Map<number, string> };
15
19
  return {
16
20
  embed: vi.fn(async () => new Float32Array(384)),
17
- embedDocument: vi.fn(async (_title: string, _tags: string[], content: string) => {
21
+ embedDocument: vi.fn(async (
22
+ _title: string,
23
+ _tags: string[],
24
+ content: string,
25
+ opts?: EmbedOpts,
26
+ ) => {
18
27
  const chunks = content.length === 0
19
28
  ? [{ index: 0, content: "", charStart: 0, charEnd: 0 }]
20
29
  : chunkText(content);
21
- return chunks.map(c => ({
22
- index: c.index,
23
- content: c.content,
24
- charStart: c.charStart,
25
- charEnd: c.charEnd,
26
- embedding: new Float32Array(384),
27
- }));
30
+ const out = [];
31
+ for (const c of chunks) {
32
+ let contextPrefix = "";
33
+ if (opts?.llm) {
34
+ if (opts.cachedPrefixes && opts.cachedPrefixes.has(c.index)) {
35
+ contextPrefix = opts.cachedPrefixes.get(c.index) ?? "";
36
+ } else {
37
+ contextPrefix = await opts.llm.contextualize(content, c.content);
38
+ }
39
+ }
40
+ out.push({
41
+ index: c.index,
42
+ content: c.content,
43
+ charStart: c.charStart,
44
+ charEnd: c.charEnd,
45
+ embedding: new Float32Array(384),
46
+ contextPrefix,
47
+ });
48
+ }
49
+ return out;
28
50
  }),
29
51
  prepareTextForEmbedding: vi.fn((title: string, tags: string[], content: string) => {
30
52
  const tagLine = tags.length > 0 ? tags.join(", ") : "";
@@ -34,6 +56,17 @@ vi.mock("../embedder.js", async () => {
34
56
  };
35
57
  });
36
58
 
59
+ // Mock the LLM client so tests can deterministically control availability and
60
+ // contextualize() return values without touching the network.
61
+ const mockLlmAvailable = vi.fn(async () => true);
62
+ const mockLlmContextualize = vi.fn(async (_fullDoc: string, _chunk: string) => "");
63
+ vi.mock("../llm-client.js", () => ({
64
+ createLlmClient: vi.fn(() => ({
65
+ available: mockLlmAvailable,
66
+ contextualize: mockLlmContextualize,
67
+ })),
68
+ }));
69
+
37
70
  import { embedDocument } from "../embedder.js";
38
71
  import { reindex } from "../reindex.js";
39
72
  import { KnowledgeDB } from "../db.js";
@@ -77,13 +110,32 @@ describe("findMarkdownFiles", () => {
77
110
  describe("incremental reindex", () => {
78
111
  let dir: string;
79
112
  let dbPath: string;
113
+ const originalFlag = process.env.RALPH_CONTEXTUAL_RETRIEVAL;
80
114
 
81
115
  beforeEach(() => {
82
116
  mockedEmbed.mockClear();
117
+ // Reset LLM mocks to defaults: available returns true, contextualize returns "".
118
+ // Individual tests override these before calling `reindex(...)`.
119
+ mockLlmAvailable.mockReset();
120
+ mockLlmAvailable.mockResolvedValue(true);
121
+ mockLlmContextualize.mockReset();
122
+ mockLlmContextualize.mockResolvedValue("");
123
+ // Default the flag to disabled for legacy tests so the existing 17 scenarios
124
+ // don't accidentally call the mocked LLM — the Phase 6 tests opt back in
125
+ // explicitly via `process.env.RALPH_CONTEXTUAL_RETRIEVAL = "1"`.
126
+ process.env.RALPH_CONTEXTUAL_RETRIEVAL = "0";
83
127
  dir = mkdtempSync(join(tmpdir(), "knowledge-reindex-"));
84
128
  dbPath = join(dir, "test.db");
85
129
  });
86
130
 
131
+ afterEach(() => {
132
+ if (originalFlag === undefined) {
133
+ delete process.env.RALPH_CONTEXTUAL_RETRIEVAL;
134
+ } else {
135
+ process.env.RALPH_CONTEXTUAL_RETRIEVAL = originalFlag;
136
+ }
137
+ });
138
+
87
139
  it("scenario 1: unchanged files are skipped on second run", async () => {
88
140
  writeFileSync(join(dir, "doc-a.md"), makeDoc("Doc A"));
89
141
  writeFileSync(join(dir, "doc-b.md"), makeDoc("Doc B"));
@@ -529,4 +581,157 @@ describe("incremental reindex", () => {
529
581
  expect(vecCount).toBe(secondCount);
530
582
  db2.close();
531
583
  });
584
+
585
+ // ---- Phase 6 (GH-767): Contextual Retrieval wiring ----
586
+
587
+ it("scenario 18: RALPH_CONTEXTUAL_RETRIEVAL=0 skips LLM entirely", async () => {
588
+ process.env.RALPH_CONTEXTUAL_RETRIEVAL = "0";
589
+ mockLlmContextualize.mockResolvedValue("SHOULD NOT APPEAR");
590
+
591
+ writeFileSync(join(dir, "doc-a.md"), makeDoc("Doc A"));
592
+
593
+ await reindex([dir], dbPath);
594
+
595
+ // Zero LLM activity when the flag is off.
596
+ expect(mockLlmAvailable).not.toHaveBeenCalled();
597
+ expect(mockLlmContextualize).not.toHaveBeenCalled();
598
+
599
+ // All chunks should have empty context_prefix.
600
+ const db = new KnowledgeDB(dbPath);
601
+ const rows = db.db
602
+ .prepare("SELECT context_prefix FROM chunks WHERE document_id = ?")
603
+ .all("doc-a") as Array<{ context_prefix: string }>;
604
+ expect(rows.length).toBeGreaterThan(0);
605
+ for (const r of rows) {
606
+ expect(r.context_prefix).toBe("");
607
+ }
608
+ db.close();
609
+ });
610
+
611
+ it("scenario 19: flag on + LLM unreachable -> empty context_prefix + single warning", async () => {
612
+ process.env.RALPH_CONTEXTUAL_RETRIEVAL = "1";
613
+ mockLlmAvailable.mockResolvedValue(false);
614
+ // contextualize should never be called because available() returned false.
615
+ mockLlmContextualize.mockResolvedValue("UNREACHED");
616
+
617
+ const warnSpy = vi.spyOn(console, "warn").mockImplementation(() => {});
618
+
619
+ try {
620
+ writeFileSync(join(dir, "doc-a.md"), makeDoc("Doc A"));
621
+ writeFileSync(join(dir, "doc-b.md"), makeDoc("Doc B"));
622
+
623
+ await reindex([dir], dbPath);
624
+
625
+ // available() probed exactly once per reindex call.
626
+ expect(mockLlmAvailable).toHaveBeenCalledTimes(1);
627
+ // contextualize() never invoked on the fail-open path.
628
+ expect(mockLlmContextualize).not.toHaveBeenCalled();
629
+
630
+ // Exactly one "unreachable" warning (other warnings like frontmatter are allowed).
631
+ const unreachableWarnings = warnSpy.mock.calls.filter(args =>
632
+ args.some(a => typeof a === "string" && /LLM endpoint unreachable/.test(a)),
633
+ );
634
+ expect(unreachableWarnings).toHaveLength(1);
635
+
636
+ const db = new KnowledgeDB(dbPath);
637
+ const rows = db.db
638
+ .prepare("SELECT context_prefix FROM chunks")
639
+ .all() as Array<{ context_prefix: string }>;
640
+ expect(rows.length).toBeGreaterThan(0);
641
+ for (const r of rows) {
642
+ expect(r.context_prefix).toBe("");
643
+ }
644
+ db.close();
645
+ } finally {
646
+ warnSpy.mockRestore();
647
+ }
648
+ });
649
+
650
+ it("scenario 20: flag on + reachable LLM -> non-empty context_prefix persisted", async () => {
651
+ process.env.RALPH_CONTEXTUAL_RETRIEVAL = "1";
652
+ mockLlmAvailable.mockResolvedValue(true);
653
+ mockLlmContextualize.mockResolvedValue("GENERATED CONTEXT");
654
+
655
+ writeFileSync(join(dir, "doc-a.md"), makeDoc("Doc A"));
656
+
657
+ await reindex([dir], dbPath);
658
+
659
+ expect(mockLlmAvailable).toHaveBeenCalledTimes(1);
660
+ expect(mockLlmContextualize).toHaveBeenCalled();
661
+
662
+ const db = new KnowledgeDB(dbPath);
663
+ const rows = db.db
664
+ .prepare("SELECT context_prefix FROM chunks WHERE document_id = ?")
665
+ .all("doc-a") as Array<{ context_prefix: string }>;
666
+ expect(rows.length).toBeGreaterThan(0);
667
+ for (const r of rows) {
668
+ expect(r.context_prefix).toBe("GENERATED CONTEXT");
669
+ }
670
+ db.close();
671
+ });
672
+
673
+ it("scenario 21: flag defaults on (undefined env) and probes LLM", async () => {
674
+ delete process.env.RALPH_CONTEXTUAL_RETRIEVAL;
675
+ mockLlmAvailable.mockResolvedValue(true);
676
+ mockLlmContextualize.mockResolvedValue("DEFAULT ON");
677
+
678
+ writeFileSync(join(dir, "doc-a.md"), makeDoc("Doc A"));
679
+
680
+ await reindex([dir], dbPath);
681
+
682
+ // available() probed because flag was not "0" / "false".
683
+ expect(mockLlmAvailable).toHaveBeenCalledTimes(1);
684
+ expect(mockLlmContextualize).toHaveBeenCalled();
685
+ });
686
+
687
+ it("scenario 22: 'false' also disables contextual retrieval", async () => {
688
+ process.env.RALPH_CONTEXTUAL_RETRIEVAL = "false";
689
+ mockLlmContextualize.mockResolvedValue("SHOULD NOT APPEAR");
690
+
691
+ writeFileSync(join(dir, "doc-a.md"), makeDoc("Doc A"));
692
+
693
+ await reindex([dir], dbPath);
694
+
695
+ expect(mockLlmAvailable).not.toHaveBeenCalled();
696
+ expect(mockLlmContextualize).not.toHaveBeenCalled();
697
+ });
698
+
699
+ it("scenario 23: re-running with unchanged content reuses cached context_prefix (no new LLM calls)", async () => {
700
+ process.env.RALPH_CONTEXTUAL_RETRIEVAL = "1";
701
+ mockLlmAvailable.mockResolvedValue(true);
702
+ mockLlmContextualize.mockResolvedValue("INITIAL CTX");
703
+
704
+ const filePath = join(dir, "doc-a.md");
705
+ writeFileSync(filePath, makeDoc("Doc A"));
706
+
707
+ await reindex([dir], dbPath);
708
+ const firstCallCount = mockLlmContextualize.mock.calls.length;
709
+ expect(firstCallCount).toBeGreaterThan(0);
710
+
711
+ // Bump mtime without changing content — this defeats the outer mtime skip
712
+ // and forces the inner content-hash cache check to fire.
713
+ const future = Date.now() / 1000 + 2;
714
+ utimesSync(filePath, future, future);
715
+
716
+ mockLlmContextualize.mockClear();
717
+ // Swap the mock return so we can prove cached prefixes were reused: if the
718
+ // cache missed and a live call happened, the new return value would show up
719
+ // in the DB.
720
+ mockLlmContextualize.mockResolvedValue("LIVE (SHOULD NOT OCCUR)");
721
+
722
+ await reindex([dir], dbPath);
723
+
724
+ // Zero fresh calls because content hash matched the meta cache.
725
+ expect(mockLlmContextualize).not.toHaveBeenCalled();
726
+
727
+ const db = new KnowledgeDB(dbPath);
728
+ const rows = db.db
729
+ .prepare("SELECT context_prefix FROM chunks WHERE document_id = ?")
730
+ .all("doc-a") as Array<{ context_prefix: string }>;
731
+ expect(rows.length).toBeGreaterThan(0);
732
+ for (const r of rows) {
733
+ expect(r.context_prefix).toBe("INITIAL CTX");
734
+ }
735
+ db.close();
736
+ });
532
737
  });
package/src/embedder.ts CHANGED
@@ -3,6 +3,7 @@ import {
3
3
  type FeatureExtractionPipeline,
4
4
  } from "@huggingface/transformers";
5
5
  import { chunkText, type Chunk, type ChunkerOptions } from "./chunker.js";
6
+ import type { LlmClient } from "./llm-client.js";
6
7
 
7
8
  const MODEL_ID = "Xenova/all-MiniLM-L6-v2";
8
9
 
@@ -39,6 +40,25 @@ export interface DocumentChunk extends Chunk {
39
40
  contextPrefix?: string;
40
41
  }
41
42
 
43
+ /**
44
+ * Options accepted by `embedDocument`. Extends `ChunkerOptions` with optional
45
+ * Contextual Retrieval inputs:
46
+ *
47
+ * - `llm`: when present, each chunk is run through `llm.contextualize(fullDoc, chunkContent)`
48
+ * and the returned string is prepended to the embed text (and persisted on the
49
+ * resulting `DocumentChunk.contextPrefix`). Empty-string returns (fail-open from
50
+ * the LLM client) cause the embed text to fall back to the legacy
51
+ * `${title}\n${tagLine}\n${chunk.content}` shape.
52
+ * - `cachedPrefixes`: optional `Map<chunkIndex, contextPrefix>` from a prior run.
53
+ * When a chunk's index has a cached prefix, the LLM call is skipped and the
54
+ * cached string is reused verbatim. Used by the reindex content-hash cache
55
+ * fast-path (Task 6.4) so unchanged docs don't re-contact the LLM endpoint.
56
+ */
57
+ export interface EmbedDocumentOptions extends ChunkerOptions {
58
+ llm?: LlmClient;
59
+ cachedPrefixes?: Map<number, string>;
60
+ }
61
+
42
62
  /**
43
63
  * Embed a document by splitting it into chunks and emitting one embedding
44
64
  * per chunk. The embedded text for each chunk is
@@ -46,6 +66,12 @@ export interface DocumentChunk extends Chunk {
46
66
  * tags) travel with every chunk embedding — matching the shape of the legacy
47
67
  * `prepareTextForEmbedding()` but without the 500-char truncation.
48
68
  *
69
+ * When `opts.llm` is provided (Phase 6 — Contextual Retrieval), a short
70
+ * context prefix is generated per chunk via `opts.llm.contextualize(content, chunk.content)`
71
+ * and prepended to the embed text as `${contextPrefix}\n${title}\n${tagLine}\n${chunk.content}`.
72
+ * If `contextualize` returns `""` (fail-open path), the embed text reverts to the
73
+ * no-context shape so we never emit a leading blank line.
74
+ *
49
75
  * Short documents (<= chunkSize) produce exactly one chunk covering the whole
50
76
  * content. Empty content yields a single chunk with empty content (so callers
51
77
  * still get a title/tag-only embedding for stub documents).
@@ -54,7 +80,7 @@ export async function embedDocument(
54
80
  title: string,
55
81
  tags: string[],
56
82
  content: string,
57
- opts?: ChunkerOptions,
83
+ opts?: EmbedDocumentOptions,
58
84
  ): Promise<DocumentChunk[]> {
59
85
  const tagLine = tags.length > 0 ? tags.join(", ") : "";
60
86
 
@@ -65,10 +91,30 @@ export async function embedDocument(
65
91
  ? [{ index: 0, content: "", charStart: 0, charEnd: 0 }]
66
92
  : chunkText(content, opts);
67
93
 
94
+ const llm = opts?.llm;
95
+ const cached = opts?.cachedPrefixes;
96
+
68
97
  const out: DocumentChunk[] = [];
69
98
  for (const chunk of chunks) {
70
- const parts = [title, tagLine, chunk.content].filter(p => p.length > 0);
71
- const embedText = parts.join("\n");
99
+ let contextPrefix = "";
100
+ if (llm) {
101
+ // Cache hit: reuse prior context_prefix when the caller supplied a map
102
+ // keyed by chunk.index. Avoids an LLM round-trip per unchanged chunk.
103
+ if (cached && cached.has(chunk.index)) {
104
+ contextPrefix = cached.get(chunk.index) ?? "";
105
+ } else {
106
+ // `contextualize` is fail-open: it returns "" on any network/timeout/
107
+ // malformed-response error. That empty string propagates into the
108
+ // returned `DocumentChunk.contextPrefix` (persisted by the caller) and
109
+ // causes the embed text to skip the leading blank line below.
110
+ contextPrefix = await llm.contextualize(content, chunk.content);
111
+ }
112
+ }
113
+
114
+ const parts = contextPrefix.length > 0
115
+ ? [contextPrefix, title, tagLine, chunk.content]
116
+ : [title, tagLine, chunk.content];
117
+ const embedText = parts.filter(p => p.length > 0).join("\n");
72
118
  const embedding = await embed(embedText);
73
119
  out.push({
74
120
  index: chunk.index,
@@ -76,6 +122,7 @@ export async function embedDocument(
76
122
  charStart: chunk.charStart,
77
123
  charEnd: chunk.charEnd,
78
124
  embedding,
125
+ contextPrefix,
79
126
  });
80
127
  }
81
128
  return out;
package/src/reindex.ts CHANGED
@@ -1,6 +1,7 @@
1
1
  import { readFileSync, statSync } from "node:fs";
2
2
  import { join, relative, resolve, basename } from "node:path";
3
3
  import { homedir } from "node:os";
4
+ import { createHash } from "node:crypto";
4
5
  import { KnowledgeDB } from "./db.js";
5
6
  import { FtsSearch } from "./search.js";
6
7
  import { VectorSearch } from "./vector-search.js";
@@ -10,7 +11,7 @@ import { findMarkdownFiles } from "./file-scanner.js";
10
11
  import { generateIndexes } from "./generate-indexes.js";
11
12
  import { loadConfig, type KnowledgeConfig } from "./config.js";
12
13
  import { loadIgnoreForRoot } from "./ignore.js";
13
-
14
+ import { createLlmClient, type LlmClient } from "./llm-client.js";
14
15
  export async function reindex(
15
16
  dirs: string[],
16
17
  dbPath: string,
@@ -25,6 +26,28 @@ export async function reindex(
25
26
  const vec = new VectorSearch(db);
26
27
  vec.createIndex();
27
28
 
29
+ // Phase 6 (GH-767): Contextual Retrieval wiring.
30
+ // `RALPH_CONTEXTUAL_RETRIEVAL` gates the whole feature. Default on; treat
31
+ // literal "0" / "false" as disabled. When enabled we probe the endpoint once
32
+ // and fail open on unreachable — all downstream chunks then embed without a
33
+ // context prefix and we log a single warning so the operator knows why.
34
+ const flagRaw = process.env.RALPH_CONTEXTUAL_RETRIEVAL;
35
+ const contextualEnabled = flagRaw !== "0" && flagRaw !== "false";
36
+ let llm: LlmClient | undefined;
37
+ if (contextualEnabled) {
38
+ const llmUrl = process.env.RALPH_LLM_URL ?? "http://localhost:8000";
39
+ const candidate = createLlmClient();
40
+ const llmReady = await candidate.available();
41
+ if (llmReady) {
42
+ llm = candidate;
43
+ } else {
44
+ console.warn(
45
+ `LLM endpoint unreachable at ${llmUrl}, contextual retrieval disabled for this run`
46
+ );
47
+ llm = undefined;
48
+ }
49
+ }
50
+
28
51
  // Schema version check — force full re-embed when embedding algorithm changes
29
52
  const SCHEMA_VERSION = "3";
30
53
  const currentVersion = db.getMeta("schema_version");
@@ -74,6 +97,7 @@ export async function reindex(
74
97
  const parsedDocs: ParsedDocument[] = [];
75
98
  let indexed = 0;
76
99
  let skipped = 0;
100
+ let totalChunks = 0;
77
101
  for (const filePath of filesOnDisk) {
78
102
  const absPath = resolve(filePath);
79
103
  const mtime = Math.trunc(statSync(absPath).mtimeMs);
@@ -143,6 +167,35 @@ export async function reindex(
143
167
  db.addRelationship(edge.sourceId, edge.targetId, "untyped", edge.context);
144
168
  }
145
169
 
170
+ // Content-hash cache for Contextual Retrieval prefixes. The outer mtime
171
+ // skip at line ~75 already short-circuits the overwhelming majority of
172
+ // unchanged docs (no embedder or LLM calls). This inner hash check is
173
+ // specifically for the rare case where mtime differs but content is
174
+ // byte-identical (e.g., git checkout touching the file). When hash matches
175
+ // AND we have a live LLM AND chunks already exist, we reuse the prior
176
+ // context_prefix map and skip the per-chunk LLM round-trips.
177
+ //
178
+ // Simpler alternative considered: rely entirely on mtime. Rejected because
179
+ // the feature spec (Task 6.4 acceptance) explicitly requires re-running
180
+ // reindex without content changes to reuse existing context_prefix.
181
+ const contentHash = createHash("sha256").update(parsed.content).digest("hex").slice(0, 16);
182
+ const hashKey = `content_hash:${parsed.id}`;
183
+ const priorHash = db.getMeta(hashKey);
184
+
185
+ let cachedPrefixes: Map<number, string> | undefined;
186
+ if (llm && priorHash === contentHash) {
187
+ const priorChunks = db.db
188
+ .prepare(
189
+ "SELECT chunk_index, context_prefix FROM chunks WHERE document_id = ? ORDER BY chunk_index"
190
+ )
191
+ .all(parsed.id) as Array<{ chunk_index: number; context_prefix: string }>;
192
+ if (priorChunks.length > 0) {
193
+ cachedPrefixes = new Map(
194
+ priorChunks.map(r => [r.chunk_index, r.context_prefix ?? ""] as [number, string])
195
+ );
196
+ }
197
+ }
198
+
146
199
  // Chunk-aware embedding: emit one embedding per chunk, persist to both
147
200
  // the `chunks` table and the `documents_vec` virtual table with chunk ids
148
201
  // of the form `${doc.id}#c${index}`.
@@ -156,9 +209,12 @@ export async function reindex(
156
209
  vec.deleteEmbedding(parsed.id);
157
210
 
158
211
  try {
159
- const chunks = await embedDocument(parsed.title, parsed.tags, parsed.content);
212
+ const chunks = await embedDocument(parsed.title, parsed.tags, parsed.content, {
213
+ llm,
214
+ cachedPrefixes,
215
+ });
160
216
  const insertChunk = db.db.prepare(
161
- "INSERT INTO chunks (id, document_id, chunk_index, content, char_start, char_end) VALUES (?, ?, ?, ?, ?, ?)"
217
+ "INSERT INTO chunks (id, document_id, chunk_index, content, char_start, char_end, context_prefix) VALUES (?, ?, ?, ?, ?, ?, ?)"
162
218
  );
163
219
  for (const chunk of chunks) {
164
220
  const chunkId = `${parsed.id}#c${chunk.index}`;
@@ -169,9 +225,16 @@ export async function reindex(
169
225
  chunk.content,
170
226
  chunk.charStart,
171
227
  chunk.charEnd,
228
+ chunk.contextPrefix ?? "",
172
229
  );
173
230
  vec.upsertEmbedding(chunkId, chunk.embedding);
231
+ totalChunks++;
232
+ if (totalChunks % 50 === 0) {
233
+ console.log(` ${totalChunks} chunks embedded`);
234
+ }
174
235
  }
236
+ // Record the content hash for the next reindex cache check.
237
+ db.setMeta(hashKey, contentHash);
175
238
  } catch (e) {
176
239
  console.warn(`Failed to embed ${id}: ${(e as Error).message}`);
177
240
  }