@hasna/knowledge 0.2.18 → 0.2.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -229,6 +229,13 @@ freshness and permission notes, graph evidence, and final rerank scores. The
229
229
  local SQLite index can later move to pgvector or a managed hosted vector store
230
230
  without changing CLI/MCP result shape.
231
231
 
232
+ Index freshness is explicit. `reindex_queue` tracks missing or stale embedding
233
+ work, `open-knowledge reindex status|enqueue|embeddings` operates the local
234
+ queue, and MCP exposes the same controls through `ok_reindex_status`,
235
+ `ok_reindex_enqueue`, and `ok_reindex_embeddings`. Hosted mode can map the same
236
+ contract to worker queues, S3/object artifact sync, Postgres/pgvector, or a
237
+ managed vector index while preserving the local command shape.
238
+
232
239
  ## Agent Workflow
233
240
 
234
241
  The target user flow is:
@@ -35,6 +35,8 @@ Local mode starts with SQLite:
35
35
  - `vector_index_entries` stores searchable embedding rows with provider/model,
36
36
  dimensions, source revision/hash, chunk offsets, status, timestamps, and
37
37
  provenance metadata.
38
+ - `reindex_queue` stores idempotent refresh jobs for missing/stale embedding
39
+ work.
38
40
  - `wiki_pages`, `wiki_backlinks`, and `citations` provide graph and provenance
39
41
  signals.
40
42
  - `knowledge_indexes` tracks generated machine-readable shards.
@@ -50,6 +52,8 @@ The current local command surface is:
50
52
  open-knowledge search "company wiki policy" --scope project --json
51
53
  open-knowledge search "company wiki policy" --scope project --semantic --json
52
54
  open-knowledge search "company wiki policy" --scope project --context --json
55
+ open-knowledge reindex status --scope project --json
56
+ open-knowledge reindex embeddings --scope project --fake --json
53
57
  open-knowledge embeddings index --scope project --model openai:text-embedding-3-small
54
58
  open-knowledge embeddings search "company wiki policy" --scope project --json
55
59
  ```
@@ -57,7 +61,8 @@ open-knowledge embeddings search "company wiki policy" --scope project --json
57
61
  `search` is the structured hybrid layer for agents. `embeddings search` is the
58
62
  lower-level vector-only command. MCP exposes the same capability through
59
63
  `ok_search`, `knowledge_search`, `ok_embeddings_status`, `ok_embeddings_index`,
60
- and `ok_semantic_search`. Deterministic `--fake` embeddings exist for tests and
64
+ `ok_semantic_search`, `ok_reindex_status`, `ok_reindex_enqueue`, and
65
+ `ok_reindex_embeddings`. Deterministic `--fake` embeddings exist for tests and
61
66
  offline verification only.
62
67
 
63
68
  ## Hosted Indexes
@@ -158,6 +163,10 @@ Reindexing is driven by source revisions:
158
163
  - Local outbox consumption deletes stale `chunk_embeddings` and
159
164
  `vector_index_entries` for deleted revisions, so semantic search cannot return
160
165
  removed source chunks.
166
+ - `reindex status` reports missing embeddings, stale revisions, queued work, and
167
+ vector counts; `reindex enqueue` records missing work in `reindex_queue`.
168
+ - `reindex embeddings` performs incremental refreshes, while `--full` clears and
169
+ rebuilds `chunk_embeddings` and `vector_index_entries`.
161
170
  - Wiki pages should track the source revisions they cite so lint can flag stale
162
171
  pages.
163
172
  - Embedding refresh jobs should be idempotent and checkpointed in `runs` and
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hasna/knowledge",
3
- "version": "0.2.18",
3
+ "version": "0.2.19",
4
4
  "description": "Agent-friendly local knowledge CLI with JSON output, pagination, and safe destructive actions",
5
5
  "type": "module",
6
6
  "bin": {
package/src/cli.ts CHANGED
@@ -57,6 +57,7 @@ interface Flags {
57
57
  provider?: string;
58
58
  domain?: string[];
59
59
  fileResults?: boolean;
60
+ full?: boolean;
60
61
  noColor?: boolean;
61
62
  scope?: string;
62
63
  olderThan?: number;
@@ -117,6 +118,7 @@ function parseArgs(argv: string[]): ParseResult {
117
118
  case '--provider': flags.provider = argv[i + 1]; i += 1; break;
118
119
  case '--domain': flags.domain = [...(flags.domain ?? []), argv[i + 1]]; i += 1; break;
119
120
  case '--file-results': flags.fileResults = true; break;
121
+ case '--full': flags.full = true; break;
120
122
  case '--fake': flags.fake = true; break;
121
123
  case '--no-color': flags.noColor = true; break;
122
124
  case '--scope': flags.scope = argv[i + 1]; i += 1; break;
@@ -194,7 +196,7 @@ Commands:
194
196
  source resolve <source-ref> Resolve read-only source content and citation evidence
195
197
  ingest manifest <file|s3://> Ingest an open-files manifest into knowledge.db
196
198
  ingest source <source-ref> Ingest a read-only source ref into knowledge.db
197
- reindex outbox <file|s3://> Consume open-files change events and invalidate chunks
199
+ reindex status|enqueue|embeddings|outbox Inspect/refresh search indexes
198
200
  search <query> Hybrid search sources, wiki pages, indexes, or context
199
201
  web search <query> Provider-native web search with citations
200
202
  ask|build <prompt> Build a read-only citation answer/context pack
@@ -216,6 +218,7 @@ Global Options:
216
218
  --provider <name> Provider override for web search
217
219
  --domain <domain> Restrict provider web search to a domain
218
220
  --file-results File web snippets as web source refs
221
+ --full Force full embedding index rebuild
219
222
  --fake Use deterministic fake embeddings for local tests
220
223
  --scope local|global|project Store scope (default: global ~/.hasna/apps/knowledge/)
221
224
  --no-color Disable color output
@@ -276,7 +279,7 @@ function printCommandHelp(command: string): void {
276
279
  if (command === 'wiki') { console.log('Usage: open-knowledge wiki init [--scope local|global|project] [--json]'); return; }
277
280
  if (command === 'source') { console.log('Usage: open-knowledge source resolve <source-ref> [--purpose knowledge_answer|knowledge_index] [--limit <n>] [--scope local|global|project] [--json]'); return; }
278
281
  if (command === 'ingest') { console.log('Usage: open-knowledge ingest manifest <file|s3://bucket/key> | source <source-ref> [--purpose knowledge_index] [--scope local|global|project] [--json]'); return; }
279
- if (command === 'reindex') { console.log('Usage: open-knowledge reindex outbox <file|s3://bucket/key> [--scope local|global|project] [--json]'); return; }
282
+ if (command === 'reindex') { console.log('Usage: open-knowledge reindex status|enqueue|embeddings|outbox [file|s3://bucket/key] [--full] [--fake] [--scope local|global|project] [--json]'); return; }
280
283
  if (command === 'search') { console.log('Usage: open-knowledge search <query> [--context] [--semantic] [--model openai:text-embedding-3-small] [--limit <n>] [--dimensions <n>] [--fake] [--scope local|global|project] [--json]'); return; }
281
284
  if (command === 'web') { console.log('Usage: open-knowledge web search <query> [--provider openai|anthropic] [--model provider:model] [--domain <domain>] [--file-results] [--fake] [--scope local|global|project] [--json]'); return; }
282
285
  if (command === 'ask' || command === 'build' || command === 'knowledge') { console.log('Usage: open-knowledge ask|build <prompt> [--generate] [--semantic] [--model default|provider:model] [--approve-write] [--scope local|global|project] [--json]'); return; }
@@ -327,11 +330,11 @@ async function run(argv: string[]): Promise<void> {
327
330
  if (flags.completions) {
328
331
  const shell = flags.completions;
329
332
  if (shell === 'bash') {
330
- console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --model --dimensions --semantic --context --generate --approve-write --provider --domain --file-results --fake --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
333
+ console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --model --dimensions --semantic --context --generate --approve-write --provider --domain --file-results --full --fake --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
331
334
  } else if (shell === 'zsh') {
332
- console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(--semantic)--semantic" "(--context)--context" "(--generate)--generate" "(--approve-write)--approve-write" "(--file-results)--file-results" "(--fake)--fake" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--model)--model[model ref]:" "(--dimensions)--dimensions[embedding dimensions]:number:" "(--provider)--provider[provider]:" "(--domain)--domain[domain]:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
335
+ console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(--semantic)--semantic" "(--context)--context" "(--generate)--generate" "(--approve-write)--approve-write" "(--file-results)--file-results" "(--full)--full" "(--fake)--fake" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--model)--model[model ref]:" "(--dimensions)--dimensions[embedding dimensions]:number:" "(--provider)--provider[provider]:" "(--domain)--domain[domain]:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
333
336
  } else if (shell === 'fish') {
334
- console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -l semantic; complete -c open-knowledge -l context; complete -c open-knowledge -l generate; complete -c open-knowledge -l approve-write; complete -c open-knowledge -l provider; complete -c open-knowledge -l domain; complete -c open-knowledge -l file-results; complete -c open-knowledge -l fake; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l model; complete -c open-knowledge -l dimensions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
337
+ console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -l semantic; complete -c open-knowledge -l context; complete -c open-knowledge -l generate; complete -c open-knowledge -l approve-write; complete -c open-knowledge -l provider; complete -c open-knowledge -l domain; complete -c open-knowledge -l file-results; complete -c open-knowledge -l full; complete -c open-knowledge -l fake; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l model; complete -c open-knowledge -l dimensions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
335
338
  } else {
336
339
  throw new Error("Invalid --completions value. Use 'bash', 'zsh', or 'fish'.");
337
340
  }
@@ -574,13 +577,44 @@ async function run(argv: string[]): Promise<void> {
574
577
  }
575
578
 
576
579
  if (command === 'reindex') {
577
- const action = positional[1] ?? '';
578
- if (action !== 'outbox') throw new Error("Invalid reindex action. Use 'outbox'.");
579
- const input = positional[2];
580
- if (!input) throw new Error('Usage: open-knowledge reindex outbox <file|s3://bucket/key>');
581
- const result = await service.consumeOutbox(input);
582
- output({ ok: true, ...result, message: `Consumed ${result.events_seen} outbox event(s)` }, flags.json);
583
- return;
580
+ const action = positional[1] ?? 'status';
581
+ if (action === 'status') {
582
+ const result = service.reindexHealth({
583
+ modelRef: flags.model,
584
+ dimensions: flags.dimensions,
585
+ fake: flags.fake,
586
+ });
587
+ output({ ok: true, ...result, message: `${result.missing_embeddings} chunk(s) missing embeddings` }, flags.json);
588
+ return;
589
+ }
590
+ if (action === 'enqueue') {
591
+ const result = service.enqueueReindex({
592
+ modelRef: flags.model,
593
+ dimensions: flags.dimensions,
594
+ fake: flags.fake,
595
+ });
596
+ output({ ok: true, ...result, message: `Queued ${result.enqueued} embedding refresh item(s)` }, flags.json);
597
+ return;
598
+ }
599
+ if (action === 'embeddings') {
600
+ const result = await service.refreshEmbeddings({
601
+ full: flags.full,
602
+ limit: flags.limit,
603
+ modelRef: flags.model,
604
+ dimensions: flags.dimensions,
605
+ fake: flags.fake,
606
+ });
607
+ output({ ok: true, ...result, message: `Embedded ${result.indexed.chunks_embedded} chunk(s)` }, flags.json);
608
+ return;
609
+ }
610
+ if (action === 'outbox') {
611
+ const input = positional[2];
612
+ if (!input) throw new Error('Usage: open-knowledge reindex outbox <file|s3://bucket/key>');
613
+ const result = await service.consumeOutbox(input);
614
+ output({ ok: true, ...result, message: `Consumed ${result.events_seen} outbox event(s)` }, flags.json);
615
+ return;
616
+ }
617
+ throw new Error("Invalid reindex action. Use 'status', 'enqueue', 'embeddings', or 'outbox'.");
584
618
  }
585
619
 
586
620
  if (command === 'embeddings') {
@@ -1,7 +1,7 @@
1
1
  import { Database } from 'bun:sqlite';
2
2
  import { ensureParentDir } from './workspace';
3
3
 
4
- export const CURRENT_SCHEMA_VERSION = 4;
4
+ export const CURRENT_SCHEMA_VERSION = 5;
5
5
 
6
6
  export interface KnowledgeDbStats {
7
7
  schema_version: number;
@@ -19,6 +19,7 @@ export interface KnowledgeDbStats {
19
19
  storage_objects: number;
20
20
  embeddings: number;
21
21
  vector_entries: number;
22
+ reindex_queue: number;
22
23
  }
23
24
 
24
25
  const MIGRATION_1 = `
@@ -271,6 +272,29 @@ INSERT OR IGNORE INTO schema_versions(version, applied_at)
271
272
  VALUES (4, datetime('now'));
272
273
  `;
273
274
 
275
+ const MIGRATION_5 = `
276
+ CREATE TABLE IF NOT EXISTS reindex_queue (
277
+ id TEXT PRIMARY KEY,
278
+ kind TEXT NOT NULL,
279
+ target_id TEXT NOT NULL,
280
+ source_uri TEXT,
281
+ reason TEXT NOT NULL,
282
+ status TEXT NOT NULL DEFAULT 'pending',
283
+ attempts INTEGER NOT NULL DEFAULT 0,
284
+ metadata_json TEXT NOT NULL DEFAULT '{}',
285
+ created_at TEXT NOT NULL,
286
+ updated_at TEXT NOT NULL,
287
+ UNIQUE(kind, target_id, reason)
288
+ );
289
+
290
+ CREATE INDEX IF NOT EXISTS idx_reindex_queue_status ON reindex_queue(status);
291
+ CREATE INDEX IF NOT EXISTS idx_reindex_queue_kind_target ON reindex_queue(kind, target_id);
292
+ CREATE INDEX IF NOT EXISTS idx_reindex_queue_source_uri ON reindex_queue(source_uri);
293
+
294
+ INSERT OR IGNORE INTO schema_versions(version, applied_at)
295
+ VALUES (5, datetime('now'));
296
+ `;
297
+
274
298
  export function openKnowledgeDb(path: string): Database {
275
299
  ensureParentDir(path);
276
300
  const db = new Database(path);
@@ -286,6 +310,7 @@ export function migrateKnowledgeDb(path: string): { path: string; schema_version
286
310
  if (getSchemaVersion(db) < 2) db.exec(MIGRATION_2);
287
311
  if (getSchemaVersion(db) < 3) db.exec(MIGRATION_3);
288
312
  if (getSchemaVersion(db) < 4) db.exec(MIGRATION_4);
313
+ if (getSchemaVersion(db) < 5) db.exec(MIGRATION_5);
289
314
  return { path, schema_version: getSchemaVersion(db) };
290
315
  } finally {
291
316
  db.close();
@@ -321,6 +346,7 @@ export function getKnowledgeDbStats(path: string): KnowledgeDbStats {
321
346
  storage_objects: count(db, 'storage_objects'),
322
347
  embeddings: count(db, 'chunk_embeddings'),
323
348
  vector_entries: count(db, 'vector_index_entries'),
349
+ reindex_queue: count(db, 'reindex_queue'),
324
350
  };
325
351
  } finally {
326
352
  db.close();
package/src/mcp.js CHANGED
@@ -153,6 +153,50 @@ export function buildServer() {
153
153
  }
154
154
  });
155
155
 
156
+ registerTool(server, 'ok_reindex_status', 'Reindex status', 'Inspect missing embeddings, queued jobs, stale revisions, and vector index health', {
157
+ scope: scopeField,
158
+ model: z.string().optional().describe('Embedding model ref, default openai:text-embedding-3-small'),
159
+ dimensions: z.number().optional().describe('Embedding dimensions for deterministic fake mode'),
160
+ fake: z.boolean().optional().describe('Use deterministic fake embeddings for local tests'),
161
+ }, async ({ scope, model, dimensions, fake }) => {
162
+ const service = createKnowledgeService({ scope });
163
+ try {
164
+ return jsonText({ ok: true, ...service.reindexHealth({ modelRef: model, dimensions, fake }) });
165
+ } catch (error) {
166
+ return errorText(error instanceof Error ? error.message : String(error));
167
+ }
168
+ });
169
+
170
+ registerTool(server, 'ok_reindex_enqueue', 'Enqueue reindex work', 'Queue missing embedding refresh jobs for indexed source chunks', {
171
+ scope: scopeField,
172
+ model: z.string().optional().describe('Embedding model ref, default openai:text-embedding-3-small'),
173
+ dimensions: z.number().optional().describe('Embedding dimensions for deterministic fake mode'),
174
+ fake: z.boolean().optional().describe('Use deterministic fake embeddings for local tests'),
175
+ }, async ({ scope, model, dimensions, fake }) => {
176
+ const service = createKnowledgeService({ scope });
177
+ try {
178
+ return jsonText({ ok: true, ...service.enqueueReindex({ modelRef: model, dimensions, fake }) });
179
+ } catch (error) {
180
+ return errorText(error instanceof Error ? error.message : String(error));
181
+ }
182
+ });
183
+
184
+ registerTool(server, 'ok_reindex_embeddings', 'Refresh embedding index', 'Run incremental or full embedding refresh jobs with run-ledger tracking', {
185
+ scope: scopeField,
186
+ full: z.boolean().optional().describe('Delete and rebuild all embedding/vector rows first'),
187
+ limit: z.number().optional().describe('Maximum chunks to embed'),
188
+ model: z.string().optional().describe('Embedding model ref, default openai:text-embedding-3-small'),
189
+ dimensions: z.number().optional().describe('Embedding dimensions for deterministic fake mode'),
190
+ fake: z.boolean().optional().describe('Use deterministic fake embeddings for local tests'),
191
+ }, async ({ scope, full, limit, model, dimensions, fake }) => {
192
+ const service = createKnowledgeService({ scope });
193
+ try {
194
+ return jsonText({ ok: true, ...await service.refreshEmbeddings({ full, limit, modelRef: model, dimensions, fake }) });
195
+ } catch (error) {
196
+ return errorText(error instanceof Error ? error.message : String(error));
197
+ }
198
+ });
199
+
156
200
  registerTool(server, 'ok_semantic_search', 'Semantic search', 'Search the local vector index and return cited chunks with provenance', {
157
201
  scope: scopeField,
158
202
  query: z.string().describe('Semantic query'),
package/src/reindex.ts ADDED
@@ -0,0 +1,260 @@
1
+ import { createHash, randomUUID } from 'node:crypto';
2
+ import { indexKnowledgeEmbeddings, resolveEmbeddingModelRef, type EmbeddingRuntimeOptions } from './embeddings';
3
+ import { migrateKnowledgeDb, openKnowledgeDb } from './knowledge-db';
4
+ import { parseModelRef } from './providers';
5
+ import type { KnowledgeConfig } from './workspace';
6
+
7
+ export interface ReindexRuntimeOptions extends EmbeddingRuntimeOptions {
8
+ dbPath: string;
9
+ config?: KnowledgeConfig;
10
+ now?: Date;
11
+ }
12
+
13
+ export interface ReindexHealthResult {
14
+ schema_version: number;
15
+ chunks: number;
16
+ vector_entries: number;
17
+ missing_embeddings: number;
18
+ queued: Record<string, number>;
19
+ stale_revisions: number;
20
+ }
21
+
22
+ export interface ReindexEnqueueResult {
23
+ enqueued: number;
24
+ already_queued: number;
25
+ reason: string;
26
+ }
27
+
28
+ export interface ReindexEmbeddingsResult {
29
+ run_id: string;
30
+ full: boolean;
31
+ deleted_embeddings: number;
32
+ deleted_vector_entries: number;
33
+ queued: ReindexEnqueueResult;
34
+ indexed: Awaited<ReturnType<typeof indexKnowledgeEmbeddings>>;
35
+ completed_queue_items: number;
36
+ }
37
+
38
+ interface MissingChunkRow {
39
+ chunk_id: string;
40
+ source_revision_id: string | null;
41
+ source_uri: string | null;
42
+ }
43
+
44
+ function stableId(prefix: string, value: string): string {
45
+ return `${prefix}_${createHash('sha256').update(value).digest('hex').slice(0, 20)}`;
46
+ }
47
+
48
+ function queueCounts(dbPath: string): Record<string, number> {
49
+ const db = openKnowledgeDb(dbPath);
50
+ try {
51
+ const rows = db.query<{ status: string; n: number }, []>(
52
+ `SELECT status, COUNT(*) AS n FROM reindex_queue GROUP BY status ORDER BY status`,
53
+ ).all();
54
+ return Object.fromEntries(rows.map((row) => [row.status, row.n]));
55
+ } finally {
56
+ db.close();
57
+ }
58
+ }
59
+
60
+ function missingEmbeddingRows(dbPath: string, options: ReindexRuntimeOptions): MissingChunkRow[] {
61
+ const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
62
+ const parsed = parseModelRef(modelRef);
63
+ const db = openKnowledgeDb(dbPath);
64
+ try {
65
+ return db.query<MissingChunkRow, [string, string]>(
66
+ `SELECT c.id AS chunk_id, c.source_revision_id, s.uri AS source_uri
67
+ FROM chunks c
68
+ LEFT JOIN source_revisions sr ON sr.id = c.source_revision_id
69
+ LEFT JOIN sources s ON s.id = sr.source_id
70
+ LEFT JOIN vector_index_entries v ON v.chunk_id = c.id AND v.provider = ? AND v.model = ?
71
+ WHERE v.id IS NULL
72
+ ORDER BY c.created_at ASC, c.ordinal ASC`,
73
+ ).all(parsed.provider, parsed.model);
74
+ } finally {
75
+ db.close();
76
+ }
77
+ }
78
+
79
+ export function reindexHealth(options: ReindexRuntimeOptions): ReindexHealthResult {
80
+ migrateKnowledgeDb(options.dbPath);
81
+ const db = openKnowledgeDb(options.dbPath);
82
+ try {
83
+ const version = db.query<{ version: number }, []>('SELECT MAX(version) AS version FROM schema_versions').get()?.version ?? 0;
84
+ const chunks = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM chunks').get()?.n ?? 0;
85
+ const vectorEntries = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM vector_index_entries').get()?.n ?? 0;
86
+ const missing = missingEmbeddingRows(options.dbPath, options).length;
87
+ const stale = db.query<{ n: number }, []>(
88
+ `SELECT COUNT(*) AS n FROM source_revisions
89
+ WHERE metadata_json LIKE '%"reindex_required":true%' OR metadata_json LIKE '%"status":"stale"%'`,
90
+ ).get()?.n ?? 0;
91
+ return {
92
+ schema_version: version,
93
+ chunks,
94
+ vector_entries: vectorEntries,
95
+ missing_embeddings: missing,
96
+ queued: queueCounts(options.dbPath),
97
+ stale_revisions: stale,
98
+ };
99
+ } finally {
100
+ db.close();
101
+ }
102
+ }
103
+
104
+ export function enqueueMissingEmbeddings(options: ReindexRuntimeOptions & { reason?: string }): ReindexEnqueueResult {
105
+ migrateKnowledgeDb(options.dbPath);
106
+ const now = (options.now ?? new Date()).toISOString();
107
+ const reason = options.reason ?? 'missing_embedding';
108
+ const rows = missingEmbeddingRows(options.dbPath, options);
109
+ const db = openKnowledgeDb(options.dbPath);
110
+ let enqueued = 0;
111
+ let alreadyQueued = 0;
112
+ try {
113
+ const write = db.transaction(() => {
114
+ for (const row of rows) {
115
+ const id = stableId('rq', `embedding\u0000${row.chunk_id}\u0000${reason}`);
116
+ const before = db.query<{ id: string }, [string, string, string]>(
117
+ 'SELECT id FROM reindex_queue WHERE kind = ? AND target_id = ? AND reason = ?',
118
+ ).get('embedding', row.chunk_id, reason);
119
+ if (before) {
120
+ alreadyQueued += 1;
121
+ continue;
122
+ }
123
+ db.run(
124
+ `INSERT INTO reindex_queue (id, kind, target_id, source_uri, reason, status, metadata_json, created_at, updated_at)
125
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
126
+ [
127
+ id,
128
+ 'embedding',
129
+ row.chunk_id,
130
+ row.source_uri,
131
+ reason,
132
+ 'pending',
133
+ JSON.stringify({ source_revision_id: row.source_revision_id }),
134
+ now,
135
+ now,
136
+ ],
137
+ );
138
+ enqueued += 1;
139
+ }
140
+ });
141
+ write();
142
+ } finally {
143
+ db.close();
144
+ }
145
+ return { enqueued, already_queued: alreadyQueued, reason };
146
+ }
147
+
148
+ function clearEmbeddingIndex(dbPath: string): { embeddings: number; vectorEntries: number } {
149
+ const db = openKnowledgeDb(dbPath);
150
+ try {
151
+ const embeddings = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM chunk_embeddings').get()?.n ?? 0;
152
+ const vectorEntries = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM vector_index_entries').get()?.n ?? 0;
153
+ db.run('DELETE FROM vector_index_entries');
154
+ db.run('DELETE FROM chunk_embeddings');
155
+ return { embeddings, vectorEntries };
156
+ } finally {
157
+ db.close();
158
+ }
159
+ }
160
+
161
+ function completeIndexedQueueItems(dbPath: string, options: ReindexRuntimeOptions, now: string): number {
162
+ const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
163
+ const parsed = parseModelRef(modelRef);
164
+ const db = openKnowledgeDb(dbPath);
165
+ try {
166
+ const result = db.run(
167
+ `UPDATE reindex_queue
168
+ SET status = ?, updated_at = ?
169
+ WHERE kind = ?
170
+ AND status = ?
171
+ AND EXISTS (
172
+ SELECT 1 FROM vector_index_entries v
173
+ WHERE v.chunk_id = reindex_queue.target_id
174
+ AND v.provider = ?
175
+ AND v.model = ?
176
+ )`,
177
+ ['completed', now, 'embedding', 'pending', parsed.provider, parsed.model],
178
+ );
179
+ return result.changes;
180
+ } finally {
181
+ db.close();
182
+ }
183
+ }
184
+
185
+ export async function refreshEmbeddingIndex(options: ReindexRuntimeOptions & { full?: boolean; limit?: number }): Promise<ReindexEmbeddingsResult> {
186
+ migrateKnowledgeDb(options.dbPath);
187
+ const now = (options.now ?? new Date()).toISOString();
188
+ const runId = `run_${randomUUID()}`;
189
+ const deleted = options.full ? clearEmbeddingIndex(options.dbPath) : { embeddings: 0, vectorEntries: 0 };
190
+ const queued = enqueueMissingEmbeddings({ ...options, reason: options.full ? 'full_embedding_rebuild' : 'missing_embedding' });
191
+ const db = openKnowledgeDb(options.dbPath);
192
+ try {
193
+ db.run(
194
+ `INSERT INTO runs (id, type, prompt, status, provider, model, metadata_json, created_at, updated_at)
195
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
196
+ [
197
+ runId,
198
+ 'embedding-refresh',
199
+ options.full ? 'full' : 'incremental',
200
+ 'running',
201
+ 'local',
202
+ resolveEmbeddingModelRef(options.modelRef, options.config),
203
+ JSON.stringify({ full: options.full === true, queued }),
204
+ now,
205
+ now,
206
+ ],
207
+ );
208
+ } finally {
209
+ db.close();
210
+ }
211
+
212
+ const indexed = await indexKnowledgeEmbeddings({
213
+ dbPath: options.dbPath,
214
+ config: options.config,
215
+ env: options.env,
216
+ modelRef: options.modelRef,
217
+ dimensions: options.dimensions,
218
+ fake: options.fake,
219
+ limit: options.limit,
220
+ now: options.now,
221
+ });
222
+
223
+ const completedQueueItems = completeIndexedQueueItems(options.dbPath, options, now);
224
+ const doneDb = openKnowledgeDb(options.dbPath);
225
+ try {
226
+ doneDb.run(
227
+ `UPDATE runs SET status = ?, metadata_json = ?, updated_at = ? WHERE id = ?`,
228
+ [
229
+ 'completed',
230
+ JSON.stringify({ full: options.full === true, queued, indexed, completed_queue_items: completedQueueItems }),
231
+ now,
232
+ runId,
233
+ ],
234
+ );
235
+ doneDb.run(
236
+ `INSERT INTO run_events (id, run_id, level, event, metadata_json, created_at)
237
+ VALUES (?, ?, ?, ?, ?, ?)`,
238
+ [
239
+ `evt_${randomUUID()}`,
240
+ runId,
241
+ 'info',
242
+ 'embedding_refresh_completed',
243
+ JSON.stringify({ queued, indexed, completed_queue_items: completedQueueItems }),
244
+ now,
245
+ ],
246
+ );
247
+ } finally {
248
+ doneDb.close();
249
+ }
250
+
251
+ return {
252
+ run_id: runId,
253
+ full: options.full === true,
254
+ deleted_embeddings: deleted.embeddings,
255
+ deleted_vector_entries: deleted.vectorEntries,
256
+ queued,
257
+ indexed,
258
+ completed_queue_items: completedQueueItems,
259
+ };
260
+ }
package/src/service.ts CHANGED
@@ -13,6 +13,7 @@ import { ingestOpenFilesManifest } from './manifest-ingest';
13
13
  import { ingestSourceRef } from './source-ingest';
14
14
  import { resolveOpenFilesSource } from './source-resolver';
15
15
  import { providerStatus, listModelRegistry, type ProviderStatusResult, type ModelRegistryEntry } from './providers';
16
+ import { enqueueMissingEmbeddings, refreshEmbeddingIndex, reindexHealth, type ReindexRuntimeOptions } from './reindex';
16
17
  import { retrieveKnowledgeContext, type RetrievalOptions } from './retrieval';
17
18
  import { hybridSearch, type HybridSearchOptions } from './search';
18
19
  import { resolveSafetyPolicy } from './safety';
@@ -188,6 +189,33 @@ export class KnowledgeService {
188
189
  });
189
190
  }
190
191
 
192
+ reindexHealth(options: Omit<ReindexRuntimeOptions, 'dbPath' | 'config'> = {}) {
193
+ const workspace = this.ensureWorkspace();
194
+ return reindexHealth({
195
+ ...options,
196
+ dbPath: workspace.knowledgeDbPath,
197
+ config: this.config(),
198
+ });
199
+ }
200
+
201
+ enqueueReindex(options: Omit<ReindexRuntimeOptions, 'dbPath' | 'config'> = {}) {
202
+ const workspace = this.ensureWorkspace();
203
+ return enqueueMissingEmbeddings({
204
+ ...options,
205
+ dbPath: workspace.knowledgeDbPath,
206
+ config: this.config(),
207
+ });
208
+ }
209
+
210
+ async refreshEmbeddings(options: Omit<ReindexRuntimeOptions & { full?: boolean; limit?: number }, 'dbPath' | 'config'> = {}) {
211
+ const workspace = this.ensureWorkspace();
212
+ return refreshEmbeddingIndex({
213
+ ...options,
214
+ dbPath: workspace.knowledgeDbPath,
215
+ config: this.config(),
216
+ });
217
+ }
218
+
191
219
  providerStatus(env: Record<string, string | undefined> = process.env): ProviderStatusResult {
192
220
  return providerStatus(this.config(), env);
193
221
  }