@hasna/knowledge 0.2.17 → 0.2.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -229,6 +229,13 @@ freshness and permission notes, graph evidence, and final rerank scores. The
229
229
  local SQLite index can later move to pgvector or a managed hosted vector store
230
230
  without changing CLI/MCP result shape.
231
231
 
232
+ Index freshness is explicit. `reindex_queue` tracks missing or stale embedding
233
+ work, `open-knowledge reindex status|enqueue|embeddings` operates the local
234
+ queue, and MCP exposes the same controls through `ok_reindex_status`,
235
+ `ok_reindex_enqueue`, and `ok_reindex_embeddings`. Hosted mode can map the same
236
+ contract to worker queues, S3/object artifact sync, Postgres/pgvector, or a
237
+ managed vector index while preserving the local command shape.
238
+
232
239
  ## Agent Workflow
233
240
 
234
241
  The target user flow is:
@@ -255,6 +262,13 @@ returns a local citation draft by default, optionally calls AI SDK generation vi
255
262
  `--generate`, records `runs`, `run_events`, and `provider_usage`, and only
256
263
  proposes durable wiki updates until the wiki compile/write task owns writes.
257
264
 
265
+ Provider-native web search is exposed separately as
266
+ `open-knowledge web search <query>` and MCP `ok_web_search`. Real network access
267
+ is safety-gated; OpenAI and Anthropic use provider web-search tools through AI
268
+ SDK, while DeepSeek remains a future fallback/external-search path. Returned web
269
+ snippets can optionally be filed as read-only `web` source refs for later local
270
+ search and citation.
271
+
258
272
  ## Provider Registry
259
273
 
260
274
  AI provider setup is BYOK and AI SDK v6 based. The local provider layer tracks:
@@ -35,6 +35,8 @@ Local mode starts with SQLite:
35
35
  - `vector_index_entries` stores searchable embedding rows with provider/model,
36
36
  dimensions, source revision/hash, chunk offsets, status, timestamps, and
37
37
  provenance metadata.
38
+ - `reindex_queue` stores idempotent refresh jobs for missing/stale embedding
39
+ work.
38
40
  - `wiki_pages`, `wiki_backlinks`, and `citations` provide graph and provenance
39
41
  signals.
40
42
  - `knowledge_indexes` tracks generated machine-readable shards.
@@ -50,6 +52,8 @@ The current local command surface is:
50
52
  open-knowledge search "company wiki policy" --scope project --json
51
53
  open-knowledge search "company wiki policy" --scope project --semantic --json
52
54
  open-knowledge search "company wiki policy" --scope project --context --json
55
+ open-knowledge reindex status --scope project --json
56
+ open-knowledge reindex embeddings --scope project --fake --json
53
57
  open-knowledge embeddings index --scope project --model openai:text-embedding-3-small
54
58
  open-knowledge embeddings search "company wiki policy" --scope project --json
55
59
  ```
@@ -57,7 +61,8 @@ open-knowledge embeddings search "company wiki policy" --scope project --json
57
61
  `search` is the structured hybrid layer for agents. `embeddings search` is the
58
62
  lower-level vector-only command. MCP exposes the same capability through
59
63
  `ok_search`, `knowledge_search`, `ok_embeddings_status`, `ok_embeddings_index`,
60
- and `ok_semantic_search`. Deterministic `--fake` embeddings exist for tests and
64
+ `ok_semantic_search`, `ok_reindex_status`, `ok_reindex_enqueue`, and
65
+ `ok_reindex_embeddings`. Deterministic `--fake` embeddings exist for tests and
61
66
  offline verification only.
62
67
 
63
68
  ## Hosted Indexes
@@ -142,6 +147,11 @@ source refs, artifact URIs, revision/hash metadata, offsets, and provenance.
142
147
  and MCP `knowledge_ask` wrap this context pack in a run ledger and return a
143
148
  citation draft or explicit AI SDK generated answer.
144
149
 
150
+ Provider-native web search lives beside local retrieval. `open-knowledge web
151
+ search` and MCP `ok_web_search` are safety-gated, capture provider sources, and
152
+ can file snippets as read-only `web` source refs so later local retrieval treats
153
+ them like other cited sources.
154
+
145
155
  ## Reindexing
146
156
 
147
157
  Reindexing is driven by source revisions:
@@ -153,6 +163,10 @@ Reindexing is driven by source revisions:
153
163
  - Local outbox consumption deletes stale `chunk_embeddings` and
154
164
  `vector_index_entries` for deleted revisions, so semantic search cannot return
155
165
  removed source chunks.
166
+ - `reindex status` reports missing embeddings, stale revisions, queued work, and
167
+ vector counts; `reindex enqueue` records missing work in `reindex_queue`.
168
+ - `reindex embeddings` performs incremental refreshes, while `--full` clears and
169
+ rebuilds `chunk_embeddings` and `vector_index_entries`.
156
170
  - Wiki pages should track the source revisions they cite so lint can flag stale
157
171
  pages.
158
172
  - Embedding refresh jobs should be idempotent and checkpointed in `runs` and
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@hasna/knowledge",
3
- "version": "0.2.17",
3
+ "version": "0.2.19",
4
4
  "description": "Agent-friendly local knowledge CLI with JSON output, pagination, and safe destructive actions",
5
5
  "type": "module",
6
6
  "bin": {
package/src/cli.ts CHANGED
@@ -54,6 +54,10 @@ interface Flags {
54
54
  context?: boolean;
55
55
  generate?: boolean;
56
56
  approveWrite?: boolean;
57
+ provider?: string;
58
+ domain?: string[];
59
+ fileResults?: boolean;
60
+ full?: boolean;
57
61
  noColor?: boolean;
58
62
  scope?: string;
59
63
  olderThan?: number;
@@ -68,7 +72,7 @@ interface ParseResult {
68
72
  flags: Flags;
69
73
  }
70
74
 
71
- const COMMANDS = ['add', 'list', 'get', 'delete', 'update', 'archive', 'restore', 'upsert', 'untag', 'export', 'prune', 'dedupe', 'stats', 'paths', 'storage', 'db', 'wiki', 'source', 'ingest', 'reindex', 'search', 'ask', 'build', 'embeddings', 'providers', 'safety', 'help'];
75
+ const COMMANDS = ['add', 'list', 'get', 'delete', 'update', 'archive', 'restore', 'upsert', 'untag', 'export', 'prune', 'dedupe', 'stats', 'paths', 'storage', 'db', 'wiki', 'source', 'ingest', 'reindex', 'search', 'web', 'ask', 'build', 'embeddings', 'providers', 'safety', 'help'];
72
76
  const COMMAND_ALIASES: Record<string, string> = {
73
77
  ls: 'list',
74
78
  rm: 'delete',
@@ -111,6 +115,10 @@ function parseArgs(argv: string[]): ParseResult {
111
115
  case '--context': flags.context = true; break;
112
116
  case '--generate': flags.generate = true; break;
113
117
  case '--approve-write': flags.approveWrite = true; break;
118
+ case '--provider': flags.provider = argv[i + 1]; i += 1; break;
119
+ case '--domain': flags.domain = [...(flags.domain ?? []), argv[i + 1]]; i += 1; break;
120
+ case '--file-results': flags.fileResults = true; break;
121
+ case '--full': flags.full = true; break;
114
122
  case '--fake': flags.fake = true; break;
115
123
  case '--no-color': flags.noColor = true; break;
116
124
  case '--scope': flags.scope = argv[i + 1]; i += 1; break;
@@ -188,8 +196,9 @@ Commands:
188
196
  source resolve <source-ref> Resolve read-only source content and citation evidence
189
197
  ingest manifest <file|s3://> Ingest an open-files manifest into knowledge.db
190
198
  ingest source <source-ref> Ingest a read-only source ref into knowledge.db
191
- reindex outbox <file|s3://> Consume open-files change events and invalidate chunks
199
+ reindex status|enqueue|embeddings|outbox Inspect/refresh search indexes
192
200
  search <query> Hybrid search sources, wiki pages, indexes, or context
201
+ web search <query> Provider-native web search with citations
193
202
  ask|build <prompt> Build a read-only citation answer/context pack
194
203
  embeddings status|index|search Build/query local vector embeddings
195
204
  providers status|models|check Inspect AI SDK provider config and credentials
@@ -206,6 +215,10 @@ Global Options:
206
215
  --context Return a reranked citation context pack for search
207
216
  --generate Call AI SDK text generation for ask/build
208
217
  --approve-write Record approval intent for future durable wiki writes
218
+ --provider <name> Provider override for web search
219
+ --domain <domain> Restrict provider web search to a domain
220
+ --file-results File web snippets as web source refs
221
+ --full Force full embedding index rebuild
209
222
  --fake Use deterministic fake embeddings for local tests
210
223
  --scope local|global|project Store scope (default: global ~/.hasna/apps/knowledge/)
211
224
  --no-color Disable color output
@@ -266,8 +279,9 @@ function printCommandHelp(command: string): void {
266
279
  if (command === 'wiki') { console.log('Usage: open-knowledge wiki init [--scope local|global|project] [--json]'); return; }
267
280
  if (command === 'source') { console.log('Usage: open-knowledge source resolve <source-ref> [--purpose knowledge_answer|knowledge_index] [--limit <n>] [--scope local|global|project] [--json]'); return; }
268
281
  if (command === 'ingest') { console.log('Usage: open-knowledge ingest manifest <file|s3://bucket/key> | source <source-ref> [--purpose knowledge_index] [--scope local|global|project] [--json]'); return; }
269
- if (command === 'reindex') { console.log('Usage: open-knowledge reindex outbox <file|s3://bucket/key> [--scope local|global|project] [--json]'); return; }
282
+ if (command === 'reindex') { console.log('Usage: open-knowledge reindex status|enqueue|embeddings|outbox [file|s3://bucket/key] [--full] [--fake] [--scope local|global|project] [--json]'); return; }
270
283
  if (command === 'search') { console.log('Usage: open-knowledge search <query> [--context] [--semantic] [--model openai:text-embedding-3-small] [--limit <n>] [--dimensions <n>] [--fake] [--scope local|global|project] [--json]'); return; }
284
+ if (command === 'web') { console.log('Usage: open-knowledge web search <query> [--provider openai|anthropic] [--model provider:model] [--domain <domain>] [--file-results] [--fake] [--scope local|global|project] [--json]'); return; }
271
285
  if (command === 'ask' || command === 'build' || command === 'knowledge') { console.log('Usage: open-knowledge ask|build <prompt> [--generate] [--semantic] [--model default|provider:model] [--approve-write] [--scope local|global|project] [--json]'); return; }
272
286
  if (command === 'embeddings') { console.log('Usage: open-knowledge embeddings status|index|search [query] [--model openai:text-embedding-3-small] [--limit <n>] [--dimensions <n>] [--fake] [--scope local|global|project] [--json]'); return; }
273
287
  if (command === 'providers') { console.log('Usage: open-knowledge providers status|models|check [provider|model-alias] [--scope local|global|project] [--json]'); return; }
@@ -316,11 +330,11 @@ async function run(argv: string[]): Promise<void> {
316
330
  if (flags.completions) {
317
331
  const shell = flags.completions;
318
332
  if (shell === 'bash') {
319
- console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search ask build embeddings providers safety help ls rm edit unarchive knowledge --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --model --dimensions --semantic --context --generate --approve-write --fake --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
333
+ console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --model --dimensions --semantic --context --generate --approve-write --provider --domain --file-results --full --fake --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
320
334
  } else if (shell === 'zsh') {
321
- console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search ask build embeddings providers safety help ls rm edit unarchive knowledge)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(--semantic)--semantic" "(--context)--context" "(--generate)--generate" "(--approve-write)--approve-write" "(--fake)--fake" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--model)--model[model ref]:" "(--dimensions)--dimensions[embedding dimensions]:number:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
335
+ console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(--semantic)--semantic" "(--context)--context" "(--generate)--generate" "(--approve-write)--approve-write" "(--file-results)--file-results" "(--full)--full" "(--fake)--fake" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--model)--model[model ref]:" "(--dimensions)--dimensions[embedding dimensions]:number:" "(--provider)--provider[provider]:" "(--domain)--domain[domain]:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
322
336
  } else if (shell === 'fish') {
323
- console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search ask build embeddings providers safety help ls rm edit unarchive knowledge"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -l semantic; complete -c open-knowledge -l context; complete -c open-knowledge -l generate; complete -c open-knowledge -l approve-write; complete -c open-knowledge -l fake; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l model; complete -c open-knowledge -l dimensions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
337
+ console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -l semantic; complete -c open-knowledge -l context; complete -c open-knowledge -l generate; complete -c open-knowledge -l approve-write; complete -c open-knowledge -l provider; complete -c open-knowledge -l domain; complete -c open-knowledge -l file-results; complete -c open-knowledge -l full; complete -c open-knowledge -l fake; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l model; complete -c open-knowledge -l dimensions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
324
338
  } else {
325
339
  throw new Error("Invalid --completions value. Use 'bash', 'zsh', or 'fish'.");
326
340
  }
@@ -563,13 +577,44 @@ async function run(argv: string[]): Promise<void> {
563
577
  }
564
578
 
565
579
  if (command === 'reindex') {
566
- const action = positional[1] ?? '';
567
- if (action !== 'outbox') throw new Error("Invalid reindex action. Use 'outbox'.");
568
- const input = positional[2];
569
- if (!input) throw new Error('Usage: open-knowledge reindex outbox <file|s3://bucket/key>');
570
- const result = await service.consumeOutbox(input);
571
- output({ ok: true, ...result, message: `Consumed ${result.events_seen} outbox event(s)` }, flags.json);
572
- return;
580
+ const action = positional[1] ?? 'status';
581
+ if (action === 'status') {
582
+ const result = service.reindexHealth({
583
+ modelRef: flags.model,
584
+ dimensions: flags.dimensions,
585
+ fake: flags.fake,
586
+ });
587
+ output({ ok: true, ...result, message: `${result.missing_embeddings} chunk(s) missing embeddings` }, flags.json);
588
+ return;
589
+ }
590
+ if (action === 'enqueue') {
591
+ const result = service.enqueueReindex({
592
+ modelRef: flags.model,
593
+ dimensions: flags.dimensions,
594
+ fake: flags.fake,
595
+ });
596
+ output({ ok: true, ...result, message: `Queued ${result.enqueued} embedding refresh item(s)` }, flags.json);
597
+ return;
598
+ }
599
+ if (action === 'embeddings') {
600
+ const result = await service.refreshEmbeddings({
601
+ full: flags.full,
602
+ limit: flags.limit,
603
+ modelRef: flags.model,
604
+ dimensions: flags.dimensions,
605
+ fake: flags.fake,
606
+ });
607
+ output({ ok: true, ...result, message: `Embedded ${result.indexed.chunks_embedded} chunk(s)` }, flags.json);
608
+ return;
609
+ }
610
+ if (action === 'outbox') {
611
+ const input = positional[2];
612
+ if (!input) throw new Error('Usage: open-knowledge reindex outbox <file|s3://bucket/key>');
613
+ const result = await service.consumeOutbox(input);
614
+ output({ ok: true, ...result, message: `Consumed ${result.events_seen} outbox event(s)` }, flags.json);
615
+ return;
616
+ }
617
+ throw new Error("Invalid reindex action. Use 'status', 'enqueue', 'embeddings', or 'outbox'.");
573
618
  }
574
619
 
575
620
  if (command === 'embeddings') {
@@ -632,6 +677,24 @@ async function run(argv: string[]): Promise<void> {
632
677
  return;
633
678
  }
634
679
 
680
+ if (command === 'web') {
681
+ const action = positional[1] ?? 'search';
682
+ if (action !== 'search') throw new Error("Invalid web action. Use 'search'.");
683
+ const query = positional.slice(2).join(' ');
684
+ if (!query) throw new Error('Usage: open-knowledge web search <query>');
685
+ const result = await service.webSearch({
686
+ query,
687
+ limit: flags.limit,
688
+ modelRef: flags.model,
689
+ provider: flags.provider as AiProviderId | undefined,
690
+ domains: flags.domain,
691
+ fake: flags.fake,
692
+ fileResults: flags.fileResults,
693
+ });
694
+ output({ ok: true, ...result, message: `${result.sources.length} web source(s)` }, flags.json);
695
+ return;
696
+ }
697
+
635
698
  if (command === 'ask' || command === 'build') {
636
699
  const prompt = positional.slice(commandArgOffset).join(' ');
637
700
  if (!prompt) throw new Error('Usage: open-knowledge ask <prompt>');
@@ -1,7 +1,7 @@
1
1
  import { Database } from 'bun:sqlite';
2
2
  import { ensureParentDir } from './workspace';
3
3
 
4
- export const CURRENT_SCHEMA_VERSION = 4;
4
+ export const CURRENT_SCHEMA_VERSION = 5;
5
5
 
6
6
  export interface KnowledgeDbStats {
7
7
  schema_version: number;
@@ -19,6 +19,7 @@ export interface KnowledgeDbStats {
19
19
  storage_objects: number;
20
20
  embeddings: number;
21
21
  vector_entries: number;
22
+ reindex_queue: number;
22
23
  }
23
24
 
24
25
  const MIGRATION_1 = `
@@ -271,6 +272,29 @@ INSERT OR IGNORE INTO schema_versions(version, applied_at)
271
272
  VALUES (4, datetime('now'));
272
273
  `;
273
274
 
275
+ const MIGRATION_5 = `
276
+ CREATE TABLE IF NOT EXISTS reindex_queue (
277
+ id TEXT PRIMARY KEY,
278
+ kind TEXT NOT NULL,
279
+ target_id TEXT NOT NULL,
280
+ source_uri TEXT,
281
+ reason TEXT NOT NULL,
282
+ status TEXT NOT NULL DEFAULT 'pending',
283
+ attempts INTEGER NOT NULL DEFAULT 0,
284
+ metadata_json TEXT NOT NULL DEFAULT '{}',
285
+ created_at TEXT NOT NULL,
286
+ updated_at TEXT NOT NULL,
287
+ UNIQUE(kind, target_id, reason)
288
+ );
289
+
290
+ CREATE INDEX IF NOT EXISTS idx_reindex_queue_status ON reindex_queue(status);
291
+ CREATE INDEX IF NOT EXISTS idx_reindex_queue_kind_target ON reindex_queue(kind, target_id);
292
+ CREATE INDEX IF NOT EXISTS idx_reindex_queue_source_uri ON reindex_queue(source_uri);
293
+
294
+ INSERT OR IGNORE INTO schema_versions(version, applied_at)
295
+ VALUES (5, datetime('now'));
296
+ `;
297
+
274
298
  export function openKnowledgeDb(path: string): Database {
275
299
  ensureParentDir(path);
276
300
  const db = new Database(path);
@@ -286,6 +310,7 @@ export function migrateKnowledgeDb(path: string): { path: string; schema_version
286
310
  if (getSchemaVersion(db) < 2) db.exec(MIGRATION_2);
287
311
  if (getSchemaVersion(db) < 3) db.exec(MIGRATION_3);
288
312
  if (getSchemaVersion(db) < 4) db.exec(MIGRATION_4);
313
+ if (getSchemaVersion(db) < 5) db.exec(MIGRATION_5);
289
314
  return { path, schema_version: getSchemaVersion(db) };
290
315
  } finally {
291
316
  db.close();
@@ -321,6 +346,7 @@ export function getKnowledgeDbStats(path: string): KnowledgeDbStats {
321
346
  storage_objects: count(db, 'storage_objects'),
322
347
  embeddings: count(db, 'chunk_embeddings'),
323
348
  vector_entries: count(db, 'vector_index_entries'),
349
+ reindex_queue: count(db, 'reindex_queue'),
324
350
  };
325
351
  } finally {
326
352
  db.close();
package/src/mcp.js CHANGED
@@ -153,6 +153,50 @@ export function buildServer() {
153
153
  }
154
154
  });
155
155
 
156
+ registerTool(server, 'ok_reindex_status', 'Reindex status', 'Inspect missing embeddings, queued jobs, stale revisions, and vector index health', {
157
+ scope: scopeField,
158
+ model: z.string().optional().describe('Embedding model ref, default openai:text-embedding-3-small'),
159
+ dimensions: z.number().optional().describe('Embedding dimensions for deterministic fake mode'),
160
+ fake: z.boolean().optional().describe('Use deterministic fake embeddings for local tests'),
161
+ }, async ({ scope, model, dimensions, fake }) => {
162
+ const service = createKnowledgeService({ scope });
163
+ try {
164
+ return jsonText({ ok: true, ...service.reindexHealth({ modelRef: model, dimensions, fake }) });
165
+ } catch (error) {
166
+ return errorText(error instanceof Error ? error.message : String(error));
167
+ }
168
+ });
169
+
170
+ registerTool(server, 'ok_reindex_enqueue', 'Enqueue reindex work', 'Queue missing embedding refresh jobs for indexed source chunks', {
171
+ scope: scopeField,
172
+ model: z.string().optional().describe('Embedding model ref, default openai:text-embedding-3-small'),
173
+ dimensions: z.number().optional().describe('Embedding dimensions for deterministic fake mode'),
174
+ fake: z.boolean().optional().describe('Use deterministic fake embeddings for local tests'),
175
+ }, async ({ scope, model, dimensions, fake }) => {
176
+ const service = createKnowledgeService({ scope });
177
+ try {
178
+ return jsonText({ ok: true, ...service.enqueueReindex({ modelRef: model, dimensions, fake }) });
179
+ } catch (error) {
180
+ return errorText(error instanceof Error ? error.message : String(error));
181
+ }
182
+ });
183
+
184
+ registerTool(server, 'ok_reindex_embeddings', 'Refresh embedding index', 'Run incremental or full embedding refresh jobs with run-ledger tracking', {
185
+ scope: scopeField,
186
+ full: z.boolean().optional().describe('Delete and rebuild all embedding/vector rows first'),
187
+ limit: z.number().optional().describe('Maximum chunks to embed'),
188
+ model: z.string().optional().describe('Embedding model ref, default openai:text-embedding-3-small'),
189
+ dimensions: z.number().optional().describe('Embedding dimensions for deterministic fake mode'),
190
+ fake: z.boolean().optional().describe('Use deterministic fake embeddings for local tests'),
191
+ }, async ({ scope, full, limit, model, dimensions, fake }) => {
192
+ const service = createKnowledgeService({ scope });
193
+ try {
194
+ return jsonText({ ok: true, ...await service.refreshEmbeddings({ full, limit, modelRef: model, dimensions, fake }) });
195
+ } catch (error) {
196
+ return errorText(error instanceof Error ? error.message : String(error));
197
+ }
198
+ });
199
+
156
200
  registerTool(server, 'ok_semantic_search', 'Semantic search', 'Search the local vector index and return cited chunks with provenance', {
157
201
  scope: scopeField,
158
202
  query: z.string().describe('Semantic query'),
@@ -222,6 +266,24 @@ export function buildServer() {
222
266
  }
223
267
  });
224
268
 
269
+ registerTool(server, 'ok_web_search', 'Provider web search', 'Run safety-gated provider-native web search and return citations/sources', {
270
+ scope: scopeField,
271
+ query: z.string().describe('Web search query'),
272
+ limit: z.number().optional().describe('Maximum sources'),
273
+ provider: z.enum(['openai', 'anthropic', 'deepseek']).optional().describe('Provider override'),
274
+ model: z.string().optional().describe('Model alias/ref'),
275
+ domains: z.array(z.string()).optional().describe('Allowed domains'),
276
+ fake: z.boolean().optional().describe('Use deterministic fake web results'),
277
+ file_results: z.boolean().optional().describe('File web snippets as web source refs'),
278
+ }, async ({ scope, query, limit, provider, model, domains, fake, file_results }) => {
279
+ const service = createKnowledgeService({ scope });
280
+ try {
281
+ return jsonText({ ok: true, ...await service.webSearch({ query, limit, provider, modelRef: model, domains, fake, fileResults: file_results }) });
282
+ } catch (error) {
283
+ return errorText(error instanceof Error ? error.message : String(error));
284
+ }
285
+ });
286
+
225
287
  registerTool(server, 'ok_add', 'Add a knowledge item', 'Add a new item to the knowledge store', {
226
288
  title: z.string().describe('Item title'),
227
289
  content: z.string().describe('Item content/body'),
package/src/reindex.ts ADDED
@@ -0,0 +1,260 @@
1
+ import { createHash, randomUUID } from 'node:crypto';
2
+ import { indexKnowledgeEmbeddings, resolveEmbeddingModelRef, type EmbeddingRuntimeOptions } from './embeddings';
3
+ import { migrateKnowledgeDb, openKnowledgeDb } from './knowledge-db';
4
+ import { parseModelRef } from './providers';
5
+ import type { KnowledgeConfig } from './workspace';
6
+
7
+ export interface ReindexRuntimeOptions extends EmbeddingRuntimeOptions {
8
+ dbPath: string;
9
+ config?: KnowledgeConfig;
10
+ now?: Date;
11
+ }
12
+
13
+ export interface ReindexHealthResult {
14
+ schema_version: number;
15
+ chunks: number;
16
+ vector_entries: number;
17
+ missing_embeddings: number;
18
+ queued: Record<string, number>;
19
+ stale_revisions: number;
20
+ }
21
+
22
+ export interface ReindexEnqueueResult {
23
+ enqueued: number;
24
+ already_queued: number;
25
+ reason: string;
26
+ }
27
+
28
+ export interface ReindexEmbeddingsResult {
29
+ run_id: string;
30
+ full: boolean;
31
+ deleted_embeddings: number;
32
+ deleted_vector_entries: number;
33
+ queued: ReindexEnqueueResult;
34
+ indexed: Awaited<ReturnType<typeof indexKnowledgeEmbeddings>>;
35
+ completed_queue_items: number;
36
+ }
37
+
38
+ interface MissingChunkRow {
39
+ chunk_id: string;
40
+ source_revision_id: string | null;
41
+ source_uri: string | null;
42
+ }
43
+
44
+ function stableId(prefix: string, value: string): string {
45
+ return `${prefix}_${createHash('sha256').update(value).digest('hex').slice(0, 20)}`;
46
+ }
47
+
48
+ function queueCounts(dbPath: string): Record<string, number> {
49
+ const db = openKnowledgeDb(dbPath);
50
+ try {
51
+ const rows = db.query<{ status: string; n: number }, []>(
52
+ `SELECT status, COUNT(*) AS n FROM reindex_queue GROUP BY status ORDER BY status`,
53
+ ).all();
54
+ return Object.fromEntries(rows.map((row) => [row.status, row.n]));
55
+ } finally {
56
+ db.close();
57
+ }
58
+ }
59
+
60
+ function missingEmbeddingRows(dbPath: string, options: ReindexRuntimeOptions): MissingChunkRow[] {
61
+ const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
62
+ const parsed = parseModelRef(modelRef);
63
+ const db = openKnowledgeDb(dbPath);
64
+ try {
65
+ return db.query<MissingChunkRow, [string, string]>(
66
+ `SELECT c.id AS chunk_id, c.source_revision_id, s.uri AS source_uri
67
+ FROM chunks c
68
+ LEFT JOIN source_revisions sr ON sr.id = c.source_revision_id
69
+ LEFT JOIN sources s ON s.id = sr.source_id
70
+ LEFT JOIN vector_index_entries v ON v.chunk_id = c.id AND v.provider = ? AND v.model = ?
71
+ WHERE v.id IS NULL
72
+ ORDER BY c.created_at ASC, c.ordinal ASC`,
73
+ ).all(parsed.provider, parsed.model);
74
+ } finally {
75
+ db.close();
76
+ }
77
+ }
78
+
79
+ export function reindexHealth(options: ReindexRuntimeOptions): ReindexHealthResult {
80
+ migrateKnowledgeDb(options.dbPath);
81
+ const db = openKnowledgeDb(options.dbPath);
82
+ try {
83
+ const version = db.query<{ version: number }, []>('SELECT MAX(version) AS version FROM schema_versions').get()?.version ?? 0;
84
+ const chunks = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM chunks').get()?.n ?? 0;
85
+ const vectorEntries = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM vector_index_entries').get()?.n ?? 0;
86
+ const missing = missingEmbeddingRows(options.dbPath, options).length;
87
+ const stale = db.query<{ n: number }, []>(
88
+ `SELECT COUNT(*) AS n FROM source_revisions
89
+ WHERE metadata_json LIKE '%"reindex_required":true%' OR metadata_json LIKE '%"status":"stale"%'`,
90
+ ).get()?.n ?? 0;
91
+ return {
92
+ schema_version: version,
93
+ chunks,
94
+ vector_entries: vectorEntries,
95
+ missing_embeddings: missing,
96
+ queued: queueCounts(options.dbPath),
97
+ stale_revisions: stale,
98
+ };
99
+ } finally {
100
+ db.close();
101
+ }
102
+ }
103
+
104
+ export function enqueueMissingEmbeddings(options: ReindexRuntimeOptions & { reason?: string }): ReindexEnqueueResult {
105
+ migrateKnowledgeDb(options.dbPath);
106
+ const now = (options.now ?? new Date()).toISOString();
107
+ const reason = options.reason ?? 'missing_embedding';
108
+ const rows = missingEmbeddingRows(options.dbPath, options);
109
+ const db = openKnowledgeDb(options.dbPath);
110
+ let enqueued = 0;
111
+ let alreadyQueued = 0;
112
+ try {
113
+ const write = db.transaction(() => {
114
+ for (const row of rows) {
115
+ const id = stableId('rq', `embedding\u0000${row.chunk_id}\u0000${reason}`);
116
+ const before = db.query<{ id: string }, [string, string, string]>(
117
+ 'SELECT id FROM reindex_queue WHERE kind = ? AND target_id = ? AND reason = ?',
118
+ ).get('embedding', row.chunk_id, reason);
119
+ if (before) {
120
+ alreadyQueued += 1;
121
+ continue;
122
+ }
123
+ db.run(
124
+ `INSERT INTO reindex_queue (id, kind, target_id, source_uri, reason, status, metadata_json, created_at, updated_at)
125
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
126
+ [
127
+ id,
128
+ 'embedding',
129
+ row.chunk_id,
130
+ row.source_uri,
131
+ reason,
132
+ 'pending',
133
+ JSON.stringify({ source_revision_id: row.source_revision_id }),
134
+ now,
135
+ now,
136
+ ],
137
+ );
138
+ enqueued += 1;
139
+ }
140
+ });
141
+ write();
142
+ } finally {
143
+ db.close();
144
+ }
145
+ return { enqueued, already_queued: alreadyQueued, reason };
146
+ }
147
+
148
+ function clearEmbeddingIndex(dbPath: string): { embeddings: number; vectorEntries: number } {
149
+ const db = openKnowledgeDb(dbPath);
150
+ try {
151
+ const embeddings = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM chunk_embeddings').get()?.n ?? 0;
152
+ const vectorEntries = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM vector_index_entries').get()?.n ?? 0;
153
+ db.run('DELETE FROM vector_index_entries');
154
+ db.run('DELETE FROM chunk_embeddings');
155
+ return { embeddings, vectorEntries };
156
+ } finally {
157
+ db.close();
158
+ }
159
+ }
160
+
161
+ function completeIndexedQueueItems(dbPath: string, options: ReindexRuntimeOptions, now: string): number {
162
+ const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
163
+ const parsed = parseModelRef(modelRef);
164
+ const db = openKnowledgeDb(dbPath);
165
+ try {
166
+ const result = db.run(
167
+ `UPDATE reindex_queue
168
+ SET status = ?, updated_at = ?
169
+ WHERE kind = ?
170
+ AND status = ?
171
+ AND EXISTS (
172
+ SELECT 1 FROM vector_index_entries v
173
+ WHERE v.chunk_id = reindex_queue.target_id
174
+ AND v.provider = ?
175
+ AND v.model = ?
176
+ )`,
177
+ ['completed', now, 'embedding', 'pending', parsed.provider, parsed.model],
178
+ );
179
+ return result.changes;
180
+ } finally {
181
+ db.close();
182
+ }
183
+ }
184
+
185
+ export async function refreshEmbeddingIndex(options: ReindexRuntimeOptions & { full?: boolean; limit?: number }): Promise<ReindexEmbeddingsResult> {
186
+ migrateKnowledgeDb(options.dbPath);
187
+ const now = (options.now ?? new Date()).toISOString();
188
+ const runId = `run_${randomUUID()}`;
189
+ const deleted = options.full ? clearEmbeddingIndex(options.dbPath) : { embeddings: 0, vectorEntries: 0 };
190
+ const queued = enqueueMissingEmbeddings({ ...options, reason: options.full ? 'full_embedding_rebuild' : 'missing_embedding' });
191
+ const db = openKnowledgeDb(options.dbPath);
192
+ try {
193
+ db.run(
194
+ `INSERT INTO runs (id, type, prompt, status, provider, model, metadata_json, created_at, updated_at)
195
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
196
+ [
197
+ runId,
198
+ 'embedding-refresh',
199
+ options.full ? 'full' : 'incremental',
200
+ 'running',
201
+ 'local',
202
+ resolveEmbeddingModelRef(options.modelRef, options.config),
203
+ JSON.stringify({ full: options.full === true, queued }),
204
+ now,
205
+ now,
206
+ ],
207
+ );
208
+ } finally {
209
+ db.close();
210
+ }
211
+
212
+ const indexed = await indexKnowledgeEmbeddings({
213
+ dbPath: options.dbPath,
214
+ config: options.config,
215
+ env: options.env,
216
+ modelRef: options.modelRef,
217
+ dimensions: options.dimensions,
218
+ fake: options.fake,
219
+ limit: options.limit,
220
+ now: options.now,
221
+ });
222
+
223
+ const completedQueueItems = completeIndexedQueueItems(options.dbPath, options, now);
224
+ const doneDb = openKnowledgeDb(options.dbPath);
225
+ try {
226
+ doneDb.run(
227
+ `UPDATE runs SET status = ?, metadata_json = ?, updated_at = ? WHERE id = ?`,
228
+ [
229
+ 'completed',
230
+ JSON.stringify({ full: options.full === true, queued, indexed, completed_queue_items: completedQueueItems }),
231
+ now,
232
+ runId,
233
+ ],
234
+ );
235
+ doneDb.run(
236
+ `INSERT INTO run_events (id, run_id, level, event, metadata_json, created_at)
237
+ VALUES (?, ?, ?, ?, ?, ?)`,
238
+ [
239
+ `evt_${randomUUID()}`,
240
+ runId,
241
+ 'info',
242
+ 'embedding_refresh_completed',
243
+ JSON.stringify({ queued, indexed, completed_queue_items: completedQueueItems }),
244
+ now,
245
+ ],
246
+ );
247
+ } finally {
248
+ doneDb.close();
249
+ }
250
+
251
+ return {
252
+ run_id: runId,
253
+ full: options.full === true,
254
+ deleted_embeddings: deleted.embeddings,
255
+ deleted_vector_entries: deleted.vectorEntries,
256
+ queued,
257
+ indexed,
258
+ completed_queue_items: completedQueueItems,
259
+ };
260
+ }