@hasna/knowledge 0.2.18 → 0.2.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +27 -9
- package/bin/open-knowledge-mcp.js +292 -15
- package/bin/open-knowledge.js +99 -58
- package/docs/architecture/ai-native-knowledge-base.md +7 -0
- package/docs/architecture/hybrid-semantic-search.md +19 -1
- package/package.json +1 -1
- package/src/cli.ts +46 -12
- package/src/knowledge-db.ts +27 -1
- package/src/mcp.js +44 -0
- package/src/reindex.ts +260 -0
- package/src/service.ts +28 -0
|
@@ -229,6 +229,13 @@ freshness and permission notes, graph evidence, and final rerank scores. The
|
|
|
229
229
|
local SQLite index can later move to pgvector or a managed hosted vector store
|
|
230
230
|
without changing CLI/MCP result shape.
|
|
231
231
|
|
|
232
|
+
Index freshness is explicit. `reindex_queue` tracks missing or stale embedding
|
|
233
|
+
work, `open-knowledge reindex status|enqueue|embeddings` operates the local
|
|
234
|
+
queue, and MCP exposes the same controls through `ok_reindex_status`,
|
|
235
|
+
`ok_reindex_enqueue`, and `ok_reindex_embeddings`. Hosted mode can map the same
|
|
236
|
+
contract to worker queues, S3/object artifact sync, Postgres/pgvector, or a
|
|
237
|
+
managed vector index while preserving the local command shape.
|
|
238
|
+
|
|
232
239
|
## Agent Workflow
|
|
233
240
|
|
|
234
241
|
The target user flow is:
|
|
@@ -35,6 +35,8 @@ Local mode starts with SQLite:
|
|
|
35
35
|
- `vector_index_entries` stores searchable embedding rows with provider/model,
|
|
36
36
|
dimensions, source revision/hash, chunk offsets, status, timestamps, and
|
|
37
37
|
provenance metadata.
|
|
38
|
+
- `reindex_queue` stores idempotent refresh jobs for missing/stale embedding
|
|
39
|
+
work.
|
|
38
40
|
- `wiki_pages`, `wiki_backlinks`, and `citations` provide graph and provenance
|
|
39
41
|
signals.
|
|
40
42
|
- `knowledge_indexes` tracks generated machine-readable shards.
|
|
@@ -50,6 +52,8 @@ The current local command surface is:
|
|
|
50
52
|
open-knowledge search "company wiki policy" --scope project --json
|
|
51
53
|
open-knowledge search "company wiki policy" --scope project --semantic --json
|
|
52
54
|
open-knowledge search "company wiki policy" --scope project --context --json
|
|
55
|
+
open-knowledge reindex status --scope project --json
|
|
56
|
+
open-knowledge reindex embeddings --scope project --fake --json
|
|
53
57
|
open-knowledge embeddings index --scope project --model openai:text-embedding-3-small
|
|
54
58
|
open-knowledge embeddings search "company wiki policy" --scope project --json
|
|
55
59
|
```
|
|
@@ -57,7 +61,8 @@ open-knowledge embeddings search "company wiki policy" --scope project --json
|
|
|
57
61
|
`search` is the structured hybrid layer for agents. `embeddings search` is the
|
|
58
62
|
lower-level vector-only command. MCP exposes the same capability through
|
|
59
63
|
`ok_search`, `knowledge_search`, `ok_embeddings_status`, `ok_embeddings_index`,
|
|
60
|
-
|
|
64
|
+
`ok_semantic_search`, `ok_reindex_status`, `ok_reindex_enqueue`, and
|
|
65
|
+
`ok_reindex_embeddings`. Deterministic `--fake` embeddings exist for tests and
|
|
61
66
|
offline verification only.
|
|
62
67
|
|
|
63
68
|
## Hosted Indexes
|
|
@@ -158,6 +163,10 @@ Reindexing is driven by source revisions:
|
|
|
158
163
|
- Local outbox consumption deletes stale `chunk_embeddings` and
|
|
159
164
|
`vector_index_entries` for deleted revisions, so semantic search cannot return
|
|
160
165
|
removed source chunks.
|
|
166
|
+
- `reindex status` reports missing embeddings, stale revisions, queued work, and
|
|
167
|
+
vector counts; `reindex enqueue` records missing work in `reindex_queue`.
|
|
168
|
+
- `reindex embeddings` performs incremental refreshes, while `--full` clears and
|
|
169
|
+
rebuilds `chunk_embeddings` and `vector_index_entries`.
|
|
161
170
|
- Wiki pages should track the source revisions they cite so lint can flag stale
|
|
162
171
|
pages.
|
|
163
172
|
- Embedding refresh jobs should be idempotent and checkpointed in `runs` and
|
|
@@ -171,3 +180,12 @@ Reindexing is driven by source revisions:
|
|
|
171
180
|
- Permission filters run before model context assembly.
|
|
172
181
|
- Retrieval internals can swap from JSON vectors to pgvector or managed vector
|
|
173
182
|
stores without changing CLI/MCP result contracts.
|
|
183
|
+
|
|
184
|
+
## Evaluation Fixtures
|
|
185
|
+
|
|
186
|
+
`tests/semantic-evals.test.ts` and `tests/fixtures/semantic-eval-fixtures.ts`
|
|
187
|
+
seed no-network corpora for retrieval quality checks. The fixtures cover
|
|
188
|
+
keyword hits, deterministic fake-vector fallback for synonym-style prompts,
|
|
189
|
+
citation correctness, stale revision filtering, non-read-only provenance
|
|
190
|
+
filtering, rerank ordering, generated wiki-page retrieval, missing-source
|
|
191
|
+
answers, and `knowledge <prompt>` context assembly.
|
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -57,6 +57,7 @@ interface Flags {
|
|
|
57
57
|
provider?: string;
|
|
58
58
|
domain?: string[];
|
|
59
59
|
fileResults?: boolean;
|
|
60
|
+
full?: boolean;
|
|
60
61
|
noColor?: boolean;
|
|
61
62
|
scope?: string;
|
|
62
63
|
olderThan?: number;
|
|
@@ -117,6 +118,7 @@ function parseArgs(argv: string[]): ParseResult {
|
|
|
117
118
|
case '--provider': flags.provider = argv[i + 1]; i += 1; break;
|
|
118
119
|
case '--domain': flags.domain = [...(flags.domain ?? []), argv[i + 1]]; i += 1; break;
|
|
119
120
|
case '--file-results': flags.fileResults = true; break;
|
|
121
|
+
case '--full': flags.full = true; break;
|
|
120
122
|
case '--fake': flags.fake = true; break;
|
|
121
123
|
case '--no-color': flags.noColor = true; break;
|
|
122
124
|
case '--scope': flags.scope = argv[i + 1]; i += 1; break;
|
|
@@ -194,7 +196,7 @@ Commands:
|
|
|
194
196
|
source resolve <source-ref> Resolve read-only source content and citation evidence
|
|
195
197
|
ingest manifest <file|s3://> Ingest an open-files manifest into knowledge.db
|
|
196
198
|
ingest source <source-ref> Ingest a read-only source ref into knowledge.db
|
|
197
|
-
reindex outbox
|
|
199
|
+
reindex status|enqueue|embeddings|outbox Inspect/refresh search indexes
|
|
198
200
|
search <query> Hybrid search sources, wiki pages, indexes, or context
|
|
199
201
|
web search <query> Provider-native web search with citations
|
|
200
202
|
ask|build <prompt> Build a read-only citation answer/context pack
|
|
@@ -216,6 +218,7 @@ Global Options:
|
|
|
216
218
|
--provider <name> Provider override for web search
|
|
217
219
|
--domain <domain> Restrict provider web search to a domain
|
|
218
220
|
--file-results File web snippets as web source refs
|
|
221
|
+
--full Force full embedding index rebuild
|
|
219
222
|
--fake Use deterministic fake embeddings for local tests
|
|
220
223
|
--scope local|global|project Store scope (default: global ~/.hasna/apps/knowledge/)
|
|
221
224
|
--no-color Disable color output
|
|
@@ -276,7 +279,7 @@ function printCommandHelp(command: string): void {
|
|
|
276
279
|
if (command === 'wiki') { console.log('Usage: open-knowledge wiki init [--scope local|global|project] [--json]'); return; }
|
|
277
280
|
if (command === 'source') { console.log('Usage: open-knowledge source resolve <source-ref> [--purpose knowledge_answer|knowledge_index] [--limit <n>] [--scope local|global|project] [--json]'); return; }
|
|
278
281
|
if (command === 'ingest') { console.log('Usage: open-knowledge ingest manifest <file|s3://bucket/key> | source <source-ref> [--purpose knowledge_index] [--scope local|global|project] [--json]'); return; }
|
|
279
|
-
if (command === 'reindex') { console.log('Usage: open-knowledge reindex outbox
|
|
282
|
+
if (command === 'reindex') { console.log('Usage: open-knowledge reindex status|enqueue|embeddings|outbox [file|s3://bucket/key] [--full] [--fake] [--scope local|global|project] [--json]'); return; }
|
|
280
283
|
if (command === 'search') { console.log('Usage: open-knowledge search <query> [--context] [--semantic] [--model openai:text-embedding-3-small] [--limit <n>] [--dimensions <n>] [--fake] [--scope local|global|project] [--json]'); return; }
|
|
281
284
|
if (command === 'web') { console.log('Usage: open-knowledge web search <query> [--provider openai|anthropic] [--model provider:model] [--domain <domain>] [--file-results] [--fake] [--scope local|global|project] [--json]'); return; }
|
|
282
285
|
if (command === 'ask' || command === 'build' || command === 'knowledge') { console.log('Usage: open-knowledge ask|build <prompt> [--generate] [--semantic] [--model default|provider:model] [--approve-write] [--scope local|global|project] [--json]'); return; }
|
|
@@ -327,11 +330,11 @@ async function run(argv: string[]): Promise<void> {
|
|
|
327
330
|
if (flags.completions) {
|
|
328
331
|
const shell = flags.completions;
|
|
329
332
|
if (shell === 'bash') {
|
|
330
|
-
console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --model --dimensions --semantic --context --generate --approve-write --provider --domain --file-results --fake --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
|
|
333
|
+
console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --model --dimensions --semantic --context --generate --approve-write --provider --domain --file-results --full --fake --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
|
|
331
334
|
} else if (shell === 'zsh') {
|
|
332
|
-
console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(--semantic)--semantic" "(--context)--context" "(--generate)--generate" "(--approve-write)--approve-write" "(--file-results)--file-results" "(--fake)--fake" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--model)--model[model ref]:" "(--dimensions)--dimensions[embedding dimensions]:number:" "(--provider)--provider[provider]:" "(--domain)--domain[domain]:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
|
|
335
|
+
console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(--semantic)--semantic" "(--context)--context" "(--generate)--generate" "(--approve-write)--approve-write" "(--file-results)--file-results" "(--full)--full" "(--fake)--fake" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--model)--model[model ref]:" "(--dimensions)--dimensions[embedding dimensions]:number:" "(--provider)--provider[provider]:" "(--domain)--domain[domain]:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
|
|
333
336
|
} else if (shell === 'fish') {
|
|
334
|
-
console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -l semantic; complete -c open-knowledge -l context; complete -c open-knowledge -l generate; complete -c open-knowledge -l approve-write; complete -c open-knowledge -l provider; complete -c open-knowledge -l domain; complete -c open-knowledge -l file-results; complete -c open-knowledge -l fake; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l model; complete -c open-knowledge -l dimensions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
|
|
337
|
+
console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -l semantic; complete -c open-knowledge -l context; complete -c open-knowledge -l generate; complete -c open-knowledge -l approve-write; complete -c open-knowledge -l provider; complete -c open-knowledge -l domain; complete -c open-knowledge -l file-results; complete -c open-knowledge -l full; complete -c open-knowledge -l fake; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l model; complete -c open-knowledge -l dimensions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
|
|
335
338
|
} else {
|
|
336
339
|
throw new Error("Invalid --completions value. Use 'bash', 'zsh', or 'fish'.");
|
|
337
340
|
}
|
|
@@ -574,13 +577,44 @@ async function run(argv: string[]): Promise<void> {
|
|
|
574
577
|
}
|
|
575
578
|
|
|
576
579
|
if (command === 'reindex') {
|
|
577
|
-
const action = positional[1] ?? '';
|
|
578
|
-
if (action
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
580
|
+
const action = positional[1] ?? 'status';
|
|
581
|
+
if (action === 'status') {
|
|
582
|
+
const result = service.reindexHealth({
|
|
583
|
+
modelRef: flags.model,
|
|
584
|
+
dimensions: flags.dimensions,
|
|
585
|
+
fake: flags.fake,
|
|
586
|
+
});
|
|
587
|
+
output({ ok: true, ...result, message: `${result.missing_embeddings} chunk(s) missing embeddings` }, flags.json);
|
|
588
|
+
return;
|
|
589
|
+
}
|
|
590
|
+
if (action === 'enqueue') {
|
|
591
|
+
const result = service.enqueueReindex({
|
|
592
|
+
modelRef: flags.model,
|
|
593
|
+
dimensions: flags.dimensions,
|
|
594
|
+
fake: flags.fake,
|
|
595
|
+
});
|
|
596
|
+
output({ ok: true, ...result, message: `Queued ${result.enqueued} embedding refresh item(s)` }, flags.json);
|
|
597
|
+
return;
|
|
598
|
+
}
|
|
599
|
+
if (action === 'embeddings') {
|
|
600
|
+
const result = await service.refreshEmbeddings({
|
|
601
|
+
full: flags.full,
|
|
602
|
+
limit: flags.limit,
|
|
603
|
+
modelRef: flags.model,
|
|
604
|
+
dimensions: flags.dimensions,
|
|
605
|
+
fake: flags.fake,
|
|
606
|
+
});
|
|
607
|
+
output({ ok: true, ...result, message: `Embedded ${result.indexed.chunks_embedded} chunk(s)` }, flags.json);
|
|
608
|
+
return;
|
|
609
|
+
}
|
|
610
|
+
if (action === 'outbox') {
|
|
611
|
+
const input = positional[2];
|
|
612
|
+
if (!input) throw new Error('Usage: open-knowledge reindex outbox <file|s3://bucket/key>');
|
|
613
|
+
const result = await service.consumeOutbox(input);
|
|
614
|
+
output({ ok: true, ...result, message: `Consumed ${result.events_seen} outbox event(s)` }, flags.json);
|
|
615
|
+
return;
|
|
616
|
+
}
|
|
617
|
+
throw new Error("Invalid reindex action. Use 'status', 'enqueue', 'embeddings', or 'outbox'.");
|
|
584
618
|
}
|
|
585
619
|
|
|
586
620
|
if (command === 'embeddings') {
|
package/src/knowledge-db.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { Database } from 'bun:sqlite';
|
|
2
2
|
import { ensureParentDir } from './workspace';
|
|
3
3
|
|
|
4
|
-
export const CURRENT_SCHEMA_VERSION =
|
|
4
|
+
export const CURRENT_SCHEMA_VERSION = 5;
|
|
5
5
|
|
|
6
6
|
export interface KnowledgeDbStats {
|
|
7
7
|
schema_version: number;
|
|
@@ -19,6 +19,7 @@ export interface KnowledgeDbStats {
|
|
|
19
19
|
storage_objects: number;
|
|
20
20
|
embeddings: number;
|
|
21
21
|
vector_entries: number;
|
|
22
|
+
reindex_queue: number;
|
|
22
23
|
}
|
|
23
24
|
|
|
24
25
|
const MIGRATION_1 = `
|
|
@@ -271,6 +272,29 @@ INSERT OR IGNORE INTO schema_versions(version, applied_at)
|
|
|
271
272
|
VALUES (4, datetime('now'));
|
|
272
273
|
`;
|
|
273
274
|
|
|
275
|
+
const MIGRATION_5 = `
|
|
276
|
+
CREATE TABLE IF NOT EXISTS reindex_queue (
|
|
277
|
+
id TEXT PRIMARY KEY,
|
|
278
|
+
kind TEXT NOT NULL,
|
|
279
|
+
target_id TEXT NOT NULL,
|
|
280
|
+
source_uri TEXT,
|
|
281
|
+
reason TEXT NOT NULL,
|
|
282
|
+
status TEXT NOT NULL DEFAULT 'pending',
|
|
283
|
+
attempts INTEGER NOT NULL DEFAULT 0,
|
|
284
|
+
metadata_json TEXT NOT NULL DEFAULT '{}',
|
|
285
|
+
created_at TEXT NOT NULL,
|
|
286
|
+
updated_at TEXT NOT NULL,
|
|
287
|
+
UNIQUE(kind, target_id, reason)
|
|
288
|
+
);
|
|
289
|
+
|
|
290
|
+
CREATE INDEX IF NOT EXISTS idx_reindex_queue_status ON reindex_queue(status);
|
|
291
|
+
CREATE INDEX IF NOT EXISTS idx_reindex_queue_kind_target ON reindex_queue(kind, target_id);
|
|
292
|
+
CREATE INDEX IF NOT EXISTS idx_reindex_queue_source_uri ON reindex_queue(source_uri);
|
|
293
|
+
|
|
294
|
+
INSERT OR IGNORE INTO schema_versions(version, applied_at)
|
|
295
|
+
VALUES (5, datetime('now'));
|
|
296
|
+
`;
|
|
297
|
+
|
|
274
298
|
export function openKnowledgeDb(path: string): Database {
|
|
275
299
|
ensureParentDir(path);
|
|
276
300
|
const db = new Database(path);
|
|
@@ -286,6 +310,7 @@ export function migrateKnowledgeDb(path: string): { path: string; schema_version
|
|
|
286
310
|
if (getSchemaVersion(db) < 2) db.exec(MIGRATION_2);
|
|
287
311
|
if (getSchemaVersion(db) < 3) db.exec(MIGRATION_3);
|
|
288
312
|
if (getSchemaVersion(db) < 4) db.exec(MIGRATION_4);
|
|
313
|
+
if (getSchemaVersion(db) < 5) db.exec(MIGRATION_5);
|
|
289
314
|
return { path, schema_version: getSchemaVersion(db) };
|
|
290
315
|
} finally {
|
|
291
316
|
db.close();
|
|
@@ -321,6 +346,7 @@ export function getKnowledgeDbStats(path: string): KnowledgeDbStats {
|
|
|
321
346
|
storage_objects: count(db, 'storage_objects'),
|
|
322
347
|
embeddings: count(db, 'chunk_embeddings'),
|
|
323
348
|
vector_entries: count(db, 'vector_index_entries'),
|
|
349
|
+
reindex_queue: count(db, 'reindex_queue'),
|
|
324
350
|
};
|
|
325
351
|
} finally {
|
|
326
352
|
db.close();
|
package/src/mcp.js
CHANGED
|
@@ -153,6 +153,50 @@ export function buildServer() {
|
|
|
153
153
|
}
|
|
154
154
|
});
|
|
155
155
|
|
|
156
|
+
registerTool(server, 'ok_reindex_status', 'Reindex status', 'Inspect missing embeddings, queued jobs, stale revisions, and vector index health', {
|
|
157
|
+
scope: scopeField,
|
|
158
|
+
model: z.string().optional().describe('Embedding model ref, default openai:text-embedding-3-small'),
|
|
159
|
+
dimensions: z.number().optional().describe('Embedding dimensions for deterministic fake mode'),
|
|
160
|
+
fake: z.boolean().optional().describe('Use deterministic fake embeddings for local tests'),
|
|
161
|
+
}, async ({ scope, model, dimensions, fake }) => {
|
|
162
|
+
const service = createKnowledgeService({ scope });
|
|
163
|
+
try {
|
|
164
|
+
return jsonText({ ok: true, ...service.reindexHealth({ modelRef: model, dimensions, fake }) });
|
|
165
|
+
} catch (error) {
|
|
166
|
+
return errorText(error instanceof Error ? error.message : String(error));
|
|
167
|
+
}
|
|
168
|
+
});
|
|
169
|
+
|
|
170
|
+
registerTool(server, 'ok_reindex_enqueue', 'Enqueue reindex work', 'Queue missing embedding refresh jobs for indexed source chunks', {
|
|
171
|
+
scope: scopeField,
|
|
172
|
+
model: z.string().optional().describe('Embedding model ref, default openai:text-embedding-3-small'),
|
|
173
|
+
dimensions: z.number().optional().describe('Embedding dimensions for deterministic fake mode'),
|
|
174
|
+
fake: z.boolean().optional().describe('Use deterministic fake embeddings for local tests'),
|
|
175
|
+
}, async ({ scope, model, dimensions, fake }) => {
|
|
176
|
+
const service = createKnowledgeService({ scope });
|
|
177
|
+
try {
|
|
178
|
+
return jsonText({ ok: true, ...service.enqueueReindex({ modelRef: model, dimensions, fake }) });
|
|
179
|
+
} catch (error) {
|
|
180
|
+
return errorText(error instanceof Error ? error.message : String(error));
|
|
181
|
+
}
|
|
182
|
+
});
|
|
183
|
+
|
|
184
|
+
registerTool(server, 'ok_reindex_embeddings', 'Refresh embedding index', 'Run incremental or full embedding refresh jobs with run-ledger tracking', {
|
|
185
|
+
scope: scopeField,
|
|
186
|
+
full: z.boolean().optional().describe('Delete and rebuild all embedding/vector rows first'),
|
|
187
|
+
limit: z.number().optional().describe('Maximum chunks to embed'),
|
|
188
|
+
model: z.string().optional().describe('Embedding model ref, default openai:text-embedding-3-small'),
|
|
189
|
+
dimensions: z.number().optional().describe('Embedding dimensions for deterministic fake mode'),
|
|
190
|
+
fake: z.boolean().optional().describe('Use deterministic fake embeddings for local tests'),
|
|
191
|
+
}, async ({ scope, full, limit, model, dimensions, fake }) => {
|
|
192
|
+
const service = createKnowledgeService({ scope });
|
|
193
|
+
try {
|
|
194
|
+
return jsonText({ ok: true, ...await service.refreshEmbeddings({ full, limit, modelRef: model, dimensions, fake }) });
|
|
195
|
+
} catch (error) {
|
|
196
|
+
return errorText(error instanceof Error ? error.message : String(error));
|
|
197
|
+
}
|
|
198
|
+
});
|
|
199
|
+
|
|
156
200
|
registerTool(server, 'ok_semantic_search', 'Semantic search', 'Search the local vector index and return cited chunks with provenance', {
|
|
157
201
|
scope: scopeField,
|
|
158
202
|
query: z.string().describe('Semantic query'),
|
package/src/reindex.ts
ADDED
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
import { createHash, randomUUID } from 'node:crypto';
|
|
2
|
+
import { indexKnowledgeEmbeddings, resolveEmbeddingModelRef, type EmbeddingRuntimeOptions } from './embeddings';
|
|
3
|
+
import { migrateKnowledgeDb, openKnowledgeDb } from './knowledge-db';
|
|
4
|
+
import { parseModelRef } from './providers';
|
|
5
|
+
import type { KnowledgeConfig } from './workspace';
|
|
6
|
+
|
|
7
|
+
export interface ReindexRuntimeOptions extends EmbeddingRuntimeOptions {
|
|
8
|
+
dbPath: string;
|
|
9
|
+
config?: KnowledgeConfig;
|
|
10
|
+
now?: Date;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export interface ReindexHealthResult {
|
|
14
|
+
schema_version: number;
|
|
15
|
+
chunks: number;
|
|
16
|
+
vector_entries: number;
|
|
17
|
+
missing_embeddings: number;
|
|
18
|
+
queued: Record<string, number>;
|
|
19
|
+
stale_revisions: number;
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
export interface ReindexEnqueueResult {
|
|
23
|
+
enqueued: number;
|
|
24
|
+
already_queued: number;
|
|
25
|
+
reason: string;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
export interface ReindexEmbeddingsResult {
|
|
29
|
+
run_id: string;
|
|
30
|
+
full: boolean;
|
|
31
|
+
deleted_embeddings: number;
|
|
32
|
+
deleted_vector_entries: number;
|
|
33
|
+
queued: ReindexEnqueueResult;
|
|
34
|
+
indexed: Awaited<ReturnType<typeof indexKnowledgeEmbeddings>>;
|
|
35
|
+
completed_queue_items: number;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
interface MissingChunkRow {
|
|
39
|
+
chunk_id: string;
|
|
40
|
+
source_revision_id: string | null;
|
|
41
|
+
source_uri: string | null;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
function stableId(prefix: string, value: string): string {
|
|
45
|
+
return `${prefix}_${createHash('sha256').update(value).digest('hex').slice(0, 20)}`;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
function queueCounts(dbPath: string): Record<string, number> {
|
|
49
|
+
const db = openKnowledgeDb(dbPath);
|
|
50
|
+
try {
|
|
51
|
+
const rows = db.query<{ status: string; n: number }, []>(
|
|
52
|
+
`SELECT status, COUNT(*) AS n FROM reindex_queue GROUP BY status ORDER BY status`,
|
|
53
|
+
).all();
|
|
54
|
+
return Object.fromEntries(rows.map((row) => [row.status, row.n]));
|
|
55
|
+
} finally {
|
|
56
|
+
db.close();
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
function missingEmbeddingRows(dbPath: string, options: ReindexRuntimeOptions): MissingChunkRow[] {
|
|
61
|
+
const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
|
|
62
|
+
const parsed = parseModelRef(modelRef);
|
|
63
|
+
const db = openKnowledgeDb(dbPath);
|
|
64
|
+
try {
|
|
65
|
+
return db.query<MissingChunkRow, [string, string]>(
|
|
66
|
+
`SELECT c.id AS chunk_id, c.source_revision_id, s.uri AS source_uri
|
|
67
|
+
FROM chunks c
|
|
68
|
+
LEFT JOIN source_revisions sr ON sr.id = c.source_revision_id
|
|
69
|
+
LEFT JOIN sources s ON s.id = sr.source_id
|
|
70
|
+
LEFT JOIN vector_index_entries v ON v.chunk_id = c.id AND v.provider = ? AND v.model = ?
|
|
71
|
+
WHERE v.id IS NULL
|
|
72
|
+
ORDER BY c.created_at ASC, c.ordinal ASC`,
|
|
73
|
+
).all(parsed.provider, parsed.model);
|
|
74
|
+
} finally {
|
|
75
|
+
db.close();
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export function reindexHealth(options: ReindexRuntimeOptions): ReindexHealthResult {
|
|
80
|
+
migrateKnowledgeDb(options.dbPath);
|
|
81
|
+
const db = openKnowledgeDb(options.dbPath);
|
|
82
|
+
try {
|
|
83
|
+
const version = db.query<{ version: number }, []>('SELECT MAX(version) AS version FROM schema_versions').get()?.version ?? 0;
|
|
84
|
+
const chunks = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM chunks').get()?.n ?? 0;
|
|
85
|
+
const vectorEntries = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM vector_index_entries').get()?.n ?? 0;
|
|
86
|
+
const missing = missingEmbeddingRows(options.dbPath, options).length;
|
|
87
|
+
const stale = db.query<{ n: number }, []>(
|
|
88
|
+
`SELECT COUNT(*) AS n FROM source_revisions
|
|
89
|
+
WHERE metadata_json LIKE '%"reindex_required":true%' OR metadata_json LIKE '%"status":"stale"%'`,
|
|
90
|
+
).get()?.n ?? 0;
|
|
91
|
+
return {
|
|
92
|
+
schema_version: version,
|
|
93
|
+
chunks,
|
|
94
|
+
vector_entries: vectorEntries,
|
|
95
|
+
missing_embeddings: missing,
|
|
96
|
+
queued: queueCounts(options.dbPath),
|
|
97
|
+
stale_revisions: stale,
|
|
98
|
+
};
|
|
99
|
+
} finally {
|
|
100
|
+
db.close();
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
export function enqueueMissingEmbeddings(options: ReindexRuntimeOptions & { reason?: string }): ReindexEnqueueResult {
|
|
105
|
+
migrateKnowledgeDb(options.dbPath);
|
|
106
|
+
const now = (options.now ?? new Date()).toISOString();
|
|
107
|
+
const reason = options.reason ?? 'missing_embedding';
|
|
108
|
+
const rows = missingEmbeddingRows(options.dbPath, options);
|
|
109
|
+
const db = openKnowledgeDb(options.dbPath);
|
|
110
|
+
let enqueued = 0;
|
|
111
|
+
let alreadyQueued = 0;
|
|
112
|
+
try {
|
|
113
|
+
const write = db.transaction(() => {
|
|
114
|
+
for (const row of rows) {
|
|
115
|
+
const id = stableId('rq', `embedding\u0000${row.chunk_id}\u0000${reason}`);
|
|
116
|
+
const before = db.query<{ id: string }, [string, string, string]>(
|
|
117
|
+
'SELECT id FROM reindex_queue WHERE kind = ? AND target_id = ? AND reason = ?',
|
|
118
|
+
).get('embedding', row.chunk_id, reason);
|
|
119
|
+
if (before) {
|
|
120
|
+
alreadyQueued += 1;
|
|
121
|
+
continue;
|
|
122
|
+
}
|
|
123
|
+
db.run(
|
|
124
|
+
`INSERT INTO reindex_queue (id, kind, target_id, source_uri, reason, status, metadata_json, created_at, updated_at)
|
|
125
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
|
126
|
+
[
|
|
127
|
+
id,
|
|
128
|
+
'embedding',
|
|
129
|
+
row.chunk_id,
|
|
130
|
+
row.source_uri,
|
|
131
|
+
reason,
|
|
132
|
+
'pending',
|
|
133
|
+
JSON.stringify({ source_revision_id: row.source_revision_id }),
|
|
134
|
+
now,
|
|
135
|
+
now,
|
|
136
|
+
],
|
|
137
|
+
);
|
|
138
|
+
enqueued += 1;
|
|
139
|
+
}
|
|
140
|
+
});
|
|
141
|
+
write();
|
|
142
|
+
} finally {
|
|
143
|
+
db.close();
|
|
144
|
+
}
|
|
145
|
+
return { enqueued, already_queued: alreadyQueued, reason };
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
function clearEmbeddingIndex(dbPath: string): { embeddings: number; vectorEntries: number } {
|
|
149
|
+
const db = openKnowledgeDb(dbPath);
|
|
150
|
+
try {
|
|
151
|
+
const embeddings = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM chunk_embeddings').get()?.n ?? 0;
|
|
152
|
+
const vectorEntries = db.query<{ n: number }, []>('SELECT COUNT(*) AS n FROM vector_index_entries').get()?.n ?? 0;
|
|
153
|
+
db.run('DELETE FROM vector_index_entries');
|
|
154
|
+
db.run('DELETE FROM chunk_embeddings');
|
|
155
|
+
return { embeddings, vectorEntries };
|
|
156
|
+
} finally {
|
|
157
|
+
db.close();
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
function completeIndexedQueueItems(dbPath: string, options: ReindexRuntimeOptions, now: string): number {
|
|
162
|
+
const modelRef = resolveEmbeddingModelRef(options.modelRef, options.config);
|
|
163
|
+
const parsed = parseModelRef(modelRef);
|
|
164
|
+
const db = openKnowledgeDb(dbPath);
|
|
165
|
+
try {
|
|
166
|
+
const result = db.run(
|
|
167
|
+
`UPDATE reindex_queue
|
|
168
|
+
SET status = ?, updated_at = ?
|
|
169
|
+
WHERE kind = ?
|
|
170
|
+
AND status = ?
|
|
171
|
+
AND EXISTS (
|
|
172
|
+
SELECT 1 FROM vector_index_entries v
|
|
173
|
+
WHERE v.chunk_id = reindex_queue.target_id
|
|
174
|
+
AND v.provider = ?
|
|
175
|
+
AND v.model = ?
|
|
176
|
+
)`,
|
|
177
|
+
['completed', now, 'embedding', 'pending', parsed.provider, parsed.model],
|
|
178
|
+
);
|
|
179
|
+
return result.changes;
|
|
180
|
+
} finally {
|
|
181
|
+
db.close();
|
|
182
|
+
}
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
export async function refreshEmbeddingIndex(options: ReindexRuntimeOptions & { full?: boolean; limit?: number }): Promise<ReindexEmbeddingsResult> {
|
|
186
|
+
migrateKnowledgeDb(options.dbPath);
|
|
187
|
+
const now = (options.now ?? new Date()).toISOString();
|
|
188
|
+
const runId = `run_${randomUUID()}`;
|
|
189
|
+
const deleted = options.full ? clearEmbeddingIndex(options.dbPath) : { embeddings: 0, vectorEntries: 0 };
|
|
190
|
+
const queued = enqueueMissingEmbeddings({ ...options, reason: options.full ? 'full_embedding_rebuild' : 'missing_embedding' });
|
|
191
|
+
const db = openKnowledgeDb(options.dbPath);
|
|
192
|
+
try {
|
|
193
|
+
db.run(
|
|
194
|
+
`INSERT INTO runs (id, type, prompt, status, provider, model, metadata_json, created_at, updated_at)
|
|
195
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
|
196
|
+
[
|
|
197
|
+
runId,
|
|
198
|
+
'embedding-refresh',
|
|
199
|
+
options.full ? 'full' : 'incremental',
|
|
200
|
+
'running',
|
|
201
|
+
'local',
|
|
202
|
+
resolveEmbeddingModelRef(options.modelRef, options.config),
|
|
203
|
+
JSON.stringify({ full: options.full === true, queued }),
|
|
204
|
+
now,
|
|
205
|
+
now,
|
|
206
|
+
],
|
|
207
|
+
);
|
|
208
|
+
} finally {
|
|
209
|
+
db.close();
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
const indexed = await indexKnowledgeEmbeddings({
|
|
213
|
+
dbPath: options.dbPath,
|
|
214
|
+
config: options.config,
|
|
215
|
+
env: options.env,
|
|
216
|
+
modelRef: options.modelRef,
|
|
217
|
+
dimensions: options.dimensions,
|
|
218
|
+
fake: options.fake,
|
|
219
|
+
limit: options.limit,
|
|
220
|
+
now: options.now,
|
|
221
|
+
});
|
|
222
|
+
|
|
223
|
+
const completedQueueItems = completeIndexedQueueItems(options.dbPath, options, now);
|
|
224
|
+
const doneDb = openKnowledgeDb(options.dbPath);
|
|
225
|
+
try {
|
|
226
|
+
doneDb.run(
|
|
227
|
+
`UPDATE runs SET status = ?, metadata_json = ?, updated_at = ? WHERE id = ?`,
|
|
228
|
+
[
|
|
229
|
+
'completed',
|
|
230
|
+
JSON.stringify({ full: options.full === true, queued, indexed, completed_queue_items: completedQueueItems }),
|
|
231
|
+
now,
|
|
232
|
+
runId,
|
|
233
|
+
],
|
|
234
|
+
);
|
|
235
|
+
doneDb.run(
|
|
236
|
+
`INSERT INTO run_events (id, run_id, level, event, metadata_json, created_at)
|
|
237
|
+
VALUES (?, ?, ?, ?, ?, ?)`,
|
|
238
|
+
[
|
|
239
|
+
`evt_${randomUUID()}`,
|
|
240
|
+
runId,
|
|
241
|
+
'info',
|
|
242
|
+
'embedding_refresh_completed',
|
|
243
|
+
JSON.stringify({ queued, indexed, completed_queue_items: completedQueueItems }),
|
|
244
|
+
now,
|
|
245
|
+
],
|
|
246
|
+
);
|
|
247
|
+
} finally {
|
|
248
|
+
doneDb.close();
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
return {
|
|
252
|
+
run_id: runId,
|
|
253
|
+
full: options.full === true,
|
|
254
|
+
deleted_embeddings: deleted.embeddings,
|
|
255
|
+
deleted_vector_entries: deleted.vectorEntries,
|
|
256
|
+
queued,
|
|
257
|
+
indexed,
|
|
258
|
+
completed_queue_items: completedQueueItems,
|
|
259
|
+
};
|
|
260
|
+
}
|
package/src/service.ts
CHANGED
|
@@ -13,6 +13,7 @@ import { ingestOpenFilesManifest } from './manifest-ingest';
|
|
|
13
13
|
import { ingestSourceRef } from './source-ingest';
|
|
14
14
|
import { resolveOpenFilesSource } from './source-resolver';
|
|
15
15
|
import { providerStatus, listModelRegistry, type ProviderStatusResult, type ModelRegistryEntry } from './providers';
|
|
16
|
+
import { enqueueMissingEmbeddings, refreshEmbeddingIndex, reindexHealth, type ReindexRuntimeOptions } from './reindex';
|
|
16
17
|
import { retrieveKnowledgeContext, type RetrievalOptions } from './retrieval';
|
|
17
18
|
import { hybridSearch, type HybridSearchOptions } from './search';
|
|
18
19
|
import { resolveSafetyPolicy } from './safety';
|
|
@@ -188,6 +189,33 @@ export class KnowledgeService {
|
|
|
188
189
|
});
|
|
189
190
|
}
|
|
190
191
|
|
|
192
|
+
reindexHealth(options: Omit<ReindexRuntimeOptions, 'dbPath' | 'config'> = {}) {
|
|
193
|
+
const workspace = this.ensureWorkspace();
|
|
194
|
+
return reindexHealth({
|
|
195
|
+
...options,
|
|
196
|
+
dbPath: workspace.knowledgeDbPath,
|
|
197
|
+
config: this.config(),
|
|
198
|
+
});
|
|
199
|
+
}
|
|
200
|
+
|
|
201
|
+
enqueueReindex(options: Omit<ReindexRuntimeOptions, 'dbPath' | 'config'> = {}) {
|
|
202
|
+
const workspace = this.ensureWorkspace();
|
|
203
|
+
return enqueueMissingEmbeddings({
|
|
204
|
+
...options,
|
|
205
|
+
dbPath: workspace.knowledgeDbPath,
|
|
206
|
+
config: this.config(),
|
|
207
|
+
});
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
async refreshEmbeddings(options: Omit<ReindexRuntimeOptions & { full?: boolean; limit?: number }, 'dbPath' | 'config'> = {}) {
|
|
211
|
+
const workspace = this.ensureWorkspace();
|
|
212
|
+
return refreshEmbeddingIndex({
|
|
213
|
+
...options,
|
|
214
|
+
dbPath: workspace.knowledgeDbPath,
|
|
215
|
+
config: this.config(),
|
|
216
|
+
});
|
|
217
|
+
}
|
|
218
|
+
|
|
191
219
|
providerStatus(env: Record<string, string | undefined> = process.env): ProviderStatusResult {
|
|
192
220
|
return providerStatus(this.config(), env);
|
|
193
221
|
}
|