@hasna/knowledge 0.2.12 → 0.2.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +36 -3
- package/bin/open-knowledge-mcp.js +763 -166
- package/bin/open-knowledge.js +161 -31
- package/docs/architecture/ai-native-knowledge-base.md +24 -0
- package/docs/architecture/hybrid-semantic-search.md +17 -0
- package/package.json +1 -1
- package/src/cli.ts +48 -4
- package/src/embeddings.ts +516 -0
- package/src/knowledge-db.ts +40 -1
- package/src/manifest-ingest.ts +19 -2
- package/src/mcp.js +38 -0
- package/src/outbox-consume.ts +11 -2
- package/src/provenance.ts +93 -0
- package/src/service.ts +32 -1
- package/src/source-resolver.ts +18 -0
- package/src/wiki-layout.ts +91 -0
- package/src/workspace.ts +12 -0
|
@@ -89,6 +89,21 @@ file revisions, hashes, extraction state, permissions, and storage metadata.
|
|
|
89
89
|
Direct `s3://`, `file://`, and `https://` refs are useful for bootstrap and
|
|
90
90
|
interop, but should be normalized into source records when possible.
|
|
91
91
|
|
|
92
|
+
## Provenance Contract
|
|
93
|
+
|
|
94
|
+
Every durable search/wiki artifact should carry a provenance object in metadata:
|
|
95
|
+
`source_owner`, `source_ref`, `source_uri`, `source_kind`, `source_revision_id`,
|
|
96
|
+
`revision`, `hash`, optional `chunk_id`, offsets, `read_only`,
|
|
97
|
+
`citation_required`, resolver name, and stale status. For generated artifacts
|
|
98
|
+
that are not source-backed yet, metadata still records that `open-files` owns
|
|
99
|
+
source bytes and that citations are required before durable facts are filed.
|
|
100
|
+
|
|
101
|
+
`wiki init` now catalogs the starter `wiki/README.md` and `indexes/root.md`
|
|
102
|
+
records with generated-artifact provenance. Source ingestion stores source
|
|
103
|
+
provenance on every chunk, and source resolution returns that provenance with
|
|
104
|
+
chunks and citations so semantic search can pass through trustworthy evidence
|
|
105
|
+
without reconstructing it later.
|
|
106
|
+
|
|
92
107
|
## Resolver Boundary
|
|
93
108
|
|
|
94
109
|
The local resolver is exposed through:
|
|
@@ -200,6 +215,15 @@ Local mode should start with SQLite FTS and a local vector-index option. Hosted
|
|
|
200
215
|
mode can use Postgres with pgvector or a managed vector index. Permission
|
|
201
216
|
filters must be applied before agent context is assembled.
|
|
202
217
|
|
|
218
|
+
The first local semantic-search implementation indexes derived chunks with
|
|
219
|
+
`open-knowledge embeddings index` and queries them with
|
|
220
|
+
`open-knowledge embeddings search`. It stores OpenAI embedding vectors as
|
|
221
|
+
generated metadata rows, not raw source bytes, and pins each row to
|
|
222
|
+
`open-files` provenance: source ref/URI, revision/hash, chunk offsets, token
|
|
223
|
+
count, provider, model, dimensions, status, and timestamps. This lets the local
|
|
224
|
+
SQLite index later move to pgvector or a managed hosted vector store without
|
|
225
|
+
changing the CLI/MCP result contract.
|
|
226
|
+
|
|
203
227
|
## Agent Workflow
|
|
204
228
|
|
|
205
229
|
The target user flow is:
|
|
@@ -32,6 +32,9 @@ Local mode starts with SQLite:
|
|
|
32
32
|
- `chunks_fts` provides keyword search.
|
|
33
33
|
- `chunk_embeddings` stores embedding vectors as JSON until a local vector
|
|
34
34
|
extension is chosen.
|
|
35
|
+
- `vector_index_entries` stores searchable embedding rows with provider/model,
|
|
36
|
+
dimensions, source revision/hash, chunk offsets, status, timestamps, and
|
|
37
|
+
provenance metadata.
|
|
35
38
|
- `wiki_pages`, `wiki_backlinks`, and `citations` provide graph and provenance
|
|
36
39
|
signals.
|
|
37
40
|
- `knowledge_indexes` tracks generated machine-readable shards.
|
|
@@ -41,6 +44,17 @@ implementation. The retrieval interface should hide it so a later vector
|
|
|
41
44
|
extension or pgvector backend can replace storage without changing CLI/MCP
|
|
42
45
|
contracts.
|
|
43
46
|
|
|
47
|
+
The current local command surface is:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
open-knowledge embeddings index --scope project --model openai:text-embedding-3-small
|
|
51
|
+
open-knowledge embeddings search "company wiki policy" --scope project --json
|
|
52
|
+
```
|
|
53
|
+
|
|
54
|
+
MCP exposes the same capability through `ok_embeddings_status`,
|
|
55
|
+
`ok_embeddings_index`, and `ok_semantic_search`. Deterministic `--fake`
|
|
56
|
+
embeddings exist for tests and offline verification only.
|
|
57
|
+
|
|
44
58
|
## Hosted Indexes
|
|
45
59
|
|
|
46
60
|
Hosted mode may use:
|
|
@@ -120,6 +134,9 @@ Reindexing is driven by source revisions:
|
|
|
120
134
|
stale.
|
|
121
135
|
- If a source is deleted or access changes, affected chunks must be hidden or
|
|
122
136
|
removed before future retrieval.
|
|
137
|
+
- Local outbox consumption deletes stale `chunk_embeddings` and
|
|
138
|
+
`vector_index_entries` for deleted revisions, so semantic search cannot return
|
|
139
|
+
removed source chunks.
|
|
123
140
|
- Wiki pages should track the source revisions they cite so lint can flag stale
|
|
124
141
|
pages.
|
|
125
142
|
- Embedding refresh jobs should be idempotent and checkpointed in `runs` and
|
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -47,10 +47,13 @@ interface Flags {
|
|
|
47
47
|
format?: string;
|
|
48
48
|
completions?: string;
|
|
49
49
|
purpose?: string;
|
|
50
|
+
model?: string;
|
|
51
|
+
dimensions?: number;
|
|
50
52
|
noColor?: boolean;
|
|
51
53
|
scope?: string;
|
|
52
54
|
olderThan?: number;
|
|
53
55
|
empty?: boolean;
|
|
56
|
+
fake?: boolean;
|
|
54
57
|
archived?: boolean;
|
|
55
58
|
includeArchived?: boolean;
|
|
56
59
|
}
|
|
@@ -60,7 +63,7 @@ interface ParseResult {
|
|
|
60
63
|
flags: Flags;
|
|
61
64
|
}
|
|
62
65
|
|
|
63
|
-
const COMMANDS = ['add', 'list', 'get', 'delete', 'update', 'archive', 'restore', 'upsert', 'untag', 'export', 'prune', 'dedupe', 'stats', 'paths', 'storage', 'db', 'wiki', 'source', 'ingest', 'reindex', 'providers', 'safety', 'help'];
|
|
66
|
+
const COMMANDS = ['add', 'list', 'get', 'delete', 'update', 'archive', 'restore', 'upsert', 'untag', 'export', 'prune', 'dedupe', 'stats', 'paths', 'storage', 'db', 'wiki', 'source', 'ingest', 'reindex', 'embeddings', 'providers', 'safety', 'help'];
|
|
64
67
|
const COMMAND_ALIASES: Record<string, string> = {
|
|
65
68
|
ls: 'list',
|
|
66
69
|
rm: 'delete',
|
|
@@ -96,6 +99,9 @@ function parseArgs(argv: string[]): ParseResult {
|
|
|
96
99
|
case '--format': flags.format = argv[i + 1]; i += 1; break;
|
|
97
100
|
case '--completions': flags.completions = argv[i + 1]; i += 1; break;
|
|
98
101
|
case '--purpose': flags.purpose = argv[i + 1]; i += 1; break;
|
|
102
|
+
case '--model': flags.model = argv[i + 1]; i += 1; break;
|
|
103
|
+
case '--dimensions': flags.dimensions = Number(argv[i + 1]); i += 1; break;
|
|
104
|
+
case '--fake': flags.fake = true; break;
|
|
99
105
|
case '--no-color': flags.noColor = true; break;
|
|
100
106
|
case '--scope': flags.scope = argv[i + 1]; i += 1; break;
|
|
101
107
|
case '--older-than': flags.olderThan = Number(argv[i + 1]); i += 1; break;
|
|
@@ -169,6 +175,7 @@ Commands:
|
|
|
169
175
|
ingest manifest <file|s3://> Ingest an open-files manifest into knowledge.db
|
|
170
176
|
ingest source <source-ref> Ingest a read-only source ref into knowledge.db
|
|
171
177
|
reindex outbox <file|s3://> Consume open-files change events and invalidate chunks
|
|
178
|
+
embeddings status|index|search Build/query local vector embeddings
|
|
172
179
|
providers status|models|check Inspect AI SDK provider config and credentials
|
|
173
180
|
safety status|check|approve|audit|redact
|
|
174
181
|
help [command] Show help
|
|
@@ -177,6 +184,9 @@ Global Options:
|
|
|
177
184
|
--json Output JSON
|
|
178
185
|
--store <path> Override store path
|
|
179
186
|
--purpose <name> Read-only source purpose (default: knowledge_answer)
|
|
187
|
+
--model <provider:model> AI/embedding model ref
|
|
188
|
+
--dimensions <n> Embedding dimensions for local/fake providers
|
|
189
|
+
--fake Use deterministic fake embeddings for local tests
|
|
180
190
|
--scope local|global|project Store scope (default: global ~/.hasna/apps/knowledge/)
|
|
181
191
|
--no-color Disable color output
|
|
182
192
|
--completions <shell> Output completions for bash|zsh|fish
|
|
@@ -237,6 +247,7 @@ function printCommandHelp(command: string): void {
|
|
|
237
247
|
if (command === 'source') { console.log('Usage: open-knowledge source resolve <source-ref> [--purpose knowledge_answer|knowledge_index] [--limit <n>] [--scope local|global|project] [--json]'); return; }
|
|
238
248
|
if (command === 'ingest') { console.log('Usage: open-knowledge ingest manifest <file|s3://bucket/key> | source <source-ref> [--purpose knowledge_index] [--scope local|global|project] [--json]'); return; }
|
|
239
249
|
if (command === 'reindex') { console.log('Usage: open-knowledge reindex outbox <file|s3://bucket/key> [--scope local|global|project] [--json]'); return; }
|
|
250
|
+
if (command === 'embeddings') { console.log('Usage: open-knowledge embeddings status|index|search [query] [--model openai:text-embedding-3-small] [--limit <n>] [--dimensions <n>] [--fake] [--scope local|global|project] [--json]'); return; }
|
|
240
251
|
if (command === 'providers') { console.log('Usage: open-knowledge providers status|models|check [provider|model-alias] [--scope local|global|project] [--json]'); return; }
|
|
241
252
|
if (command === 'safety') { console.log('Usage: open-knowledge safety status|check|approve|audit|redact [args] [--scope local|global|project] [--json]'); return; }
|
|
242
253
|
printGlobalHelp();
|
|
@@ -283,11 +294,11 @@ async function run(argv: string[]): Promise<void> {
|
|
|
283
294
|
if (flags.completions) {
|
|
284
295
|
const shell = flags.completions;
|
|
285
296
|
if (shell === 'bash') {
|
|
286
|
-
console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex providers safety help ls rm edit unarchive --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
|
|
297
|
+
console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex embeddings providers safety help ls rm edit unarchive --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --model --dimensions --fake --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
|
|
287
298
|
} else if (shell === 'zsh') {
|
|
288
|
-
console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex providers safety help ls rm edit unarchive)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
|
|
299
|
+
console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex embeddings providers safety help ls rm edit unarchive)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(--fake)--fake" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--model)--model[model ref]:" "(--dimensions)--dimensions[embedding dimensions]:number:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
|
|
289
300
|
} else if (shell === 'fish') {
|
|
290
|
-
console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex providers safety help ls rm edit unarchive"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
|
|
301
|
+
console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex embeddings providers safety help ls rm edit unarchive"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -l fake; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l model; complete -c open-knowledge -l dimensions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
|
|
291
302
|
} else {
|
|
292
303
|
throw new Error("Invalid --completions value. Use 'bash', 'zsh', or 'fish'.");
|
|
293
304
|
}
|
|
@@ -534,6 +545,39 @@ async function run(argv: string[]): Promise<void> {
|
|
|
534
545
|
return;
|
|
535
546
|
}
|
|
536
547
|
|
|
548
|
+
if (command === 'embeddings') {
|
|
549
|
+
const action = positional[1] ?? 'status';
|
|
550
|
+
if (action === 'status') {
|
|
551
|
+
const result = service.embeddingStatus();
|
|
552
|
+
output({ ok: true, ...result, message: `${result.total_vector_entries} vector index entries` }, flags.json);
|
|
553
|
+
return;
|
|
554
|
+
}
|
|
555
|
+
if (action === 'index') {
|
|
556
|
+
const result = await service.indexEmbeddings({
|
|
557
|
+
limit: flags.limit,
|
|
558
|
+
modelRef: flags.model,
|
|
559
|
+
dimensions: flags.dimensions,
|
|
560
|
+
fake: flags.fake,
|
|
561
|
+
});
|
|
562
|
+
output({ ok: true, ...result, message: `Embedded ${result.chunks_embedded} chunk(s)` }, flags.json);
|
|
563
|
+
return;
|
|
564
|
+
}
|
|
565
|
+
if (action === 'search') {
|
|
566
|
+
const query = positional.slice(2).join(' ');
|
|
567
|
+
if (!query) throw new Error('Usage: open-knowledge embeddings search <query>');
|
|
568
|
+
const result = await service.semanticSearch({
|
|
569
|
+
query,
|
|
570
|
+
limit: flags.limit,
|
|
571
|
+
modelRef: flags.model,
|
|
572
|
+
dimensions: flags.dimensions,
|
|
573
|
+
fake: flags.fake,
|
|
574
|
+
});
|
|
575
|
+
output({ ok: true, ...result, message: `${result.results.length} semantic result(s)` }, flags.json);
|
|
576
|
+
return;
|
|
577
|
+
}
|
|
578
|
+
throw new Error("Invalid embeddings action. Use 'status', 'index', or 'search'.");
|
|
579
|
+
}
|
|
580
|
+
|
|
537
581
|
if (command === 'providers') {
|
|
538
582
|
const action = positional[1] ?? 'status';
|
|
539
583
|
if (action === 'status') {
|