@hasna/knowledge 0.2.13 → 0.2.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +45 -3
- package/bin/open-knowledge-mcp.js +1106 -227
- package/bin/open-knowledge.js +192 -35
- package/docs/architecture/ai-native-knowledge-base.md +11 -0
- package/docs/architecture/hybrid-semantic-search.md +31 -8
- package/package.json +1 -1
- package/src/cli.ts +68 -4
- package/src/embeddings.ts +516 -0
- package/src/knowledge-db.ts +39 -1
- package/src/mcp.js +55 -0
- package/src/outbox-consume.ts +11 -2
- package/src/search.ts +510 -0
- package/src/service.ts +40 -0
- package/src/wiki-layout.ts +41 -1
- package/src/workspace.ts +12 -0
|
@@ -215,6 +215,17 @@ Local mode should start with SQLite FTS and a local vector-index option. Hosted
|
|
|
215
215
|
mode can use Postgres with pgvector or a managed vector index. Permission
|
|
216
216
|
filters must be applied before agent context is assembled.
|
|
217
217
|
|
|
218
|
+
The first local semantic-search implementation indexes derived chunks with
|
|
219
|
+
`open-knowledge embeddings index` and queries them with
|
|
220
|
+
`open-knowledge search --semantic` or the lower-level
|
|
221
|
+
`open-knowledge embeddings search`. It stores OpenAI embedding vectors as
|
|
222
|
+
generated metadata rows, not raw source bytes, and pins each row to `open-files`
|
|
223
|
+
provenance: source ref/URI, revision/hash, chunk offsets, token count, provider,
|
|
224
|
+
model, dimensions, status, and timestamps. The structured `search` contract
|
|
225
|
+
merges keyword FTS, wiki/index catalog hits, generated wiki chunks, and optional
|
|
226
|
+
vector results so the local SQLite index can later move to pgvector or a managed
|
|
227
|
+
hosted vector store without changing CLI/MCP result shape.
|
|
228
|
+
|
|
218
229
|
## Agent Workflow
|
|
219
230
|
|
|
220
231
|
The target user flow is:
|
|
@@ -32,6 +32,9 @@ Local mode starts with SQLite:
|
|
|
32
32
|
- `chunks_fts` provides keyword search.
|
|
33
33
|
- `chunk_embeddings` stores embedding vectors as JSON until a local vector
|
|
34
34
|
extension is chosen.
|
|
35
|
+
- `vector_index_entries` stores searchable embedding rows with provider/model,
|
|
36
|
+
dimensions, source revision/hash, chunk offsets, status, timestamps, and
|
|
37
|
+
provenance metadata.
|
|
35
38
|
- `wiki_pages`, `wiki_backlinks`, and `citations` provide graph and provenance
|
|
36
39
|
signals.
|
|
37
40
|
- `knowledge_indexes` tracks generated machine-readable shards.
|
|
@@ -41,6 +44,21 @@ implementation. The retrieval interface should hide it so a later vector
|
|
|
41
44
|
extension or pgvector backend can replace storage without changing CLI/MCP
|
|
42
45
|
contracts.
|
|
43
46
|
|
|
47
|
+
The current local command surface is:
|
|
48
|
+
|
|
49
|
+
```bash
|
|
50
|
+
open-knowledge search "company wiki policy" --scope project --json
|
|
51
|
+
open-knowledge search "company wiki policy" --scope project --semantic --json
|
|
52
|
+
open-knowledge embeddings index --scope project --model openai:text-embedding-3-small
|
|
53
|
+
open-knowledge embeddings search "company wiki policy" --scope project --json
|
|
54
|
+
```
|
|
55
|
+
|
|
56
|
+
`search` is the structured hybrid layer for agents. `embeddings search` is the
|
|
57
|
+
lower-level vector-only command. MCP exposes the same capability through
|
|
58
|
+
`ok_search`, `ok_embeddings_status`, `ok_embeddings_index`, and
|
|
59
|
+
`ok_semantic_search`. Deterministic `--fake` embeddings exist for tests and
|
|
60
|
+
offline verification only.
|
|
61
|
+
|
|
44
62
|
## Hosted Indexes
|
|
45
63
|
|
|
46
64
|
Hosted mode may use:
|
|
@@ -58,16 +76,18 @@ unauthorized content.
|
|
|
58
76
|
|
|
59
77
|
1. Normalize the query.
|
|
60
78
|
2. Embed the query if a semantic-capable provider is configured.
|
|
61
|
-
3. Run keyword FTS over source chunks and wiki chunks.
|
|
62
|
-
4.
|
|
63
|
-
5.
|
|
64
|
-
|
|
79
|
+
3. Run keyword FTS over source chunks and generated wiki chunks.
|
|
80
|
+
4. Search wiki page and machine-readable index catalog rows.
|
|
81
|
+
5. Run vector search over source chunks and wiki pages when semantic mode is
|
|
82
|
+
requested.
|
|
83
|
+
6. Expand candidate pages through backlinks and citations.
|
|
84
|
+
7. Drop stale candidates whose source revision/hash no longer matches
|
|
65
85
|
`open-files`.
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
86
|
+
8. Apply permission filters.
|
|
87
|
+
9. Merge and dedupe by source revision, wiki page, citation, and text hash.
|
|
88
|
+
10. Rerank by relevance, exact-match score, semantic score, freshness, citation
|
|
69
89
|
quality, and wiki authority.
|
|
70
|
-
|
|
90
|
+
11. Return structured results with source refs, citation spans, page refs,
|
|
71
91
|
scores, and reason codes.
|
|
72
92
|
|
|
73
93
|
## Result Shape
|
|
@@ -120,6 +140,9 @@ Reindexing is driven by source revisions:
|
|
|
120
140
|
stale.
|
|
121
141
|
- If a source is deleted or access changes, affected chunks must be hidden or
|
|
122
142
|
removed before future retrieval.
|
|
143
|
+
- Local outbox consumption deletes stale `chunk_embeddings` and
|
|
144
|
+
`vector_index_entries` for deleted revisions, so semantic search cannot return
|
|
145
|
+
removed source chunks.
|
|
123
146
|
- Wiki pages should track the source revisions they cite so lint can flag stale
|
|
124
147
|
pages.
|
|
125
148
|
- Embedding refresh jobs should be idempotent and checkpointed in `runs` and
|
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -47,10 +47,14 @@ interface Flags {
|
|
|
47
47
|
format?: string;
|
|
48
48
|
completions?: string;
|
|
49
49
|
purpose?: string;
|
|
50
|
+
model?: string;
|
|
51
|
+
dimensions?: number;
|
|
52
|
+
semantic?: boolean;
|
|
50
53
|
noColor?: boolean;
|
|
51
54
|
scope?: string;
|
|
52
55
|
olderThan?: number;
|
|
53
56
|
empty?: boolean;
|
|
57
|
+
fake?: boolean;
|
|
54
58
|
archived?: boolean;
|
|
55
59
|
includeArchived?: boolean;
|
|
56
60
|
}
|
|
@@ -60,7 +64,7 @@ interface ParseResult {
|
|
|
60
64
|
flags: Flags;
|
|
61
65
|
}
|
|
62
66
|
|
|
63
|
-
const COMMANDS = ['add', 'list', 'get', 'delete', 'update', 'archive', 'restore', 'upsert', 'untag', 'export', 'prune', 'dedupe', 'stats', 'paths', 'storage', 'db', 'wiki', 'source', 'ingest', 'reindex', 'providers', 'safety', 'help'];
|
|
67
|
+
const COMMANDS = ['add', 'list', 'get', 'delete', 'update', 'archive', 'restore', 'upsert', 'untag', 'export', 'prune', 'dedupe', 'stats', 'paths', 'storage', 'db', 'wiki', 'source', 'ingest', 'reindex', 'search', 'embeddings', 'providers', 'safety', 'help'];
|
|
64
68
|
const COMMAND_ALIASES: Record<string, string> = {
|
|
65
69
|
ls: 'list',
|
|
66
70
|
rm: 'delete',
|
|
@@ -96,6 +100,10 @@ function parseArgs(argv: string[]): ParseResult {
|
|
|
96
100
|
case '--format': flags.format = argv[i + 1]; i += 1; break;
|
|
97
101
|
case '--completions': flags.completions = argv[i + 1]; i += 1; break;
|
|
98
102
|
case '--purpose': flags.purpose = argv[i + 1]; i += 1; break;
|
|
103
|
+
case '--model': flags.model = argv[i + 1]; i += 1; break;
|
|
104
|
+
case '--dimensions': flags.dimensions = Number(argv[i + 1]); i += 1; break;
|
|
105
|
+
case '--semantic': flags.semantic = true; break;
|
|
106
|
+
case '--fake': flags.fake = true; break;
|
|
99
107
|
case '--no-color': flags.noColor = true; break;
|
|
100
108
|
case '--scope': flags.scope = argv[i + 1]; i += 1; break;
|
|
101
109
|
case '--older-than': flags.olderThan = Number(argv[i + 1]); i += 1; break;
|
|
@@ -169,6 +177,8 @@ Commands:
|
|
|
169
177
|
ingest manifest <file|s3://> Ingest an open-files manifest into knowledge.db
|
|
170
178
|
ingest source <source-ref> Ingest a read-only source ref into knowledge.db
|
|
171
179
|
reindex outbox <file|s3://> Consume open-files change events and invalidate chunks
|
|
180
|
+
search <query> Hybrid search sources, wiki pages, and indexes
|
|
181
|
+
embeddings status|index|search Build/query local vector embeddings
|
|
172
182
|
providers status|models|check Inspect AI SDK provider config and credentials
|
|
173
183
|
safety status|check|approve|audit|redact
|
|
174
184
|
help [command] Show help
|
|
@@ -177,6 +187,10 @@ Global Options:
|
|
|
177
187
|
--json Output JSON
|
|
178
188
|
--store <path> Override store path
|
|
179
189
|
--purpose <name> Read-only source purpose (default: knowledge_answer)
|
|
190
|
+
--model <provider:model> AI/embedding model ref
|
|
191
|
+
--dimensions <n> Embedding dimensions for local/fake providers
|
|
192
|
+
--semantic Include vector semantic results in search
|
|
193
|
+
--fake Use deterministic fake embeddings for local tests
|
|
180
194
|
--scope local|global|project Store scope (default: global ~/.hasna/apps/knowledge/)
|
|
181
195
|
--no-color Disable color output
|
|
182
196
|
--completions <shell> Output completions for bash|zsh|fish
|
|
@@ -237,6 +251,8 @@ function printCommandHelp(command: string): void {
|
|
|
237
251
|
if (command === 'source') { console.log('Usage: open-knowledge source resolve <source-ref> [--purpose knowledge_answer|knowledge_index] [--limit <n>] [--scope local|global|project] [--json]'); return; }
|
|
238
252
|
if (command === 'ingest') { console.log('Usage: open-knowledge ingest manifest <file|s3://bucket/key> | source <source-ref> [--purpose knowledge_index] [--scope local|global|project] [--json]'); return; }
|
|
239
253
|
if (command === 'reindex') { console.log('Usage: open-knowledge reindex outbox <file|s3://bucket/key> [--scope local|global|project] [--json]'); return; }
|
|
254
|
+
if (command === 'search') { console.log('Usage: open-knowledge search <query> [--semantic] [--model openai:text-embedding-3-small] [--limit <n>] [--dimensions <n>] [--fake] [--scope local|global|project] [--json]'); return; }
|
|
255
|
+
if (command === 'embeddings') { console.log('Usage: open-knowledge embeddings status|index|search [query] [--model openai:text-embedding-3-small] [--limit <n>] [--dimensions <n>] [--fake] [--scope local|global|project] [--json]'); return; }
|
|
240
256
|
if (command === 'providers') { console.log('Usage: open-knowledge providers status|models|check [provider|model-alias] [--scope local|global|project] [--json]'); return; }
|
|
241
257
|
if (command === 'safety') { console.log('Usage: open-knowledge safety status|check|approve|audit|redact [args] [--scope local|global|project] [--json]'); return; }
|
|
242
258
|
printGlobalHelp();
|
|
@@ -283,11 +299,11 @@ async function run(argv: string[]): Promise<void> {
|
|
|
283
299
|
if (flags.completions) {
|
|
284
300
|
const shell = flags.completions;
|
|
285
301
|
if (shell === 'bash') {
|
|
286
|
-
console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex providers safety help ls rm edit unarchive --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
|
|
302
|
+
console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search embeddings providers safety help ls rm edit unarchive --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --model --dimensions --semantic --fake --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
|
|
287
303
|
} else if (shell === 'zsh') {
|
|
288
|
-
console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex providers safety help ls rm edit unarchive)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
|
|
304
|
+
console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search embeddings providers safety help ls rm edit unarchive)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(--semantic)--semantic" "(--fake)--fake" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--model)--model[model ref]:" "(--dimensions)--dimensions[embedding dimensions]:number:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
|
|
289
305
|
} else if (shell === 'fish') {
|
|
290
|
-
console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex providers safety help ls rm edit unarchive"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
|
|
306
|
+
console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search embeddings providers safety help ls rm edit unarchive"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -l semantic; complete -c open-knowledge -l fake; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l model; complete -c open-knowledge -l dimensions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
|
|
291
307
|
} else {
|
|
292
308
|
throw new Error("Invalid --completions value. Use 'bash', 'zsh', or 'fish'.");
|
|
293
309
|
}
|
|
@@ -534,6 +550,54 @@ async function run(argv: string[]): Promise<void> {
|
|
|
534
550
|
return;
|
|
535
551
|
}
|
|
536
552
|
|
|
553
|
+
if (command === 'embeddings') {
|
|
554
|
+
const action = positional[1] ?? 'status';
|
|
555
|
+
if (action === 'status') {
|
|
556
|
+
const result = service.embeddingStatus();
|
|
557
|
+
output({ ok: true, ...result, message: `${result.total_vector_entries} vector index entries` }, flags.json);
|
|
558
|
+
return;
|
|
559
|
+
}
|
|
560
|
+
if (action === 'index') {
|
|
561
|
+
const result = await service.indexEmbeddings({
|
|
562
|
+
limit: flags.limit,
|
|
563
|
+
modelRef: flags.model,
|
|
564
|
+
dimensions: flags.dimensions,
|
|
565
|
+
fake: flags.fake,
|
|
566
|
+
});
|
|
567
|
+
output({ ok: true, ...result, message: `Embedded ${result.chunks_embedded} chunk(s)` }, flags.json);
|
|
568
|
+
return;
|
|
569
|
+
}
|
|
570
|
+
if (action === 'search') {
|
|
571
|
+
const query = positional.slice(2).join(' ');
|
|
572
|
+
if (!query) throw new Error('Usage: open-knowledge embeddings search <query>');
|
|
573
|
+
const result = await service.semanticSearch({
|
|
574
|
+
query,
|
|
575
|
+
limit: flags.limit,
|
|
576
|
+
modelRef: flags.model,
|
|
577
|
+
dimensions: flags.dimensions,
|
|
578
|
+
fake: flags.fake,
|
|
579
|
+
});
|
|
580
|
+
output({ ok: true, ...result, message: `${result.results.length} semantic result(s)` }, flags.json);
|
|
581
|
+
return;
|
|
582
|
+
}
|
|
583
|
+
throw new Error("Invalid embeddings action. Use 'status', 'index', or 'search'.");
|
|
584
|
+
}
|
|
585
|
+
|
|
586
|
+
if (command === 'search') {
|
|
587
|
+
const query = positional.slice(1).join(' ');
|
|
588
|
+
if (!query) throw new Error('Usage: open-knowledge search <query>');
|
|
589
|
+
const result = await service.search({
|
|
590
|
+
query,
|
|
591
|
+
limit: flags.limit,
|
|
592
|
+
semantic: flags.semantic,
|
|
593
|
+
modelRef: flags.model,
|
|
594
|
+
dimensions: flags.dimensions,
|
|
595
|
+
fake: flags.fake,
|
|
596
|
+
});
|
|
597
|
+
output({ ok: true, ...result, message: `${result.results.length} search result(s)` }, flags.json);
|
|
598
|
+
return;
|
|
599
|
+
}
|
|
600
|
+
|
|
537
601
|
if (command === 'providers') {
|
|
538
602
|
const action = positional[1] ?? 'status';
|
|
539
603
|
if (action === 'status') {
|