@hasna/knowledge 0.2.14 → 0.2.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +26 -2
- package/bin/open-knowledge-mcp.js +681 -8
- package/bin/open-knowledge.js +92 -40
- package/docs/architecture/ai-native-knowledge-base.md +10 -5
- package/docs/architecture/hybrid-semantic-search.md +24 -11
- package/package.json +1 -1
- package/src/cli.ts +39 -4
- package/src/mcp.js +34 -0
- package/src/retrieval.ts +326 -0
- package/src/search.ts +510 -0
- package/src/service.ts +20 -0
- package/src/wiki-layout.ts +41 -1
|
@@ -217,12 +217,17 @@ filters must be applied before agent context is assembled.
|
|
|
217
217
|
|
|
218
218
|
The first local semantic-search implementation indexes derived chunks with
|
|
219
219
|
`open-knowledge embeddings index` and queries them with
|
|
220
|
+
`open-knowledge search --semantic` or the lower-level
|
|
220
221
|
`open-knowledge embeddings search`. It stores OpenAI embedding vectors as
|
|
221
|
-
generated metadata rows, not raw source bytes, and pins each row to
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
222
|
+
generated metadata rows, not raw source bytes, and pins each row to `open-files`
|
|
223
|
+
provenance: source ref/URI, revision/hash, chunk offsets, token count, provider,
|
|
224
|
+
model, dimensions, status, and timestamps. The structured `search` contract
|
|
225
|
+
merges keyword FTS, wiki/index catalog hits, generated wiki chunks, and optional
|
|
226
|
+
vector results. `open-knowledge search --context` and MCP `knowledge_search`
|
|
227
|
+
turn those rows into reranked citation context packs with selected excerpts,
|
|
228
|
+
freshness and permission notes, graph evidence, and final rerank scores. The
|
|
229
|
+
local SQLite index can later move to pgvector or a managed hosted vector store
|
|
230
|
+
without changing CLI/MCP result shape.
|
|
226
231
|
|
|
227
232
|
## Agent Workflow
|
|
228
233
|
|
|
@@ -47,13 +47,18 @@ contracts.
|
|
|
47
47
|
The current local command surface is:
|
|
48
48
|
|
|
49
49
|
```bash
|
|
50
|
+
open-knowledge search "company wiki policy" --scope project --json
|
|
51
|
+
open-knowledge search "company wiki policy" --scope project --semantic --json
|
|
52
|
+
open-knowledge search "company wiki policy" --scope project --context --json
|
|
50
53
|
open-knowledge embeddings index --scope project --model openai:text-embedding-3-small
|
|
51
54
|
open-knowledge embeddings search "company wiki policy" --scope project --json
|
|
52
55
|
```
|
|
53
56
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
+
`search` is the structured hybrid layer for agents. `embeddings search` is the
|
|
58
|
+
lower-level vector-only command. MCP exposes the same capability through
|
|
59
|
+
`ok_search`, `knowledge_search`, `ok_embeddings_status`, `ok_embeddings_index`,
|
|
60
|
+
and `ok_semantic_search`. Deterministic `--fake` embeddings exist for tests and
|
|
61
|
+
offline verification only.
|
|
57
62
|
|
|
58
63
|
## Hosted Indexes
|
|
59
64
|
|
|
@@ -72,16 +77,18 @@ unauthorized content.
|
|
|
72
77
|
|
|
73
78
|
1. Normalize the query.
|
|
74
79
|
2. Embed the query if a semantic-capable provider is configured.
|
|
75
|
-
3. Run keyword FTS over source chunks and wiki chunks.
|
|
76
|
-
4.
|
|
77
|
-
5.
|
|
78
|
-
|
|
80
|
+
3. Run keyword FTS over source chunks and generated wiki chunks.
|
|
81
|
+
4. Search wiki page and machine-readable index catalog rows.
|
|
82
|
+
5. Run vector search over source chunks and wiki pages when semantic mode is
|
|
83
|
+
requested.
|
|
84
|
+
6. Expand candidate pages through backlinks and citations.
|
|
85
|
+
7. Drop stale candidates whose source revision/hash no longer matches
|
|
79
86
|
`open-files`.
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
87
|
+
8. Apply permission filters.
|
|
88
|
+
9. Merge and dedupe by source revision, wiki page, citation, and text hash.
|
|
89
|
+
10. Rerank by relevance, exact-match score, semantic score, freshness, citation
|
|
83
90
|
quality, and wiki authority.
|
|
84
|
-
|
|
91
|
+
11. Return structured results with source refs, citation spans, page refs,
|
|
85
92
|
scores, and reason codes.
|
|
86
93
|
|
|
87
94
|
## Result Shape
|
|
@@ -126,6 +133,12 @@ They should receive context packs:
|
|
|
126
133
|
|
|
127
134
|
This keeps agent prompts stable while the retrieval internals evolve.
|
|
128
135
|
|
|
136
|
+
The local context-pack implementation is available through
|
|
137
|
+
`open-knowledge search --context` and MCP `knowledge_search`. It reranks merged
|
|
138
|
+
search rows using exact-term coverage, citation availability, source freshness,
|
|
139
|
+
and source/wiki authority, then emits excerpts and citation objects that preserve
|
|
140
|
+
source refs, artifact URIs, revision/hash metadata, offsets, and provenance.
|
|
141
|
+
|
|
129
142
|
## Reindexing
|
|
130
143
|
|
|
131
144
|
Reindexing is driven by source revisions:
|
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -49,6 +49,8 @@ interface Flags {
|
|
|
49
49
|
purpose?: string;
|
|
50
50
|
model?: string;
|
|
51
51
|
dimensions?: number;
|
|
52
|
+
semantic?: boolean;
|
|
53
|
+
context?: boolean;
|
|
52
54
|
noColor?: boolean;
|
|
53
55
|
scope?: string;
|
|
54
56
|
olderThan?: number;
|
|
@@ -63,7 +65,7 @@ interface ParseResult {
|
|
|
63
65
|
flags: Flags;
|
|
64
66
|
}
|
|
65
67
|
|
|
66
|
-
const COMMANDS = ['add', 'list', 'get', 'delete', 'update', 'archive', 'restore', 'upsert', 'untag', 'export', 'prune', 'dedupe', 'stats', 'paths', 'storage', 'db', 'wiki', 'source', 'ingest', 'reindex', 'embeddings', 'providers', 'safety', 'help'];
|
|
68
|
+
const COMMANDS = ['add', 'list', 'get', 'delete', 'update', 'archive', 'restore', 'upsert', 'untag', 'export', 'prune', 'dedupe', 'stats', 'paths', 'storage', 'db', 'wiki', 'source', 'ingest', 'reindex', 'search', 'embeddings', 'providers', 'safety', 'help'];
|
|
67
69
|
const COMMAND_ALIASES: Record<string, string> = {
|
|
68
70
|
ls: 'list',
|
|
69
71
|
rm: 'delete',
|
|
@@ -101,6 +103,8 @@ function parseArgs(argv: string[]): ParseResult {
|
|
|
101
103
|
case '--purpose': flags.purpose = argv[i + 1]; i += 1; break;
|
|
102
104
|
case '--model': flags.model = argv[i + 1]; i += 1; break;
|
|
103
105
|
case '--dimensions': flags.dimensions = Number(argv[i + 1]); i += 1; break;
|
|
106
|
+
case '--semantic': flags.semantic = true; break;
|
|
107
|
+
case '--context': flags.context = true; break;
|
|
104
108
|
case '--fake': flags.fake = true; break;
|
|
105
109
|
case '--no-color': flags.noColor = true; break;
|
|
106
110
|
case '--scope': flags.scope = argv[i + 1]; i += 1; break;
|
|
@@ -175,6 +179,7 @@ Commands:
|
|
|
175
179
|
ingest manifest <file|s3://> Ingest an open-files manifest into knowledge.db
|
|
176
180
|
ingest source <source-ref> Ingest a read-only source ref into knowledge.db
|
|
177
181
|
reindex outbox <file|s3://> Consume open-files change events and invalidate chunks
|
|
182
|
+
search <query> Hybrid search sources, wiki pages, indexes, or context
|
|
178
183
|
embeddings status|index|search Build/query local vector embeddings
|
|
179
184
|
providers status|models|check Inspect AI SDK provider config and credentials
|
|
180
185
|
safety status|check|approve|audit|redact
|
|
@@ -186,6 +191,8 @@ Global Options:
|
|
|
186
191
|
--purpose <name> Read-only source purpose (default: knowledge_answer)
|
|
187
192
|
--model <provider:model> AI/embedding model ref
|
|
188
193
|
--dimensions <n> Embedding dimensions for local/fake providers
|
|
194
|
+
--semantic Include vector semantic results in search
|
|
195
|
+
--context Return a reranked citation context pack for search
|
|
189
196
|
--fake Use deterministic fake embeddings for local tests
|
|
190
197
|
--scope local|global|project Store scope (default: global ~/.hasna/apps/knowledge/)
|
|
191
198
|
--no-color Disable color output
|
|
@@ -247,6 +254,7 @@ function printCommandHelp(command: string): void {
|
|
|
247
254
|
if (command === 'source') { console.log('Usage: open-knowledge source resolve <source-ref> [--purpose knowledge_answer|knowledge_index] [--limit <n>] [--scope local|global|project] [--json]'); return; }
|
|
248
255
|
if (command === 'ingest') { console.log('Usage: open-knowledge ingest manifest <file|s3://bucket/key> | source <source-ref> [--purpose knowledge_index] [--scope local|global|project] [--json]'); return; }
|
|
249
256
|
if (command === 'reindex') { console.log('Usage: open-knowledge reindex outbox <file|s3://bucket/key> [--scope local|global|project] [--json]'); return; }
|
|
257
|
+
if (command === 'search') { console.log('Usage: open-knowledge search <query> [--context] [--semantic] [--model openai:text-embedding-3-small] [--limit <n>] [--dimensions <n>] [--fake] [--scope local|global|project] [--json]'); return; }
|
|
250
258
|
if (command === 'embeddings') { console.log('Usage: open-knowledge embeddings status|index|search [query] [--model openai:text-embedding-3-small] [--limit <n>] [--dimensions <n>] [--fake] [--scope local|global|project] [--json]'); return; }
|
|
251
259
|
if (command === 'providers') { console.log('Usage: open-knowledge providers status|models|check [provider|model-alias] [--scope local|global|project] [--json]'); return; }
|
|
252
260
|
if (command === 'safety') { console.log('Usage: open-knowledge safety status|check|approve|audit|redact [args] [--scope local|global|project] [--json]'); return; }
|
|
@@ -294,11 +302,11 @@ async function run(argv: string[]): Promise<void> {
|
|
|
294
302
|
if (flags.completions) {
|
|
295
303
|
const shell = flags.completions;
|
|
296
304
|
if (shell === 'bash') {
|
|
297
|
-
console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex embeddings providers safety help ls rm edit unarchive --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --model --dimensions --fake --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
|
|
305
|
+
console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search embeddings providers safety help ls rm edit unarchive --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --model --dimensions --semantic --context --fake --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
|
|
298
306
|
} else if (shell === 'zsh') {
|
|
299
|
-
console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex embeddings providers safety help ls rm edit unarchive)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(--fake)--fake" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--model)--model[model ref]:" "(--dimensions)--dimensions[embedding dimensions]:number:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
|
|
307
|
+
console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search embeddings providers safety help ls rm edit unarchive)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(--semantic)--semantic" "(--context)--context" "(--fake)--fake" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--model)--model[model ref]:" "(--dimensions)--dimensions[embedding dimensions]:number:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
|
|
300
308
|
} else if (shell === 'fish') {
|
|
301
|
-
console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex embeddings providers safety help ls rm edit unarchive"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -l fake; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l model; complete -c open-knowledge -l dimensions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
|
|
309
|
+
console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search embeddings providers safety help ls rm edit unarchive"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -l semantic; complete -c open-knowledge -l context; complete -c open-knowledge -l fake; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l model; complete -c open-knowledge -l dimensions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
|
|
302
310
|
} else {
|
|
303
311
|
throw new Error("Invalid --completions value. Use 'bash', 'zsh', or 'fish'.");
|
|
304
312
|
}
|
|
@@ -578,6 +586,33 @@ async function run(argv: string[]): Promise<void> {
|
|
|
578
586
|
throw new Error("Invalid embeddings action. Use 'status', 'index', or 'search'.");
|
|
579
587
|
}
|
|
580
588
|
|
|
589
|
+
if (command === 'search') {
|
|
590
|
+
const query = positional.slice(1).join(' ');
|
|
591
|
+
if (!query) throw new Error('Usage: open-knowledge search <query>');
|
|
592
|
+
if (flags.context) {
|
|
593
|
+
const context = await service.retrieveContext({
|
|
594
|
+
query,
|
|
595
|
+
limit: flags.limit,
|
|
596
|
+
semantic: flags.semantic,
|
|
597
|
+
modelRef: flags.model,
|
|
598
|
+
dimensions: flags.dimensions,
|
|
599
|
+
fake: flags.fake,
|
|
600
|
+
});
|
|
601
|
+
output({ ok: true, ...context, message: `${context.excerpts.length} context excerpt(s)` }, flags.json);
|
|
602
|
+
return;
|
|
603
|
+
}
|
|
604
|
+
const result = await service.search({
|
|
605
|
+
query,
|
|
606
|
+
limit: flags.limit,
|
|
607
|
+
semantic: flags.semantic,
|
|
608
|
+
modelRef: flags.model,
|
|
609
|
+
dimensions: flags.dimensions,
|
|
610
|
+
fake: flags.fake,
|
|
611
|
+
});
|
|
612
|
+
output({ ok: true, ...result, message: `${result.results.length} search result(s)` }, flags.json);
|
|
613
|
+
return;
|
|
614
|
+
}
|
|
615
|
+
|
|
581
616
|
if (command === 'providers') {
|
|
582
617
|
const action = positional[1] ?? 'status';
|
|
583
618
|
if (action === 'status') {
|
package/src/mcp.js
CHANGED
|
@@ -169,6 +169,40 @@ export function buildServer() {
|
|
|
169
169
|
}
|
|
170
170
|
});
|
|
171
171
|
|
|
172
|
+
registerTool(server, 'ok_search', 'Hybrid knowledge search', 'Search source chunks, generated wiki pages, sharded indexes, and optional semantic vectors', {
|
|
173
|
+
scope: scopeField,
|
|
174
|
+
query: z.string().describe('Search query'),
|
|
175
|
+
limit: z.number().optional().describe('Maximum results'),
|
|
176
|
+
semantic: z.boolean().optional().describe('Include vector semantic results'),
|
|
177
|
+
model: z.string().optional().describe('Embedding model ref, default openai:text-embedding-3-small'),
|
|
178
|
+
dimensions: z.number().optional().describe('Embedding dimensions for deterministic fake mode'),
|
|
179
|
+
fake: z.boolean().optional().describe('Use deterministic fake embeddings for local tests'),
|
|
180
|
+
}, async ({ scope, query, limit, semantic, model, dimensions, fake }) => {
|
|
181
|
+
const service = createKnowledgeService({ scope });
|
|
182
|
+
try {
|
|
183
|
+
return jsonText({ ok: true, ...await service.search({ query, limit, semantic, modelRef: model, dimensions, fake }) });
|
|
184
|
+
} catch (error) {
|
|
185
|
+
return errorText(error instanceof Error ? error.message : String(error));
|
|
186
|
+
}
|
|
187
|
+
});
|
|
188
|
+
|
|
189
|
+
registerTool(server, 'knowledge_search', 'Knowledge context search', 'Return a reranked citation context pack for agent prompts', {
|
|
190
|
+
scope: scopeField,
|
|
191
|
+
query: z.string().describe('Search query or prompt'),
|
|
192
|
+
limit: z.number().optional().describe('Maximum context results'),
|
|
193
|
+
semantic: z.boolean().optional().describe('Include vector semantic results'),
|
|
194
|
+
model: z.string().optional().describe('Embedding model ref, default openai:text-embedding-3-small'),
|
|
195
|
+
dimensions: z.number().optional().describe('Embedding dimensions for deterministic fake mode'),
|
|
196
|
+
fake: z.boolean().optional().describe('Use deterministic fake embeddings for local tests'),
|
|
197
|
+
}, async ({ scope, query, limit, semantic, model, dimensions, fake }) => {
|
|
198
|
+
const service = createKnowledgeService({ scope });
|
|
199
|
+
try {
|
|
200
|
+
return jsonText({ ok: true, ...await service.retrieveContext({ query, limit, semantic, modelRef: model, dimensions, fake }) });
|
|
201
|
+
} catch (error) {
|
|
202
|
+
return errorText(error instanceof Error ? error.message : String(error));
|
|
203
|
+
}
|
|
204
|
+
});
|
|
205
|
+
|
|
172
206
|
registerTool(server, 'ok_add', 'Add a knowledge item', 'Add a new item to the knowledge store', {
|
|
173
207
|
title: z.string().describe('Item title'),
|
|
174
208
|
content: z.string().describe('Item content/body'),
|
package/src/retrieval.ts
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
2
|
+
import { openKnowledgeDb } from './knowledge-db';
|
|
3
|
+
import { isStaleStatus } from './provenance';
|
|
4
|
+
import { hybridSearch, type HybridSearchEntry, type HybridSearchOptions, type HybridSearchResult, type SearchProvenance } from './search';
|
|
5
|
+
|
|
6
|
+
export interface RetrievalOptions extends HybridSearchOptions {
|
|
7
|
+
contextChars?: number;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export interface RerankedSearchEntry extends HybridSearchEntry {
|
|
11
|
+
rerank: {
|
|
12
|
+
base_score: number;
|
|
13
|
+
final_score: number;
|
|
14
|
+
exact_score: number;
|
|
15
|
+
citation_score: number;
|
|
16
|
+
freshness_score: number;
|
|
17
|
+
authority_score: number;
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface RetrievalCitation {
|
|
22
|
+
id: string;
|
|
23
|
+
result_id: string;
|
|
24
|
+
kind: HybridSearchEntry['kind'];
|
|
25
|
+
source_uri: string | null;
|
|
26
|
+
source_ref: string | null;
|
|
27
|
+
artifact_uri: string | null;
|
|
28
|
+
artifact_path: string | null;
|
|
29
|
+
revision: string | null;
|
|
30
|
+
hash: string | null;
|
|
31
|
+
chunk_id: string | null;
|
|
32
|
+
start_offset: number | null;
|
|
33
|
+
end_offset: number | null;
|
|
34
|
+
quote: string | null;
|
|
35
|
+
provenance: SearchProvenance | null;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export interface RetrievalExcerpt {
|
|
39
|
+
id: string;
|
|
40
|
+
result_id: string;
|
|
41
|
+
citation_id: string | null;
|
|
42
|
+
kind: HybridSearchEntry['kind'];
|
|
43
|
+
text: string;
|
|
44
|
+
score: number;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export interface RetrievalGraphEvidence {
|
|
48
|
+
citations: Array<{
|
|
49
|
+
id: string;
|
|
50
|
+
chunk_id: string | null;
|
|
51
|
+
wiki_page_id: string | null;
|
|
52
|
+
source_uri: string;
|
|
53
|
+
quote: string | null;
|
|
54
|
+
start_offset: number | null;
|
|
55
|
+
end_offset: number | null;
|
|
56
|
+
}>;
|
|
57
|
+
backlinks: Array<{
|
|
58
|
+
from_page_id: string;
|
|
59
|
+
to_page_id: string;
|
|
60
|
+
label: string | null;
|
|
61
|
+
}>;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export interface KnowledgeContextPack {
|
|
65
|
+
query: string;
|
|
66
|
+
normalized_query: string;
|
|
67
|
+
created_at: string;
|
|
68
|
+
mode: HybridSearchResult['mode'];
|
|
69
|
+
warnings: string[];
|
|
70
|
+
search_counts: HybridSearchResult['counts'];
|
|
71
|
+
results: RerankedSearchEntry[];
|
|
72
|
+
citations: RetrievalCitation[];
|
|
73
|
+
excerpts: RetrievalExcerpt[];
|
|
74
|
+
graph: RetrievalGraphEvidence;
|
|
75
|
+
notes: {
|
|
76
|
+
permissions: string[];
|
|
77
|
+
freshness: string[];
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
interface CitationRow {
|
|
82
|
+
id: string;
|
|
83
|
+
wiki_page_id: string | null;
|
|
84
|
+
chunk_id: string | null;
|
|
85
|
+
source_uri: string;
|
|
86
|
+
quote: string | null;
|
|
87
|
+
start_offset: number | null;
|
|
88
|
+
end_offset: number | null;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
interface BacklinkRow {
|
|
92
|
+
from_page_id: string;
|
|
93
|
+
to_page_id: string;
|
|
94
|
+
label: string | null;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function stableId(prefix: string, value: string): string {
|
|
98
|
+
return `${prefix}_${createHash('sha256').update(value).digest('hex').slice(0, 20)}`;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function normalizeQuery(query: string): string {
|
|
102
|
+
return query.normalize('NFKC').trim().replace(/\s+/g, ' ').toLowerCase();
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function queryTerms(query: string): string[] {
|
|
106
|
+
return Array.from(new Set(normalizeQuery(query).match(/[\p{L}\p{N}_]+/gu) ?? [])).slice(0, 16);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function textForResult(result: HybridSearchEntry): string {
|
|
110
|
+
return [result.title, result.text].filter(Boolean).join(' ').toLowerCase();
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function exactScore(result: HybridSearchEntry, terms: string[]): number {
|
|
114
|
+
if (terms.length === 0) return 0;
|
|
115
|
+
const text = textForResult(result);
|
|
116
|
+
const matched = terms.filter((term) => text.includes(term)).length;
|
|
117
|
+
return Number((matched / terms.length).toFixed(6));
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function hasReadOnlyProvenance(provenance: SearchProvenance | null): boolean {
|
|
121
|
+
if (!provenance) return true;
|
|
122
|
+
if ('read_only' in provenance) return provenance.read_only === true;
|
|
123
|
+
if ('read_only_sources' in provenance) return provenance.read_only_sources === true;
|
|
124
|
+
return true;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function isStale(provenance: SearchProvenance | null): boolean {
|
|
128
|
+
if (!provenance) return false;
|
|
129
|
+
if ('stale' in provenance && provenance.stale) return true;
|
|
130
|
+
if ('status' in provenance) return isStaleStatus(provenance.status);
|
|
131
|
+
return false;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function freshnessScore(result: HybridSearchEntry): number {
|
|
135
|
+
if (isStale(result.provenance)) return 0;
|
|
136
|
+
if (result.source?.hash || result.source?.revision) return 1;
|
|
137
|
+
if (result.artifact?.hash) return 0.85;
|
|
138
|
+
if (result.provenance && 'source_refs' in result.provenance && result.provenance.source_refs.length > 0) return 0.75;
|
|
139
|
+
return 0.55;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function citationScore(result: HybridSearchEntry): number {
|
|
143
|
+
if (result.citation?.chunk_id && (result.source?.uri || result.artifact?.uri)) return 1;
|
|
144
|
+
if (result.provenance && 'citation_required' in result.provenance && result.provenance.citation_required) return 0.75;
|
|
145
|
+
if (result.artifact?.uri) return 0.65;
|
|
146
|
+
return 0.35;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function authorityScore(result: HybridSearchEntry): number {
|
|
150
|
+
if (result.kind === 'wiki_chunk') return 0.85;
|
|
151
|
+
if (result.kind === 'source_chunk') return 0.8;
|
|
152
|
+
if (result.kind === 'wiki_page') return 0.65;
|
|
153
|
+
return 0.55;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
function rerank(result: HybridSearchEntry, terms: string[]): RerankedSearchEntry {
|
|
157
|
+
const scores = {
|
|
158
|
+
base_score: result.score,
|
|
159
|
+
exact_score: exactScore(result, terms),
|
|
160
|
+
citation_score: citationScore(result),
|
|
161
|
+
freshness_score: freshnessScore(result),
|
|
162
|
+
authority_score: authorityScore(result),
|
|
163
|
+
};
|
|
164
|
+
const final = Math.min(1,
|
|
165
|
+
scores.base_score * 0.65 +
|
|
166
|
+
scores.exact_score * 0.1 +
|
|
167
|
+
scores.citation_score * 0.1 +
|
|
168
|
+
scores.freshness_score * 0.1 +
|
|
169
|
+
scores.authority_score * 0.05,
|
|
170
|
+
);
|
|
171
|
+
const reasons = new Set(result.reasons);
|
|
172
|
+
if (scores.exact_score > 0.5) reasons.add('exact_term');
|
|
173
|
+
if (scores.citation_score >= 0.75) reasons.add('cited_source');
|
|
174
|
+
if (scores.freshness_score >= 0.85) reasons.add('fresh_source');
|
|
175
|
+
return {
|
|
176
|
+
...result,
|
|
177
|
+
score: Number(final.toFixed(6)),
|
|
178
|
+
reasons: Array.from(reasons),
|
|
179
|
+
rerank: {
|
|
180
|
+
...scores,
|
|
181
|
+
final_score: Number(final.toFixed(6)),
|
|
182
|
+
},
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
function quoteFor(result: HybridSearchEntry, maxChars: number): string | null {
|
|
187
|
+
const source = result.text ?? result.title;
|
|
188
|
+
if (!source) return null;
|
|
189
|
+
const normalized = source.replace(/\s+/g, ' ').trim();
|
|
190
|
+
return normalized.length <= maxChars ? normalized : `${normalized.slice(0, Math.max(0, maxChars - 1)).trim()}...`;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
function citationFor(result: RerankedSearchEntry): RetrievalCitation {
|
|
194
|
+
const id = stableId('cite', `${result.kind}\u0000${result.id}\u0000${result.source?.uri ?? ''}\u0000${result.artifact?.uri ?? ''}`);
|
|
195
|
+
return {
|
|
196
|
+
id,
|
|
197
|
+
result_id: result.id,
|
|
198
|
+
kind: result.kind,
|
|
199
|
+
source_uri: result.source?.uri ?? null,
|
|
200
|
+
source_ref: result.source?.ref ?? null,
|
|
201
|
+
artifact_uri: result.artifact?.uri ?? null,
|
|
202
|
+
artifact_path: result.artifact?.path ?? null,
|
|
203
|
+
revision: result.source?.revision ?? null,
|
|
204
|
+
hash: result.source?.hash ?? result.artifact?.hash ?? null,
|
|
205
|
+
chunk_id: result.citation?.chunk_id ?? null,
|
|
206
|
+
start_offset: result.citation?.start_offset ?? null,
|
|
207
|
+
end_offset: result.citation?.end_offset ?? null,
|
|
208
|
+
quote: quoteFor(result, 500),
|
|
209
|
+
provenance: result.provenance,
|
|
210
|
+
};
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
function excerptFor(result: RerankedSearchEntry, citation: RetrievalCitation, contextChars: number): RetrievalExcerpt | null {
|
|
214
|
+
const text = quoteFor(result, contextChars);
|
|
215
|
+
if (!text) return null;
|
|
216
|
+
return {
|
|
217
|
+
id: stableId('excerpt', `${result.kind}\u0000${result.id}`),
|
|
218
|
+
result_id: result.id,
|
|
219
|
+
citation_id: citation.id,
|
|
220
|
+
kind: result.kind,
|
|
221
|
+
text,
|
|
222
|
+
score: result.score,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
function placeholders(values: unknown[]): string {
|
|
227
|
+
return values.map(() => '?').join(', ');
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
function loadGraphEvidence(dbPath: string, results: RerankedSearchEntry[]): RetrievalGraphEvidence {
|
|
231
|
+
const chunkIds = results.map((result) => result.citation?.chunk_id).filter((id): id is string => Boolean(id));
|
|
232
|
+
const wikiPageIds = results.filter((result) => result.kind === 'wiki_page').map((result) => result.id);
|
|
233
|
+
const citations: CitationRow[] = [];
|
|
234
|
+
const backlinks: BacklinkRow[] = [];
|
|
235
|
+
if (chunkIds.length === 0 && wikiPageIds.length === 0) return { citations, backlinks };
|
|
236
|
+
|
|
237
|
+
const db = openKnowledgeDb(dbPath);
|
|
238
|
+
try {
|
|
239
|
+
if (chunkIds.length > 0) {
|
|
240
|
+
citations.push(...db.query<CitationRow, string[]>(
|
|
241
|
+
`SELECT id, wiki_page_id, chunk_id, source_uri, quote, start_offset, end_offset
|
|
242
|
+
FROM citations
|
|
243
|
+
WHERE chunk_id IN (${placeholders(chunkIds)})
|
|
244
|
+
ORDER BY created_at DESC
|
|
245
|
+
LIMIT 50`,
|
|
246
|
+
).all(...chunkIds));
|
|
247
|
+
}
|
|
248
|
+
if (wikiPageIds.length > 0) {
|
|
249
|
+
citations.push(...db.query<CitationRow, string[]>(
|
|
250
|
+
`SELECT id, wiki_page_id, chunk_id, source_uri, quote, start_offset, end_offset
|
|
251
|
+
FROM citations
|
|
252
|
+
WHERE wiki_page_id IN (${placeholders(wikiPageIds)})
|
|
253
|
+
ORDER BY created_at DESC
|
|
254
|
+
LIMIT 50`,
|
|
255
|
+
).all(...wikiPageIds));
|
|
256
|
+
backlinks.push(...db.query<BacklinkRow, string[]>(
|
|
257
|
+
`SELECT from_page_id, to_page_id, label
|
|
258
|
+
FROM wiki_backlinks
|
|
259
|
+
WHERE from_page_id IN (${placeholders(wikiPageIds)}) OR to_page_id IN (${placeholders(wikiPageIds)})
|
|
260
|
+
LIMIT 50`,
|
|
261
|
+
).all(...wikiPageIds, ...wikiPageIds));
|
|
262
|
+
}
|
|
263
|
+
} finally {
|
|
264
|
+
db.close();
|
|
265
|
+
}
|
|
266
|
+
return { citations, backlinks };
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
export async function retrieveKnowledgeContext(options: RetrievalOptions): Promise<KnowledgeContextPack> {
|
|
270
|
+
const contextChars = Math.max(200, Math.min(options.contextChars ?? 1200, 4000));
|
|
271
|
+
const search = await hybridSearch(options);
|
|
272
|
+
const terms = queryTerms(search.query);
|
|
273
|
+
const warnings = [...search.warnings];
|
|
274
|
+
const permissionNotes = new Set<string>();
|
|
275
|
+
const freshnessNotes = new Set<string>();
|
|
276
|
+
|
|
277
|
+
const filtered = search.results.filter((result) => {
|
|
278
|
+
if (!hasReadOnlyProvenance(result.provenance)) {
|
|
279
|
+
warnings.push(`permission_filtered: ${result.kind}:${result.id}`);
|
|
280
|
+
permissionNotes.add('Dropped a result because provenance was not read-only.');
|
|
281
|
+
return false;
|
|
282
|
+
}
|
|
283
|
+
if (isStale(result.provenance)) {
|
|
284
|
+
warnings.push(`stale_filtered: ${result.kind}:${result.id}`);
|
|
285
|
+
freshnessNotes.add('Dropped a stale result whose source status requires reindexing.');
|
|
286
|
+
return false;
|
|
287
|
+
}
|
|
288
|
+
return true;
|
|
289
|
+
});
|
|
290
|
+
|
|
291
|
+
const results = filtered
|
|
292
|
+
.map((result) => rerank(result, terms))
|
|
293
|
+
.sort((a, b) => b.score - a.score || a.id.localeCompare(b.id))
|
|
294
|
+
.slice(0, search.limit);
|
|
295
|
+
|
|
296
|
+
const citations = results.map(citationFor);
|
|
297
|
+
const excerpts = results
|
|
298
|
+
.map((result, index) => excerptFor(result, citations[index], contextChars))
|
|
299
|
+
.filter((entry): entry is RetrievalExcerpt => Boolean(entry));
|
|
300
|
+
|
|
301
|
+
for (const result of results) {
|
|
302
|
+
if (result.provenance && 'read_only' in result.provenance && result.provenance.read_only) {
|
|
303
|
+
permissionNotes.add('All source-backed excerpts are read-only and citation-required.');
|
|
304
|
+
}
|
|
305
|
+
if (result.rerank.freshness_score >= 0.85) {
|
|
306
|
+
freshnessNotes.add('Fresh source revision/hash or artifact hash is present for top context.');
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
return {
|
|
311
|
+
query: search.query,
|
|
312
|
+
normalized_query: normalizeQuery(search.query),
|
|
313
|
+
created_at: new Date().toISOString(),
|
|
314
|
+
mode: search.mode,
|
|
315
|
+
warnings,
|
|
316
|
+
search_counts: search.counts,
|
|
317
|
+
results,
|
|
318
|
+
citations,
|
|
319
|
+
excerpts,
|
|
320
|
+
graph: loadGraphEvidence(options.dbPath, results),
|
|
321
|
+
notes: {
|
|
322
|
+
permissions: Array.from(permissionNotes),
|
|
323
|
+
freshness: Array.from(freshnessNotes),
|
|
324
|
+
},
|
|
325
|
+
};
|
|
326
|
+
}
|