@hasna/knowledge 0.2.15 → 0.2.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +10 -1
- package/bin/open-knowledge-mcp.js +252 -9
- package/bin/open-knowledge.js +54 -42
- package/docs/architecture/ai-native-knowledge-base.md +5 -2
- package/docs/architecture/hybrid-semantic-search.md +9 -2
- package/package.json +1 -1
- package/src/cli.ts +20 -5
- package/src/mcp.js +17 -0
- package/src/retrieval.ts +326 -0
- package/src/service.ts +10 -0
|
@@ -223,8 +223,11 @@ generated metadata rows, not raw source bytes, and pins each row to `open-files`
|
|
|
223
223
|
provenance: source ref/URI, revision/hash, chunk offsets, token count, provider,
|
|
224
224
|
model, dimensions, status, and timestamps. The structured `search` contract
|
|
225
225
|
merges keyword FTS, wiki/index catalog hits, generated wiki chunks, and optional
|
|
226
|
-
vector results
|
|
227
|
-
|
|
226
|
+
vector results. `open-knowledge search --context` and MCP `knowledge_search`
|
|
227
|
+
turn those rows into reranked citation context packs with selected excerpts,
|
|
228
|
+
freshness and permission notes, graph evidence, and final rerank scores. The
|
|
229
|
+
local SQLite index can later move to pgvector or a managed hosted vector store
|
|
230
|
+
without changing CLI/MCP result shape.
|
|
228
231
|
|
|
229
232
|
## Agent Workflow
|
|
230
233
|
|
|
@@ -49,14 +49,15 @@ The current local command surface is:
|
|
|
49
49
|
```bash
|
|
50
50
|
open-knowledge search "company wiki policy" --scope project --json
|
|
51
51
|
open-knowledge search "company wiki policy" --scope project --semantic --json
|
|
52
|
+
open-knowledge search "company wiki policy" --scope project --context --json
|
|
52
53
|
open-knowledge embeddings index --scope project --model openai:text-embedding-3-small
|
|
53
54
|
open-knowledge embeddings search "company wiki policy" --scope project --json
|
|
54
55
|
```
|
|
55
56
|
|
|
56
57
|
`search` is the structured hybrid layer for agents. `embeddings search` is the
|
|
57
58
|
lower-level vector-only command. MCP exposes the same capability through
|
|
58
|
-
`ok_search`, `ok_embeddings_status`, `ok_embeddings_index`,
|
|
59
|
-
`ok_semantic_search`. Deterministic `--fake` embeddings exist for tests and
|
|
59
|
+
`ok_search`, `knowledge_search`, `ok_embeddings_status`, `ok_embeddings_index`,
|
|
60
|
+
and `ok_semantic_search`. Deterministic `--fake` embeddings exist for tests and
|
|
60
61
|
offline verification only.
|
|
61
62
|
|
|
62
63
|
## Hosted Indexes
|
|
@@ -132,6 +133,12 @@ They should receive context packs:
|
|
|
132
133
|
|
|
133
134
|
This keeps agent prompts stable while the retrieval internals evolve.
|
|
134
135
|
|
|
136
|
+
The local context-pack implementation is available through
|
|
137
|
+
`open-knowledge search --context` and MCP `knowledge_search`. It reranks merged
|
|
138
|
+
search rows using exact-term coverage, citation availability, source freshness,
|
|
139
|
+
and source/wiki authority, then emits excerpts and citation objects that preserve
|
|
140
|
+
source refs, artifact URIs, revision/hash metadata, offsets, and provenance.
|
|
141
|
+
|
|
135
142
|
## Reindexing
|
|
136
143
|
|
|
137
144
|
Reindexing is driven by source revisions:
|
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -50,6 +50,7 @@ interface Flags {
|
|
|
50
50
|
model?: string;
|
|
51
51
|
dimensions?: number;
|
|
52
52
|
semantic?: boolean;
|
|
53
|
+
context?: boolean;
|
|
53
54
|
noColor?: boolean;
|
|
54
55
|
scope?: string;
|
|
55
56
|
olderThan?: number;
|
|
@@ -103,6 +104,7 @@ function parseArgs(argv: string[]): ParseResult {
|
|
|
103
104
|
case '--model': flags.model = argv[i + 1]; i += 1; break;
|
|
104
105
|
case '--dimensions': flags.dimensions = Number(argv[i + 1]); i += 1; break;
|
|
105
106
|
case '--semantic': flags.semantic = true; break;
|
|
107
|
+
case '--context': flags.context = true; break;
|
|
106
108
|
case '--fake': flags.fake = true; break;
|
|
107
109
|
case '--no-color': flags.noColor = true; break;
|
|
108
110
|
case '--scope': flags.scope = argv[i + 1]; i += 1; break;
|
|
@@ -177,7 +179,7 @@ Commands:
|
|
|
177
179
|
ingest manifest <file|s3://> Ingest an open-files manifest into knowledge.db
|
|
178
180
|
ingest source <source-ref> Ingest a read-only source ref into knowledge.db
|
|
179
181
|
reindex outbox <file|s3://> Consume open-files change events and invalidate chunks
|
|
180
|
-
search <query> Hybrid search sources, wiki pages,
|
|
182
|
+
search <query> Hybrid search sources, wiki pages, indexes, or context
|
|
181
183
|
embeddings status|index|search Build/query local vector embeddings
|
|
182
184
|
providers status|models|check Inspect AI SDK provider config and credentials
|
|
183
185
|
safety status|check|approve|audit|redact
|
|
@@ -190,6 +192,7 @@ Global Options:
|
|
|
190
192
|
--model <provider:model> AI/embedding model ref
|
|
191
193
|
--dimensions <n> Embedding dimensions for local/fake providers
|
|
192
194
|
--semantic Include vector semantic results in search
|
|
195
|
+
--context Return a reranked citation context pack for search
|
|
193
196
|
--fake Use deterministic fake embeddings for local tests
|
|
194
197
|
--scope local|global|project Store scope (default: global ~/.hasna/apps/knowledge/)
|
|
195
198
|
--no-color Disable color output
|
|
@@ -251,7 +254,7 @@ function printCommandHelp(command: string): void {
|
|
|
251
254
|
if (command === 'source') { console.log('Usage: open-knowledge source resolve <source-ref> [--purpose knowledge_answer|knowledge_index] [--limit <n>] [--scope local|global|project] [--json]'); return; }
|
|
252
255
|
if (command === 'ingest') { console.log('Usage: open-knowledge ingest manifest <file|s3://bucket/key> | source <source-ref> [--purpose knowledge_index] [--scope local|global|project] [--json]'); return; }
|
|
253
256
|
if (command === 'reindex') { console.log('Usage: open-knowledge reindex outbox <file|s3://bucket/key> [--scope local|global|project] [--json]'); return; }
|
|
254
|
-
if (command === 'search') { console.log('Usage: open-knowledge search <query> [--semantic] [--model openai:text-embedding-3-small] [--limit <n>] [--dimensions <n>] [--fake] [--scope local|global|project] [--json]'); return; }
|
|
257
|
+
if (command === 'search') { console.log('Usage: open-knowledge search <query> [--context] [--semantic] [--model openai:text-embedding-3-small] [--limit <n>] [--dimensions <n>] [--fake] [--scope local|global|project] [--json]'); return; }
|
|
255
258
|
if (command === 'embeddings') { console.log('Usage: open-knowledge embeddings status|index|search [query] [--model openai:text-embedding-3-small] [--limit <n>] [--dimensions <n>] [--fake] [--scope local|global|project] [--json]'); return; }
|
|
256
259
|
if (command === 'providers') { console.log('Usage: open-knowledge providers status|models|check [provider|model-alias] [--scope local|global|project] [--json]'); return; }
|
|
257
260
|
if (command === 'safety') { console.log('Usage: open-knowledge safety status|check|approve|audit|redact [args] [--scope local|global|project] [--json]'); return; }
|
|
@@ -299,11 +302,11 @@ async function run(argv: string[]): Promise<void> {
|
|
|
299
302
|
if (flags.completions) {
|
|
300
303
|
const shell = flags.completions;
|
|
301
304
|
if (shell === 'bash') {
|
|
302
|
-
console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search embeddings providers safety help ls rm edit unarchive --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --model --dimensions --semantic --fake --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
|
|
305
|
+
console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search embeddings providers safety help ls rm edit unarchive --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --model --dimensions --semantic --context --fake --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
|
|
303
306
|
} else if (shell === 'zsh') {
|
|
304
|
-
console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search embeddings providers safety help ls rm edit unarchive)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(--semantic)--semantic" "(--fake)--fake" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--model)--model[model ref]:" "(--dimensions)--dimensions[embedding dimensions]:number:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
|
|
307
|
+
console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search embeddings providers safety help ls rm edit unarchive)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(--semantic)--semantic" "(--context)--context" "(--fake)--fake" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--model)--model[model ref]:" "(--dimensions)--dimensions[embedding dimensions]:number:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
|
|
305
308
|
} else if (shell === 'fish') {
|
|
306
|
-
console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search embeddings providers safety help ls rm edit unarchive"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -l semantic; complete -c open-knowledge -l fake; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l model; complete -c open-knowledge -l dimensions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
|
|
309
|
+
console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search embeddings providers safety help ls rm edit unarchive"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -l semantic; complete -c open-knowledge -l context; complete -c open-knowledge -l fake; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l model; complete -c open-knowledge -l dimensions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
|
|
307
310
|
} else {
|
|
308
311
|
throw new Error("Invalid --completions value. Use 'bash', 'zsh', or 'fish'.");
|
|
309
312
|
}
|
|
@@ -586,6 +589,18 @@ async function run(argv: string[]): Promise<void> {
|
|
|
586
589
|
if (command === 'search') {
|
|
587
590
|
const query = positional.slice(1).join(' ');
|
|
588
591
|
if (!query) throw new Error('Usage: open-knowledge search <query>');
|
|
592
|
+
if (flags.context) {
|
|
593
|
+
const context = await service.retrieveContext({
|
|
594
|
+
query,
|
|
595
|
+
limit: flags.limit,
|
|
596
|
+
semantic: flags.semantic,
|
|
597
|
+
modelRef: flags.model,
|
|
598
|
+
dimensions: flags.dimensions,
|
|
599
|
+
fake: flags.fake,
|
|
600
|
+
});
|
|
601
|
+
output({ ok: true, ...context, message: `${context.excerpts.length} context excerpt(s)` }, flags.json);
|
|
602
|
+
return;
|
|
603
|
+
}
|
|
589
604
|
const result = await service.search({
|
|
590
605
|
query,
|
|
591
606
|
limit: flags.limit,
|
package/src/mcp.js
CHANGED
|
@@ -186,6 +186,23 @@ export function buildServer() {
|
|
|
186
186
|
}
|
|
187
187
|
});
|
|
188
188
|
|
|
189
|
+
registerTool(server, 'knowledge_search', 'Knowledge context search', 'Return a reranked citation context pack for agent prompts', {
|
|
190
|
+
scope: scopeField,
|
|
191
|
+
query: z.string().describe('Search query or prompt'),
|
|
192
|
+
limit: z.number().optional().describe('Maximum context results'),
|
|
193
|
+
semantic: z.boolean().optional().describe('Include vector semantic results'),
|
|
194
|
+
model: z.string().optional().describe('Embedding model ref, default openai:text-embedding-3-small'),
|
|
195
|
+
dimensions: z.number().optional().describe('Embedding dimensions for deterministic fake mode'),
|
|
196
|
+
fake: z.boolean().optional().describe('Use deterministic fake embeddings for local tests'),
|
|
197
|
+
}, async ({ scope, query, limit, semantic, model, dimensions, fake }) => {
|
|
198
|
+
const service = createKnowledgeService({ scope });
|
|
199
|
+
try {
|
|
200
|
+
return jsonText({ ok: true, ...await service.retrieveContext({ query, limit, semantic, modelRef: model, dimensions, fake }) });
|
|
201
|
+
} catch (error) {
|
|
202
|
+
return errorText(error instanceof Error ? error.message : String(error));
|
|
203
|
+
}
|
|
204
|
+
});
|
|
205
|
+
|
|
189
206
|
registerTool(server, 'ok_add', 'Add a knowledge item', 'Add a new item to the knowledge store', {
|
|
190
207
|
title: z.string().describe('Item title'),
|
|
191
208
|
content: z.string().describe('Item content/body'),
|
package/src/retrieval.ts
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
2
|
+
import { openKnowledgeDb } from './knowledge-db';
|
|
3
|
+
import { isStaleStatus } from './provenance';
|
|
4
|
+
import { hybridSearch, type HybridSearchEntry, type HybridSearchOptions, type HybridSearchResult, type SearchProvenance } from './search';
|
|
5
|
+
|
|
6
|
+
export interface RetrievalOptions extends HybridSearchOptions {
|
|
7
|
+
contextChars?: number;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export interface RerankedSearchEntry extends HybridSearchEntry {
|
|
11
|
+
rerank: {
|
|
12
|
+
base_score: number;
|
|
13
|
+
final_score: number;
|
|
14
|
+
exact_score: number;
|
|
15
|
+
citation_score: number;
|
|
16
|
+
freshness_score: number;
|
|
17
|
+
authority_score: number;
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface RetrievalCitation {
|
|
22
|
+
id: string;
|
|
23
|
+
result_id: string;
|
|
24
|
+
kind: HybridSearchEntry['kind'];
|
|
25
|
+
source_uri: string | null;
|
|
26
|
+
source_ref: string | null;
|
|
27
|
+
artifact_uri: string | null;
|
|
28
|
+
artifact_path: string | null;
|
|
29
|
+
revision: string | null;
|
|
30
|
+
hash: string | null;
|
|
31
|
+
chunk_id: string | null;
|
|
32
|
+
start_offset: number | null;
|
|
33
|
+
end_offset: number | null;
|
|
34
|
+
quote: string | null;
|
|
35
|
+
provenance: SearchProvenance | null;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export interface RetrievalExcerpt {
|
|
39
|
+
id: string;
|
|
40
|
+
result_id: string;
|
|
41
|
+
citation_id: string | null;
|
|
42
|
+
kind: HybridSearchEntry['kind'];
|
|
43
|
+
text: string;
|
|
44
|
+
score: number;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export interface RetrievalGraphEvidence {
|
|
48
|
+
citations: Array<{
|
|
49
|
+
id: string;
|
|
50
|
+
chunk_id: string | null;
|
|
51
|
+
wiki_page_id: string | null;
|
|
52
|
+
source_uri: string;
|
|
53
|
+
quote: string | null;
|
|
54
|
+
start_offset: number | null;
|
|
55
|
+
end_offset: number | null;
|
|
56
|
+
}>;
|
|
57
|
+
backlinks: Array<{
|
|
58
|
+
from_page_id: string;
|
|
59
|
+
to_page_id: string;
|
|
60
|
+
label: string | null;
|
|
61
|
+
}>;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export interface KnowledgeContextPack {
|
|
65
|
+
query: string;
|
|
66
|
+
normalized_query: string;
|
|
67
|
+
created_at: string;
|
|
68
|
+
mode: HybridSearchResult['mode'];
|
|
69
|
+
warnings: string[];
|
|
70
|
+
search_counts: HybridSearchResult['counts'];
|
|
71
|
+
results: RerankedSearchEntry[];
|
|
72
|
+
citations: RetrievalCitation[];
|
|
73
|
+
excerpts: RetrievalExcerpt[];
|
|
74
|
+
graph: RetrievalGraphEvidence;
|
|
75
|
+
notes: {
|
|
76
|
+
permissions: string[];
|
|
77
|
+
freshness: string[];
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
interface CitationRow {
|
|
82
|
+
id: string;
|
|
83
|
+
wiki_page_id: string | null;
|
|
84
|
+
chunk_id: string | null;
|
|
85
|
+
source_uri: string;
|
|
86
|
+
quote: string | null;
|
|
87
|
+
start_offset: number | null;
|
|
88
|
+
end_offset: number | null;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
interface BacklinkRow {
|
|
92
|
+
from_page_id: string;
|
|
93
|
+
to_page_id: string;
|
|
94
|
+
label: string | null;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function stableId(prefix: string, value: string): string {
|
|
98
|
+
return `${prefix}_${createHash('sha256').update(value).digest('hex').slice(0, 20)}`;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function normalizeQuery(query: string): string {
|
|
102
|
+
return query.normalize('NFKC').trim().replace(/\s+/g, ' ').toLowerCase();
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function queryTerms(query: string): string[] {
|
|
106
|
+
return Array.from(new Set(normalizeQuery(query).match(/[\p{L}\p{N}_]+/gu) ?? [])).slice(0, 16);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function textForResult(result: HybridSearchEntry): string {
|
|
110
|
+
return [result.title, result.text].filter(Boolean).join(' ').toLowerCase();
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function exactScore(result: HybridSearchEntry, terms: string[]): number {
|
|
114
|
+
if (terms.length === 0) return 0;
|
|
115
|
+
const text = textForResult(result);
|
|
116
|
+
const matched = terms.filter((term) => text.includes(term)).length;
|
|
117
|
+
return Number((matched / terms.length).toFixed(6));
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function hasReadOnlyProvenance(provenance: SearchProvenance | null): boolean {
|
|
121
|
+
if (!provenance) return true;
|
|
122
|
+
if ('read_only' in provenance) return provenance.read_only === true;
|
|
123
|
+
if ('read_only_sources' in provenance) return provenance.read_only_sources === true;
|
|
124
|
+
return true;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function isStale(provenance: SearchProvenance | null): boolean {
|
|
128
|
+
if (!provenance) return false;
|
|
129
|
+
if ('stale' in provenance && provenance.stale) return true;
|
|
130
|
+
if ('status' in provenance) return isStaleStatus(provenance.status);
|
|
131
|
+
return false;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function freshnessScore(result: HybridSearchEntry): number {
|
|
135
|
+
if (isStale(result.provenance)) return 0;
|
|
136
|
+
if (result.source?.hash || result.source?.revision) return 1;
|
|
137
|
+
if (result.artifact?.hash) return 0.85;
|
|
138
|
+
if (result.provenance && 'source_refs' in result.provenance && result.provenance.source_refs.length > 0) return 0.75;
|
|
139
|
+
return 0.55;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function citationScore(result: HybridSearchEntry): number {
|
|
143
|
+
if (result.citation?.chunk_id && (result.source?.uri || result.artifact?.uri)) return 1;
|
|
144
|
+
if (result.provenance && 'citation_required' in result.provenance && result.provenance.citation_required) return 0.75;
|
|
145
|
+
if (result.artifact?.uri) return 0.65;
|
|
146
|
+
return 0.35;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function authorityScore(result: HybridSearchEntry): number {
|
|
150
|
+
if (result.kind === 'wiki_chunk') return 0.85;
|
|
151
|
+
if (result.kind === 'source_chunk') return 0.8;
|
|
152
|
+
if (result.kind === 'wiki_page') return 0.65;
|
|
153
|
+
return 0.55;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
function rerank(result: HybridSearchEntry, terms: string[]): RerankedSearchEntry {
|
|
157
|
+
const scores = {
|
|
158
|
+
base_score: result.score,
|
|
159
|
+
exact_score: exactScore(result, terms),
|
|
160
|
+
citation_score: citationScore(result),
|
|
161
|
+
freshness_score: freshnessScore(result),
|
|
162
|
+
authority_score: authorityScore(result),
|
|
163
|
+
};
|
|
164
|
+
const final = Math.min(1,
|
|
165
|
+
scores.base_score * 0.65 +
|
|
166
|
+
scores.exact_score * 0.1 +
|
|
167
|
+
scores.citation_score * 0.1 +
|
|
168
|
+
scores.freshness_score * 0.1 +
|
|
169
|
+
scores.authority_score * 0.05,
|
|
170
|
+
);
|
|
171
|
+
const reasons = new Set(result.reasons);
|
|
172
|
+
if (scores.exact_score > 0.5) reasons.add('exact_term');
|
|
173
|
+
if (scores.citation_score >= 0.75) reasons.add('cited_source');
|
|
174
|
+
if (scores.freshness_score >= 0.85) reasons.add('fresh_source');
|
|
175
|
+
return {
|
|
176
|
+
...result,
|
|
177
|
+
score: Number(final.toFixed(6)),
|
|
178
|
+
reasons: Array.from(reasons),
|
|
179
|
+
rerank: {
|
|
180
|
+
...scores,
|
|
181
|
+
final_score: Number(final.toFixed(6)),
|
|
182
|
+
},
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
function quoteFor(result: HybridSearchEntry, maxChars: number): string | null {
|
|
187
|
+
const source = result.text ?? result.title;
|
|
188
|
+
if (!source) return null;
|
|
189
|
+
const normalized = source.replace(/\s+/g, ' ').trim();
|
|
190
|
+
return normalized.length <= maxChars ? normalized : `${normalized.slice(0, Math.max(0, maxChars - 1)).trim()}...`;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
function citationFor(result: RerankedSearchEntry): RetrievalCitation {
|
|
194
|
+
const id = stableId('cite', `${result.kind}\u0000${result.id}\u0000${result.source?.uri ?? ''}\u0000${result.artifact?.uri ?? ''}`);
|
|
195
|
+
return {
|
|
196
|
+
id,
|
|
197
|
+
result_id: result.id,
|
|
198
|
+
kind: result.kind,
|
|
199
|
+
source_uri: result.source?.uri ?? null,
|
|
200
|
+
source_ref: result.source?.ref ?? null,
|
|
201
|
+
artifact_uri: result.artifact?.uri ?? null,
|
|
202
|
+
artifact_path: result.artifact?.path ?? null,
|
|
203
|
+
revision: result.source?.revision ?? null,
|
|
204
|
+
hash: result.source?.hash ?? result.artifact?.hash ?? null,
|
|
205
|
+
chunk_id: result.citation?.chunk_id ?? null,
|
|
206
|
+
start_offset: result.citation?.start_offset ?? null,
|
|
207
|
+
end_offset: result.citation?.end_offset ?? null,
|
|
208
|
+
quote: quoteFor(result, 500),
|
|
209
|
+
provenance: result.provenance,
|
|
210
|
+
};
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
function excerptFor(result: RerankedSearchEntry, citation: RetrievalCitation, contextChars: number): RetrievalExcerpt | null {
|
|
214
|
+
const text = quoteFor(result, contextChars);
|
|
215
|
+
if (!text) return null;
|
|
216
|
+
return {
|
|
217
|
+
id: stableId('excerpt', `${result.kind}\u0000${result.id}`),
|
|
218
|
+
result_id: result.id,
|
|
219
|
+
citation_id: citation.id,
|
|
220
|
+
kind: result.kind,
|
|
221
|
+
text,
|
|
222
|
+
score: result.score,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
function placeholders(values: unknown[]): string {
|
|
227
|
+
return values.map(() => '?').join(', ');
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
function loadGraphEvidence(dbPath: string, results: RerankedSearchEntry[]): RetrievalGraphEvidence {
|
|
231
|
+
const chunkIds = results.map((result) => result.citation?.chunk_id).filter((id): id is string => Boolean(id));
|
|
232
|
+
const wikiPageIds = results.filter((result) => result.kind === 'wiki_page').map((result) => result.id);
|
|
233
|
+
const citations: CitationRow[] = [];
|
|
234
|
+
const backlinks: BacklinkRow[] = [];
|
|
235
|
+
if (chunkIds.length === 0 && wikiPageIds.length === 0) return { citations, backlinks };
|
|
236
|
+
|
|
237
|
+
const db = openKnowledgeDb(dbPath);
|
|
238
|
+
try {
|
|
239
|
+
if (chunkIds.length > 0) {
|
|
240
|
+
citations.push(...db.query<CitationRow, string[]>(
|
|
241
|
+
`SELECT id, wiki_page_id, chunk_id, source_uri, quote, start_offset, end_offset
|
|
242
|
+
FROM citations
|
|
243
|
+
WHERE chunk_id IN (${placeholders(chunkIds)})
|
|
244
|
+
ORDER BY created_at DESC
|
|
245
|
+
LIMIT 50`,
|
|
246
|
+
).all(...chunkIds));
|
|
247
|
+
}
|
|
248
|
+
if (wikiPageIds.length > 0) {
|
|
249
|
+
citations.push(...db.query<CitationRow, string[]>(
|
|
250
|
+
`SELECT id, wiki_page_id, chunk_id, source_uri, quote, start_offset, end_offset
|
|
251
|
+
FROM citations
|
|
252
|
+
WHERE wiki_page_id IN (${placeholders(wikiPageIds)})
|
|
253
|
+
ORDER BY created_at DESC
|
|
254
|
+
LIMIT 50`,
|
|
255
|
+
).all(...wikiPageIds));
|
|
256
|
+
backlinks.push(...db.query<BacklinkRow, string[]>(
|
|
257
|
+
`SELECT from_page_id, to_page_id, label
|
|
258
|
+
FROM wiki_backlinks
|
|
259
|
+
WHERE from_page_id IN (${placeholders(wikiPageIds)}) OR to_page_id IN (${placeholders(wikiPageIds)})
|
|
260
|
+
LIMIT 50`,
|
|
261
|
+
).all(...wikiPageIds, ...wikiPageIds));
|
|
262
|
+
}
|
|
263
|
+
} finally {
|
|
264
|
+
db.close();
|
|
265
|
+
}
|
|
266
|
+
return { citations, backlinks };
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
export async function retrieveKnowledgeContext(options: RetrievalOptions): Promise<KnowledgeContextPack> {
|
|
270
|
+
const contextChars = Math.max(200, Math.min(options.contextChars ?? 1200, 4000));
|
|
271
|
+
const search = await hybridSearch(options);
|
|
272
|
+
const terms = queryTerms(search.query);
|
|
273
|
+
const warnings = [...search.warnings];
|
|
274
|
+
const permissionNotes = new Set<string>();
|
|
275
|
+
const freshnessNotes = new Set<string>();
|
|
276
|
+
|
|
277
|
+
const filtered = search.results.filter((result) => {
|
|
278
|
+
if (!hasReadOnlyProvenance(result.provenance)) {
|
|
279
|
+
warnings.push(`permission_filtered: ${result.kind}:${result.id}`);
|
|
280
|
+
permissionNotes.add('Dropped a result because provenance was not read-only.');
|
|
281
|
+
return false;
|
|
282
|
+
}
|
|
283
|
+
if (isStale(result.provenance)) {
|
|
284
|
+
warnings.push(`stale_filtered: ${result.kind}:${result.id}`);
|
|
285
|
+
freshnessNotes.add('Dropped a stale result whose source status requires reindexing.');
|
|
286
|
+
return false;
|
|
287
|
+
}
|
|
288
|
+
return true;
|
|
289
|
+
});
|
|
290
|
+
|
|
291
|
+
const results = filtered
|
|
292
|
+
.map((result) => rerank(result, terms))
|
|
293
|
+
.sort((a, b) => b.score - a.score || a.id.localeCompare(b.id))
|
|
294
|
+
.slice(0, search.limit);
|
|
295
|
+
|
|
296
|
+
const citations = results.map(citationFor);
|
|
297
|
+
const excerpts = results
|
|
298
|
+
.map((result, index) => excerptFor(result, citations[index], contextChars))
|
|
299
|
+
.filter((entry): entry is RetrievalExcerpt => Boolean(entry));
|
|
300
|
+
|
|
301
|
+
for (const result of results) {
|
|
302
|
+
if (result.provenance && 'read_only' in result.provenance && result.provenance.read_only) {
|
|
303
|
+
permissionNotes.add('All source-backed excerpts are read-only and citation-required.');
|
|
304
|
+
}
|
|
305
|
+
if (result.rerank.freshness_score >= 0.85) {
|
|
306
|
+
freshnessNotes.add('Fresh source revision/hash or artifact hash is present for top context.');
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
return {
|
|
311
|
+
query: search.query,
|
|
312
|
+
normalized_query: normalizeQuery(search.query),
|
|
313
|
+
created_at: new Date().toISOString(),
|
|
314
|
+
mode: search.mode,
|
|
315
|
+
warnings,
|
|
316
|
+
search_counts: search.counts,
|
|
317
|
+
results,
|
|
318
|
+
citations,
|
|
319
|
+
excerpts,
|
|
320
|
+
graph: loadGraphEvidence(options.dbPath, results),
|
|
321
|
+
notes: {
|
|
322
|
+
permissions: Array.from(permissionNotes),
|
|
323
|
+
freshness: Array.from(freshnessNotes),
|
|
324
|
+
},
|
|
325
|
+
};
|
|
326
|
+
}
|
package/src/service.ts
CHANGED
|
@@ -12,6 +12,7 @@ import { ingestOpenFilesManifest } from './manifest-ingest';
|
|
|
12
12
|
import { ingestSourceRef } from './source-ingest';
|
|
13
13
|
import { resolveOpenFilesSource } from './source-resolver';
|
|
14
14
|
import { providerStatus, listModelRegistry, type ProviderStatusResult, type ModelRegistryEntry } from './providers';
|
|
15
|
+
import { retrieveKnowledgeContext, type RetrievalOptions } from './retrieval';
|
|
15
16
|
import { hybridSearch, type HybridSearchOptions } from './search';
|
|
16
17
|
import { resolveSafetyPolicy } from './safety';
|
|
17
18
|
import {
|
|
@@ -224,6 +225,15 @@ export class KnowledgeService {
|
|
|
224
225
|
config: this.config(),
|
|
225
226
|
});
|
|
226
227
|
}
|
|
228
|
+
|
|
229
|
+
async retrieveContext(options: Omit<RetrievalOptions, 'dbPath' | 'config'>) {
|
|
230
|
+
const workspace = this.ensureWorkspace();
|
|
231
|
+
return retrieveKnowledgeContext({
|
|
232
|
+
...options,
|
|
233
|
+
dbPath: workspace.knowledgeDbPath,
|
|
234
|
+
config: this.config(),
|
|
235
|
+
});
|
|
236
|
+
}
|
|
227
237
|
}
|
|
228
238
|
|
|
229
239
|
export function createKnowledgeService(options: KnowledgeServiceOptions = {}): KnowledgeService {
|