@hasna/knowledge 0.2.15 → 0.2.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -1
- package/bin/open-knowledge-mcp.js +2275 -1643
- package/bin/open-knowledge.js +105 -79
- package/docs/architecture/ai-native-knowledge-base.md +11 -2
- package/docs/architecture/hybrid-semantic-search.md +12 -2
- package/package.json +2 -1
- package/src/agent.ts +367 -0
- package/src/cli.ts +58 -7
- package/src/mcp.js +36 -0
- package/src/providers.ts +1 -1
- package/src/retrieval.ts +326 -0
- package/src/service.ts +20 -0
package/src/retrieval.ts
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
import { createHash } from 'node:crypto';
|
|
2
|
+
import { openKnowledgeDb } from './knowledge-db';
|
|
3
|
+
import { isStaleStatus } from './provenance';
|
|
4
|
+
import { hybridSearch, type HybridSearchEntry, type HybridSearchOptions, type HybridSearchResult, type SearchProvenance } from './search';
|
|
5
|
+
|
|
6
|
+
export interface RetrievalOptions extends HybridSearchOptions {
|
|
7
|
+
contextChars?: number;
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export interface RerankedSearchEntry extends HybridSearchEntry {
|
|
11
|
+
rerank: {
|
|
12
|
+
base_score: number;
|
|
13
|
+
final_score: number;
|
|
14
|
+
exact_score: number;
|
|
15
|
+
citation_score: number;
|
|
16
|
+
freshness_score: number;
|
|
17
|
+
authority_score: number;
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
export interface RetrievalCitation {
|
|
22
|
+
id: string;
|
|
23
|
+
result_id: string;
|
|
24
|
+
kind: HybridSearchEntry['kind'];
|
|
25
|
+
source_uri: string | null;
|
|
26
|
+
source_ref: string | null;
|
|
27
|
+
artifact_uri: string | null;
|
|
28
|
+
artifact_path: string | null;
|
|
29
|
+
revision: string | null;
|
|
30
|
+
hash: string | null;
|
|
31
|
+
chunk_id: string | null;
|
|
32
|
+
start_offset: number | null;
|
|
33
|
+
end_offset: number | null;
|
|
34
|
+
quote: string | null;
|
|
35
|
+
provenance: SearchProvenance | null;
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
export interface RetrievalExcerpt {
|
|
39
|
+
id: string;
|
|
40
|
+
result_id: string;
|
|
41
|
+
citation_id: string | null;
|
|
42
|
+
kind: HybridSearchEntry['kind'];
|
|
43
|
+
text: string;
|
|
44
|
+
score: number;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
export interface RetrievalGraphEvidence {
|
|
48
|
+
citations: Array<{
|
|
49
|
+
id: string;
|
|
50
|
+
chunk_id: string | null;
|
|
51
|
+
wiki_page_id: string | null;
|
|
52
|
+
source_uri: string;
|
|
53
|
+
quote: string | null;
|
|
54
|
+
start_offset: number | null;
|
|
55
|
+
end_offset: number | null;
|
|
56
|
+
}>;
|
|
57
|
+
backlinks: Array<{
|
|
58
|
+
from_page_id: string;
|
|
59
|
+
to_page_id: string;
|
|
60
|
+
label: string | null;
|
|
61
|
+
}>;
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
export interface KnowledgeContextPack {
|
|
65
|
+
query: string;
|
|
66
|
+
normalized_query: string;
|
|
67
|
+
created_at: string;
|
|
68
|
+
mode: HybridSearchResult['mode'];
|
|
69
|
+
warnings: string[];
|
|
70
|
+
search_counts: HybridSearchResult['counts'];
|
|
71
|
+
results: RerankedSearchEntry[];
|
|
72
|
+
citations: RetrievalCitation[];
|
|
73
|
+
excerpts: RetrievalExcerpt[];
|
|
74
|
+
graph: RetrievalGraphEvidence;
|
|
75
|
+
notes: {
|
|
76
|
+
permissions: string[];
|
|
77
|
+
freshness: string[];
|
|
78
|
+
};
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
interface CitationRow {
|
|
82
|
+
id: string;
|
|
83
|
+
wiki_page_id: string | null;
|
|
84
|
+
chunk_id: string | null;
|
|
85
|
+
source_uri: string;
|
|
86
|
+
quote: string | null;
|
|
87
|
+
start_offset: number | null;
|
|
88
|
+
end_offset: number | null;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
interface BacklinkRow {
|
|
92
|
+
from_page_id: string;
|
|
93
|
+
to_page_id: string;
|
|
94
|
+
label: string | null;
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function stableId(prefix: string, value: string): string {
|
|
98
|
+
return `${prefix}_${createHash('sha256').update(value).digest('hex').slice(0, 20)}`;
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
function normalizeQuery(query: string): string {
|
|
102
|
+
return query.normalize('NFKC').trim().replace(/\s+/g, ' ').toLowerCase();
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
function queryTerms(query: string): string[] {
|
|
106
|
+
return Array.from(new Set(normalizeQuery(query).match(/[\p{L}\p{N}_]+/gu) ?? [])).slice(0, 16);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
function textForResult(result: HybridSearchEntry): string {
|
|
110
|
+
return [result.title, result.text].filter(Boolean).join(' ').toLowerCase();
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
function exactScore(result: HybridSearchEntry, terms: string[]): number {
|
|
114
|
+
if (terms.length === 0) return 0;
|
|
115
|
+
const text = textForResult(result);
|
|
116
|
+
const matched = terms.filter((term) => text.includes(term)).length;
|
|
117
|
+
return Number((matched / terms.length).toFixed(6));
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
function hasReadOnlyProvenance(provenance: SearchProvenance | null): boolean {
|
|
121
|
+
if (!provenance) return true;
|
|
122
|
+
if ('read_only' in provenance) return provenance.read_only === true;
|
|
123
|
+
if ('read_only_sources' in provenance) return provenance.read_only_sources === true;
|
|
124
|
+
return true;
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
function isStale(provenance: SearchProvenance | null): boolean {
|
|
128
|
+
if (!provenance) return false;
|
|
129
|
+
if ('stale' in provenance && provenance.stale) return true;
|
|
130
|
+
if ('status' in provenance) return isStaleStatus(provenance.status);
|
|
131
|
+
return false;
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
function freshnessScore(result: HybridSearchEntry): number {
|
|
135
|
+
if (isStale(result.provenance)) return 0;
|
|
136
|
+
if (result.source?.hash || result.source?.revision) return 1;
|
|
137
|
+
if (result.artifact?.hash) return 0.85;
|
|
138
|
+
if (result.provenance && 'source_refs' in result.provenance && result.provenance.source_refs.length > 0) return 0.75;
|
|
139
|
+
return 0.55;
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
function citationScore(result: HybridSearchEntry): number {
|
|
143
|
+
if (result.citation?.chunk_id && (result.source?.uri || result.artifact?.uri)) return 1;
|
|
144
|
+
if (result.provenance && 'citation_required' in result.provenance && result.provenance.citation_required) return 0.75;
|
|
145
|
+
if (result.artifact?.uri) return 0.65;
|
|
146
|
+
return 0.35;
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
function authorityScore(result: HybridSearchEntry): number {
|
|
150
|
+
if (result.kind === 'wiki_chunk') return 0.85;
|
|
151
|
+
if (result.kind === 'source_chunk') return 0.8;
|
|
152
|
+
if (result.kind === 'wiki_page') return 0.65;
|
|
153
|
+
return 0.55;
|
|
154
|
+
}
|
|
155
|
+
|
|
156
|
+
function rerank(result: HybridSearchEntry, terms: string[]): RerankedSearchEntry {
|
|
157
|
+
const scores = {
|
|
158
|
+
base_score: result.score,
|
|
159
|
+
exact_score: exactScore(result, terms),
|
|
160
|
+
citation_score: citationScore(result),
|
|
161
|
+
freshness_score: freshnessScore(result),
|
|
162
|
+
authority_score: authorityScore(result),
|
|
163
|
+
};
|
|
164
|
+
const final = Math.min(1,
|
|
165
|
+
scores.base_score * 0.65 +
|
|
166
|
+
scores.exact_score * 0.1 +
|
|
167
|
+
scores.citation_score * 0.1 +
|
|
168
|
+
scores.freshness_score * 0.1 +
|
|
169
|
+
scores.authority_score * 0.05,
|
|
170
|
+
);
|
|
171
|
+
const reasons = new Set(result.reasons);
|
|
172
|
+
if (scores.exact_score > 0.5) reasons.add('exact_term');
|
|
173
|
+
if (scores.citation_score >= 0.75) reasons.add('cited_source');
|
|
174
|
+
if (scores.freshness_score >= 0.85) reasons.add('fresh_source');
|
|
175
|
+
return {
|
|
176
|
+
...result,
|
|
177
|
+
score: Number(final.toFixed(6)),
|
|
178
|
+
reasons: Array.from(reasons),
|
|
179
|
+
rerank: {
|
|
180
|
+
...scores,
|
|
181
|
+
final_score: Number(final.toFixed(6)),
|
|
182
|
+
},
|
|
183
|
+
};
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
function quoteFor(result: HybridSearchEntry, maxChars: number): string | null {
|
|
187
|
+
const source = result.text ?? result.title;
|
|
188
|
+
if (!source) return null;
|
|
189
|
+
const normalized = source.replace(/\s+/g, ' ').trim();
|
|
190
|
+
return normalized.length <= maxChars ? normalized : `${normalized.slice(0, Math.max(0, maxChars - 1)).trim()}...`;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
function citationFor(result: RerankedSearchEntry): RetrievalCitation {
|
|
194
|
+
const id = stableId('cite', `${result.kind}\u0000${result.id}\u0000${result.source?.uri ?? ''}\u0000${result.artifact?.uri ?? ''}`);
|
|
195
|
+
return {
|
|
196
|
+
id,
|
|
197
|
+
result_id: result.id,
|
|
198
|
+
kind: result.kind,
|
|
199
|
+
source_uri: result.source?.uri ?? null,
|
|
200
|
+
source_ref: result.source?.ref ?? null,
|
|
201
|
+
artifact_uri: result.artifact?.uri ?? null,
|
|
202
|
+
artifact_path: result.artifact?.path ?? null,
|
|
203
|
+
revision: result.source?.revision ?? null,
|
|
204
|
+
hash: result.source?.hash ?? result.artifact?.hash ?? null,
|
|
205
|
+
chunk_id: result.citation?.chunk_id ?? null,
|
|
206
|
+
start_offset: result.citation?.start_offset ?? null,
|
|
207
|
+
end_offset: result.citation?.end_offset ?? null,
|
|
208
|
+
quote: quoteFor(result, 500),
|
|
209
|
+
provenance: result.provenance,
|
|
210
|
+
};
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
function excerptFor(result: RerankedSearchEntry, citation: RetrievalCitation, contextChars: number): RetrievalExcerpt | null {
|
|
214
|
+
const text = quoteFor(result, contextChars);
|
|
215
|
+
if (!text) return null;
|
|
216
|
+
return {
|
|
217
|
+
id: stableId('excerpt', `${result.kind}\u0000${result.id}`),
|
|
218
|
+
result_id: result.id,
|
|
219
|
+
citation_id: citation.id,
|
|
220
|
+
kind: result.kind,
|
|
221
|
+
text,
|
|
222
|
+
score: result.score,
|
|
223
|
+
};
|
|
224
|
+
}
|
|
225
|
+
|
|
226
|
+
function placeholders(values: unknown[]): string {
|
|
227
|
+
return values.map(() => '?').join(', ');
|
|
228
|
+
}
|
|
229
|
+
|
|
230
|
+
function loadGraphEvidence(dbPath: string, results: RerankedSearchEntry[]): RetrievalGraphEvidence {
|
|
231
|
+
const chunkIds = results.map((result) => result.citation?.chunk_id).filter((id): id is string => Boolean(id));
|
|
232
|
+
const wikiPageIds = results.filter((result) => result.kind === 'wiki_page').map((result) => result.id);
|
|
233
|
+
const citations: CitationRow[] = [];
|
|
234
|
+
const backlinks: BacklinkRow[] = [];
|
|
235
|
+
if (chunkIds.length === 0 && wikiPageIds.length === 0) return { citations, backlinks };
|
|
236
|
+
|
|
237
|
+
const db = openKnowledgeDb(dbPath);
|
|
238
|
+
try {
|
|
239
|
+
if (chunkIds.length > 0) {
|
|
240
|
+
citations.push(...db.query<CitationRow, string[]>(
|
|
241
|
+
`SELECT id, wiki_page_id, chunk_id, source_uri, quote, start_offset, end_offset
|
|
242
|
+
FROM citations
|
|
243
|
+
WHERE chunk_id IN (${placeholders(chunkIds)})
|
|
244
|
+
ORDER BY created_at DESC
|
|
245
|
+
LIMIT 50`,
|
|
246
|
+
).all(...chunkIds));
|
|
247
|
+
}
|
|
248
|
+
if (wikiPageIds.length > 0) {
|
|
249
|
+
citations.push(...db.query<CitationRow, string[]>(
|
|
250
|
+
`SELECT id, wiki_page_id, chunk_id, source_uri, quote, start_offset, end_offset
|
|
251
|
+
FROM citations
|
|
252
|
+
WHERE wiki_page_id IN (${placeholders(wikiPageIds)})
|
|
253
|
+
ORDER BY created_at DESC
|
|
254
|
+
LIMIT 50`,
|
|
255
|
+
).all(...wikiPageIds));
|
|
256
|
+
backlinks.push(...db.query<BacklinkRow, string[]>(
|
|
257
|
+
`SELECT from_page_id, to_page_id, label
|
|
258
|
+
FROM wiki_backlinks
|
|
259
|
+
WHERE from_page_id IN (${placeholders(wikiPageIds)}) OR to_page_id IN (${placeholders(wikiPageIds)})
|
|
260
|
+
LIMIT 50`,
|
|
261
|
+
).all(...wikiPageIds, ...wikiPageIds));
|
|
262
|
+
}
|
|
263
|
+
} finally {
|
|
264
|
+
db.close();
|
|
265
|
+
}
|
|
266
|
+
return { citations, backlinks };
|
|
267
|
+
}
|
|
268
|
+
|
|
269
|
+
export async function retrieveKnowledgeContext(options: RetrievalOptions): Promise<KnowledgeContextPack> {
|
|
270
|
+
const contextChars = Math.max(200, Math.min(options.contextChars ?? 1200, 4000));
|
|
271
|
+
const search = await hybridSearch(options);
|
|
272
|
+
const terms = queryTerms(search.query);
|
|
273
|
+
const warnings = [...search.warnings];
|
|
274
|
+
const permissionNotes = new Set<string>();
|
|
275
|
+
const freshnessNotes = new Set<string>();
|
|
276
|
+
|
|
277
|
+
const filtered = search.results.filter((result) => {
|
|
278
|
+
if (!hasReadOnlyProvenance(result.provenance)) {
|
|
279
|
+
warnings.push(`permission_filtered: ${result.kind}:${result.id}`);
|
|
280
|
+
permissionNotes.add('Dropped a result because provenance was not read-only.');
|
|
281
|
+
return false;
|
|
282
|
+
}
|
|
283
|
+
if (isStale(result.provenance)) {
|
|
284
|
+
warnings.push(`stale_filtered: ${result.kind}:${result.id}`);
|
|
285
|
+
freshnessNotes.add('Dropped a stale result whose source status requires reindexing.');
|
|
286
|
+
return false;
|
|
287
|
+
}
|
|
288
|
+
return true;
|
|
289
|
+
});
|
|
290
|
+
|
|
291
|
+
const results = filtered
|
|
292
|
+
.map((result) => rerank(result, terms))
|
|
293
|
+
.sort((a, b) => b.score - a.score || a.id.localeCompare(b.id))
|
|
294
|
+
.slice(0, search.limit);
|
|
295
|
+
|
|
296
|
+
const citations = results.map(citationFor);
|
|
297
|
+
const excerpts = results
|
|
298
|
+
.map((result, index) => excerptFor(result, citations[index], contextChars))
|
|
299
|
+
.filter((entry): entry is RetrievalExcerpt => Boolean(entry));
|
|
300
|
+
|
|
301
|
+
for (const result of results) {
|
|
302
|
+
if (result.provenance && 'read_only' in result.provenance && result.provenance.read_only) {
|
|
303
|
+
permissionNotes.add('All source-backed excerpts are read-only and citation-required.');
|
|
304
|
+
}
|
|
305
|
+
if (result.rerank.freshness_score >= 0.85) {
|
|
306
|
+
freshnessNotes.add('Fresh source revision/hash or artifact hash is present for top context.');
|
|
307
|
+
}
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
return {
|
|
311
|
+
query: search.query,
|
|
312
|
+
normalized_query: normalizeQuery(search.query),
|
|
313
|
+
created_at: new Date().toISOString(),
|
|
314
|
+
mode: search.mode,
|
|
315
|
+
warnings,
|
|
316
|
+
search_counts: search.counts,
|
|
317
|
+
results,
|
|
318
|
+
citations,
|
|
319
|
+
excerpts,
|
|
320
|
+
graph: loadGraphEvidence(options.dbPath, results),
|
|
321
|
+
notes: {
|
|
322
|
+
permissions: Array.from(permissionNotes),
|
|
323
|
+
freshness: Array.from(freshnessNotes),
|
|
324
|
+
},
|
|
325
|
+
};
|
|
326
|
+
}
|
package/src/service.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import { createArtifactStore } from './artifact-store';
|
|
2
|
+
import { runKnowledgePrompt, type KnowledgePromptOptions } from './agent';
|
|
2
3
|
import {
|
|
3
4
|
embeddingIndexStatus,
|
|
4
5
|
indexKnowledgeEmbeddings,
|
|
@@ -12,6 +13,7 @@ import { ingestOpenFilesManifest } from './manifest-ingest';
|
|
|
12
13
|
import { ingestSourceRef } from './source-ingest';
|
|
13
14
|
import { resolveOpenFilesSource } from './source-resolver';
|
|
14
15
|
import { providerStatus, listModelRegistry, type ProviderStatusResult, type ModelRegistryEntry } from './providers';
|
|
16
|
+
import { retrieveKnowledgeContext, type RetrievalOptions } from './retrieval';
|
|
15
17
|
import { hybridSearch, type HybridSearchOptions } from './search';
|
|
16
18
|
import { resolveSafetyPolicy } from './safety';
|
|
17
19
|
import {
|
|
@@ -224,6 +226,24 @@ export class KnowledgeService {
|
|
|
224
226
|
config: this.config(),
|
|
225
227
|
});
|
|
226
228
|
}
|
|
229
|
+
|
|
230
|
+
async retrieveContext(options: Omit<RetrievalOptions, 'dbPath' | 'config'>) {
|
|
231
|
+
const workspace = this.ensureWorkspace();
|
|
232
|
+
return retrieveKnowledgeContext({
|
|
233
|
+
...options,
|
|
234
|
+
dbPath: workspace.knowledgeDbPath,
|
|
235
|
+
config: this.config(),
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
async runPrompt(options: Omit<KnowledgePromptOptions, 'dbPath' | 'config'>) {
|
|
240
|
+
const workspace = this.ensureWorkspace();
|
|
241
|
+
return runKnowledgePrompt({
|
|
242
|
+
...options,
|
|
243
|
+
dbPath: workspace.knowledgeDbPath,
|
|
244
|
+
config: this.config(),
|
|
245
|
+
});
|
|
246
|
+
}
|
|
227
247
|
}
|
|
228
248
|
|
|
229
249
|
export function createKnowledgeService(options: KnowledgeServiceOptions = {}): KnowledgeService {
|