@hasna/knowledge 0.2.15 → 0.2.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,326 @@
1
+ import { createHash } from 'node:crypto';
2
+ import { openKnowledgeDb } from './knowledge-db';
3
+ import { isStaleStatus } from './provenance';
4
+ import { hybridSearch, type HybridSearchEntry, type HybridSearchOptions, type HybridSearchResult, type SearchProvenance } from './search';
5
+
6
+ export interface RetrievalOptions extends HybridSearchOptions {
7
+ contextChars?: number;
8
+ }
9
+
10
+ export interface RerankedSearchEntry extends HybridSearchEntry {
11
+ rerank: {
12
+ base_score: number;
13
+ final_score: number;
14
+ exact_score: number;
15
+ citation_score: number;
16
+ freshness_score: number;
17
+ authority_score: number;
18
+ };
19
+ }
20
+
21
+ export interface RetrievalCitation {
22
+ id: string;
23
+ result_id: string;
24
+ kind: HybridSearchEntry['kind'];
25
+ source_uri: string | null;
26
+ source_ref: string | null;
27
+ artifact_uri: string | null;
28
+ artifact_path: string | null;
29
+ revision: string | null;
30
+ hash: string | null;
31
+ chunk_id: string | null;
32
+ start_offset: number | null;
33
+ end_offset: number | null;
34
+ quote: string | null;
35
+ provenance: SearchProvenance | null;
36
+ }
37
+
38
+ export interface RetrievalExcerpt {
39
+ id: string;
40
+ result_id: string;
41
+ citation_id: string | null;
42
+ kind: HybridSearchEntry['kind'];
43
+ text: string;
44
+ score: number;
45
+ }
46
+
47
+ export interface RetrievalGraphEvidence {
48
+ citations: Array<{
49
+ id: string;
50
+ chunk_id: string | null;
51
+ wiki_page_id: string | null;
52
+ source_uri: string;
53
+ quote: string | null;
54
+ start_offset: number | null;
55
+ end_offset: number | null;
56
+ }>;
57
+ backlinks: Array<{
58
+ from_page_id: string;
59
+ to_page_id: string;
60
+ label: string | null;
61
+ }>;
62
+ }
63
+
64
+ export interface KnowledgeContextPack {
65
+ query: string;
66
+ normalized_query: string;
67
+ created_at: string;
68
+ mode: HybridSearchResult['mode'];
69
+ warnings: string[];
70
+ search_counts: HybridSearchResult['counts'];
71
+ results: RerankedSearchEntry[];
72
+ citations: RetrievalCitation[];
73
+ excerpts: RetrievalExcerpt[];
74
+ graph: RetrievalGraphEvidence;
75
+ notes: {
76
+ permissions: string[];
77
+ freshness: string[];
78
+ };
79
+ }
80
+
81
+ interface CitationRow {
82
+ id: string;
83
+ wiki_page_id: string | null;
84
+ chunk_id: string | null;
85
+ source_uri: string;
86
+ quote: string | null;
87
+ start_offset: number | null;
88
+ end_offset: number | null;
89
+ }
90
+
91
+ interface BacklinkRow {
92
+ from_page_id: string;
93
+ to_page_id: string;
94
+ label: string | null;
95
+ }
96
+
97
+ function stableId(prefix: string, value: string): string {
98
+ return `${prefix}_${createHash('sha256').update(value).digest('hex').slice(0, 20)}`;
99
+ }
100
+
101
+ function normalizeQuery(query: string): string {
102
+ return query.normalize('NFKC').trim().replace(/\s+/g, ' ').toLowerCase();
103
+ }
104
+
105
+ function queryTerms(query: string): string[] {
106
+ return Array.from(new Set(normalizeQuery(query).match(/[\p{L}\p{N}_]+/gu) ?? [])).slice(0, 16);
107
+ }
108
+
109
+ function textForResult(result: HybridSearchEntry): string {
110
+ return [result.title, result.text].filter(Boolean).join(' ').toLowerCase();
111
+ }
112
+
113
+ function exactScore(result: HybridSearchEntry, terms: string[]): number {
114
+ if (terms.length === 0) return 0;
115
+ const text = textForResult(result);
116
+ const matched = terms.filter((term) => text.includes(term)).length;
117
+ return Number((matched / terms.length).toFixed(6));
118
+ }
119
+
120
+ function hasReadOnlyProvenance(provenance: SearchProvenance | null): boolean {
121
+ if (!provenance) return true;
122
+ if ('read_only' in provenance) return provenance.read_only === true;
123
+ if ('read_only_sources' in provenance) return provenance.read_only_sources === true;
124
+ return true;
125
+ }
126
+
127
+ function isStale(provenance: SearchProvenance | null): boolean {
128
+ if (!provenance) return false;
129
+ if ('stale' in provenance && provenance.stale) return true;
130
+ if ('status' in provenance) return isStaleStatus(provenance.status);
131
+ return false;
132
+ }
133
+
134
+ function freshnessScore(result: HybridSearchEntry): number {
135
+ if (isStale(result.provenance)) return 0;
136
+ if (result.source?.hash || result.source?.revision) return 1;
137
+ if (result.artifact?.hash) return 0.85;
138
+ if (result.provenance && 'source_refs' in result.provenance && result.provenance.source_refs.length > 0) return 0.75;
139
+ return 0.55;
140
+ }
141
+
142
+ function citationScore(result: HybridSearchEntry): number {
143
+ if (result.citation?.chunk_id && (result.source?.uri || result.artifact?.uri)) return 1;
144
+ if (result.provenance && 'citation_required' in result.provenance && result.provenance.citation_required) return 0.75;
145
+ if (result.artifact?.uri) return 0.65;
146
+ return 0.35;
147
+ }
148
+
149
+ function authorityScore(result: HybridSearchEntry): number {
150
+ if (result.kind === 'wiki_chunk') return 0.85;
151
+ if (result.kind === 'source_chunk') return 0.8;
152
+ if (result.kind === 'wiki_page') return 0.65;
153
+ return 0.55;
154
+ }
155
+
156
+ function rerank(result: HybridSearchEntry, terms: string[]): RerankedSearchEntry {
157
+ const scores = {
158
+ base_score: result.score,
159
+ exact_score: exactScore(result, terms),
160
+ citation_score: citationScore(result),
161
+ freshness_score: freshnessScore(result),
162
+ authority_score: authorityScore(result),
163
+ };
164
+ const final = Math.min(1,
165
+ scores.base_score * 0.65 +
166
+ scores.exact_score * 0.1 +
167
+ scores.citation_score * 0.1 +
168
+ scores.freshness_score * 0.1 +
169
+ scores.authority_score * 0.05,
170
+ );
171
+ const reasons = new Set(result.reasons);
172
+ if (scores.exact_score > 0.5) reasons.add('exact_term');
173
+ if (scores.citation_score >= 0.75) reasons.add('cited_source');
174
+ if (scores.freshness_score >= 0.85) reasons.add('fresh_source');
175
+ return {
176
+ ...result,
177
+ score: Number(final.toFixed(6)),
178
+ reasons: Array.from(reasons),
179
+ rerank: {
180
+ ...scores,
181
+ final_score: Number(final.toFixed(6)),
182
+ },
183
+ };
184
+ }
185
+
186
+ function quoteFor(result: HybridSearchEntry, maxChars: number): string | null {
187
+ const source = result.text ?? result.title;
188
+ if (!source) return null;
189
+ const normalized = source.replace(/\s+/g, ' ').trim();
190
+ return normalized.length <= maxChars ? normalized : `${normalized.slice(0, Math.max(0, maxChars - 1)).trim()}...`;
191
+ }
192
+
193
+ function citationFor(result: RerankedSearchEntry): RetrievalCitation {
194
+ const id = stableId('cite', `${result.kind}\u0000${result.id}\u0000${result.source?.uri ?? ''}\u0000${result.artifact?.uri ?? ''}`);
195
+ return {
196
+ id,
197
+ result_id: result.id,
198
+ kind: result.kind,
199
+ source_uri: result.source?.uri ?? null,
200
+ source_ref: result.source?.ref ?? null,
201
+ artifact_uri: result.artifact?.uri ?? null,
202
+ artifact_path: result.artifact?.path ?? null,
203
+ revision: result.source?.revision ?? null,
204
+ hash: result.source?.hash ?? result.artifact?.hash ?? null,
205
+ chunk_id: result.citation?.chunk_id ?? null,
206
+ start_offset: result.citation?.start_offset ?? null,
207
+ end_offset: result.citation?.end_offset ?? null,
208
+ quote: quoteFor(result, 500),
209
+ provenance: result.provenance,
210
+ };
211
+ }
212
+
213
+ function excerptFor(result: RerankedSearchEntry, citation: RetrievalCitation, contextChars: number): RetrievalExcerpt | null {
214
+ const text = quoteFor(result, contextChars);
215
+ if (!text) return null;
216
+ return {
217
+ id: stableId('excerpt', `${result.kind}\u0000${result.id}`),
218
+ result_id: result.id,
219
+ citation_id: citation.id,
220
+ kind: result.kind,
221
+ text,
222
+ score: result.score,
223
+ };
224
+ }
225
+
226
+ function placeholders(values: unknown[]): string {
227
+ return values.map(() => '?').join(', ');
228
+ }
229
+
230
+ function loadGraphEvidence(dbPath: string, results: RerankedSearchEntry[]): RetrievalGraphEvidence {
231
+ const chunkIds = results.map((result) => result.citation?.chunk_id).filter((id): id is string => Boolean(id));
232
+ const wikiPageIds = results.filter((result) => result.kind === 'wiki_page').map((result) => result.id);
233
+ const citations: CitationRow[] = [];
234
+ const backlinks: BacklinkRow[] = [];
235
+ if (chunkIds.length === 0 && wikiPageIds.length === 0) return { citations, backlinks };
236
+
237
+ const db = openKnowledgeDb(dbPath);
238
+ try {
239
+ if (chunkIds.length > 0) {
240
+ citations.push(...db.query<CitationRow, string[]>(
241
+ `SELECT id, wiki_page_id, chunk_id, source_uri, quote, start_offset, end_offset
242
+ FROM citations
243
+ WHERE chunk_id IN (${placeholders(chunkIds)})
244
+ ORDER BY created_at DESC
245
+ LIMIT 50`,
246
+ ).all(...chunkIds));
247
+ }
248
+ if (wikiPageIds.length > 0) {
249
+ citations.push(...db.query<CitationRow, string[]>(
250
+ `SELECT id, wiki_page_id, chunk_id, source_uri, quote, start_offset, end_offset
251
+ FROM citations
252
+ WHERE wiki_page_id IN (${placeholders(wikiPageIds)})
253
+ ORDER BY created_at DESC
254
+ LIMIT 50`,
255
+ ).all(...wikiPageIds));
256
+ backlinks.push(...db.query<BacklinkRow, string[]>(
257
+ `SELECT from_page_id, to_page_id, label
258
+ FROM wiki_backlinks
259
+ WHERE from_page_id IN (${placeholders(wikiPageIds)}) OR to_page_id IN (${placeholders(wikiPageIds)})
260
+ LIMIT 50`,
261
+ ).all(...wikiPageIds, ...wikiPageIds));
262
+ }
263
+ } finally {
264
+ db.close();
265
+ }
266
+ return { citations, backlinks };
267
+ }
268
+
269
+ export async function retrieveKnowledgeContext(options: RetrievalOptions): Promise<KnowledgeContextPack> {
270
+ const contextChars = Math.max(200, Math.min(options.contextChars ?? 1200, 4000));
271
+ const search = await hybridSearch(options);
272
+ const terms = queryTerms(search.query);
273
+ const warnings = [...search.warnings];
274
+ const permissionNotes = new Set<string>();
275
+ const freshnessNotes = new Set<string>();
276
+
277
+ const filtered = search.results.filter((result) => {
278
+ if (!hasReadOnlyProvenance(result.provenance)) {
279
+ warnings.push(`permission_filtered: ${result.kind}:${result.id}`);
280
+ permissionNotes.add('Dropped a result because provenance was not read-only.');
281
+ return false;
282
+ }
283
+ if (isStale(result.provenance)) {
284
+ warnings.push(`stale_filtered: ${result.kind}:${result.id}`);
285
+ freshnessNotes.add('Dropped a stale result whose source status requires reindexing.');
286
+ return false;
287
+ }
288
+ return true;
289
+ });
290
+
291
+ const results = filtered
292
+ .map((result) => rerank(result, terms))
293
+ .sort((a, b) => b.score - a.score || a.id.localeCompare(b.id))
294
+ .slice(0, search.limit);
295
+
296
+ const citations = results.map(citationFor);
297
+ const excerpts = results
298
+ .map((result, index) => excerptFor(result, citations[index], contextChars))
299
+ .filter((entry): entry is RetrievalExcerpt => Boolean(entry));
300
+
301
+ for (const result of results) {
302
+ if (result.provenance && 'read_only' in result.provenance && result.provenance.read_only) {
303
+ permissionNotes.add('All source-backed excerpts are read-only and citation-required.');
304
+ }
305
+ if (result.rerank.freshness_score >= 0.85) {
306
+ freshnessNotes.add('Fresh source revision/hash or artifact hash is present for top context.');
307
+ }
308
+ }
309
+
310
+ return {
311
+ query: search.query,
312
+ normalized_query: normalizeQuery(search.query),
313
+ created_at: new Date().toISOString(),
314
+ mode: search.mode,
315
+ warnings,
316
+ search_counts: search.counts,
317
+ results,
318
+ citations,
319
+ excerpts,
320
+ graph: loadGraphEvidence(options.dbPath, results),
321
+ notes: {
322
+ permissions: Array.from(permissionNotes),
323
+ freshness: Array.from(freshnessNotes),
324
+ },
325
+ };
326
+ }
package/src/service.ts CHANGED
@@ -1,4 +1,5 @@
1
1
  import { createArtifactStore } from './artifact-store';
2
+ import { runKnowledgePrompt, type KnowledgePromptOptions } from './agent';
2
3
  import {
3
4
  embeddingIndexStatus,
4
5
  indexKnowledgeEmbeddings,
@@ -12,6 +13,7 @@ import { ingestOpenFilesManifest } from './manifest-ingest';
12
13
  import { ingestSourceRef } from './source-ingest';
13
14
  import { resolveOpenFilesSource } from './source-resolver';
14
15
  import { providerStatus, listModelRegistry, type ProviderStatusResult, type ModelRegistryEntry } from './providers';
16
+ import { retrieveKnowledgeContext, type RetrievalOptions } from './retrieval';
15
17
  import { hybridSearch, type HybridSearchOptions } from './search';
16
18
  import { resolveSafetyPolicy } from './safety';
17
19
  import {
@@ -224,6 +226,24 @@ export class KnowledgeService {
224
226
  config: this.config(),
225
227
  });
226
228
  }
229
+
230
+ async retrieveContext(options: Omit<RetrievalOptions, 'dbPath' | 'config'>) {
231
+ const workspace = this.ensureWorkspace();
232
+ return retrieveKnowledgeContext({
233
+ ...options,
234
+ dbPath: workspace.knowledgeDbPath,
235
+ config: this.config(),
236
+ });
237
+ }
238
+
239
+ async runPrompt(options: Omit<KnowledgePromptOptions, 'dbPath' | 'config'>) {
240
+ const workspace = this.ensureWorkspace();
241
+ return runKnowledgePrompt({
242
+ ...options,
243
+ dbPath: workspace.knowledgeDbPath,
244
+ config: this.config(),
245
+ });
246
+ }
227
247
  }
228
248
 
229
249
  export function createKnowledgeService(options: KnowledgeServiceOptions = {}): KnowledgeService {