@hasna/knowledge 0.2.17 → 0.2.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/src/service.ts CHANGED
@@ -13,9 +13,11 @@ import { ingestOpenFilesManifest } from './manifest-ingest';
13
13
  import { ingestSourceRef } from './source-ingest';
14
14
  import { resolveOpenFilesSource } from './source-resolver';
15
15
  import { providerStatus, listModelRegistry, type ProviderStatusResult, type ModelRegistryEntry } from './providers';
16
+ import { enqueueMissingEmbeddings, refreshEmbeddingIndex, reindexHealth, type ReindexRuntimeOptions } from './reindex';
16
17
  import { retrieveKnowledgeContext, type RetrievalOptions } from './retrieval';
17
18
  import { hybridSearch, type HybridSearchOptions } from './search';
18
19
  import { resolveSafetyPolicy } from './safety';
20
+ import { runProviderWebSearch, type WebSearchOptions } from './web-search';
19
21
  import {
20
22
  recordStorageObjects,
21
23
  resolveStorageContract,
@@ -187,6 +189,33 @@ export class KnowledgeService {
187
189
  });
188
190
  }
189
191
 
192
+ reindexHealth(options: Omit<ReindexRuntimeOptions, 'dbPath' | 'config'> = {}) {
193
+ const workspace = this.ensureWorkspace();
194
+ return reindexHealth({
195
+ ...options,
196
+ dbPath: workspace.knowledgeDbPath,
197
+ config: this.config(),
198
+ });
199
+ }
200
+
201
+ enqueueReindex(options: Omit<ReindexRuntimeOptions, 'dbPath' | 'config'> = {}) {
202
+ const workspace = this.ensureWorkspace();
203
+ return enqueueMissingEmbeddings({
204
+ ...options,
205
+ dbPath: workspace.knowledgeDbPath,
206
+ config: this.config(),
207
+ });
208
+ }
209
+
210
+ async refreshEmbeddings(options: Omit<ReindexRuntimeOptions & { full?: boolean; limit?: number }, 'dbPath' | 'config'> = {}) {
211
+ const workspace = this.ensureWorkspace();
212
+ return refreshEmbeddingIndex({
213
+ ...options,
214
+ dbPath: workspace.knowledgeDbPath,
215
+ config: this.config(),
216
+ });
217
+ }
218
+
190
219
  providerStatus(env: Record<string, string | undefined> = process.env): ProviderStatusResult {
191
220
  return providerStatus(this.config(), env);
192
221
  }
@@ -244,6 +273,16 @@ export class KnowledgeService {
244
273
  config: this.config(),
245
274
  });
246
275
  }
276
+
277
+ async webSearch(options: Omit<WebSearchOptions, 'dbPath' | 'config' | 'safetyPolicy'>) {
278
+ const workspace = this.ensureWorkspace();
279
+ return runProviderWebSearch({
280
+ ...options,
281
+ dbPath: workspace.knowledgeDbPath,
282
+ config: this.config(),
283
+ safetyPolicy: this.safetyPolicy(),
284
+ });
285
+ }
247
286
  }
248
287
 
249
288
  export function createKnowledgeService(options: KnowledgeServiceOptions = {}): KnowledgeService {
@@ -0,0 +1,330 @@
1
+ import { createHash, randomUUID } from 'node:crypto';
2
+ import { migrateKnowledgeDb, openKnowledgeDb } from './knowledge-db';
3
+ import { ingestOpenFilesManifestItems } from './manifest-ingest';
4
+ import {
5
+ assertProviderCredentials,
6
+ normalizeAiSdkUsage,
7
+ parseModelRef,
8
+ providerSettings,
9
+ recordProviderUsage,
10
+ resolveModelRef,
11
+ type AiProviderId,
12
+ } from './providers';
13
+ import { assertWebSearchAllowed, recordAuditEvent, type SafetyPolicy } from './safety';
14
+ import type { KnowledgeConfig } from './workspace';
15
+
16
+ export interface WebSearchOptions {
17
+ dbPath: string;
18
+ query: string;
19
+ config?: KnowledgeConfig;
20
+ safetyPolicy?: SafetyPolicy;
21
+ modelRef?: string;
22
+ provider?: AiProviderId;
23
+ limit?: number;
24
+ maxUses?: number;
25
+ domains?: string[];
26
+ fake?: boolean;
27
+ fileResults?: boolean;
28
+ env?: Record<string, string | undefined>;
29
+ now?: Date;
30
+ }
31
+
32
+ export interface WebSearchSource {
33
+ url: string;
34
+ title: string | null;
35
+ snippet: string | null;
36
+ provider_metadata: Record<string, unknown>;
37
+ }
38
+
39
+ export interface WebSearchResult {
40
+ run_id: string;
41
+ query: string;
42
+ provider: string;
43
+ model: string;
44
+ answer: string;
45
+ sources: WebSearchSource[];
46
+ filed_sources: number;
47
+ usage: {
48
+ input_tokens: number;
49
+ output_tokens: number;
50
+ cost_usd: number;
51
+ };
52
+ warnings: string[];
53
+ }
54
+
55
+ function stableHash(value: string): string {
56
+ return `sha256:${createHash('sha256').update(value).digest('hex')}`;
57
+ }
58
+
59
+ function estimateTokens(text: string): number {
60
+ const words = text.trim().split(/\s+/).filter(Boolean).length;
61
+ return Math.max(1, Math.ceil(words * 1.25));
62
+ }
63
+
64
+ function asRecord(value: unknown): Record<string, unknown> {
65
+ return value && typeof value === 'object' && !Array.isArray(value) ? value as Record<string, unknown> : {};
66
+ }
67
+
68
+ function asString(value: unknown): string | null {
69
+ return typeof value === 'string' && value.length > 0 ? value : null;
70
+ }
71
+
72
+ function sourceFromRecord(value: unknown): WebSearchSource | null {
73
+ const record = asRecord(value);
74
+ const url = asString(record.url) ?? asString(record.uri) ?? asString(record.sourceUrl);
75
+ if (!url) return null;
76
+ return {
77
+ url,
78
+ title: asString(record.title) ?? asString(record.name),
79
+ snippet: asString(record.snippet) ?? asString(record.text) ?? asString(record.description),
80
+ provider_metadata: record,
81
+ };
82
+ }
83
+
84
+ function collectSources(value: unknown, output: Map<string, WebSearchSource>): void {
85
+ if (Array.isArray(value)) {
86
+ for (const entry of value) collectSources(entry, output);
87
+ return;
88
+ }
89
+ const source = sourceFromRecord(value);
90
+ if (source) output.set(source.url, source);
91
+ const record = asRecord(value);
92
+ for (const key of ['sources', 'results', 'citations', 'annotations', 'output']) {
93
+ if (record[key]) collectSources(record[key], output);
94
+ }
95
+ }
96
+
97
+ function fakeSources(query: string, limit: number): WebSearchSource[] {
98
+ return Array.from({ length: Math.min(limit, 3) }, (_, index) => ({
99
+ url: `https://example.com/knowledge-web-${index + 1}`,
100
+ title: `Fake web source ${index + 1}`,
101
+ snippet: `Deterministic web-search fixture for "${query}"`,
102
+ provider_metadata: { fake: true, rank: index + 1 },
103
+ }));
104
+ }
105
+
106
+ async function openAiWebSearch(input: {
107
+ query: string;
108
+ model: string;
109
+ config?: KnowledgeConfig;
110
+ env: Record<string, string | undefined>;
111
+ maxUses: number;
112
+ domains: string[];
113
+ }) {
114
+ const { generateText } = await import('ai');
115
+ const { createOpenAI } = await import('@ai-sdk/openai');
116
+ const settings = providerSettings(input.config, 'openai');
117
+ const openai = createOpenAI({
118
+ apiKey: input.env[settings.api_key_env],
119
+ baseURL: settings.base_url,
120
+ }) as any;
121
+ const webSearch = openai.tools?.webSearch;
122
+ if (!webSearch) throw new Error('OpenAI provider does not expose tools.webSearch.');
123
+ return generateText({
124
+ model: openai(input.model),
125
+ prompt: input.query,
126
+ tools: {
127
+ web_search: webSearch({
128
+ externalWebAccess: true,
129
+ searchContextSize: 'medium',
130
+ ...(input.domains.length > 0 ? { allowedDomains: input.domains } : {}),
131
+ }),
132
+ },
133
+ toolChoice: { type: 'tool', toolName: 'web_search' },
134
+ });
135
+ }
136
+
137
+ async function anthropicWebSearch(input: {
138
+ query: string;
139
+ model: string;
140
+ config?: KnowledgeConfig;
141
+ env: Record<string, string | undefined>;
142
+ maxUses: number;
143
+ domains: string[];
144
+ }) {
145
+ const { generateText } = await import('ai');
146
+ const { createAnthropic } = await import('@ai-sdk/anthropic');
147
+ const settings = providerSettings(input.config, 'anthropic');
148
+ const anthropic = createAnthropic({
149
+ apiKey: input.env[settings.api_key_env],
150
+ baseURL: settings.base_url,
151
+ }) as any;
152
+ const factory = anthropic.tools?.webSearch_20250305 ?? anthropic.tools?.webSearch;
153
+ if (!factory) throw new Error('Anthropic provider does not expose a web search tool.');
154
+ return generateText({
155
+ model: anthropic(input.model),
156
+ prompt: input.query,
157
+ tools: {
158
+ web_search: factory({
159
+ maxUses: input.maxUses,
160
+ ...(input.domains.length > 0 ? { allowedDomains: input.domains } : {}),
161
+ }),
162
+ },
163
+ });
164
+ }
165
+
166
+ async function fileWebSources(options: WebSearchOptions, sources: WebSearchSource[], now: string): Promise<number> {
167
+ if (!options.fileResults || sources.length === 0) return 0;
168
+ const items = sources.map((source) => {
169
+ const text = [source.title, source.snippet, source.url].filter(Boolean).join('\n');
170
+ const hash = stableHash(text);
171
+ return {
172
+ source_ref: source.url,
173
+ name: source.title ?? source.url,
174
+ url: source.url,
175
+ mime: 'text/plain',
176
+ hash,
177
+ revision: hash,
178
+ status: 'active',
179
+ updated_at: now,
180
+ permissions: { mode: 'read_only', allowed_purposes: ['knowledge_answer', 'knowledge_index'] },
181
+ metadata: {
182
+ source_ref: source.url,
183
+ content_source: 'provider_web_search',
184
+ provider_metadata: source.provider_metadata,
185
+ },
186
+ extracted_text: text,
187
+ };
188
+ });
189
+ const result = await ingestOpenFilesManifestItems({
190
+ dbPath: options.dbPath,
191
+ items,
192
+ sourceLabel: `web-search:${options.query}`,
193
+ readAction: 'provider_web_search_file_results',
194
+ safetyPolicy: options.safetyPolicy,
195
+ now: new Date(now),
196
+ });
197
+ return result.sources_upserted;
198
+ }
199
+
200
+ export async function runProviderWebSearch(options: WebSearchOptions): Promise<WebSearchResult> {
201
+ const query = options.query.trim();
202
+ if (!query) throw new Error('Web search query is required.');
203
+ const env = options.env ?? process.env;
204
+ const now = (options.now ?? new Date()).toISOString();
205
+ const limit = Math.max(1, Math.min(options.limit ?? 5, 20));
206
+ const maxUses = Math.max(1, Math.min(options.maxUses ?? 3, 10));
207
+ const domains = options.domains ?? [];
208
+ const modelRef = resolveModelRef(options.modelRef ?? (options.provider ? `${options.provider}:${providerSettings(options.config, options.provider).default_model}` : 'default'), options.config);
209
+ const parsed = parseModelRef(modelRef);
210
+ const provider = options.provider ?? parsed.provider;
211
+ const model = parsed.provider === provider ? parsed.model : providerSettings(options.config, provider).default_model;
212
+ const runId = `run_${randomUUID()}`;
213
+
214
+ if (!options.fake && options.safetyPolicy) assertWebSearchAllowed(options.safetyPolicy);
215
+ if (!options.fake && provider !== 'openai' && provider !== 'anthropic') {
216
+ throw new Error(`Provider ${provider} does not expose native web search yet.`);
217
+ }
218
+ if (!options.fake) assertProviderCredentials(provider, options.config, env);
219
+
220
+ migrateKnowledgeDb(options.dbPath);
221
+ const db = openKnowledgeDb(options.dbPath);
222
+ try {
223
+ db.run(
224
+ `INSERT INTO runs (id, type, prompt, status, provider, model, metadata_json, created_at, updated_at)
225
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
226
+ [
227
+ runId,
228
+ 'provider-web-search',
229
+ query,
230
+ 'running',
231
+ provider,
232
+ model,
233
+ JSON.stringify({ domains, max_uses: maxUses, fake: options.fake === true }),
234
+ now,
235
+ now,
236
+ ],
237
+ );
238
+ recordAuditEvent(db, {
239
+ event_type: 'source_read',
240
+ action: options.fake ? 'fake_provider_web_search' : 'provider_web_search',
241
+ target_uri: query,
242
+ decision: 'allow',
243
+ metadata: { provider, model, domains, max_uses: maxUses },
244
+ created_at: now,
245
+ });
246
+ } finally {
247
+ db.close();
248
+ }
249
+
250
+ let answer = '';
251
+ let sources: WebSearchSource[] = [];
252
+ let usage = { input_tokens: estimateTokens(query), output_tokens: 0, cost_usd: 0 };
253
+ const warnings: string[] = [];
254
+ if (options.fake) {
255
+ sources = fakeSources(query, limit);
256
+ answer = `Fake web search answer for: ${query}`;
257
+ usage.output_tokens = estimateTokens(answer);
258
+ } else {
259
+ const result = provider === 'openai'
260
+ ? await openAiWebSearch({ query, model, config: options.config, env, maxUses, domains })
261
+ : await anthropicWebSearch({ query, model, config: options.config, env, maxUses, domains });
262
+ answer = result.text;
263
+ const collected = new Map<string, WebSearchSource>();
264
+ collectSources((result as any).sources, collected);
265
+ collectSources((result as any).toolResults, collected);
266
+ sources = Array.from(collected.values()).slice(0, limit);
267
+ const normalized = normalizeAiSdkUsage({
268
+ provider,
269
+ model,
270
+ usage: (result as any).usage,
271
+ providerMetadata: (result as any).providerMetadata,
272
+ });
273
+ usage = {
274
+ input_tokens: normalized.input_tokens,
275
+ output_tokens: normalized.output_tokens,
276
+ cost_usd: normalized.cost_usd,
277
+ };
278
+ }
279
+
280
+ const filedSources = await fileWebSources(options, sources, now);
281
+ const writeDb = openKnowledgeDb(options.dbPath);
282
+ try {
283
+ writeDb.run(
284
+ `UPDATE runs SET status = ?, metadata_json = ?, updated_at = ? WHERE id = ?`,
285
+ [
286
+ 'completed',
287
+ JSON.stringify({ domains, max_uses: maxUses, sources: sources.length, filed_sources: filedSources, fake: options.fake === true }),
288
+ now,
289
+ runId,
290
+ ],
291
+ );
292
+ writeDb.run(
293
+ `INSERT INTO run_events (id, run_id, level, event, metadata_json, created_at)
294
+ VALUES (?, ?, ?, ?, ?, ?)`,
295
+ [
296
+ `evt_${randomUUID()}`,
297
+ runId,
298
+ 'info',
299
+ 'provider_web_search_completed',
300
+ JSON.stringify({ sources: sources.length, filed_sources: filedSources }),
301
+ now,
302
+ ],
303
+ );
304
+ recordProviderUsage(writeDb, {
305
+ run_id: runId,
306
+ provider,
307
+ model,
308
+ input_tokens: usage.input_tokens,
309
+ output_tokens: usage.output_tokens,
310
+ cost_usd: usage.cost_usd,
311
+ metadata: { web_search: true, sources: sources.length, filed_sources: filedSources },
312
+ created_at: now,
313
+ });
314
+ } finally {
315
+ writeDb.close();
316
+ }
317
+
318
+ if (sources.length === 0) warnings.push('no_web_sources_returned');
319
+ return {
320
+ run_id: runId,
321
+ query,
322
+ provider,
323
+ model,
324
+ answer,
325
+ sources,
326
+ filed_sources: filedSources,
327
+ usage,
328
+ warnings,
329
+ };
330
+ }