@hasna/knowledge 0.2.17 → 0.2.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +17 -1
- package/bin/open-knowledge-mcp.js +283 -6
- package/bin/open-knowledge.js +64 -57
- package/docs/architecture/ai-native-knowledge-base.md +7 -0
- package/docs/architecture/hybrid-semantic-search.md +5 -0
- package/package.json +1 -1
- package/src/cli.ts +33 -4
- package/src/mcp.js +18 -0
- package/src/service.ts +11 -0
- package/src/web-search.ts +330 -0
|
@@ -255,6 +255,13 @@ returns a local citation draft by default, optionally calls AI SDK generation vi
|
|
|
255
255
|
`--generate`, records `runs`, `run_events`, and `provider_usage`, and only
|
|
256
256
|
proposes durable wiki updates until the wiki compile/write task owns writes.
|
|
257
257
|
|
|
258
|
+
Provider-native web search is exposed separately as
|
|
259
|
+
`open-knowledge web search <query>` and MCP `ok_web_search`. Real network access
|
|
260
|
+
is safety-gated; OpenAI and Anthropic use provider web-search tools through AI
|
|
261
|
+
SDK, while DeepSeek remains a future fallback/external-search path. Returned web
|
|
262
|
+
snippets can optionally be filed as read-only `web` source refs for later local
|
|
263
|
+
search and citation.
|
|
264
|
+
|
|
258
265
|
## Provider Registry
|
|
259
266
|
|
|
260
267
|
AI provider setup is BYOK and AI SDK v6 based. The local provider layer tracks:
|
|
@@ -142,6 +142,11 @@ source refs, artifact URIs, revision/hash metadata, offsets, and provenance.
|
|
|
142
142
|
and MCP `knowledge_ask` wrap this context pack in a run ledger and return a
|
|
143
143
|
citation draft or explicit AI SDK generated answer.
|
|
144
144
|
|
|
145
|
+
Provider-native web search lives beside local retrieval. `open-knowledge web
|
|
146
|
+
search` and MCP `ok_web_search` are safety-gated, capture provider sources, and
|
|
147
|
+
can file snippets as read-only `web` source refs so later local retrieval treats
|
|
148
|
+
them like other cited sources.
|
|
149
|
+
|
|
145
150
|
## Reindexing
|
|
146
151
|
|
|
147
152
|
Reindexing is driven by source revisions:
|
package/package.json
CHANGED
package/src/cli.ts
CHANGED
|
@@ -54,6 +54,9 @@ interface Flags {
|
|
|
54
54
|
context?: boolean;
|
|
55
55
|
generate?: boolean;
|
|
56
56
|
approveWrite?: boolean;
|
|
57
|
+
provider?: string;
|
|
58
|
+
domain?: string[];
|
|
59
|
+
fileResults?: boolean;
|
|
57
60
|
noColor?: boolean;
|
|
58
61
|
scope?: string;
|
|
59
62
|
olderThan?: number;
|
|
@@ -68,7 +71,7 @@ interface ParseResult {
|
|
|
68
71
|
flags: Flags;
|
|
69
72
|
}
|
|
70
73
|
|
|
71
|
-
const COMMANDS = ['add', 'list', 'get', 'delete', 'update', 'archive', 'restore', 'upsert', 'untag', 'export', 'prune', 'dedupe', 'stats', 'paths', 'storage', 'db', 'wiki', 'source', 'ingest', 'reindex', 'search', 'ask', 'build', 'embeddings', 'providers', 'safety', 'help'];
|
|
74
|
+
const COMMANDS = ['add', 'list', 'get', 'delete', 'update', 'archive', 'restore', 'upsert', 'untag', 'export', 'prune', 'dedupe', 'stats', 'paths', 'storage', 'db', 'wiki', 'source', 'ingest', 'reindex', 'search', 'web', 'ask', 'build', 'embeddings', 'providers', 'safety', 'help'];
|
|
72
75
|
const COMMAND_ALIASES: Record<string, string> = {
|
|
73
76
|
ls: 'list',
|
|
74
77
|
rm: 'delete',
|
|
@@ -111,6 +114,9 @@ function parseArgs(argv: string[]): ParseResult {
|
|
|
111
114
|
case '--context': flags.context = true; break;
|
|
112
115
|
case '--generate': flags.generate = true; break;
|
|
113
116
|
case '--approve-write': flags.approveWrite = true; break;
|
|
117
|
+
case '--provider': flags.provider = argv[i + 1]; i += 1; break;
|
|
118
|
+
case '--domain': flags.domain = [...(flags.domain ?? []), argv[i + 1]]; i += 1; break;
|
|
119
|
+
case '--file-results': flags.fileResults = true; break;
|
|
114
120
|
case '--fake': flags.fake = true; break;
|
|
115
121
|
case '--no-color': flags.noColor = true; break;
|
|
116
122
|
case '--scope': flags.scope = argv[i + 1]; i += 1; break;
|
|
@@ -190,6 +196,7 @@ Commands:
|
|
|
190
196
|
ingest source <source-ref> Ingest a read-only source ref into knowledge.db
|
|
191
197
|
reindex outbox <file|s3://> Consume open-files change events and invalidate chunks
|
|
192
198
|
search <query> Hybrid search sources, wiki pages, indexes, or context
|
|
199
|
+
web search <query> Provider-native web search with citations
|
|
193
200
|
ask|build <prompt> Build a read-only citation answer/context pack
|
|
194
201
|
embeddings status|index|search Build/query local vector embeddings
|
|
195
202
|
providers status|models|check Inspect AI SDK provider config and credentials
|
|
@@ -206,6 +213,9 @@ Global Options:
|
|
|
206
213
|
--context Return a reranked citation context pack for search
|
|
207
214
|
--generate Call AI SDK text generation for ask/build
|
|
208
215
|
--approve-write Record approval intent for future durable wiki writes
|
|
216
|
+
--provider <name> Provider override for web search
|
|
217
|
+
--domain <domain> Restrict provider web search to a domain
|
|
218
|
+
--file-results File web snippets as web source refs
|
|
209
219
|
--fake Use deterministic fake embeddings for local tests
|
|
210
220
|
--scope local|global|project Store scope (default: global ~/.hasna/apps/knowledge/)
|
|
211
221
|
--no-color Disable color output
|
|
@@ -268,6 +278,7 @@ function printCommandHelp(command: string): void {
|
|
|
268
278
|
if (command === 'ingest') { console.log('Usage: open-knowledge ingest manifest <file|s3://bucket/key> | source <source-ref> [--purpose knowledge_index] [--scope local|global|project] [--json]'); return; }
|
|
269
279
|
if (command === 'reindex') { console.log('Usage: open-knowledge reindex outbox <file|s3://bucket/key> [--scope local|global|project] [--json]'); return; }
|
|
270
280
|
if (command === 'search') { console.log('Usage: open-knowledge search <query> [--context] [--semantic] [--model openai:text-embedding-3-small] [--limit <n>] [--dimensions <n>] [--fake] [--scope local|global|project] [--json]'); return; }
|
|
281
|
+
if (command === 'web') { console.log('Usage: open-knowledge web search <query> [--provider openai|anthropic] [--model provider:model] [--domain <domain>] [--file-results] [--fake] [--scope local|global|project] [--json]'); return; }
|
|
271
282
|
if (command === 'ask' || command === 'build' || command === 'knowledge') { console.log('Usage: open-knowledge ask|build <prompt> [--generate] [--semantic] [--model default|provider:model] [--approve-write] [--scope local|global|project] [--json]'); return; }
|
|
272
283
|
if (command === 'embeddings') { console.log('Usage: open-knowledge embeddings status|index|search [query] [--model openai:text-embedding-3-small] [--limit <n>] [--dimensions <n>] [--fake] [--scope local|global|project] [--json]'); return; }
|
|
273
284
|
if (command === 'providers') { console.log('Usage: open-knowledge providers status|models|check [provider|model-alias] [--scope local|global|project] [--json]'); return; }
|
|
@@ -316,11 +327,11 @@ async function run(argv: string[]): Promise<void> {
|
|
|
316
327
|
if (flags.completions) {
|
|
317
328
|
const shell = flags.completions;
|
|
318
329
|
if (shell === 'bash') {
|
|
319
|
-
console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search ask build embeddings providers safety help ls rm edit unarchive knowledge --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --model --dimensions --semantic --context --generate --approve-write --fake --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
|
|
330
|
+
console.log(`_open_knowledge() { local cur; cur="${"$"}{COMP_WORDS[COMP_CWORD]}"; COMPREPLY=($(compgen -W "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge --json --yes --help --version --desc --page --limit --search --sort --id --store --title --content --url --tag --format --completions --purpose --model --dimensions --semantic --context --generate --approve-write --provider --domain --file-results --fake --no-color --scope --archived --include-archived" -- "$cur")); }; complete -F _open_knowledge open-knowledge`);
|
|
320
331
|
} else if (shell === 'zsh') {
|
|
321
|
-
console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search ask build embeddings providers safety help ls rm edit unarchive knowledge)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(--semantic)--semantic" "(--context)--context" "(--generate)--generate" "(--approve-write)--approve-write" "(--fake)--fake" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--model)--model[model ref]:" "(--dimensions)--dimensions[embedding dimensions]:number:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
|
|
332
|
+
console.log(`#compdef open-knowledge\n_open_knowledge() { _arguments -C "1: :(add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge)" "(--json)--json" "(--yes)-y" "(--help)--help" "(--version)--version" "(--desc)--desc" "(--archived)--archived" "(--include-archived)--include-archived" "(--semantic)--semantic" "(--context)--context" "(--generate)--generate" "(--approve-write)--approve-write" "(--file-results)--file-results" "(--fake)--fake" "(-p --page)"{-p,--page}"[page number]:number:" "(-l --limit)"{-l,--limit}"[items per page]:number:" "(-s --search)"{-s,--search}"[search text]:text:" "(--sort)--sort"\{created,title\}:" "(--id)--id[item id]:id:" "(--store)--store[store path]:path:" "(--title)--title[new title]:" "(--content)--content[new content]:" "(--url)--url[source url]:" "(-t --tag)"{-t,--tag}"[tag]:tag:" "(--format)--format[json|jsonl]:" "(--completions)--completions[output completions]:shell:(bash zsh fish):" "(--purpose)--purpose[purpose]:" "(--model)--model[model ref]:" "(--dimensions)--dimensions[embedding dimensions]:number:" "(--provider)--provider[provider]:" "(--domain)--domain[domain]:" "(--no-color)--no-color[disable color]" "(--scope)--scope"\{local,global,project\}:" }; _open_knowledge`);
|
|
322
333
|
} else if (shell === 'fish') {
|
|
323
|
-
console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search ask build embeddings providers safety help ls rm edit unarchive knowledge"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -l semantic; complete -c open-knowledge -l context; complete -c open-knowledge -l generate; complete -c open-knowledge -l approve-write; complete -c open-knowledge -l fake; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l model; complete -c open-knowledge -l dimensions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
|
|
334
|
+
console.log(`complete -c open-knowledge -f; complete -c open-knowledge -a "add list get update archive restore upsert untag delete export prune dedupe stats paths storage db wiki source ingest reindex search web ask build embeddings providers safety help ls rm edit unarchive knowledge"; complete -c open-knowledge -l json; complete -c open-knowledge -l yes -s y; complete -c open-knowledge -l help -s h; complete -c open-knowledge -l version -s v; complete -c open-knowledge -l desc; complete -c open-knowledge -l archived; complete -c open-knowledge -l include-archived; complete -c open-knowledge -l semantic; complete -c open-knowledge -l context; complete -c open-knowledge -l generate; complete -c open-knowledge -l approve-write; complete -c open-knowledge -l provider; complete -c open-knowledge -l domain; complete -c open-knowledge -l file-results; complete -c open-knowledge -l fake; complete -c open-knowledge -s p -l page; complete -c open-knowledge -s l -l limit; complete -c open-knowledge -s s -l search; complete -c open-knowledge -l sort; complete -c open-knowledge -l id; complete -c open-knowledge -l store; complete -c open-knowledge -l title; complete -c open-knowledge -l content; complete -c open-knowledge -l url; complete -c open-knowledge -s t -l tag; complete -c open-knowledge -l format; complete -c open-knowledge -l completions; complete -c open-knowledge -l purpose; complete -c open-knowledge -l model; complete -c open-knowledge -l dimensions; complete -c open-knowledge -l no-color; complete -c open-knowledge -l scope -a "local global project"`);
|
|
324
335
|
} else {
|
|
325
336
|
throw new Error("Invalid --completions value. Use 'bash', 'zsh', or 'fish'.");
|
|
326
337
|
}
|
|
@@ -632,6 +643,24 @@ async function run(argv: string[]): Promise<void> {
|
|
|
632
643
|
return;
|
|
633
644
|
}
|
|
634
645
|
|
|
646
|
+
if (command === 'web') {
|
|
647
|
+
const action = positional[1] ?? 'search';
|
|
648
|
+
if (action !== 'search') throw new Error("Invalid web action. Use 'search'.");
|
|
649
|
+
const query = positional.slice(2).join(' ');
|
|
650
|
+
if (!query) throw new Error('Usage: open-knowledge web search <query>');
|
|
651
|
+
const result = await service.webSearch({
|
|
652
|
+
query,
|
|
653
|
+
limit: flags.limit,
|
|
654
|
+
modelRef: flags.model,
|
|
655
|
+
provider: flags.provider as AiProviderId | undefined,
|
|
656
|
+
domains: flags.domain,
|
|
657
|
+
fake: flags.fake,
|
|
658
|
+
fileResults: flags.fileResults,
|
|
659
|
+
});
|
|
660
|
+
output({ ok: true, ...result, message: `${result.sources.length} web source(s)` }, flags.json);
|
|
661
|
+
return;
|
|
662
|
+
}
|
|
663
|
+
|
|
635
664
|
if (command === 'ask' || command === 'build') {
|
|
636
665
|
const prompt = positional.slice(commandArgOffset).join(' ');
|
|
637
666
|
if (!prompt) throw new Error('Usage: open-knowledge ask <prompt>');
|
package/src/mcp.js
CHANGED
|
@@ -222,6 +222,24 @@ export function buildServer() {
|
|
|
222
222
|
}
|
|
223
223
|
});
|
|
224
224
|
|
|
225
|
+
registerTool(server, 'ok_web_search', 'Provider web search', 'Run safety-gated provider-native web search and return citations/sources', {
|
|
226
|
+
scope: scopeField,
|
|
227
|
+
query: z.string().describe('Web search query'),
|
|
228
|
+
limit: z.number().optional().describe('Maximum sources'),
|
|
229
|
+
provider: z.enum(['openai', 'anthropic', 'deepseek']).optional().describe('Provider override'),
|
|
230
|
+
model: z.string().optional().describe('Model alias/ref'),
|
|
231
|
+
domains: z.array(z.string()).optional().describe('Allowed domains'),
|
|
232
|
+
fake: z.boolean().optional().describe('Use deterministic fake web results'),
|
|
233
|
+
file_results: z.boolean().optional().describe('File web snippets as web source refs'),
|
|
234
|
+
}, async ({ scope, query, limit, provider, model, domains, fake, file_results }) => {
|
|
235
|
+
const service = createKnowledgeService({ scope });
|
|
236
|
+
try {
|
|
237
|
+
return jsonText({ ok: true, ...await service.webSearch({ query, limit, provider, modelRef: model, domains, fake, fileResults: file_results }) });
|
|
238
|
+
} catch (error) {
|
|
239
|
+
return errorText(error instanceof Error ? error.message : String(error));
|
|
240
|
+
}
|
|
241
|
+
});
|
|
242
|
+
|
|
225
243
|
registerTool(server, 'ok_add', 'Add a knowledge item', 'Add a new item to the knowledge store', {
|
|
226
244
|
title: z.string().describe('Item title'),
|
|
227
245
|
content: z.string().describe('Item content/body'),
|
package/src/service.ts
CHANGED
|
@@ -16,6 +16,7 @@ import { providerStatus, listModelRegistry, type ProviderStatusResult, type Mode
|
|
|
16
16
|
import { retrieveKnowledgeContext, type RetrievalOptions } from './retrieval';
|
|
17
17
|
import { hybridSearch, type HybridSearchOptions } from './search';
|
|
18
18
|
import { resolveSafetyPolicy } from './safety';
|
|
19
|
+
import { runProviderWebSearch, type WebSearchOptions } from './web-search';
|
|
19
20
|
import {
|
|
20
21
|
recordStorageObjects,
|
|
21
22
|
resolveStorageContract,
|
|
@@ -244,6 +245,16 @@ export class KnowledgeService {
|
|
|
244
245
|
config: this.config(),
|
|
245
246
|
});
|
|
246
247
|
}
|
|
248
|
+
|
|
249
|
+
async webSearch(options: Omit<WebSearchOptions, 'dbPath' | 'config' | 'safetyPolicy'>) {
|
|
250
|
+
const workspace = this.ensureWorkspace();
|
|
251
|
+
return runProviderWebSearch({
|
|
252
|
+
...options,
|
|
253
|
+
dbPath: workspace.knowledgeDbPath,
|
|
254
|
+
config: this.config(),
|
|
255
|
+
safetyPolicy: this.safetyPolicy(),
|
|
256
|
+
});
|
|
257
|
+
}
|
|
247
258
|
}
|
|
248
259
|
|
|
249
260
|
export function createKnowledgeService(options: KnowledgeServiceOptions = {}): KnowledgeService {
|
|
@@ -0,0 +1,330 @@
|
|
|
1
|
+
import { createHash, randomUUID } from 'node:crypto';
|
|
2
|
+
import { migrateKnowledgeDb, openKnowledgeDb } from './knowledge-db';
|
|
3
|
+
import { ingestOpenFilesManifestItems } from './manifest-ingest';
|
|
4
|
+
import {
|
|
5
|
+
assertProviderCredentials,
|
|
6
|
+
normalizeAiSdkUsage,
|
|
7
|
+
parseModelRef,
|
|
8
|
+
providerSettings,
|
|
9
|
+
recordProviderUsage,
|
|
10
|
+
resolveModelRef,
|
|
11
|
+
type AiProviderId,
|
|
12
|
+
} from './providers';
|
|
13
|
+
import { assertWebSearchAllowed, recordAuditEvent, type SafetyPolicy } from './safety';
|
|
14
|
+
import type { KnowledgeConfig } from './workspace';
|
|
15
|
+
|
|
16
|
+
export interface WebSearchOptions {
|
|
17
|
+
dbPath: string;
|
|
18
|
+
query: string;
|
|
19
|
+
config?: KnowledgeConfig;
|
|
20
|
+
safetyPolicy?: SafetyPolicy;
|
|
21
|
+
modelRef?: string;
|
|
22
|
+
provider?: AiProviderId;
|
|
23
|
+
limit?: number;
|
|
24
|
+
maxUses?: number;
|
|
25
|
+
domains?: string[];
|
|
26
|
+
fake?: boolean;
|
|
27
|
+
fileResults?: boolean;
|
|
28
|
+
env?: Record<string, string | undefined>;
|
|
29
|
+
now?: Date;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
export interface WebSearchSource {
|
|
33
|
+
url: string;
|
|
34
|
+
title: string | null;
|
|
35
|
+
snippet: string | null;
|
|
36
|
+
provider_metadata: Record<string, unknown>;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export interface WebSearchResult {
|
|
40
|
+
run_id: string;
|
|
41
|
+
query: string;
|
|
42
|
+
provider: string;
|
|
43
|
+
model: string;
|
|
44
|
+
answer: string;
|
|
45
|
+
sources: WebSearchSource[];
|
|
46
|
+
filed_sources: number;
|
|
47
|
+
usage: {
|
|
48
|
+
input_tokens: number;
|
|
49
|
+
output_tokens: number;
|
|
50
|
+
cost_usd: number;
|
|
51
|
+
};
|
|
52
|
+
warnings: string[];
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function stableHash(value: string): string {
|
|
56
|
+
return `sha256:${createHash('sha256').update(value).digest('hex')}`;
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function estimateTokens(text: string): number {
|
|
60
|
+
const words = text.trim().split(/\s+/).filter(Boolean).length;
|
|
61
|
+
return Math.max(1, Math.ceil(words * 1.25));
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
function asRecord(value: unknown): Record<string, unknown> {
|
|
65
|
+
return value && typeof value === 'object' && !Array.isArray(value) ? value as Record<string, unknown> : {};
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
function asString(value: unknown): string | null {
|
|
69
|
+
return typeof value === 'string' && value.length > 0 ? value : null;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
function sourceFromRecord(value: unknown): WebSearchSource | null {
|
|
73
|
+
const record = asRecord(value);
|
|
74
|
+
const url = asString(record.url) ?? asString(record.uri) ?? asString(record.sourceUrl);
|
|
75
|
+
if (!url) return null;
|
|
76
|
+
return {
|
|
77
|
+
url,
|
|
78
|
+
title: asString(record.title) ?? asString(record.name),
|
|
79
|
+
snippet: asString(record.snippet) ?? asString(record.text) ?? asString(record.description),
|
|
80
|
+
provider_metadata: record,
|
|
81
|
+
};
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
function collectSources(value: unknown, output: Map<string, WebSearchSource>): void {
|
|
85
|
+
if (Array.isArray(value)) {
|
|
86
|
+
for (const entry of value) collectSources(entry, output);
|
|
87
|
+
return;
|
|
88
|
+
}
|
|
89
|
+
const source = sourceFromRecord(value);
|
|
90
|
+
if (source) output.set(source.url, source);
|
|
91
|
+
const record = asRecord(value);
|
|
92
|
+
for (const key of ['sources', 'results', 'citations', 'annotations', 'output']) {
|
|
93
|
+
if (record[key]) collectSources(record[key], output);
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
|
|
97
|
+
function fakeSources(query: string, limit: number): WebSearchSource[] {
|
|
98
|
+
return Array.from({ length: Math.min(limit, 3) }, (_, index) => ({
|
|
99
|
+
url: `https://example.com/knowledge-web-${index + 1}`,
|
|
100
|
+
title: `Fake web source ${index + 1}`,
|
|
101
|
+
snippet: `Deterministic web-search fixture for "${query}"`,
|
|
102
|
+
provider_metadata: { fake: true, rank: index + 1 },
|
|
103
|
+
}));
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
async function openAiWebSearch(input: {
|
|
107
|
+
query: string;
|
|
108
|
+
model: string;
|
|
109
|
+
config?: KnowledgeConfig;
|
|
110
|
+
env: Record<string, string | undefined>;
|
|
111
|
+
maxUses: number;
|
|
112
|
+
domains: string[];
|
|
113
|
+
}) {
|
|
114
|
+
const { generateText } = await import('ai');
|
|
115
|
+
const { createOpenAI } = await import('@ai-sdk/openai');
|
|
116
|
+
const settings = providerSettings(input.config, 'openai');
|
|
117
|
+
const openai = createOpenAI({
|
|
118
|
+
apiKey: input.env[settings.api_key_env],
|
|
119
|
+
baseURL: settings.base_url,
|
|
120
|
+
}) as any;
|
|
121
|
+
const webSearch = openai.tools?.webSearch;
|
|
122
|
+
if (!webSearch) throw new Error('OpenAI provider does not expose tools.webSearch.');
|
|
123
|
+
return generateText({
|
|
124
|
+
model: openai(input.model),
|
|
125
|
+
prompt: input.query,
|
|
126
|
+
tools: {
|
|
127
|
+
web_search: webSearch({
|
|
128
|
+
externalWebAccess: true,
|
|
129
|
+
searchContextSize: 'medium',
|
|
130
|
+
...(input.domains.length > 0 ? { allowedDomains: input.domains } : {}),
|
|
131
|
+
}),
|
|
132
|
+
},
|
|
133
|
+
toolChoice: { type: 'tool', toolName: 'web_search' },
|
|
134
|
+
});
|
|
135
|
+
}
|
|
136
|
+
|
|
137
|
+
async function anthropicWebSearch(input: {
|
|
138
|
+
query: string;
|
|
139
|
+
model: string;
|
|
140
|
+
config?: KnowledgeConfig;
|
|
141
|
+
env: Record<string, string | undefined>;
|
|
142
|
+
maxUses: number;
|
|
143
|
+
domains: string[];
|
|
144
|
+
}) {
|
|
145
|
+
const { generateText } = await import('ai');
|
|
146
|
+
const { createAnthropic } = await import('@ai-sdk/anthropic');
|
|
147
|
+
const settings = providerSettings(input.config, 'anthropic');
|
|
148
|
+
const anthropic = createAnthropic({
|
|
149
|
+
apiKey: input.env[settings.api_key_env],
|
|
150
|
+
baseURL: settings.base_url,
|
|
151
|
+
}) as any;
|
|
152
|
+
const factory = anthropic.tools?.webSearch_20250305 ?? anthropic.tools?.webSearch;
|
|
153
|
+
if (!factory) throw new Error('Anthropic provider does not expose a web search tool.');
|
|
154
|
+
return generateText({
|
|
155
|
+
model: anthropic(input.model),
|
|
156
|
+
prompt: input.query,
|
|
157
|
+
tools: {
|
|
158
|
+
web_search: factory({
|
|
159
|
+
maxUses: input.maxUses,
|
|
160
|
+
...(input.domains.length > 0 ? { allowedDomains: input.domains } : {}),
|
|
161
|
+
}),
|
|
162
|
+
},
|
|
163
|
+
});
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
async function fileWebSources(options: WebSearchOptions, sources: WebSearchSource[], now: string): Promise<number> {
|
|
167
|
+
if (!options.fileResults || sources.length === 0) return 0;
|
|
168
|
+
const items = sources.map((source) => {
|
|
169
|
+
const text = [source.title, source.snippet, source.url].filter(Boolean).join('\n');
|
|
170
|
+
const hash = stableHash(text);
|
|
171
|
+
return {
|
|
172
|
+
source_ref: source.url,
|
|
173
|
+
name: source.title ?? source.url,
|
|
174
|
+
url: source.url,
|
|
175
|
+
mime: 'text/plain',
|
|
176
|
+
hash,
|
|
177
|
+
revision: hash,
|
|
178
|
+
status: 'active',
|
|
179
|
+
updated_at: now,
|
|
180
|
+
permissions: { mode: 'read_only', allowed_purposes: ['knowledge_answer', 'knowledge_index'] },
|
|
181
|
+
metadata: {
|
|
182
|
+
source_ref: source.url,
|
|
183
|
+
content_source: 'provider_web_search',
|
|
184
|
+
provider_metadata: source.provider_metadata,
|
|
185
|
+
},
|
|
186
|
+
extracted_text: text,
|
|
187
|
+
};
|
|
188
|
+
});
|
|
189
|
+
const result = await ingestOpenFilesManifestItems({
|
|
190
|
+
dbPath: options.dbPath,
|
|
191
|
+
items,
|
|
192
|
+
sourceLabel: `web-search:${options.query}`,
|
|
193
|
+
readAction: 'provider_web_search_file_results',
|
|
194
|
+
safetyPolicy: options.safetyPolicy,
|
|
195
|
+
now: new Date(now),
|
|
196
|
+
});
|
|
197
|
+
return result.sources_upserted;
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
export async function runProviderWebSearch(options: WebSearchOptions): Promise<WebSearchResult> {
|
|
201
|
+
const query = options.query.trim();
|
|
202
|
+
if (!query) throw new Error('Web search query is required.');
|
|
203
|
+
const env = options.env ?? process.env;
|
|
204
|
+
const now = (options.now ?? new Date()).toISOString();
|
|
205
|
+
const limit = Math.max(1, Math.min(options.limit ?? 5, 20));
|
|
206
|
+
const maxUses = Math.max(1, Math.min(options.maxUses ?? 3, 10));
|
|
207
|
+
const domains = options.domains ?? [];
|
|
208
|
+
const modelRef = resolveModelRef(options.modelRef ?? (options.provider ? `${options.provider}:${providerSettings(options.config, options.provider).default_model}` : 'default'), options.config);
|
|
209
|
+
const parsed = parseModelRef(modelRef);
|
|
210
|
+
const provider = options.provider ?? parsed.provider;
|
|
211
|
+
const model = parsed.provider === provider ? parsed.model : providerSettings(options.config, provider).default_model;
|
|
212
|
+
const runId = `run_${randomUUID()}`;
|
|
213
|
+
|
|
214
|
+
if (!options.fake && options.safetyPolicy) assertWebSearchAllowed(options.safetyPolicy);
|
|
215
|
+
if (!options.fake && provider !== 'openai' && provider !== 'anthropic') {
|
|
216
|
+
throw new Error(`Provider ${provider} does not expose native web search yet.`);
|
|
217
|
+
}
|
|
218
|
+
if (!options.fake) assertProviderCredentials(provider, options.config, env);
|
|
219
|
+
|
|
220
|
+
migrateKnowledgeDb(options.dbPath);
|
|
221
|
+
const db = openKnowledgeDb(options.dbPath);
|
|
222
|
+
try {
|
|
223
|
+
db.run(
|
|
224
|
+
`INSERT INTO runs (id, type, prompt, status, provider, model, metadata_json, created_at, updated_at)
|
|
225
|
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)`,
|
|
226
|
+
[
|
|
227
|
+
runId,
|
|
228
|
+
'provider-web-search',
|
|
229
|
+
query,
|
|
230
|
+
'running',
|
|
231
|
+
provider,
|
|
232
|
+
model,
|
|
233
|
+
JSON.stringify({ domains, max_uses: maxUses, fake: options.fake === true }),
|
|
234
|
+
now,
|
|
235
|
+
now,
|
|
236
|
+
],
|
|
237
|
+
);
|
|
238
|
+
recordAuditEvent(db, {
|
|
239
|
+
event_type: 'source_read',
|
|
240
|
+
action: options.fake ? 'fake_provider_web_search' : 'provider_web_search',
|
|
241
|
+
target_uri: query,
|
|
242
|
+
decision: 'allow',
|
|
243
|
+
metadata: { provider, model, domains, max_uses: maxUses },
|
|
244
|
+
created_at: now,
|
|
245
|
+
});
|
|
246
|
+
} finally {
|
|
247
|
+
db.close();
|
|
248
|
+
}
|
|
249
|
+
|
|
250
|
+
let answer = '';
|
|
251
|
+
let sources: WebSearchSource[] = [];
|
|
252
|
+
let usage = { input_tokens: estimateTokens(query), output_tokens: 0, cost_usd: 0 };
|
|
253
|
+
const warnings: string[] = [];
|
|
254
|
+
if (options.fake) {
|
|
255
|
+
sources = fakeSources(query, limit);
|
|
256
|
+
answer = `Fake web search answer for: ${query}`;
|
|
257
|
+
usage.output_tokens = estimateTokens(answer);
|
|
258
|
+
} else {
|
|
259
|
+
const result = provider === 'openai'
|
|
260
|
+
? await openAiWebSearch({ query, model, config: options.config, env, maxUses, domains })
|
|
261
|
+
: await anthropicWebSearch({ query, model, config: options.config, env, maxUses, domains });
|
|
262
|
+
answer = result.text;
|
|
263
|
+
const collected = new Map<string, WebSearchSource>();
|
|
264
|
+
collectSources((result as any).sources, collected);
|
|
265
|
+
collectSources((result as any).toolResults, collected);
|
|
266
|
+
sources = Array.from(collected.values()).slice(0, limit);
|
|
267
|
+
const normalized = normalizeAiSdkUsage({
|
|
268
|
+
provider,
|
|
269
|
+
model,
|
|
270
|
+
usage: (result as any).usage,
|
|
271
|
+
providerMetadata: (result as any).providerMetadata,
|
|
272
|
+
});
|
|
273
|
+
usage = {
|
|
274
|
+
input_tokens: normalized.input_tokens,
|
|
275
|
+
output_tokens: normalized.output_tokens,
|
|
276
|
+
cost_usd: normalized.cost_usd,
|
|
277
|
+
};
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
const filedSources = await fileWebSources(options, sources, now);
|
|
281
|
+
const writeDb = openKnowledgeDb(options.dbPath);
|
|
282
|
+
try {
|
|
283
|
+
writeDb.run(
|
|
284
|
+
`UPDATE runs SET status = ?, metadata_json = ?, updated_at = ? WHERE id = ?`,
|
|
285
|
+
[
|
|
286
|
+
'completed',
|
|
287
|
+
JSON.stringify({ domains, max_uses: maxUses, sources: sources.length, filed_sources: filedSources, fake: options.fake === true }),
|
|
288
|
+
now,
|
|
289
|
+
runId,
|
|
290
|
+
],
|
|
291
|
+
);
|
|
292
|
+
writeDb.run(
|
|
293
|
+
`INSERT INTO run_events (id, run_id, level, event, metadata_json, created_at)
|
|
294
|
+
VALUES (?, ?, ?, ?, ?, ?)`,
|
|
295
|
+
[
|
|
296
|
+
`evt_${randomUUID()}`,
|
|
297
|
+
runId,
|
|
298
|
+
'info',
|
|
299
|
+
'provider_web_search_completed',
|
|
300
|
+
JSON.stringify({ sources: sources.length, filed_sources: filedSources }),
|
|
301
|
+
now,
|
|
302
|
+
],
|
|
303
|
+
);
|
|
304
|
+
recordProviderUsage(writeDb, {
|
|
305
|
+
run_id: runId,
|
|
306
|
+
provider,
|
|
307
|
+
model,
|
|
308
|
+
input_tokens: usage.input_tokens,
|
|
309
|
+
output_tokens: usage.output_tokens,
|
|
310
|
+
cost_usd: usage.cost_usd,
|
|
311
|
+
metadata: { web_search: true, sources: sources.length, filed_sources: filedSources },
|
|
312
|
+
created_at: now,
|
|
313
|
+
});
|
|
314
|
+
} finally {
|
|
315
|
+
writeDb.close();
|
|
316
|
+
}
|
|
317
|
+
|
|
318
|
+
if (sources.length === 0) warnings.push('no_web_sources_returned');
|
|
319
|
+
return {
|
|
320
|
+
run_id: runId,
|
|
321
|
+
query,
|
|
322
|
+
provider,
|
|
323
|
+
model,
|
|
324
|
+
answer,
|
|
325
|
+
sources,
|
|
326
|
+
filed_sources: filedSources,
|
|
327
|
+
usage,
|
|
328
|
+
warnings,
|
|
329
|
+
};
|
|
330
|
+
}
|