@agwab/pi-workflow 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -3
- package/agents/researcher.md +17 -7
- package/dist/artifact-graph-runtime.js +1 -0
- package/dist/compiler.js +2 -2
- package/dist/dynamic-generated-task-runtime.js +4 -3
- package/dist/dynamic-runtime-bundle.js +3 -2
- package/dist/extension.js +40 -1
- package/dist/subagent-backend.js +82 -27
- package/dist/tool-metadata.d.ts +1 -0
- package/dist/tool-metadata.js +13 -1
- package/dist/workflow-artifact-extension.js +3 -2
- package/dist/workflow-artifact-tool.js +84 -4
- package/dist/workflow-web-source-extension.d.ts +43 -0
- package/dist/workflow-web-source-extension.js +1194 -0
- package/dist/workflow-web-source.d.ts +171 -0
- package/dist/workflow-web-source.js +897 -0
- package/docs/usage.md +32 -45
- package/node_modules/@agwab/pi-subagent/package.json +1 -1
- package/node_modules/@agwab/pi-subagent/src/api.ts +245 -132
- package/node_modules/@agwab/pi-subagent/src/artifacts/result.ts +243 -163
- package/node_modules/@agwab/pi-subagent/src/core/constants.ts +117 -90
- package/node_modules/@agwab/pi-subagent/src/core/validation.ts +728 -475
- package/node_modules/@agwab/pi-subagent/src/orchestrate/run.ts +305 -209
- package/node_modules/@agwab/pi-subagent/src/runners/headless-model.ts +750 -439
- package/node_modules/@agwab/pi-subagent/src/runners/tmux.ts +422 -268
- package/package.json +3 -4
- package/skills/workflow-guide/scaffolds/object-tool-fallback/schemas/fetch-control.schema.json +1 -1
- package/skills/workflow-guide/scaffolds/object-tool-fallback/spec.json +4 -3
- package/src/artifact-graph-runtime.ts +1 -0
- package/src/compiler.ts +2 -1
- package/src/dynamic-generated-task-runtime.ts +4 -2
- package/src/dynamic-runtime-bundle.ts +3 -2
- package/src/extension.ts +46 -1
- package/src/subagent-backend.ts +121 -37
- package/src/tool-metadata.ts +22 -1
- package/src/workflow-artifact-extension.ts +3 -2
- package/src/workflow-artifact-tool.ts +96 -4
- package/src/workflow-web-source-extension.ts +1411 -0
- package/src/workflow-web-source.ts +1171 -0
- package/workflows/README.md +1 -1
- package/workflows/deep-research/helpers/claim-evidence-gate.mjs +474 -40
- package/workflows/deep-research/helpers/final-audit-packet.mjs +219 -0
- package/workflows/deep-research/helpers/normalize-input-packet.mjs +436 -0
- package/workflows/deep-research/helpers/render-executive.mjs +571 -198
- package/workflows/deep-research/schemas/deep-research-executive-render-control.schema.json +35 -8
- package/workflows/deep-research/schemas/deep-research-normalize-claims-control.schema.json +45 -4
- package/workflows/deep-research/schemas/deep-research-verify-claims-control.schema.json +0 -2
- package/workflows/deep-research/spec.json +36 -21
- package/workflows/deep-review/helpers/render-review-report.mjs +502 -0
- package/workflows/deep-review/schemas/deep-review-render-control.schema.json +50 -0
- package/workflows/deep-review/spec.json +22 -1
- package/docs/release.md +0 -89
- package/node_modules/@pondwader/socks5-server/.DS_Store +0 -0
- package/node_modules/commander/.DS_Store +0 -0
- package/node_modules/jiti/.DS_Store +0 -0
- package/node_modules/node-forge/.DS_Store +0 -0
- package/node_modules/shell-quote/.DS_Store +0 -0
- package/node_modules/zod/.DS_Store +0 -0
|
@@ -0,0 +1,1171 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import { appendFile, mkdir, readFile, readdir, rename, writeFile } from "node:fs/promises";
|
|
3
|
+
import { isIP } from "node:net";
|
|
4
|
+
import { dirname, resolve } from "node:path";
|
|
5
|
+
|
|
6
|
+
export const WORKFLOW_WEB_SOURCE_CACHE_SCHEMA =
|
|
7
|
+
"workflow-web-source-cache-v1" as const;
|
|
8
|
+
export const WORKFLOW_WEB_SOURCE_INDEX_SCHEMA =
|
|
9
|
+
"workflow-web-source-index-v1" as const;
|
|
10
|
+
export const WORKFLOW_WEB_SOURCE_INDEX_EVENT_SCHEMA =
|
|
11
|
+
"workflow-web-source-index-event-v1" as const;
|
|
12
|
+
export const WORKFLOW_WEB_SOURCE_EVENT_SCHEMA =
|
|
13
|
+
"workflow-web-source-event-v1" as const;
|
|
14
|
+
|
|
15
|
+
export const WORKFLOW_WEB_SOURCE_TOOLS = [
|
|
16
|
+
"workflow_web_search",
|
|
17
|
+
"workflow_web_fetch_source",
|
|
18
|
+
"workflow_web_source_read",
|
|
19
|
+
] as const;
|
|
20
|
+
|
|
21
|
+
export type WorkflowWebSourceTool = (typeof WORKFLOW_WEB_SOURCE_TOOLS)[number];
|
|
22
|
+
|
|
23
|
+
export interface WorkflowWebSourcePolicy {
|
|
24
|
+
previewChars: number;
|
|
25
|
+
duplicatePreviewChars: number;
|
|
26
|
+
sourceReadMaxChars: number;
|
|
27
|
+
searchSnippetChars: number;
|
|
28
|
+
perTaskVisibleCharBudget: number;
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
export interface WorkflowWebSecurityPolicy {
|
|
32
|
+
allowPrivateHosts: boolean;
|
|
33
|
+
cacheRawProviderPayloads: boolean;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
export interface WorkflowWebSourceCacheConfig {
|
|
37
|
+
runId: string;
|
|
38
|
+
taskId: string;
|
|
39
|
+
cacheDir: string;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export interface WorkflowWebSource {
|
|
43
|
+
schema: typeof WORKFLOW_WEB_SOURCE_CACHE_SCHEMA;
|
|
44
|
+
sourceRef: string;
|
|
45
|
+
createdAt: string;
|
|
46
|
+
runId: string;
|
|
47
|
+
taskId: string;
|
|
48
|
+
url: string;
|
|
49
|
+
redactedUrl: string;
|
|
50
|
+
urlKey?: string;
|
|
51
|
+
domain: string;
|
|
52
|
+
title?: string;
|
|
53
|
+
provider?: string;
|
|
54
|
+
contentHash: string;
|
|
55
|
+
text: string;
|
|
56
|
+
textChars: number;
|
|
57
|
+
extractionLossy?: boolean;
|
|
58
|
+
metadata?: Record<string, string | number | boolean | null>;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
export interface WorkflowWebSourceIndexEntry {
|
|
62
|
+
sourceRef: string;
|
|
63
|
+
createdAt: string;
|
|
64
|
+
url: string;
|
|
65
|
+
redactedUrl: string;
|
|
66
|
+
urlKey?: string;
|
|
67
|
+
domain: string;
|
|
68
|
+
title?: string;
|
|
69
|
+
contentHash: string;
|
|
70
|
+
textChars: number;
|
|
71
|
+
provider?: string;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
export interface WorkflowWebSourceIndex {
|
|
75
|
+
schema: typeof WORKFLOW_WEB_SOURCE_INDEX_SCHEMA;
|
|
76
|
+
updatedAt: string;
|
|
77
|
+
runId: string;
|
|
78
|
+
sources: WorkflowWebSourceIndexEntry[];
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
export interface WorkflowWebVisibleBudget {
|
|
82
|
+
limit: number;
|
|
83
|
+
used: number;
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
export interface WorkflowWebSourceReadRequest {
|
|
87
|
+
query?: string;
|
|
88
|
+
claim?: string;
|
|
89
|
+
terms?: string[];
|
|
90
|
+
maxChars?: number;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
export interface WorkflowWebSourceReadResult {
|
|
94
|
+
status: "matched" | "not_found";
|
|
95
|
+
matchType?: "exact" | "normalized" | "terms";
|
|
96
|
+
quote?: string;
|
|
97
|
+
startOffset?: number;
|
|
98
|
+
endOffset?: number;
|
|
99
|
+
visibleChars: number;
|
|
100
|
+
matchedTerms?: string[];
|
|
101
|
+
missingTerms?: string[];
|
|
102
|
+
coverageRatio?: number;
|
|
103
|
+
candidateOnly?: boolean;
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
export interface WorkflowWebSourceCard {
|
|
107
|
+
sourceRef: string;
|
|
108
|
+
url: string;
|
|
109
|
+
domain: string;
|
|
110
|
+
title?: string;
|
|
111
|
+
preview: string;
|
|
112
|
+
textChars: number;
|
|
113
|
+
fullContentCached: boolean;
|
|
114
|
+
duplicate: boolean;
|
|
115
|
+
budget: {
|
|
116
|
+
limit: number;
|
|
117
|
+
used: number;
|
|
118
|
+
remaining: number;
|
|
119
|
+
truncated: boolean;
|
|
120
|
+
};
|
|
121
|
+
next: string;
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
export interface WorkflowWebSearchCandidate {
|
|
125
|
+
url?: string;
|
|
126
|
+
title?: string;
|
|
127
|
+
snippet: string;
|
|
128
|
+
domain?: string;
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
export const DEFAULT_WORKFLOW_WEB_SOURCE_POLICY: WorkflowWebSourcePolicy = {
|
|
132
|
+
previewChars: 800,
|
|
133
|
+
duplicatePreviewChars: 160,
|
|
134
|
+
sourceReadMaxChars: 1_200,
|
|
135
|
+
searchSnippetChars: 240,
|
|
136
|
+
perTaskVisibleCharBudget: 12_000,
|
|
137
|
+
};
|
|
138
|
+
|
|
139
|
+
export const DEFAULT_WORKFLOW_WEB_SECURITY_POLICY: WorkflowWebSecurityPolicy = {
|
|
140
|
+
allowPrivateHosts: false,
|
|
141
|
+
cacheRawProviderPayloads: false,
|
|
142
|
+
};
|
|
143
|
+
|
|
144
|
+
const SENSITIVE_QUERY_PARAM_PATTERN =
|
|
145
|
+
/(^|[-_])(access[-_]?token|auth|code|credential|key|password|secret|session|signature|sig|token)([-_]|$)/i;
|
|
146
|
+
const PRIVATE_HOST_PATTERNS = [
|
|
147
|
+
/^localhost$/i,
|
|
148
|
+
/^127\./,
|
|
149
|
+
/^0\./,
|
|
150
|
+
/^10\./,
|
|
151
|
+
/^192\.168\./,
|
|
152
|
+
/^169\.254\./,
|
|
153
|
+
/^metadata\.google\.internal$/i,
|
|
154
|
+
];
|
|
155
|
+
|
|
156
|
+
export function normalizeWorkflowWebSourcePolicy(
|
|
157
|
+
policy: Partial<WorkflowWebSourcePolicy> | undefined,
|
|
158
|
+
): WorkflowWebSourcePolicy {
|
|
159
|
+
return {
|
|
160
|
+
...DEFAULT_WORKFLOW_WEB_SOURCE_POLICY,
|
|
161
|
+
...(policy ?? {}),
|
|
162
|
+
};
|
|
163
|
+
}
|
|
164
|
+
|
|
165
|
+
export function normalizeWorkflowWebSecurityPolicy(
|
|
166
|
+
policy: Partial<WorkflowWebSecurityPolicy> | undefined,
|
|
167
|
+
): WorkflowWebSecurityPolicy {
|
|
168
|
+
return {
|
|
169
|
+
...DEFAULT_WORKFLOW_WEB_SECURITY_POLICY,
|
|
170
|
+
...(policy ?? {}),
|
|
171
|
+
};
|
|
172
|
+
}
|
|
173
|
+
|
|
174
|
+
export function isWorkflowWebSourceTool(tool: string): tool is WorkflowWebSourceTool {
|
|
175
|
+
return (WORKFLOW_WEB_SOURCE_TOOLS as readonly string[]).includes(tool);
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
export function createWorkflowWebVisibleBudget(
|
|
179
|
+
limit: number,
|
|
180
|
+
): WorkflowWebVisibleBudget {
|
|
181
|
+
return { limit: Math.max(0, Math.floor(limit)), used: 0 };
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
export function consumeWorkflowWebVisibleBudget(
|
|
185
|
+
budget: WorkflowWebVisibleBudget,
|
|
186
|
+
text: string,
|
|
187
|
+
maxChars: number,
|
|
188
|
+
): { text: string; truncated: boolean; remaining: number; used: number } {
|
|
189
|
+
const remainingBefore = Math.max(0, budget.limit - budget.used);
|
|
190
|
+
const allowed = Math.max(0, Math.min(maxChars, remainingBefore));
|
|
191
|
+
const truncated = text.length > allowed;
|
|
192
|
+
const visible = text.slice(0, allowed);
|
|
193
|
+
budget.used += visible.length;
|
|
194
|
+
return {
|
|
195
|
+
text: visible,
|
|
196
|
+
truncated,
|
|
197
|
+
remaining: Math.max(0, budget.limit - budget.used),
|
|
198
|
+
used: budget.used,
|
|
199
|
+
};
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
export function validateWorkflowWebUrl(
|
|
203
|
+
url: string,
|
|
204
|
+
security: WorkflowWebSecurityPolicy = DEFAULT_WORKFLOW_WEB_SECURITY_POLICY,
|
|
205
|
+
): { ok: true; normalizedUrl: string; domain: string } | { ok: false; reason: string } {
|
|
206
|
+
let parsed: URL;
|
|
207
|
+
try {
|
|
208
|
+
parsed = new URL(url);
|
|
209
|
+
} catch {
|
|
210
|
+
return { ok: false, reason: "invalid_url" };
|
|
211
|
+
}
|
|
212
|
+
if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
|
|
213
|
+
return { ok: false, reason: "unsafe_scheme" };
|
|
214
|
+
}
|
|
215
|
+
const host = parsed.hostname.toLowerCase().replace(/^\[|\]$/g, "");
|
|
216
|
+
if (!security.allowPrivateHosts && isPrivateHostname(host)) {
|
|
217
|
+
return { ok: false, reason: "private_host_blocked" };
|
|
218
|
+
}
|
|
219
|
+
return { ok: true, normalizedUrl: parsed.href, domain: host };
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
export function sanitizeUrlForModel(url: string): string {
|
|
223
|
+
let parsed: URL;
|
|
224
|
+
try {
|
|
225
|
+
parsed = new URL(url);
|
|
226
|
+
} catch {
|
|
227
|
+
return redactInlineSecrets(url);
|
|
228
|
+
}
|
|
229
|
+
return sanitizeParsedUrlForModel(parsed);
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
function sanitizeParsedUrlForModel(parsed: URL): string {
|
|
233
|
+
parsed.username = "";
|
|
234
|
+
parsed.password = "";
|
|
235
|
+
for (const key of [...parsed.searchParams.keys()]) {
|
|
236
|
+
if (SENSITIVE_QUERY_PARAM_PATTERN.test(key)) {
|
|
237
|
+
parsed.searchParams.set(key, "REDACTED");
|
|
238
|
+
}
|
|
239
|
+
}
|
|
240
|
+
parsed.hash = redactUrlFragment(parsed.hash);
|
|
241
|
+
return redactInlineSecretsNoUrls(parsed.href);
|
|
242
|
+
}
|
|
243
|
+
|
|
244
|
+
export function sourceRefFor(url: string, text: string): string {
|
|
245
|
+
return `wsrc_${hashString(`${sourceUrlCacheKey(url)}\0${text}`).slice(0, 32)}`;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
export function sourceUrlCacheKey(url: string): string {
|
|
249
|
+
return `urlkey_${hashString(canonicalUrlForCache(url)).slice(0, 32)}`;
|
|
250
|
+
}
|
|
251
|
+
|
|
252
|
+
function sourceUrlDisplayCacheKey(url: string): string {
|
|
253
|
+
let parsed: URL;
|
|
254
|
+
try {
|
|
255
|
+
parsed = new URL(sanitizeUrlForModel(url));
|
|
256
|
+
} catch {
|
|
257
|
+
return sanitizeUrlForModel(url).trim();
|
|
258
|
+
}
|
|
259
|
+
parsed.hash = shouldKeepFragmentForCache(parsed.hash) ? parsed.hash : "";
|
|
260
|
+
parsed.hostname = parsed.hostname.toLowerCase();
|
|
261
|
+
if (parsed.pathname.length > 1 && parsed.pathname.endsWith("/")) {
|
|
262
|
+
parsed.pathname = parsed.pathname.slice(0, -1);
|
|
263
|
+
}
|
|
264
|
+
const sortedParams = [...parsed.searchParams.entries()].sort(([left], [right]) =>
|
|
265
|
+
left.localeCompare(right),
|
|
266
|
+
);
|
|
267
|
+
parsed.search = "";
|
|
268
|
+
for (const [key, value] of sortedParams) {
|
|
269
|
+
parsed.searchParams.append(key, value);
|
|
270
|
+
}
|
|
271
|
+
return parsed.href;
|
|
272
|
+
}
|
|
273
|
+
|
|
274
|
+
function canonicalUrlForCache(url: string): string {
|
|
275
|
+
let parsed: URL;
|
|
276
|
+
try {
|
|
277
|
+
parsed = new URL(url);
|
|
278
|
+
} catch {
|
|
279
|
+
return url.trim();
|
|
280
|
+
}
|
|
281
|
+
parsed.hostname = parsed.hostname.toLowerCase();
|
|
282
|
+
parsed.hash = shouldKeepFragmentForCache(parsed.hash) ? parsed.hash : "";
|
|
283
|
+
if (parsed.pathname.length > 1 && parsed.pathname.endsWith("/")) {
|
|
284
|
+
parsed.pathname = parsed.pathname.slice(0, -1);
|
|
285
|
+
}
|
|
286
|
+
const sortedParams = [...parsed.searchParams.entries()].sort(([left], [right]) =>
|
|
287
|
+
left.localeCompare(right),
|
|
288
|
+
);
|
|
289
|
+
parsed.search = "";
|
|
290
|
+
for (const [key, value] of sortedParams) {
|
|
291
|
+
parsed.searchParams.append(key, value);
|
|
292
|
+
}
|
|
293
|
+
return parsed.href;
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
export function createWorkflowWebSource(options: {
|
|
297
|
+
config: WorkflowWebSourceCacheConfig;
|
|
298
|
+
url: string;
|
|
299
|
+
text: string;
|
|
300
|
+
title?: string;
|
|
301
|
+
provider?: string;
|
|
302
|
+
extractionLossy?: boolean;
|
|
303
|
+
metadata?: WorkflowWebSource["metadata"];
|
|
304
|
+
}): WorkflowWebSource {
|
|
305
|
+
const checked = validateWorkflowWebUrl(options.url, {
|
|
306
|
+
...DEFAULT_WORKFLOW_WEB_SECURITY_POLICY,
|
|
307
|
+
allowPrivateHosts: true,
|
|
308
|
+
});
|
|
309
|
+
const domain = checked.ok ? checked.domain : "unknown";
|
|
310
|
+
const redactedUrl = sanitizeUrlForModel(options.url);
|
|
311
|
+
const contentHash = hashString(options.text);
|
|
312
|
+
return {
|
|
313
|
+
schema: WORKFLOW_WEB_SOURCE_CACHE_SCHEMA,
|
|
314
|
+
sourceRef: sourceRefFor(options.url, options.text),
|
|
315
|
+
createdAt: new Date().toISOString(),
|
|
316
|
+
runId: options.config.runId,
|
|
317
|
+
taskId: options.config.taskId,
|
|
318
|
+
url: redactedUrl,
|
|
319
|
+
redactedUrl,
|
|
320
|
+
urlKey: sourceUrlCacheKey(options.url),
|
|
321
|
+
domain,
|
|
322
|
+
...(options.title ? { title: options.title } : {}),
|
|
323
|
+
...(options.provider ? { provider: options.provider } : {}),
|
|
324
|
+
contentHash,
|
|
325
|
+
text: options.text,
|
|
326
|
+
textChars: options.text.length,
|
|
327
|
+
...(options.extractionLossy !== undefined
|
|
328
|
+
? { extractionLossy: options.extractionLossy }
|
|
329
|
+
: {}),
|
|
330
|
+
...(options.metadata ? { metadata: options.metadata } : {}),
|
|
331
|
+
};
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
export async function writeWorkflowWebSource(
|
|
335
|
+
config: WorkflowWebSourceCacheConfig,
|
|
336
|
+
source: WorkflowWebSource,
|
|
337
|
+
): Promise<void> {
|
|
338
|
+
await mkdir(resolve(config.cacheDir, "sources"), { recursive: true });
|
|
339
|
+
await writeJsonAtomic(sourceObjectPath(config, source.sourceRef), source);
|
|
340
|
+
const entry = sourceToIndexEntry(source);
|
|
341
|
+
await appendWorkflowWebSourceIndexEvent(config, entry);
|
|
342
|
+
const index = await readWorkflowWebSourceIndex(config);
|
|
343
|
+
const withoutExisting = index.sources.filter(
|
|
344
|
+
(indexEntry) => indexEntry.sourceRef !== source.sourceRef,
|
|
345
|
+
);
|
|
346
|
+
withoutExisting.push(entry);
|
|
347
|
+
await writeJsonAtomic(indexPath(config), {
|
|
348
|
+
...index,
|
|
349
|
+
updatedAt: new Date().toISOString(),
|
|
350
|
+
sources: mergeSourceIndexEntries(withoutExisting),
|
|
351
|
+
});
|
|
352
|
+
}
|
|
353
|
+
|
|
354
|
+
export async function readWorkflowWebSource(
|
|
355
|
+
config: WorkflowWebSourceCacheConfig,
|
|
356
|
+
sourceRef: string,
|
|
357
|
+
): Promise<WorkflowWebSource | undefined> {
|
|
358
|
+
if (!isWorkflowWebSourceRef(sourceRef)) return undefined;
|
|
359
|
+
try {
|
|
360
|
+
const parsed = JSON.parse(
|
|
361
|
+
await readFile(sourceObjectPath(config, sourceRef), "utf8"),
|
|
362
|
+
) as unknown;
|
|
363
|
+
if (!isRecord(parsed)) return undefined;
|
|
364
|
+
if (parsed.schema !== WORKFLOW_WEB_SOURCE_CACHE_SCHEMA) return undefined;
|
|
365
|
+
if (parsed.sourceRef !== sourceRef) return undefined;
|
|
366
|
+
if (typeof parsed.text !== "string") return undefined;
|
|
367
|
+
return parsed as unknown as WorkflowWebSource;
|
|
368
|
+
} catch {
|
|
369
|
+
return undefined;
|
|
370
|
+
}
|
|
371
|
+
}
|
|
372
|
+
|
|
373
|
+
export async function readWorkflowWebSourceIndex(
|
|
374
|
+
config: WorkflowWebSourceCacheConfig,
|
|
375
|
+
): Promise<WorkflowWebSourceIndex> {
|
|
376
|
+
const base = await readWorkflowWebSourceIndexFile(config);
|
|
377
|
+
const ledgerEntries = await readWorkflowWebSourceIndexLedger(config);
|
|
378
|
+
if (ledgerEntries.length === 0) return base;
|
|
379
|
+
return {
|
|
380
|
+
...base,
|
|
381
|
+
updatedAt: new Date().toISOString(),
|
|
382
|
+
sources: mergeSourceIndexEntries([...base.sources, ...ledgerEntries]),
|
|
383
|
+
};
|
|
384
|
+
}
|
|
385
|
+
|
|
386
|
+
export async function findWorkflowWebSourceByUrl(
|
|
387
|
+
config: WorkflowWebSourceCacheConfig,
|
|
388
|
+
url: string,
|
|
389
|
+
): Promise<WorkflowWebSource | undefined> {
|
|
390
|
+
const redactedUrl = sanitizeUrlForModel(url);
|
|
391
|
+
const targetKey = sourceUrlCacheKey(url);
|
|
392
|
+
const targetDisplayKey = sourceUrlDisplayCacheKey(redactedUrl);
|
|
393
|
+
const index = await readWorkflowWebSourceIndex(config);
|
|
394
|
+
const existing = [...index.sources].reverse().find((entry) => {
|
|
395
|
+
return sourceIndexEntryMatchesUrl(entry, url, redactedUrl, targetKey, targetDisplayKey);
|
|
396
|
+
});
|
|
397
|
+
if (existing) {
|
|
398
|
+
const source = await readWorkflowWebSource(config, existing.sourceRef);
|
|
399
|
+
if (source) return source;
|
|
400
|
+
}
|
|
401
|
+
return findWorkflowWebSourceByUrlFromSources(config, url, redactedUrl, targetKey, targetDisplayKey);
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
function sourceIndexEntryMatchesUrl(
|
|
405
|
+
entry: WorkflowWebSourceIndexEntry,
|
|
406
|
+
url: string,
|
|
407
|
+
redactedUrl: string,
|
|
408
|
+
targetKey: string,
|
|
409
|
+
targetDisplayKey: string,
|
|
410
|
+
): boolean {
|
|
411
|
+
if (entry.urlKey) return entry.urlKey === targetKey;
|
|
412
|
+
if (redactedUrlIdentityUnsafe(redactedUrl) || redactedUrlIdentityUnsafe(entry.redactedUrl) || redactedUrlIdentityUnsafe(entry.url)) {
|
|
413
|
+
return false;
|
|
414
|
+
}
|
|
415
|
+
return (
|
|
416
|
+
entry.redactedUrl === redactedUrl ||
|
|
417
|
+
entry.url === url ||
|
|
418
|
+
sourceUrlDisplayCacheKey(entry.redactedUrl) === targetDisplayKey ||
|
|
419
|
+
sourceUrlDisplayCacheKey(entry.url) === targetDisplayKey
|
|
420
|
+
);
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
function redactedUrlIdentityUnsafe(url: string): boolean {
|
|
424
|
+
return /REDACTED/.test(url) || /[?&#][^=]*(?:token|secret|password|signature|sig|key|auth|session|credential)[^=]*=/i.test(url);
|
|
425
|
+
}
|
|
426
|
+
|
|
427
|
+
async function findWorkflowWebSourceByUrlFromSources(
|
|
428
|
+
config: WorkflowWebSourceCacheConfig,
|
|
429
|
+
url: string,
|
|
430
|
+
redactedUrl: string,
|
|
431
|
+
targetKey: string,
|
|
432
|
+
targetDisplayKey: string,
|
|
433
|
+
): Promise<WorkflowWebSource | undefined> {
|
|
434
|
+
let entries: string[];
|
|
435
|
+
try {
|
|
436
|
+
entries = await readdir(resolve(config.cacheDir, "sources"));
|
|
437
|
+
} catch {
|
|
438
|
+
return undefined;
|
|
439
|
+
}
|
|
440
|
+
for (const entry of entries.reverse()) {
|
|
441
|
+
if (!entry.endsWith(".json")) continue;
|
|
442
|
+
const sourceRef = entry.slice(0, -".json".length);
|
|
443
|
+
const source = await readWorkflowWebSource(config, sourceRef);
|
|
444
|
+
if (!source) continue;
|
|
445
|
+
if (source.urlKey) {
|
|
446
|
+
if (source.urlKey === targetKey) return source;
|
|
447
|
+
continue;
|
|
448
|
+
}
|
|
449
|
+
if (redactedUrlIdentityUnsafe(redactedUrl) || redactedUrlIdentityUnsafe(source.redactedUrl) || redactedUrlIdentityUnsafe(source.url)) {
|
|
450
|
+
continue;
|
|
451
|
+
}
|
|
452
|
+
if (
|
|
453
|
+
source.redactedUrl === redactedUrl ||
|
|
454
|
+
source.url === url ||
|
|
455
|
+
sourceUrlDisplayCacheKey(source.redactedUrl) === targetDisplayKey ||
|
|
456
|
+
sourceUrlDisplayCacheKey(source.url) === targetDisplayKey
|
|
457
|
+
) {
|
|
458
|
+
return source;
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
return undefined;
|
|
462
|
+
}
|
|
463
|
+
|
|
464
|
+
export async function recordWorkflowWebSourceEvent(
|
|
465
|
+
config: WorkflowWebSourceCacheConfig,
|
|
466
|
+
event: string,
|
|
467
|
+
data: Record<string, unknown> = {},
|
|
468
|
+
): Promise<void> {
|
|
469
|
+
await mkdir(resolve(config.cacheDir), { recursive: true });
|
|
470
|
+
await appendFile(
|
|
471
|
+
resolve(config.cacheDir, "events.jsonl"),
|
|
472
|
+
`${JSON.stringify({
|
|
473
|
+
schema: WORKFLOW_WEB_SOURCE_EVENT_SCHEMA,
|
|
474
|
+
at: new Date().toISOString(),
|
|
475
|
+
runId: config.runId,
|
|
476
|
+
taskId: config.taskId,
|
|
477
|
+
event,
|
|
478
|
+
...redactRecordForModel(data),
|
|
479
|
+
})}\n`,
|
|
480
|
+
"utf8",
|
|
481
|
+
);
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
export function buildWorkflowWebSourceCard(options: {
|
|
485
|
+
source: WorkflowWebSource;
|
|
486
|
+
policy: WorkflowWebSourcePolicy;
|
|
487
|
+
budget: WorkflowWebVisibleBudget;
|
|
488
|
+
duplicate?: boolean;
|
|
489
|
+
}): WorkflowWebSourceCard {
|
|
490
|
+
const previewLimit = options.duplicate
|
|
491
|
+
? options.policy.duplicatePreviewChars
|
|
492
|
+
: options.policy.previewChars;
|
|
493
|
+
const preview = consumeWorkflowWebVisibleBudget(
|
|
494
|
+
options.budget,
|
|
495
|
+
redactInlineSecrets(options.source.text),
|
|
496
|
+
previewLimit,
|
|
497
|
+
);
|
|
498
|
+
return {
|
|
499
|
+
sourceRef: options.source.sourceRef,
|
|
500
|
+
url: options.source.redactedUrl,
|
|
501
|
+
domain: options.source.domain,
|
|
502
|
+
...(options.source.title ? { title: options.source.title } : {}),
|
|
503
|
+
preview: preview.text,
|
|
504
|
+
textChars: options.source.textChars,
|
|
505
|
+
fullContentCached: true,
|
|
506
|
+
duplicate: Boolean(options.duplicate),
|
|
507
|
+
budget: {
|
|
508
|
+
limit: options.budget.limit,
|
|
509
|
+
used: preview.used,
|
|
510
|
+
remaining: preview.remaining,
|
|
511
|
+
truncated: preview.truncated,
|
|
512
|
+
},
|
|
513
|
+
next: `Use workflow_web_source_read with sourceRef=${options.source.sourceRef} and an exact query for one quote, queries:[...] or reads:[...] to batch several quotes, or claim+terms when the exact quote is unknown. Do not read workflow cache files directly.`,
|
|
514
|
+
};
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
export function readWorkflowWebSourceSnippet(options: {
|
|
518
|
+
source: WorkflowWebSource;
|
|
519
|
+
query?: string;
|
|
520
|
+
claim?: string;
|
|
521
|
+
terms?: string[];
|
|
522
|
+
maxChars: number;
|
|
523
|
+
budget: WorkflowWebVisibleBudget;
|
|
524
|
+
}): WorkflowWebSourceReadResult {
|
|
525
|
+
return readWorkflowWebSourceSnippets({
|
|
526
|
+
source: options.source,
|
|
527
|
+
requests: [
|
|
528
|
+
{
|
|
529
|
+
query: options.query,
|
|
530
|
+
claim: options.claim,
|
|
531
|
+
terms: options.terms,
|
|
532
|
+
maxChars: options.maxChars,
|
|
533
|
+
},
|
|
534
|
+
],
|
|
535
|
+
maxChars: options.maxChars,
|
|
536
|
+
budget: options.budget,
|
|
537
|
+
})[0] ?? { status: "not_found", visibleChars: 0 };
|
|
538
|
+
}
|
|
539
|
+
|
|
540
|
+
export function readWorkflowWebSourceSnippets(options: {
|
|
541
|
+
source: WorkflowWebSource;
|
|
542
|
+
requests: WorkflowWebSourceReadRequest[];
|
|
543
|
+
maxChars: number;
|
|
544
|
+
budget: WorkflowWebVisibleBudget;
|
|
545
|
+
}): WorkflowWebSourceReadResult[] {
|
|
546
|
+
let normalizedSource: NormalizedSearchText | undefined;
|
|
547
|
+
const getNormalizedSource = () => {
|
|
548
|
+
normalizedSource ??= normalizeForSearch(options.source.text);
|
|
549
|
+
return normalizedSource;
|
|
550
|
+
};
|
|
551
|
+
return options.requests.map((request) =>
|
|
552
|
+
readWorkflowWebSourceSnippetWithCache({
|
|
553
|
+
source: options.source,
|
|
554
|
+
request,
|
|
555
|
+
maxChars: request.maxChars ?? options.maxChars,
|
|
556
|
+
budget: options.budget,
|
|
557
|
+
getNormalizedSource,
|
|
558
|
+
}),
|
|
559
|
+
);
|
|
560
|
+
}
|
|
561
|
+
|
|
562
|
+
export function extractTextFromToolResult(result: unknown): string {
|
|
563
|
+
if (!isRecord(result)) return "";
|
|
564
|
+
const content = result.content;
|
|
565
|
+
if (!Array.isArray(content)) return "";
|
|
566
|
+
return content
|
|
567
|
+
.map((entry) => {
|
|
568
|
+
if (!isRecord(entry)) return "";
|
|
569
|
+
const text = entry.text;
|
|
570
|
+
return typeof text === "string" ? text : "";
|
|
571
|
+
})
|
|
572
|
+
.filter(Boolean)
|
|
573
|
+
.join("\n\n");
|
|
574
|
+
}
|
|
575
|
+
|
|
576
|
+
export function extractTitleFromToolResult(result: unknown): string | undefined {
|
|
577
|
+
if (!isRecord(result)) return undefined;
|
|
578
|
+
const details = result.details;
|
|
579
|
+
if (isRecord(details) && typeof details.title === "string") return details.title;
|
|
580
|
+
const text = extractTextFromToolResult(result);
|
|
581
|
+
const heading = text.match(/^#\s+(.+)$/m)?.[1]?.trim();
|
|
582
|
+
return heading ? heading.slice(0, 200) : undefined;
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
export function extractSearchCandidates(
|
|
586
|
+
result: unknown,
|
|
587
|
+
policy: WorkflowWebSourcePolicy = DEFAULT_WORKFLOW_WEB_SOURCE_POLICY,
|
|
588
|
+
): WorkflowWebSearchCandidate[] {
|
|
589
|
+
const text = extractTextFromToolResult(result);
|
|
590
|
+
if (!text.trim()) return [];
|
|
591
|
+
const urls = [...text.matchAll(/https?:\/\/[^\s)\]>"']+/g)].map(
|
|
592
|
+
(match) => match[0],
|
|
593
|
+
);
|
|
594
|
+
if (urls.length === 0) {
|
|
595
|
+
return [
|
|
596
|
+
{
|
|
597
|
+
snippet: redactInlineSecrets(
|
|
598
|
+
text.trim().slice(0, policy.searchSnippetChars),
|
|
599
|
+
),
|
|
600
|
+
},
|
|
601
|
+
];
|
|
602
|
+
}
|
|
603
|
+
return [...new Set(urls)].slice(0, 10).map((url) => {
|
|
604
|
+
const checked = validateWorkflowWebUrl(url, {
|
|
605
|
+
...DEFAULT_WORKFLOW_WEB_SECURITY_POLICY,
|
|
606
|
+
allowPrivateHosts: true,
|
|
607
|
+
});
|
|
608
|
+
return {
|
|
609
|
+
url: sanitizeUrlForModel(url),
|
|
610
|
+
domain: checked.ok ? checked.domain : undefined,
|
|
611
|
+
snippet: redactInlineSecrets(
|
|
612
|
+
nearbySnippet(text, url, policy.searchSnippetChars),
|
|
613
|
+
),
|
|
614
|
+
};
|
|
615
|
+
});
|
|
616
|
+
}
|
|
617
|
+
|
|
618
|
+
export function toolResultFromJson(value: unknown): {
|
|
619
|
+
content: Array<{ type: "text"; text: string }>;
|
|
620
|
+
details: Record<string, unknown>;
|
|
621
|
+
} {
|
|
622
|
+
return {
|
|
623
|
+
content: [{ type: "text", text: `${JSON.stringify(value)}\n` }],
|
|
624
|
+
details: { workflowWebSource: true },
|
|
625
|
+
};
|
|
626
|
+
}
|
|
627
|
+
|
|
628
|
+
export function errorToolResult(
|
|
629
|
+
code: string,
|
|
630
|
+
message: string,
|
|
631
|
+
extra: Record<string, unknown> = {},
|
|
632
|
+
): ReturnType<typeof toolResultFromJson> {
|
|
633
|
+
return toolResultFromJson({ status: "blocked", code, message, ...extra });
|
|
634
|
+
}
|
|
635
|
+
|
|
636
|
+
function redactUrlFragment(hash: string): string {
|
|
637
|
+
if (!hash) return "";
|
|
638
|
+
const raw = hash.startsWith("#") ? hash.slice(1) : hash;
|
|
639
|
+
if (!raw) return "";
|
|
640
|
+
try {
|
|
641
|
+
const params = new URLSearchParams(raw);
|
|
642
|
+
let changed = false;
|
|
643
|
+
for (const key of [...params.keys()]) {
|
|
644
|
+
if (SENSITIVE_QUERY_PARAM_PATTERN.test(key)) {
|
|
645
|
+
params.set(key, "REDACTED");
|
|
646
|
+
changed = true;
|
|
647
|
+
}
|
|
648
|
+
}
|
|
649
|
+
if (changed) return `#${params.toString()}`;
|
|
650
|
+
} catch {
|
|
651
|
+
// Fall through to inline redaction.
|
|
652
|
+
}
|
|
653
|
+
const redacted = redactInlineSecrets(raw);
|
|
654
|
+
return redacted ? `#${redacted}` : "";
|
|
655
|
+
}
|
|
656
|
+
|
|
657
|
+
function shouldKeepFragmentForCache(hash: string): boolean {
|
|
658
|
+
if (!hash) return false;
|
|
659
|
+
const raw = hash.startsWith("#") ? hash.slice(1) : hash;
|
|
660
|
+
return raw.startsWith("/") || raw.startsWith("!") || raw.includes("?");
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
function sourceToIndexEntry(source: WorkflowWebSource): WorkflowWebSourceIndexEntry {
|
|
664
|
+
return {
|
|
665
|
+
sourceRef: source.sourceRef,
|
|
666
|
+
createdAt: source.createdAt,
|
|
667
|
+
url: source.url,
|
|
668
|
+
redactedUrl: source.redactedUrl,
|
|
669
|
+
...(source.urlKey ? { urlKey: source.urlKey } : {}),
|
|
670
|
+
domain: source.domain,
|
|
671
|
+
...(source.title ? { title: source.title } : {}),
|
|
672
|
+
contentHash: source.contentHash,
|
|
673
|
+
textChars: source.textChars,
|
|
674
|
+
...(source.provider ? { provider: source.provider } : {}),
|
|
675
|
+
};
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
type NormalizedSearchText = ReturnType<typeof normalizeForSearch>;
|
|
679
|
+
|
|
680
|
+
function readWorkflowWebSourceSnippetWithCache(options: {
|
|
681
|
+
source: WorkflowWebSource;
|
|
682
|
+
request: WorkflowWebSourceReadRequest;
|
|
683
|
+
maxChars: number;
|
|
684
|
+
budget: WorkflowWebVisibleBudget;
|
|
685
|
+
getNormalizedSource: () => NormalizedSearchText;
|
|
686
|
+
}): WorkflowWebSourceReadResult {
|
|
687
|
+
const query = options.request.query?.trim() ?? "";
|
|
688
|
+
if (query) {
|
|
689
|
+
const exactIndex = options.source.text.indexOf(query);
|
|
690
|
+
if (exactIndex >= 0) {
|
|
691
|
+
return snippetForMatch({
|
|
692
|
+
text: options.source.text,
|
|
693
|
+
start: exactIndex,
|
|
694
|
+
end: exactIndex + query.length,
|
|
695
|
+
matchType: "exact",
|
|
696
|
+
maxChars: options.maxChars,
|
|
697
|
+
budget: options.budget,
|
|
698
|
+
});
|
|
699
|
+
}
|
|
700
|
+
const sourceNorm = options.getNormalizedSource();
|
|
701
|
+
const queryNorm = normalizeForSearch(query);
|
|
702
|
+
const normalizedIndex = sourceNorm.normalized.indexOf(queryNorm.normalized);
|
|
703
|
+
if (normalizedIndex >= 0) {
|
|
704
|
+
const start = sourceNorm.map[normalizedIndex] ?? 0;
|
|
705
|
+
const endMapIndex = Math.min(
|
|
706
|
+
sourceNorm.map.length - 1,
|
|
707
|
+
normalizedIndex + Math.max(0, queryNorm.normalized.length - 1),
|
|
708
|
+
);
|
|
709
|
+
const end = (sourceNorm.map[endMapIndex] ?? start) + 1;
|
|
710
|
+
return snippetForMatch({
|
|
711
|
+
text: options.source.text,
|
|
712
|
+
start,
|
|
713
|
+
end,
|
|
714
|
+
matchType: "normalized",
|
|
715
|
+
maxChars: options.maxChars,
|
|
716
|
+
budget: options.budget,
|
|
717
|
+
});
|
|
718
|
+
}
|
|
719
|
+
}
|
|
720
|
+
const termNeedles = prepareTermNeedles(options.request.terms, options.request.claim);
|
|
721
|
+
if (termNeedles.length === 0) return { status: "not_found", visibleChars: 0 };
|
|
722
|
+
return snippetForTerms({
|
|
723
|
+
text: options.source.text,
|
|
724
|
+
normalizedSource: options.getNormalizedSource(),
|
|
725
|
+
terms: termNeedles,
|
|
726
|
+
maxChars: options.maxChars,
|
|
727
|
+
budget: options.budget,
|
|
728
|
+
});
|
|
729
|
+
}
|
|
730
|
+
|
|
731
|
+
function snippetForTerms(options: {
|
|
732
|
+
text: string;
|
|
733
|
+
normalizedSource: NormalizedSearchText;
|
|
734
|
+
terms: string[];
|
|
735
|
+
maxChars: number;
|
|
736
|
+
budget: WorkflowWebVisibleBudget;
|
|
737
|
+
}): WorkflowWebSourceReadResult {
|
|
738
|
+
const needles = options.terms
|
|
739
|
+
.map((term) => ({ raw: term, normalized: normalizeForSearch(term).normalized }))
|
|
740
|
+
.filter((term) => term.normalized.length > 0);
|
|
741
|
+
if (needles.length === 0) return { status: "not_found", visibleChars: 0 };
|
|
742
|
+
const candidates: Array<{ start: number; end: number; matchedTerms: string[]; missingTerms: string[]; score: number }> = [];
|
|
743
|
+
for (const needle of needles) {
|
|
744
|
+
let fromIndex = 0;
|
|
745
|
+
let occurrenceCount = 0;
|
|
746
|
+
while (occurrenceCount < 20) {
|
|
747
|
+
const normalizedIndex = options.normalizedSource.normalized.indexOf(
|
|
748
|
+
needle.normalized,
|
|
749
|
+
fromIndex,
|
|
750
|
+
);
|
|
751
|
+
if (normalizedIndex < 0) break;
|
|
752
|
+
const start = options.normalizedSource.map[normalizedIndex] ?? 0;
|
|
753
|
+
const endMapIndex = Math.min(
|
|
754
|
+
options.normalizedSource.map.length - 1,
|
|
755
|
+
normalizedIndex + Math.max(0, needle.normalized.length - 1),
|
|
756
|
+
);
|
|
757
|
+
const end = (options.normalizedSource.map[endMapIndex] ?? start) + 1;
|
|
758
|
+
candidates.push(scoreTermWindow(options.text, start, end, options.maxChars, needles));
|
|
759
|
+
fromIndex = normalizedIndex + Math.max(1, needle.normalized.length);
|
|
760
|
+
occurrenceCount += 1;
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
if (candidates.length === 0) return { status: "not_found", visibleChars: 0 };
|
|
764
|
+
const best = candidates.sort((left, right) => {
|
|
765
|
+
if (right.score !== left.score) return right.score - left.score;
|
|
766
|
+
return right.matchedTerms.length - left.matchedTerms.length;
|
|
767
|
+
})[0]!;
|
|
768
|
+
const raw = redactInlineSecrets(options.text.slice(best.start, best.end));
|
|
769
|
+
const consumed = consumeWorkflowWebVisibleBudget(options.budget, raw, options.maxChars);
|
|
770
|
+
return {
|
|
771
|
+
status: "matched",
|
|
772
|
+
matchType: "terms",
|
|
773
|
+
quote: consumed.text,
|
|
774
|
+
startOffset: best.start,
|
|
775
|
+
endOffset: best.end,
|
|
776
|
+
visibleChars: consumed.text.length,
|
|
777
|
+
matchedTerms: best.matchedTerms,
|
|
778
|
+
missingTerms: best.missingTerms,
|
|
779
|
+
coverageRatio: best.matchedTerms.length / Math.max(1, needles.length),
|
|
780
|
+
candidateOnly: true,
|
|
781
|
+
};
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
function scoreTermWindow(
|
|
785
|
+
text: string,
|
|
786
|
+
matchStart: number,
|
|
787
|
+
matchEnd: number,
|
|
788
|
+
maxChars: number,
|
|
789
|
+
terms: Array<{ raw: string; normalized: string }>,
|
|
790
|
+
): { start: number; end: number; matchedTerms: string[]; missingTerms: string[]; score: number } {
|
|
791
|
+
const center = Math.floor((matchStart + matchEnd) / 2);
|
|
792
|
+
const start = Math.max(0, center - Math.floor(maxChars / 2));
|
|
793
|
+
const end = Math.min(text.length, start + maxChars);
|
|
794
|
+
const windowNorm = normalizeForSearch(text.slice(start, end)).normalized;
|
|
795
|
+
const matchedTerms = terms
|
|
796
|
+
.filter((term) => windowNorm.includes(term.normalized))
|
|
797
|
+
.map((term) => term.raw);
|
|
798
|
+
const missingTerms = terms
|
|
799
|
+
.filter((term) => !windowNorm.includes(term.normalized))
|
|
800
|
+
.map((term) => term.raw);
|
|
801
|
+
const occurrenceScore = terms.reduce((score, term) => {
|
|
802
|
+
return score + (windowNorm.includes(term.normalized) ? term.normalized.length : 0);
|
|
803
|
+
}, 0);
|
|
804
|
+
return {
|
|
805
|
+
start,
|
|
806
|
+
end,
|
|
807
|
+
matchedTerms,
|
|
808
|
+
missingTerms,
|
|
809
|
+
score: matchedTerms.length * 1_000 + occurrenceScore,
|
|
810
|
+
};
|
|
811
|
+
}
|
|
812
|
+
|
|
813
|
+
function prepareTermNeedles(terms: string[] | undefined, claim: string | undefined): string[] {
|
|
814
|
+
const explicitTerms = dedupeStrings((terms ?? []).map((term) => term.trim()).filter(Boolean));
|
|
815
|
+
if (explicitTerms.length > 0) return explicitTerms.slice(0, 16);
|
|
816
|
+
if (!claim?.trim()) return [];
|
|
817
|
+
return extractClaimTerms(claim).slice(0, 16);
|
|
818
|
+
}
|
|
819
|
+
|
|
820
|
+
function extractClaimTerms(claim: string): string[] {
|
|
821
|
+
const tokens = claim
|
|
822
|
+
.match(/[\p{L}\p{N}][\p{L}\p{N}._/-]{2,}/gu)
|
|
823
|
+
?.map((token) => token.toLowerCase()) ?? [];
|
|
824
|
+
const filtered = tokens.filter((token) => !SOURCE_READ_STOPWORDS.has(token));
|
|
825
|
+
return dedupeStrings(filtered).sort((left, right) => right.length - left.length);
|
|
826
|
+
}
|
|
827
|
+
|
|
828
|
+
function dedupeStrings(values: string[]): string[] {
|
|
829
|
+
const seen = new Set<string>();
|
|
830
|
+
const deduped: string[] = [];
|
|
831
|
+
for (const value of values) {
|
|
832
|
+
const key = normalizeForSearch(value).normalized;
|
|
833
|
+
if (!key || seen.has(key)) continue;
|
|
834
|
+
seen.add(key);
|
|
835
|
+
deduped.push(value);
|
|
836
|
+
}
|
|
837
|
+
return deduped;
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
const SOURCE_READ_STOPWORDS = new Set([
|
|
841
|
+
"about",
|
|
842
|
+
"across",
|
|
843
|
+
"after",
|
|
844
|
+
"against",
|
|
845
|
+
"also",
|
|
846
|
+
"because",
|
|
847
|
+
"before",
|
|
848
|
+
"between",
|
|
849
|
+
"claim",
|
|
850
|
+
"claims",
|
|
851
|
+
"could",
|
|
852
|
+
"does",
|
|
853
|
+
"from",
|
|
854
|
+
"have",
|
|
855
|
+
"into",
|
|
856
|
+
"more",
|
|
857
|
+
"must",
|
|
858
|
+
"only",
|
|
859
|
+
"other",
|
|
860
|
+
"over",
|
|
861
|
+
"should",
|
|
862
|
+
"source",
|
|
863
|
+
"sources",
|
|
864
|
+
"than",
|
|
865
|
+
"that",
|
|
866
|
+
"their",
|
|
867
|
+
"there",
|
|
868
|
+
"these",
|
|
869
|
+
"this",
|
|
870
|
+
"through",
|
|
871
|
+
"under",
|
|
872
|
+
"using",
|
|
873
|
+
"when",
|
|
874
|
+
"where",
|
|
875
|
+
"which",
|
|
876
|
+
"with",
|
|
877
|
+
"without",
|
|
878
|
+
]);
|
|
879
|
+
|
|
880
|
+
function snippetForMatch(options: {
|
|
881
|
+
text: string;
|
|
882
|
+
start: number;
|
|
883
|
+
end: number;
|
|
884
|
+
matchType: "exact" | "normalized";
|
|
885
|
+
maxChars: number;
|
|
886
|
+
budget: WorkflowWebVisibleBudget;
|
|
887
|
+
}): WorkflowWebSourceReadResult {
|
|
888
|
+
const matchLength = Math.max(0, options.end - options.start);
|
|
889
|
+
const slack = Math.max(0, options.maxChars - matchLength);
|
|
890
|
+
const before = Math.floor(slack / 2);
|
|
891
|
+
const snippetStart = Math.max(0, options.start - before);
|
|
892
|
+
const snippetEnd = Math.min(options.text.length, snippetStart + options.maxChars);
|
|
893
|
+
const raw = redactInlineSecrets(options.text.slice(snippetStart, snippetEnd));
|
|
894
|
+
const consumed = consumeWorkflowWebVisibleBudget(
|
|
895
|
+
options.budget,
|
|
896
|
+
raw,
|
|
897
|
+
options.maxChars,
|
|
898
|
+
);
|
|
899
|
+
return {
|
|
900
|
+
status: "matched",
|
|
901
|
+
matchType: options.matchType,
|
|
902
|
+
quote: consumed.text,
|
|
903
|
+
startOffset: options.start,
|
|
904
|
+
endOffset: options.end,
|
|
905
|
+
visibleChars: consumed.text.length,
|
|
906
|
+
};
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
function normalizeForSearch(text: string): { normalized: string; map: number[] } {
|
|
910
|
+
let normalized = "";
|
|
911
|
+
const map: number[] = [];
|
|
912
|
+
let previousWhitespace = false;
|
|
913
|
+
for (let index = 0; index < text.length; index += 1) {
|
|
914
|
+
const raw = text[index]!;
|
|
915
|
+
let folded = raw.normalize("NFKC").toLowerCase();
|
|
916
|
+
folded = folded
|
|
917
|
+
.replace(/[\u2018\u2019\u201A\u201B]/g, "'")
|
|
918
|
+
.replace(/[\u201C\u201D\u201E\u201F]/g, '"')
|
|
919
|
+
.replace(/[\u2010-\u2015\u2212]/g, "-");
|
|
920
|
+
if (/\s/.test(folded)) {
|
|
921
|
+
if (!previousWhitespace) {
|
|
922
|
+
normalized += " ";
|
|
923
|
+
map.push(index);
|
|
924
|
+
}
|
|
925
|
+
previousWhitespace = true;
|
|
926
|
+
continue;
|
|
927
|
+
}
|
|
928
|
+
previousWhitespace = false;
|
|
929
|
+
for (const char of folded) {
|
|
930
|
+
normalized += char;
|
|
931
|
+
map.push(index);
|
|
932
|
+
}
|
|
933
|
+
}
|
|
934
|
+
return { normalized: normalized.trim(), map };
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
function nearbySnippet(text: string, needle: string, maxChars: number): string {
|
|
938
|
+
const index = text.indexOf(needle);
|
|
939
|
+
if (index < 0) return text.trim().slice(0, maxChars);
|
|
940
|
+
const start = Math.max(0, index - Math.floor(maxChars / 2));
|
|
941
|
+
return text.slice(start, start + maxChars).trim();
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
async function readWorkflowWebSourceIndexFile(
|
|
945
|
+
config: WorkflowWebSourceCacheConfig,
|
|
946
|
+
): Promise<WorkflowWebSourceIndex> {
|
|
947
|
+
try {
|
|
948
|
+
const parsed = JSON.parse(await readFile(indexPath(config), "utf8")) as unknown;
|
|
949
|
+
if (!isRecord(parsed) || parsed.schema !== WORKFLOW_WEB_SOURCE_INDEX_SCHEMA) {
|
|
950
|
+
throw new Error("invalid index");
|
|
951
|
+
}
|
|
952
|
+
const sources = Array.isArray(parsed.sources)
|
|
953
|
+
? parsed.sources.flatMap((entry) => {
|
|
954
|
+
const normalized = sourceIndexEntryFromUnknown(entry);
|
|
955
|
+
return normalized ? [normalized] : [];
|
|
956
|
+
})
|
|
957
|
+
: [];
|
|
958
|
+
return {
|
|
959
|
+
schema: WORKFLOW_WEB_SOURCE_INDEX_SCHEMA,
|
|
960
|
+
updatedAt: typeof parsed.updatedAt === "string" ? parsed.updatedAt : new Date().toISOString(),
|
|
961
|
+
runId: typeof parsed.runId === "string" ? parsed.runId : config.runId,
|
|
962
|
+
sources: mergeSourceIndexEntries(sources),
|
|
963
|
+
};
|
|
964
|
+
} catch {
|
|
965
|
+
return emptyWorkflowWebSourceIndex(config);
|
|
966
|
+
}
|
|
967
|
+
}
|
|
968
|
+
|
|
969
|
+
async function appendWorkflowWebSourceIndexEvent(
|
|
970
|
+
config: WorkflowWebSourceCacheConfig,
|
|
971
|
+
entry: WorkflowWebSourceIndexEntry,
|
|
972
|
+
): Promise<void> {
|
|
973
|
+
await mkdir(resolve(config.cacheDir), { recursive: true });
|
|
974
|
+
await appendFile(
|
|
975
|
+
indexEventsPath(config),
|
|
976
|
+
`${JSON.stringify({
|
|
977
|
+
schema: WORKFLOW_WEB_SOURCE_INDEX_EVENT_SCHEMA,
|
|
978
|
+
at: new Date().toISOString(),
|
|
979
|
+
runId: config.runId,
|
|
980
|
+
taskId: config.taskId,
|
|
981
|
+
entry,
|
|
982
|
+
})}\n`,
|
|
983
|
+
"utf8",
|
|
984
|
+
);
|
|
985
|
+
}
|
|
986
|
+
|
|
987
|
+
async function readWorkflowWebSourceIndexLedger(
|
|
988
|
+
config: WorkflowWebSourceCacheConfig,
|
|
989
|
+
): Promise<WorkflowWebSourceIndexEntry[]> {
|
|
990
|
+
let text: string;
|
|
991
|
+
try {
|
|
992
|
+
text = await readFile(indexEventsPath(config), "utf8");
|
|
993
|
+
} catch {
|
|
994
|
+
return [];
|
|
995
|
+
}
|
|
996
|
+
const entries: WorkflowWebSourceIndexEntry[] = [];
|
|
997
|
+
for (const line of text.split(/\r?\n/)) {
|
|
998
|
+
if (!line.trim()) continue;
|
|
999
|
+
try {
|
|
1000
|
+
const parsed = JSON.parse(line) as unknown;
|
|
1001
|
+
if (!isRecord(parsed) || parsed.schema !== WORKFLOW_WEB_SOURCE_INDEX_EVENT_SCHEMA) continue;
|
|
1002
|
+
const entry = sourceIndexEntryFromUnknown(parsed.entry);
|
|
1003
|
+
if (entry) entries.push(entry);
|
|
1004
|
+
} catch {
|
|
1005
|
+
// Ignore torn or corrupt ledger lines; source file scan still provides a final fallback.
|
|
1006
|
+
}
|
|
1007
|
+
}
|
|
1008
|
+
return entries;
|
|
1009
|
+
}
|
|
1010
|
+
|
|
1011
|
+
function sourceIndexEntryFromUnknown(value: unknown): WorkflowWebSourceIndexEntry | undefined {
|
|
1012
|
+
if (!isRecord(value)) return undefined;
|
|
1013
|
+
if (typeof value.sourceRef !== "string" || !isWorkflowWebSourceRef(value.sourceRef)) return undefined;
|
|
1014
|
+
if (typeof value.createdAt !== "string") return undefined;
|
|
1015
|
+
if (typeof value.url !== "string") return undefined;
|
|
1016
|
+
if (typeof value.redactedUrl !== "string") return undefined;
|
|
1017
|
+
if (typeof value.domain !== "string") return undefined;
|
|
1018
|
+
if (typeof value.contentHash !== "string") return undefined;
|
|
1019
|
+
if (!Number.isFinite(Number(value.textChars))) return undefined;
|
|
1020
|
+
return {
|
|
1021
|
+
sourceRef: value.sourceRef,
|
|
1022
|
+
createdAt: value.createdAt,
|
|
1023
|
+
url: value.url,
|
|
1024
|
+
redactedUrl: value.redactedUrl,
|
|
1025
|
+
...(typeof value.urlKey === "string" ? { urlKey: value.urlKey } : {}),
|
|
1026
|
+
domain: value.domain,
|
|
1027
|
+
...(typeof value.title === "string" ? { title: value.title } : {}),
|
|
1028
|
+
contentHash: value.contentHash,
|
|
1029
|
+
textChars: Number(value.textChars),
|
|
1030
|
+
...(typeof value.provider === "string" ? { provider: value.provider } : {}),
|
|
1031
|
+
};
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
function mergeSourceIndexEntries(
|
|
1035
|
+
entries: WorkflowWebSourceIndexEntry[],
|
|
1036
|
+
): WorkflowWebSourceIndexEntry[] {
|
|
1037
|
+
const bySourceRef = new Map<string, WorkflowWebSourceIndexEntry>();
|
|
1038
|
+
for (const entry of entries) bySourceRef.set(entry.sourceRef, entry);
|
|
1039
|
+
return [...bySourceRef.values()].sort((left, right) => left.createdAt.localeCompare(right.createdAt));
|
|
1040
|
+
}
|
|
1041
|
+
|
|
1042
|
+
function emptyWorkflowWebSourceIndex(
|
|
1043
|
+
config: WorkflowWebSourceCacheConfig,
|
|
1044
|
+
): WorkflowWebSourceIndex {
|
|
1045
|
+
return {
|
|
1046
|
+
schema: WORKFLOW_WEB_SOURCE_INDEX_SCHEMA,
|
|
1047
|
+
updatedAt: new Date().toISOString(),
|
|
1048
|
+
runId: config.runId,
|
|
1049
|
+
sources: [],
|
|
1050
|
+
};
|
|
1051
|
+
}
|
|
1052
|
+
|
|
1053
|
+
function indexPath(config: WorkflowWebSourceCacheConfig): string {
|
|
1054
|
+
return resolve(config.cacheDir, "index.json");
|
|
1055
|
+
}
|
|
1056
|
+
|
|
1057
|
+
function indexEventsPath(config: WorkflowWebSourceCacheConfig): string {
|
|
1058
|
+
return resolve(config.cacheDir, "index-events.jsonl");
|
|
1059
|
+
}
|
|
1060
|
+
|
|
1061
|
+
function sourceObjectPath(
|
|
1062
|
+
config: WorkflowWebSourceCacheConfig,
|
|
1063
|
+
sourceRef: string,
|
|
1064
|
+
): string {
|
|
1065
|
+
if (!isWorkflowWebSourceRef(sourceRef)) {
|
|
1066
|
+
throw new Error("invalid workflow web sourceRef");
|
|
1067
|
+
}
|
|
1068
|
+
const sourcesDir = resolve(config.cacheDir, "sources");
|
|
1069
|
+
const path = resolve(sourcesDir, `${sourceRef}.json`);
|
|
1070
|
+
if (!path.startsWith(`${sourcesDir}/`)) {
|
|
1071
|
+
throw new Error("workflow web sourceRef escaped source cache");
|
|
1072
|
+
}
|
|
1073
|
+
return path;
|
|
1074
|
+
}
|
|
1075
|
+
|
|
1076
|
+
function isWorkflowWebSourceRef(sourceRef: string): boolean {
|
|
1077
|
+
return /^wsrc_[a-f0-9]{32}$/.test(sourceRef);
|
|
1078
|
+
}
|
|
1079
|
+
|
|
1080
|
+
async function writeJsonAtomic(path: string, value: unknown): Promise<void> {
|
|
1081
|
+
await mkdir(dirname(path), { recursive: true });
|
|
1082
|
+
const tmp = `${path}.${process.pid}.${Date.now()}.tmp`;
|
|
1083
|
+
await writeFile(tmp, `${JSON.stringify(value, null, 2)}\n`, "utf8");
|
|
1084
|
+
await rename(tmp, path);
|
|
1085
|
+
}
|
|
1086
|
+
|
|
1087
|
+
function hashString(value: string): string {
|
|
1088
|
+
return createHash("sha256").update(value).digest("hex");
|
|
1089
|
+
}
|
|
1090
|
+
|
|
1091
|
+
function isPrivateHostname(host: string): boolean {
|
|
1092
|
+
if (PRIVATE_HOST_PATTERNS.some((pattern) => pattern.test(host))) return true;
|
|
1093
|
+
return nonPublicIpReason(host) !== undefined;
|
|
1094
|
+
}
|
|
1095
|
+
|
|
1096
|
+
function nonPublicIpReason(address: string): string | undefined {
|
|
1097
|
+
const lower = address.toLowerCase().replace(/^\[|\]$/g, "");
|
|
1098
|
+
const mappedIpv4 = lower.match(/^::ffff:(\d+\.\d+\.\d+\.\d+)$/)?.[1];
|
|
1099
|
+
if (mappedIpv4) return nonPublicIpReason(mappedIpv4);
|
|
1100
|
+
const hexMapped = lower.match(/^::ffff:([0-9a-f]{1,4}):([0-9a-f]{1,4})$/);
|
|
1101
|
+
if (hexMapped) {
|
|
1102
|
+
const high = Number.parseInt(hexMapped[1]!, 16);
|
|
1103
|
+
const low = Number.parseInt(hexMapped[2]!, 16);
|
|
1104
|
+
return nonPublicIpReason(`${high >> 8}.${high & 255}.${low >> 8}.${low & 255}`);
|
|
1105
|
+
}
|
|
1106
|
+
if (isIP(lower) === 4) {
|
|
1107
|
+
const parts = lower.split(".").map((part) => Number(part));
|
|
1108
|
+
if (parts.length !== 4 || parts.some((part) => !Number.isInteger(part) || part < 0 || part > 255)) return "non_public_ip_blocked";
|
|
1109
|
+
const [a, b, c, d] = parts as [number, number, number, number];
|
|
1110
|
+
if (a === 0 || a === 10 || a === 127 || a >= 224) return "non_public_ip_blocked";
|
|
1111
|
+
if (a === 100 && b >= 64 && b <= 127) return "non_public_ip_blocked";
|
|
1112
|
+
if (a === 169 && b === 254) return "non_public_ip_blocked";
|
|
1113
|
+
if (a === 172 && b >= 16 && b <= 31) return "non_public_ip_blocked";
|
|
1114
|
+
if (a === 192 && b === 168) return "non_public_ip_blocked";
|
|
1115
|
+
if (a === 192 && b === 0 && (c === 0 || c === 2)) return "non_public_ip_blocked";
|
|
1116
|
+
if (a === 198 && (b === 18 || b === 19)) return "non_public_ip_blocked";
|
|
1117
|
+
if (a === 198 && b === 51 && c === 100) return "non_public_ip_blocked";
|
|
1118
|
+
if (a === 203 && b === 0 && c === 113) return "non_public_ip_blocked";
|
|
1119
|
+
if (a === 255 && b === 255 && c === 255 && d === 255) return "non_public_ip_blocked";
|
|
1120
|
+
}
|
|
1121
|
+
if (isIP(lower) === 6) {
|
|
1122
|
+
if (lower === "::" || lower === "::1") return "non_public_ip_blocked";
|
|
1123
|
+
if (lower.startsWith("fc") || lower.startsWith("fd")) return "non_public_ip_blocked";
|
|
1124
|
+
if (lower.startsWith("fe80") || lower.startsWith("ff")) return "non_public_ip_blocked";
|
|
1125
|
+
if (lower.startsWith("2001:db8")) return "non_public_ip_blocked";
|
|
1126
|
+
}
|
|
1127
|
+
return undefined;
|
|
1128
|
+
}
|
|
1129
|
+
|
|
1130
|
+
function redactRecordForModel(
|
|
1131
|
+
value: Record<string, unknown>,
|
|
1132
|
+
): Record<string, unknown> {
|
|
1133
|
+
return Object.fromEntries(
|
|
1134
|
+
Object.entries(value).map(([key, item]) => [key, redactValueForModel(item)]),
|
|
1135
|
+
);
|
|
1136
|
+
}
|
|
1137
|
+
|
|
1138
|
+
function redactValueForModel(value: unknown): unknown {
|
|
1139
|
+
if (typeof value === "string") return redactInlineSecrets(sanitizeUrlMaybe(value));
|
|
1140
|
+
if (Array.isArray(value)) return value.map((item) => redactValueForModel(item));
|
|
1141
|
+
if (!isRecord(value)) return value;
|
|
1142
|
+
return redactRecordForModel(value);
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
function sanitizeUrlMaybe(value: string): string {
|
|
1146
|
+
return /^https?:\/\//i.test(value) ? sanitizeUrlForModel(value) : value;
|
|
1147
|
+
}
|
|
1148
|
+
|
|
1149
|
+
function redactInlineSecrets(value: string): string {
|
|
1150
|
+
const withSanitizedUrls = value.replace(/https?:\/\/[^\s)\]}>"']+/gi, (match) => {
|
|
1151
|
+
const trailing = match.match(/[.,;:!?]+$/)?.[0] ?? "";
|
|
1152
|
+
const core = trailing ? match.slice(0, -trailing.length) : match;
|
|
1153
|
+
try {
|
|
1154
|
+
return `${sanitizeParsedUrlForModel(new URL(core))}${trailing}`;
|
|
1155
|
+
} catch {
|
|
1156
|
+
return match;
|
|
1157
|
+
}
|
|
1158
|
+
});
|
|
1159
|
+
return redactInlineSecretsNoUrls(withSanitizedUrls);
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1162
|
+
function redactInlineSecretsNoUrls(value: string): string {
|
|
1163
|
+
return value
|
|
1164
|
+
.replace(/(authorization|cookie|set-cookie):\s*[^\n\r]+/gi, "$1: REDACTED")
|
|
1165
|
+
.replace(/(token|secret|password|api[-_]?key)=([^\s&]+)/gi, "$1=REDACTED")
|
|
1166
|
+
.replace(/\/Users\/[^\s:'")]+/g, "/Users/REDACTED");
|
|
1167
|
+
}
|
|
1168
|
+
|
|
1169
|
+
function isRecord(value: unknown): value is Record<string, unknown> {
|
|
1170
|
+
return typeof value === "object" && value !== null && !Array.isArray(value);
|
|
1171
|
+
}
|