ultimate-pi 0.19.0 → 0.20.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/.agents/skills/web-retrieval/SKILL.md +163 -0
  2. package/.agents/skills/wiki-autoresearch/SKILL.md +6 -6
  3. package/.pi/SYSTEM.md +30 -12
  4. package/.pi/agents/harness/planning/implementation-researcher.md +1 -1
  5. package/.pi/agents/harness/planning/stack-researcher.md +5 -1
  6. package/.pi/agents/harness/running/executor.md +42 -1
  7. package/.pi/agents/harness/web-retrieval/web-answerer.md +35 -0
  8. package/.pi/agents/harness/web-retrieval/web-criteria-verifier.md +28 -0
  9. package/.pi/agents/harness/web-retrieval/web-gap-analyzer.md +31 -0
  10. package/.pi/agents/harness/web-retrieval/web-query-expander-fast.md +34 -0
  11. package/.pi/agents/harness/web-retrieval/web-query-expander.md +60 -0
  12. package/.pi/agents/harness/web-retrieval/web-summarizer.md +18 -0
  13. package/.pi/extensions/harness-anchored-edit.ts +141 -0
  14. package/.pi/extensions/harness-web-guard.ts +2 -1
  15. package/.pi/extensions/harness-web-tools.ts +689 -51
  16. package/.pi/harness/agents.manifest.json +30 -6
  17. package/.pi/harness/agents.policy.yaml +37 -4
  18. package/.pi/harness/docs/adrs/0050-agentic-web-retrieval-stack.md +46 -0
  19. package/.pi/harness/docs/adrs/0051-hash-anchored-executor-edits.md +41 -0
  20. package/.pi/harness/docs/adrs/README.md +2 -0
  21. package/.pi/harness/docs/harness-web-search.md +97 -0
  22. package/.pi/harness/docs/practice-map.md +11 -0
  23. package/.pi/harness/env.harness.template +9 -1
  24. package/.pi/harness/examples/web-heuristic-angles.project.yaml +22 -0
  25. package/.pi/harness/web-heuristic-angles.json +278 -0
  26. package/.pi/harness/web-heuristic-angles.yaml +182 -0
  27. package/.pi/lib/agents-policy.d.mts +4 -0
  28. package/.pi/lib/agents-policy.mjs +49 -1
  29. package/.pi/lib/agents-policy.ts +1 -0
  30. package/.pi/lib/harness-anchored-edit/.hash_anchors +1721 -0
  31. package/.pi/lib/harness-anchored-edit/anchor-state.ts +320 -0
  32. package/.pi/lib/harness-anchored-edit/apply-anchored-edits.ts +161 -0
  33. package/.pi/lib/harness-anchored-edit/edit-executor.ts +146 -0
  34. package/.pi/lib/harness-anchored-edit/index.ts +9 -0
  35. package/.pi/lib/harness-anchored-edit/line-protocol.ts +38 -0
  36. package/.pi/lib/harness-anchored-edit/settings.ts +1 -0
  37. package/.pi/lib/harness-anchored-edit/task-id.ts +8 -0
  38. package/.pi/lib/harness-anchored-edit/types.ts +19 -0
  39. package/.pi/lib/harness-lens/clients/anchored-edit-autopatch.ts +158 -0
  40. package/.pi/lib/harness-lens/index.ts +24 -7
  41. package/.pi/lib/harness-subagent-auth.ts +39 -9
  42. package/.pi/lib/harness-subagents-bridge.ts +24 -1
  43. package/.pi/lib/harness-web/artifacts.ts +200 -0
  44. package/.pi/lib/harness-web/cache.ts +369 -0
  45. package/.pi/lib/harness-web/run-cli.ts +42 -2
  46. package/.pi/prompts/harness-plan.md +1 -0
  47. package/.pi/prompts/harness-setup.md +3 -1
  48. package/.pi/prompts/harness-steer.md +1 -1
  49. package/.pi/scripts/gen-web-heuristic-angles-json.mjs +24 -0
  50. package/.pi/scripts/harness-anchored-edit-smoke.mjs +45 -0
  51. package/.pi/scripts/harness-cli-verify.sh +5 -0
  52. package/.pi/scripts/harness-verify.mjs +145 -0
  53. package/.pi/scripts/harness-web-policy-guard.mjs +1 -1
  54. package/.pi/scripts/harness-web.py +218 -15
  55. package/.pi/scripts/harness_web/deep_search.py +55 -0
  56. package/.pi/scripts/harness_web/evidence_bundle.py +47 -0
  57. package/.pi/scripts/harness_web/find_similar.py +88 -0
  58. package/.pi/scripts/harness_web/heuristic_angles_shipped.py +85 -0
  59. package/.pi/scripts/harness_web/heuristic_config.py +251 -0
  60. package/.pi/scripts/harness_web/highlights.py +47 -0
  61. package/.pi/scripts/harness_web/multi_search.py +59 -0
  62. package/.pi/scripts/harness_web/output.py +24 -0
  63. package/.pi/scripts/harness_web/query_angles.py +116 -0
  64. package/.pi/scripts/harness_web/rank.py +163 -0
  65. package/.pi/scripts/harness_web/scrape.py +30 -0
  66. package/.pi/scripts/run-tests.mjs +64 -0
  67. package/.pi/scripts/tests/test_harness_web_heuristic_config.py +132 -0
  68. package/.pi/scripts/tests/test_harness_web_query_angles.py +45 -0
  69. package/.pi/scripts/tests/test_harness_web_rank.py +56 -0
  70. package/AGENTS.md +2 -2
  71. package/CHANGELOG.md +12 -0
  72. package/THIRD_PARTY_NOTICES.md +7 -0
  73. package/package.json +7 -4
  74. package/vendor/pi-subagents/src/agents.ts +5 -0
  75. package/vendor/pi-subagents/src/subagents.ts +22 -3
  76. package/.agents/skills/scrapling-web/SKILL.md +0 -98
  77. package/.pi/extensions/00-posthog-network-bootstrap.ts +0 -11
  78. package/.pi/scripts/harness_web/__pycache__/__init__.cpython-314.pyc +0 -0
  79. package/.pi/scripts/harness_web/__pycache__/config.cpython-314.pyc +0 -0
  80. package/.pi/scripts/harness_web/__pycache__/output.cpython-314.pyc +0 -0
  81. package/.pi/scripts/harness_web/__pycache__/scrape.cpython-314.pyc +0 -0
  82. package/.pi/scripts/harness_web/__pycache__/search.cpython-314.pyc +0 -0
  83. package/.pi/scripts/harness_web/__pycache__/search_ddg.cpython-314.pyc +0 -0
  84. package/.pi/scripts/harness_web/__pycache__/search_searxng.cpython-314.pyc +0 -0
  85. package/.pi/scripts/release.sh +0 -338
@@ -1,14 +1,34 @@
1
1
  /**
2
- * harness-web-tools — web_search + web_fetch pi tools wrapping harness-web.py.
2
+ * harness-web-tools — WRS web_search, web_fetch, web_find_similar, web_contents.
3
3
  */
4
4
 
5
+ import { mkdirSync, writeFileSync } from "node:fs";
6
+ import { dirname, resolve } from "node:path";
5
7
  import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
6
8
  import { Type } from "@sinclair/typebox";
7
9
  import { claimHarnessGovernanceLoad } from "../lib/extension-load-guard.js";
10
+ import {
11
+ rememberSessionWebArtifactDir,
12
+ resolveWebOutputPath,
13
+ webArtifactScopeHint,
14
+ type WebArtifactScope,
15
+ } from "../lib/harness-web/artifacts.js";
16
+ import {
17
+ fingerprintFile,
18
+ formatCacheAge,
19
+ lookupFetchCache,
20
+ lookupSearchCache,
21
+ publishWorkspaceAlias,
22
+ writeFetchCacheEntry,
23
+ writeSearchCacheEntry,
24
+ type FetchCacheContext,
25
+ type SearchCacheContext,
26
+ } from "../lib/harness-web/cache.js";
8
27
  import {
9
28
  harnessWebContextLine,
10
29
  readTextExcerpt,
11
30
  runHarnessWeb,
31
+ summarizeDeepSearchJson,
12
32
  summarizeSearchJson,
13
33
  } from "../lib/harness-web/run-cli.js";
14
34
 
@@ -16,24 +36,106 @@ import {
16
36
  const MODULE_URL = import.meta.url;
17
37
 
18
38
  const WEB_SEARCH_GUIDELINES = [
19
- "Use web_search for open-web SERP never preflight UP_PKG, ls harness-web.py, or python3 -c import scrapling.",
20
- "Never use Firecrawl, curl/wget for search, or scrapling CLI for SERP.",
21
- "After search, use web_fetch on URLs or read the output JSON under .web/.",
22
- "Use bulk:true only when you need search plus multi-page scrape in one step.",
39
+ "DEFAULT tier=deep for landscape, prior art, comparisons, planning research, or any multi-source question.",
40
+ "Before deep (research): spawn harness/web-retrieval/web-query-expander <artifactDir>/angles.yaml anglesFile on web_search.",
41
+ "Latency: tier=instant|standard with NO expander; or web-query-expander-fast (2–3 angles); or expandHeuristic:true (no LLM).",
42
+ "tier=standard ONLY for one narrow fact or after search-deep.json exists.",
43
+ "tier=instant ONLY when latency-critical and the question is closed-form.",
44
+ "Set HARNESS_WEB_FAST_MODEL / HARNESS_WEB_EXPANDER_MODEL / HARNESS_WEB_QUALITY_MODEL env (provider/model-id) for web subagents (web-retrieval skill).",
45
+ "Never run 3+ web_search calls with different queries; use one deep search instead.",
46
+ "After deep: read <artifactDir>/search-deep.json; web_fetch with highlights:true before full scrape.",
47
+ "bulk:true only when you need immediate markdown for top N URLs.",
48
+ "Library docs: context7 only, not web_search.",
49
+ "Never preflight UP_PKG, ls harness-web.py, or python3 -c import scrapling before searching.",
23
50
  ];
24
51
 
25
52
  const WEB_FETCH_GUIDELINES = [
53
+ "Prefer highlights:true + highlightQuery after deep search before full page markdown.",
26
54
  "Use web_fetch for page markdown or same-host link maps — never curl/wget the URL.",
27
55
  "Never use raw scrapling CLI for fetch; harness-web handles Scrapling bootstrap.",
28
56
  "Library API documentation → context7 only, not web_fetch.",
29
57
  "Set fast:true for static docs (example.com, raw HTML docs, localhost).",
30
58
  ];
31
59
 
60
+ const WEB_FIND_SIMILAR_GUIDELINES = [
61
+ "Use when you have a good seed URL and want more pages like it (Exa findSimilar analog).",
62
+ "Prefer over manually re-phrasing the same intent in multiple web_search calls.",
63
+ "Output is search-deep.json shape; follow with web_fetch highlights on top hits.",
64
+ ];
65
+
66
+ const WEB_CONTENTS_GUIDELINES = [
67
+ "Batch-fetch URLs after deep search — pass fromSearch pointing at search-deep.json.",
68
+ "Use after web_search(tier=deep), not instead of deep search.",
69
+ "Set highlights:true when building an evidence bundle for web-answerer.",
70
+ ];
71
+
72
+ const WebScopeSchema = Type.Optional(
73
+ Type.String({
74
+ description:
75
+ "WRS workspace directory (default .web/; set HARNESS_WEB_ISOLATE=1 for per-run/session dirs)",
76
+ }),
77
+ );
78
+
79
+ const WebCacheControlSchema = {
80
+ refreshCache: Type.Optional(
81
+ Type.Boolean({
82
+ description: "Bypass pooled .web/cache and refetch from the network",
83
+ default: false,
84
+ }),
85
+ ),
86
+ cacheMaxAge: Type.Optional(
87
+ Type.Number({
88
+ description: "Reuse cache entry only if younger than this many seconds",
89
+ minimum: 60,
90
+ }),
91
+ ),
92
+ };
93
+
32
94
  const WebSearchSchema = Type.Object({
33
- query: Type.String({ description: "Search query" }),
95
+ query: Type.String({ description: "Search query or research intent" }),
96
+ webScope: WebScopeSchema,
97
+ tier: Type.Optional(
98
+ Type.Union(
99
+ [
100
+ Type.Literal("instant"),
101
+ Type.Literal("standard"),
102
+ Type.Literal("deep"),
103
+ Type.Literal("research"),
104
+ ],
105
+ {
106
+ description:
107
+ "WRS tier: deep (default for research), standard (narrow follow-up), instant (fast fact)",
108
+ default: "deep",
109
+ },
110
+ ),
111
+ ),
112
+ anglesFile: Type.Optional(
113
+ Type.String({
114
+ description:
115
+ "Path to angles YAML from web-query-expander (required for tier=deep unless angles provided)",
116
+ }),
117
+ ),
118
+ angles: Type.Optional(
119
+ Type.Array(Type.String(), {
120
+ description: "Inline search queries (one per angle); skips anglesFile",
121
+ minItems: 2,
122
+ maxItems: 8,
123
+ }),
124
+ ),
125
+ category: Type.Optional(
126
+ Type.String({
127
+ description: "Expander hint: code|company|people|paper|news",
128
+ }),
129
+ ),
130
+ expandHeuristic: Type.Optional(
131
+ Type.Boolean({
132
+ description: "Emergency angle templates without expander (fallback only)",
133
+ default: false,
134
+ }),
135
+ ),
34
136
  limit: Type.Optional(
35
137
  Type.Number({
36
- description: "Max results (default 5)",
138
+ description: "Max results (tier defaults: instant 5, standard 10, deep 10)",
37
139
  minimum: 1,
38
140
  maximum: 20,
39
141
  }),
@@ -41,7 +143,7 @@ const WebSearchSchema = Type.Object({
41
143
  output: Type.Optional(
42
144
  Type.String({
43
145
  description:
44
- "Output path (default .web/search.json or .web/bulk for bulk)",
146
+ "Output path (default .web/search-deep.json for deep, .web/search.json otherwise)",
45
147
  }),
46
148
  ),
47
149
  bulk: Type.Optional(
@@ -51,10 +153,12 @@ const WebSearchSchema = Type.Object({
51
153
  default: false,
52
154
  }),
53
155
  ),
156
+ ...WebCacheControlSchema,
54
157
  });
55
158
 
56
159
  const WebFetchSchema = Type.Object({
57
160
  url: Type.String({ description: "URL to fetch" }),
161
+ webScope: WebScopeSchema,
58
162
  mode: Type.Optional(
59
163
  Type.Union([Type.Literal("scrape"), Type.Literal("map")], {
60
164
  description: "scrape (markdown) or map (same-host links JSON)",
@@ -70,6 +174,18 @@ const WebFetchSchema = Type.Object({
70
174
  default: false,
71
175
  }),
72
176
  ),
177
+ highlights: Type.Optional(
178
+ Type.Boolean({
179
+ description: "Extract query-aligned excerpts to highlights JSON",
180
+ default: false,
181
+ }),
182
+ ),
183
+ highlightQuery: Type.Optional(
184
+ Type.String({ description: "Query for highlight scoring (required if highlights)" }),
185
+ ),
186
+ highlightsOutput: Type.Optional(
187
+ Type.String({ description: "Highlights JSON path (default .web/highlights.json)" }),
188
+ ),
73
189
  limit: Type.Optional(
74
190
  Type.Number({
75
191
  description: "For map mode: max links (default 100)",
@@ -77,6 +193,50 @@ const WebFetchSchema = Type.Object({
77
193
  maximum: 500,
78
194
  }),
79
195
  ),
196
+ ...WebCacheControlSchema,
197
+ });
198
+
199
+ const WebFindSimilarSchema = Type.Object({
200
+ url: Type.String({ description: "Seed URL to find similar pages for" }),
201
+ webScope: WebScopeSchema,
202
+ limit: Type.Optional(
203
+ Type.Number({ description: "Max fused results", minimum: 1, maximum: 20 }),
204
+ ),
205
+ output: Type.Optional(
206
+ Type.String({ description: "Output JSON (default .web/search-deep.json)" }),
207
+ ),
208
+ fast: Type.Optional(
209
+ Type.Boolean({
210
+ description: "Fast HTTP for seed page fetch",
211
+ default: true,
212
+ }),
213
+ ),
214
+ });
215
+
216
+ const WebContentsSchema = Type.Object({
217
+ webScope: WebScopeSchema,
218
+ urls: Type.Optional(
219
+ Type.Array(Type.String(), { description: "URLs to fetch (or use fromSearch)" }),
220
+ ),
221
+ fromSearch: Type.Optional(
222
+ Type.String({
223
+ description: "search.json or search-deep.json to read URLs from",
224
+ }),
225
+ ),
226
+ outputDir: Type.Optional(
227
+ Type.String({ description: "Output directory (default .web/contents)" }),
228
+ ),
229
+ limit: Type.Optional(
230
+ Type.Number({ description: "Max URLs to fetch", minimum: 1, maximum: 10 }),
231
+ ),
232
+ highlights: Type.Optional(Type.Boolean({ default: false })),
233
+ highlightQuery: Type.Optional(Type.String()),
234
+ evidenceBundle: Type.Optional(
235
+ Type.String({
236
+ description: "Write evidence-bundle.json (requires fromSearch)",
237
+ }),
238
+ ),
239
+ fast: Type.Optional(Type.Boolean({ default: false })),
80
240
  });
81
241
 
82
242
  function failResult(text: string) {
@@ -93,15 +253,81 @@ function okResult(text: string, details: Record<string, unknown> = {}) {
93
253
  };
94
254
  }
95
255
 
96
- function sessionCwd(ctx: { cwd?: string }): string {
256
+ type WebToolCtx = {
257
+ cwd?: string;
258
+ sessionManager?: { getSessionId(): string };
259
+ };
260
+
261
+ function sessionCwd(ctx: WebToolCtx): string {
97
262
  return ctx.cwd ?? process.cwd();
98
263
  }
99
264
 
265
+ function piSessionId(ctx: WebToolCtx): string {
266
+ return ctx.sessionManager?.getSessionId?.() ?? "default";
267
+ }
268
+
269
+ function resolveScopedOutput(
270
+ ctx: WebToolCtx,
271
+ basename: string,
272
+ explicitOutput?: string,
273
+ webScope?: string,
274
+ ): { output: string; artifactDir: string; scope: WebArtifactScope } {
275
+ const cwd = sessionCwd(ctx);
276
+ const sessionId = piSessionId(ctx);
277
+ const resolved = resolveWebOutputPath({
278
+ projectRoot: cwd,
279
+ piSessionId: sessionId,
280
+ basename,
281
+ explicitOutput,
282
+ webScope,
283
+ });
284
+ rememberSessionWebArtifactDir(sessionId, resolved.artifactDir);
285
+ return {
286
+ output: resolved.path,
287
+ artifactDir: resolved.artifactDir,
288
+ scope: resolved.scope,
289
+ };
290
+ }
291
+
292
+ function ensureParentDir(cwd: string, filePath: string): void {
293
+ mkdirSync(dirname(resolve(cwd, filePath)), { recursive: true });
294
+ }
295
+
296
+ function searchEngineId(): string {
297
+ return process.env.HARNESS_WEB_SEARCH_ENGINE?.trim() || "ddg_html";
298
+ }
299
+
300
+ function cacheControlFromParams(params: {
301
+ refreshCache?: boolean;
302
+ cacheMaxAge?: number;
303
+ }): { refresh: boolean; maxAgeSec?: number } {
304
+ return {
305
+ refresh: params.refreshCache === true,
306
+ maxAgeSec:
307
+ typeof params.cacheMaxAge === "number" ? params.cacheMaxAge : undefined,
308
+ };
309
+ }
310
+
311
+ function resolveTier(params: { tier?: string; bulk?: boolean }): string {
312
+ if (params.bulk) return "standard";
313
+ const t = String(params.tier ?? "deep").trim();
314
+ if (["instant", "standard", "deep", "research"].includes(t)) return t;
315
+ return "deep";
316
+ }
317
+
100
318
  export default function harnessWebTools(pi: ExtensionAPI) {
101
319
  if (!claimHarnessGovernanceLoad("harness-web-tools", MODULE_URL)) return;
102
- pi.on("before_agent_start", async (event) => {
320
+ pi.on("before_agent_start", async (event, ctx) => {
321
+ const cwd = sessionCwd(ctx);
322
+ const sessionId = piSessionId(ctx);
323
+ const scope = resolveWebOutputPath({
324
+ projectRoot: cwd,
325
+ piSessionId: sessionId,
326
+ basename: "angles.yaml",
327
+ }).scope;
328
+ rememberSessionWebArtifactDir(sessionId, scope.artifactDir);
103
329
  return {
104
- systemPrompt: `${event.systemPrompt}\n\n${harnessWebContextLine()}`,
330
+ systemPrompt: `${event.systemPrompt}\n\n${harnessWebContextLine()}\n${webArtifactScopeHint(scope)}`,
105
331
  };
106
332
  });
107
333
 
@@ -109,51 +335,216 @@ export default function harnessWebTools(pi: ExtensionAPI) {
109
335
  name: "web_search",
110
336
  label: "Web Search",
111
337
  description:
112
- "Search the web via harness-web (DuckDuckGo HTML or self-hosted SearXNG from .env). Returns result summaries and output path.",
113
- promptSnippet: "SERP via configured engine (ddg_html or searxng from .env)",
338
+ "Multi-tier web retrieval (WRS). Default tier=deep for research: parallel angle queries, RRF fusion. " +
339
+ "Use tier=standard only for narrow follow-ups. Requires anglesFile from web-query-expander for deep.",
340
+ promptSnippet: "tier=deep + anglesFile; not bare SERP",
114
341
  promptGuidelines: WEB_SEARCH_GUIDELINES,
115
342
  parameters: WebSearchSchema,
116
343
 
117
344
  async execute(_id, params, _signal, _onUpdate, ctx) {
118
345
  const cwd = sessionCwd(ctx);
346
+ const webScope = String(params.webScope ?? "").trim() || undefined;
119
347
  const query = String(params.query ?? "").trim();
120
348
  if (!query) return failResult("web_search: query is required.");
121
349
 
122
- const limit = typeof params.limit === "number" ? params.limit : 5;
350
+ const tier = resolveTier(params);
123
351
  const bulk = params.bulk === true;
124
- const output = String(
125
- params.output ?? (bulk ? ".web/bulk" : ".web/search.json"),
352
+ const limit = typeof params.limit === "number" ? params.limit : undefined;
353
+
354
+ if (bulk) {
355
+ const bulkScoped = resolveScopedOutput(
356
+ ctx,
357
+ "bulk",
358
+ params.output ? `${params.output}` : undefined,
359
+ webScope,
360
+ );
361
+ const output = bulkScoped.output.endsWith("/bulk")
362
+ ? bulkScoped.output
363
+ : `${bulkScoped.artifactDir}/bulk`;
364
+ ensureParentDir(cwd, output);
365
+ const lim = limit ?? 3;
366
+ const argv = ["bulk-scrape", query, "-o", output, "--limit", String(lim)];
367
+ const run = runHarnessWeb(MODULE_URL, argv, cwd);
368
+ if (!run.ok) {
369
+ return failResult(
370
+ `web_search bulk failed (exit ${run.exitCode}).\n${run.stderr || run.stdout}`,
371
+ );
372
+ }
373
+ return okResult(
374
+ `${run.stdout}\n\noutput: ${output}\nartifactDir: ${bulkScoped.artifactDir}`,
375
+ { output, artifactDir: bulkScoped.artifactDir, query, bulk: true },
376
+ );
377
+ }
378
+
379
+ const basename =
380
+ tier === "deep" || tier === "research" ? "search-deep.json" : "search.json";
381
+ const scoped = resolveScopedOutput(
382
+ ctx,
383
+ basename,
384
+ params.output ? String(params.output) : undefined,
385
+ webScope,
126
386
  );
387
+ const output = scoped.output;
388
+ ensureParentDir(cwd, output);
389
+ const { refresh: refreshCache, maxAgeSec } = cacheControlFromParams(params);
390
+ const engine = searchEngineId();
391
+ const resultLimit = limit ?? 10;
392
+ const category = params.category ? String(params.category) : undefined;
127
393
 
128
- const argv = bulk
129
- ? ["bulk-scrape", query, "-o", output, "--limit", String(limit)]
130
- : ["search", query, "-o", output, "--limit", String(limit)];
394
+ let anglesFile = String(params.anglesFile ?? "").trim();
395
+ if (anglesFile && !anglesFile.startsWith("/") && !anglesFile.includes("..")) {
396
+ anglesFile = resolveScopedOutput(ctx, "angles.yaml", anglesFile, webScope).output;
397
+ }
398
+ if (params.angles?.length && !anglesFile) {
399
+ const inline = resolveScopedOutput(ctx, "angles-inline.yaml", undefined, webScope);
400
+ const tmp = resolve(cwd, inline.output);
401
+ ensureParentDir(cwd, inline.output);
402
+ const yaml =
403
+ `intent: ${JSON.stringify(query)}\nangles:\n` +
404
+ params.angles
405
+ .map(
406
+ (q, i) =>
407
+ ` - id: angle_${i + 1}\n query: ${JSON.stringify(q)}`,
408
+ )
409
+ .join("\n") +
410
+ "\n";
411
+ writeFileSync(tmp, yaml, "utf-8");
412
+ anglesFile = inline.output;
413
+ }
414
+
415
+ if (
416
+ (tier === "deep" || tier === "research") &&
417
+ !anglesFile &&
418
+ params.expandHeuristic !== true &&
419
+ !params.angles?.length
420
+ ) {
421
+ return failResult(
422
+ "web_search tier=deep requires anglesFile (.web/angles.yaml from harness/web-retrieval/web-query-expander) " +
423
+ "or expandHeuristic:true. Invoke web-retrieval skill first.",
424
+ );
425
+ }
426
+
427
+ const anglesFingerprint = anglesFile
428
+ ? fingerprintFile(cwd, anglesFile)
429
+ : undefined;
430
+
431
+ const searchCtx: SearchCacheContext = {
432
+ query,
433
+ tier,
434
+ engine,
435
+ limit: resultLimit,
436
+ category,
437
+ expandHeuristic: params.expandHeuristic === true,
438
+ anglesFingerprint,
439
+ };
440
+
441
+ if (!refreshCache) {
442
+ const cached = lookupSearchCache(cwd, searchCtx, { maxAgeSec });
443
+ if (cached.hit && !cached.stale) {
444
+ const workspaceOutput = publishWorkspaceAlias(
445
+ cwd,
446
+ cached.artifactPath,
447
+ basename,
448
+ );
449
+ const parts = [
450
+ `[cache hit] age ${formatCacheAge(cached.ageMs)} · key ${cached.cacheKey}`,
451
+ `cache: ${cached.entryDir}`,
452
+ ];
453
+ const summary =
454
+ tier === "deep" || tier === "research"
455
+ ? summarizeDeepSearchJson(workspaceOutput, cwd)
456
+ : summarizeSearchJson(workspaceOutput, cwd);
457
+ if (summary) parts.push("", summary);
458
+ parts.push(
459
+ "",
460
+ `output: ${workspaceOutput}`,
461
+ `artifactDir: ${scoped.artifactDir}`,
462
+ `tier: ${tier}`,
463
+ );
464
+ parts.push("Read output JSON; web_fetch top URLs with highlights:true.");
465
+ return okResult(parts.join("\n"), {
466
+ output: workspaceOutput,
467
+ artifactDir: scoped.artifactDir,
468
+ query,
469
+ tier,
470
+ engine,
471
+ cacheHit: true,
472
+ cacheKey: cached.cacheKey,
473
+ cachePath: cached.artifactPath,
474
+ cacheAgeMs: cached.ageMs,
475
+ });
476
+ }
477
+ }
478
+
479
+ let argv: string[];
480
+ if (tier === "deep" || tier === "research") {
481
+ argv = [
482
+ "search-deep",
483
+ query,
484
+ "-o",
485
+ output,
486
+ "--limit",
487
+ String(resultLimit),
488
+ ];
489
+ if (anglesFile) {
490
+ argv.push("--angles-file", anglesFile);
491
+ } else if (params.expandHeuristic === true) {
492
+ argv.push("--expand-heuristic");
493
+ }
494
+ if (category) {
495
+ argv.push("--category", category);
496
+ }
497
+ } else {
498
+ argv = [
499
+ "search",
500
+ query,
501
+ "-o",
502
+ output,
503
+ "--tier",
504
+ tier,
505
+ ...(limit != null ? ["--limit", String(limit)] : []),
506
+ ];
507
+ }
131
508
 
132
509
  const run = runHarnessWeb(MODULE_URL, argv, cwd);
133
510
  if (!run.ok) {
134
511
  const hint =
135
512
  "\n\nHints: run /harness-setup; for searxng set HARNESS_WEB_SEARXNG_URL; " +
136
- "enable json in SearXNG search.formats.";
513
+ "enable json in SearXNG search.formats; for deep spawn web-query-expander first.";
137
514
  return failResult(
138
515
  `web_search failed (exit ${run.exitCode}).\n${run.stderr || run.stdout}${hint}`,
139
516
  );
140
517
  }
141
518
 
519
+ const cacheWrite = writeSearchCacheEntry(cwd, searchCtx, output, {
520
+ anglesPath: anglesFile,
521
+ });
522
+ publishWorkspaceAlias(cwd, `${cacheWrite.entryDir}/${basename}`, basename);
523
+
142
524
  const parts = [run.stdout];
143
- if (!bulk) {
144
- const summary = summarizeSearchJson(output, cwd);
145
- if (summary) {
146
- parts.push("", summary);
147
- }
148
- }
149
- parts.push("", `output: ${output}`);
150
- parts.push("Use read tool for full JSON, or web_fetch on result URLs.");
525
+ const summary =
526
+ tier === "deep" || tier === "research"
527
+ ? summarizeDeepSearchJson(output, cwd)
528
+ : summarizeSearchJson(output, cwd);
529
+ if (summary) parts.push("", summary);
530
+ parts.push(
531
+ "",
532
+ `output: ${output}`,
533
+ `artifactDir: ${scoped.artifactDir}`,
534
+ `tier: ${tier}`,
535
+ `cache: ${cacheWrite.entryDir}`,
536
+ );
537
+ parts.push("Read output JSON; web_fetch top URLs with highlights:true.");
151
538
 
152
539
  return okResult(parts.join("\n"), {
153
540
  output,
541
+ artifactDir: scoped.artifactDir,
154
542
  query,
155
- bulk,
156
- engine: process.env.HARNESS_WEB_SEARCH_ENGINE,
543
+ tier,
544
+ engine,
545
+ cacheHit: false,
546
+ cacheKey: cacheWrite.cacheKey,
547
+ cachePath: `${cacheWrite.entryDir}/${basename}`,
157
548
  });
158
549
  },
159
550
  });
@@ -162,34 +553,110 @@ export default function harnessWebTools(pi: ExtensionAPI) {
162
553
  name: "web_fetch",
163
554
  label: "Web Fetch",
164
555
  description:
165
- "Fetch a URL via harness-web/Scrapling (scrape to markdown or map same-host links).",
166
- promptSnippet: "Scrape/map URL via Scrapling (harness-web)",
556
+ "Fetch URL content via Scrapling. Prefer highlights:true after deep search before full markdown.",
557
+ promptSnippet: "Scrape/map; highlights first after deep",
167
558
  promptGuidelines: WEB_FETCH_GUIDELINES,
168
559
  parameters: WebFetchSchema,
169
560
 
170
561
  async execute(_id, params, _signal, _onUpdate, ctx) {
171
562
  const cwd = sessionCwd(ctx);
563
+ const webScope = String(params.webScope ?? "").trim() || undefined;
172
564
  const url = String(params.url ?? "").trim();
173
565
  if (!url) return failResult("web_fetch: url is required.");
174
566
 
175
567
  const mode = params.mode === "map" ? "map" : "scrape";
176
568
  const fast = params.fast === true;
177
569
  const limit = typeof params.limit === "number" ? params.limit : 100;
178
- const defaultOut = mode === "map" ? ".web/map.json" : ".web/page.md";
179
- const output = String(params.output ?? defaultOut);
180
-
181
- const argv =
182
- mode === "map"
183
- ? [
184
- "map",
185
- url,
186
- "-o",
187
- output,
188
- "--limit",
189
- String(limit),
190
- ...(fast ? ["--fast"] : []),
191
- ]
192
- : ["scrape", url, "-o", output, ...(fast ? ["--fast"] : [])];
570
+ const basename = mode === "map" ? "map.json" : "page.md";
571
+ const scoped = resolveScopedOutput(
572
+ ctx,
573
+ basename,
574
+ params.output ? String(params.output) : undefined,
575
+ webScope,
576
+ );
577
+ const output = scoped.output;
578
+ ensureParentDir(cwd, output);
579
+ const highlights = params.highlights === true;
580
+ const hlQuery = String(params.highlightQuery ?? "").trim();
581
+ const { refresh: refreshCache, maxAgeSec } = cacheControlFromParams(params);
582
+
583
+ const hlScoped =
584
+ highlights && !params.highlightsOutput
585
+ ? resolveScopedOutput(ctx, "highlights.json", undefined, webScope)
586
+ : highlights
587
+ ? resolveScopedOutput(
588
+ ctx,
589
+ "highlights.json",
590
+ String(params.highlightsOutput),
591
+ webScope,
592
+ )
593
+ : undefined;
594
+ if (hlScoped) ensureParentDir(cwd, hlScoped.output);
595
+
596
+ const fetchCtx: FetchCacheContext = {
597
+ url,
598
+ mode,
599
+ fast,
600
+ highlightQuery: hlQuery || undefined,
601
+ highlights,
602
+ };
603
+
604
+ if (!refreshCache) {
605
+ const cached = lookupFetchCache(cwd, fetchCtx, { maxAgeSec });
606
+ if (cached.hit && !cached.stale) {
607
+ const workspaceBasename = highlights
608
+ ? "highlights.json"
609
+ : mode === "map"
610
+ ? "map.json"
611
+ : "page.md";
612
+ const workspaceOutput = publishWorkspaceAlias(
613
+ cwd,
614
+ cached.artifactPath,
615
+ workspaceBasename,
616
+ );
617
+ const parts = [
618
+ `[cache hit] age ${formatCacheAge(cached.ageMs)} · key ${cached.cacheKey}`,
619
+ `cache: ${cached.entryDir}`,
620
+ "",
621
+ `output: ${workspaceOutput}`,
622
+ `artifactDir: ${scoped.artifactDir}`,
623
+ ];
624
+ const excerpt = readTextExcerpt(workspaceOutput, cwd);
625
+ if (excerpt) parts.push("", "--- excerpt ---", excerpt);
626
+ return okResult(parts.join("\n"), {
627
+ output: workspaceOutput,
628
+ artifactDir: scoped.artifactDir,
629
+ url,
630
+ mode,
631
+ highlights,
632
+ cacheHit: true,
633
+ cacheKey: cached.cacheKey,
634
+ cachePath: cached.artifactPath,
635
+ });
636
+ }
637
+ }
638
+
639
+ let argv: string[];
640
+ if (mode === "map") {
641
+ argv = [
642
+ "map",
643
+ url,
644
+ "-o",
645
+ output,
646
+ "--limit",
647
+ String(limit),
648
+ ...(fast ? ["--fast"] : []),
649
+ ];
650
+ } else {
651
+ argv = ["scrape", url, "-o", output, ...(fast ? ["--fast"] : [])];
652
+ if (highlights) {
653
+ if (!hlQuery) {
654
+ return failResult("web_fetch: highlightQuery required when highlights=true");
655
+ }
656
+ argv.push("--highlights", "--highlight-query", hlQuery);
657
+ if (hlScoped) argv.push("--highlights-output", hlScoped.output);
658
+ }
659
+ }
193
660
 
194
661
  const run = runHarnessWeb(MODULE_URL, argv, cwd);
195
662
  if (!run.ok) {
@@ -199,13 +666,184 @@ export default function harnessWebTools(pi: ExtensionAPI) {
199
666
  );
200
667
  }
201
668
 
202
- const parts = [run.stdout, "", `output: ${output}`];
669
+ const cacheArtifact = highlights && hlScoped ? hlScoped.output : output;
670
+ const cacheWrite = writeFetchCacheEntry(cwd, fetchCtx, cacheArtifact, {
671
+ highlightsPath:
672
+ highlights && hlScoped && hlScoped.output !== cacheArtifact
673
+ ? hlScoped.output
674
+ : undefined,
675
+ });
676
+ const workspaceBasename = highlights
677
+ ? "highlights.json"
678
+ : mode === "map"
679
+ ? "map.json"
680
+ : "page.md";
681
+ publishWorkspaceAlias(cwd, `${cacheWrite.entryDir}/${workspaceBasename}`, workspaceBasename);
682
+
683
+ const parts = [
684
+ run.stdout,
685
+ "",
686
+ `output: ${output}`,
687
+ `artifactDir: ${scoped.artifactDir}`,
688
+ `cache: ${cacheWrite.entryDir}`,
689
+ ];
203
690
  const excerpt = readTextExcerpt(output, cwd);
204
- if (excerpt) {
205
- parts.push("", "--- excerpt ---", excerpt);
691
+ if (excerpt) parts.push("", "--- excerpt ---", excerpt);
692
+
693
+ return okResult(parts.join("\n"), {
694
+ output,
695
+ artifactDir: scoped.artifactDir,
696
+ url,
697
+ mode,
698
+ highlights,
699
+ cacheHit: false,
700
+ cacheKey: cacheWrite.cacheKey,
701
+ cachePath: `${cacheWrite.entryDir}/${workspaceBasename}`,
702
+ });
703
+ },
704
+ });
705
+
706
+ pi.registerTool({
707
+ name: "web_find_similar",
708
+ label: "Web Find Similar",
709
+ description:
710
+ "Find pages similar to a seed URL (Exa findSimilar analog). Outputs fused search-deep.json.",
711
+ promptSnippet: "Similar pages from seed URL",
712
+ promptGuidelines: WEB_FIND_SIMILAR_GUIDELINES,
713
+ parameters: WebFindSimilarSchema,
714
+
715
+ async execute(_id, params, _signal, _onUpdate, ctx) {
716
+ const cwd = sessionCwd(ctx);
717
+ const webScope = String(params.webScope ?? "").trim() || undefined;
718
+ const url = String(params.url ?? "").trim();
719
+ if (!url) return failResult("web_find_similar: url is required.");
720
+
721
+ const scoped = resolveScopedOutput(
722
+ ctx,
723
+ "search-deep.json",
724
+ params.output ? String(params.output) : undefined,
725
+ webScope,
726
+ );
727
+ const output = scoped.output;
728
+ ensureParentDir(cwd, output);
729
+ const limit = typeof params.limit === "number" ? params.limit : 10;
730
+ const argv = [
731
+ "find-similar",
732
+ url,
733
+ "-o",
734
+ output,
735
+ "--limit",
736
+ String(limit),
737
+ ...(params.fast !== false ? ["--fast"] : []),
738
+ ];
739
+
740
+ const run = runHarnessWeb(MODULE_URL, argv, cwd);
741
+ if (!run.ok) {
742
+ return failResult(
743
+ `web_find_similar failed (exit ${run.exitCode}).\n${run.stderr || run.stdout}`,
744
+ );
206
745
  }
207
746
 
208
- return okResult(parts.join("\n"), { output, url, mode });
747
+ const parts = [run.stdout];
748
+ const summary = summarizeDeepSearchJson(output, cwd);
749
+ if (summary) parts.push("", summary);
750
+ parts.push("", `output: ${output}`, `artifactDir: ${scoped.artifactDir}`);
751
+
752
+ return okResult(parts.join("\n"), {
753
+ output,
754
+ artifactDir: scoped.artifactDir,
755
+ url,
756
+ });
757
+ },
758
+ });
759
+
760
+ pi.registerTool({
761
+ name: "web_contents",
762
+ label: "Web Contents Batch",
763
+ description:
764
+ "Batch-fetch URLs from search-deep.json into markdown (+ optional highlights). Builds evidence bundle.",
765
+ promptSnippet: "Batch fetch after deep search",
766
+ promptGuidelines: WEB_CONTENTS_GUIDELINES,
767
+ parameters: WebContentsSchema,
768
+
769
+ async execute(_id, params, _signal, _onUpdate, ctx) {
770
+ const cwd = sessionCwd(ctx);
771
+ const webScope = String(params.webScope ?? "").trim() || undefined;
772
+ const dirScoped = resolveScopedOutput(
773
+ ctx,
774
+ "contents",
775
+ params.outputDir ? String(params.outputDir) : undefined,
776
+ webScope,
777
+ );
778
+ const outputDir = dirScoped.output.endsWith("/contents")
779
+ ? dirScoped.output
780
+ : `${dirScoped.artifactDir}/contents`;
781
+ mkdirSync(resolve(cwd, outputDir), { recursive: true });
782
+ let fromSearch = String(params.fromSearch ?? "").trim();
783
+ if (fromSearch && !fromSearch.startsWith("/") && !fromSearch.includes("..")) {
784
+ fromSearch = resolveScopedOutput(
785
+ ctx,
786
+ "search-deep.json",
787
+ fromSearch,
788
+ webScope,
789
+ ).output;
790
+ }
791
+ const urls = (params.urls ?? []).map((u) => String(u).trim()).filter(Boolean);
792
+ const limit = typeof params.limit === "number" ? params.limit : 5;
793
+ const hlQuery = String(params.highlightQuery ?? "").trim();
794
+
795
+ const argv = [
796
+ "contents-batch",
797
+ "-o",
798
+ outputDir,
799
+ "--limit",
800
+ String(limit),
801
+ ...(params.fast ? ["--fast"] : []),
802
+ ...(params.highlights && hlQuery
803
+ ? ["--highlights", "--highlight-query", hlQuery]
804
+ : []),
805
+ ...urls,
806
+ ];
807
+ if (fromSearch) {
808
+ argv.splice(1, 0, "--from-search", fromSearch);
809
+ }
810
+ let evidencePath: string | undefined;
811
+ if (params.evidenceBundle && fromSearch) {
812
+ const bundleArg = String(params.evidenceBundle);
813
+ evidencePath =
814
+ bundleArg.startsWith("/") || bundleArg.includes("..")
815
+ ? bundleArg
816
+ : resolveScopedOutput(
817
+ ctx,
818
+ "evidence-bundle.json",
819
+ bundleArg,
820
+ webScope,
821
+ ).output;
822
+ ensureParentDir(cwd, evidencePath);
823
+ argv.push("--evidence-bundle", evidencePath);
824
+ }
825
+
826
+ if (!fromSearch && !urls.length) {
827
+ return failResult("web_contents: provide urls or fromSearch");
828
+ }
829
+
830
+ const run = runHarnessWeb(MODULE_URL, argv, cwd);
831
+ if (!run.ok) {
832
+ return failResult(
833
+ `web_contents failed (exit ${run.exitCode}).\n${run.stderr || run.stdout}`,
834
+ );
835
+ }
836
+
837
+ return okResult(
838
+ `${run.stdout}\n\noutputDir: ${outputDir}\nartifactDir: ${dirScoped.artifactDir}` +
839
+ (evidencePath ? `\nevidence: ${evidencePath}` : ""),
840
+ {
841
+ outputDir,
842
+ artifactDir: dirScoped.artifactDir,
843
+ fromSearch,
844
+ evidenceBundle: evidencePath,
845
+ },
846
+ );
209
847
  },
210
848
  });
211
849
  }