ultimate-pi 0.19.0 → 0.20.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/skills/web-retrieval/SKILL.md +163 -0
- package/.agents/skills/wiki-autoresearch/SKILL.md +6 -6
- package/.pi/SYSTEM.md +30 -12
- package/.pi/agents/harness/planning/implementation-researcher.md +1 -1
- package/.pi/agents/harness/planning/stack-researcher.md +5 -1
- package/.pi/agents/harness/running/executor.md +42 -1
- package/.pi/agents/harness/web-retrieval/web-answerer.md +35 -0
- package/.pi/agents/harness/web-retrieval/web-criteria-verifier.md +28 -0
- package/.pi/agents/harness/web-retrieval/web-gap-analyzer.md +31 -0
- package/.pi/agents/harness/web-retrieval/web-query-expander-fast.md +34 -0
- package/.pi/agents/harness/web-retrieval/web-query-expander.md +60 -0
- package/.pi/agents/harness/web-retrieval/web-summarizer.md +18 -0
- package/.pi/extensions/harness-anchored-edit.ts +141 -0
- package/.pi/extensions/harness-web-guard.ts +2 -1
- package/.pi/extensions/harness-web-tools.ts +689 -51
- package/.pi/harness/agents.manifest.json +30 -6
- package/.pi/harness/agents.policy.yaml +37 -4
- package/.pi/harness/docs/adrs/0050-agentic-web-retrieval-stack.md +46 -0
- package/.pi/harness/docs/adrs/0051-hash-anchored-executor-edits.md +41 -0
- package/.pi/harness/docs/adrs/README.md +2 -0
- package/.pi/harness/docs/harness-web-search.md +97 -0
- package/.pi/harness/docs/practice-map.md +11 -0
- package/.pi/harness/env.harness.template +9 -1
- package/.pi/harness/examples/web-heuristic-angles.project.yaml +22 -0
- package/.pi/harness/web-heuristic-angles.json +278 -0
- package/.pi/harness/web-heuristic-angles.yaml +182 -0
- package/.pi/lib/agents-policy.d.mts +4 -0
- package/.pi/lib/agents-policy.mjs +49 -1
- package/.pi/lib/agents-policy.ts +1 -0
- package/.pi/lib/harness-anchored-edit/.hash_anchors +1721 -0
- package/.pi/lib/harness-anchored-edit/anchor-state.ts +320 -0
- package/.pi/lib/harness-anchored-edit/apply-anchored-edits.ts +161 -0
- package/.pi/lib/harness-anchored-edit/edit-executor.ts +146 -0
- package/.pi/lib/harness-anchored-edit/index.ts +9 -0
- package/.pi/lib/harness-anchored-edit/line-protocol.ts +38 -0
- package/.pi/lib/harness-anchored-edit/settings.ts +1 -0
- package/.pi/lib/harness-anchored-edit/task-id.ts +8 -0
- package/.pi/lib/harness-anchored-edit/types.ts +19 -0
- package/.pi/lib/harness-lens/clients/anchored-edit-autopatch.ts +158 -0
- package/.pi/lib/harness-lens/index.ts +24 -7
- package/.pi/lib/harness-subagent-auth.ts +39 -9
- package/.pi/lib/harness-subagents-bridge.ts +24 -1
- package/.pi/lib/harness-web/artifacts.ts +200 -0
- package/.pi/lib/harness-web/cache.ts +369 -0
- package/.pi/lib/harness-web/run-cli.ts +42 -2
- package/.pi/prompts/harness-plan.md +1 -0
- package/.pi/prompts/harness-setup.md +3 -1
- package/.pi/prompts/harness-steer.md +1 -1
- package/.pi/scripts/gen-web-heuristic-angles-json.mjs +24 -0
- package/.pi/scripts/harness-anchored-edit-smoke.mjs +45 -0
- package/.pi/scripts/harness-cli-verify.sh +5 -0
- package/.pi/scripts/harness-verify.mjs +145 -0
- package/.pi/scripts/harness-web-policy-guard.mjs +1 -1
- package/.pi/scripts/harness-web.py +218 -15
- package/.pi/scripts/harness_web/deep_search.py +55 -0
- package/.pi/scripts/harness_web/evidence_bundle.py +47 -0
- package/.pi/scripts/harness_web/find_similar.py +88 -0
- package/.pi/scripts/harness_web/heuristic_angles_shipped.py +85 -0
- package/.pi/scripts/harness_web/heuristic_config.py +251 -0
- package/.pi/scripts/harness_web/highlights.py +47 -0
- package/.pi/scripts/harness_web/multi_search.py +59 -0
- package/.pi/scripts/harness_web/output.py +24 -0
- package/.pi/scripts/harness_web/query_angles.py +116 -0
- package/.pi/scripts/harness_web/rank.py +163 -0
- package/.pi/scripts/harness_web/scrape.py +30 -0
- package/.pi/scripts/run-tests.mjs +64 -0
- package/.pi/scripts/tests/test_harness_web_heuristic_config.py +132 -0
- package/.pi/scripts/tests/test_harness_web_query_angles.py +45 -0
- package/.pi/scripts/tests/test_harness_web_rank.py +56 -0
- package/AGENTS.md +2 -2
- package/CHANGELOG.md +12 -0
- package/THIRD_PARTY_NOTICES.md +7 -0
- package/package.json +7 -4
- package/vendor/pi-subagents/src/agents.ts +5 -0
- package/vendor/pi-subagents/src/subagents.ts +22 -3
- package/.agents/skills/scrapling-web/SKILL.md +0 -98
- package/.pi/extensions/00-posthog-network-bootstrap.ts +0 -11
- package/.pi/scripts/harness_web/__pycache__/__init__.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/config.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/output.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/scrape.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/search.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/search_ddg.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/search_searxng.cpython-314.pyc +0 -0
- package/.pi/scripts/release.sh +0 -338
|
@@ -207,6 +207,59 @@ async function checkHarnessLens(pkgJson) {
|
|
|
207
207
|
ok("no harness-lens UPSTREAM_PIN.md");
|
|
208
208
|
}
|
|
209
209
|
|
|
210
|
+
async function checkHarnessAnchoredEdit(pkgJson) {
|
|
211
|
+
if (!pkgJson.files?.includes(".pi/lib/harness-anchored-edit")) {
|
|
212
|
+
fail(
|
|
213
|
+
'package.json "files" must include .pi/lib/harness-anchored-edit',
|
|
214
|
+
);
|
|
215
|
+
}
|
|
216
|
+
ok('package.json files includes .pi/lib/harness-anchored-edit');
|
|
217
|
+
|
|
218
|
+
const resolvePath = join(
|
|
219
|
+
ROOT,
|
|
220
|
+
".pi",
|
|
221
|
+
"lib",
|
|
222
|
+
"harness-anchored-edit",
|
|
223
|
+
"resolve-to-pi-edit.ts",
|
|
224
|
+
);
|
|
225
|
+
if (await fileExists(resolvePath)) {
|
|
226
|
+
fail("resolve-to-pi-edit.ts must not exist (native anchored apply)");
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
const applyPath = join(
|
|
230
|
+
ROOT,
|
|
231
|
+
".pi",
|
|
232
|
+
"lib",
|
|
233
|
+
"harness-anchored-edit",
|
|
234
|
+
"apply-anchored-edits.ts",
|
|
235
|
+
);
|
|
236
|
+
if (!(await fileExists(applyPath))) {
|
|
237
|
+
fail("missing .pi/lib/harness-anchored-edit/apply-anchored-edits.ts");
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
const extPath = join(ROOT, ".pi", "extensions", "harness-anchored-edit.ts");
|
|
241
|
+
if (!(await fileExists(extPath))) {
|
|
242
|
+
fail("missing .pi/extensions/harness-anchored-edit.ts");
|
|
243
|
+
}
|
|
244
|
+
const extSrc = await readFile(extPath, "utf8");
|
|
245
|
+
if (extSrc.includes("HARNESS_ANCHORED_EDIT")) {
|
|
246
|
+
fail("harness-anchored-edit must not gate on HARNESS_ANCHORED_EDIT");
|
|
247
|
+
}
|
|
248
|
+
if (extSrc.includes("createEditTool")) {
|
|
249
|
+
fail("harness-anchored-edit must not delegate to createEditTool");
|
|
250
|
+
}
|
|
251
|
+
if (extSrc.includes("resolveAnchoredInputToPiEdit")) {
|
|
252
|
+
fail("harness-anchored-edit must not use resolveAnchoredInputToPiEdit");
|
|
253
|
+
}
|
|
254
|
+
if (extSrc.includes('pi.on("tool_call"')) {
|
|
255
|
+
fail("harness-anchored-edit must not mutate edit input on tool_call");
|
|
256
|
+
}
|
|
257
|
+
if (!extSrc.includes("applyAnchoredEditsToFile")) {
|
|
258
|
+
fail("harness-anchored-edit must call applyAnchoredEditsToFile");
|
|
259
|
+
}
|
|
260
|
+
ok("harness-anchored-edit first-class contract");
|
|
261
|
+
}
|
|
262
|
+
|
|
210
263
|
async function checkSentruxGate() {
|
|
211
264
|
await checkSentruxRules();
|
|
212
265
|
|
|
@@ -278,6 +331,7 @@ async function main() {
|
|
|
278
331
|
await readFile(join(ROOT, "package.json"), "utf-8"),
|
|
279
332
|
);
|
|
280
333
|
await checkHarnessLens(pkgJson);
|
|
334
|
+
await checkHarnessAnchoredEdit(pkgJson);
|
|
281
335
|
|
|
282
336
|
if (!pkgJson.files?.includes("vendor/pi-subagents")) {
|
|
283
337
|
fail(
|
|
@@ -408,6 +462,19 @@ async function main() {
|
|
|
408
462
|
}
|
|
409
463
|
ok("agents.policy.yaml present");
|
|
410
464
|
|
|
465
|
+
const policyYaml = await readFile(AGENTS_POLICY, "utf8");
|
|
466
|
+
if (!/^\s+extension_bundle:\s+executor/m.test(policyYaml)) {
|
|
467
|
+
fail("agents.policy.yaml kinds.executor must set extension_bundle: executor");
|
|
468
|
+
}
|
|
469
|
+
if (
|
|
470
|
+
/harness\/running\/executor:[\s\S]*?extensions:\s+true/m.test(policyYaml)
|
|
471
|
+
) {
|
|
472
|
+
fail(
|
|
473
|
+
"harness/running/executor must not set extensions: true (use kind extension_bundle)",
|
|
474
|
+
);
|
|
475
|
+
}
|
|
476
|
+
ok("executor extension_bundle policy");
|
|
477
|
+
|
|
411
478
|
if (!(await fileExists(AGENTS_MANIFEST))) {
|
|
412
479
|
fail(
|
|
413
480
|
"missing .pi/harness/agents.manifest.json — run node \"$UP_PKG/.pi/scripts/harness-agents-manifest.mjs\" --write",
|
|
@@ -424,9 +491,87 @@ async function main() {
|
|
|
424
491
|
}
|
|
425
492
|
ok("agents.manifest.json in sync");
|
|
426
493
|
|
|
494
|
+
await checkWrsContracts();
|
|
495
|
+
|
|
427
496
|
console.log("\nharness:verify PASS");
|
|
428
497
|
}
|
|
429
498
|
|
|
499
|
+
async function checkWrsContracts() {
|
|
500
|
+
const systemMd = join(ROOT, ".pi", "SYSTEM.md");
|
|
501
|
+
const toolsTs = join(ROOT, ".pi", "extensions", "harness-web-tools.ts");
|
|
502
|
+
const runCli = join(ROOT, ".pi", "lib", "harness-web", "run-cli.ts");
|
|
503
|
+
const webRetrievalSkill = join(ROOT, ".agents", "skills", "web-retrieval", "SKILL.md");
|
|
504
|
+
const adr = join(
|
|
505
|
+
ROOT,
|
|
506
|
+
".pi",
|
|
507
|
+
"harness",
|
|
508
|
+
"docs",
|
|
509
|
+
"adrs",
|
|
510
|
+
"0050-agentic-web-retrieval-stack.md",
|
|
511
|
+
);
|
|
512
|
+
|
|
513
|
+
for (const p of [systemMd, toolsTs, runCli, webRetrievalSkill, adr]) {
|
|
514
|
+
if (!(await fileExists(p))) fail(`WRS contract missing file: ${p}`);
|
|
515
|
+
}
|
|
516
|
+
|
|
517
|
+
const sys = await readFile(systemMd, "utf-8");
|
|
518
|
+
if (!sys.includes("tier=deep") && !sys.includes('tier: "deep"')) {
|
|
519
|
+
fail("SYSTEM.md must document deep tier default for WRS");
|
|
520
|
+
}
|
|
521
|
+
if (!sys.includes("web-retrieval")) {
|
|
522
|
+
fail("SYSTEM.md must reference web-retrieval skill");
|
|
523
|
+
}
|
|
524
|
+
if (!sys.includes(".web/cache") && !sys.includes("HARNESS_WEB_CACHE")) {
|
|
525
|
+
fail("SYSTEM.md must document pooled WRS cache under .web/cache/");
|
|
526
|
+
}
|
|
527
|
+
|
|
528
|
+
const tools = await readFile(toolsTs, "utf-8");
|
|
529
|
+
if (!tools.includes('Literal("deep")')) {
|
|
530
|
+
fail("harness-web-tools.ts must define tier enum including deep");
|
|
531
|
+
}
|
|
532
|
+
if (!tools.includes("anglesFile")) {
|
|
533
|
+
fail("harness-web-tools.ts must expose anglesFile on web_search");
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
const cli = await readFile(runCli, "utf-8");
|
|
537
|
+
if (!cli.includes("tier=deep")) {
|
|
538
|
+
fail("run-cli.ts harnessWebContextLine must mention tier=deep");
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
const artifactsTs = join(ROOT, ".pi", "lib", "harness-web", "artifacts.ts");
|
|
542
|
+
if (!(await fileExists(artifactsTs))) {
|
|
543
|
+
fail("missing harness-web/artifacts.ts for scoped .web paths");
|
|
544
|
+
}
|
|
545
|
+
const cacheTs = join(ROOT, ".pi", "lib", "harness-web", "cache.ts");
|
|
546
|
+
if (!(await fileExists(cacheTs))) {
|
|
547
|
+
fail("missing harness-web/cache.ts for pooled .web/cache/");
|
|
548
|
+
}
|
|
549
|
+
if (!tools.includes("refreshCache") || !tools.includes("lookupSearchCache")) {
|
|
550
|
+
fail("harness-web-tools.ts must implement pooled cache (refreshCache, lookupSearchCache)");
|
|
551
|
+
}
|
|
552
|
+
const heuristicYaml = join(ROOT, ".pi", "harness", "web-heuristic-angles.yaml");
|
|
553
|
+
if (!(await fileExists(heuristicYaml))) {
|
|
554
|
+
fail("missing .pi/harness/web-heuristic-angles.yaml");
|
|
555
|
+
}
|
|
556
|
+
const heuristicPy = join(ROOT, ".pi", "scripts", "harness_web", "heuristic_config.py");
|
|
557
|
+
if (!(await fileExists(heuristicPy))) {
|
|
558
|
+
fail("missing harness_web/heuristic_config.py");
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
const rankPy = join(ROOT, ".pi", "scripts", "harness_web", "rank.py");
|
|
562
|
+
const anglesPy = join(ROOT, ".pi", "scripts", "harness_web", "deep_search.py");
|
|
563
|
+
for (const p of [rankPy, anglesPy]) {
|
|
564
|
+
if (!(await fileExists(p))) fail(`WRS python module missing: ${p}`);
|
|
565
|
+
}
|
|
566
|
+
|
|
567
|
+
const expander = join(ROOT, ".pi", "agents", "harness", "web-retrieval", "web-query-expander.md");
|
|
568
|
+
if (!(await fileExists(expander))) {
|
|
569
|
+
fail("missing web-query-expander agent");
|
|
570
|
+
}
|
|
571
|
+
|
|
572
|
+
ok("WRS contracts (SYSTEM.md, tools, modules, web-retrieval skill, ADR)");
|
|
573
|
+
}
|
|
574
|
+
|
|
430
575
|
main().catch((err) => {
|
|
431
576
|
console.error(err);
|
|
432
577
|
process.exit(1);
|
|
@@ -18,7 +18,7 @@ const ALLOWED_FILES = new Set([
|
|
|
18
18
|
".pi/scripts/harness-web.py",
|
|
19
19
|
".pi/scripts/harness-web-search.md",
|
|
20
20
|
".pi/scripts/harness-web-policy-guard.mjs",
|
|
21
|
-
".agents/skills/
|
|
21
|
+
".agents/skills/web-retrieval/SKILL.md",
|
|
22
22
|
".pi/scripts/harness-cli-verify.sh",
|
|
23
23
|
".pi/scripts/harness_web/output.py",
|
|
24
24
|
"AGENTS.md",
|
|
@@ -9,6 +9,7 @@ import shutil
|
|
|
9
9
|
import sys
|
|
10
10
|
import time
|
|
11
11
|
from pathlib import Path
|
|
12
|
+
from urllib.parse import urlparse
|
|
12
13
|
|
|
13
14
|
# Re-exec with scrapling's uv-tool Python when the library is not on default python3.
|
|
14
15
|
def _bootstrap_scrapling() -> None:
|
|
@@ -34,10 +35,28 @@ if str(SCRIPT_DIR) not in sys.path:
|
|
|
34
35
|
sys.path.insert(0, str(SCRIPT_DIR))
|
|
35
36
|
|
|
36
37
|
from harness_web.config import HarnessWebConfig, load_config # noqa: E402
|
|
37
|
-
from harness_web.
|
|
38
|
-
from harness_web.
|
|
38
|
+
from harness_web.deep_search import run_deep_search # noqa: E402
|
|
39
|
+
from harness_web.evidence_bundle import build_evidence_bundle, write_evidence_bundle # noqa: E402
|
|
40
|
+
from harness_web.find_similar import run_find_similar # noqa: E402
|
|
41
|
+
from harness_web.output import ( # noqa: E402
|
|
42
|
+
write_deep_search_results,
|
|
43
|
+
write_search_results,
|
|
44
|
+
)
|
|
45
|
+
from harness_web.scrape import ( # noqa: E402
|
|
46
|
+
bulk_scrape,
|
|
47
|
+
map_url,
|
|
48
|
+
scrape_url,
|
|
49
|
+
scrape_url_with_highlights,
|
|
50
|
+
)
|
|
39
51
|
from harness_web.search import search # noqa: E402
|
|
40
52
|
|
|
53
|
+
TIER_LIMITS = {
|
|
54
|
+
"instant": 5,
|
|
55
|
+
"standard": 10,
|
|
56
|
+
"deep": 10,
|
|
57
|
+
"research": 15,
|
|
58
|
+
}
|
|
59
|
+
|
|
41
60
|
DEFAULT_WEB_DIR = ".web"
|
|
42
61
|
|
|
43
62
|
|
|
@@ -45,26 +64,153 @@ def _default_out(sub: str) -> Path:
|
|
|
45
64
|
return Path(DEFAULT_WEB_DIR) / sub
|
|
46
65
|
|
|
47
66
|
|
|
67
|
+
def _tier_limit(tier: str, cli_limit: int | None) -> int:
|
|
68
|
+
if cli_limit is not None:
|
|
69
|
+
return cli_limit
|
|
70
|
+
return TIER_LIMITS.get(tier, 10)
|
|
71
|
+
|
|
72
|
+
|
|
48
73
|
def cmd_search(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
74
|
+
tier = getattr(args, "tier", None) or "standard"
|
|
75
|
+
limit = _tier_limit(tier, args.limit)
|
|
49
76
|
out = Path(args.output or _default_out("search.json"))
|
|
50
|
-
results = search(args.query, limit=
|
|
51
|
-
write_search_results(out, results, args.query, engine=config.search_engine)
|
|
52
|
-
print(f"wrote {out} ({len(results)} results)")
|
|
77
|
+
results = search(args.query, limit=limit, config=config)
|
|
78
|
+
write_search_results(out, results, args.query, engine=config.search_engine, tier=tier)
|
|
79
|
+
print(f"wrote {out} ({len(results)} results, tier={tier})")
|
|
53
80
|
return 0
|
|
54
81
|
|
|
55
82
|
|
|
56
|
-
def
|
|
57
|
-
out = Path(args.output or _default_out("
|
|
58
|
-
|
|
59
|
-
|
|
83
|
+
def cmd_search_deep(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
84
|
+
out = Path(args.output or _default_out("search-deep.json"))
|
|
85
|
+
angles_path = Path(args.angles_file) if args.angles_file else None
|
|
86
|
+
plan, ranked = run_deep_search(
|
|
87
|
+
args.query,
|
|
88
|
+
config=config,
|
|
89
|
+
angles_file=angles_path,
|
|
90
|
+
expand_heuristic=args.expand_heuristic,
|
|
91
|
+
category=args.category,
|
|
92
|
+
per_angle_limit=args.per_angle_limit,
|
|
93
|
+
final_limit=args.limit,
|
|
94
|
+
)
|
|
95
|
+
angle_dicts = [
|
|
96
|
+
{"id": a.id, "query": a.query, "rationale": a.rationale} for a in plan.angles
|
|
97
|
+
]
|
|
98
|
+
write_deep_search_results(
|
|
99
|
+
out,
|
|
100
|
+
query=args.query,
|
|
101
|
+
engine=config.search_engine,
|
|
102
|
+
tier="deep",
|
|
103
|
+
plan_angles=angle_dicts,
|
|
104
|
+
ranked_web=ranked,
|
|
105
|
+
)
|
|
106
|
+
print(f"wrote {out} ({len(ranked)} fused results, {len(plan.angles)} angles)")
|
|
107
|
+
return 0
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def cmd_find_similar(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
111
|
+
out = Path(args.output or _default_out("search-deep.json"))
|
|
112
|
+
plan, ranked = run_find_similar(
|
|
60
113
|
args.url,
|
|
61
|
-
str(out),
|
|
62
114
|
config=config,
|
|
63
|
-
|
|
64
|
-
|
|
115
|
+
final_limit=args.limit,
|
|
116
|
+
per_angle_limit=args.per_angle_limit,
|
|
117
|
+
fast_fetch=args.fast,
|
|
118
|
+
)
|
|
119
|
+
angle_dicts = [
|
|
120
|
+
{"id": a.id, "query": a.query, "rationale": a.rationale} for a in plan.angles
|
|
121
|
+
]
|
|
122
|
+
write_deep_search_results(
|
|
123
|
+
out,
|
|
124
|
+
query=plan.intent,
|
|
125
|
+
engine=config.search_engine,
|
|
126
|
+
tier="deep",
|
|
127
|
+
plan_angles=angle_dicts,
|
|
128
|
+
ranked_web=ranked,
|
|
65
129
|
)
|
|
66
|
-
|
|
67
|
-
|
|
130
|
+
print(f"wrote {out} ({len(ranked)} similar results)")
|
|
131
|
+
return 0
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def cmd_scrape(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
135
|
+
out = Path(args.output or _default_out("page.md"))
|
|
136
|
+
fast = config.use_fast_for_url(args.url, args.fast)
|
|
137
|
+
hl_out = args.highlights_output
|
|
138
|
+
hl_query = (args.highlight_query or "").strip()
|
|
139
|
+
if args.highlights and hl_query:
|
|
140
|
+
scrape_url_with_highlights(
|
|
141
|
+
args.url,
|
|
142
|
+
str(out),
|
|
143
|
+
hl_out or str(_default_out("highlights.json")),
|
|
144
|
+
config=config,
|
|
145
|
+
fast=fast,
|
|
146
|
+
wait_ms=args.wait_for,
|
|
147
|
+
highlight_query=hl_query,
|
|
148
|
+
)
|
|
149
|
+
print(f"wrote {out} (highlights)")
|
|
150
|
+
else:
|
|
151
|
+
scrape_url(
|
|
152
|
+
args.url,
|
|
153
|
+
str(out),
|
|
154
|
+
config=config,
|
|
155
|
+
fast=fast,
|
|
156
|
+
wait_ms=args.wait_for,
|
|
157
|
+
)
|
|
158
|
+
mode = "fast" if fast else "stealth"
|
|
159
|
+
print(f"wrote {out} ({mode})")
|
|
160
|
+
return 0
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def cmd_contents_batch(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
164
|
+
import json
|
|
165
|
+
|
|
166
|
+
out_dir = Path(args.output or _default_out("contents"))
|
|
167
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
168
|
+
urls: list[str] = list(args.urls or [])
|
|
169
|
+
if args.from_search:
|
|
170
|
+
data = json.loads(Path(args.from_search).read_text(encoding="utf-8"))
|
|
171
|
+
for item in data.get("data", {}).get("web", []):
|
|
172
|
+
u = (item.get("url") or "").strip()
|
|
173
|
+
if u:
|
|
174
|
+
urls.append(u)
|
|
175
|
+
if not urls:
|
|
176
|
+
print("contents-batch: no URLs", file=sys.stderr)
|
|
177
|
+
return 1
|
|
178
|
+
|
|
179
|
+
hl_query = (args.highlight_query or "").strip()
|
|
180
|
+
manifest: list[dict] = []
|
|
181
|
+
sleep_sec = config.rate_limit_ms / 1000.0
|
|
182
|
+
for i, url in enumerate(urls[: args.limit]):
|
|
183
|
+
if i and sleep_sec > 0:
|
|
184
|
+
time.sleep(sleep_sec)
|
|
185
|
+
safe = urlparse(url).netloc.replace(".", "_")
|
|
186
|
+
md_path = out_dir / f"{safe}.md"
|
|
187
|
+
hl_path = out_dir / f"{safe}.highlights.json" if args.highlights and hl_query else None
|
|
188
|
+
fast = config.use_fast_for_url(url, args.fast)
|
|
189
|
+
try:
|
|
190
|
+
if hl_path:
|
|
191
|
+
scrape_url_with_highlights(
|
|
192
|
+
url,
|
|
193
|
+
str(md_path),
|
|
194
|
+
str(hl_path),
|
|
195
|
+
config=config,
|
|
196
|
+
fast=fast,
|
|
197
|
+
wait_ms=None,
|
|
198
|
+
highlight_query=hl_query,
|
|
199
|
+
)
|
|
200
|
+
else:
|
|
201
|
+
scrape_url(url, str(md_path), config=config, fast=fast, wait_ms=None)
|
|
202
|
+
manifest.append({"url": url, "markdown": str(md_path), "ok": True})
|
|
203
|
+
except Exception as err: # noqa: BLE001
|
|
204
|
+
manifest.append({"url": url, "ok": False, "error": str(err)})
|
|
205
|
+
|
|
206
|
+
manifest_path = out_dir / "manifest.json"
|
|
207
|
+
manifest_path.write_text(json.dumps({"urls": manifest}, indent=2) + "\n", encoding="utf-8")
|
|
208
|
+
if args.evidence_bundle and args.from_search:
|
|
209
|
+
eb_path = Path(args.evidence_bundle)
|
|
210
|
+
bundle = build_evidence_bundle(Path(args.from_search), query=hl_query)
|
|
211
|
+
write_evidence_bundle(eb_path, bundle)
|
|
212
|
+
print(f"wrote {eb_path}")
|
|
213
|
+
print(f"wrote {len(manifest)} entries to {out_dir}")
|
|
68
214
|
return 0
|
|
69
215
|
|
|
70
216
|
|
|
@@ -132,9 +278,41 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
132
278
|
ps = sub.add_parser("search", help="Search via configured SERP (HARNESS_WEB_SEARCH_ENGINE)")
|
|
133
279
|
ps.add_argument("query", help="Search query")
|
|
134
280
|
ps.add_argument("-o", "--output", help="JSON output path (default: .web/search.json)")
|
|
135
|
-
ps.add_argument("--limit", type=int, default=
|
|
281
|
+
ps.add_argument("--limit", type=int, default=None)
|
|
282
|
+
ps.add_argument(
|
|
283
|
+
"--tier",
|
|
284
|
+
choices=("instant", "standard", "deep", "research"),
|
|
285
|
+
default="standard",
|
|
286
|
+
help="WRS tier (instant=5, standard=10 results)",
|
|
287
|
+
)
|
|
136
288
|
ps.set_defaults(func=cmd_search)
|
|
137
289
|
|
|
290
|
+
pd = sub.add_parser("search-deep", help="Multi-angle SERP fusion (WRS deep)")
|
|
291
|
+
pd.add_argument("query", help="Original research intent")
|
|
292
|
+
pd.add_argument("-o", "--output", help="JSON output (default: .web/search-deep.json)")
|
|
293
|
+
pd.add_argument("--limit", type=int, default=10, help="Final fused result count")
|
|
294
|
+
pd.add_argument("--per-angle-limit", type=int, default=8, help="SERP hits per angle")
|
|
295
|
+
pd.add_argument(
|
|
296
|
+
"--angles-file",
|
|
297
|
+
metavar="YAML",
|
|
298
|
+
help="Angles from web-query-expander (.web/angles.yaml)",
|
|
299
|
+
)
|
|
300
|
+
pd.add_argument(
|
|
301
|
+
"--expand-heuristic",
|
|
302
|
+
action="store_true",
|
|
303
|
+
help="Emergency angle templates without expander subagent",
|
|
304
|
+
)
|
|
305
|
+
pd.add_argument("--category", help="Hint: code|company|people|paper|news")
|
|
306
|
+
pd.set_defaults(func=cmd_search_deep)
|
|
307
|
+
|
|
308
|
+
pf = sub.add_parser("find-similar", help="Pages similar to a seed URL")
|
|
309
|
+
pf.add_argument("url", help="Seed URL")
|
|
310
|
+
pf.add_argument("-o", "--output", help="JSON output (default: .web/search-deep.json)")
|
|
311
|
+
pf.add_argument("--limit", type=int, default=10)
|
|
312
|
+
pf.add_argument("--per-angle-limit", type=int, default=6)
|
|
313
|
+
pf.add_argument("--fast", action="store_true", help="Fast HTTP for seed fetch")
|
|
314
|
+
pf.set_defaults(func=cmd_find_similar)
|
|
315
|
+
|
|
138
316
|
pc = sub.add_parser("scrape", help="Scrape a URL to markdown")
|
|
139
317
|
pc.add_argument("url")
|
|
140
318
|
pc.add_argument("-o", "--output", help="Markdown output (default: .web/page.md)")
|
|
@@ -150,8 +328,33 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
150
328
|
metavar="MS",
|
|
151
329
|
help="Extra wait after load (stealth mode, milliseconds)",
|
|
152
330
|
)
|
|
331
|
+
pc.add_argument("--highlights", action="store_true", help="Extract query-aligned excerpts")
|
|
332
|
+
pc.add_argument("--highlight-query", help="Query for highlight scoring")
|
|
333
|
+
pc.add_argument(
|
|
334
|
+
"--highlights-output",
|
|
335
|
+
help="Highlights JSON path (default: .web/highlights.json)",
|
|
336
|
+
)
|
|
153
337
|
pc.set_defaults(func=cmd_scrape)
|
|
154
338
|
|
|
339
|
+
pbatch = sub.add_parser("contents-batch", help="Batch scrape URLs to markdown manifest")
|
|
340
|
+
pbatch.add_argument("urls", nargs="*", help="URLs to fetch")
|
|
341
|
+
pbatch.add_argument("-o", "--output", help="Output directory (default: .web/contents)")
|
|
342
|
+
pbatch.add_argument("--limit", type=int, default=5)
|
|
343
|
+
pbatch.add_argument(
|
|
344
|
+
"--from-search",
|
|
345
|
+
metavar="JSON",
|
|
346
|
+
help="URLs from search.json or search-deep.json",
|
|
347
|
+
)
|
|
348
|
+
pbatch.add_argument("--fast", action="store_true")
|
|
349
|
+
pbatch.add_argument("--highlights", action="store_true")
|
|
350
|
+
pbatch.add_argument("--highlight-query", default="")
|
|
351
|
+
pbatch.add_argument(
|
|
352
|
+
"--evidence-bundle",
|
|
353
|
+
metavar="JSON",
|
|
354
|
+
help="Write evidence-bundle.json from --from-search",
|
|
355
|
+
)
|
|
356
|
+
pbatch.set_defaults(func=cmd_contents_batch)
|
|
357
|
+
|
|
155
358
|
pb = sub.add_parser("bulk-scrape", help="Search then scrape multiple URLs")
|
|
156
359
|
pb.add_argument("query", nargs="?", help="Search query when not using --from-search")
|
|
157
360
|
pb.add_argument("-o", "--output", help="Output directory (default: .web/bulk)")
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""WRS deep search orchestration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from .config import HarnessWebConfig
|
|
9
|
+
from .multi_search import multi_search
|
|
10
|
+
from .query_angles import AnglesPlan, resolve_angles
|
|
11
|
+
from .rank import fuse_angle_results
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _rerank_mode() -> str:
|
|
15
|
+
mode = os.environ.get("HARNESS_WEB_RERANK", "off").strip().lower()
|
|
16
|
+
if mode in ("off", "lexical", "embed"):
|
|
17
|
+
return mode
|
|
18
|
+
return "off"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def run_deep_search(
|
|
22
|
+
query: str,
|
|
23
|
+
*,
|
|
24
|
+
config: HarnessWebConfig,
|
|
25
|
+
angles_file: Path | None = None,
|
|
26
|
+
expand_heuristic: bool = False,
|
|
27
|
+
category: str | None = None,
|
|
28
|
+
per_angle_limit: int = 8,
|
|
29
|
+
final_limit: int = 10,
|
|
30
|
+
) -> tuple[AnglesPlan, list[dict]]:
|
|
31
|
+
plan = resolve_angles(
|
|
32
|
+
query,
|
|
33
|
+
angles_file=angles_file,
|
|
34
|
+
expand_heuristic=expand_heuristic,
|
|
35
|
+
category=category,
|
|
36
|
+
)
|
|
37
|
+
per_angle = multi_search(plan, per_angle_limit=per_angle_limit, config=config)
|
|
38
|
+
# Strip internal tags before fusion
|
|
39
|
+
clean: dict[str, list[dict[str, str]]] = {}
|
|
40
|
+
for aid, rows in per_angle.items():
|
|
41
|
+
clean[aid] = [
|
|
42
|
+
{
|
|
43
|
+
"url": r.get("url", ""),
|
|
44
|
+
"title": r.get("title", ""),
|
|
45
|
+
"description": r.get("description", ""),
|
|
46
|
+
}
|
|
47
|
+
for r in rows
|
|
48
|
+
]
|
|
49
|
+
ranked = fuse_angle_results(
|
|
50
|
+
clean,
|
|
51
|
+
final_limit=final_limit,
|
|
52
|
+
intent=plan.intent,
|
|
53
|
+
rerank_mode=_rerank_mode(),
|
|
54
|
+
)
|
|
55
|
+
return plan, [h.to_web_dict() for h in ranked]
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Build evidence-bundle.json from search-deep + optional highlight fetches."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def build_evidence_bundle(
|
|
11
|
+
search_deep_path: Path,
|
|
12
|
+
*,
|
|
13
|
+
highlight_files: dict[str, Path] | None = None,
|
|
14
|
+
query: str = "",
|
|
15
|
+
) -> dict[str, Any]:
|
|
16
|
+
data = json.loads(search_deep_path.read_text(encoding="utf-8"))
|
|
17
|
+
intent = data.get("query") or query
|
|
18
|
+
hits = data.get("data", {}).get("web", [])
|
|
19
|
+
sources: list[dict[str, Any]] = []
|
|
20
|
+
for hit in hits:
|
|
21
|
+
url = hit.get("url", "")
|
|
22
|
+
entry: dict[str, Any] = {
|
|
23
|
+
"url": url,
|
|
24
|
+
"title": hit.get("title", ""),
|
|
25
|
+
"description": hit.get("description", ""),
|
|
26
|
+
"score": hit.get("score"),
|
|
27
|
+
"angle_ids": hit.get("angle_ids", []),
|
|
28
|
+
}
|
|
29
|
+
if highlight_files and url in highlight_files:
|
|
30
|
+
hp = highlight_files[url]
|
|
31
|
+
if hp.exists():
|
|
32
|
+
try:
|
|
33
|
+
entry["highlights"] = json.loads(hp.read_text(encoding="utf-8"))
|
|
34
|
+
except json.JSONDecodeError:
|
|
35
|
+
pass
|
|
36
|
+
sources.append(entry)
|
|
37
|
+
return {
|
|
38
|
+
"intent": intent,
|
|
39
|
+
"mode": data.get("mode", "deep"),
|
|
40
|
+
"engine": data.get("engine", ""),
|
|
41
|
+
"sources": sources,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def write_evidence_bundle(path: Path, payload: dict[str, Any]) -> None:
|
|
46
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
path.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Seed-URL discovery (Exa findSimilar analog)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from .config import HarnessWebConfig
|
|
9
|
+
from .deep_search import run_deep_search
|
|
10
|
+
from .query_angles import AnglesPlan, SearchAngle
|
|
11
|
+
from .rank import RankedHit, fuse_angle_results, normalize_url, tokenize
|
|
12
|
+
from .scrape import fetch_page
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _extract_seed_phrases(url: str, *, config: HarnessWebConfig, fast: bool) -> list[str]:
|
|
16
|
+
page = fetch_page(url, config=config, fast=fast, wait_ms=None)
|
|
17
|
+
title = ""
|
|
18
|
+
if hasattr(page, "css"):
|
|
19
|
+
for sel in ("title", "h1"):
|
|
20
|
+
nodes = page.css(sel)
|
|
21
|
+
if nodes:
|
|
22
|
+
title = (nodes[0].get_all_text(strip=True) or "").strip()
|
|
23
|
+
if title:
|
|
24
|
+
break
|
|
25
|
+
if not title and hasattr(page, "get_all_text"):
|
|
26
|
+
title = (page.get_all_text(strip=True) or "")[:200].strip()
|
|
27
|
+
title = re.sub(r"\s+", " ", title).strip()
|
|
28
|
+
phrases: list[str] = []
|
|
29
|
+
if title:
|
|
30
|
+
phrases.append(title[:120])
|
|
31
|
+
# Key tokens from title
|
|
32
|
+
tokens = sorted(tokenize(title), key=len, reverse=True)[:6]
|
|
33
|
+
if tokens:
|
|
34
|
+
phrases.append(" ".join(tokens[:5]))
|
|
35
|
+
phrases.append(f"similar to {title[:80]}" if title else f"related pages {url}")
|
|
36
|
+
return [p for p in phrases if p.strip()][:3]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def run_find_similar(
|
|
40
|
+
seed_url: str,
|
|
41
|
+
*,
|
|
42
|
+
config: HarnessWebConfig,
|
|
43
|
+
final_limit: int = 10,
|
|
44
|
+
per_angle_limit: int = 6,
|
|
45
|
+
fast_fetch: bool = True,
|
|
46
|
+
) -> tuple[AnglesPlan, list[dict]]:
|
|
47
|
+
phrases = _extract_seed_phrases(seed_url, config=config, fast=fast_fetch)
|
|
48
|
+
angles = tuple(
|
|
49
|
+
SearchAngle(f"similar_{i + 1}", q, f"Derived from seed {seed_url}")
|
|
50
|
+
for i, q in enumerate(phrases)
|
|
51
|
+
)
|
|
52
|
+
plan = AnglesPlan(intent=f"pages similar to {seed_url}", angles=angles)
|
|
53
|
+
from .multi_search import multi_search
|
|
54
|
+
|
|
55
|
+
per_angle = multi_search(plan, per_angle_limit=per_angle_limit, config=config)
|
|
56
|
+
clean: dict[str, list[dict[str, str]]] = {}
|
|
57
|
+
for aid, rows in per_angle.items():
|
|
58
|
+
clean[aid] = [
|
|
59
|
+
{
|
|
60
|
+
"url": r.get("url", ""),
|
|
61
|
+
"title": r.get("title", ""),
|
|
62
|
+
"description": r.get("description", ""),
|
|
63
|
+
}
|
|
64
|
+
for r in rows
|
|
65
|
+
]
|
|
66
|
+
ranked = fuse_angle_results(clean, final_limit=final_limit * 2, intent=plan.intent)
|
|
67
|
+
|
|
68
|
+
# Boost overlap with seed text
|
|
69
|
+
seed_norm = normalize_url(seed_url)
|
|
70
|
+
seed_tokens = tokenize(" ".join(phrases))
|
|
71
|
+
rescored: list[RankedHit] = []
|
|
72
|
+
for h in ranked:
|
|
73
|
+
if normalize_url(h.url) == seed_norm:
|
|
74
|
+
continue
|
|
75
|
+
blob = f"{h.title} {h.description}".lower()
|
|
76
|
+
overlap = len(seed_tokens & tokenize(blob)) / max(len(seed_tokens), 1)
|
|
77
|
+
rescored.append(
|
|
78
|
+
RankedHit(
|
|
79
|
+
url=h.url,
|
|
80
|
+
title=h.title,
|
|
81
|
+
description=h.description,
|
|
82
|
+
score=h.score + 0.2 * overlap,
|
|
83
|
+
angle_ids=h.angle_ids,
|
|
84
|
+
ranks=h.ranks,
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
rescored.sort(key=lambda x: -x.score)
|
|
88
|
+
return plan, [h.to_web_dict() for h in rescored[:final_limit]]
|