ultimate-pi 0.19.0 → 0.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/.agents/skills/web-retrieval/SKILL.md +163 -0
  2. package/.agents/skills/wiki-autoresearch/SKILL.md +6 -6
  3. package/.pi/SYSTEM.md +30 -12
  4. package/.pi/agents/harness/planning/implementation-researcher.md +1 -1
  5. package/.pi/agents/harness/planning/stack-researcher.md +5 -1
  6. package/.pi/agents/harness/web-retrieval/web-answerer.md +35 -0
  7. package/.pi/agents/harness/web-retrieval/web-criteria-verifier.md +28 -0
  8. package/.pi/agents/harness/web-retrieval/web-gap-analyzer.md +31 -0
  9. package/.pi/agents/harness/web-retrieval/web-query-expander-fast.md +34 -0
  10. package/.pi/agents/harness/web-retrieval/web-query-expander.md +60 -0
  11. package/.pi/agents/harness/web-retrieval/web-summarizer.md +18 -0
  12. package/.pi/extensions/harness-web-guard.ts +2 -1
  13. package/.pi/extensions/harness-web-tools.ts +689 -51
  14. package/.pi/harness/agents.manifest.json +29 -5
  15. package/.pi/harness/agents.policy.yaml +34 -0
  16. package/.pi/harness/docs/adrs/0050-agentic-web-retrieval-stack.md +46 -0
  17. package/.pi/harness/docs/harness-web-search.md +97 -0
  18. package/.pi/harness/env.harness.template +9 -1
  19. package/.pi/harness/examples/web-heuristic-angles.project.yaml +22 -0
  20. package/.pi/harness/web-heuristic-angles.json +278 -0
  21. package/.pi/harness/web-heuristic-angles.yaml +182 -0
  22. package/.pi/lib/agents-policy.mjs +6 -0
  23. package/.pi/lib/harness-subagent-auth.ts +39 -9
  24. package/.pi/lib/harness-subagents-bridge.ts +21 -0
  25. package/.pi/lib/harness-web/artifacts.ts +200 -0
  26. package/.pi/lib/harness-web/cache.ts +369 -0
  27. package/.pi/lib/harness-web/run-cli.ts +42 -2
  28. package/.pi/prompts/harness-plan.md +1 -0
  29. package/.pi/prompts/harness-setup.md +3 -1
  30. package/.pi/scripts/gen-web-heuristic-angles-json.mjs +24 -0
  31. package/.pi/scripts/harness-cli-verify.sh +5 -0
  32. package/.pi/scripts/harness-verify.mjs +78 -0
  33. package/.pi/scripts/harness-web-policy-guard.mjs +1 -1
  34. package/.pi/scripts/harness-web.py +218 -15
  35. package/.pi/scripts/harness_web/deep_search.py +55 -0
  36. package/.pi/scripts/harness_web/evidence_bundle.py +47 -0
  37. package/.pi/scripts/harness_web/find_similar.py +88 -0
  38. package/.pi/scripts/harness_web/heuristic_angles_shipped.py +85 -0
  39. package/.pi/scripts/harness_web/heuristic_config.py +251 -0
  40. package/.pi/scripts/harness_web/highlights.py +47 -0
  41. package/.pi/scripts/harness_web/multi_search.py +59 -0
  42. package/.pi/scripts/harness_web/output.py +24 -0
  43. package/.pi/scripts/harness_web/query_angles.py +116 -0
  44. package/.pi/scripts/harness_web/rank.py +163 -0
  45. package/.pi/scripts/harness_web/scrape.py +30 -0
  46. package/.pi/scripts/tests/test_harness_web_heuristic_config.py +132 -0
  47. package/.pi/scripts/tests/test_harness_web_query_angles.py +45 -0
  48. package/.pi/scripts/tests/test_harness_web_rank.py +56 -0
  49. package/AGENTS.md +2 -2
  50. package/CHANGELOG.md +6 -0
  51. package/package.json +5 -3
  52. package/.agents/skills/scrapling-web/SKILL.md +0 -98
  53. package/.pi/extensions/00-posthog-network-bootstrap.ts +0 -11
  54. package/.pi/scripts/harness_web/__pycache__/__init__.cpython-314.pyc +0 -0
  55. package/.pi/scripts/harness_web/__pycache__/config.cpython-314.pyc +0 -0
  56. package/.pi/scripts/harness_web/__pycache__/output.cpython-314.pyc +0 -0
  57. package/.pi/scripts/harness_web/__pycache__/scrape.cpython-314.pyc +0 -0
  58. package/.pi/scripts/harness_web/__pycache__/search.cpython-314.pyc +0 -0
  59. package/.pi/scripts/harness_web/__pycache__/search_ddg.cpython-314.pyc +0 -0
  60. package/.pi/scripts/harness_web/__pycache__/search_searxng.cpython-314.pyc +0 -0
@@ -9,6 +9,7 @@ import shutil
9
9
  import sys
10
10
  import time
11
11
  from pathlib import Path
12
+ from urllib.parse import urlparse
12
13
 
13
14
  # Re-exec with scrapling's uv-tool Python when the library is not on default python3.
14
15
  def _bootstrap_scrapling() -> None:
@@ -34,10 +35,28 @@ if str(SCRIPT_DIR) not in sys.path:
34
35
  sys.path.insert(0, str(SCRIPT_DIR))
35
36
 
36
37
  from harness_web.config import HarnessWebConfig, load_config # noqa: E402
37
- from harness_web.output import write_search_results # noqa: E402
38
- from harness_web.scrape import bulk_scrape, map_url, scrape_url # noqa: E402
38
+ from harness_web.deep_search import run_deep_search # noqa: E402
39
+ from harness_web.evidence_bundle import build_evidence_bundle, write_evidence_bundle # noqa: E402
40
+ from harness_web.find_similar import run_find_similar # noqa: E402
41
+ from harness_web.output import ( # noqa: E402
42
+ write_deep_search_results,
43
+ write_search_results,
44
+ )
45
+ from harness_web.scrape import ( # noqa: E402
46
+ bulk_scrape,
47
+ map_url,
48
+ scrape_url,
49
+ scrape_url_with_highlights,
50
+ )
39
51
  from harness_web.search import search # noqa: E402
40
52
 
53
+ TIER_LIMITS = {
54
+ "instant": 5,
55
+ "standard": 10,
56
+ "deep": 10,
57
+ "research": 15,
58
+ }
59
+
41
60
  DEFAULT_WEB_DIR = ".web"
42
61
 
43
62
 
@@ -45,26 +64,153 @@ def _default_out(sub: str) -> Path:
45
64
  return Path(DEFAULT_WEB_DIR) / sub
46
65
 
47
66
 
67
+ def _tier_limit(tier: str, cli_limit: int | None) -> int:
68
+ if cli_limit is not None:
69
+ return cli_limit
70
+ return TIER_LIMITS.get(tier, 10)
71
+
72
+
48
73
  def cmd_search(args: argparse.Namespace, config: HarnessWebConfig) -> int:
74
+ tier = getattr(args, "tier", None) or "standard"
75
+ limit = _tier_limit(tier, args.limit)
49
76
  out = Path(args.output or _default_out("search.json"))
50
- results = search(args.query, limit=args.limit, config=config)
51
- write_search_results(out, results, args.query, engine=config.search_engine)
52
- print(f"wrote {out} ({len(results)} results)")
77
+ results = search(args.query, limit=limit, config=config)
78
+ write_search_results(out, results, args.query, engine=config.search_engine, tier=tier)
79
+ print(f"wrote {out} ({len(results)} results, tier={tier})")
53
80
  return 0
54
81
 
55
82
 
56
- def cmd_scrape(args: argparse.Namespace, config: HarnessWebConfig) -> int:
57
- out = Path(args.output or _default_out("page.md"))
58
- fast = config.use_fast_for_url(args.url, args.fast)
59
- scrape_url(
83
+ def cmd_search_deep(args: argparse.Namespace, config: HarnessWebConfig) -> int:
84
+ out = Path(args.output or _default_out("search-deep.json"))
85
+ angles_path = Path(args.angles_file) if args.angles_file else None
86
+ plan, ranked = run_deep_search(
87
+ args.query,
88
+ config=config,
89
+ angles_file=angles_path,
90
+ expand_heuristic=args.expand_heuristic,
91
+ category=args.category,
92
+ per_angle_limit=args.per_angle_limit,
93
+ final_limit=args.limit,
94
+ )
95
+ angle_dicts = [
96
+ {"id": a.id, "query": a.query, "rationale": a.rationale} for a in plan.angles
97
+ ]
98
+ write_deep_search_results(
99
+ out,
100
+ query=args.query,
101
+ engine=config.search_engine,
102
+ tier="deep",
103
+ plan_angles=angle_dicts,
104
+ ranked_web=ranked,
105
+ )
106
+ print(f"wrote {out} ({len(ranked)} fused results, {len(plan.angles)} angles)")
107
+ return 0
108
+
109
+
110
+ def cmd_find_similar(args: argparse.Namespace, config: HarnessWebConfig) -> int:
111
+ out = Path(args.output or _default_out("search-deep.json"))
112
+ plan, ranked = run_find_similar(
60
113
  args.url,
61
- str(out),
62
114
  config=config,
63
- fast=fast,
64
- wait_ms=args.wait_for,
115
+ final_limit=args.limit,
116
+ per_angle_limit=args.per_angle_limit,
117
+ fast_fetch=args.fast,
118
+ )
119
+ angle_dicts = [
120
+ {"id": a.id, "query": a.query, "rationale": a.rationale} for a in plan.angles
121
+ ]
122
+ write_deep_search_results(
123
+ out,
124
+ query=plan.intent,
125
+ engine=config.search_engine,
126
+ tier="deep",
127
+ plan_angles=angle_dicts,
128
+ ranked_web=ranked,
65
129
  )
66
- mode = "fast" if fast else "stealth"
67
- print(f"wrote {out} ({mode})")
130
+ print(f"wrote {out} ({len(ranked)} similar results)")
131
+ return 0
132
+
133
+
134
+ def cmd_scrape(args: argparse.Namespace, config: HarnessWebConfig) -> int:
135
+ out = Path(args.output or _default_out("page.md"))
136
+ fast = config.use_fast_for_url(args.url, args.fast)
137
+ hl_out = args.highlights_output
138
+ hl_query = (args.highlight_query or "").strip()
139
+ if args.highlights and hl_query:
140
+ scrape_url_with_highlights(
141
+ args.url,
142
+ str(out),
143
+ hl_out or str(_default_out("highlights.json")),
144
+ config=config,
145
+ fast=fast,
146
+ wait_ms=args.wait_for,
147
+ highlight_query=hl_query,
148
+ )
149
+ print(f"wrote {out} (highlights)")
150
+ else:
151
+ scrape_url(
152
+ args.url,
153
+ str(out),
154
+ config=config,
155
+ fast=fast,
156
+ wait_ms=args.wait_for,
157
+ )
158
+ mode = "fast" if fast else "stealth"
159
+ print(f"wrote {out} ({mode})")
160
+ return 0
161
+
162
+
163
+ def cmd_contents_batch(args: argparse.Namespace, config: HarnessWebConfig) -> int:
164
+ import json
165
+
166
+ out_dir = Path(args.output or _default_out("contents"))
167
+ out_dir.mkdir(parents=True, exist_ok=True)
168
+ urls: list[str] = list(args.urls or [])
169
+ if args.from_search:
170
+ data = json.loads(Path(args.from_search).read_text(encoding="utf-8"))
171
+ for item in data.get("data", {}).get("web", []):
172
+ u = (item.get("url") or "").strip()
173
+ if u:
174
+ urls.append(u)
175
+ if not urls:
176
+ print("contents-batch: no URLs", file=sys.stderr)
177
+ return 1
178
+
179
+ hl_query = (args.highlight_query or "").strip()
180
+ manifest: list[dict] = []
181
+ sleep_sec = config.rate_limit_ms / 1000.0
182
+ for i, url in enumerate(urls[: args.limit]):
183
+ if i and sleep_sec > 0:
184
+ time.sleep(sleep_sec)
185
+ safe = urlparse(url).netloc.replace(".", "_")
186
+ md_path = out_dir / f"{safe}.md"
187
+ hl_path = out_dir / f"{safe}.highlights.json" if args.highlights and hl_query else None
188
+ fast = config.use_fast_for_url(url, args.fast)
189
+ try:
190
+ if hl_path:
191
+ scrape_url_with_highlights(
192
+ url,
193
+ str(md_path),
194
+ str(hl_path),
195
+ config=config,
196
+ fast=fast,
197
+ wait_ms=None,
198
+ highlight_query=hl_query,
199
+ )
200
+ else:
201
+ scrape_url(url, str(md_path), config=config, fast=fast, wait_ms=None)
202
+ manifest.append({"url": url, "markdown": str(md_path), "ok": True})
203
+ except Exception as err: # noqa: BLE001
204
+ manifest.append({"url": url, "ok": False, "error": str(err)})
205
+
206
+ manifest_path = out_dir / "manifest.json"
207
+ manifest_path.write_text(json.dumps({"urls": manifest}, indent=2) + "\n", encoding="utf-8")
208
+ if args.evidence_bundle and args.from_search:
209
+ eb_path = Path(args.evidence_bundle)
210
+ bundle = build_evidence_bundle(Path(args.from_search), query=hl_query)
211
+ write_evidence_bundle(eb_path, bundle)
212
+ print(f"wrote {eb_path}")
213
+ print(f"wrote {len(manifest)} entries to {out_dir}")
68
214
  return 0
69
215
 
70
216
 
@@ -132,9 +278,41 @@ def build_parser() -> argparse.ArgumentParser:
132
278
  ps = sub.add_parser("search", help="Search via configured SERP (HARNESS_WEB_SEARCH_ENGINE)")
133
279
  ps.add_argument("query", help="Search query")
134
280
  ps.add_argument("-o", "--output", help="JSON output path (default: .web/search.json)")
135
- ps.add_argument("--limit", type=int, default=5)
281
+ ps.add_argument("--limit", type=int, default=None)
282
+ ps.add_argument(
283
+ "--tier",
284
+ choices=("instant", "standard", "deep", "research"),
285
+ default="standard",
286
+ help="WRS tier (instant=5, standard=10 results)",
287
+ )
136
288
  ps.set_defaults(func=cmd_search)
137
289
 
290
+ pd = sub.add_parser("search-deep", help="Multi-angle SERP fusion (WRS deep)")
291
+ pd.add_argument("query", help="Original research intent")
292
+ pd.add_argument("-o", "--output", help="JSON output (default: .web/search-deep.json)")
293
+ pd.add_argument("--limit", type=int, default=10, help="Final fused result count")
294
+ pd.add_argument("--per-angle-limit", type=int, default=8, help="SERP hits per angle")
295
+ pd.add_argument(
296
+ "--angles-file",
297
+ metavar="YAML",
298
+ help="Angles from web-query-expander (.web/angles.yaml)",
299
+ )
300
+ pd.add_argument(
301
+ "--expand-heuristic",
302
+ action="store_true",
303
+ help="Emergency angle templates without expander subagent",
304
+ )
305
+ pd.add_argument("--category", help="Hint: code|company|people|paper|news")
306
+ pd.set_defaults(func=cmd_search_deep)
307
+
308
+ pf = sub.add_parser("find-similar", help="Pages similar to a seed URL")
309
+ pf.add_argument("url", help="Seed URL")
310
+ pf.add_argument("-o", "--output", help="JSON output (default: .web/search-deep.json)")
311
+ pf.add_argument("--limit", type=int, default=10)
312
+ pf.add_argument("--per-angle-limit", type=int, default=6)
313
+ pf.add_argument("--fast", action="store_true", help="Fast HTTP for seed fetch")
314
+ pf.set_defaults(func=cmd_find_similar)
315
+
138
316
  pc = sub.add_parser("scrape", help="Scrape a URL to markdown")
139
317
  pc.add_argument("url")
140
318
  pc.add_argument("-o", "--output", help="Markdown output (default: .web/page.md)")
@@ -150,8 +328,33 @@ def build_parser() -> argparse.ArgumentParser:
150
328
  metavar="MS",
151
329
  help="Extra wait after load (stealth mode, milliseconds)",
152
330
  )
331
+ pc.add_argument("--highlights", action="store_true", help="Extract query-aligned excerpts")
332
+ pc.add_argument("--highlight-query", help="Query for highlight scoring")
333
+ pc.add_argument(
334
+ "--highlights-output",
335
+ help="Highlights JSON path (default: .web/highlights.json)",
336
+ )
153
337
  pc.set_defaults(func=cmd_scrape)
154
338
 
339
+ pbatch = sub.add_parser("contents-batch", help="Batch scrape URLs to markdown manifest")
340
+ pbatch.add_argument("urls", nargs="*", help="URLs to fetch")
341
+ pbatch.add_argument("-o", "--output", help="Output directory (default: .web/contents)")
342
+ pbatch.add_argument("--limit", type=int, default=5)
343
+ pbatch.add_argument(
344
+ "--from-search",
345
+ metavar="JSON",
346
+ help="URLs from search.json or search-deep.json",
347
+ )
348
+ pbatch.add_argument("--fast", action="store_true")
349
+ pbatch.add_argument("--highlights", action="store_true")
350
+ pbatch.add_argument("--highlight-query", default="")
351
+ pbatch.add_argument(
352
+ "--evidence-bundle",
353
+ metavar="JSON",
354
+ help="Write evidence-bundle.json from --from-search",
355
+ )
356
+ pbatch.set_defaults(func=cmd_contents_batch)
357
+
155
358
  pb = sub.add_parser("bulk-scrape", help="Search then scrape multiple URLs")
156
359
  pb.add_argument("query", nargs="?", help="Search query when not using --from-search")
157
360
  pb.add_argument("-o", "--output", help="Output directory (default: .web/bulk)")
@@ -0,0 +1,55 @@
1
+ """WRS deep search orchestration."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ from pathlib import Path
7
+
8
+ from .config import HarnessWebConfig
9
+ from .multi_search import multi_search
10
+ from .query_angles import AnglesPlan, resolve_angles
11
+ from .rank import fuse_angle_results
12
+
13
+
14
+ def _rerank_mode() -> str:
15
+ mode = os.environ.get("HARNESS_WEB_RERANK", "off").strip().lower()
16
+ if mode in ("off", "lexical", "embed"):
17
+ return mode
18
+ return "off"
19
+
20
+
21
+ def run_deep_search(
22
+ query: str,
23
+ *,
24
+ config: HarnessWebConfig,
25
+ angles_file: Path | None = None,
26
+ expand_heuristic: bool = False,
27
+ category: str | None = None,
28
+ per_angle_limit: int = 8,
29
+ final_limit: int = 10,
30
+ ) -> tuple[AnglesPlan, list[dict]]:
31
+ plan = resolve_angles(
32
+ query,
33
+ angles_file=angles_file,
34
+ expand_heuristic=expand_heuristic,
35
+ category=category,
36
+ )
37
+ per_angle = multi_search(plan, per_angle_limit=per_angle_limit, config=config)
38
+ # Strip internal tags before fusion
39
+ clean: dict[str, list[dict[str, str]]] = {}
40
+ for aid, rows in per_angle.items():
41
+ clean[aid] = [
42
+ {
43
+ "url": r.get("url", ""),
44
+ "title": r.get("title", ""),
45
+ "description": r.get("description", ""),
46
+ }
47
+ for r in rows
48
+ ]
49
+ ranked = fuse_angle_results(
50
+ clean,
51
+ final_limit=final_limit,
52
+ intent=plan.intent,
53
+ rerank_mode=_rerank_mode(),
54
+ )
55
+ return plan, [h.to_web_dict() for h in ranked]
@@ -0,0 +1,47 @@
1
+ """Build evidence-bundle.json from search-deep + optional highlight fetches."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ from pathlib import Path
7
+ from typing import Any
8
+
9
+
10
+ def build_evidence_bundle(
11
+ search_deep_path: Path,
12
+ *,
13
+ highlight_files: dict[str, Path] | None = None,
14
+ query: str = "",
15
+ ) -> dict[str, Any]:
16
+ data = json.loads(search_deep_path.read_text(encoding="utf-8"))
17
+ intent = data.get("query") or query
18
+ hits = data.get("data", {}).get("web", [])
19
+ sources: list[dict[str, Any]] = []
20
+ for hit in hits:
21
+ url = hit.get("url", "")
22
+ entry: dict[str, Any] = {
23
+ "url": url,
24
+ "title": hit.get("title", ""),
25
+ "description": hit.get("description", ""),
26
+ "score": hit.get("score"),
27
+ "angle_ids": hit.get("angle_ids", []),
28
+ }
29
+ if highlight_files and url in highlight_files:
30
+ hp = highlight_files[url]
31
+ if hp.exists():
32
+ try:
33
+ entry["highlights"] = json.loads(hp.read_text(encoding="utf-8"))
34
+ except json.JSONDecodeError:
35
+ pass
36
+ sources.append(entry)
37
+ return {
38
+ "intent": intent,
39
+ "mode": data.get("mode", "deep"),
40
+ "engine": data.get("engine", ""),
41
+ "sources": sources,
42
+ }
43
+
44
+
45
+ def write_evidence_bundle(path: Path, payload: dict[str, Any]) -> None:
46
+ path.parent.mkdir(parents=True, exist_ok=True)
47
+ path.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
@@ -0,0 +1,88 @@
1
+ """Seed-URL discovery (Exa findSimilar analog)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from pathlib import Path
7
+
8
+ from .config import HarnessWebConfig
9
+ from .deep_search import run_deep_search
10
+ from .query_angles import AnglesPlan, SearchAngle
11
+ from .rank import RankedHit, fuse_angle_results, normalize_url, tokenize
12
+ from .scrape import fetch_page
13
+
14
+
15
+ def _extract_seed_phrases(url: str, *, config: HarnessWebConfig, fast: bool) -> list[str]:
16
+ page = fetch_page(url, config=config, fast=fast, wait_ms=None)
17
+ title = ""
18
+ if hasattr(page, "css"):
19
+ for sel in ("title", "h1"):
20
+ nodes = page.css(sel)
21
+ if nodes:
22
+ title = (nodes[0].get_all_text(strip=True) or "").strip()
23
+ if title:
24
+ break
25
+ if not title and hasattr(page, "get_all_text"):
26
+ title = (page.get_all_text(strip=True) or "")[:200].strip()
27
+ title = re.sub(r"\s+", " ", title).strip()
28
+ phrases: list[str] = []
29
+ if title:
30
+ phrases.append(title[:120])
31
+ # Key tokens from title
32
+ tokens = sorted(tokenize(title), key=len, reverse=True)[:6]
33
+ if tokens:
34
+ phrases.append(" ".join(tokens[:5]))
35
+ phrases.append(f"similar to {title[:80]}" if title else f"related pages {url}")
36
+ return [p for p in phrases if p.strip()][:3]
37
+
38
+
39
+ def run_find_similar(
40
+ seed_url: str,
41
+ *,
42
+ config: HarnessWebConfig,
43
+ final_limit: int = 10,
44
+ per_angle_limit: int = 6,
45
+ fast_fetch: bool = True,
46
+ ) -> tuple[AnglesPlan, list[dict]]:
47
+ phrases = _extract_seed_phrases(seed_url, config=config, fast=fast_fetch)
48
+ angles = tuple(
49
+ SearchAngle(f"similar_{i + 1}", q, f"Derived from seed {seed_url}")
50
+ for i, q in enumerate(phrases)
51
+ )
52
+ plan = AnglesPlan(intent=f"pages similar to {seed_url}", angles=angles)
53
+ from .multi_search import multi_search
54
+
55
+ per_angle = multi_search(plan, per_angle_limit=per_angle_limit, config=config)
56
+ clean: dict[str, list[dict[str, str]]] = {}
57
+ for aid, rows in per_angle.items():
58
+ clean[aid] = [
59
+ {
60
+ "url": r.get("url", ""),
61
+ "title": r.get("title", ""),
62
+ "description": r.get("description", ""),
63
+ }
64
+ for r in rows
65
+ ]
66
+ ranked = fuse_angle_results(clean, final_limit=final_limit * 2, intent=plan.intent)
67
+
68
+ # Boost overlap with seed text
69
+ seed_norm = normalize_url(seed_url)
70
+ seed_tokens = tokenize(" ".join(phrases))
71
+ rescored: list[RankedHit] = []
72
+ for h in ranked:
73
+ if normalize_url(h.url) == seed_norm:
74
+ continue
75
+ blob = f"{h.title} {h.description}".lower()
76
+ overlap = len(seed_tokens & tokenize(blob)) / max(len(seed_tokens), 1)
77
+ rescored.append(
78
+ RankedHit(
79
+ url=h.url,
80
+ title=h.title,
81
+ description=h.description,
82
+ score=h.score + 0.2 * overlap,
83
+ angle_ids=h.angle_ids,
84
+ ranks=h.ranks,
85
+ )
86
+ )
87
+ rescored.sort(key=lambda x: -x.score)
88
+ return plan, [h.to_web_dict() for h in rescored[:final_limit]]
@@ -0,0 +1,85 @@
1
+ """Shipped WRS heuristic angles (stdlib-only). Keep in sync with web-heuristic-angles.yaml."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ # fmt: off
8
+ SHIPPED_HEURISTIC_ANGLES: dict[str, Any] = {
9
+ "version": 1,
10
+ "max_angles": 8,
11
+ "base": [
12
+ {"id": "definitional", "query": "{query}", "rationale": "Core intent phrasing"},
13
+ {
14
+ "id": "authoritative",
15
+ "query": "{query} official documentation OR specification OR RFC",
16
+ "rationale": "Primary specs and vendor docs",
17
+ },
18
+ ],
19
+ "categories": {
20
+ "code": [
21
+ {"id": "github", "query": "{query} site:github.com", "rationale": "Source, issues, discussions"},
22
+ {"id": "stackoverflow", "query": "{query} site:stackoverflow.com", "rationale": "Debugging and API usage Q&A"},
23
+ {"id": "stackexchange", "query": "{query} site:stackexchange.com", "rationale": "Broader SE network (Super User, Server Fault, etc.)"},
24
+ {"id": "readthedocs", "query": "{query} site:readthedocs.io", "rationale": "OSS library documentation"},
25
+ {"id": "mdn", "query": "{query} site:developer.mozilla.org", "rationale": "Web platform and browser APIs"},
26
+ {"id": "package_registries", "query": "{query} site:npmjs.com OR site:pypi.org OR site:pkg.go.dev OR site:crates.io", "rationale": "Package metadata across major ecosystems"},
27
+ {"id": "microsoft_learn", "query": "{query} site:learn.microsoft.com", "rationale": ".NET, Azure, Windows, and enterprise stacks"},
28
+ {"id": "hacker_news", "query": "{query} site:news.ycombinator.com", "rationale": "High-signal practitioner discussion"},
29
+ {"id": "gitlab", "query": "{query} site:gitlab.com", "rationale": "Alternate host and CI-visible code"},
30
+ {"id": "devto", "query": "{query} site:dev.to OR site:medium.com", "rationale": "Tutorials and implementation writeups"},
31
+ ],
32
+ "paper": [
33
+ {"id": "arxiv", "query": "{query} site:arxiv.org", "rationale": "Preprints and latest ML/CS uploads"},
34
+ {"id": "semantic_scholar", "query": "{query} site:semanticscholar.org", "rationale": "Citations, influences, and PDF links"},
35
+ {"id": "google_scholar", "query": "{query} site:scholar.google.com", "rationale": "Broad academic discovery"},
36
+ {"id": "papers_with_code", "query": "{query} site:paperswithcode.com", "rationale": "Benchmarks tied to implementations"},
37
+ {"id": "openreview", "query": "{query} site:openreview.net", "rationale": "Peer reviews and ML conference submissions"},
38
+ {"id": "acl_anthology", "query": "{query} site:aclanthology.org", "rationale": "NLP and computational linguistics"},
39
+ {"id": "acm_dl", "query": "{query} site:dl.acm.org", "rationale": "ACM proceedings and journals"},
40
+ {"id": "pubmed", "query": "{query} site:pubmed.ncbi.nlm.nih.gov", "rationale": "Biomedical and life-sciences literature"},
41
+ ],
42
+ "news": [
43
+ {"id": "recent", "query": "{query} news 2025 2026", "rationale": "Recency-biased open web"},
44
+ {"id": "wire_reuters", "query": "{query} site:reuters.com", "rationale": "Wire-service reporting"},
45
+ {"id": "wire_ap", "query": "{query} site:apnews.com", "rationale": "Associated Press coverage"},
46
+ {"id": "tech_press", "query": "{query} site:techcrunch.com OR site:theverge.com OR site:arstechnica.com", "rationale": "Technology industry news"},
47
+ {"id": "business_press", "query": "{query} site:bloomberg.com OR site:ft.com OR site:wsj.com", "rationale": "Markets and business context"},
48
+ {"id": "analysis", "query": "{query} in-depth analysis explainer", "rationale": "Long-form journalism and explainers"},
49
+ {"id": "bbc", "query": "{query} site:bbc.com/news", "rationale": "International general news desk"},
50
+ ],
51
+ "company": [
52
+ {"id": "official_site", "query": "{query} official website", "rationale": "Company-controlled messaging"},
53
+ {"id": "crunchbase", "query": "{query} site:crunchbase.com", "rationale": "Funding, investors, and competitors"},
54
+ {"id": "linkedin_company", "query": "{query} site:linkedin.com/company", "rationale": "Headcount, hiring, and positioning"},
55
+ {"id": "sec_filings", "query": "{query} site:sec.gov 10-K OR 10-Q OR S-1", "rationale": "US public-company disclosures"},
56
+ {"id": "g2_reviews", "query": "{query} site:g2.com OR site:capterra.com", "rationale": "B2B software reviews and comparisons"},
57
+ {"id": "company_news", "query": "{query} company announcement press release", "rationale": "Launches, partnerships, and earnings"},
58
+ {"id": "glassdoor", "query": "{query} site:glassdoor.com", "rationale": "Employee sentiment and culture signals"},
59
+ ],
60
+ "people": [
61
+ {"id": "linkedin", "query": "{query} site:linkedin.com/in", "rationale": "Professional profiles"},
62
+ {"id": "github_person", "query": "{query} site:github.com", "rationale": "Open-source footprint for builders"},
63
+ {"id": "wikipedia", "query": "{query} site:en.wikipedia.org", "rationale": "Neutral biographical baseline"},
64
+ {"id": "scholar_person", "query": "{query} site:scholar.google.com", "rationale": "Publication record for researchers"},
65
+ {"id": "interviews", "query": "{query} interview podcast keynote", "rationale": "First-person statements and talks"},
66
+ {"id": "twitter_x", "query": "{query} site:x.com OR site:twitter.com", "rationale": "Public statements and discourse"},
67
+ ],
68
+ "security": [
69
+ {"id": "cve_nvd", "query": "{query} CVE site:nvd.nist.gov", "rationale": "National Vulnerability Database"},
70
+ {"id": "owasp", "query": "{query} site:owasp.org", "rationale": "AppSec standards and cheat sheets"},
71
+ {"id": "cwe", "query": "{query} site:cwe.mitre.org", "rationale": "Weakness taxonomy"},
72
+ {"id": "github_advisories", "query": "{query} site:github.com/advisories OR dependabot", "rationale": "Ecosystem security advisories"},
73
+ {"id": "snyk_blog", "query": "{query} site:snyk.io/blog OR vulnerability", "rationale": "Practitioner security writeups"},
74
+ ],
75
+ "default": [
76
+ {"id": "technical", "query": "{query} how it works architecture internals", "rationale": "Mechanism and design"},
77
+ {"id": "criticism", "query": "{query} limitations criticism drawbacks", "rationale": "Counterpoints and failure modes"},
78
+ {"id": "wikipedia", "query": "{query} site:en.wikipedia.org", "rationale": "Structured overview"},
79
+ {"id": "comparison", "query": "{query} vs alternatives comparison benchmark", "rationale": "Competitive landscape"},
80
+ {"id": "reddit", "query": "{query} site:reddit.com", "rationale": "Community experience reports"},
81
+ {"id": "hn_default", "query": "{query} site:news.ycombinator.com", "rationale": "Practitioner threads when category unknown"},
82
+ ],
83
+ },
84
+ }
85
+ # fmt: on