ultimate-pi 0.19.0 → 0.19.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. package/.agents/skills/web-retrieval/SKILL.md +163 -0
  2. package/.agents/skills/wiki-autoresearch/SKILL.md +6 -6
  3. package/.pi/SYSTEM.md +30 -12
  4. package/.pi/agents/harness/planning/implementation-researcher.md +1 -1
  5. package/.pi/agents/harness/planning/stack-researcher.md +5 -1
  6. package/.pi/agents/harness/web-retrieval/web-answerer.md +35 -0
  7. package/.pi/agents/harness/web-retrieval/web-criteria-verifier.md +28 -0
  8. package/.pi/agents/harness/web-retrieval/web-gap-analyzer.md +31 -0
  9. package/.pi/agents/harness/web-retrieval/web-query-expander-fast.md +34 -0
  10. package/.pi/agents/harness/web-retrieval/web-query-expander.md +60 -0
  11. package/.pi/agents/harness/web-retrieval/web-summarizer.md +18 -0
  12. package/.pi/extensions/harness-web-guard.ts +2 -1
  13. package/.pi/extensions/harness-web-tools.ts +689 -51
  14. package/.pi/harness/agents.manifest.json +29 -5
  15. package/.pi/harness/agents.policy.yaml +34 -0
  16. package/.pi/harness/docs/adrs/0050-agentic-web-retrieval-stack.md +46 -0
  17. package/.pi/harness/docs/harness-web-search.md +97 -0
  18. package/.pi/harness/env.harness.template +9 -1
  19. package/.pi/harness/examples/web-heuristic-angles.project.yaml +22 -0
  20. package/.pi/harness/web-heuristic-angles.json +278 -0
  21. package/.pi/harness/web-heuristic-angles.yaml +182 -0
  22. package/.pi/lib/agents-policy.mjs +6 -0
  23. package/.pi/lib/harness-subagent-auth.ts +39 -9
  24. package/.pi/lib/harness-subagents-bridge.ts +21 -0
  25. package/.pi/lib/harness-web/artifacts.ts +200 -0
  26. package/.pi/lib/harness-web/cache.ts +369 -0
  27. package/.pi/lib/harness-web/run-cli.ts +42 -2
  28. package/.pi/prompts/harness-plan.md +1 -0
  29. package/.pi/prompts/harness-setup.md +3 -1
  30. package/.pi/scripts/gen-web-heuristic-angles-json.mjs +24 -0
  31. package/.pi/scripts/harness-cli-verify.sh +5 -0
  32. package/.pi/scripts/harness-verify.mjs +78 -0
  33. package/.pi/scripts/harness-web-policy-guard.mjs +1 -1
  34. package/.pi/scripts/harness-web.py +218 -15
  35. package/.pi/scripts/harness_web/deep_search.py +55 -0
  36. package/.pi/scripts/harness_web/evidence_bundle.py +47 -0
  37. package/.pi/scripts/harness_web/find_similar.py +88 -0
  38. package/.pi/scripts/harness_web/heuristic_angles_shipped.py +85 -0
  39. package/.pi/scripts/harness_web/heuristic_config.py +251 -0
  40. package/.pi/scripts/harness_web/highlights.py +47 -0
  41. package/.pi/scripts/harness_web/multi_search.py +59 -0
  42. package/.pi/scripts/harness_web/output.py +24 -0
  43. package/.pi/scripts/harness_web/query_angles.py +116 -0
  44. package/.pi/scripts/harness_web/rank.py +163 -0
  45. package/.pi/scripts/harness_web/scrape.py +30 -0
  46. package/.pi/scripts/tests/test_harness_web_heuristic_config.py +132 -0
  47. package/.pi/scripts/tests/test_harness_web_query_angles.py +45 -0
  48. package/.pi/scripts/tests/test_harness_web_rank.py +56 -0
  49. package/AGENTS.md +2 -2
  50. package/CHANGELOG.md +6 -0
  51. package/package.json +5 -3
  52. package/.agents/skills/scrapling-web/SKILL.md +0 -98
  53. package/.pi/extensions/00-posthog-network-bootstrap.ts +0 -11
  54. package/.pi/scripts/harness_web/__pycache__/__init__.cpython-314.pyc +0 -0
  55. package/.pi/scripts/harness_web/__pycache__/config.cpython-314.pyc +0 -0
  56. package/.pi/scripts/harness_web/__pycache__/output.cpython-314.pyc +0 -0
  57. package/.pi/scripts/harness_web/__pycache__/scrape.cpython-314.pyc +0 -0
  58. package/.pi/scripts/harness_web/__pycache__/search.cpython-314.pyc +0 -0
  59. package/.pi/scripts/harness_web/__pycache__/search_ddg.cpython-314.pyc +0 -0
  60. package/.pi/scripts/harness_web/__pycache__/search_searxng.cpython-314.pyc +0 -0
@@ -0,0 +1,251 @@
1
+ """Load and merge WRS heuristic angle templates from YAML."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import os
7
+ from dataclasses import dataclass
8
+ from functools import lru_cache
9
+ from pathlib import Path
10
+ from typing import Any
11
+
12
+ from .query_angles import SearchAngle
13
+
14
+ try:
15
+ import yaml # type: ignore[import-untyped]
16
+ except ImportError:
17
+ yaml = None # type: ignore[assignment]
18
+
19
+ CONFIG_BASENAME = "web-heuristic-angles.yaml"
20
+ CONFIG_JSON_BASENAME = "web-heuristic-angles.json"
21
+ ENV_CONFIG_FILE = "HARNESS_WEB_HEURISTIC_ANGLES_FILE"
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class HeuristicAnglesConfig:
26
+ max_angles: int
27
+ base: tuple[SearchAngle, ...]
28
+ categories: dict[str, tuple[SearchAngle, ...]]
29
+
30
+
31
+ def _project_root() -> Path:
32
+ env = os.environ.get("HARNESS_PROJECT_ROOT", "").strip()
33
+ if env:
34
+ return Path(env).resolve()
35
+ return Path.cwd().resolve()
36
+
37
+
38
+ def _package_root() -> Path | None:
39
+ env = os.environ.get("HARNESS_PKG_ROOT", "").strip()
40
+ if env:
41
+ return Path(env).resolve()
42
+ here = Path(__file__).resolve()
43
+ for anc in here.parents:
44
+ if (anc / ".pi" / "harness" / CONFIG_BASENAME).is_file():
45
+ return anc
46
+ return None
47
+
48
+
49
+ def discover_heuristic_config_paths() -> list[Path]:
50
+ """Package defaults first, then project override, then env file last (wins on id)."""
51
+ paths: list[Path] = []
52
+ pkg = _package_root()
53
+ if pkg is not None:
54
+ pkg_yaml = pkg / ".pi" / "harness" / CONFIG_BASENAME
55
+ pkg_json = pkg / ".pi" / "harness" / CONFIG_JSON_BASENAME
56
+ if pkg_yaml.is_file():
57
+ paths.append(pkg_yaml)
58
+ if pkg_json.is_file() and pkg_json not in paths:
59
+ paths.append(pkg_json)
60
+ proj_file = _project_root() / ".pi" / "harness" / CONFIG_BASENAME
61
+ if proj_file.is_file() and proj_file not in paths:
62
+ paths.append(proj_file)
63
+ env_path = os.environ.get(ENV_CONFIG_FILE, "").strip()
64
+ if env_path:
65
+ p = Path(env_path).expanduser().resolve()
66
+ if p.is_file() and p not in paths:
67
+ paths.append(p)
68
+ return paths
69
+
70
+
71
+ def _format_query(template: str, query: str) -> str:
72
+ return template.replace("{query}", query.strip())
73
+
74
+
75
+ def _parse_angle_list(raw: Any, *, source: str) -> list[SearchAngle]:
76
+ if raw is None:
77
+ return []
78
+ if not isinstance(raw, list):
79
+ raise ValueError(f"{source}: expected list of angle objects")
80
+ out: list[SearchAngle] = []
81
+ for i, item in enumerate(raw):
82
+ if not isinstance(item, dict):
83
+ raise ValueError(f"{source}[{i}]: expected object")
84
+ aid = str(item.get("id") or item.get("name") or f"angle_{i + 1}").strip()
85
+ qtpl = str(item.get("query") or "").strip()
86
+ if not aid or not qtpl:
87
+ raise ValueError(f"{source}[{i}]: id and query required")
88
+ rationale = str(item.get("rationale") or item.get("reason") or "").strip()
89
+ out.append(
90
+ SearchAngle(
91
+ id=aid,
92
+ query=qtpl,
93
+ rationale=rationale,
94
+ )
95
+ )
96
+ return out
97
+
98
+
99
+ def _merge_config_dict(accum: dict[str, Any], layer: dict[str, Any]) -> dict[str, Any]:
100
+ out = dict(accum)
101
+ if "max_angles" in layer:
102
+ out["max_angles"] = layer["max_angles"]
103
+ if "version" in layer:
104
+ out["version"] = layer["version"]
105
+ base_acc = list(out.get("base") or [])
106
+ base_acc.extend(layer.get("base") or [])
107
+ out["base"] = base_acc
108
+ cats: dict[str, list[Any]] = dict(out.get("categories") or {})
109
+ layer_cats = layer.get("categories")
110
+ if isinstance(layer_cats, dict):
111
+ for key, angles in layer_cats.items():
112
+ cat = str(key).strip().lower()
113
+ if not cat:
114
+ continue
115
+ existing = list(cats.get(cat) or [])
116
+ if isinstance(angles, list):
117
+ existing.extend(angles)
118
+ cats[cat] = existing
119
+ out["categories"] = cats
120
+ return out
121
+
122
+
123
+ def _load_config_file(path: Path) -> dict[str, Any]:
124
+ text = path.read_text(encoding="utf-8")
125
+ if path.suffix.lower() == ".json":
126
+ data = json.loads(text)
127
+ elif yaml is not None:
128
+ data = yaml.safe_load(text)
129
+ else:
130
+ raise ValueError(f"PyYAML required to load {path} (or use .json)")
131
+ if not isinstance(data, dict):
132
+ raise ValueError(f"{path}: root must be a mapping")
133
+ return data
134
+
135
+
136
+ def _embedded_builtin_dict() -> dict[str, Any]:
137
+ """Fallback when no config files load (stdlib shipped defaults)."""
138
+ pkg = _package_root()
139
+ if pkg is not None:
140
+ for name in (CONFIG_JSON_BASENAME, CONFIG_BASENAME):
141
+ pkg_file = pkg / ".pi" / "harness" / name
142
+ if not pkg_file.is_file():
143
+ continue
144
+ try:
145
+ return _load_config_file(pkg_file)
146
+ except (ValueError, json.JSONDecodeError, OSError):
147
+ continue
148
+ from .heuristic_angles_shipped import SHIPPED_HEURISTIC_ANGLES
149
+
150
+ return dict(SHIPPED_HEURISTIC_ANGLES)
151
+
152
+
153
+ def heuristic_config_from_merged(merged: dict[str, Any]) -> HeuristicAnglesConfig:
154
+ max_angles = int(merged.get("max_angles") or 5)
155
+ max_angles = max(2, min(max_angles, 8))
156
+
157
+ base_templates = _parse_angle_list(merged.get("base"), source="base")
158
+ raw_cats = merged.get("categories")
159
+ categories: dict[str, tuple[SearchAngle, ...]] = {}
160
+ if isinstance(raw_cats, dict):
161
+ for key, raw_list in raw_cats.items():
162
+ cat = str(key).strip().lower()
163
+ if not cat:
164
+ continue
165
+ categories[cat] = tuple(
166
+ _parse_angle_list(raw_list, source=f"categories.{cat}")
167
+ )
168
+
169
+ if "default" not in categories:
170
+ categories["default"] = (
171
+ SearchAngle("technical", "{query} how it works architecture", "Technical"),
172
+ SearchAngle("criticism", "{query} limitations criticism", "Counterpoints"),
173
+ )
174
+
175
+ return HeuristicAnglesConfig(
176
+ max_angles=max_angles,
177
+ base=tuple(base_templates),
178
+ categories=categories,
179
+ )
180
+
181
+
182
+ @lru_cache(maxsize=8)
183
+ def load_heuristic_angles_config_cached(paths_key: tuple[str, ...]) -> HeuristicAnglesConfig:
184
+ paths = [Path(p) for p in paths_key] if paths_key else discover_heuristic_config_paths()
185
+ layers: list[dict[str, Any]] = []
186
+ for p in paths:
187
+ try:
188
+ layers.append(_load_config_file(p))
189
+ except (ValueError, json.JSONDecodeError, OSError):
190
+ continue
191
+ if layers:
192
+ merged: dict[str, Any] = {}
193
+ for layer in layers:
194
+ merged = _merge_config_dict(merged, layer)
195
+ else:
196
+ merged = _embedded_builtin_dict()
197
+ return heuristic_config_from_merged(merged)
198
+
199
+
200
+ def load_heuristic_angles_config() -> HeuristicAnglesConfig:
201
+ paths = discover_heuristic_config_paths()
202
+ return load_heuristic_angles_config_cached(tuple(str(p) for p in paths))
203
+
204
+
205
+ def clear_heuristic_config_cache() -> None:
206
+ load_heuristic_angles_config_cached.cache_clear()
207
+
208
+
209
+ def build_heuristic_angles(
210
+ query: str,
211
+ *,
212
+ category: str | None = None,
213
+ config: HeuristicAnglesConfig | None = None,
214
+ ) -> tuple[SearchAngle, ...]:
215
+ cfg = config or load_heuristic_angles_config()
216
+ q = query.strip()
217
+ cat = (category or "").strip().lower()
218
+
219
+ angles: list[SearchAngle] = []
220
+ for tmpl in cfg.base:
221
+ angles.append(
222
+ SearchAngle(
223
+ id=tmpl.id,
224
+ query=_format_query(tmpl.query, q),
225
+ rationale=tmpl.rationale,
226
+ )
227
+ )
228
+
229
+ cat_angles = cfg.categories.get(cat) if cat else None
230
+ if not cat_angles:
231
+ cat_angles = cfg.categories.get("default", ())
232
+
233
+ for tmpl in cat_angles:
234
+ angles.append(
235
+ SearchAngle(
236
+ id=tmpl.id,
237
+ query=_format_query(tmpl.query, q),
238
+ rationale=tmpl.rationale,
239
+ )
240
+ )
241
+
242
+ # Stable dedupe by id (first wins — base before category)
243
+ seen: set[str] = set()
244
+ unique: list[SearchAngle] = []
245
+ for a in angles:
246
+ if a.id in seen:
247
+ continue
248
+ seen.add(a.id)
249
+ unique.append(a)
250
+
251
+ return tuple(unique[: cfg.max_angles])
@@ -0,0 +1,47 @@
1
+ """Query-aligned excerpt extraction from page markdown."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from typing import Any
7
+
8
+
9
+ def _tokenize(text: str) -> set[str]:
10
+ return {t for t in re.findall(r"[a-z0-9]{3,}", text.lower()) if len(t) >= 3}
11
+
12
+
13
+ def extract_highlights(
14
+ markdown: str,
15
+ query: str,
16
+ *,
17
+ max_spans: int = 5,
18
+ max_chars_per_span: int = 400,
19
+ ) -> list[dict[str, Any]]:
20
+ q_tokens = _tokenize(query)
21
+ if not q_tokens:
22
+ return []
23
+
24
+ paragraphs = [p.strip() for p in re.split(r"\n\s*\n", markdown) if p.strip()]
25
+ if not paragraphs:
26
+ paragraphs = [line.strip() for line in markdown.splitlines() if line.strip()]
27
+
28
+ scored: list[tuple[float, int, str]] = []
29
+ for idx, para in enumerate(paragraphs):
30
+ if len(para) < 40:
31
+ continue
32
+ tokens = _tokenize(para)
33
+ if not tokens:
34
+ continue
35
+ overlap = len(q_tokens & tokens) / max(len(q_tokens), 1)
36
+ scored.append((overlap, idx, para))
37
+
38
+ scored.sort(key=lambda x: (-x[0], x[1]))
39
+ out: list[dict[str, Any]] = []
40
+ for score, idx, para in scored[:max_spans]:
41
+ if score <= 0:
42
+ continue
43
+ text = para[:max_chars_per_span]
44
+ if len(para) > max_chars_per_span:
45
+ text += "…"
46
+ out.append({"score": round(score, 4), "paragraph_index": idx, "text": text})
47
+ return out
@@ -0,0 +1,59 @@
1
+ """Parallel SERP queries per search angle."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import time
7
+ from concurrent.futures import ThreadPoolExecutor, as_completed
8
+
9
+ from .config import HarnessWebConfig
10
+ from .query_angles import AnglesPlan, SearchAngle
11
+ from .search import search
12
+
13
+
14
+ def _concurrency() -> int:
15
+ raw = os.environ.get("HARNESS_WEB_DEEP_CONCURRENCY", "4").strip()
16
+ try:
17
+ return max(1, min(8, int(raw)))
18
+ except ValueError:
19
+ return 4
20
+
21
+
22
+ def multi_search(
23
+ plan: AnglesPlan,
24
+ *,
25
+ per_angle_limit: int,
26
+ config: HarnessWebConfig,
27
+ rate_limit_ms: int | None = None,
28
+ ) -> dict[str, list[dict[str, str]]]:
29
+ """Run search() for each angle; return angle_id -> hits."""
30
+ sleep_sec = (rate_limit_ms if rate_limit_ms is not None else config.rate_limit_ms) / 1000.0
31
+ results: dict[str, list[dict[str, str]]] = {}
32
+ angles = list(plan.angles)
33
+
34
+ def run_one(angle: SearchAngle) -> tuple[str, list[dict[str, str]]]:
35
+ hits = search(angle.query, limit=per_angle_limit, config=config)
36
+ tagged = []
37
+ for i, h in enumerate(hits):
38
+ row = dict(h)
39
+ row["_angle_id"] = angle.id
40
+ row["_angle_rank"] = str(i + 1)
41
+ tagged.append(row)
42
+ return angle.id, tagged
43
+
44
+ if len(angles) == 1:
45
+ aid, hits = run_one(angles[0])
46
+ results[aid] = hits
47
+ return results
48
+
49
+ with ThreadPoolExecutor(max_workers=min(_concurrency(), len(angles))) as pool:
50
+ futures = {pool.submit(run_one, a): a for a in angles}
51
+ done = 0
52
+ for fut in as_completed(futures):
53
+ aid, hits = fut.result()
54
+ results[aid] = hits
55
+ done += 1
56
+ if done < len(angles) and sleep_sec > 0:
57
+ time.sleep(sleep_sec)
58
+
59
+ return results
@@ -24,6 +24,7 @@ def write_search_results(
24
24
  query: str,
25
25
  *,
26
26
  engine: str,
27
+ tier: str = "standard",
27
28
  ) -> None:
28
29
  """Firecrawl-compatible envelope: data.web[].url|title|description."""
29
30
  write_json(
@@ -31,6 +32,7 @@ def write_search_results(
31
32
  {
32
33
  "query": query,
33
34
  "engine": engine,
35
+ "tier": tier,
34
36
  "data": {
35
37
  "web": [
36
38
  {
@@ -45,6 +47,28 @@ def write_search_results(
45
47
  )
46
48
 
47
49
 
50
+ def write_deep_search_results(
51
+ path: Path,
52
+ *,
53
+ query: str,
54
+ engine: str,
55
+ tier: str,
56
+ plan_angles: list[dict],
57
+ ranked_web: list[dict],
58
+ ) -> None:
59
+ write_json(
60
+ path,
61
+ {
62
+ "query": query,
63
+ "engine": engine,
64
+ "mode": tier,
65
+ "tier": tier,
66
+ "angles": plan_angles,
67
+ "data": {"web": ranked_web},
68
+ },
69
+ )
70
+
71
+
48
72
  def write_page_markdown(path: Path, page: Any, *, main_content_only: bool = True) -> None:
49
73
  ensure_parent(path)
50
74
  try:
@@ -0,0 +1,116 @@
1
+ """Parse and validate WRS search angles (YAML/JSON)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import re
7
+ from dataclasses import dataclass
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ try:
12
+ import yaml # type: ignore[import-untyped]
13
+ except ImportError:
14
+ yaml = None # type: ignore[assignment]
15
+
16
+
17
+ @dataclass(frozen=True)
18
+ class SearchAngle:
19
+ id: str
20
+ query: str
21
+ rationale: str = ""
22
+
23
+
24
+ @dataclass(frozen=True)
25
+ class AnglesPlan:
26
+ intent: str
27
+ angles: tuple[SearchAngle, ...]
28
+ category: str | None = None
29
+
30
+
31
+ def _heuristic_angles(query: str, *, category: str | None = None) -> AnglesPlan:
32
+ """Emergency fallback when no expander output — templates from YAML config."""
33
+ from .heuristic_config import build_heuristic_angles, load_heuristic_angles_config
34
+
35
+ q = query.strip()
36
+ cfg = load_heuristic_angles_config()
37
+ built = build_heuristic_angles(q, category=category, config=cfg)
38
+ if len(built) < 2:
39
+ built = (
40
+ SearchAngle("definitional", q, "Core intent phrasing"),
41
+ SearchAngle("official", f"{q} official documentation", "Authoritative sources"),
42
+ )
43
+ return AnglesPlan(intent=q, angles=built, category=category or None)
44
+
45
+
46
+ def _parse_angle_item(raw: Any, idx: int) -> SearchAngle:
47
+ if isinstance(raw, str):
48
+ s = raw.strip()
49
+ if not s:
50
+ raise ValueError(f"angles[{idx}]: empty query string")
51
+ return SearchAngle(id=f"angle_{idx + 1}", query=s)
52
+ if not isinstance(raw, dict):
53
+ raise ValueError(f"angles[{idx}]: expected object or string")
54
+ aid = str(raw.get("id") or raw.get("name") or f"angle_{idx + 1}").strip()
55
+ query = str(raw.get("query") or "").strip()
56
+ if not query:
57
+ raise ValueError(f"angles[{idx}]: missing query")
58
+ rationale = str(raw.get("rationale") or raw.get("reason") or "").strip()
59
+ return SearchAngle(id=aid or f"angle_{idx + 1}", query=query, rationale=rationale)
60
+
61
+
62
+ def _load_structured(data: dict[str, Any]) -> AnglesPlan:
63
+ intent = str(data.get("intent") or data.get("query") or "").strip()
64
+ raw_angles = data.get("angles")
65
+ if not isinstance(raw_angles, list) or not raw_angles:
66
+ raise ValueError("angles: expected non-empty list")
67
+ angles = tuple(_parse_angle_item(item, i) for i, item in enumerate(raw_angles))
68
+ if len(angles) < 2:
69
+ raise ValueError("angles: need at least 2 entries for deep search")
70
+ if len(angles) > 8:
71
+ angles = angles[:8]
72
+ category = data.get("category")
73
+ cat_str = str(category).strip() if category else None
74
+ return AnglesPlan(intent=intent or angles[0].query, angles=angles, category=cat_str)
75
+
76
+
77
+ def load_angles_file(path: Path) -> AnglesPlan:
78
+ text = path.read_text(encoding="utf-8")
79
+ # Strip markdown fences if present
80
+ fenced = re.search(r"```(?:ya?ml|json)?\s*\n([\s\S]*?)```", text)
81
+ if fenced:
82
+ text = fenced.group(1)
83
+ text = text.strip()
84
+ if not text:
85
+ raise ValueError(f"empty angles file: {path}")
86
+
87
+ data: Any
88
+ if text.startswith("{"):
89
+ data = json.loads(text)
90
+ elif yaml is not None:
91
+ data = yaml.safe_load(text)
92
+ else:
93
+ raise SystemExit(
94
+ "angles file is YAML but PyYAML is not installed. "
95
+ "Use JSON angles or: pip install pyyaml"
96
+ )
97
+ if not isinstance(data, dict):
98
+ raise ValueError("angles file root must be an object")
99
+ return _load_structured(data)
100
+
101
+
102
+ def resolve_angles(
103
+ query: str,
104
+ *,
105
+ angles_file: Path | None = None,
106
+ expand_heuristic: bool = False,
107
+ category: str | None = None,
108
+ ) -> AnglesPlan:
109
+ if angles_file is not None:
110
+ return load_angles_file(angles_file)
111
+ if expand_heuristic:
112
+ return _heuristic_angles(query, category=category)
113
+ raise SystemExit(
114
+ "deep search requires --angles-file (.web/angles.yaml from web-query-expander) "
115
+ "or --expand-heuristic for emergency fallback"
116
+ )
@@ -0,0 +1,163 @@
1
+ """URL normalization and RRF fusion for multi-angle SERP results."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import re
6
+ from dataclasses import dataclass, field
7
+ from typing import Any
8
+ from urllib.parse import parse_qs, urlparse, urlunparse
9
+
10
+ RRF_K = 60
11
+
12
+ _TRACKING_PARAMS = frozenset(
13
+ {
14
+ "utm_source",
15
+ "utm_medium",
16
+ "utm_campaign",
17
+ "utm_term",
18
+ "utm_content",
19
+ "fbclid",
20
+ "gclid",
21
+ "mc_cid",
22
+ "mc_eid",
23
+ }
24
+ )
25
+
26
+
27
+ @dataclass
28
+ class RankedHit:
29
+ url: str
30
+ title: str
31
+ description: str
32
+ score: float
33
+ angle_ids: list[str] = field(default_factory=list)
34
+ ranks: dict[str, int] = field(default_factory=dict)
35
+
36
+ def to_web_dict(self) -> dict[str, Any]:
37
+ return {
38
+ "url": self.url,
39
+ "title": self.title,
40
+ "description": self.description,
41
+ "score": round(self.score, 6),
42
+ "angle_ids": list(self.angle_ids),
43
+ "ranks": dict(self.ranks),
44
+ }
45
+
46
+
47
+ def normalize_url(url: str) -> str:
48
+ u = url.strip()
49
+ if not u:
50
+ return ""
51
+ parsed = urlparse(u)
52
+ scheme = (parsed.scheme or "https").lower()
53
+ host = (parsed.hostname or "").lower()
54
+ if not host:
55
+ return u
56
+ port = parsed.port
57
+ netloc = host
58
+ if port and not ((scheme == "http" and port == 80) or (scheme == "https" and port == 443)):
59
+ netloc = f"{host}:{port}"
60
+ path = parsed.path or "/"
61
+ if path != "/" and path.endswith("/"):
62
+ path = path.rstrip("/")
63
+ qs = parse_qs(parsed.query, keep_blank_values=False)
64
+ filtered = []
65
+ for key in sorted(qs.keys()):
66
+ if key.lower() in _TRACKING_PARAMS:
67
+ continue
68
+ for val in qs[key]:
69
+ filtered.append(f"{key}={val}")
70
+ query = "&".join(filtered)
71
+ return urlunparse((scheme, netloc, path, "", query, ""))
72
+
73
+
74
+ def tokenize(text: str) -> set[str]:
75
+ return {t for t in re.findall(r"[a-z0-9]{3,}", text.lower()) if len(t) >= 3}
76
+
77
+
78
+ def lexical_rerank(hits: list[RankedHit], intent: str) -> list[RankedHit]:
79
+ """Lightweight O3 boost when HARNESS_WEB_RERANK=lexical."""
80
+ intent_tokens = tokenize(intent)
81
+ if not intent_tokens:
82
+ return hits
83
+
84
+ def lex_score(h: RankedHit) -> float:
85
+ blob = f"{h.title} {h.description}".lower()
86
+ tokens = tokenize(blob)
87
+ if not tokens:
88
+ return 0.0
89
+ overlap = len(intent_tokens & tokens) / max(len(intent_tokens), 1)
90
+ return overlap
91
+
92
+ scored = [(h, h.score + 0.15 * lex_score(h)) for h in hits]
93
+ scored.sort(key=lambda x: x[1], reverse=True)
94
+ out: list[RankedHit] = []
95
+ for h, s in scored:
96
+ out.append(
97
+ RankedHit(
98
+ url=h.url,
99
+ title=h.title,
100
+ description=h.description,
101
+ score=s,
102
+ angle_ids=h.angle_ids,
103
+ ranks=h.ranks,
104
+ )
105
+ )
106
+ return out
107
+
108
+
109
+ def fuse_angle_results(
110
+ per_angle: dict[str, list[dict[str, str]]],
111
+ *,
112
+ final_limit: int = 10,
113
+ intent: str = "",
114
+ rerank_mode: str = "off",
115
+ ) -> list[RankedHit]:
116
+ """Reciprocal Rank Fusion across angle result lists."""
117
+ accum: dict[str, dict[str, Any]] = {}
118
+
119
+ for angle_id, results in per_angle.items():
120
+ for rank_1based, item in enumerate(results, start=1):
121
+ raw_url = (item.get("url") or "").strip()
122
+ norm = normalize_url(raw_url)
123
+ if not norm or not norm.startswith("http"):
124
+ continue
125
+ entry = accum.setdefault(
126
+ norm,
127
+ {
128
+ "url": raw_url,
129
+ "title": "",
130
+ "description": "",
131
+ "score": 0.0,
132
+ "angle_ids": [],
133
+ "ranks": {},
134
+ },
135
+ )
136
+ entry["score"] += 1.0 / (RRF_K + rank_1based)
137
+ if angle_id not in entry["angle_ids"]:
138
+ entry["angle_ids"].append(angle_id)
139
+ entry["ranks"][angle_id] = rank_1based
140
+ title = (item.get("title") or "").strip()
141
+ desc = (item.get("description") or "").strip()
142
+ if title and not entry["title"]:
143
+ entry["title"] = title
144
+ if desc and (not entry["description"] or len(desc) > len(entry["description"])):
145
+ entry["description"] = desc
146
+
147
+ hits = [
148
+ RankedHit(
149
+ url=e["url"],
150
+ title=e["title"],
151
+ description=e["description"],
152
+ score=e["score"],
153
+ angle_ids=e["angle_ids"],
154
+ ranks=e["ranks"],
155
+ )
156
+ for e in accum.values()
157
+ ]
158
+ hits.sort(key=lambda h: (-h.score, -len(h.angle_ids), min(h.ranks.values()) if h.ranks else 999))
159
+
160
+ if rerank_mode == "lexical" and intent:
161
+ hits = lexical_rerank(hits, intent)
162
+
163
+ return hits[:final_limit]