ultimate-pi 0.19.0 → 0.19.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/skills/web-retrieval/SKILL.md +163 -0
- package/.agents/skills/wiki-autoresearch/SKILL.md +6 -6
- package/.pi/SYSTEM.md +30 -12
- package/.pi/agents/harness/planning/implementation-researcher.md +1 -1
- package/.pi/agents/harness/planning/stack-researcher.md +5 -1
- package/.pi/agents/harness/web-retrieval/web-answerer.md +35 -0
- package/.pi/agents/harness/web-retrieval/web-criteria-verifier.md +28 -0
- package/.pi/agents/harness/web-retrieval/web-gap-analyzer.md +31 -0
- package/.pi/agents/harness/web-retrieval/web-query-expander-fast.md +34 -0
- package/.pi/agents/harness/web-retrieval/web-query-expander.md +60 -0
- package/.pi/agents/harness/web-retrieval/web-summarizer.md +18 -0
- package/.pi/extensions/harness-web-guard.ts +2 -1
- package/.pi/extensions/harness-web-tools.ts +689 -51
- package/.pi/harness/agents.manifest.json +29 -5
- package/.pi/harness/agents.policy.yaml +34 -0
- package/.pi/harness/docs/adrs/0050-agentic-web-retrieval-stack.md +46 -0
- package/.pi/harness/docs/harness-web-search.md +97 -0
- package/.pi/harness/env.harness.template +9 -1
- package/.pi/harness/examples/web-heuristic-angles.project.yaml +22 -0
- package/.pi/harness/web-heuristic-angles.json +278 -0
- package/.pi/harness/web-heuristic-angles.yaml +182 -0
- package/.pi/lib/agents-policy.mjs +6 -0
- package/.pi/lib/harness-subagent-auth.ts +39 -9
- package/.pi/lib/harness-subagents-bridge.ts +21 -0
- package/.pi/lib/harness-web/artifacts.ts +200 -0
- package/.pi/lib/harness-web/cache.ts +369 -0
- package/.pi/lib/harness-web/run-cli.ts +42 -2
- package/.pi/prompts/harness-plan.md +1 -0
- package/.pi/prompts/harness-setup.md +3 -1
- package/.pi/scripts/gen-web-heuristic-angles-json.mjs +24 -0
- package/.pi/scripts/harness-cli-verify.sh +5 -0
- package/.pi/scripts/harness-verify.mjs +78 -0
- package/.pi/scripts/harness-web-policy-guard.mjs +1 -1
- package/.pi/scripts/harness-web.py +218 -15
- package/.pi/scripts/harness_web/deep_search.py +55 -0
- package/.pi/scripts/harness_web/evidence_bundle.py +47 -0
- package/.pi/scripts/harness_web/find_similar.py +88 -0
- package/.pi/scripts/harness_web/heuristic_angles_shipped.py +85 -0
- package/.pi/scripts/harness_web/heuristic_config.py +251 -0
- package/.pi/scripts/harness_web/highlights.py +47 -0
- package/.pi/scripts/harness_web/multi_search.py +59 -0
- package/.pi/scripts/harness_web/output.py +24 -0
- package/.pi/scripts/harness_web/query_angles.py +116 -0
- package/.pi/scripts/harness_web/rank.py +163 -0
- package/.pi/scripts/harness_web/scrape.py +30 -0
- package/.pi/scripts/tests/test_harness_web_heuristic_config.py +132 -0
- package/.pi/scripts/tests/test_harness_web_query_angles.py +45 -0
- package/.pi/scripts/tests/test_harness_web_rank.py +56 -0
- package/AGENTS.md +2 -2
- package/CHANGELOG.md +6 -0
- package/package.json +5 -3
- package/.agents/skills/scrapling-web/SKILL.md +0 -98
- package/.pi/extensions/00-posthog-network-bootstrap.ts +0 -11
- package/.pi/scripts/harness_web/__pycache__/__init__.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/config.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/output.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/scrape.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/search.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/search_ddg.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/search_searxng.cpython-314.pyc +0 -0
|
@@ -0,0 +1,251 @@
|
|
|
1
|
+
"""Load and merge WRS heuristic angle templates from YAML."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import os
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from functools import lru_cache
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from .query_angles import SearchAngle
|
|
13
|
+
|
|
14
|
+
try:
|
|
15
|
+
import yaml # type: ignore[import-untyped]
|
|
16
|
+
except ImportError:
|
|
17
|
+
yaml = None # type: ignore[assignment]
|
|
18
|
+
|
|
19
|
+
CONFIG_BASENAME = "web-heuristic-angles.yaml"
|
|
20
|
+
CONFIG_JSON_BASENAME = "web-heuristic-angles.json"
|
|
21
|
+
ENV_CONFIG_FILE = "HARNESS_WEB_HEURISTIC_ANGLES_FILE"
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class HeuristicAnglesConfig:
|
|
26
|
+
max_angles: int
|
|
27
|
+
base: tuple[SearchAngle, ...]
|
|
28
|
+
categories: dict[str, tuple[SearchAngle, ...]]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _project_root() -> Path:
|
|
32
|
+
env = os.environ.get("HARNESS_PROJECT_ROOT", "").strip()
|
|
33
|
+
if env:
|
|
34
|
+
return Path(env).resolve()
|
|
35
|
+
return Path.cwd().resolve()
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _package_root() -> Path | None:
|
|
39
|
+
env = os.environ.get("HARNESS_PKG_ROOT", "").strip()
|
|
40
|
+
if env:
|
|
41
|
+
return Path(env).resolve()
|
|
42
|
+
here = Path(__file__).resolve()
|
|
43
|
+
for anc in here.parents:
|
|
44
|
+
if (anc / ".pi" / "harness" / CONFIG_BASENAME).is_file():
|
|
45
|
+
return anc
|
|
46
|
+
return None
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
def discover_heuristic_config_paths() -> list[Path]:
|
|
50
|
+
"""Package defaults first, then project override, then env file last (wins on id)."""
|
|
51
|
+
paths: list[Path] = []
|
|
52
|
+
pkg = _package_root()
|
|
53
|
+
if pkg is not None:
|
|
54
|
+
pkg_yaml = pkg / ".pi" / "harness" / CONFIG_BASENAME
|
|
55
|
+
pkg_json = pkg / ".pi" / "harness" / CONFIG_JSON_BASENAME
|
|
56
|
+
if pkg_yaml.is_file():
|
|
57
|
+
paths.append(pkg_yaml)
|
|
58
|
+
if pkg_json.is_file() and pkg_json not in paths:
|
|
59
|
+
paths.append(pkg_json)
|
|
60
|
+
proj_file = _project_root() / ".pi" / "harness" / CONFIG_BASENAME
|
|
61
|
+
if proj_file.is_file() and proj_file not in paths:
|
|
62
|
+
paths.append(proj_file)
|
|
63
|
+
env_path = os.environ.get(ENV_CONFIG_FILE, "").strip()
|
|
64
|
+
if env_path:
|
|
65
|
+
p = Path(env_path).expanduser().resolve()
|
|
66
|
+
if p.is_file() and p not in paths:
|
|
67
|
+
paths.append(p)
|
|
68
|
+
return paths
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _format_query(template: str, query: str) -> str:
|
|
72
|
+
return template.replace("{query}", query.strip())
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _parse_angle_list(raw: Any, *, source: str) -> list[SearchAngle]:
|
|
76
|
+
if raw is None:
|
|
77
|
+
return []
|
|
78
|
+
if not isinstance(raw, list):
|
|
79
|
+
raise ValueError(f"{source}: expected list of angle objects")
|
|
80
|
+
out: list[SearchAngle] = []
|
|
81
|
+
for i, item in enumerate(raw):
|
|
82
|
+
if not isinstance(item, dict):
|
|
83
|
+
raise ValueError(f"{source}[{i}]: expected object")
|
|
84
|
+
aid = str(item.get("id") or item.get("name") or f"angle_{i + 1}").strip()
|
|
85
|
+
qtpl = str(item.get("query") or "").strip()
|
|
86
|
+
if not aid or not qtpl:
|
|
87
|
+
raise ValueError(f"{source}[{i}]: id and query required")
|
|
88
|
+
rationale = str(item.get("rationale") or item.get("reason") or "").strip()
|
|
89
|
+
out.append(
|
|
90
|
+
SearchAngle(
|
|
91
|
+
id=aid,
|
|
92
|
+
query=qtpl,
|
|
93
|
+
rationale=rationale,
|
|
94
|
+
)
|
|
95
|
+
)
|
|
96
|
+
return out
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _merge_config_dict(accum: dict[str, Any], layer: dict[str, Any]) -> dict[str, Any]:
|
|
100
|
+
out = dict(accum)
|
|
101
|
+
if "max_angles" in layer:
|
|
102
|
+
out["max_angles"] = layer["max_angles"]
|
|
103
|
+
if "version" in layer:
|
|
104
|
+
out["version"] = layer["version"]
|
|
105
|
+
base_acc = list(out.get("base") or [])
|
|
106
|
+
base_acc.extend(layer.get("base") or [])
|
|
107
|
+
out["base"] = base_acc
|
|
108
|
+
cats: dict[str, list[Any]] = dict(out.get("categories") or {})
|
|
109
|
+
layer_cats = layer.get("categories")
|
|
110
|
+
if isinstance(layer_cats, dict):
|
|
111
|
+
for key, angles in layer_cats.items():
|
|
112
|
+
cat = str(key).strip().lower()
|
|
113
|
+
if not cat:
|
|
114
|
+
continue
|
|
115
|
+
existing = list(cats.get(cat) or [])
|
|
116
|
+
if isinstance(angles, list):
|
|
117
|
+
existing.extend(angles)
|
|
118
|
+
cats[cat] = existing
|
|
119
|
+
out["categories"] = cats
|
|
120
|
+
return out
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _load_config_file(path: Path) -> dict[str, Any]:
|
|
124
|
+
text = path.read_text(encoding="utf-8")
|
|
125
|
+
if path.suffix.lower() == ".json":
|
|
126
|
+
data = json.loads(text)
|
|
127
|
+
elif yaml is not None:
|
|
128
|
+
data = yaml.safe_load(text)
|
|
129
|
+
else:
|
|
130
|
+
raise ValueError(f"PyYAML required to load {path} (or use .json)")
|
|
131
|
+
if not isinstance(data, dict):
|
|
132
|
+
raise ValueError(f"{path}: root must be a mapping")
|
|
133
|
+
return data
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _embedded_builtin_dict() -> dict[str, Any]:
|
|
137
|
+
"""Fallback when no config files load (stdlib shipped defaults)."""
|
|
138
|
+
pkg = _package_root()
|
|
139
|
+
if pkg is not None:
|
|
140
|
+
for name in (CONFIG_JSON_BASENAME, CONFIG_BASENAME):
|
|
141
|
+
pkg_file = pkg / ".pi" / "harness" / name
|
|
142
|
+
if not pkg_file.is_file():
|
|
143
|
+
continue
|
|
144
|
+
try:
|
|
145
|
+
return _load_config_file(pkg_file)
|
|
146
|
+
except (ValueError, json.JSONDecodeError, OSError):
|
|
147
|
+
continue
|
|
148
|
+
from .heuristic_angles_shipped import SHIPPED_HEURISTIC_ANGLES
|
|
149
|
+
|
|
150
|
+
return dict(SHIPPED_HEURISTIC_ANGLES)
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
def heuristic_config_from_merged(merged: dict[str, Any]) -> HeuristicAnglesConfig:
|
|
154
|
+
max_angles = int(merged.get("max_angles") or 5)
|
|
155
|
+
max_angles = max(2, min(max_angles, 8))
|
|
156
|
+
|
|
157
|
+
base_templates = _parse_angle_list(merged.get("base"), source="base")
|
|
158
|
+
raw_cats = merged.get("categories")
|
|
159
|
+
categories: dict[str, tuple[SearchAngle, ...]] = {}
|
|
160
|
+
if isinstance(raw_cats, dict):
|
|
161
|
+
for key, raw_list in raw_cats.items():
|
|
162
|
+
cat = str(key).strip().lower()
|
|
163
|
+
if not cat:
|
|
164
|
+
continue
|
|
165
|
+
categories[cat] = tuple(
|
|
166
|
+
_parse_angle_list(raw_list, source=f"categories.{cat}")
|
|
167
|
+
)
|
|
168
|
+
|
|
169
|
+
if "default" not in categories:
|
|
170
|
+
categories["default"] = (
|
|
171
|
+
SearchAngle("technical", "{query} how it works architecture", "Technical"),
|
|
172
|
+
SearchAngle("criticism", "{query} limitations criticism", "Counterpoints"),
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
return HeuristicAnglesConfig(
|
|
176
|
+
max_angles=max_angles,
|
|
177
|
+
base=tuple(base_templates),
|
|
178
|
+
categories=categories,
|
|
179
|
+
)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
@lru_cache(maxsize=8)
|
|
183
|
+
def load_heuristic_angles_config_cached(paths_key: tuple[str, ...]) -> HeuristicAnglesConfig:
|
|
184
|
+
paths = [Path(p) for p in paths_key] if paths_key else discover_heuristic_config_paths()
|
|
185
|
+
layers: list[dict[str, Any]] = []
|
|
186
|
+
for p in paths:
|
|
187
|
+
try:
|
|
188
|
+
layers.append(_load_config_file(p))
|
|
189
|
+
except (ValueError, json.JSONDecodeError, OSError):
|
|
190
|
+
continue
|
|
191
|
+
if layers:
|
|
192
|
+
merged: dict[str, Any] = {}
|
|
193
|
+
for layer in layers:
|
|
194
|
+
merged = _merge_config_dict(merged, layer)
|
|
195
|
+
else:
|
|
196
|
+
merged = _embedded_builtin_dict()
|
|
197
|
+
return heuristic_config_from_merged(merged)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
def load_heuristic_angles_config() -> HeuristicAnglesConfig:
|
|
201
|
+
paths = discover_heuristic_config_paths()
|
|
202
|
+
return load_heuristic_angles_config_cached(tuple(str(p) for p in paths))
|
|
203
|
+
|
|
204
|
+
|
|
205
|
+
def clear_heuristic_config_cache() -> None:
|
|
206
|
+
load_heuristic_angles_config_cached.cache_clear()
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def build_heuristic_angles(
|
|
210
|
+
query: str,
|
|
211
|
+
*,
|
|
212
|
+
category: str | None = None,
|
|
213
|
+
config: HeuristicAnglesConfig | None = None,
|
|
214
|
+
) -> tuple[SearchAngle, ...]:
|
|
215
|
+
cfg = config or load_heuristic_angles_config()
|
|
216
|
+
q = query.strip()
|
|
217
|
+
cat = (category or "").strip().lower()
|
|
218
|
+
|
|
219
|
+
angles: list[SearchAngle] = []
|
|
220
|
+
for tmpl in cfg.base:
|
|
221
|
+
angles.append(
|
|
222
|
+
SearchAngle(
|
|
223
|
+
id=tmpl.id,
|
|
224
|
+
query=_format_query(tmpl.query, q),
|
|
225
|
+
rationale=tmpl.rationale,
|
|
226
|
+
)
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
cat_angles = cfg.categories.get(cat) if cat else None
|
|
230
|
+
if not cat_angles:
|
|
231
|
+
cat_angles = cfg.categories.get("default", ())
|
|
232
|
+
|
|
233
|
+
for tmpl in cat_angles:
|
|
234
|
+
angles.append(
|
|
235
|
+
SearchAngle(
|
|
236
|
+
id=tmpl.id,
|
|
237
|
+
query=_format_query(tmpl.query, q),
|
|
238
|
+
rationale=tmpl.rationale,
|
|
239
|
+
)
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
# Stable dedupe by id (first wins — base before category)
|
|
243
|
+
seen: set[str] = set()
|
|
244
|
+
unique: list[SearchAngle] = []
|
|
245
|
+
for a in angles:
|
|
246
|
+
if a.id in seen:
|
|
247
|
+
continue
|
|
248
|
+
seen.add(a.id)
|
|
249
|
+
unique.append(a)
|
|
250
|
+
|
|
251
|
+
return tuple(unique[: cfg.max_angles])
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Query-aligned excerpt extraction from page markdown."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _tokenize(text: str) -> set[str]:
|
|
10
|
+
return {t for t in re.findall(r"[a-z0-9]{3,}", text.lower()) if len(t) >= 3}
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def extract_highlights(
|
|
14
|
+
markdown: str,
|
|
15
|
+
query: str,
|
|
16
|
+
*,
|
|
17
|
+
max_spans: int = 5,
|
|
18
|
+
max_chars_per_span: int = 400,
|
|
19
|
+
) -> list[dict[str, Any]]:
|
|
20
|
+
q_tokens = _tokenize(query)
|
|
21
|
+
if not q_tokens:
|
|
22
|
+
return []
|
|
23
|
+
|
|
24
|
+
paragraphs = [p.strip() for p in re.split(r"\n\s*\n", markdown) if p.strip()]
|
|
25
|
+
if not paragraphs:
|
|
26
|
+
paragraphs = [line.strip() for line in markdown.splitlines() if line.strip()]
|
|
27
|
+
|
|
28
|
+
scored: list[tuple[float, int, str]] = []
|
|
29
|
+
for idx, para in enumerate(paragraphs):
|
|
30
|
+
if len(para) < 40:
|
|
31
|
+
continue
|
|
32
|
+
tokens = _tokenize(para)
|
|
33
|
+
if not tokens:
|
|
34
|
+
continue
|
|
35
|
+
overlap = len(q_tokens & tokens) / max(len(q_tokens), 1)
|
|
36
|
+
scored.append((overlap, idx, para))
|
|
37
|
+
|
|
38
|
+
scored.sort(key=lambda x: (-x[0], x[1]))
|
|
39
|
+
out: list[dict[str, Any]] = []
|
|
40
|
+
for score, idx, para in scored[:max_spans]:
|
|
41
|
+
if score <= 0:
|
|
42
|
+
continue
|
|
43
|
+
text = para[:max_chars_per_span]
|
|
44
|
+
if len(para) > max_chars_per_span:
|
|
45
|
+
text += "…"
|
|
46
|
+
out.append({"score": round(score, 4), "paragraph_index": idx, "text": text})
|
|
47
|
+
return out
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""Parallel SERP queries per search angle."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import time
|
|
7
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
8
|
+
|
|
9
|
+
from .config import HarnessWebConfig
|
|
10
|
+
from .query_angles import AnglesPlan, SearchAngle
|
|
11
|
+
from .search import search
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _concurrency() -> int:
|
|
15
|
+
raw = os.environ.get("HARNESS_WEB_DEEP_CONCURRENCY", "4").strip()
|
|
16
|
+
try:
|
|
17
|
+
return max(1, min(8, int(raw)))
|
|
18
|
+
except ValueError:
|
|
19
|
+
return 4
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def multi_search(
|
|
23
|
+
plan: AnglesPlan,
|
|
24
|
+
*,
|
|
25
|
+
per_angle_limit: int,
|
|
26
|
+
config: HarnessWebConfig,
|
|
27
|
+
rate_limit_ms: int | None = None,
|
|
28
|
+
) -> dict[str, list[dict[str, str]]]:
|
|
29
|
+
"""Run search() for each angle; return angle_id -> hits."""
|
|
30
|
+
sleep_sec = (rate_limit_ms if rate_limit_ms is not None else config.rate_limit_ms) / 1000.0
|
|
31
|
+
results: dict[str, list[dict[str, str]]] = {}
|
|
32
|
+
angles = list(plan.angles)
|
|
33
|
+
|
|
34
|
+
def run_one(angle: SearchAngle) -> tuple[str, list[dict[str, str]]]:
|
|
35
|
+
hits = search(angle.query, limit=per_angle_limit, config=config)
|
|
36
|
+
tagged = []
|
|
37
|
+
for i, h in enumerate(hits):
|
|
38
|
+
row = dict(h)
|
|
39
|
+
row["_angle_id"] = angle.id
|
|
40
|
+
row["_angle_rank"] = str(i + 1)
|
|
41
|
+
tagged.append(row)
|
|
42
|
+
return angle.id, tagged
|
|
43
|
+
|
|
44
|
+
if len(angles) == 1:
|
|
45
|
+
aid, hits = run_one(angles[0])
|
|
46
|
+
results[aid] = hits
|
|
47
|
+
return results
|
|
48
|
+
|
|
49
|
+
with ThreadPoolExecutor(max_workers=min(_concurrency(), len(angles))) as pool:
|
|
50
|
+
futures = {pool.submit(run_one, a): a for a in angles}
|
|
51
|
+
done = 0
|
|
52
|
+
for fut in as_completed(futures):
|
|
53
|
+
aid, hits = fut.result()
|
|
54
|
+
results[aid] = hits
|
|
55
|
+
done += 1
|
|
56
|
+
if done < len(angles) and sleep_sec > 0:
|
|
57
|
+
time.sleep(sleep_sec)
|
|
58
|
+
|
|
59
|
+
return results
|
|
@@ -24,6 +24,7 @@ def write_search_results(
|
|
|
24
24
|
query: str,
|
|
25
25
|
*,
|
|
26
26
|
engine: str,
|
|
27
|
+
tier: str = "standard",
|
|
27
28
|
) -> None:
|
|
28
29
|
"""Firecrawl-compatible envelope: data.web[].url|title|description."""
|
|
29
30
|
write_json(
|
|
@@ -31,6 +32,7 @@ def write_search_results(
|
|
|
31
32
|
{
|
|
32
33
|
"query": query,
|
|
33
34
|
"engine": engine,
|
|
35
|
+
"tier": tier,
|
|
34
36
|
"data": {
|
|
35
37
|
"web": [
|
|
36
38
|
{
|
|
@@ -45,6 +47,28 @@ def write_search_results(
|
|
|
45
47
|
)
|
|
46
48
|
|
|
47
49
|
|
|
50
|
+
def write_deep_search_results(
|
|
51
|
+
path: Path,
|
|
52
|
+
*,
|
|
53
|
+
query: str,
|
|
54
|
+
engine: str,
|
|
55
|
+
tier: str,
|
|
56
|
+
plan_angles: list[dict],
|
|
57
|
+
ranked_web: list[dict],
|
|
58
|
+
) -> None:
|
|
59
|
+
write_json(
|
|
60
|
+
path,
|
|
61
|
+
{
|
|
62
|
+
"query": query,
|
|
63
|
+
"engine": engine,
|
|
64
|
+
"mode": tier,
|
|
65
|
+
"tier": tier,
|
|
66
|
+
"angles": plan_angles,
|
|
67
|
+
"data": {"web": ranked_web},
|
|
68
|
+
},
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
|
|
48
72
|
def write_page_markdown(path: Path, page: Any, *, main_content_only: bool = True) -> None:
|
|
49
73
|
ensure_parent(path)
|
|
50
74
|
try:
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""Parse and validate WRS search angles (YAML/JSON)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import re
|
|
7
|
+
from dataclasses import dataclass
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
try:
|
|
12
|
+
import yaml # type: ignore[import-untyped]
|
|
13
|
+
except ImportError:
|
|
14
|
+
yaml = None # type: ignore[assignment]
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass(frozen=True)
|
|
18
|
+
class SearchAngle:
|
|
19
|
+
id: str
|
|
20
|
+
query: str
|
|
21
|
+
rationale: str = ""
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
@dataclass(frozen=True)
|
|
25
|
+
class AnglesPlan:
|
|
26
|
+
intent: str
|
|
27
|
+
angles: tuple[SearchAngle, ...]
|
|
28
|
+
category: str | None = None
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _heuristic_angles(query: str, *, category: str | None = None) -> AnglesPlan:
|
|
32
|
+
"""Emergency fallback when no expander output — templates from YAML config."""
|
|
33
|
+
from .heuristic_config import build_heuristic_angles, load_heuristic_angles_config
|
|
34
|
+
|
|
35
|
+
q = query.strip()
|
|
36
|
+
cfg = load_heuristic_angles_config()
|
|
37
|
+
built = build_heuristic_angles(q, category=category, config=cfg)
|
|
38
|
+
if len(built) < 2:
|
|
39
|
+
built = (
|
|
40
|
+
SearchAngle("definitional", q, "Core intent phrasing"),
|
|
41
|
+
SearchAngle("official", f"{q} official documentation", "Authoritative sources"),
|
|
42
|
+
)
|
|
43
|
+
return AnglesPlan(intent=q, angles=built, category=category or None)
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _parse_angle_item(raw: Any, idx: int) -> SearchAngle:
|
|
47
|
+
if isinstance(raw, str):
|
|
48
|
+
s = raw.strip()
|
|
49
|
+
if not s:
|
|
50
|
+
raise ValueError(f"angles[{idx}]: empty query string")
|
|
51
|
+
return SearchAngle(id=f"angle_{idx + 1}", query=s)
|
|
52
|
+
if not isinstance(raw, dict):
|
|
53
|
+
raise ValueError(f"angles[{idx}]: expected object or string")
|
|
54
|
+
aid = str(raw.get("id") or raw.get("name") or f"angle_{idx + 1}").strip()
|
|
55
|
+
query = str(raw.get("query") or "").strip()
|
|
56
|
+
if not query:
|
|
57
|
+
raise ValueError(f"angles[{idx}]: missing query")
|
|
58
|
+
rationale = str(raw.get("rationale") or raw.get("reason") or "").strip()
|
|
59
|
+
return SearchAngle(id=aid or f"angle_{idx + 1}", query=query, rationale=rationale)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def _load_structured(data: dict[str, Any]) -> AnglesPlan:
|
|
63
|
+
intent = str(data.get("intent") or data.get("query") or "").strip()
|
|
64
|
+
raw_angles = data.get("angles")
|
|
65
|
+
if not isinstance(raw_angles, list) or not raw_angles:
|
|
66
|
+
raise ValueError("angles: expected non-empty list")
|
|
67
|
+
angles = tuple(_parse_angle_item(item, i) for i, item in enumerate(raw_angles))
|
|
68
|
+
if len(angles) < 2:
|
|
69
|
+
raise ValueError("angles: need at least 2 entries for deep search")
|
|
70
|
+
if len(angles) > 8:
|
|
71
|
+
angles = angles[:8]
|
|
72
|
+
category = data.get("category")
|
|
73
|
+
cat_str = str(category).strip() if category else None
|
|
74
|
+
return AnglesPlan(intent=intent or angles[0].query, angles=angles, category=cat_str)
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def load_angles_file(path: Path) -> AnglesPlan:
|
|
78
|
+
text = path.read_text(encoding="utf-8")
|
|
79
|
+
# Strip markdown fences if present
|
|
80
|
+
fenced = re.search(r"```(?:ya?ml|json)?\s*\n([\s\S]*?)```", text)
|
|
81
|
+
if fenced:
|
|
82
|
+
text = fenced.group(1)
|
|
83
|
+
text = text.strip()
|
|
84
|
+
if not text:
|
|
85
|
+
raise ValueError(f"empty angles file: {path}")
|
|
86
|
+
|
|
87
|
+
data: Any
|
|
88
|
+
if text.startswith("{"):
|
|
89
|
+
data = json.loads(text)
|
|
90
|
+
elif yaml is not None:
|
|
91
|
+
data = yaml.safe_load(text)
|
|
92
|
+
else:
|
|
93
|
+
raise SystemExit(
|
|
94
|
+
"angles file is YAML but PyYAML is not installed. "
|
|
95
|
+
"Use JSON angles or: pip install pyyaml"
|
|
96
|
+
)
|
|
97
|
+
if not isinstance(data, dict):
|
|
98
|
+
raise ValueError("angles file root must be an object")
|
|
99
|
+
return _load_structured(data)
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def resolve_angles(
|
|
103
|
+
query: str,
|
|
104
|
+
*,
|
|
105
|
+
angles_file: Path | None = None,
|
|
106
|
+
expand_heuristic: bool = False,
|
|
107
|
+
category: str | None = None,
|
|
108
|
+
) -> AnglesPlan:
|
|
109
|
+
if angles_file is not None:
|
|
110
|
+
return load_angles_file(angles_file)
|
|
111
|
+
if expand_heuristic:
|
|
112
|
+
return _heuristic_angles(query, category=category)
|
|
113
|
+
raise SystemExit(
|
|
114
|
+
"deep search requires --angles-file (.web/angles.yaml from web-query-expander) "
|
|
115
|
+
"or --expand-heuristic for emergency fallback"
|
|
116
|
+
)
|
|
@@ -0,0 +1,163 @@
|
|
|
1
|
+
"""URL normalization and RRF fusion for multi-angle SERP results."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from dataclasses import dataclass, field
|
|
7
|
+
from typing import Any
|
|
8
|
+
from urllib.parse import parse_qs, urlparse, urlunparse
|
|
9
|
+
|
|
10
|
+
RRF_K = 60
|
|
11
|
+
|
|
12
|
+
_TRACKING_PARAMS = frozenset(
|
|
13
|
+
{
|
|
14
|
+
"utm_source",
|
|
15
|
+
"utm_medium",
|
|
16
|
+
"utm_campaign",
|
|
17
|
+
"utm_term",
|
|
18
|
+
"utm_content",
|
|
19
|
+
"fbclid",
|
|
20
|
+
"gclid",
|
|
21
|
+
"mc_cid",
|
|
22
|
+
"mc_eid",
|
|
23
|
+
}
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class RankedHit:
|
|
29
|
+
url: str
|
|
30
|
+
title: str
|
|
31
|
+
description: str
|
|
32
|
+
score: float
|
|
33
|
+
angle_ids: list[str] = field(default_factory=list)
|
|
34
|
+
ranks: dict[str, int] = field(default_factory=dict)
|
|
35
|
+
|
|
36
|
+
def to_web_dict(self) -> dict[str, Any]:
|
|
37
|
+
return {
|
|
38
|
+
"url": self.url,
|
|
39
|
+
"title": self.title,
|
|
40
|
+
"description": self.description,
|
|
41
|
+
"score": round(self.score, 6),
|
|
42
|
+
"angle_ids": list(self.angle_ids),
|
|
43
|
+
"ranks": dict(self.ranks),
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def normalize_url(url: str) -> str:
|
|
48
|
+
u = url.strip()
|
|
49
|
+
if not u:
|
|
50
|
+
return ""
|
|
51
|
+
parsed = urlparse(u)
|
|
52
|
+
scheme = (parsed.scheme or "https").lower()
|
|
53
|
+
host = (parsed.hostname or "").lower()
|
|
54
|
+
if not host:
|
|
55
|
+
return u
|
|
56
|
+
port = parsed.port
|
|
57
|
+
netloc = host
|
|
58
|
+
if port and not ((scheme == "http" and port == 80) or (scheme == "https" and port == 443)):
|
|
59
|
+
netloc = f"{host}:{port}"
|
|
60
|
+
path = parsed.path or "/"
|
|
61
|
+
if path != "/" and path.endswith("/"):
|
|
62
|
+
path = path.rstrip("/")
|
|
63
|
+
qs = parse_qs(parsed.query, keep_blank_values=False)
|
|
64
|
+
filtered = []
|
|
65
|
+
for key in sorted(qs.keys()):
|
|
66
|
+
if key.lower() in _TRACKING_PARAMS:
|
|
67
|
+
continue
|
|
68
|
+
for val in qs[key]:
|
|
69
|
+
filtered.append(f"{key}={val}")
|
|
70
|
+
query = "&".join(filtered)
|
|
71
|
+
return urlunparse((scheme, netloc, path, "", query, ""))
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def tokenize(text: str) -> set[str]:
|
|
75
|
+
return {t for t in re.findall(r"[a-z0-9]{3,}", text.lower()) if len(t) >= 3}
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def lexical_rerank(hits: list[RankedHit], intent: str) -> list[RankedHit]:
|
|
79
|
+
"""Lightweight O3 boost when HARNESS_WEB_RERANK=lexical."""
|
|
80
|
+
intent_tokens = tokenize(intent)
|
|
81
|
+
if not intent_tokens:
|
|
82
|
+
return hits
|
|
83
|
+
|
|
84
|
+
def lex_score(h: RankedHit) -> float:
|
|
85
|
+
blob = f"{h.title} {h.description}".lower()
|
|
86
|
+
tokens = tokenize(blob)
|
|
87
|
+
if not tokens:
|
|
88
|
+
return 0.0
|
|
89
|
+
overlap = len(intent_tokens & tokens) / max(len(intent_tokens), 1)
|
|
90
|
+
return overlap
|
|
91
|
+
|
|
92
|
+
scored = [(h, h.score + 0.15 * lex_score(h)) for h in hits]
|
|
93
|
+
scored.sort(key=lambda x: x[1], reverse=True)
|
|
94
|
+
out: list[RankedHit] = []
|
|
95
|
+
for h, s in scored:
|
|
96
|
+
out.append(
|
|
97
|
+
RankedHit(
|
|
98
|
+
url=h.url,
|
|
99
|
+
title=h.title,
|
|
100
|
+
description=h.description,
|
|
101
|
+
score=s,
|
|
102
|
+
angle_ids=h.angle_ids,
|
|
103
|
+
ranks=h.ranks,
|
|
104
|
+
)
|
|
105
|
+
)
|
|
106
|
+
return out
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def fuse_angle_results(
|
|
110
|
+
per_angle: dict[str, list[dict[str, str]]],
|
|
111
|
+
*,
|
|
112
|
+
final_limit: int = 10,
|
|
113
|
+
intent: str = "",
|
|
114
|
+
rerank_mode: str = "off",
|
|
115
|
+
) -> list[RankedHit]:
|
|
116
|
+
"""Reciprocal Rank Fusion across angle result lists."""
|
|
117
|
+
accum: dict[str, dict[str, Any]] = {}
|
|
118
|
+
|
|
119
|
+
for angle_id, results in per_angle.items():
|
|
120
|
+
for rank_1based, item in enumerate(results, start=1):
|
|
121
|
+
raw_url = (item.get("url") or "").strip()
|
|
122
|
+
norm = normalize_url(raw_url)
|
|
123
|
+
if not norm or not norm.startswith("http"):
|
|
124
|
+
continue
|
|
125
|
+
entry = accum.setdefault(
|
|
126
|
+
norm,
|
|
127
|
+
{
|
|
128
|
+
"url": raw_url,
|
|
129
|
+
"title": "",
|
|
130
|
+
"description": "",
|
|
131
|
+
"score": 0.0,
|
|
132
|
+
"angle_ids": [],
|
|
133
|
+
"ranks": {},
|
|
134
|
+
},
|
|
135
|
+
)
|
|
136
|
+
entry["score"] += 1.0 / (RRF_K + rank_1based)
|
|
137
|
+
if angle_id not in entry["angle_ids"]:
|
|
138
|
+
entry["angle_ids"].append(angle_id)
|
|
139
|
+
entry["ranks"][angle_id] = rank_1based
|
|
140
|
+
title = (item.get("title") or "").strip()
|
|
141
|
+
desc = (item.get("description") or "").strip()
|
|
142
|
+
if title and not entry["title"]:
|
|
143
|
+
entry["title"] = title
|
|
144
|
+
if desc and (not entry["description"] or len(desc) > len(entry["description"])):
|
|
145
|
+
entry["description"] = desc
|
|
146
|
+
|
|
147
|
+
hits = [
|
|
148
|
+
RankedHit(
|
|
149
|
+
url=e["url"],
|
|
150
|
+
title=e["title"],
|
|
151
|
+
description=e["description"],
|
|
152
|
+
score=e["score"],
|
|
153
|
+
angle_ids=e["angle_ids"],
|
|
154
|
+
ranks=e["ranks"],
|
|
155
|
+
)
|
|
156
|
+
for e in accum.values()
|
|
157
|
+
]
|
|
158
|
+
hits.sort(key=lambda h: (-h.score, -len(h.angle_ids), min(h.ranks.values()) if h.ranks else 999))
|
|
159
|
+
|
|
160
|
+
if rerank_mode == "lexical" and intent:
|
|
161
|
+
hits = lexical_rerank(hits, intent)
|
|
162
|
+
|
|
163
|
+
return hits[:final_limit]
|