ultimate-pi 0.19.0 → 0.19.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/skills/web-retrieval/SKILL.md +163 -0
- package/.agents/skills/wiki-autoresearch/SKILL.md +6 -6
- package/.pi/SYSTEM.md +30 -12
- package/.pi/agents/harness/planning/implementation-researcher.md +1 -1
- package/.pi/agents/harness/planning/stack-researcher.md +5 -1
- package/.pi/agents/harness/web-retrieval/web-answerer.md +35 -0
- package/.pi/agents/harness/web-retrieval/web-criteria-verifier.md +28 -0
- package/.pi/agents/harness/web-retrieval/web-gap-analyzer.md +31 -0
- package/.pi/agents/harness/web-retrieval/web-query-expander-fast.md +34 -0
- package/.pi/agents/harness/web-retrieval/web-query-expander.md +60 -0
- package/.pi/agents/harness/web-retrieval/web-summarizer.md +18 -0
- package/.pi/extensions/harness-web-guard.ts +2 -1
- package/.pi/extensions/harness-web-tools.ts +689 -51
- package/.pi/harness/agents.manifest.json +29 -5
- package/.pi/harness/agents.policy.yaml +34 -0
- package/.pi/harness/docs/adrs/0050-agentic-web-retrieval-stack.md +46 -0
- package/.pi/harness/docs/harness-web-search.md +97 -0
- package/.pi/harness/env.harness.template +9 -1
- package/.pi/harness/examples/web-heuristic-angles.project.yaml +22 -0
- package/.pi/harness/web-heuristic-angles.json +278 -0
- package/.pi/harness/web-heuristic-angles.yaml +182 -0
- package/.pi/lib/agents-policy.mjs +6 -0
- package/.pi/lib/harness-subagent-auth.ts +39 -9
- package/.pi/lib/harness-subagents-bridge.ts +21 -0
- package/.pi/lib/harness-web/artifacts.ts +200 -0
- package/.pi/lib/harness-web/cache.ts +369 -0
- package/.pi/lib/harness-web/run-cli.ts +42 -2
- package/.pi/prompts/harness-plan.md +1 -0
- package/.pi/prompts/harness-setup.md +3 -1
- package/.pi/scripts/gen-web-heuristic-angles-json.mjs +24 -0
- package/.pi/scripts/harness-cli-verify.sh +5 -0
- package/.pi/scripts/harness-verify.mjs +78 -0
- package/.pi/scripts/harness-web-policy-guard.mjs +1 -1
- package/.pi/scripts/harness-web.py +218 -15
- package/.pi/scripts/harness_web/deep_search.py +55 -0
- package/.pi/scripts/harness_web/evidence_bundle.py +47 -0
- package/.pi/scripts/harness_web/find_similar.py +88 -0
- package/.pi/scripts/harness_web/heuristic_angles_shipped.py +85 -0
- package/.pi/scripts/harness_web/heuristic_config.py +251 -0
- package/.pi/scripts/harness_web/highlights.py +47 -0
- package/.pi/scripts/harness_web/multi_search.py +59 -0
- package/.pi/scripts/harness_web/output.py +24 -0
- package/.pi/scripts/harness_web/query_angles.py +116 -0
- package/.pi/scripts/harness_web/rank.py +163 -0
- package/.pi/scripts/harness_web/scrape.py +30 -0
- package/.pi/scripts/tests/test_harness_web_heuristic_config.py +132 -0
- package/.pi/scripts/tests/test_harness_web_query_angles.py +45 -0
- package/.pi/scripts/tests/test_harness_web_rank.py +56 -0
- package/AGENTS.md +2 -2
- package/CHANGELOG.md +6 -0
- package/package.json +5 -3
- package/.agents/skills/scrapling-web/SKILL.md +0 -98
- package/.pi/extensions/00-posthog-network-bootstrap.ts +0 -11
- package/.pi/scripts/harness_web/__pycache__/__init__.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/config.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/output.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/scrape.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/search.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/search_ddg.cpython-314.pyc +0 -0
- package/.pi/scripts/harness_web/__pycache__/search_searxng.cpython-314.pyc +0 -0
|
@@ -9,6 +9,7 @@ import shutil
|
|
|
9
9
|
import sys
|
|
10
10
|
import time
|
|
11
11
|
from pathlib import Path
|
|
12
|
+
from urllib.parse import urlparse
|
|
12
13
|
|
|
13
14
|
# Re-exec with scrapling's uv-tool Python when the library is not on default python3.
|
|
14
15
|
def _bootstrap_scrapling() -> None:
|
|
@@ -34,10 +35,28 @@ if str(SCRIPT_DIR) not in sys.path:
|
|
|
34
35
|
sys.path.insert(0, str(SCRIPT_DIR))
|
|
35
36
|
|
|
36
37
|
from harness_web.config import HarnessWebConfig, load_config # noqa: E402
|
|
37
|
-
from harness_web.
|
|
38
|
-
from harness_web.
|
|
38
|
+
from harness_web.deep_search import run_deep_search # noqa: E402
|
|
39
|
+
from harness_web.evidence_bundle import build_evidence_bundle, write_evidence_bundle # noqa: E402
|
|
40
|
+
from harness_web.find_similar import run_find_similar # noqa: E402
|
|
41
|
+
from harness_web.output import ( # noqa: E402
|
|
42
|
+
write_deep_search_results,
|
|
43
|
+
write_search_results,
|
|
44
|
+
)
|
|
45
|
+
from harness_web.scrape import ( # noqa: E402
|
|
46
|
+
bulk_scrape,
|
|
47
|
+
map_url,
|
|
48
|
+
scrape_url,
|
|
49
|
+
scrape_url_with_highlights,
|
|
50
|
+
)
|
|
39
51
|
from harness_web.search import search # noqa: E402
|
|
40
52
|
|
|
53
|
+
TIER_LIMITS = {
|
|
54
|
+
"instant": 5,
|
|
55
|
+
"standard": 10,
|
|
56
|
+
"deep": 10,
|
|
57
|
+
"research": 15,
|
|
58
|
+
}
|
|
59
|
+
|
|
41
60
|
DEFAULT_WEB_DIR = ".web"
|
|
42
61
|
|
|
43
62
|
|
|
@@ -45,26 +64,153 @@ def _default_out(sub: str) -> Path:
|
|
|
45
64
|
return Path(DEFAULT_WEB_DIR) / sub
|
|
46
65
|
|
|
47
66
|
|
|
67
|
+
def _tier_limit(tier: str, cli_limit: int | None) -> int:
|
|
68
|
+
if cli_limit is not None:
|
|
69
|
+
return cli_limit
|
|
70
|
+
return TIER_LIMITS.get(tier, 10)
|
|
71
|
+
|
|
72
|
+
|
|
48
73
|
def cmd_search(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
74
|
+
tier = getattr(args, "tier", None) or "standard"
|
|
75
|
+
limit = _tier_limit(tier, args.limit)
|
|
49
76
|
out = Path(args.output or _default_out("search.json"))
|
|
50
|
-
results = search(args.query, limit=
|
|
51
|
-
write_search_results(out, results, args.query, engine=config.search_engine)
|
|
52
|
-
print(f"wrote {out} ({len(results)} results)")
|
|
77
|
+
results = search(args.query, limit=limit, config=config)
|
|
78
|
+
write_search_results(out, results, args.query, engine=config.search_engine, tier=tier)
|
|
79
|
+
print(f"wrote {out} ({len(results)} results, tier={tier})")
|
|
53
80
|
return 0
|
|
54
81
|
|
|
55
82
|
|
|
56
|
-
def
|
|
57
|
-
out = Path(args.output or _default_out("
|
|
58
|
-
|
|
59
|
-
|
|
83
|
+
def cmd_search_deep(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
84
|
+
out = Path(args.output or _default_out("search-deep.json"))
|
|
85
|
+
angles_path = Path(args.angles_file) if args.angles_file else None
|
|
86
|
+
plan, ranked = run_deep_search(
|
|
87
|
+
args.query,
|
|
88
|
+
config=config,
|
|
89
|
+
angles_file=angles_path,
|
|
90
|
+
expand_heuristic=args.expand_heuristic,
|
|
91
|
+
category=args.category,
|
|
92
|
+
per_angle_limit=args.per_angle_limit,
|
|
93
|
+
final_limit=args.limit,
|
|
94
|
+
)
|
|
95
|
+
angle_dicts = [
|
|
96
|
+
{"id": a.id, "query": a.query, "rationale": a.rationale} for a in plan.angles
|
|
97
|
+
]
|
|
98
|
+
write_deep_search_results(
|
|
99
|
+
out,
|
|
100
|
+
query=args.query,
|
|
101
|
+
engine=config.search_engine,
|
|
102
|
+
tier="deep",
|
|
103
|
+
plan_angles=angle_dicts,
|
|
104
|
+
ranked_web=ranked,
|
|
105
|
+
)
|
|
106
|
+
print(f"wrote {out} ({len(ranked)} fused results, {len(plan.angles)} angles)")
|
|
107
|
+
return 0
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def cmd_find_similar(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
111
|
+
out = Path(args.output or _default_out("search-deep.json"))
|
|
112
|
+
plan, ranked = run_find_similar(
|
|
60
113
|
args.url,
|
|
61
|
-
str(out),
|
|
62
114
|
config=config,
|
|
63
|
-
|
|
64
|
-
|
|
115
|
+
final_limit=args.limit,
|
|
116
|
+
per_angle_limit=args.per_angle_limit,
|
|
117
|
+
fast_fetch=args.fast,
|
|
118
|
+
)
|
|
119
|
+
angle_dicts = [
|
|
120
|
+
{"id": a.id, "query": a.query, "rationale": a.rationale} for a in plan.angles
|
|
121
|
+
]
|
|
122
|
+
write_deep_search_results(
|
|
123
|
+
out,
|
|
124
|
+
query=plan.intent,
|
|
125
|
+
engine=config.search_engine,
|
|
126
|
+
tier="deep",
|
|
127
|
+
plan_angles=angle_dicts,
|
|
128
|
+
ranked_web=ranked,
|
|
65
129
|
)
|
|
66
|
-
|
|
67
|
-
|
|
130
|
+
print(f"wrote {out} ({len(ranked)} similar results)")
|
|
131
|
+
return 0
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def cmd_scrape(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
135
|
+
out = Path(args.output or _default_out("page.md"))
|
|
136
|
+
fast = config.use_fast_for_url(args.url, args.fast)
|
|
137
|
+
hl_out = args.highlights_output
|
|
138
|
+
hl_query = (args.highlight_query or "").strip()
|
|
139
|
+
if args.highlights and hl_query:
|
|
140
|
+
scrape_url_with_highlights(
|
|
141
|
+
args.url,
|
|
142
|
+
str(out),
|
|
143
|
+
hl_out or str(_default_out("highlights.json")),
|
|
144
|
+
config=config,
|
|
145
|
+
fast=fast,
|
|
146
|
+
wait_ms=args.wait_for,
|
|
147
|
+
highlight_query=hl_query,
|
|
148
|
+
)
|
|
149
|
+
print(f"wrote {out} (highlights)")
|
|
150
|
+
else:
|
|
151
|
+
scrape_url(
|
|
152
|
+
args.url,
|
|
153
|
+
str(out),
|
|
154
|
+
config=config,
|
|
155
|
+
fast=fast,
|
|
156
|
+
wait_ms=args.wait_for,
|
|
157
|
+
)
|
|
158
|
+
mode = "fast" if fast else "stealth"
|
|
159
|
+
print(f"wrote {out} ({mode})")
|
|
160
|
+
return 0
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def cmd_contents_batch(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
164
|
+
import json
|
|
165
|
+
|
|
166
|
+
out_dir = Path(args.output or _default_out("contents"))
|
|
167
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
168
|
+
urls: list[str] = list(args.urls or [])
|
|
169
|
+
if args.from_search:
|
|
170
|
+
data = json.loads(Path(args.from_search).read_text(encoding="utf-8"))
|
|
171
|
+
for item in data.get("data", {}).get("web", []):
|
|
172
|
+
u = (item.get("url") or "").strip()
|
|
173
|
+
if u:
|
|
174
|
+
urls.append(u)
|
|
175
|
+
if not urls:
|
|
176
|
+
print("contents-batch: no URLs", file=sys.stderr)
|
|
177
|
+
return 1
|
|
178
|
+
|
|
179
|
+
hl_query = (args.highlight_query or "").strip()
|
|
180
|
+
manifest: list[dict] = []
|
|
181
|
+
sleep_sec = config.rate_limit_ms / 1000.0
|
|
182
|
+
for i, url in enumerate(urls[: args.limit]):
|
|
183
|
+
if i and sleep_sec > 0:
|
|
184
|
+
time.sleep(sleep_sec)
|
|
185
|
+
safe = urlparse(url).netloc.replace(".", "_")
|
|
186
|
+
md_path = out_dir / f"{safe}.md"
|
|
187
|
+
hl_path = out_dir / f"{safe}.highlights.json" if args.highlights and hl_query else None
|
|
188
|
+
fast = config.use_fast_for_url(url, args.fast)
|
|
189
|
+
try:
|
|
190
|
+
if hl_path:
|
|
191
|
+
scrape_url_with_highlights(
|
|
192
|
+
url,
|
|
193
|
+
str(md_path),
|
|
194
|
+
str(hl_path),
|
|
195
|
+
config=config,
|
|
196
|
+
fast=fast,
|
|
197
|
+
wait_ms=None,
|
|
198
|
+
highlight_query=hl_query,
|
|
199
|
+
)
|
|
200
|
+
else:
|
|
201
|
+
scrape_url(url, str(md_path), config=config, fast=fast, wait_ms=None)
|
|
202
|
+
manifest.append({"url": url, "markdown": str(md_path), "ok": True})
|
|
203
|
+
except Exception as err: # noqa: BLE001
|
|
204
|
+
manifest.append({"url": url, "ok": False, "error": str(err)})
|
|
205
|
+
|
|
206
|
+
manifest_path = out_dir / "manifest.json"
|
|
207
|
+
manifest_path.write_text(json.dumps({"urls": manifest}, indent=2) + "\n", encoding="utf-8")
|
|
208
|
+
if args.evidence_bundle and args.from_search:
|
|
209
|
+
eb_path = Path(args.evidence_bundle)
|
|
210
|
+
bundle = build_evidence_bundle(Path(args.from_search), query=hl_query)
|
|
211
|
+
write_evidence_bundle(eb_path, bundle)
|
|
212
|
+
print(f"wrote {eb_path}")
|
|
213
|
+
print(f"wrote {len(manifest)} entries to {out_dir}")
|
|
68
214
|
return 0
|
|
69
215
|
|
|
70
216
|
|
|
@@ -132,9 +278,41 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
132
278
|
ps = sub.add_parser("search", help="Search via configured SERP (HARNESS_WEB_SEARCH_ENGINE)")
|
|
133
279
|
ps.add_argument("query", help="Search query")
|
|
134
280
|
ps.add_argument("-o", "--output", help="JSON output path (default: .web/search.json)")
|
|
135
|
-
ps.add_argument("--limit", type=int, default=
|
|
281
|
+
ps.add_argument("--limit", type=int, default=None)
|
|
282
|
+
ps.add_argument(
|
|
283
|
+
"--tier",
|
|
284
|
+
choices=("instant", "standard", "deep", "research"),
|
|
285
|
+
default="standard",
|
|
286
|
+
help="WRS tier (instant=5, standard=10 results)",
|
|
287
|
+
)
|
|
136
288
|
ps.set_defaults(func=cmd_search)
|
|
137
289
|
|
|
290
|
+
pd = sub.add_parser("search-deep", help="Multi-angle SERP fusion (WRS deep)")
|
|
291
|
+
pd.add_argument("query", help="Original research intent")
|
|
292
|
+
pd.add_argument("-o", "--output", help="JSON output (default: .web/search-deep.json)")
|
|
293
|
+
pd.add_argument("--limit", type=int, default=10, help="Final fused result count")
|
|
294
|
+
pd.add_argument("--per-angle-limit", type=int, default=8, help="SERP hits per angle")
|
|
295
|
+
pd.add_argument(
|
|
296
|
+
"--angles-file",
|
|
297
|
+
metavar="YAML",
|
|
298
|
+
help="Angles from web-query-expander (.web/angles.yaml)",
|
|
299
|
+
)
|
|
300
|
+
pd.add_argument(
|
|
301
|
+
"--expand-heuristic",
|
|
302
|
+
action="store_true",
|
|
303
|
+
help="Emergency angle templates without expander subagent",
|
|
304
|
+
)
|
|
305
|
+
pd.add_argument("--category", help="Hint: code|company|people|paper|news")
|
|
306
|
+
pd.set_defaults(func=cmd_search_deep)
|
|
307
|
+
|
|
308
|
+
pf = sub.add_parser("find-similar", help="Pages similar to a seed URL")
|
|
309
|
+
pf.add_argument("url", help="Seed URL")
|
|
310
|
+
pf.add_argument("-o", "--output", help="JSON output (default: .web/search-deep.json)")
|
|
311
|
+
pf.add_argument("--limit", type=int, default=10)
|
|
312
|
+
pf.add_argument("--per-angle-limit", type=int, default=6)
|
|
313
|
+
pf.add_argument("--fast", action="store_true", help="Fast HTTP for seed fetch")
|
|
314
|
+
pf.set_defaults(func=cmd_find_similar)
|
|
315
|
+
|
|
138
316
|
pc = sub.add_parser("scrape", help="Scrape a URL to markdown")
|
|
139
317
|
pc.add_argument("url")
|
|
140
318
|
pc.add_argument("-o", "--output", help="Markdown output (default: .web/page.md)")
|
|
@@ -150,8 +328,33 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
150
328
|
metavar="MS",
|
|
151
329
|
help="Extra wait after load (stealth mode, milliseconds)",
|
|
152
330
|
)
|
|
331
|
+
pc.add_argument("--highlights", action="store_true", help="Extract query-aligned excerpts")
|
|
332
|
+
pc.add_argument("--highlight-query", help="Query for highlight scoring")
|
|
333
|
+
pc.add_argument(
|
|
334
|
+
"--highlights-output",
|
|
335
|
+
help="Highlights JSON path (default: .web/highlights.json)",
|
|
336
|
+
)
|
|
153
337
|
pc.set_defaults(func=cmd_scrape)
|
|
154
338
|
|
|
339
|
+
pbatch = sub.add_parser("contents-batch", help="Batch scrape URLs to markdown manifest")
|
|
340
|
+
pbatch.add_argument("urls", nargs="*", help="URLs to fetch")
|
|
341
|
+
pbatch.add_argument("-o", "--output", help="Output directory (default: .web/contents)")
|
|
342
|
+
pbatch.add_argument("--limit", type=int, default=5)
|
|
343
|
+
pbatch.add_argument(
|
|
344
|
+
"--from-search",
|
|
345
|
+
metavar="JSON",
|
|
346
|
+
help="URLs from search.json or search-deep.json",
|
|
347
|
+
)
|
|
348
|
+
pbatch.add_argument("--fast", action="store_true")
|
|
349
|
+
pbatch.add_argument("--highlights", action="store_true")
|
|
350
|
+
pbatch.add_argument("--highlight-query", default="")
|
|
351
|
+
pbatch.add_argument(
|
|
352
|
+
"--evidence-bundle",
|
|
353
|
+
metavar="JSON",
|
|
354
|
+
help="Write evidence-bundle.json from --from-search",
|
|
355
|
+
)
|
|
356
|
+
pbatch.set_defaults(func=cmd_contents_batch)
|
|
357
|
+
|
|
155
358
|
pb = sub.add_parser("bulk-scrape", help="Search then scrape multiple URLs")
|
|
156
359
|
pb.add_argument("query", nargs="?", help="Search query when not using --from-search")
|
|
157
360
|
pb.add_argument("-o", "--output", help="Output directory (default: .web/bulk)")
|
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""WRS deep search orchestration."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from .config import HarnessWebConfig
|
|
9
|
+
from .multi_search import multi_search
|
|
10
|
+
from .query_angles import AnglesPlan, resolve_angles
|
|
11
|
+
from .rank import fuse_angle_results
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def _rerank_mode() -> str:
|
|
15
|
+
mode = os.environ.get("HARNESS_WEB_RERANK", "off").strip().lower()
|
|
16
|
+
if mode in ("off", "lexical", "embed"):
|
|
17
|
+
return mode
|
|
18
|
+
return "off"
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def run_deep_search(
|
|
22
|
+
query: str,
|
|
23
|
+
*,
|
|
24
|
+
config: HarnessWebConfig,
|
|
25
|
+
angles_file: Path | None = None,
|
|
26
|
+
expand_heuristic: bool = False,
|
|
27
|
+
category: str | None = None,
|
|
28
|
+
per_angle_limit: int = 8,
|
|
29
|
+
final_limit: int = 10,
|
|
30
|
+
) -> tuple[AnglesPlan, list[dict]]:
|
|
31
|
+
plan = resolve_angles(
|
|
32
|
+
query,
|
|
33
|
+
angles_file=angles_file,
|
|
34
|
+
expand_heuristic=expand_heuristic,
|
|
35
|
+
category=category,
|
|
36
|
+
)
|
|
37
|
+
per_angle = multi_search(plan, per_angle_limit=per_angle_limit, config=config)
|
|
38
|
+
# Strip internal tags before fusion
|
|
39
|
+
clean: dict[str, list[dict[str, str]]] = {}
|
|
40
|
+
for aid, rows in per_angle.items():
|
|
41
|
+
clean[aid] = [
|
|
42
|
+
{
|
|
43
|
+
"url": r.get("url", ""),
|
|
44
|
+
"title": r.get("title", ""),
|
|
45
|
+
"description": r.get("description", ""),
|
|
46
|
+
}
|
|
47
|
+
for r in rows
|
|
48
|
+
]
|
|
49
|
+
ranked = fuse_angle_results(
|
|
50
|
+
clean,
|
|
51
|
+
final_limit=final_limit,
|
|
52
|
+
intent=plan.intent,
|
|
53
|
+
rerank_mode=_rerank_mode(),
|
|
54
|
+
)
|
|
55
|
+
return plan, [h.to_web_dict() for h in ranked]
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Build evidence-bundle.json from search-deep + optional highlight fetches."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def build_evidence_bundle(
|
|
11
|
+
search_deep_path: Path,
|
|
12
|
+
*,
|
|
13
|
+
highlight_files: dict[str, Path] | None = None,
|
|
14
|
+
query: str = "",
|
|
15
|
+
) -> dict[str, Any]:
|
|
16
|
+
data = json.loads(search_deep_path.read_text(encoding="utf-8"))
|
|
17
|
+
intent = data.get("query") or query
|
|
18
|
+
hits = data.get("data", {}).get("web", [])
|
|
19
|
+
sources: list[dict[str, Any]] = []
|
|
20
|
+
for hit in hits:
|
|
21
|
+
url = hit.get("url", "")
|
|
22
|
+
entry: dict[str, Any] = {
|
|
23
|
+
"url": url,
|
|
24
|
+
"title": hit.get("title", ""),
|
|
25
|
+
"description": hit.get("description", ""),
|
|
26
|
+
"score": hit.get("score"),
|
|
27
|
+
"angle_ids": hit.get("angle_ids", []),
|
|
28
|
+
}
|
|
29
|
+
if highlight_files and url in highlight_files:
|
|
30
|
+
hp = highlight_files[url]
|
|
31
|
+
if hp.exists():
|
|
32
|
+
try:
|
|
33
|
+
entry["highlights"] = json.loads(hp.read_text(encoding="utf-8"))
|
|
34
|
+
except json.JSONDecodeError:
|
|
35
|
+
pass
|
|
36
|
+
sources.append(entry)
|
|
37
|
+
return {
|
|
38
|
+
"intent": intent,
|
|
39
|
+
"mode": data.get("mode", "deep"),
|
|
40
|
+
"engine": data.get("engine", ""),
|
|
41
|
+
"sources": sources,
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def write_evidence_bundle(path: Path, payload: dict[str, Any]) -> None:
|
|
46
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
47
|
+
path.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Seed-URL discovery (Exa findSimilar analog)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
from .config import HarnessWebConfig
|
|
9
|
+
from .deep_search import run_deep_search
|
|
10
|
+
from .query_angles import AnglesPlan, SearchAngle
|
|
11
|
+
from .rank import RankedHit, fuse_angle_results, normalize_url, tokenize
|
|
12
|
+
from .scrape import fetch_page
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def _extract_seed_phrases(url: str, *, config: HarnessWebConfig, fast: bool) -> list[str]:
|
|
16
|
+
page = fetch_page(url, config=config, fast=fast, wait_ms=None)
|
|
17
|
+
title = ""
|
|
18
|
+
if hasattr(page, "css"):
|
|
19
|
+
for sel in ("title", "h1"):
|
|
20
|
+
nodes = page.css(sel)
|
|
21
|
+
if nodes:
|
|
22
|
+
title = (nodes[0].get_all_text(strip=True) or "").strip()
|
|
23
|
+
if title:
|
|
24
|
+
break
|
|
25
|
+
if not title and hasattr(page, "get_all_text"):
|
|
26
|
+
title = (page.get_all_text(strip=True) or "")[:200].strip()
|
|
27
|
+
title = re.sub(r"\s+", " ", title).strip()
|
|
28
|
+
phrases: list[str] = []
|
|
29
|
+
if title:
|
|
30
|
+
phrases.append(title[:120])
|
|
31
|
+
# Key tokens from title
|
|
32
|
+
tokens = sorted(tokenize(title), key=len, reverse=True)[:6]
|
|
33
|
+
if tokens:
|
|
34
|
+
phrases.append(" ".join(tokens[:5]))
|
|
35
|
+
phrases.append(f"similar to {title[:80]}" if title else f"related pages {url}")
|
|
36
|
+
return [p for p in phrases if p.strip()][:3]
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def run_find_similar(
|
|
40
|
+
seed_url: str,
|
|
41
|
+
*,
|
|
42
|
+
config: HarnessWebConfig,
|
|
43
|
+
final_limit: int = 10,
|
|
44
|
+
per_angle_limit: int = 6,
|
|
45
|
+
fast_fetch: bool = True,
|
|
46
|
+
) -> tuple[AnglesPlan, list[dict]]:
|
|
47
|
+
phrases = _extract_seed_phrases(seed_url, config=config, fast=fast_fetch)
|
|
48
|
+
angles = tuple(
|
|
49
|
+
SearchAngle(f"similar_{i + 1}", q, f"Derived from seed {seed_url}")
|
|
50
|
+
for i, q in enumerate(phrases)
|
|
51
|
+
)
|
|
52
|
+
plan = AnglesPlan(intent=f"pages similar to {seed_url}", angles=angles)
|
|
53
|
+
from .multi_search import multi_search
|
|
54
|
+
|
|
55
|
+
per_angle = multi_search(plan, per_angle_limit=per_angle_limit, config=config)
|
|
56
|
+
clean: dict[str, list[dict[str, str]]] = {}
|
|
57
|
+
for aid, rows in per_angle.items():
|
|
58
|
+
clean[aid] = [
|
|
59
|
+
{
|
|
60
|
+
"url": r.get("url", ""),
|
|
61
|
+
"title": r.get("title", ""),
|
|
62
|
+
"description": r.get("description", ""),
|
|
63
|
+
}
|
|
64
|
+
for r in rows
|
|
65
|
+
]
|
|
66
|
+
ranked = fuse_angle_results(clean, final_limit=final_limit * 2, intent=plan.intent)
|
|
67
|
+
|
|
68
|
+
# Boost overlap with seed text
|
|
69
|
+
seed_norm = normalize_url(seed_url)
|
|
70
|
+
seed_tokens = tokenize(" ".join(phrases))
|
|
71
|
+
rescored: list[RankedHit] = []
|
|
72
|
+
for h in ranked:
|
|
73
|
+
if normalize_url(h.url) == seed_norm:
|
|
74
|
+
continue
|
|
75
|
+
blob = f"{h.title} {h.description}".lower()
|
|
76
|
+
overlap = len(seed_tokens & tokenize(blob)) / max(len(seed_tokens), 1)
|
|
77
|
+
rescored.append(
|
|
78
|
+
RankedHit(
|
|
79
|
+
url=h.url,
|
|
80
|
+
title=h.title,
|
|
81
|
+
description=h.description,
|
|
82
|
+
score=h.score + 0.2 * overlap,
|
|
83
|
+
angle_ids=h.angle_ids,
|
|
84
|
+
ranks=h.ranks,
|
|
85
|
+
)
|
|
86
|
+
)
|
|
87
|
+
rescored.sort(key=lambda x: -x.score)
|
|
88
|
+
return plan, [h.to_web_dict() for h in rescored[:final_limit]]
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
"""Shipped WRS heuristic angles (stdlib-only). Keep in sync with web-heuristic-angles.yaml."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
# fmt: off
|
|
8
|
+
SHIPPED_HEURISTIC_ANGLES: dict[str, Any] = {
|
|
9
|
+
"version": 1,
|
|
10
|
+
"max_angles": 8,
|
|
11
|
+
"base": [
|
|
12
|
+
{"id": "definitional", "query": "{query}", "rationale": "Core intent phrasing"},
|
|
13
|
+
{
|
|
14
|
+
"id": "authoritative",
|
|
15
|
+
"query": "{query} official documentation OR specification OR RFC",
|
|
16
|
+
"rationale": "Primary specs and vendor docs",
|
|
17
|
+
},
|
|
18
|
+
],
|
|
19
|
+
"categories": {
|
|
20
|
+
"code": [
|
|
21
|
+
{"id": "github", "query": "{query} site:github.com", "rationale": "Source, issues, discussions"},
|
|
22
|
+
{"id": "stackoverflow", "query": "{query} site:stackoverflow.com", "rationale": "Debugging and API usage Q&A"},
|
|
23
|
+
{"id": "stackexchange", "query": "{query} site:stackexchange.com", "rationale": "Broader SE network (Super User, Server Fault, etc.)"},
|
|
24
|
+
{"id": "readthedocs", "query": "{query} site:readthedocs.io", "rationale": "OSS library documentation"},
|
|
25
|
+
{"id": "mdn", "query": "{query} site:developer.mozilla.org", "rationale": "Web platform and browser APIs"},
|
|
26
|
+
{"id": "package_registries", "query": "{query} site:npmjs.com OR site:pypi.org OR site:pkg.go.dev OR site:crates.io", "rationale": "Package metadata across major ecosystems"},
|
|
27
|
+
{"id": "microsoft_learn", "query": "{query} site:learn.microsoft.com", "rationale": ".NET, Azure, Windows, and enterprise stacks"},
|
|
28
|
+
{"id": "hacker_news", "query": "{query} site:news.ycombinator.com", "rationale": "High-signal practitioner discussion"},
|
|
29
|
+
{"id": "gitlab", "query": "{query} site:gitlab.com", "rationale": "Alternate host and CI-visible code"},
|
|
30
|
+
{"id": "devto", "query": "{query} site:dev.to OR site:medium.com", "rationale": "Tutorials and implementation writeups"},
|
|
31
|
+
],
|
|
32
|
+
"paper": [
|
|
33
|
+
{"id": "arxiv", "query": "{query} site:arxiv.org", "rationale": "Preprints and latest ML/CS uploads"},
|
|
34
|
+
{"id": "semantic_scholar", "query": "{query} site:semanticscholar.org", "rationale": "Citations, influences, and PDF links"},
|
|
35
|
+
{"id": "google_scholar", "query": "{query} site:scholar.google.com", "rationale": "Broad academic discovery"},
|
|
36
|
+
{"id": "papers_with_code", "query": "{query} site:paperswithcode.com", "rationale": "Benchmarks tied to implementations"},
|
|
37
|
+
{"id": "openreview", "query": "{query} site:openreview.net", "rationale": "Peer reviews and ML conference submissions"},
|
|
38
|
+
{"id": "acl_anthology", "query": "{query} site:aclanthology.org", "rationale": "NLP and computational linguistics"},
|
|
39
|
+
{"id": "acm_dl", "query": "{query} site:dl.acm.org", "rationale": "ACM proceedings and journals"},
|
|
40
|
+
{"id": "pubmed", "query": "{query} site:pubmed.ncbi.nlm.nih.gov", "rationale": "Biomedical and life-sciences literature"},
|
|
41
|
+
],
|
|
42
|
+
"news": [
|
|
43
|
+
{"id": "recent", "query": "{query} news 2025 2026", "rationale": "Recency-biased open web"},
|
|
44
|
+
{"id": "wire_reuters", "query": "{query} site:reuters.com", "rationale": "Wire-service reporting"},
|
|
45
|
+
{"id": "wire_ap", "query": "{query} site:apnews.com", "rationale": "Associated Press coverage"},
|
|
46
|
+
{"id": "tech_press", "query": "{query} site:techcrunch.com OR site:theverge.com OR site:arstechnica.com", "rationale": "Technology industry news"},
|
|
47
|
+
{"id": "business_press", "query": "{query} site:bloomberg.com OR site:ft.com OR site:wsj.com", "rationale": "Markets and business context"},
|
|
48
|
+
{"id": "analysis", "query": "{query} in-depth analysis explainer", "rationale": "Long-form journalism and explainers"},
|
|
49
|
+
{"id": "bbc", "query": "{query} site:bbc.com/news", "rationale": "International general news desk"},
|
|
50
|
+
],
|
|
51
|
+
"company": [
|
|
52
|
+
{"id": "official_site", "query": "{query} official website", "rationale": "Company-controlled messaging"},
|
|
53
|
+
{"id": "crunchbase", "query": "{query} site:crunchbase.com", "rationale": "Funding, investors, and competitors"},
|
|
54
|
+
{"id": "linkedin_company", "query": "{query} site:linkedin.com/company", "rationale": "Headcount, hiring, and positioning"},
|
|
55
|
+
{"id": "sec_filings", "query": "{query} site:sec.gov 10-K OR 10-Q OR S-1", "rationale": "US public-company disclosures"},
|
|
56
|
+
{"id": "g2_reviews", "query": "{query} site:g2.com OR site:capterra.com", "rationale": "B2B software reviews and comparisons"},
|
|
57
|
+
{"id": "company_news", "query": "{query} company announcement press release", "rationale": "Launches, partnerships, and earnings"},
|
|
58
|
+
{"id": "glassdoor", "query": "{query} site:glassdoor.com", "rationale": "Employee sentiment and culture signals"},
|
|
59
|
+
],
|
|
60
|
+
"people": [
|
|
61
|
+
{"id": "linkedin", "query": "{query} site:linkedin.com/in", "rationale": "Professional profiles"},
|
|
62
|
+
{"id": "github_person", "query": "{query} site:github.com", "rationale": "Open-source footprint for builders"},
|
|
63
|
+
{"id": "wikipedia", "query": "{query} site:en.wikipedia.org", "rationale": "Neutral biographical baseline"},
|
|
64
|
+
{"id": "scholar_person", "query": "{query} site:scholar.google.com", "rationale": "Publication record for researchers"},
|
|
65
|
+
{"id": "interviews", "query": "{query} interview podcast keynote", "rationale": "First-person statements and talks"},
|
|
66
|
+
{"id": "twitter_x", "query": "{query} site:x.com OR site:twitter.com", "rationale": "Public statements and discourse"},
|
|
67
|
+
],
|
|
68
|
+
"security": [
|
|
69
|
+
{"id": "cve_nvd", "query": "{query} CVE site:nvd.nist.gov", "rationale": "National Vulnerability Database"},
|
|
70
|
+
{"id": "owasp", "query": "{query} site:owasp.org", "rationale": "AppSec standards and cheat sheets"},
|
|
71
|
+
{"id": "cwe", "query": "{query} site:cwe.mitre.org", "rationale": "Weakness taxonomy"},
|
|
72
|
+
{"id": "github_advisories", "query": "{query} site:github.com/advisories OR dependabot", "rationale": "Ecosystem security advisories"},
|
|
73
|
+
{"id": "snyk_blog", "query": "{query} site:snyk.io/blog OR vulnerability", "rationale": "Practitioner security writeups"},
|
|
74
|
+
],
|
|
75
|
+
"default": [
|
|
76
|
+
{"id": "technical", "query": "{query} how it works architecture internals", "rationale": "Mechanism and design"},
|
|
77
|
+
{"id": "criticism", "query": "{query} limitations criticism drawbacks", "rationale": "Counterpoints and failure modes"},
|
|
78
|
+
{"id": "wikipedia", "query": "{query} site:en.wikipedia.org", "rationale": "Structured overview"},
|
|
79
|
+
{"id": "comparison", "query": "{query} vs alternatives comparison benchmark", "rationale": "Competitive landscape"},
|
|
80
|
+
{"id": "reddit", "query": "{query} site:reddit.com", "rationale": "Community experience reports"},
|
|
81
|
+
{"id": "hn_default", "query": "{query} site:news.ycombinator.com", "rationale": "Practitioner threads when category unknown"},
|
|
82
|
+
],
|
|
83
|
+
},
|
|
84
|
+
}
|
|
85
|
+
# fmt: on
|