ultimate-pi 0.4.1 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -134,10 +134,13 @@ export PATH="$HOME/.local/bin:$PATH"
134
134
  uv tool install "scrapling[fetchers]"
135
135
  scrapling install # Chromium for default stealth scrape; may need sudo for OS libs on Linux
136
136
  mkdir -p .web
137
+ python3 "$UP_PKG/.pi/scripts/harness-web.py" status # JSON config (setup/diagnostics only)
137
138
  python3 "$UP_PKG/.pi/scripts/harness-web.py" search "ultimate-pi harness" -o .web/smoke-search.json --limit 3
138
139
  python3 "$UP_PKG/.pi/scripts/harness-web.py" scrape "https://example.com" -o .web/smoke-page.md --fast
139
140
  ```
140
141
 
142
+ After pi loads extensions, agents should smoke **`web_search`** once (not `UP_PKG` / `import scrapling` preflight). Example intent: query `ultimate-pi harness`, `limit` 2.
143
+
141
144
  - **`--skip-tools`:** skip Step 2 (includes Scrapling verify).
142
145
  - On Linux/WSL, if stealth scrape fails, install browser libs from `harness-cli-verify.sh` output or use `--fast` for static targets.
143
146
 
@@ -421,6 +424,47 @@ If **no** `.env` at project root:
421
424
  - On **skip** or `--non-interactive`: warn in report (non-interactive skips creation)
422
425
  - If `ask_user` cancelled: stop with `needs_clarification`
423
426
 
427
+ ### 4.0b — harness-web search engine (non-destructive)
428
+
429
+ Unless `--non-interactive`, **call `ask_user`** after Step 4.0 (harness-decisions skill):
430
+
431
+ ```json
432
+ {
433
+ "question": "Which harness-web search backend should this project use?",
434
+ "context": "Scrapling still handles scrape/map/bulk. Search only: DuckDuckGo HTML needs no extra services. SearXNG must be self-hosted for agents — public instances often block JSON (403) and default to ~4 API requests/hour per IP.",
435
+ "options": [
436
+ {
437
+ "title": "DuckDuckGo HTML (default)",
438
+ "description": "HARNESS_WEB_SEARCH_ENGINE=ddg_html — no Docker"
439
+ },
440
+ {
441
+ "title": "Self-host SearXNG here (Docker)",
442
+ "description": "Bootstrap .searxng/ with official compose, enable JSON API, set harness env"
443
+ },
444
+ {
445
+ "title": "Use existing SearXNG instance",
446
+ "description": "You provide base URL; harness writes HARNESS_WEB_SEARXNG_URL"
447
+ }
448
+ ],
449
+ "allowFreeform": true
450
+ }
451
+ ```
452
+
453
+ | User choice | Actions |
454
+ |-------------|---------|
455
+ | **DDG** | Ensure `.env` has `HARNESS_WEB_SEARCH_ENGINE=ddg_html` via `harness-sync-env.mjs` (append only if missing; do not overwrite user values) |
456
+ | **Self-host** | `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs"` (requires Docker). Script sets `HARNESS_WEB_SEARCH_ENGINE=searxng` and `HARNESS_WEB_SEARXNG_URL` |
457
+ | **Existing instance** | Parse base URL from freeform answer. Run `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" --set-url {url}` (health check + upsert `.env`) |
458
+ | **Cancelled** | Stop with `needs_clarification` |
459
+ | **`--non-interactive`** | Skip prompt; leave/default `ddg_html`; do not run Docker bootstrap |
460
+
461
+ Post-choice smoke (report pass/fail):
462
+
463
+ ```bash
464
+ mkdir -p .web
465
+ python3 "$UP_PKG/.pi/scripts/harness-web.py" search "ultimate-pi harness" -o .web/setup-search.json --limit 2
466
+ ```
467
+
424
468
  Rules:
425
469
 
426
470
  - **Do not** `cp` over an existing `.env`.
@@ -436,6 +480,7 @@ Ensure `.gitignore` contains:
436
480
  ```
437
481
  .env
438
482
  .web/
483
+ .searxng/
439
484
  .raw/
440
485
  .vault-meta/
441
486
  .pi/harness/critics/
@@ -646,6 +691,7 @@ Output summary table:
646
691
  | .gitignore | ✓/✗ | entries added (incl. `.env`) |
647
692
  | ./raw directory | ✓/✗ | Created for graphify source ingestion |
648
693
  | harness-web (Scrapling) | ✓/✗ | search + scrape smoke |
694
+ | harness-web search engine | ddg / searxng / — | Step 4.0b choice; SearXNG URL if applicable |
649
695
 
650
696
  Next steps:
651
697
  1. If tools missing: re-run with `--force` or install individually
@@ -200,10 +200,19 @@ verify_scrapling() {
200
200
  return
201
201
  fi
202
202
  mkdir -p .web
203
- if python3 "$_hw" search "ultimate-pi harness" -o .web/verify-search.json --limit 2 2>/dev/null | grep -q wrote; then
204
- pass "harness-web search smoke"
203
+ _search_engine="${HARNESS_WEB_SEARCH_ENGINE:-ddg_html}"
204
+ if [ "$_search_engine" = "searxng" ]; then
205
+ if [ -z "${HARNESS_WEB_SEARXNG_URL:-}" ]; then
206
+ fail "HARNESS_WEB_SEARCH_ENGINE=searxng but HARNESS_WEB_SEARXNG_URL is unset"
207
+ elif python3 "$_hw" search "ultimate-pi harness" -o .web/verify-search.json --limit 2 2>/dev/null | grep -q wrote; then
208
+ pass "harness-web search smoke (searxng)"
209
+ else
210
+ fail "harness-web search smoke failed (searxng at ${HARNESS_WEB_SEARXNG_URL})"
211
+ fi
212
+ elif python3 "$_hw" search "ultimate-pi harness" -o .web/verify-search.json --limit 2 2>/dev/null | grep -q wrote; then
213
+ pass "harness-web search smoke (ddg_html)"
205
214
  else
206
- fail "harness-web search smoke failed"
215
+ fail "harness-web search smoke failed (ddg_html)"
207
216
  fi
208
217
  if python3 "$_hw" scrape "https://example.com" -o .web/verify-page.md --fast 2>/dev/null | grep -q wrote; then
209
218
  pass "harness-web scrape --fast smoke"
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Bootstrap a project-local SearXNG instance for harness-web (Docker Compose).
4
+ *
5
+ * - Creates .searxng/ with official upstream compose template
6
+ * - Writes core-config/settings.yml with json format + limiter off (local dev)
7
+ * - Starts containers and waits for JSON search health
8
+ * - Upserts HARNESS_WEB_SEARCH_ENGINE / HARNESS_WEB_SEARXNG_URL in project .env
9
+ *
10
+ * Usage:
11
+ * node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" [PROJECT_ROOT] [--url-only]
12
+ * node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" --set-url http://127.0.0.1:8080
13
+ *
14
+ * Requires: docker, docker compose, curl
15
+ */
16
+
17
+ import {
18
+ access,
19
+ copyFile,
20
+ mkdir,
21
+ readFile,
22
+ writeFile,
23
+ } from "node:fs/promises";
24
+ import { constants } from "node:fs";
25
+ import { join, dirname } from "node:path";
26
+ import { fileURLToPath } from "node:url";
27
+ import { spawn } from "node:child_process";
28
+
29
+ const SCRIPT_DIR = dirname(fileURLToPath(import.meta.url));
30
+ const UP_PKG = join(SCRIPT_DIR, "..", "..");
31
+
32
+ const SEARXNG_BASE =
33
+ "https://raw.githubusercontent.com/searxng/searxng/master/container";
34
+ const DEFAULT_PORT = "8080";
35
+ const HEALTH_PATH = "/search?q=harness&format=json";
36
+
37
+ const MANAGED_START = "# --- harness:env:start ---";
38
+ const MANAGED_END = "# --- harness:env:end ---";
39
+
40
+ const args = process.argv.slice(2).filter((a) => !a.startsWith("-"));
41
+ const flags = new Set(process.argv.slice(2).filter((a) => a.startsWith("-")));
42
+ const urlOnly = flags.has("--url-only");
43
+ const setUrlIdx = process.argv.indexOf("--set-url");
44
+ const setUrl = setUrlIdx !== -1 ? process.argv[setUrlIdx + 1] : null;
45
+
46
+ const PROJECT_ROOT = args[0] || process.cwd();
47
+ const SEARXNG_DIR = join(PROJECT_ROOT, ".searxng");
48
+ const CORE_CONFIG = join(SEARXNG_DIR, "core-config");
49
+ const SETTINGS_PATH = join(CORE_CONFIG, "settings.yml");
50
+ const COMPOSE_PATH = join(SEARXNG_DIR, "docker-compose.yml");
51
+ const ENV_COMPOSE = join(SEARXNG_DIR, ".env");
52
+
53
+ const HARNESS_SETTINGS = `use_default_settings: true
54
+
55
+ search:
56
+ formats:
57
+ - html
58
+ - json
59
+
60
+ server:
61
+ limiter: false
62
+ public_instance: false
63
+ `;
64
+
65
+ async function exists(path) {
66
+ try {
67
+ await access(path, constants.F_OK);
68
+ return true;
69
+ } catch {
70
+ return false;
71
+ }
72
+ }
73
+
74
+ function run(cmd, cmdArgs, opts = {}) {
75
+ return new Promise((resolve, reject) => {
76
+ const child = spawn(cmd, cmdArgs, {
77
+ stdio: opts.inherit ? "inherit" : "pipe",
78
+ cwd: opts.cwd,
79
+ env: { ...process.env, ...opts.env },
80
+ });
81
+ let stdout = "";
82
+ let stderr = "";
83
+ if (!opts.inherit) {
84
+ child.stdout?.on("data", (d) => {
85
+ stdout += d;
86
+ });
87
+ child.stderr?.on("data", (d) => {
88
+ stderr += d;
89
+ });
90
+ }
91
+ child.on("error", reject);
92
+ child.on("close", (code) => {
93
+ if (code === 0) resolve({ stdout, stderr });
94
+ else
95
+ reject(
96
+ new Error(
97
+ `${cmd} ${cmdArgs.join(" ")} exited ${code}\n${stderr || stdout}`,
98
+ ),
99
+ );
100
+ });
101
+ });
102
+ }
103
+
104
+ async function requireDocker() {
105
+ for (const bin of ["docker"]) {
106
+ try {
107
+ await run(bin, ["--version"]);
108
+ } catch {
109
+ console.error(`✗ ${bin} not found`);
110
+ console.error(
111
+ "Install Docker: https://docs.searxng.org/admin/installation-docker.html",
112
+ );
113
+ process.exit(1);
114
+ }
115
+ }
116
+ try {
117
+ await run("docker", ["compose", "version"]);
118
+ } catch {
119
+ console.error("✗ docker compose not available");
120
+ console.error(
121
+ "Install Docker Compose v2: https://docs.docker.com/compose/install/",
122
+ );
123
+ process.exit(1);
124
+ }
125
+ }
126
+
127
+ async function curlToFile(url, dest) {
128
+ await run("curl", ["-fsSL", "-o", dest, url]);
129
+ }
130
+
131
+ async function readComposePort() {
132
+ if (!(await exists(ENV_COMPOSE))) return DEFAULT_PORT;
133
+ const text = await readFile(ENV_COMPOSE, "utf8");
134
+ for (const line of text.split("\n")) {
135
+ const m = line.match(/^SEARXNG_PORT=(.+)$/);
136
+ if (m) return m[1].trim().replace(/^["']|["']$/g, "") || DEFAULT_PORT;
137
+ }
138
+ return DEFAULT_PORT;
139
+ }
140
+
141
+ async function ensureSearxngLayout() {
142
+ await mkdir(CORE_CONFIG, { recursive: true });
143
+ if (!(await exists(COMPOSE_PATH))) {
144
+ console.log("Fetching SearXNG docker-compose.yml …");
145
+ await curlToFile(`${SEARXNG_BASE}/docker-compose.yml`, COMPOSE_PATH);
146
+ }
147
+ if (!(await exists(ENV_COMPOSE))) {
148
+ const example = join(SEARXNG_DIR, ".env.example");
149
+ if (!(await exists(example))) {
150
+ console.log("Fetching SearXNG .env.example …");
151
+ await curlToFile(`${SEARXNG_BASE}/.env.example`, example);
152
+ }
153
+ await copyFile(example, ENV_COMPOSE);
154
+ }
155
+ const needsSettings =
156
+ !(await exists(SETTINGS_PATH)) ||
157
+ !(await readFile(SETTINGS_PATH, "utf8")).includes("json");
158
+ if (needsSettings) {
159
+ await writeFile(SETTINGS_PATH, HARNESS_SETTINGS, "utf8");
160
+ console.log(`✓ Wrote ${SETTINGS_PATH} (json format, limiter off)`);
161
+ }
162
+ }
163
+
164
+ async function composeUp() {
165
+ console.log("Starting SearXNG (docker compose up -d) …");
166
+ await run("docker", ["compose", "up", "-d"], { cwd: SEARXNG_DIR, inherit: true });
167
+ }
168
+
169
+ async function waitForHealth(baseUrl) {
170
+ const url = `${baseUrl}${HEALTH_PATH}`;
171
+ const deadline = Date.now() + 90_000;
172
+ let lastErr = "";
173
+ while (Date.now() < deadline) {
174
+ try {
175
+ const res = await fetch(url, {
176
+ headers: { Accept: "application/json" },
177
+ signal: AbortSignal.timeout(10_000),
178
+ });
179
+ if (res.status === 403) {
180
+ throw new Error(
181
+ "SearXNG returned 403 for format=json — ensure search.formats includes json in .searxng/core-config/settings.yml",
182
+ );
183
+ }
184
+ if (res.ok) {
185
+ const data = await res.json();
186
+ if (data && typeof data === "object") {
187
+ console.log(`✓ SearXNG healthy at ${baseUrl}`);
188
+ return;
189
+ }
190
+ }
191
+ lastErr = `HTTP ${res.status}`;
192
+ } catch (err) {
193
+ lastErr = err instanceof Error ? err.message : String(err);
194
+ }
195
+ await new Promise((r) => setTimeout(r, 3000));
196
+ }
197
+ throw new Error(`SearXNG health check timed out (${url}): ${lastErr}`);
198
+ }
199
+
200
+ function upsertEnvKey(content, key, value) {
201
+ const line = `${key}=${value}`;
202
+ const re = new RegExp(`^${key}=.*$`, "m");
203
+ if (re.test(content)) {
204
+ return content.replace(re, line);
205
+ }
206
+ if (content.includes(MANAGED_START) && content.includes(MANAGED_END)) {
207
+ const end = content.indexOf(MANAGED_END);
208
+ return `${content.slice(0, end)}${line}\n${content.slice(end)}`;
209
+ }
210
+ const sep = content.endsWith("\n") || content.length === 0 ? "" : "\n";
211
+ return `${content}${sep}${MANAGED_START}\n# harness-web (SearXNG)\n${line}\n${MANAGED_END}\n`;
212
+ }
213
+
214
+ async function upsertHarnessEnv(baseUrl) {
215
+ const envPath = join(PROJECT_ROOT, ".env");
216
+ let content = "";
217
+ if (await exists(envPath)) {
218
+ content = await readFile(envPath, "utf8");
219
+ } else {
220
+ const template = join(UP_PKG, ".pi", "harness", "env.harness.template");
221
+ if (await exists(template)) {
222
+ content = await readFile(template, "utf8");
223
+ }
224
+ }
225
+ content = upsertEnvKey(content, "HARNESS_WEB_SEARCH_ENGINE", "searxng");
226
+ content = upsertEnvKey(content, "HARNESS_WEB_SEARXNG_URL", baseUrl);
227
+ await writeFile(envPath, content.endsWith("\n") ? content : `${content}\n`, "utf8");
228
+ console.log(`✓ Updated .env: HARNESS_WEB_SEARCH_ENGINE=searxng, HARNESS_WEB_SEARXNG_URL=${baseUrl}`);
229
+ }
230
+
231
+ function normalizeBaseUrl(raw) {
232
+ const url = raw.trim().replace(/\/+$/, "");
233
+ if (!/^https?:\/\//i.test(url)) {
234
+ throw new Error(`Invalid SearXNG URL: ${raw}`);
235
+ }
236
+ return url;
237
+ }
238
+
239
+ async function main() {
240
+ if (setUrl) {
241
+ const baseUrl = normalizeBaseUrl(setUrl);
242
+ await waitForHealth(baseUrl);
243
+ await upsertHarnessEnv(baseUrl);
244
+ process.exit(0);
245
+ }
246
+
247
+ if (urlOnly) {
248
+ const port = (await exists(ENV_COMPOSE)) ? await readComposePort() : DEFAULT_PORT;
249
+ console.log(`http://127.0.0.1:${port}`);
250
+ process.exit(0);
251
+ }
252
+
253
+ await requireDocker();
254
+ await ensureSearxngLayout();
255
+ const port = await readComposePort();
256
+ const baseUrl = `http://127.0.0.1:${port}`;
257
+ await composeUp();
258
+ await waitForHealth(baseUrl);
259
+ await upsertHarnessEnv(baseUrl);
260
+
261
+ console.log("");
262
+ console.log("SearXNG is ready for harness-web:");
263
+ console.log(` HARNESS_WEB_SEARXNG_URL=${baseUrl}`);
264
+ console.log(` Test: python3 "${join(UP_PKG, ".pi/scripts/harness-web.py")}" search "test" -o .web/search.json --limit 2`);
265
+ }
266
+
267
+ main().catch((err) => {
268
+ console.error(`✗ ${err.message || err}`);
269
+ process.exit(1);
270
+ });
@@ -1,12 +1,21 @@
1
1
  # harness-web search (internal)
2
2
 
3
- ## Engine
3
+ Routing: `harness_web/search.py` dispatches by `HARNESS_WEB_SEARCH_ENGINE`.
4
4
 
5
- Default: DuckDuckGo static HTML — `GET https://html.duckduckgo.com/html/?q=…`
5
+ ## Engines
6
6
 
7
- Implemented in `harness_web/search_ddg.py` via `Fetcher.get` (HTTP, not a browser per query).
7
+ | Value | Module | Notes |
8
+ |-------|--------|-------|
9
+ | `ddg_html` (default) | `search_ddg.py` | DuckDuckGo HTML SERP via Scrapling HTTP (+ one stealth retry on challenge) |
10
+ | `searxng` | `search_searxng.py` | Self-hosted JSON API — requires `HARNESS_WEB_SEARXNG_URL` |
8
11
 
9
- ## Selectors
12
+ Bootstrap local SearXNG: `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs"`
13
+
14
+ ## DuckDuckGo HTML (`ddg_html`)
15
+
16
+ `GET https://html.duckduckgo.com/html/?q=…`
17
+
18
+ ### Selectors
10
19
 
11
20
  | Field | CSS |
12
21
  |-------|-----|
@@ -16,10 +25,18 @@ Implemented in `harness_web/search_ddg.py` via `Fetcher.get` (HTTP, not a browse
16
25
 
17
26
  DDG redirect URLs (`//duckduckgo.com/l/?uddg=…`) are unwrapped to the target `uddg` parameter.
18
27
 
19
- ## Challenge detection
28
+ ### Challenge detection
20
29
 
21
30
  If status 403 or HTML contains challenge markers (`anomaly-modal`, etc.), retry **once** with `StealthyFetcher`, then exit with a clear “search engine blocked” message.
22
31
 
32
+ ## SearXNG (`searxng`)
33
+
34
+ `GET {HARNESS_WEB_SEARXNG_URL}/search?q=…&format=json&pageno=1`
35
+
36
+ - No client API token (SearXNG has no standard search API key).
37
+ - `search.formats` in instance `settings.yml` must include `json` or the API returns **403**.
38
+ - Public instances are unsuitable (~4 JSON req/hr when limiter on; JSON often disabled). Use self-hosted bootstrap.
39
+
23
40
  ## Output
24
41
 
25
42
  `.web/search.json` — envelope compatible with legacy Firecrawl skills:
@@ -31,3 +48,5 @@ If status 403 or HTML contains challenge markers (`anomaly-modal`, etc.), retry
31
48
  "data": { "web": [{ "url", "title", "description" }] }
32
49
  }
33
50
  ```
51
+
52
+ `engine` reflects the active backend (`ddg_html` or `searxng`).
@@ -36,7 +36,7 @@ if str(SCRIPT_DIR) not in sys.path:
36
36
  from harness_web.config import HarnessWebConfig, load_config # noqa: E402
37
37
  from harness_web.output import write_search_results # noqa: E402
38
38
  from harness_web.scrape import bulk_scrape, map_url, scrape_url # noqa: E402
39
- from harness_web.search_ddg import search_ddg # noqa: E402
39
+ from harness_web.search import search # noqa: E402
40
40
 
41
41
  DEFAULT_WEB_DIR = ".web"
42
42
 
@@ -47,8 +47,8 @@ def _default_out(sub: str) -> Path:
47
47
 
48
48
  def cmd_search(args: argparse.Namespace, config: HarnessWebConfig) -> int:
49
49
  out = Path(args.output or _default_out("search.json"))
50
- results = search_ddg(args.query, limit=args.limit, config=config)
51
- write_search_results(out, results, args.query)
50
+ results = search(args.query, limit=args.limit, config=config)
51
+ write_search_results(out, results, args.query, engine=config.search_engine)
52
52
  print(f"wrote {out} ({len(results)} results)")
53
53
  return 0
54
54
 
@@ -76,6 +76,20 @@ def cmd_map(args: argparse.Namespace, config: HarnessWebConfig) -> int:
76
76
  return 0
77
77
 
78
78
 
79
+ def cmd_status(_args: argparse.Namespace, config: HarnessWebConfig) -> int:
80
+ import json
81
+
82
+ payload = {
83
+ "search_engine": config.search_engine,
84
+ "searxng_url": config.searxng_url,
85
+ "fetch_mode": config.fetch_mode,
86
+ "script": str(Path(__file__).resolve()),
87
+ "bootstrap": "ok",
88
+ }
89
+ print(json.dumps(payload, indent=2))
90
+ return 0
91
+
92
+
79
93
  def cmd_bulk_scrape(args: argparse.Namespace, config: HarnessWebConfig) -> int:
80
94
  sleep_sec = args.sleep if args.sleep is not None else config.rate_limit_ms / 1000.0
81
95
  if args.urls:
@@ -86,8 +100,8 @@ def cmd_bulk_scrape(args: argparse.Namespace, config: HarnessWebConfig) -> int:
86
100
  data = json.loads(Path(args.from_search).read_text(encoding="utf-8"))
87
101
  urls = [item["url"] for item in data.get("data", {}).get("web", []) if item.get("url")]
88
102
  else:
89
- urls = search_ddg(args.query, limit=args.limit, config=config)
90
- urls = [r["url"] for r in urls]
103
+ serp = search(args.query, limit=args.limit, config=config)
104
+ urls = [r["url"] for r in serp]
91
105
 
92
106
  if not urls:
93
107
  print("bulk-scrape: no URLs to fetch", file=sys.stderr)
@@ -111,11 +125,11 @@ def cmd_bulk_scrape(args: argparse.Namespace, config: HarnessWebConfig) -> int:
111
125
  def build_parser() -> argparse.ArgumentParser:
112
126
  p = argparse.ArgumentParser(
113
127
  prog="harness-web",
114
- description="Harness web layer: search (DDG HTML) and scrape (Scrapling).",
128
+ description="Harness web layer: search (DDG HTML or SearXNG) and scrape (Scrapling).",
115
129
  )
116
130
  sub = p.add_subparsers(dest="command", required=True)
117
131
 
118
- ps = sub.add_parser("search", help="Search via DuckDuckGo HTML SERP")
132
+ ps = sub.add_parser("search", help="Search via configured SERP (HARNESS_WEB_SEARCH_ENGINE)")
119
133
  ps.add_argument("query", help="Search query")
120
134
  ps.add_argument("-o", "--output", help="JSON output path (default: .web/search.json)")
121
135
  ps.add_argument("--limit", type=int, default=5)
@@ -160,6 +174,9 @@ def build_parser() -> argparse.ArgumentParser:
160
174
  pm.add_argument("--fast", action="store_true")
161
175
  pm.set_defaults(func=cmd_map)
162
176
 
177
+ pst = sub.add_parser("status", help="Print harness-web config as JSON (setup/diagnostics)")
178
+ pst.set_defaults(func=cmd_status)
179
+
163
180
  return p
164
181
 
165
182
 
@@ -6,6 +6,8 @@ import os
6
6
  from dataclasses import dataclass
7
7
  from urllib.parse import urlparse
8
8
 
9
+ SUPPORTED_SEARCH_ENGINES = frozenset({"ddg_html", "searxng"})
10
+
9
11
 
10
12
  def _int_env(name: str, default: int) -> int:
11
13
  raw = os.environ.get(name, "").strip()
@@ -24,6 +26,18 @@ def _fetch_mode() -> str:
24
26
  return "stealth"
25
27
 
26
28
 
29
+ def _normalize_searxng_url(raw: str) -> str:
30
+ url = raw.strip().rstrip("/")
31
+ if not url:
32
+ return ""
33
+ parsed = urlparse(url)
34
+ if parsed.scheme not in ("http", "https") or not parsed.netloc:
35
+ raise SystemExit(
36
+ f"Invalid HARNESS_WEB_SEARXNG_URL={raw!r} — expected http(s)://host[:port]"
37
+ )
38
+ return url
39
+
40
+
27
41
  _STATIC_HOSTS = frozenset(
28
42
  {
29
43
  "example.com",
@@ -50,6 +64,7 @@ def host_is_static(url: str) -> bool:
50
64
  class HarnessWebConfig:
51
65
  fetch_mode: str
52
66
  search_engine: str
67
+ searxng_url: str | None
53
68
  proxy: str | None
54
69
  rate_limit_ms: int
55
70
  timeout_ms: int
@@ -68,13 +83,32 @@ class HarnessWebConfig:
68
83
  return False
69
84
 
70
85
 
86
+ def validate_search_config(config: HarnessWebConfig) -> None:
87
+ engine = config.search_engine
88
+ if engine not in SUPPORTED_SEARCH_ENGINES:
89
+ supported = ", ".join(sorted(SUPPORTED_SEARCH_ENGINES))
90
+ raise SystemExit(
91
+ f"Unsupported HARNESS_WEB_SEARCH_ENGINE={engine!r} (supported: {supported})"
92
+ )
93
+ if engine == "searxng" and not config.searxng_url:
94
+ raise SystemExit(
95
+ "HARNESS_WEB_SEARCH_ENGINE=searxng requires HARNESS_WEB_SEARXNG_URL "
96
+ "(e.g. http://127.0.0.1:8080). Run /harness-setup and choose SearXNG, or set both in .env."
97
+ )
98
+
99
+
71
100
  def load_config() -> HarnessWebConfig:
72
101
  proxy = os.environ.get("HARNESS_WEB_PROXY", "").strip() or None
73
- return HarnessWebConfig(
102
+ engine = os.environ.get("HARNESS_WEB_SEARCH_ENGINE", "ddg_html").strip() or "ddg_html"
103
+ searx_raw = os.environ.get("HARNESS_WEB_SEARXNG_URL", "").strip()
104
+ searxng_url = _normalize_searxng_url(searx_raw) if searx_raw else None
105
+ config = HarnessWebConfig(
74
106
  fetch_mode=_fetch_mode(),
75
- search_engine=os.environ.get("HARNESS_WEB_SEARCH_ENGINE", "ddg_html").strip()
76
- or "ddg_html",
107
+ search_engine=engine,
108
+ searxng_url=searxng_url,
77
109
  proxy=proxy,
78
110
  rate_limit_ms=_int_env("HARNESS_WEB_RATE_LIMIT_MS", 2000),
79
111
  timeout_ms=_int_env("HARNESS_WEB_TIMEOUT_MS", 30000),
80
112
  )
113
+ validate_search_config(config)
114
+ return config
@@ -18,13 +18,19 @@ def write_json(path: Path, payload: Any) -> None:
18
18
  path.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
19
19
 
20
20
 
21
- def write_search_results(path: Path, results: list[dict[str, str]], query: str) -> None:
21
+ def write_search_results(
22
+ path: Path,
23
+ results: list[dict[str, str]],
24
+ query: str,
25
+ *,
26
+ engine: str,
27
+ ) -> None:
22
28
  """Firecrawl-compatible envelope: data.web[].url|title|description."""
23
29
  write_json(
24
30
  path,
25
31
  {
26
32
  "query": query,
27
- "engine": "ddg_html",
33
+ "engine": engine,
28
34
  "data": {
29
35
  "web": [
30
36
  {
@@ -0,0 +1,22 @@
1
+ """Route harness-web search to the configured SERP backend."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .config import HarnessWebConfig, validate_search_config
6
+ from .search_ddg import search_ddg
7
+ from .search_searxng import search_searxng
8
+
9
+
10
+ def search(
11
+ query: str,
12
+ *,
13
+ limit: int,
14
+ config: HarnessWebConfig,
15
+ ) -> list[dict[str, str]]:
16
+ validate_search_config(config)
17
+ engine = config.search_engine
18
+ if engine == "searxng":
19
+ return search_searxng(query, limit=limit, config=config)
20
+ if engine == "ddg_html":
21
+ return search_ddg(query, limit=limit, config=config)
22
+ raise SystemExit(f"Unsupported HARNESS_WEB_SEARCH_ENGINE={engine!r}")
@@ -2,6 +2,7 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
+ from typing import Any
5
6
  from urllib.parse import parse_qs, unquote, urlparse
6
7
 
7
8
  from scrapling.fetchers import Fetcher, StealthyFetcher
@@ -63,11 +64,6 @@ def search_ddg(
63
64
  config: HarnessWebConfig,
64
65
  impersonate: bool = True,
65
66
  ) -> list[dict[str, str]]:
66
- if config.search_engine != "ddg_html":
67
- raise SystemExit(
68
- f"Unsupported HARNESS_WEB_SEARCH_ENGINE={config.search_engine!r} (only ddg_html)"
69
- )
70
-
71
67
  kwargs: dict = {
72
68
  "params": {"q": query},
73
69
  "timeout": config.timeout_sec,