ultimate-pi 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/skills/harness-decisions/SKILL.md +15 -0
- package/.agents/skills/scrapling-web/SKILL.md +45 -40
- package/.agents/skills/wiki-autoresearch/SKILL.md +3 -3
- package/.pi/SYSTEM.md +12 -13
- package/.pi/agents/pi-pi/agent-expert.md +3 -3
- package/.pi/extensions/harness-web-guard.ts +95 -0
- package/.pi/extensions/harness-web-tools.ts +209 -0
- package/.pi/extensions/lib/harness-web/run-cli.ts +92 -0
- package/.pi/harness/env.harness.template +3 -1
- package/.pi/prompts/harness-setup.md +46 -0
- package/.pi/scripts/harness-cli-verify.sh +12 -3
- package/.pi/scripts/harness-searxng-bootstrap.mjs +270 -0
- package/.pi/scripts/harness-web-search.md +24 -5
- package/.pi/scripts/harness-web.py +24 -7
- package/.pi/scripts/harness_web/config.py +37 -3
- package/.pi/scripts/harness_web/output.py +8 -2
- package/.pi/scripts/harness_web/search.py +22 -0
- package/.pi/scripts/harness_web/search_ddg.py +1 -5
- package/.pi/scripts/harness_web/search_searxng.py +100 -0
- package/CHANGELOG.md +12 -0
- package/package.json +2 -2
|
@@ -134,10 +134,13 @@ export PATH="$HOME/.local/bin:$PATH"
|
|
|
134
134
|
uv tool install "scrapling[fetchers]"
|
|
135
135
|
scrapling install # Chromium for default stealth scrape; may need sudo for OS libs on Linux
|
|
136
136
|
mkdir -p .web
|
|
137
|
+
python3 "$UP_PKG/.pi/scripts/harness-web.py" status # JSON config (setup/diagnostics only)
|
|
137
138
|
python3 "$UP_PKG/.pi/scripts/harness-web.py" search "ultimate-pi harness" -o .web/smoke-search.json --limit 3
|
|
138
139
|
python3 "$UP_PKG/.pi/scripts/harness-web.py" scrape "https://example.com" -o .web/smoke-page.md --fast
|
|
139
140
|
```
|
|
140
141
|
|
|
142
|
+
After pi loads extensions, agents should smoke **`web_search`** once (not `UP_PKG` / `import scrapling` preflight). Example intent: query `ultimate-pi harness`, `limit` 2.
|
|
143
|
+
|
|
141
144
|
- **`--skip-tools`:** skip Step 2 (includes Scrapling verify).
|
|
142
145
|
- On Linux/WSL, if stealth scrape fails, install browser libs from `harness-cli-verify.sh` output or use `--fast` for static targets.
|
|
143
146
|
|
|
@@ -421,6 +424,47 @@ If **no** `.env` at project root:
|
|
|
421
424
|
- On **skip** or `--non-interactive`: warn in report (non-interactive skips creation)
|
|
422
425
|
- If `ask_user` cancelled: stop with `needs_clarification`
|
|
423
426
|
|
|
427
|
+
### 4.0b — harness-web search engine (non-destructive)
|
|
428
|
+
|
|
429
|
+
Unless `--non-interactive`, **call `ask_user`** after Step 4.0 (harness-decisions skill):
|
|
430
|
+
|
|
431
|
+
```json
|
|
432
|
+
{
|
|
433
|
+
"question": "Which harness-web search backend should this project use?",
|
|
434
|
+
"context": "Scrapling still handles scrape/map/bulk. Search only: DuckDuckGo HTML needs no extra services. SearXNG must be self-hosted for agents — public instances often block JSON (403) and default to ~4 API requests/hour per IP.",
|
|
435
|
+
"options": [
|
|
436
|
+
{
|
|
437
|
+
"title": "DuckDuckGo HTML (default)",
|
|
438
|
+
"description": "HARNESS_WEB_SEARCH_ENGINE=ddg_html — no Docker"
|
|
439
|
+
},
|
|
440
|
+
{
|
|
441
|
+
"title": "Self-host SearXNG here (Docker)",
|
|
442
|
+
"description": "Bootstrap .searxng/ with official compose, enable JSON API, set harness env"
|
|
443
|
+
},
|
|
444
|
+
{
|
|
445
|
+
"title": "Use existing SearXNG instance",
|
|
446
|
+
"description": "You provide base URL; harness writes HARNESS_WEB_SEARXNG_URL"
|
|
447
|
+
}
|
|
448
|
+
],
|
|
449
|
+
"allowFreeform": true
|
|
450
|
+
}
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
| User choice | Actions |
|
|
454
|
+
|-------------|---------|
|
|
455
|
+
| **DDG** | Ensure `.env` has `HARNESS_WEB_SEARCH_ENGINE=ddg_html` via `harness-sync-env.mjs` (append only if missing; do not overwrite user values) |
|
|
456
|
+
| **Self-host** | `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs"` (requires Docker). Script sets `HARNESS_WEB_SEARCH_ENGINE=searxng` and `HARNESS_WEB_SEARXNG_URL` |
|
|
457
|
+
| **Existing instance** | Parse base URL from freeform answer. Run `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" --set-url {url}` (health check + upsert `.env`) |
|
|
458
|
+
| **Cancelled** | Stop with `needs_clarification` |
|
|
459
|
+
| **`--non-interactive`** | Skip prompt; leave/default `ddg_html`; do not run Docker bootstrap |
|
|
460
|
+
|
|
461
|
+
Post-choice smoke (report pass/fail):
|
|
462
|
+
|
|
463
|
+
```bash
|
|
464
|
+
mkdir -p .web
|
|
465
|
+
python3 "$UP_PKG/.pi/scripts/harness-web.py" search "ultimate-pi harness" -o .web/setup-search.json --limit 2
|
|
466
|
+
```
|
|
467
|
+
|
|
424
468
|
Rules:
|
|
425
469
|
|
|
426
470
|
- **Do not** `cp` over an existing `.env`.
|
|
@@ -436,6 +480,7 @@ Ensure `.gitignore` contains:
|
|
|
436
480
|
```
|
|
437
481
|
.env
|
|
438
482
|
.web/
|
|
483
|
+
.searxng/
|
|
439
484
|
.raw/
|
|
440
485
|
.vault-meta/
|
|
441
486
|
.pi/harness/critics/
|
|
@@ -646,6 +691,7 @@ Output summary table:
|
|
|
646
691
|
| .gitignore | ✓/✗ | entries added (incl. `.env`) |
|
|
647
692
|
| ./raw directory | ✓/✗ | Created for graphify source ingestion |
|
|
648
693
|
| harness-web (Scrapling) | ✓/✗ | search + scrape smoke |
|
|
694
|
+
| harness-web search engine | ddg / searxng / — | Step 4.0b choice; SearXNG URL if applicable |
|
|
649
695
|
|
|
650
696
|
Next steps:
|
|
651
697
|
1. If tools missing: re-run with `--force` or install individually
|
|
@@ -200,10 +200,19 @@ verify_scrapling() {
|
|
|
200
200
|
return
|
|
201
201
|
fi
|
|
202
202
|
mkdir -p .web
|
|
203
|
-
|
|
204
|
-
|
|
203
|
+
_search_engine="${HARNESS_WEB_SEARCH_ENGINE:-ddg_html}"
|
|
204
|
+
if [ "$_search_engine" = "searxng" ]; then
|
|
205
|
+
if [ -z "${HARNESS_WEB_SEARXNG_URL:-}" ]; then
|
|
206
|
+
fail "HARNESS_WEB_SEARCH_ENGINE=searxng but HARNESS_WEB_SEARXNG_URL is unset"
|
|
207
|
+
elif python3 "$_hw" search "ultimate-pi harness" -o .web/verify-search.json --limit 2 2>/dev/null | grep -q wrote; then
|
|
208
|
+
pass "harness-web search smoke (searxng)"
|
|
209
|
+
else
|
|
210
|
+
fail "harness-web search smoke failed (searxng at ${HARNESS_WEB_SEARXNG_URL})"
|
|
211
|
+
fi
|
|
212
|
+
elif python3 "$_hw" search "ultimate-pi harness" -o .web/verify-search.json --limit 2 2>/dev/null | grep -q wrote; then
|
|
213
|
+
pass "harness-web search smoke (ddg_html)"
|
|
205
214
|
else
|
|
206
|
-
fail "harness-web search smoke failed"
|
|
215
|
+
fail "harness-web search smoke failed (ddg_html)"
|
|
207
216
|
fi
|
|
208
217
|
if python3 "$_hw" scrape "https://example.com" -o .web/verify-page.md --fast 2>/dev/null | grep -q wrote; then
|
|
209
218
|
pass "harness-web scrape --fast smoke"
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Bootstrap a project-local SearXNG instance for harness-web (Docker Compose).
|
|
4
|
+
*
|
|
5
|
+
* - Creates .searxng/ with official upstream compose template
|
|
6
|
+
* - Writes core-config/settings.yml with json format + limiter off (local dev)
|
|
7
|
+
* - Starts containers and waits for JSON search health
|
|
8
|
+
* - Upserts HARNESS_WEB_SEARCH_ENGINE / HARNESS_WEB_SEARXNG_URL in project .env
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" [PROJECT_ROOT] [--url-only]
|
|
12
|
+
* node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" --set-url http://127.0.0.1:8080
|
|
13
|
+
*
|
|
14
|
+
* Requires: docker, docker compose, curl
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import {
|
|
18
|
+
access,
|
|
19
|
+
copyFile,
|
|
20
|
+
mkdir,
|
|
21
|
+
readFile,
|
|
22
|
+
writeFile,
|
|
23
|
+
} from "node:fs/promises";
|
|
24
|
+
import { constants } from "node:fs";
|
|
25
|
+
import { join, dirname } from "node:path";
|
|
26
|
+
import { fileURLToPath } from "node:url";
|
|
27
|
+
import { spawn } from "node:child_process";
|
|
28
|
+
|
|
29
|
+
const SCRIPT_DIR = dirname(fileURLToPath(import.meta.url));
|
|
30
|
+
const UP_PKG = join(SCRIPT_DIR, "..", "..");
|
|
31
|
+
|
|
32
|
+
const SEARXNG_BASE =
|
|
33
|
+
"https://raw.githubusercontent.com/searxng/searxng/master/container";
|
|
34
|
+
const DEFAULT_PORT = "8080";
|
|
35
|
+
const HEALTH_PATH = "/search?q=harness&format=json";
|
|
36
|
+
|
|
37
|
+
const MANAGED_START = "# --- harness:env:start ---";
|
|
38
|
+
const MANAGED_END = "# --- harness:env:end ---";
|
|
39
|
+
|
|
40
|
+
const args = process.argv.slice(2).filter((a) => !a.startsWith("-"));
|
|
41
|
+
const flags = new Set(process.argv.slice(2).filter((a) => a.startsWith("-")));
|
|
42
|
+
const urlOnly = flags.has("--url-only");
|
|
43
|
+
const setUrlIdx = process.argv.indexOf("--set-url");
|
|
44
|
+
const setUrl = setUrlIdx !== -1 ? process.argv[setUrlIdx + 1] : null;
|
|
45
|
+
|
|
46
|
+
const PROJECT_ROOT = args[0] || process.cwd();
|
|
47
|
+
const SEARXNG_DIR = join(PROJECT_ROOT, ".searxng");
|
|
48
|
+
const CORE_CONFIG = join(SEARXNG_DIR, "core-config");
|
|
49
|
+
const SETTINGS_PATH = join(CORE_CONFIG, "settings.yml");
|
|
50
|
+
const COMPOSE_PATH = join(SEARXNG_DIR, "docker-compose.yml");
|
|
51
|
+
const ENV_COMPOSE = join(SEARXNG_DIR, ".env");
|
|
52
|
+
|
|
53
|
+
const HARNESS_SETTINGS = `use_default_settings: true
|
|
54
|
+
|
|
55
|
+
search:
|
|
56
|
+
formats:
|
|
57
|
+
- html
|
|
58
|
+
- json
|
|
59
|
+
|
|
60
|
+
server:
|
|
61
|
+
limiter: false
|
|
62
|
+
public_instance: false
|
|
63
|
+
`;
|
|
64
|
+
|
|
65
|
+
async function exists(path) {
|
|
66
|
+
try {
|
|
67
|
+
await access(path, constants.F_OK);
|
|
68
|
+
return true;
|
|
69
|
+
} catch {
|
|
70
|
+
return false;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function run(cmd, cmdArgs, opts = {}) {
|
|
75
|
+
return new Promise((resolve, reject) => {
|
|
76
|
+
const child = spawn(cmd, cmdArgs, {
|
|
77
|
+
stdio: opts.inherit ? "inherit" : "pipe",
|
|
78
|
+
cwd: opts.cwd,
|
|
79
|
+
env: { ...process.env, ...opts.env },
|
|
80
|
+
});
|
|
81
|
+
let stdout = "";
|
|
82
|
+
let stderr = "";
|
|
83
|
+
if (!opts.inherit) {
|
|
84
|
+
child.stdout?.on("data", (d) => {
|
|
85
|
+
stdout += d;
|
|
86
|
+
});
|
|
87
|
+
child.stderr?.on("data", (d) => {
|
|
88
|
+
stderr += d;
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
child.on("error", reject);
|
|
92
|
+
child.on("close", (code) => {
|
|
93
|
+
if (code === 0) resolve({ stdout, stderr });
|
|
94
|
+
else
|
|
95
|
+
reject(
|
|
96
|
+
new Error(
|
|
97
|
+
`${cmd} ${cmdArgs.join(" ")} exited ${code}\n${stderr || stdout}`,
|
|
98
|
+
),
|
|
99
|
+
);
|
|
100
|
+
});
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
async function requireDocker() {
|
|
105
|
+
for (const bin of ["docker"]) {
|
|
106
|
+
try {
|
|
107
|
+
await run(bin, ["--version"]);
|
|
108
|
+
} catch {
|
|
109
|
+
console.error(`✗ ${bin} not found`);
|
|
110
|
+
console.error(
|
|
111
|
+
"Install Docker: https://docs.searxng.org/admin/installation-docker.html",
|
|
112
|
+
);
|
|
113
|
+
process.exit(1);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
try {
|
|
117
|
+
await run("docker", ["compose", "version"]);
|
|
118
|
+
} catch {
|
|
119
|
+
console.error("✗ docker compose not available");
|
|
120
|
+
console.error(
|
|
121
|
+
"Install Docker Compose v2: https://docs.docker.com/compose/install/",
|
|
122
|
+
);
|
|
123
|
+
process.exit(1);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
async function curlToFile(url, dest) {
|
|
128
|
+
await run("curl", ["-fsSL", "-o", dest, url]);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
async function readComposePort() {
|
|
132
|
+
if (!(await exists(ENV_COMPOSE))) return DEFAULT_PORT;
|
|
133
|
+
const text = await readFile(ENV_COMPOSE, "utf8");
|
|
134
|
+
for (const line of text.split("\n")) {
|
|
135
|
+
const m = line.match(/^SEARXNG_PORT=(.+)$/);
|
|
136
|
+
if (m) return m[1].trim().replace(/^["']|["']$/g, "") || DEFAULT_PORT;
|
|
137
|
+
}
|
|
138
|
+
return DEFAULT_PORT;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
async function ensureSearxngLayout() {
|
|
142
|
+
await mkdir(CORE_CONFIG, { recursive: true });
|
|
143
|
+
if (!(await exists(COMPOSE_PATH))) {
|
|
144
|
+
console.log("Fetching SearXNG docker-compose.yml …");
|
|
145
|
+
await curlToFile(`${SEARXNG_BASE}/docker-compose.yml`, COMPOSE_PATH);
|
|
146
|
+
}
|
|
147
|
+
if (!(await exists(ENV_COMPOSE))) {
|
|
148
|
+
const example = join(SEARXNG_DIR, ".env.example");
|
|
149
|
+
if (!(await exists(example))) {
|
|
150
|
+
console.log("Fetching SearXNG .env.example …");
|
|
151
|
+
await curlToFile(`${SEARXNG_BASE}/.env.example`, example);
|
|
152
|
+
}
|
|
153
|
+
await copyFile(example, ENV_COMPOSE);
|
|
154
|
+
}
|
|
155
|
+
const needsSettings =
|
|
156
|
+
!(await exists(SETTINGS_PATH)) ||
|
|
157
|
+
!(await readFile(SETTINGS_PATH, "utf8")).includes("json");
|
|
158
|
+
if (needsSettings) {
|
|
159
|
+
await writeFile(SETTINGS_PATH, HARNESS_SETTINGS, "utf8");
|
|
160
|
+
console.log(`✓ Wrote ${SETTINGS_PATH} (json format, limiter off)`);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
async function composeUp() {
|
|
165
|
+
console.log("Starting SearXNG (docker compose up -d) …");
|
|
166
|
+
await run("docker", ["compose", "up", "-d"], { cwd: SEARXNG_DIR, inherit: true });
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
async function waitForHealth(baseUrl) {
|
|
170
|
+
const url = `${baseUrl}${HEALTH_PATH}`;
|
|
171
|
+
const deadline = Date.now() + 90_000;
|
|
172
|
+
let lastErr = "";
|
|
173
|
+
while (Date.now() < deadline) {
|
|
174
|
+
try {
|
|
175
|
+
const res = await fetch(url, {
|
|
176
|
+
headers: { Accept: "application/json" },
|
|
177
|
+
signal: AbortSignal.timeout(10_000),
|
|
178
|
+
});
|
|
179
|
+
if (res.status === 403) {
|
|
180
|
+
throw new Error(
|
|
181
|
+
"SearXNG returned 403 for format=json — ensure search.formats includes json in .searxng/core-config/settings.yml",
|
|
182
|
+
);
|
|
183
|
+
}
|
|
184
|
+
if (res.ok) {
|
|
185
|
+
const data = await res.json();
|
|
186
|
+
if (data && typeof data === "object") {
|
|
187
|
+
console.log(`✓ SearXNG healthy at ${baseUrl}`);
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
lastErr = `HTTP ${res.status}`;
|
|
192
|
+
} catch (err) {
|
|
193
|
+
lastErr = err instanceof Error ? err.message : String(err);
|
|
194
|
+
}
|
|
195
|
+
await new Promise((r) => setTimeout(r, 3000));
|
|
196
|
+
}
|
|
197
|
+
throw new Error(`SearXNG health check timed out (${url}): ${lastErr}`);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
function upsertEnvKey(content, key, value) {
|
|
201
|
+
const line = `${key}=${value}`;
|
|
202
|
+
const re = new RegExp(`^${key}=.*$`, "m");
|
|
203
|
+
if (re.test(content)) {
|
|
204
|
+
return content.replace(re, line);
|
|
205
|
+
}
|
|
206
|
+
if (content.includes(MANAGED_START) && content.includes(MANAGED_END)) {
|
|
207
|
+
const end = content.indexOf(MANAGED_END);
|
|
208
|
+
return `${content.slice(0, end)}${line}\n${content.slice(end)}`;
|
|
209
|
+
}
|
|
210
|
+
const sep = content.endsWith("\n") || content.length === 0 ? "" : "\n";
|
|
211
|
+
return `${content}${sep}${MANAGED_START}\n# harness-web (SearXNG)\n${line}\n${MANAGED_END}\n`;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
async function upsertHarnessEnv(baseUrl) {
|
|
215
|
+
const envPath = join(PROJECT_ROOT, ".env");
|
|
216
|
+
let content = "";
|
|
217
|
+
if (await exists(envPath)) {
|
|
218
|
+
content = await readFile(envPath, "utf8");
|
|
219
|
+
} else {
|
|
220
|
+
const template = join(UP_PKG, ".pi", "harness", "env.harness.template");
|
|
221
|
+
if (await exists(template)) {
|
|
222
|
+
content = await readFile(template, "utf8");
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
content = upsertEnvKey(content, "HARNESS_WEB_SEARCH_ENGINE", "searxng");
|
|
226
|
+
content = upsertEnvKey(content, "HARNESS_WEB_SEARXNG_URL", baseUrl);
|
|
227
|
+
await writeFile(envPath, content.endsWith("\n") ? content : `${content}\n`, "utf8");
|
|
228
|
+
console.log(`✓ Updated .env: HARNESS_WEB_SEARCH_ENGINE=searxng, HARNESS_WEB_SEARXNG_URL=${baseUrl}`);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
function normalizeBaseUrl(raw) {
|
|
232
|
+
const url = raw.trim().replace(/\/+$/, "");
|
|
233
|
+
if (!/^https?:\/\//i.test(url)) {
|
|
234
|
+
throw new Error(`Invalid SearXNG URL: ${raw}`);
|
|
235
|
+
}
|
|
236
|
+
return url;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
async function main() {
|
|
240
|
+
if (setUrl) {
|
|
241
|
+
const baseUrl = normalizeBaseUrl(setUrl);
|
|
242
|
+
await waitForHealth(baseUrl);
|
|
243
|
+
await upsertHarnessEnv(baseUrl);
|
|
244
|
+
process.exit(0);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
if (urlOnly) {
|
|
248
|
+
const port = (await exists(ENV_COMPOSE)) ? await readComposePort() : DEFAULT_PORT;
|
|
249
|
+
console.log(`http://127.0.0.1:${port}`);
|
|
250
|
+
process.exit(0);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
await requireDocker();
|
|
254
|
+
await ensureSearxngLayout();
|
|
255
|
+
const port = await readComposePort();
|
|
256
|
+
const baseUrl = `http://127.0.0.1:${port}`;
|
|
257
|
+
await composeUp();
|
|
258
|
+
await waitForHealth(baseUrl);
|
|
259
|
+
await upsertHarnessEnv(baseUrl);
|
|
260
|
+
|
|
261
|
+
console.log("");
|
|
262
|
+
console.log("SearXNG is ready for harness-web:");
|
|
263
|
+
console.log(` HARNESS_WEB_SEARXNG_URL=${baseUrl}`);
|
|
264
|
+
console.log(` Test: python3 "${join(UP_PKG, ".pi/scripts/harness-web.py")}" search "test" -o .web/search.json --limit 2`);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
main().catch((err) => {
|
|
268
|
+
console.error(`✗ ${err.message || err}`);
|
|
269
|
+
process.exit(1);
|
|
270
|
+
});
|
|
@@ -1,12 +1,21 @@
|
|
|
1
1
|
# harness-web search (internal)
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Routing: `harness_web/search.py` dispatches by `HARNESS_WEB_SEARCH_ENGINE`.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
## Engines
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
| Value | Module | Notes |
|
|
8
|
+
|-------|--------|-------|
|
|
9
|
+
| `ddg_html` (default) | `search_ddg.py` | DuckDuckGo HTML SERP via Scrapling HTTP (+ one stealth retry on challenge) |
|
|
10
|
+
| `searxng` | `search_searxng.py` | Self-hosted JSON API — requires `HARNESS_WEB_SEARXNG_URL` |
|
|
8
11
|
|
|
9
|
-
|
|
12
|
+
Bootstrap local SearXNG: `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs"`
|
|
13
|
+
|
|
14
|
+
## DuckDuckGo HTML (`ddg_html`)
|
|
15
|
+
|
|
16
|
+
`GET https://html.duckduckgo.com/html/?q=…`
|
|
17
|
+
|
|
18
|
+
### Selectors
|
|
10
19
|
|
|
11
20
|
| Field | CSS |
|
|
12
21
|
|-------|-----|
|
|
@@ -16,10 +25,18 @@ Implemented in `harness_web/search_ddg.py` via `Fetcher.get` (HTTP, not a browse
|
|
|
16
25
|
|
|
17
26
|
DDG redirect URLs (`//duckduckgo.com/l/?uddg=…`) are unwrapped to the target `uddg` parameter.
|
|
18
27
|
|
|
19
|
-
|
|
28
|
+
### Challenge detection
|
|
20
29
|
|
|
21
30
|
If status 403 or HTML contains challenge markers (`anomaly-modal`, etc.), retry **once** with `StealthyFetcher`, then exit with a clear “search engine blocked” message.
|
|
22
31
|
|
|
32
|
+
## SearXNG (`searxng`)
|
|
33
|
+
|
|
34
|
+
`GET {HARNESS_WEB_SEARXNG_URL}/search?q=…&format=json&pageno=1`
|
|
35
|
+
|
|
36
|
+
- No client API token (SearXNG has no standard search API key).
|
|
37
|
+
- `search.formats` in instance `settings.yml` must include `json` or the API returns **403**.
|
|
38
|
+
- Public instances are unsuitable (~4 JSON req/hr when limiter on; JSON often disabled). Use self-hosted bootstrap.
|
|
39
|
+
|
|
23
40
|
## Output
|
|
24
41
|
|
|
25
42
|
`.web/search.json` — envelope compatible with legacy Firecrawl skills:
|
|
@@ -31,3 +48,5 @@ If status 403 or HTML contains challenge markers (`anomaly-modal`, etc.), retry
|
|
|
31
48
|
"data": { "web": [{ "url", "title", "description" }] }
|
|
32
49
|
}
|
|
33
50
|
```
|
|
51
|
+
|
|
52
|
+
`engine` reflects the active backend (`ddg_html` or `searxng`).
|
|
@@ -36,7 +36,7 @@ if str(SCRIPT_DIR) not in sys.path:
|
|
|
36
36
|
from harness_web.config import HarnessWebConfig, load_config # noqa: E402
|
|
37
37
|
from harness_web.output import write_search_results # noqa: E402
|
|
38
38
|
from harness_web.scrape import bulk_scrape, map_url, scrape_url # noqa: E402
|
|
39
|
-
from harness_web.
|
|
39
|
+
from harness_web.search import search # noqa: E402
|
|
40
40
|
|
|
41
41
|
DEFAULT_WEB_DIR = ".web"
|
|
42
42
|
|
|
@@ -47,8 +47,8 @@ def _default_out(sub: str) -> Path:
|
|
|
47
47
|
|
|
48
48
|
def cmd_search(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
49
49
|
out = Path(args.output or _default_out("search.json"))
|
|
50
|
-
results =
|
|
51
|
-
write_search_results(out, results, args.query)
|
|
50
|
+
results = search(args.query, limit=args.limit, config=config)
|
|
51
|
+
write_search_results(out, results, args.query, engine=config.search_engine)
|
|
52
52
|
print(f"wrote {out} ({len(results)} results)")
|
|
53
53
|
return 0
|
|
54
54
|
|
|
@@ -76,6 +76,20 @@ def cmd_map(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
|
76
76
|
return 0
|
|
77
77
|
|
|
78
78
|
|
|
79
|
+
def cmd_status(_args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
80
|
+
import json
|
|
81
|
+
|
|
82
|
+
payload = {
|
|
83
|
+
"search_engine": config.search_engine,
|
|
84
|
+
"searxng_url": config.searxng_url,
|
|
85
|
+
"fetch_mode": config.fetch_mode,
|
|
86
|
+
"script": str(Path(__file__).resolve()),
|
|
87
|
+
"bootstrap": "ok",
|
|
88
|
+
}
|
|
89
|
+
print(json.dumps(payload, indent=2))
|
|
90
|
+
return 0
|
|
91
|
+
|
|
92
|
+
|
|
79
93
|
def cmd_bulk_scrape(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
80
94
|
sleep_sec = args.sleep if args.sleep is not None else config.rate_limit_ms / 1000.0
|
|
81
95
|
if args.urls:
|
|
@@ -86,8 +100,8 @@ def cmd_bulk_scrape(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
|
86
100
|
data = json.loads(Path(args.from_search).read_text(encoding="utf-8"))
|
|
87
101
|
urls = [item["url"] for item in data.get("data", {}).get("web", []) if item.get("url")]
|
|
88
102
|
else:
|
|
89
|
-
|
|
90
|
-
urls = [r["url"] for r in
|
|
103
|
+
serp = search(args.query, limit=args.limit, config=config)
|
|
104
|
+
urls = [r["url"] for r in serp]
|
|
91
105
|
|
|
92
106
|
if not urls:
|
|
93
107
|
print("bulk-scrape: no URLs to fetch", file=sys.stderr)
|
|
@@ -111,11 +125,11 @@ def cmd_bulk_scrape(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
|
111
125
|
def build_parser() -> argparse.ArgumentParser:
|
|
112
126
|
p = argparse.ArgumentParser(
|
|
113
127
|
prog="harness-web",
|
|
114
|
-
description="Harness web layer: search (DDG HTML) and scrape (Scrapling).",
|
|
128
|
+
description="Harness web layer: search (DDG HTML or SearXNG) and scrape (Scrapling).",
|
|
115
129
|
)
|
|
116
130
|
sub = p.add_subparsers(dest="command", required=True)
|
|
117
131
|
|
|
118
|
-
ps = sub.add_parser("search", help="Search via
|
|
132
|
+
ps = sub.add_parser("search", help="Search via configured SERP (HARNESS_WEB_SEARCH_ENGINE)")
|
|
119
133
|
ps.add_argument("query", help="Search query")
|
|
120
134
|
ps.add_argument("-o", "--output", help="JSON output path (default: .web/search.json)")
|
|
121
135
|
ps.add_argument("--limit", type=int, default=5)
|
|
@@ -160,6 +174,9 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
160
174
|
pm.add_argument("--fast", action="store_true")
|
|
161
175
|
pm.set_defaults(func=cmd_map)
|
|
162
176
|
|
|
177
|
+
pst = sub.add_parser("status", help="Print harness-web config as JSON (setup/diagnostics)")
|
|
178
|
+
pst.set_defaults(func=cmd_status)
|
|
179
|
+
|
|
163
180
|
return p
|
|
164
181
|
|
|
165
182
|
|
|
@@ -6,6 +6,8 @@ import os
|
|
|
6
6
|
from dataclasses import dataclass
|
|
7
7
|
from urllib.parse import urlparse
|
|
8
8
|
|
|
9
|
+
SUPPORTED_SEARCH_ENGINES = frozenset({"ddg_html", "searxng"})
|
|
10
|
+
|
|
9
11
|
|
|
10
12
|
def _int_env(name: str, default: int) -> int:
|
|
11
13
|
raw = os.environ.get(name, "").strip()
|
|
@@ -24,6 +26,18 @@ def _fetch_mode() -> str:
|
|
|
24
26
|
return "stealth"
|
|
25
27
|
|
|
26
28
|
|
|
29
|
+
def _normalize_searxng_url(raw: str) -> str:
|
|
30
|
+
url = raw.strip().rstrip("/")
|
|
31
|
+
if not url:
|
|
32
|
+
return ""
|
|
33
|
+
parsed = urlparse(url)
|
|
34
|
+
if parsed.scheme not in ("http", "https") or not parsed.netloc:
|
|
35
|
+
raise SystemExit(
|
|
36
|
+
f"Invalid HARNESS_WEB_SEARXNG_URL={raw!r} — expected http(s)://host[:port]"
|
|
37
|
+
)
|
|
38
|
+
return url
|
|
39
|
+
|
|
40
|
+
|
|
27
41
|
_STATIC_HOSTS = frozenset(
|
|
28
42
|
{
|
|
29
43
|
"example.com",
|
|
@@ -50,6 +64,7 @@ def host_is_static(url: str) -> bool:
|
|
|
50
64
|
class HarnessWebConfig:
|
|
51
65
|
fetch_mode: str
|
|
52
66
|
search_engine: str
|
|
67
|
+
searxng_url: str | None
|
|
53
68
|
proxy: str | None
|
|
54
69
|
rate_limit_ms: int
|
|
55
70
|
timeout_ms: int
|
|
@@ -68,13 +83,32 @@ class HarnessWebConfig:
|
|
|
68
83
|
return False
|
|
69
84
|
|
|
70
85
|
|
|
86
|
+
def validate_search_config(config: HarnessWebConfig) -> None:
|
|
87
|
+
engine = config.search_engine
|
|
88
|
+
if engine not in SUPPORTED_SEARCH_ENGINES:
|
|
89
|
+
supported = ", ".join(sorted(SUPPORTED_SEARCH_ENGINES))
|
|
90
|
+
raise SystemExit(
|
|
91
|
+
f"Unsupported HARNESS_WEB_SEARCH_ENGINE={engine!r} (supported: {supported})"
|
|
92
|
+
)
|
|
93
|
+
if engine == "searxng" and not config.searxng_url:
|
|
94
|
+
raise SystemExit(
|
|
95
|
+
"HARNESS_WEB_SEARCH_ENGINE=searxng requires HARNESS_WEB_SEARXNG_URL "
|
|
96
|
+
"(e.g. http://127.0.0.1:8080). Run /harness-setup and choose SearXNG, or set both in .env."
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
|
|
71
100
|
def load_config() -> HarnessWebConfig:
|
|
72
101
|
proxy = os.environ.get("HARNESS_WEB_PROXY", "").strip() or None
|
|
73
|
-
|
|
102
|
+
engine = os.environ.get("HARNESS_WEB_SEARCH_ENGINE", "ddg_html").strip() or "ddg_html"
|
|
103
|
+
searx_raw = os.environ.get("HARNESS_WEB_SEARXNG_URL", "").strip()
|
|
104
|
+
searxng_url = _normalize_searxng_url(searx_raw) if searx_raw else None
|
|
105
|
+
config = HarnessWebConfig(
|
|
74
106
|
fetch_mode=_fetch_mode(),
|
|
75
|
-
search_engine=
|
|
76
|
-
|
|
107
|
+
search_engine=engine,
|
|
108
|
+
searxng_url=searxng_url,
|
|
77
109
|
proxy=proxy,
|
|
78
110
|
rate_limit_ms=_int_env("HARNESS_WEB_RATE_LIMIT_MS", 2000),
|
|
79
111
|
timeout_ms=_int_env("HARNESS_WEB_TIMEOUT_MS", 30000),
|
|
80
112
|
)
|
|
113
|
+
validate_search_config(config)
|
|
114
|
+
return config
|
|
@@ -18,13 +18,19 @@ def write_json(path: Path, payload: Any) -> None:
|
|
|
18
18
|
path.write_text(json.dumps(payload, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")
|
|
19
19
|
|
|
20
20
|
|
|
21
|
-
def write_search_results(
|
|
21
|
+
def write_search_results(
|
|
22
|
+
path: Path,
|
|
23
|
+
results: list[dict[str, str]],
|
|
24
|
+
query: str,
|
|
25
|
+
*,
|
|
26
|
+
engine: str,
|
|
27
|
+
) -> None:
|
|
22
28
|
"""Firecrawl-compatible envelope: data.web[].url|title|description."""
|
|
23
29
|
write_json(
|
|
24
30
|
path,
|
|
25
31
|
{
|
|
26
32
|
"query": query,
|
|
27
|
-
"engine":
|
|
33
|
+
"engine": engine,
|
|
28
34
|
"data": {
|
|
29
35
|
"web": [
|
|
30
36
|
{
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Route harness-web search to the configured SERP backend."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .config import HarnessWebConfig, validate_search_config
|
|
6
|
+
from .search_ddg import search_ddg
|
|
7
|
+
from .search_searxng import search_searxng
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
def search(
|
|
11
|
+
query: str,
|
|
12
|
+
*,
|
|
13
|
+
limit: int,
|
|
14
|
+
config: HarnessWebConfig,
|
|
15
|
+
) -> list[dict[str, str]]:
|
|
16
|
+
validate_search_config(config)
|
|
17
|
+
engine = config.search_engine
|
|
18
|
+
if engine == "searxng":
|
|
19
|
+
return search_searxng(query, limit=limit, config=config)
|
|
20
|
+
if engine == "ddg_html":
|
|
21
|
+
return search_ddg(query, limit=limit, config=config)
|
|
22
|
+
raise SystemExit(f"Unsupported HARNESS_WEB_SEARCH_ENGINE={engine!r}")
|
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
+
from typing import Any
|
|
5
6
|
from urllib.parse import parse_qs, unquote, urlparse
|
|
6
7
|
|
|
7
8
|
from scrapling.fetchers import Fetcher, StealthyFetcher
|
|
@@ -63,11 +64,6 @@ def search_ddg(
|
|
|
63
64
|
config: HarnessWebConfig,
|
|
64
65
|
impersonate: bool = True,
|
|
65
66
|
) -> list[dict[str, str]]:
|
|
66
|
-
if config.search_engine != "ddg_html":
|
|
67
|
-
raise SystemExit(
|
|
68
|
-
f"Unsupported HARNESS_WEB_SEARCH_ENGINE={config.search_engine!r} (only ddg_html)"
|
|
69
|
-
)
|
|
70
|
-
|
|
71
67
|
kwargs: dict = {
|
|
72
68
|
"params": {"q": query},
|
|
73
69
|
"timeout": config.timeout_sec,
|