ultimate-pi 0.4.1 → 0.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/skills/harness-decisions/SKILL.md +15 -0
- package/.agents/skills/harness-sentrux-setup/SKILL.md +1 -1
- package/.agents/skills/scrapling-web/SKILL.md +45 -40
- package/.agents/skills/sentrux/SKILL.md +99 -0
- package/.agents/skills/wiki-autoresearch/SKILL.md +3 -3
- package/.pi/SYSTEM.md +12 -13
- package/.pi/agents/pi-pi/agent-expert.md +3 -3
- package/.pi/extensions/harness-web-guard.ts +95 -0
- package/.pi/extensions/harness-web-tools.ts +209 -0
- package/.pi/extensions/lib/harness-web/run-cli.ts +92 -0
- package/.pi/harness/env.harness.template +3 -1
- package/.pi/prompts/harness-setup.md +66 -21
- package/.pi/scripts/harness-cli-verify.sh +12 -3
- package/.pi/scripts/harness-searxng-bootstrap.mjs +270 -0
- package/.pi/scripts/harness-web-search.md +24 -5
- package/.pi/scripts/harness-web.py +24 -7
- package/.pi/scripts/harness_web/config.py +37 -3
- package/.pi/scripts/harness_web/output.py +8 -2
- package/.pi/scripts/harness_web/search.py +22 -0
- package/.pi/scripts/harness_web/search_ddg.py +1 -5
- package/.pi/scripts/harness_web/search_searxng.py +100 -0
- package/CHANGELOG.md +26 -0
- package/package.json +2 -3
- package/.pi/mcp.json +0 -11
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { spawnSync } from "node:child_process";
|
|
2
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
3
|
+
import { resolve } from "node:path";
|
|
4
|
+
import { resolveHarnessScript } from "../harness-paths.js";
|
|
5
|
+
|
|
6
|
+
export interface RunHarnessWebResult {
|
|
7
|
+
ok: boolean;
|
|
8
|
+
exitCode: number;
|
|
9
|
+
stdout: string;
|
|
10
|
+
stderr: string;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export function runHarnessWeb(
|
|
14
|
+
moduleUrl: string,
|
|
15
|
+
args: string[],
|
|
16
|
+
cwd: string,
|
|
17
|
+
): RunHarnessWebResult {
|
|
18
|
+
const script = resolveHarnessScript(moduleUrl, "harness-web.py");
|
|
19
|
+
const result = spawnSync("python3", [script, ...args], {
|
|
20
|
+
cwd,
|
|
21
|
+
env: process.env,
|
|
22
|
+
encoding: "utf-8",
|
|
23
|
+
maxBuffer: 16 * 1024 * 1024,
|
|
24
|
+
});
|
|
25
|
+
return {
|
|
26
|
+
ok: result.status === 0,
|
|
27
|
+
exitCode: result.status ?? 1,
|
|
28
|
+
stdout: (result.stdout ?? "").trim(),
|
|
29
|
+
stderr: (result.stderr ?? "").trim(),
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function readTextExcerpt(
|
|
34
|
+
filePath: string,
|
|
35
|
+
cwd: string,
|
|
36
|
+
maxChars = 2000,
|
|
37
|
+
): string {
|
|
38
|
+
const full = resolve(cwd, filePath);
|
|
39
|
+
if (!existsSync(full)) return "";
|
|
40
|
+
const text = readFileSync(full, "utf-8");
|
|
41
|
+
if (text.length <= maxChars) return text;
|
|
42
|
+
return `${text.slice(0, maxChars)}\n… (truncated; use read tool for full file)`;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export interface SearchHit {
|
|
46
|
+
url: string;
|
|
47
|
+
title: string;
|
|
48
|
+
description: string;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export function summarizeSearchJson(filePath: string, cwd: string): string {
|
|
52
|
+
const full = resolve(cwd, filePath);
|
|
53
|
+
if (!existsSync(full)) return "";
|
|
54
|
+
try {
|
|
55
|
+
const data = JSON.parse(readFileSync(full, "utf-8")) as {
|
|
56
|
+
query?: string;
|
|
57
|
+
engine?: string;
|
|
58
|
+
data?: { web?: SearchHit[] };
|
|
59
|
+
};
|
|
60
|
+
const hits = data.data?.web ?? [];
|
|
61
|
+
const lines = [
|
|
62
|
+
`engine: ${data.engine ?? "unknown"}`,
|
|
63
|
+
`query: ${data.query ?? ""}`,
|
|
64
|
+
`results: ${hits.length}`,
|
|
65
|
+
"",
|
|
66
|
+
];
|
|
67
|
+
for (const [i, hit] of hits.entries()) {
|
|
68
|
+
lines.push(`${i + 1}. ${hit.title || "(no title)"}`);
|
|
69
|
+
lines.push(` ${hit.url}`);
|
|
70
|
+
if (hit.description) {
|
|
71
|
+
const snip =
|
|
72
|
+
hit.description.length > 120
|
|
73
|
+
? `${hit.description.slice(0, 120)}…`
|
|
74
|
+
: hit.description;
|
|
75
|
+
lines.push(` ${snip}`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return lines.join("\n");
|
|
79
|
+
} catch {
|
|
80
|
+
return "";
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
export function harnessWebContextLine(): string {
|
|
85
|
+
const engine = process.env.HARNESS_WEB_SEARCH_ENGINE?.trim() || "ddg_html";
|
|
86
|
+
const searx = process.env.HARNESS_WEB_SEARXNG_URL?.trim();
|
|
87
|
+
const searxPart = searx ? ` searxng_url=${searx}` : "";
|
|
88
|
+
return (
|
|
89
|
+
`[HarnessWeb] search_engine=${engine}${searxPart} — use web_search / web_fetch tools; ` +
|
|
90
|
+
"never resolve UP_PKG, ls harness-web.py, or python3 -c import scrapling before searching."
|
|
91
|
+
);
|
|
92
|
+
}
|
|
@@ -4,9 +4,11 @@
|
|
|
4
4
|
# Telemetry (set false to disable harness PostHog events)
|
|
5
5
|
HARNESS_TELEMETRY_ENABLED=true
|
|
6
6
|
|
|
7
|
-
# harness-web (Scrapling
|
|
7
|
+
# harness-web (Scrapling scrape + pluggable search)
|
|
8
8
|
HARNESS_WEB_FETCH_MODE=stealth
|
|
9
9
|
HARNESS_WEB_SEARCH_ENGINE=ddg_html
|
|
10
|
+
# SearXNG (when HARNESS_WEB_SEARCH_ENGINE=searxng):
|
|
11
|
+
# HARNESS_WEB_SEARXNG_URL=http://127.0.0.1:8080
|
|
10
12
|
# HARNESS_WEB_PROXY=
|
|
11
13
|
# HARNESS_WEB_RATE_LIMIT_MS=2000
|
|
12
14
|
# HARNESS_WEB_TIMEOUT_MS=30000
|
|
@@ -134,10 +134,13 @@ export PATH="$HOME/.local/bin:$PATH"
|
|
|
134
134
|
uv tool install "scrapling[fetchers]"
|
|
135
135
|
scrapling install # Chromium for default stealth scrape; may need sudo for OS libs on Linux
|
|
136
136
|
mkdir -p .web
|
|
137
|
+
python3 "$UP_PKG/.pi/scripts/harness-web.py" status # JSON config (setup/diagnostics only)
|
|
137
138
|
python3 "$UP_PKG/.pi/scripts/harness-web.py" search "ultimate-pi harness" -o .web/smoke-search.json --limit 3
|
|
138
139
|
python3 "$UP_PKG/.pi/scripts/harness-web.py" scrape "https://example.com" -o .web/smoke-page.md --fast
|
|
139
140
|
```
|
|
140
141
|
|
|
142
|
+
After pi loads extensions, agents should smoke **`web_search`** once (not `UP_PKG` / `import scrapling` preflight). Example intent: query `ultimate-pi harness`, `limit` 2.
|
|
143
|
+
|
|
141
144
|
- **`--skip-tools`:** skip Step 2 (includes Scrapling verify).
|
|
142
145
|
- On Linux/WSL, if stealth scrape fails, install browser libs from `harness-cli-verify.sh` output or use `--fast` for static targets.
|
|
143
146
|
|
|
@@ -303,7 +306,7 @@ if gh auth status &>/dev/null; then
|
|
|
303
306
|
fi
|
|
304
307
|
```
|
|
305
308
|
|
|
306
|
-
### 2.8 — sentrux (Architectural Quality Gate
|
|
309
|
+
### 2.8 — sentrux (Architectural Quality Gate)
|
|
307
310
|
|
|
308
311
|
```bash
|
|
309
312
|
if ! command -v sentrux &>/dev/null || [ "$FORCE" = "true" ]; then
|
|
@@ -316,7 +319,7 @@ Install all 52 language plugins:
|
|
|
316
319
|
sentrux plugin add-standard 2>/dev/null || echo "Plugins already installed or failed"
|
|
317
320
|
```
|
|
318
321
|
|
|
319
|
-
|
|
322
|
+
Ensure the **sentrux** Pi skill is linked (see Step 4.2). **Rules.toml bootstrap runs in Step 4.3** (idempotent, merge-safe).
|
|
320
323
|
|
|
321
324
|
## Step 3 — Pi Extension Packages
|
|
322
325
|
|
|
@@ -421,6 +424,47 @@ If **no** `.env` at project root:
|
|
|
421
424
|
- On **skip** or `--non-interactive`: warn in report (non-interactive skips creation)
|
|
422
425
|
- If `ask_user` cancelled: stop with `needs_clarification`
|
|
423
426
|
|
|
427
|
+
### 4.0b — harness-web search engine (non-destructive)
|
|
428
|
+
|
|
429
|
+
Unless `--non-interactive`, **call `ask_user`** after Step 4.0 (harness-decisions skill):
|
|
430
|
+
|
|
431
|
+
```json
|
|
432
|
+
{
|
|
433
|
+
"question": "Which harness-web search backend should this project use?",
|
|
434
|
+
"context": "Scrapling still handles scrape/map/bulk. Search only: DuckDuckGo HTML needs no extra services. SearXNG must be self-hosted for agents — public instances often block JSON (403) and default to ~4 API requests/hour per IP.",
|
|
435
|
+
"options": [
|
|
436
|
+
{
|
|
437
|
+
"title": "DuckDuckGo HTML (default)",
|
|
438
|
+
"description": "HARNESS_WEB_SEARCH_ENGINE=ddg_html — no Docker"
|
|
439
|
+
},
|
|
440
|
+
{
|
|
441
|
+
"title": "Self-host SearXNG here (Docker)",
|
|
442
|
+
"description": "Bootstrap .searxng/ with official compose, enable JSON API, set harness env"
|
|
443
|
+
},
|
|
444
|
+
{
|
|
445
|
+
"title": "Use existing SearXNG instance",
|
|
446
|
+
"description": "You provide base URL; harness writes HARNESS_WEB_SEARXNG_URL"
|
|
447
|
+
}
|
|
448
|
+
],
|
|
449
|
+
"allowFreeform": true
|
|
450
|
+
}
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
| User choice | Actions |
|
|
454
|
+
|-------------|---------|
|
|
455
|
+
| **DDG** | Ensure `.env` has `HARNESS_WEB_SEARCH_ENGINE=ddg_html` via `harness-sync-env.mjs` (append only if missing; do not overwrite user values) |
|
|
456
|
+
| **Self-host** | `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs"` (requires Docker). Script sets `HARNESS_WEB_SEARCH_ENGINE=searxng` and `HARNESS_WEB_SEARXNG_URL` |
|
|
457
|
+
| **Existing instance** | Parse base URL from freeform answer. Run `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" --set-url {url}` (health check + upsert `.env`) |
|
|
458
|
+
| **Cancelled** | Stop with `needs_clarification` |
|
|
459
|
+
| **`--non-interactive`** | Skip prompt; leave/default `ddg_html`; do not run Docker bootstrap |
|
|
460
|
+
|
|
461
|
+
Post-choice smoke (report pass/fail):
|
|
462
|
+
|
|
463
|
+
```bash
|
|
464
|
+
mkdir -p .web
|
|
465
|
+
python3 "$UP_PKG/.pi/scripts/harness-web.py" search "ultimate-pi harness" -o .web/setup-search.json --limit 2
|
|
466
|
+
```
|
|
467
|
+
|
|
424
468
|
Rules:
|
|
425
469
|
|
|
426
470
|
- **Do not** `cp` over an existing `.env`.
|
|
@@ -436,6 +480,7 @@ Ensure `.gitignore` contains:
|
|
|
436
480
|
```
|
|
437
481
|
.env
|
|
438
482
|
.web/
|
|
483
|
+
.searxng/
|
|
439
484
|
.raw/
|
|
440
485
|
.vault-meta/
|
|
441
486
|
.pi/harness/critics/
|
|
@@ -451,28 +496,27 @@ Ensure `.gitignore` contains:
|
|
|
451
496
|
!.sentrux/rules.toml
|
|
452
497
|
```
|
|
453
498
|
|
|
454
|
-
### 4.2 —
|
|
499
|
+
### 4.2 — Sentrux Pi skill
|
|
455
500
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
501
|
+
Pi does **not** load `.pi/mcp.json`. Agents use Sentrux via the **CLI** and the **`sentrux`** skill.
|
|
502
|
+
|
|
503
|
+
From **project root**, ensure the skill is discoverable (idempotent):
|
|
504
|
+
|
|
505
|
+
```bash
|
|
506
|
+
UP_PKG="$(node -p "require('path').dirname(require.resolve('ultimate-pi/package.json'))")"
|
|
507
|
+
SKILL_SRC="$UP_PKG/.agents/skills/sentrux"
|
|
508
|
+
SKILL_DST=".pi/skills/sentrux"
|
|
509
|
+
if [ -d "$SKILL_SRC" ] && [ ! -e "$SKILL_DST" ]; then
|
|
510
|
+
ln -s "../../.agents/skills/sentrux" "$SKILL_DST"
|
|
511
|
+
echo "✓ linked $SKILL_DST → sentrux skill"
|
|
512
|
+
elif [ -e "$SKILL_DST" ]; then
|
|
513
|
+
echo "✓ sentrux skill already present at $SKILL_DST"
|
|
514
|
+
else
|
|
515
|
+
echo "✗ missing $SKILL_SRC — reinstall ultimate-pi"
|
|
516
|
+
fi
|
|
469
517
|
```
|
|
470
518
|
|
|
471
|
-
|
|
472
|
-
- `scan` — quality signal, file count, bottleneck detection
|
|
473
|
-
- `session_start` / `session_end` — baseline comparison, degradation detection
|
|
474
|
-
- `check_rules` — architectural constraint enforcement
|
|
475
|
-
- `health`, `rescan`, `evolution`, `dsm`, `test_gaps`
|
|
519
|
+
After `/reload`, agents can invoke **`/skill:sentrux`** for install paths, `sentrux check`, `sentrux gate --save` / `sentrux gate`, and harness integration. **context-mode** remains a separate `npm:context-mode` package in `.pi/settings.json` (its own MCP bridge inside that extension).
|
|
476
520
|
|
|
477
521
|
### 4.3 — Sentrux rules bootstrap (required)
|
|
478
522
|
|
|
@@ -646,6 +690,7 @@ Output summary table:
|
|
|
646
690
|
| .gitignore | ✓/✗ | entries added (incl. `.env`) |
|
|
647
691
|
| ./raw directory | ✓/✗ | Created for graphify source ingestion |
|
|
648
692
|
| harness-web (Scrapling) | ✓/✗ | search + scrape smoke |
|
|
693
|
+
| harness-web search engine | ddg / searxng / — | Step 4.0b choice; SearXNG URL if applicable |
|
|
649
694
|
|
|
650
695
|
Next steps:
|
|
651
696
|
1. If tools missing: re-run with `--force` or install individually
|
|
@@ -200,10 +200,19 @@ verify_scrapling() {
|
|
|
200
200
|
return
|
|
201
201
|
fi
|
|
202
202
|
mkdir -p .web
|
|
203
|
-
|
|
204
|
-
|
|
203
|
+
_search_engine="${HARNESS_WEB_SEARCH_ENGINE:-ddg_html}"
|
|
204
|
+
if [ "$_search_engine" = "searxng" ]; then
|
|
205
|
+
if [ -z "${HARNESS_WEB_SEARXNG_URL:-}" ]; then
|
|
206
|
+
fail "HARNESS_WEB_SEARCH_ENGINE=searxng but HARNESS_WEB_SEARXNG_URL is unset"
|
|
207
|
+
elif python3 "$_hw" search "ultimate-pi harness" -o .web/verify-search.json --limit 2 2>/dev/null | grep -q wrote; then
|
|
208
|
+
pass "harness-web search smoke (searxng)"
|
|
209
|
+
else
|
|
210
|
+
fail "harness-web search smoke failed (searxng at ${HARNESS_WEB_SEARXNG_URL})"
|
|
211
|
+
fi
|
|
212
|
+
elif python3 "$_hw" search "ultimate-pi harness" -o .web/verify-search.json --limit 2 2>/dev/null | grep -q wrote; then
|
|
213
|
+
pass "harness-web search smoke (ddg_html)"
|
|
205
214
|
else
|
|
206
|
-
fail "harness-web search smoke failed"
|
|
215
|
+
fail "harness-web search smoke failed (ddg_html)"
|
|
207
216
|
fi
|
|
208
217
|
if python3 "$_hw" scrape "https://example.com" -o .web/verify-page.md --fast 2>/dev/null | grep -q wrote; then
|
|
209
218
|
pass "harness-web scrape --fast smoke"
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Bootstrap a project-local SearXNG instance for harness-web (Docker Compose).
|
|
4
|
+
*
|
|
5
|
+
* - Creates .searxng/ with official upstream compose template
|
|
6
|
+
* - Writes core-config/settings.yml with json format + limiter off (local dev)
|
|
7
|
+
* - Starts containers and waits for JSON search health
|
|
8
|
+
* - Upserts HARNESS_WEB_SEARCH_ENGINE / HARNESS_WEB_SEARXNG_URL in project .env
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" [PROJECT_ROOT] [--url-only]
|
|
12
|
+
* node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" --set-url http://127.0.0.1:8080
|
|
13
|
+
*
|
|
14
|
+
* Requires: docker, docker compose, curl
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import {
|
|
18
|
+
access,
|
|
19
|
+
copyFile,
|
|
20
|
+
mkdir,
|
|
21
|
+
readFile,
|
|
22
|
+
writeFile,
|
|
23
|
+
} from "node:fs/promises";
|
|
24
|
+
import { constants } from "node:fs";
|
|
25
|
+
import { join, dirname } from "node:path";
|
|
26
|
+
import { fileURLToPath } from "node:url";
|
|
27
|
+
import { spawn } from "node:child_process";
|
|
28
|
+
|
|
29
|
+
const SCRIPT_DIR = dirname(fileURLToPath(import.meta.url));
|
|
30
|
+
const UP_PKG = join(SCRIPT_DIR, "..", "..");
|
|
31
|
+
|
|
32
|
+
const SEARXNG_BASE =
|
|
33
|
+
"https://raw.githubusercontent.com/searxng/searxng/master/container";
|
|
34
|
+
const DEFAULT_PORT = "8080";
|
|
35
|
+
const HEALTH_PATH = "/search?q=harness&format=json";
|
|
36
|
+
|
|
37
|
+
const MANAGED_START = "# --- harness:env:start ---";
|
|
38
|
+
const MANAGED_END = "# --- harness:env:end ---";
|
|
39
|
+
|
|
40
|
+
const args = process.argv.slice(2).filter((a) => !a.startsWith("-"));
|
|
41
|
+
const flags = new Set(process.argv.slice(2).filter((a) => a.startsWith("-")));
|
|
42
|
+
const urlOnly = flags.has("--url-only");
|
|
43
|
+
const setUrlIdx = process.argv.indexOf("--set-url");
|
|
44
|
+
const setUrl = setUrlIdx !== -1 ? process.argv[setUrlIdx + 1] : null;
|
|
45
|
+
|
|
46
|
+
const PROJECT_ROOT = args[0] || process.cwd();
|
|
47
|
+
const SEARXNG_DIR = join(PROJECT_ROOT, ".searxng");
|
|
48
|
+
const CORE_CONFIG = join(SEARXNG_DIR, "core-config");
|
|
49
|
+
const SETTINGS_PATH = join(CORE_CONFIG, "settings.yml");
|
|
50
|
+
const COMPOSE_PATH = join(SEARXNG_DIR, "docker-compose.yml");
|
|
51
|
+
const ENV_COMPOSE = join(SEARXNG_DIR, ".env");
|
|
52
|
+
|
|
53
|
+
const HARNESS_SETTINGS = `use_default_settings: true
|
|
54
|
+
|
|
55
|
+
search:
|
|
56
|
+
formats:
|
|
57
|
+
- html
|
|
58
|
+
- json
|
|
59
|
+
|
|
60
|
+
server:
|
|
61
|
+
limiter: false
|
|
62
|
+
public_instance: false
|
|
63
|
+
`;
|
|
64
|
+
|
|
65
|
+
async function exists(path) {
|
|
66
|
+
try {
|
|
67
|
+
await access(path, constants.F_OK);
|
|
68
|
+
return true;
|
|
69
|
+
} catch {
|
|
70
|
+
return false;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function run(cmd, cmdArgs, opts = {}) {
|
|
75
|
+
return new Promise((resolve, reject) => {
|
|
76
|
+
const child = spawn(cmd, cmdArgs, {
|
|
77
|
+
stdio: opts.inherit ? "inherit" : "pipe",
|
|
78
|
+
cwd: opts.cwd,
|
|
79
|
+
env: { ...process.env, ...opts.env },
|
|
80
|
+
});
|
|
81
|
+
let stdout = "";
|
|
82
|
+
let stderr = "";
|
|
83
|
+
if (!opts.inherit) {
|
|
84
|
+
child.stdout?.on("data", (d) => {
|
|
85
|
+
stdout += d;
|
|
86
|
+
});
|
|
87
|
+
child.stderr?.on("data", (d) => {
|
|
88
|
+
stderr += d;
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
child.on("error", reject);
|
|
92
|
+
child.on("close", (code) => {
|
|
93
|
+
if (code === 0) resolve({ stdout, stderr });
|
|
94
|
+
else
|
|
95
|
+
reject(
|
|
96
|
+
new Error(
|
|
97
|
+
`${cmd} ${cmdArgs.join(" ")} exited ${code}\n${stderr || stdout}`,
|
|
98
|
+
),
|
|
99
|
+
);
|
|
100
|
+
});
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
async function requireDocker() {
|
|
105
|
+
for (const bin of ["docker"]) {
|
|
106
|
+
try {
|
|
107
|
+
await run(bin, ["--version"]);
|
|
108
|
+
} catch {
|
|
109
|
+
console.error(`✗ ${bin} not found`);
|
|
110
|
+
console.error(
|
|
111
|
+
"Install Docker: https://docs.searxng.org/admin/installation-docker.html",
|
|
112
|
+
);
|
|
113
|
+
process.exit(1);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
try {
|
|
117
|
+
await run("docker", ["compose", "version"]);
|
|
118
|
+
} catch {
|
|
119
|
+
console.error("✗ docker compose not available");
|
|
120
|
+
console.error(
|
|
121
|
+
"Install Docker Compose v2: https://docs.docker.com/compose/install/",
|
|
122
|
+
);
|
|
123
|
+
process.exit(1);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
async function curlToFile(url, dest) {
|
|
128
|
+
await run("curl", ["-fsSL", "-o", dest, url]);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
async function readComposePort() {
|
|
132
|
+
if (!(await exists(ENV_COMPOSE))) return DEFAULT_PORT;
|
|
133
|
+
const text = await readFile(ENV_COMPOSE, "utf8");
|
|
134
|
+
for (const line of text.split("\n")) {
|
|
135
|
+
const m = line.match(/^SEARXNG_PORT=(.+)$/);
|
|
136
|
+
if (m) return m[1].trim().replace(/^["']|["']$/g, "") || DEFAULT_PORT;
|
|
137
|
+
}
|
|
138
|
+
return DEFAULT_PORT;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
async function ensureSearxngLayout() {
|
|
142
|
+
await mkdir(CORE_CONFIG, { recursive: true });
|
|
143
|
+
if (!(await exists(COMPOSE_PATH))) {
|
|
144
|
+
console.log("Fetching SearXNG docker-compose.yml …");
|
|
145
|
+
await curlToFile(`${SEARXNG_BASE}/docker-compose.yml`, COMPOSE_PATH);
|
|
146
|
+
}
|
|
147
|
+
if (!(await exists(ENV_COMPOSE))) {
|
|
148
|
+
const example = join(SEARXNG_DIR, ".env.example");
|
|
149
|
+
if (!(await exists(example))) {
|
|
150
|
+
console.log("Fetching SearXNG .env.example …");
|
|
151
|
+
await curlToFile(`${SEARXNG_BASE}/.env.example`, example);
|
|
152
|
+
}
|
|
153
|
+
await copyFile(example, ENV_COMPOSE);
|
|
154
|
+
}
|
|
155
|
+
const needsSettings =
|
|
156
|
+
!(await exists(SETTINGS_PATH)) ||
|
|
157
|
+
!(await readFile(SETTINGS_PATH, "utf8")).includes("json");
|
|
158
|
+
if (needsSettings) {
|
|
159
|
+
await writeFile(SETTINGS_PATH, HARNESS_SETTINGS, "utf8");
|
|
160
|
+
console.log(`✓ Wrote ${SETTINGS_PATH} (json format, limiter off)`);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
async function composeUp() {
|
|
165
|
+
console.log("Starting SearXNG (docker compose up -d) …");
|
|
166
|
+
await run("docker", ["compose", "up", "-d"], { cwd: SEARXNG_DIR, inherit: true });
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
async function waitForHealth(baseUrl) {
|
|
170
|
+
const url = `${baseUrl}${HEALTH_PATH}`;
|
|
171
|
+
const deadline = Date.now() + 90_000;
|
|
172
|
+
let lastErr = "";
|
|
173
|
+
while (Date.now() < deadline) {
|
|
174
|
+
try {
|
|
175
|
+
const res = await fetch(url, {
|
|
176
|
+
headers: { Accept: "application/json" },
|
|
177
|
+
signal: AbortSignal.timeout(10_000),
|
|
178
|
+
});
|
|
179
|
+
if (res.status === 403) {
|
|
180
|
+
throw new Error(
|
|
181
|
+
"SearXNG returned 403 for format=json — ensure search.formats includes json in .searxng/core-config/settings.yml",
|
|
182
|
+
);
|
|
183
|
+
}
|
|
184
|
+
if (res.ok) {
|
|
185
|
+
const data = await res.json();
|
|
186
|
+
if (data && typeof data === "object") {
|
|
187
|
+
console.log(`✓ SearXNG healthy at ${baseUrl}`);
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
lastErr = `HTTP ${res.status}`;
|
|
192
|
+
} catch (err) {
|
|
193
|
+
lastErr = err instanceof Error ? err.message : String(err);
|
|
194
|
+
}
|
|
195
|
+
await new Promise((r) => setTimeout(r, 3000));
|
|
196
|
+
}
|
|
197
|
+
throw new Error(`SearXNG health check timed out (${url}): ${lastErr}`);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
function upsertEnvKey(content, key, value) {
|
|
201
|
+
const line = `${key}=${value}`;
|
|
202
|
+
const re = new RegExp(`^${key}=.*$`, "m");
|
|
203
|
+
if (re.test(content)) {
|
|
204
|
+
return content.replace(re, line);
|
|
205
|
+
}
|
|
206
|
+
if (content.includes(MANAGED_START) && content.includes(MANAGED_END)) {
|
|
207
|
+
const end = content.indexOf(MANAGED_END);
|
|
208
|
+
return `${content.slice(0, end)}${line}\n${content.slice(end)}`;
|
|
209
|
+
}
|
|
210
|
+
const sep = content.endsWith("\n") || content.length === 0 ? "" : "\n";
|
|
211
|
+
return `${content}${sep}${MANAGED_START}\n# harness-web (SearXNG)\n${line}\n${MANAGED_END}\n`;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
async function upsertHarnessEnv(baseUrl) {
|
|
215
|
+
const envPath = join(PROJECT_ROOT, ".env");
|
|
216
|
+
let content = "";
|
|
217
|
+
if (await exists(envPath)) {
|
|
218
|
+
content = await readFile(envPath, "utf8");
|
|
219
|
+
} else {
|
|
220
|
+
const template = join(UP_PKG, ".pi", "harness", "env.harness.template");
|
|
221
|
+
if (await exists(template)) {
|
|
222
|
+
content = await readFile(template, "utf8");
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
content = upsertEnvKey(content, "HARNESS_WEB_SEARCH_ENGINE", "searxng");
|
|
226
|
+
content = upsertEnvKey(content, "HARNESS_WEB_SEARXNG_URL", baseUrl);
|
|
227
|
+
await writeFile(envPath, content.endsWith("\n") ? content : `${content}\n`, "utf8");
|
|
228
|
+
console.log(`✓ Updated .env: HARNESS_WEB_SEARCH_ENGINE=searxng, HARNESS_WEB_SEARXNG_URL=${baseUrl}`);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
function normalizeBaseUrl(raw) {
|
|
232
|
+
const url = raw.trim().replace(/\/+$/, "");
|
|
233
|
+
if (!/^https?:\/\//i.test(url)) {
|
|
234
|
+
throw new Error(`Invalid SearXNG URL: ${raw}`);
|
|
235
|
+
}
|
|
236
|
+
return url;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
async function main() {
|
|
240
|
+
if (setUrl) {
|
|
241
|
+
const baseUrl = normalizeBaseUrl(setUrl);
|
|
242
|
+
await waitForHealth(baseUrl);
|
|
243
|
+
await upsertHarnessEnv(baseUrl);
|
|
244
|
+
process.exit(0);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
if (urlOnly) {
|
|
248
|
+
const port = (await exists(ENV_COMPOSE)) ? await readComposePort() : DEFAULT_PORT;
|
|
249
|
+
console.log(`http://127.0.0.1:${port}`);
|
|
250
|
+
process.exit(0);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
await requireDocker();
|
|
254
|
+
await ensureSearxngLayout();
|
|
255
|
+
const port = await readComposePort();
|
|
256
|
+
const baseUrl = `http://127.0.0.1:${port}`;
|
|
257
|
+
await composeUp();
|
|
258
|
+
await waitForHealth(baseUrl);
|
|
259
|
+
await upsertHarnessEnv(baseUrl);
|
|
260
|
+
|
|
261
|
+
console.log("");
|
|
262
|
+
console.log("SearXNG is ready for harness-web:");
|
|
263
|
+
console.log(` HARNESS_WEB_SEARXNG_URL=${baseUrl}`);
|
|
264
|
+
console.log(` Test: python3 "${join(UP_PKG, ".pi/scripts/harness-web.py")}" search "test" -o .web/search.json --limit 2`);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
main().catch((err) => {
|
|
268
|
+
console.error(`✗ ${err.message || err}`);
|
|
269
|
+
process.exit(1);
|
|
270
|
+
});
|
|
@@ -1,12 +1,21 @@
|
|
|
1
1
|
# harness-web search (internal)
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Routing: `harness_web/search.py` dispatches by `HARNESS_WEB_SEARCH_ENGINE`.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
## Engines
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
| Value | Module | Notes |
|
|
8
|
+
|-------|--------|-------|
|
|
9
|
+
| `ddg_html` (default) | `search_ddg.py` | DuckDuckGo HTML SERP via Scrapling HTTP (+ one stealth retry on challenge) |
|
|
10
|
+
| `searxng` | `search_searxng.py` | Self-hosted JSON API — requires `HARNESS_WEB_SEARXNG_URL` |
|
|
8
11
|
|
|
9
|
-
|
|
12
|
+
Bootstrap local SearXNG: `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs"`
|
|
13
|
+
|
|
14
|
+
## DuckDuckGo HTML (`ddg_html`)
|
|
15
|
+
|
|
16
|
+
`GET https://html.duckduckgo.com/html/?q=…`
|
|
17
|
+
|
|
18
|
+
### Selectors
|
|
10
19
|
|
|
11
20
|
| Field | CSS |
|
|
12
21
|
|-------|-----|
|
|
@@ -16,10 +25,18 @@ Implemented in `harness_web/search_ddg.py` via `Fetcher.get` (HTTP, not a browse
|
|
|
16
25
|
|
|
17
26
|
DDG redirect URLs (`//duckduckgo.com/l/?uddg=…`) are unwrapped to the target `uddg` parameter.
|
|
18
27
|
|
|
19
|
-
|
|
28
|
+
### Challenge detection
|
|
20
29
|
|
|
21
30
|
If status 403 or HTML contains challenge markers (`anomaly-modal`, etc.), retry **once** with `StealthyFetcher`, then exit with a clear “search engine blocked” message.
|
|
22
31
|
|
|
32
|
+
## SearXNG (`searxng`)
|
|
33
|
+
|
|
34
|
+
`GET {HARNESS_WEB_SEARXNG_URL}/search?q=…&format=json&pageno=1`
|
|
35
|
+
|
|
36
|
+
- No client API token (SearXNG has no standard search API key).
|
|
37
|
+
- `search.formats` in instance `settings.yml` must include `json` or the API returns **403**.
|
|
38
|
+
- Public instances are unsuitable (~4 JSON req/hr when limiter on; JSON often disabled). Use self-hosted bootstrap.
|
|
39
|
+
|
|
23
40
|
## Output
|
|
24
41
|
|
|
25
42
|
`.web/search.json` — envelope compatible with legacy Firecrawl skills:
|
|
@@ -31,3 +48,5 @@ If status 403 or HTML contains challenge markers (`anomaly-modal`, etc.), retry
|
|
|
31
48
|
"data": { "web": [{ "url", "title", "description" }] }
|
|
32
49
|
}
|
|
33
50
|
```
|
|
51
|
+
|
|
52
|
+
`engine` reflects the active backend (`ddg_html` or `searxng`).
|
|
@@ -36,7 +36,7 @@ if str(SCRIPT_DIR) not in sys.path:
|
|
|
36
36
|
from harness_web.config import HarnessWebConfig, load_config # noqa: E402
|
|
37
37
|
from harness_web.output import write_search_results # noqa: E402
|
|
38
38
|
from harness_web.scrape import bulk_scrape, map_url, scrape_url # noqa: E402
|
|
39
|
-
from harness_web.
|
|
39
|
+
from harness_web.search import search # noqa: E402
|
|
40
40
|
|
|
41
41
|
DEFAULT_WEB_DIR = ".web"
|
|
42
42
|
|
|
@@ -47,8 +47,8 @@ def _default_out(sub: str) -> Path:
|
|
|
47
47
|
|
|
48
48
|
def cmd_search(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
49
49
|
out = Path(args.output or _default_out("search.json"))
|
|
50
|
-
results =
|
|
51
|
-
write_search_results(out, results, args.query)
|
|
50
|
+
results = search(args.query, limit=args.limit, config=config)
|
|
51
|
+
write_search_results(out, results, args.query, engine=config.search_engine)
|
|
52
52
|
print(f"wrote {out} ({len(results)} results)")
|
|
53
53
|
return 0
|
|
54
54
|
|
|
@@ -76,6 +76,20 @@ def cmd_map(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
|
76
76
|
return 0
|
|
77
77
|
|
|
78
78
|
|
|
79
|
+
def cmd_status(_args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
80
|
+
import json
|
|
81
|
+
|
|
82
|
+
payload = {
|
|
83
|
+
"search_engine": config.search_engine,
|
|
84
|
+
"searxng_url": config.searxng_url,
|
|
85
|
+
"fetch_mode": config.fetch_mode,
|
|
86
|
+
"script": str(Path(__file__).resolve()),
|
|
87
|
+
"bootstrap": "ok",
|
|
88
|
+
}
|
|
89
|
+
print(json.dumps(payload, indent=2))
|
|
90
|
+
return 0
|
|
91
|
+
|
|
92
|
+
|
|
79
93
|
def cmd_bulk_scrape(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
80
94
|
sleep_sec = args.sleep if args.sleep is not None else config.rate_limit_ms / 1000.0
|
|
81
95
|
if args.urls:
|
|
@@ -86,8 +100,8 @@ def cmd_bulk_scrape(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
|
86
100
|
data = json.loads(Path(args.from_search).read_text(encoding="utf-8"))
|
|
87
101
|
urls = [item["url"] for item in data.get("data", {}).get("web", []) if item.get("url")]
|
|
88
102
|
else:
|
|
89
|
-
|
|
90
|
-
urls = [r["url"] for r in
|
|
103
|
+
serp = search(args.query, limit=args.limit, config=config)
|
|
104
|
+
urls = [r["url"] for r in serp]
|
|
91
105
|
|
|
92
106
|
if not urls:
|
|
93
107
|
print("bulk-scrape: no URLs to fetch", file=sys.stderr)
|
|
@@ -111,11 +125,11 @@ def cmd_bulk_scrape(args: argparse.Namespace, config: HarnessWebConfig) -> int:
|
|
|
111
125
|
def build_parser() -> argparse.ArgumentParser:
|
|
112
126
|
p = argparse.ArgumentParser(
|
|
113
127
|
prog="harness-web",
|
|
114
|
-
description="Harness web layer: search (DDG HTML) and scrape (Scrapling).",
|
|
128
|
+
description="Harness web layer: search (DDG HTML or SearXNG) and scrape (Scrapling).",
|
|
115
129
|
)
|
|
116
130
|
sub = p.add_subparsers(dest="command", required=True)
|
|
117
131
|
|
|
118
|
-
ps = sub.add_parser("search", help="Search via
|
|
132
|
+
ps = sub.add_parser("search", help="Search via configured SERP (HARNESS_WEB_SEARCH_ENGINE)")
|
|
119
133
|
ps.add_argument("query", help="Search query")
|
|
120
134
|
ps.add_argument("-o", "--output", help="JSON output path (default: .web/search.json)")
|
|
121
135
|
ps.add_argument("--limit", type=int, default=5)
|
|
@@ -160,6 +174,9 @@ def build_parser() -> argparse.ArgumentParser:
|
|
|
160
174
|
pm.add_argument("--fast", action="store_true")
|
|
161
175
|
pm.set_defaults(func=cmd_map)
|
|
162
176
|
|
|
177
|
+
pst = sub.add_parser("status", help="Print harness-web config as JSON (setup/diagnostics)")
|
|
178
|
+
pst.set_defaults(func=cmd_status)
|
|
179
|
+
|
|
163
180
|
return p
|
|
164
181
|
|
|
165
182
|
|