ultimate-pi 0.4.0 → 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (94) hide show
  1. package/.agents/skills/harness-decisions/SKILL.md +15 -0
  2. package/.agents/skills/scrapling-web/SKILL.md +45 -40
  3. package/.agents/skills/wiki-autoresearch/SKILL.md +3 -3
  4. package/.pi/PACKAGING.md +3 -2
  5. package/.pi/SYSTEM.md +12 -13
  6. package/.pi/agents/pi-pi/agent-expert.md +3 -3
  7. package/.pi/extensions/harness-web-guard.ts +95 -0
  8. package/.pi/extensions/harness-web-tools.ts +209 -0
  9. package/.pi/extensions/lib/harness-vcc-settings.ts +50 -0
  10. package/.pi/extensions/lib/harness-web/run-cli.ts +92 -0
  11. package/.pi/extensions/ultimate-pi-vcc.ts +17 -0
  12. package/.pi/harness/docs/adrs/0030-inhouse-vcc-compaction.md +40 -0
  13. package/.pi/harness/docs/adrs/README.md +1 -0
  14. package/.pi/harness/env.harness.template +3 -1
  15. package/.pi/prompts/harness-setup.md +48 -2
  16. package/.pi/scripts/harness-cli-verify.sh +12 -3
  17. package/.pi/scripts/harness-searxng-bootstrap.mjs +270 -0
  18. package/.pi/scripts/harness-web-search.md +24 -5
  19. package/.pi/scripts/harness-web.py +24 -7
  20. package/.pi/scripts/harness_web/config.py +37 -3
  21. package/.pi/scripts/harness_web/output.py +8 -2
  22. package/.pi/scripts/harness_web/search.py +22 -0
  23. package/.pi/scripts/harness_web/search_ddg.py +1 -5
  24. package/.pi/scripts/harness_web/search_searxng.py +100 -0
  25. package/.pi/scripts/vendor-pi-vcc-settings.stub.ts +8 -0
  26. package/.pi/scripts/vendor-sync-pi-vcc.sh +40 -0
  27. package/.pi/settings.example.json +1 -6
  28. package/CHANGELOG.md +20 -6
  29. package/THIRD_PARTY_NOTICES.md +8 -22
  30. package/package.json +7 -6
  31. package/vendor/pi-vcc/README.md +215 -0
  32. package/vendor/pi-vcc/UPSTREAM_PIN.md +12 -0
  33. package/vendor/pi-vcc/demo.gif +0 -0
  34. package/vendor/pi-vcc/index.ts +12 -0
  35. package/vendor/pi-vcc/package.json +26 -0
  36. package/vendor/pi-vcc/scripts/audit-sessions.ts +88 -0
  37. package/vendor/pi-vcc/scripts/benchmark-real-sessions.ts +25 -0
  38. package/vendor/pi-vcc/scripts/compare-before-after.ts +36 -0
  39. package/vendor/pi-vcc/scripts/dump-branch-output.ts +20 -0
  40. package/vendor/pi-vcc/src/commands/pi-vcc.ts +36 -0
  41. package/vendor/pi-vcc/src/commands/vcc-recall.ts +65 -0
  42. package/vendor/pi-vcc/src/core/brief.ts +381 -0
  43. package/vendor/pi-vcc/src/core/build-sections.ts +79 -0
  44. package/vendor/pi-vcc/src/core/content.ts +60 -0
  45. package/vendor/pi-vcc/src/core/filter-noise.ts +42 -0
  46. package/vendor/pi-vcc/src/core/format-recall.ts +27 -0
  47. package/vendor/pi-vcc/src/core/format.ts +49 -0
  48. package/vendor/pi-vcc/src/core/lineage.ts +26 -0
  49. package/vendor/pi-vcc/src/core/load-messages.ts +41 -0
  50. package/vendor/pi-vcc/src/core/normalize.ts +66 -0
  51. package/vendor/pi-vcc/src/core/recall-scope.ts +14 -0
  52. package/vendor/pi-vcc/src/core/render-entries.ts +55 -0
  53. package/vendor/pi-vcc/src/core/report.ts +237 -0
  54. package/vendor/pi-vcc/src/core/sanitize.ts +5 -0
  55. package/vendor/pi-vcc/src/core/search-entries.ts +221 -0
  56. package/vendor/pi-vcc/src/core/settings.ts +8 -0
  57. package/vendor/pi-vcc/src/core/skill-collapse.ts +35 -0
  58. package/vendor/pi-vcc/src/core/summarize.ts +157 -0
  59. package/vendor/pi-vcc/src/core/tool-args.ts +14 -0
  60. package/vendor/pi-vcc/src/details.ts +7 -0
  61. package/vendor/pi-vcc/src/extract/commits.ts +69 -0
  62. package/vendor/pi-vcc/src/extract/files.ts +80 -0
  63. package/vendor/pi-vcc/src/extract/goals.ts +79 -0
  64. package/vendor/pi-vcc/src/extract/preferences.ts +55 -0
  65. package/vendor/pi-vcc/src/hooks/before-compact.ts +314 -0
  66. package/vendor/pi-vcc/src/sections.ts +12 -0
  67. package/vendor/pi-vcc/src/tools/recall.ts +109 -0
  68. package/vendor/pi-vcc/src/types.ts +14 -0
  69. package/vendor/pi-vcc/tests/before-compact-hook.test.ts +204 -0
  70. package/vendor/pi-vcc/tests/before-compact.test.ts +145 -0
  71. package/vendor/pi-vcc/tests/brief.test.ts +206 -0
  72. package/vendor/pi-vcc/tests/build-sections.test.ts +59 -0
  73. package/vendor/pi-vcc/tests/compile.test.ts +80 -0
  74. package/vendor/pi-vcc/tests/content.test.ts +31 -0
  75. package/vendor/pi-vcc/tests/extract-goals.test.ts +86 -0
  76. package/vendor/pi-vcc/tests/extract-preferences.test.ts +30 -0
  77. package/vendor/pi-vcc/tests/filter-noise.test.ts +61 -0
  78. package/vendor/pi-vcc/tests/fixtures.ts +61 -0
  79. package/vendor/pi-vcc/tests/format-recall.test.ts +30 -0
  80. package/vendor/pi-vcc/tests/format.test.ts +62 -0
  81. package/vendor/pi-vcc/tests/lineage.test.ts +33 -0
  82. package/vendor/pi-vcc/tests/load-messages.test.ts +51 -0
  83. package/vendor/pi-vcc/tests/normalize.test.ts +97 -0
  84. package/vendor/pi-vcc/tests/real-sessions.test.ts +38 -0
  85. package/vendor/pi-vcc/tests/recall-expand.test.ts +15 -0
  86. package/vendor/pi-vcc/tests/recall-scope.test.ts +32 -0
  87. package/vendor/pi-vcc/tests/recall-tool-scope.test.ts +67 -0
  88. package/vendor/pi-vcc/tests/render-entries.test.ts +62 -0
  89. package/vendor/pi-vcc/tests/report.test.ts +44 -0
  90. package/vendor/pi-vcc/tests/sanitize.test.ts +24 -0
  91. package/vendor/pi-vcc/tests/search-entries.test.ts +144 -0
  92. package/vendor/pi-vcc/tests/support/load-session.ts +23 -0
  93. package/vendor/pi-vcc/tests/support/real-sessions.ts +51 -0
  94. package/.pi/pi-vcc-config.json +0 -4
@@ -0,0 +1,92 @@
1
+ import { spawnSync } from "node:child_process";
2
+ import { existsSync, readFileSync } from "node:fs";
3
+ import { resolve } from "node:path";
4
+ import { resolveHarnessScript } from "../harness-paths.js";
5
+
6
+ export interface RunHarnessWebResult {
7
+ ok: boolean;
8
+ exitCode: number;
9
+ stdout: string;
10
+ stderr: string;
11
+ }
12
+
13
+ export function runHarnessWeb(
14
+ moduleUrl: string,
15
+ args: string[],
16
+ cwd: string,
17
+ ): RunHarnessWebResult {
18
+ const script = resolveHarnessScript(moduleUrl, "harness-web.py");
19
+ const result = spawnSync("python3", [script, ...args], {
20
+ cwd,
21
+ env: process.env,
22
+ encoding: "utf-8",
23
+ maxBuffer: 16 * 1024 * 1024,
24
+ });
25
+ return {
26
+ ok: result.status === 0,
27
+ exitCode: result.status ?? 1,
28
+ stdout: (result.stdout ?? "").trim(),
29
+ stderr: (result.stderr ?? "").trim(),
30
+ };
31
+ }
32
+
33
+ export function readTextExcerpt(
34
+ filePath: string,
35
+ cwd: string,
36
+ maxChars = 2000,
37
+ ): string {
38
+ const full = resolve(cwd, filePath);
39
+ if (!existsSync(full)) return "";
40
+ const text = readFileSync(full, "utf-8");
41
+ if (text.length <= maxChars) return text;
42
+ return `${text.slice(0, maxChars)}\n… (truncated; use read tool for full file)`;
43
+ }
44
+
45
+ export interface SearchHit {
46
+ url: string;
47
+ title: string;
48
+ description: string;
49
+ }
50
+
51
+ export function summarizeSearchJson(filePath: string, cwd: string): string {
52
+ const full = resolve(cwd, filePath);
53
+ if (!existsSync(full)) return "";
54
+ try {
55
+ const data = JSON.parse(readFileSync(full, "utf-8")) as {
56
+ query?: string;
57
+ engine?: string;
58
+ data?: { web?: SearchHit[] };
59
+ };
60
+ const hits = data.data?.web ?? [];
61
+ const lines = [
62
+ `engine: ${data.engine ?? "unknown"}`,
63
+ `query: ${data.query ?? ""}`,
64
+ `results: ${hits.length}`,
65
+ "",
66
+ ];
67
+ for (const [i, hit] of hits.entries()) {
68
+ lines.push(`${i + 1}. ${hit.title || "(no title)"}`);
69
+ lines.push(` ${hit.url}`);
70
+ if (hit.description) {
71
+ const snip =
72
+ hit.description.length > 120
73
+ ? `${hit.description.slice(0, 120)}…`
74
+ : hit.description;
75
+ lines.push(` ${snip}`);
76
+ }
77
+ }
78
+ return lines.join("\n");
79
+ } catch {
80
+ return "";
81
+ }
82
+ }
83
+
84
+ export function harnessWebContextLine(): string {
85
+ const engine = process.env.HARNESS_WEB_SEARCH_ENGINE?.trim() || "ddg_html";
86
+ const searx = process.env.HARNESS_WEB_SEARXNG_URL?.trim();
87
+ const searxPart = searx ? ` searxng_url=${searx}` : "";
88
+ return (
89
+ `[HarnessWeb] search_engine=${engine}${searxPart} — use web_search / web_fetch tools; ` +
90
+ "never resolve UP_PKG, ls harness-web.py, or python3 -c import scrapling before searching."
91
+ );
92
+ }
@@ -0,0 +1,17 @@
1
+ /**
2
+ * In-house VCC compaction for ultimate-pi.
3
+ *
4
+ * Vendored compaction core from [pi-vcc](https://github.com/sting8k/pi-vcc),
5
+ * inspired by [VCC](https://github.com/lllyasviel/VCC).
6
+ *
7
+ * Configuration is **env-only** (no JSON config files):
8
+ * - `HARNESS_VCC_COMPACTION` — default on; set `false` for Pi LLM compaction on /compact + auto-compact
9
+ * - `HARNESS_VCC_DEBUG` — set `true` to write `/tmp/pi-vcc-debug.json` on compaction
10
+ */
11
+
12
+ import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
13
+ import registerVcc from "../../vendor/pi-vcc/index.js";
14
+
15
+ export default function ultimatePiVcc(pi: ExtensionAPI): void {
16
+ registerVcc(pi);
17
+ }
@@ -0,0 +1,40 @@
1
+ # ADR 0030: In-house VCC compaction (vendored pi-vcc)
2
+
3
+ - **Status:** Accepted
4
+ - **Date:** 2026-05-17
5
+ - **Deciders:** ultimate-pi harness team
6
+
7
+ ## Context
8
+
9
+ ultimate-pi depended on the npm package `@sting8k/pi-vcc` for deterministic, view-oriented session compaction (no LLM summarization call). We need that behavior by default for harness sessions, without an external package coupling, while preserving attribution to upstream [pi-vcc](https://github.com/sting8k/pi-vcc) and the conceptual [VCC](https://github.com/lllyasviel/VCC) work.
10
+
11
+ ## Decision
12
+
13
+ 1. Vendor [sting8k/pi-vcc](https://github.com/sting8k/pi-vcc) under `vendor/pi-vcc/` (refresh via `npm run vendor:sync-vcc`), following the same pattern as `vendor/pi-model-router`.
14
+ 2. Load compaction through [`.pi/extensions/ultimate-pi-vcc.ts`](../../../extensions/ultimate-pi-vcc.ts).
15
+ 3. Remove `@sting8k/pi-vcc` from `package.json` dependencies and from `.pi/settings*.json` `packages` arrays.
16
+ 4. **Configuration is env-only** — no JSON config files (`PI_VCC_CONFIG_PATH` and `.pi/pi-vcc-config.json` are not used).
17
+ 5. **Default:** `HARNESS_VCC_COMPACTION` unset → VCC overrides Pi built-in LLM compaction for `/compact`, auto-threshold, and overflow.
18
+ 6. **Opt-out:** `HARNESS_VCC_COMPACTION=false` (also `0` / `off`) uses Pi’s LLM compaction for those paths; explicit `/pi-vcc` still uses VCC.
19
+ 7. **Debug:** `HARNESS_VCC_DEBUG=true` writes `/tmp/pi-vcc-debug.json` on compaction (default off).
20
+ 8. Settings implementation: [`.pi/extensions/lib/harness-vcc-settings.ts`](../../../extensions/lib/harness-vcc-settings.ts).
21
+ 9. Compaction telemetry `details.compactor` is `ultimate-pi-vcc`.
22
+
23
+ ## Consequences
24
+
25
+ ### Positive
26
+
27
+ - No runtime dependency on `@sting8k/pi-vcc` npm; vendored tree is pinned and patchable.
28
+ - Harness-default compaction matches ADR intent (deterministic, recall-friendly summaries).
29
+ - Operators can revert to LLM compaction per project via one env var.
30
+
31
+ ### Negative / trade-offs
32
+
33
+ - Maintainer must run `vendor:sync-vcc` to pick up upstream pi-vcc fixes.
34
+ - Vendored `loadSettings` re-exports harness env settings from `.pi/extensions/lib/` (couples vendor tree to ultimate-pi layout).
35
+
36
+ ## References
37
+
38
+ - [THIRD_PARTY_NOTICES.md](../../../../THIRD_PARTY_NOTICES.md)
39
+ - [vendor/pi-vcc/UPSTREAM_PIN.md](../../../../vendor/pi-vcc/UPSTREAM_PIN.md)
40
+ - [`.env.example`](../../../../.env.example) — `HARNESS_VCC_COMPACTION`, `HARNESS_VCC_DEBUG`
@@ -15,6 +15,7 @@ Team-shared ADRs for the ultimate-pi harness live under `.pi/harness/docs/adrs/`
15
15
  | [0007](0007-interactive-drift-monitor.md) | Interactive drift monitor | Accepted |
16
16
  | [0008](0008-harness-posthog-telemetry.md) | Harness PostHog telemetry | Accepted |
17
17
  | [0009](0009-sentrux-rules-lifecycle.md) | Sentrux rules.toml lifecycle | Accepted |
18
+ | [0030](0030-inhouse-vcc-compaction.md) | In-house VCC compaction (vendored pi-vcc) | Accepted |
18
19
 
19
20
  ## Template
20
21
 
@@ -4,9 +4,11 @@
4
4
  # Telemetry (set false to disable harness PostHog events)
5
5
  HARNESS_TELEMETRY_ENABLED=true
6
6
 
7
- # harness-web (Scrapling) local fetch/search; no API key
7
+ # harness-web (Scrapling scrape + pluggable search)
8
8
  HARNESS_WEB_FETCH_MODE=stealth
9
9
  HARNESS_WEB_SEARCH_ENGINE=ddg_html
10
+ # SearXNG (when HARNESS_WEB_SEARCH_ENGINE=searxng):
11
+ # HARNESS_WEB_SEARXNG_URL=http://127.0.0.1:8080
10
12
  # HARNESS_WEB_PROXY=
11
13
  # HARNESS_WEB_RATE_LIMIT_MS=2000
12
14
  # HARNESS_WEB_TIMEOUT_MS=30000
@@ -134,10 +134,13 @@ export PATH="$HOME/.local/bin:$PATH"
134
134
  uv tool install "scrapling[fetchers]"
135
135
  scrapling install # Chromium for default stealth scrape; may need sudo for OS libs on Linux
136
136
  mkdir -p .web
137
+ python3 "$UP_PKG/.pi/scripts/harness-web.py" status # JSON config (setup/diagnostics only)
137
138
  python3 "$UP_PKG/.pi/scripts/harness-web.py" search "ultimate-pi harness" -o .web/smoke-search.json --limit 3
138
139
  python3 "$UP_PKG/.pi/scripts/harness-web.py" scrape "https://example.com" -o .web/smoke-page.md --fast
139
140
  ```
140
141
 
142
+ After pi loads extensions, agents should smoke **`web_search`** once (not `UP_PKG` / `import scrapling` preflight). Example intent: query `ultimate-pi harness`, `limit` 2.
143
+
141
144
  - **`--skip-tools`:** skip Step 2 (includes Scrapling verify).
142
145
  - On Linux/WSL, if stealth scrape fails, install browser libs from `harness-cli-verify.sh` output or use `--fast` for static targets.
143
146
 
@@ -343,7 +346,7 @@ Verify each package:
343
346
  | `@posthog/pi` | Analytics event capture | F0 |
344
347
  | `pi-lean-ctx` | Context runtime (read/bash/find/grep/MCP bridge) | F0 |
345
348
  | `harness-subagents` (bundled extension) | L4 sub-agent spawn, blackboard, package agents | P16 |
346
- | `@sting8k/pi-vcc` | VCC compaction / conversation memory | Shipped |
349
+ | Vendored `pi-vcc` (`vendor/pi-vcc`, `.pi/extensions/ultimate-pi-vcc.ts`) | VCC compaction / `vcc_recall` env-only: `HARNESS_VCC_COMPACTION` (default on), `HARNESS_VCC_DEBUG` | Shipped |
347
350
  | `pi-model-router` | Vendored (`vendor/`); activates after `.pi/model-router.json` exists | F0 |
348
351
 
349
352
  ## Step 3.5 — Model Router Configuration (Dynamic)
@@ -421,6 +424,47 @@ If **no** `.env` at project root:
421
424
  - On **skip** or `--non-interactive`: warn in report (non-interactive skips creation)
422
425
  - If `ask_user` cancelled: stop with `needs_clarification`
423
426
 
427
+ ### 4.0b — harness-web search engine (non-destructive)
428
+
429
+ Unless `--non-interactive`, **call `ask_user`** after Step 4.0 (harness-decisions skill):
430
+
431
+ ```json
432
+ {
433
+ "question": "Which harness-web search backend should this project use?",
434
+ "context": "Scrapling still handles scrape/map/bulk. Search only: DuckDuckGo HTML needs no extra services. SearXNG must be self-hosted for agents — public instances often block JSON (403) and default to ~4 API requests/hour per IP.",
435
+ "options": [
436
+ {
437
+ "title": "DuckDuckGo HTML (default)",
438
+ "description": "HARNESS_WEB_SEARCH_ENGINE=ddg_html — no Docker"
439
+ },
440
+ {
441
+ "title": "Self-host SearXNG here (Docker)",
442
+ "description": "Bootstrap .searxng/ with official compose, enable JSON API, set harness env"
443
+ },
444
+ {
445
+ "title": "Use existing SearXNG instance",
446
+ "description": "You provide base URL; harness writes HARNESS_WEB_SEARXNG_URL"
447
+ }
448
+ ],
449
+ "allowFreeform": true
450
+ }
451
+ ```
452
+
453
+ | User choice | Actions |
454
+ |-------------|---------|
455
+ | **DDG** | Ensure `.env` has `HARNESS_WEB_SEARCH_ENGINE=ddg_html` via `harness-sync-env.mjs` (append only if missing; do not overwrite user values) |
456
+ | **Self-host** | `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs"` (requires Docker). Script sets `HARNESS_WEB_SEARCH_ENGINE=searxng` and `HARNESS_WEB_SEARXNG_URL` |
457
+ | **Existing instance** | Parse base URL from freeform answer. Run `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" --set-url {url}` (health check + upsert `.env`) |
458
+ | **Cancelled** | Stop with `needs_clarification` |
459
+ | **`--non-interactive`** | Skip prompt; leave/default `ddg_html`; do not run Docker bootstrap |
460
+
461
+ Post-choice smoke (report pass/fail):
462
+
463
+ ```bash
464
+ mkdir -p .web
465
+ python3 "$UP_PKG/.pi/scripts/harness-web.py" search "ultimate-pi harness" -o .web/setup-search.json --limit 2
466
+ ```
467
+
424
468
  Rules:
425
469
 
426
470
  - **Do not** `cp` over an existing `.env`.
@@ -428,7 +472,7 @@ Rules:
428
472
  - Re-runs only add keys from `$UP_PKG/.pi/harness/env.harness.template` that are absent (managed block at EOF).
429
473
  - Ensure `.env` is gitignored (Step 4.1).
430
474
 
431
- Template keys (placeholders — user fills secrets): `HARNESS_TELEMETRY_ENABLED`, `HARNESS_WEB_*`, `PI_VCC_CONFIG_PATH`, plus commented optional PostHog / Graphify vars.
475
+ Template keys (placeholders — user fills secrets): `HARNESS_TELEMETRY_ENABLED`, `HARNESS_WEB_*`, `HARNESS_VCC_COMPACTION`, `HARNESS_VCC_DEBUG`, plus commented optional PostHog / Graphify vars.
432
476
 
433
477
  ### 4.1 — .gitignore Entries
434
478
 
@@ -436,6 +480,7 @@ Ensure `.gitignore` contains:
436
480
  ```
437
481
  .env
438
482
  .web/
483
+ .searxng/
439
484
  .raw/
440
485
  .vault-meta/
441
486
  .pi/harness/critics/
@@ -646,6 +691,7 @@ Output summary table:
646
691
  | .gitignore | ✓/✗ | entries added (incl. `.env`) |
647
692
  | ./raw directory | ✓/✗ | Created for graphify source ingestion |
648
693
  | harness-web (Scrapling) | ✓/✗ | search + scrape smoke |
694
+ | harness-web search engine | ddg / searxng / — | Step 4.0b choice; SearXNG URL if applicable |
649
695
 
650
696
  Next steps:
651
697
  1. If tools missing: re-run with `--force` or install individually
@@ -200,10 +200,19 @@ verify_scrapling() {
200
200
  return
201
201
  fi
202
202
  mkdir -p .web
203
- if python3 "$_hw" search "ultimate-pi harness" -o .web/verify-search.json --limit 2 2>/dev/null | grep -q wrote; then
204
- pass "harness-web search smoke"
203
+ _search_engine="${HARNESS_WEB_SEARCH_ENGINE:-ddg_html}"
204
+ if [ "$_search_engine" = "searxng" ]; then
205
+ if [ -z "${HARNESS_WEB_SEARXNG_URL:-}" ]; then
206
+ fail "HARNESS_WEB_SEARCH_ENGINE=searxng but HARNESS_WEB_SEARXNG_URL is unset"
207
+ elif python3 "$_hw" search "ultimate-pi harness" -o .web/verify-search.json --limit 2 2>/dev/null | grep -q wrote; then
208
+ pass "harness-web search smoke (searxng)"
209
+ else
210
+ fail "harness-web search smoke failed (searxng at ${HARNESS_WEB_SEARXNG_URL})"
211
+ fi
212
+ elif python3 "$_hw" search "ultimate-pi harness" -o .web/verify-search.json --limit 2 2>/dev/null | grep -q wrote; then
213
+ pass "harness-web search smoke (ddg_html)"
205
214
  else
206
- fail "harness-web search smoke failed"
215
+ fail "harness-web search smoke failed (ddg_html)"
207
216
  fi
208
217
  if python3 "$_hw" scrape "https://example.com" -o .web/verify-page.md --fast 2>/dev/null | grep -q wrote; then
209
218
  pass "harness-web scrape --fast smoke"
@@ -0,0 +1,270 @@
1
+ #!/usr/bin/env node
2
+ /**
3
+ * Bootstrap a project-local SearXNG instance for harness-web (Docker Compose).
4
+ *
5
+ * - Creates .searxng/ with official upstream compose template
6
+ * - Writes core-config/settings.yml with json format + limiter off (local dev)
7
+ * - Starts containers and waits for JSON search health
8
+ * - Upserts HARNESS_WEB_SEARCH_ENGINE / HARNESS_WEB_SEARXNG_URL in project .env
9
+ *
10
+ * Usage:
11
+ * node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" [PROJECT_ROOT] [--url-only]
12
+ * node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" --set-url http://127.0.0.1:8080
13
+ *
14
+ * Requires: docker, docker compose, curl
15
+ */
16
+
17
+ import {
18
+ access,
19
+ copyFile,
20
+ mkdir,
21
+ readFile,
22
+ writeFile,
23
+ } from "node:fs/promises";
24
+ import { constants } from "node:fs";
25
+ import { join, dirname } from "node:path";
26
+ import { fileURLToPath } from "node:url";
27
+ import { spawn } from "node:child_process";
28
+
29
+ const SCRIPT_DIR = dirname(fileURLToPath(import.meta.url));
30
+ const UP_PKG = join(SCRIPT_DIR, "..", "..");
31
+
32
+ const SEARXNG_BASE =
33
+ "https://raw.githubusercontent.com/searxng/searxng/master/container";
34
+ const DEFAULT_PORT = "8080";
35
+ const HEALTH_PATH = "/search?q=harness&format=json";
36
+
37
+ const MANAGED_START = "# --- harness:env:start ---";
38
+ const MANAGED_END = "# --- harness:env:end ---";
39
+
40
+ const args = process.argv.slice(2).filter((a) => !a.startsWith("-"));
41
+ const flags = new Set(process.argv.slice(2).filter((a) => a.startsWith("-")));
42
+ const urlOnly = flags.has("--url-only");
43
+ const setUrlIdx = process.argv.indexOf("--set-url");
44
+ const setUrl = setUrlIdx !== -1 ? process.argv[setUrlIdx + 1] : null;
45
+
46
+ const PROJECT_ROOT = args[0] || process.cwd();
47
+ const SEARXNG_DIR = join(PROJECT_ROOT, ".searxng");
48
+ const CORE_CONFIG = join(SEARXNG_DIR, "core-config");
49
+ const SETTINGS_PATH = join(CORE_CONFIG, "settings.yml");
50
+ const COMPOSE_PATH = join(SEARXNG_DIR, "docker-compose.yml");
51
+ const ENV_COMPOSE = join(SEARXNG_DIR, ".env");
52
+
53
+ const HARNESS_SETTINGS = `use_default_settings: true
54
+
55
+ search:
56
+ formats:
57
+ - html
58
+ - json
59
+
60
+ server:
61
+ limiter: false
62
+ public_instance: false
63
+ `;
64
+
65
+ async function exists(path) {
66
+ try {
67
+ await access(path, constants.F_OK);
68
+ return true;
69
+ } catch {
70
+ return false;
71
+ }
72
+ }
73
+
74
+ function run(cmd, cmdArgs, opts = {}) {
75
+ return new Promise((resolve, reject) => {
76
+ const child = spawn(cmd, cmdArgs, {
77
+ stdio: opts.inherit ? "inherit" : "pipe",
78
+ cwd: opts.cwd,
79
+ env: { ...process.env, ...opts.env },
80
+ });
81
+ let stdout = "";
82
+ let stderr = "";
83
+ if (!opts.inherit) {
84
+ child.stdout?.on("data", (d) => {
85
+ stdout += d;
86
+ });
87
+ child.stderr?.on("data", (d) => {
88
+ stderr += d;
89
+ });
90
+ }
91
+ child.on("error", reject);
92
+ child.on("close", (code) => {
93
+ if (code === 0) resolve({ stdout, stderr });
94
+ else
95
+ reject(
96
+ new Error(
97
+ `${cmd} ${cmdArgs.join(" ")} exited ${code}\n${stderr || stdout}`,
98
+ ),
99
+ );
100
+ });
101
+ });
102
+ }
103
+
104
+ async function requireDocker() {
105
+ for (const bin of ["docker"]) {
106
+ try {
107
+ await run(bin, ["--version"]);
108
+ } catch {
109
+ console.error(`✗ ${bin} not found`);
110
+ console.error(
111
+ "Install Docker: https://docs.searxng.org/admin/installation-docker.html",
112
+ );
113
+ process.exit(1);
114
+ }
115
+ }
116
+ try {
117
+ await run("docker", ["compose", "version"]);
118
+ } catch {
119
+ console.error("✗ docker compose not available");
120
+ console.error(
121
+ "Install Docker Compose v2: https://docs.docker.com/compose/install/",
122
+ );
123
+ process.exit(1);
124
+ }
125
+ }
126
+
127
+ async function curlToFile(url, dest) {
128
+ await run("curl", ["-fsSL", "-o", dest, url]);
129
+ }
130
+
131
+ async function readComposePort() {
132
+ if (!(await exists(ENV_COMPOSE))) return DEFAULT_PORT;
133
+ const text = await readFile(ENV_COMPOSE, "utf8");
134
+ for (const line of text.split("\n")) {
135
+ const m = line.match(/^SEARXNG_PORT=(.+)$/);
136
+ if (m) return m[1].trim().replace(/^["']|["']$/g, "") || DEFAULT_PORT;
137
+ }
138
+ return DEFAULT_PORT;
139
+ }
140
+
141
+ async function ensureSearxngLayout() {
142
+ await mkdir(CORE_CONFIG, { recursive: true });
143
+ if (!(await exists(COMPOSE_PATH))) {
144
+ console.log("Fetching SearXNG docker-compose.yml …");
145
+ await curlToFile(`${SEARXNG_BASE}/docker-compose.yml`, COMPOSE_PATH);
146
+ }
147
+ if (!(await exists(ENV_COMPOSE))) {
148
+ const example = join(SEARXNG_DIR, ".env.example");
149
+ if (!(await exists(example))) {
150
+ console.log("Fetching SearXNG .env.example …");
151
+ await curlToFile(`${SEARXNG_BASE}/.env.example`, example);
152
+ }
153
+ await copyFile(example, ENV_COMPOSE);
154
+ }
155
+ const needsSettings =
156
+ !(await exists(SETTINGS_PATH)) ||
157
+ !(await readFile(SETTINGS_PATH, "utf8")).includes("json");
158
+ if (needsSettings) {
159
+ await writeFile(SETTINGS_PATH, HARNESS_SETTINGS, "utf8");
160
+ console.log(`✓ Wrote ${SETTINGS_PATH} (json format, limiter off)`);
161
+ }
162
+ }
163
+
164
+ async function composeUp() {
165
+ console.log("Starting SearXNG (docker compose up -d) …");
166
+ await run("docker", ["compose", "up", "-d"], { cwd: SEARXNG_DIR, inherit: true });
167
+ }
168
+
169
+ async function waitForHealth(baseUrl) {
170
+ const url = `${baseUrl}${HEALTH_PATH}`;
171
+ const deadline = Date.now() + 90_000;
172
+ let lastErr = "";
173
+ while (Date.now() < deadline) {
174
+ try {
175
+ const res = await fetch(url, {
176
+ headers: { Accept: "application/json" },
177
+ signal: AbortSignal.timeout(10_000),
178
+ });
179
+ if (res.status === 403) {
180
+ throw new Error(
181
+ "SearXNG returned 403 for format=json — ensure search.formats includes json in .searxng/core-config/settings.yml",
182
+ );
183
+ }
184
+ if (res.ok) {
185
+ const data = await res.json();
186
+ if (data && typeof data === "object") {
187
+ console.log(`✓ SearXNG healthy at ${baseUrl}`);
188
+ return;
189
+ }
190
+ }
191
+ lastErr = `HTTP ${res.status}`;
192
+ } catch (err) {
193
+ lastErr = err instanceof Error ? err.message : String(err);
194
+ }
195
+ await new Promise((r) => setTimeout(r, 3000));
196
+ }
197
+ throw new Error(`SearXNG health check timed out (${url}): ${lastErr}`);
198
+ }
199
+
200
+ function upsertEnvKey(content, key, value) {
201
+ const line = `${key}=${value}`;
202
+ const re = new RegExp(`^${key}=.*$`, "m");
203
+ if (re.test(content)) {
204
+ return content.replace(re, line);
205
+ }
206
+ if (content.includes(MANAGED_START) && content.includes(MANAGED_END)) {
207
+ const end = content.indexOf(MANAGED_END);
208
+ return `${content.slice(0, end)}${line}\n${content.slice(end)}`;
209
+ }
210
+ const sep = content.endsWith("\n") || content.length === 0 ? "" : "\n";
211
+ return `${content}${sep}${MANAGED_START}\n# harness-web (SearXNG)\n${line}\n${MANAGED_END}\n`;
212
+ }
213
+
214
+ async function upsertHarnessEnv(baseUrl) {
215
+ const envPath = join(PROJECT_ROOT, ".env");
216
+ let content = "";
217
+ if (await exists(envPath)) {
218
+ content = await readFile(envPath, "utf8");
219
+ } else {
220
+ const template = join(UP_PKG, ".pi", "harness", "env.harness.template");
221
+ if (await exists(template)) {
222
+ content = await readFile(template, "utf8");
223
+ }
224
+ }
225
+ content = upsertEnvKey(content, "HARNESS_WEB_SEARCH_ENGINE", "searxng");
226
+ content = upsertEnvKey(content, "HARNESS_WEB_SEARXNG_URL", baseUrl);
227
+ await writeFile(envPath, content.endsWith("\n") ? content : `${content}\n`, "utf8");
228
+ console.log(`✓ Updated .env: HARNESS_WEB_SEARCH_ENGINE=searxng, HARNESS_WEB_SEARXNG_URL=${baseUrl}`);
229
+ }
230
+
231
+ function normalizeBaseUrl(raw) {
232
+ const url = raw.trim().replace(/\/+$/, "");
233
+ if (!/^https?:\/\//i.test(url)) {
234
+ throw new Error(`Invalid SearXNG URL: ${raw}`);
235
+ }
236
+ return url;
237
+ }
238
+
239
+ async function main() {
240
+ if (setUrl) {
241
+ const baseUrl = normalizeBaseUrl(setUrl);
242
+ await waitForHealth(baseUrl);
243
+ await upsertHarnessEnv(baseUrl);
244
+ process.exit(0);
245
+ }
246
+
247
+ if (urlOnly) {
248
+ const port = (await exists(ENV_COMPOSE)) ? await readComposePort() : DEFAULT_PORT;
249
+ console.log(`http://127.0.0.1:${port}`);
250
+ process.exit(0);
251
+ }
252
+
253
+ await requireDocker();
254
+ await ensureSearxngLayout();
255
+ const port = await readComposePort();
256
+ const baseUrl = `http://127.0.0.1:${port}`;
257
+ await composeUp();
258
+ await waitForHealth(baseUrl);
259
+ await upsertHarnessEnv(baseUrl);
260
+
261
+ console.log("");
262
+ console.log("SearXNG is ready for harness-web:");
263
+ console.log(` HARNESS_WEB_SEARXNG_URL=${baseUrl}`);
264
+ console.log(` Test: python3 "${join(UP_PKG, ".pi/scripts/harness-web.py")}" search "test" -o .web/search.json --limit 2`);
265
+ }
266
+
267
+ main().catch((err) => {
268
+ console.error(`✗ ${err.message || err}`);
269
+ process.exit(1);
270
+ });
@@ -1,12 +1,21 @@
1
1
  # harness-web search (internal)
2
2
 
3
- ## Engine
3
+ Routing: `harness_web/search.py` dispatches by `HARNESS_WEB_SEARCH_ENGINE`.
4
4
 
5
- Default: DuckDuckGo static HTML — `GET https://html.duckduckgo.com/html/?q=…`
5
+ ## Engines
6
6
 
7
- Implemented in `harness_web/search_ddg.py` via `Fetcher.get` (HTTP, not a browser per query).
7
+ | Value | Module | Notes |
8
+ |-------|--------|-------|
9
+ | `ddg_html` (default) | `search_ddg.py` | DuckDuckGo HTML SERP via Scrapling HTTP (+ one stealth retry on challenge) |
10
+ | `searxng` | `search_searxng.py` | Self-hosted JSON API — requires `HARNESS_WEB_SEARXNG_URL` |
8
11
 
9
- ## Selectors
12
+ Bootstrap local SearXNG: `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs"`
13
+
14
+ ## DuckDuckGo HTML (`ddg_html`)
15
+
16
+ `GET https://html.duckduckgo.com/html/?q=…`
17
+
18
+ ### Selectors
10
19
 
11
20
  | Field | CSS |
12
21
  |-------|-----|
@@ -16,10 +25,18 @@ Implemented in `harness_web/search_ddg.py` via `Fetcher.get` (HTTP, not a browse
16
25
 
17
26
  DDG redirect URLs (`//duckduckgo.com/l/?uddg=…`) are unwrapped to the target `uddg` parameter.
18
27
 
19
- ## Challenge detection
28
+ ### Challenge detection
20
29
 
21
30
  If status 403 or HTML contains challenge markers (`anomaly-modal`, etc.), retry **once** with `StealthyFetcher`, then exit with a clear “search engine blocked” message.
22
31
 
32
+ ## SearXNG (`searxng`)
33
+
34
+ `GET {HARNESS_WEB_SEARXNG_URL}/search?q=…&format=json&pageno=1`
35
+
36
+ - No client API token (SearXNG has no standard search API key).
37
+ - `search.formats` in instance `settings.yml` must include `json` or the API returns **403**.
38
+ - Public instances are unsuitable (~4 JSON req/hr when limiter on; JSON often disabled). Use self-hosted bootstrap.
39
+
23
40
  ## Output
24
41
 
25
42
  `.web/search.json` — envelope compatible with legacy Firecrawl skills:
@@ -31,3 +48,5 @@ If status 403 or HTML contains challenge markers (`anomaly-modal`, etc.), retry
31
48
  "data": { "web": [{ "url", "title", "description" }] }
32
49
  }
33
50
  ```
51
+
52
+ `engine` reflects the active backend (`ddg_html` or `searxng`).