ultimate-pi 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/skills/harness-decisions/SKILL.md +15 -0
- package/.agents/skills/scrapling-web/SKILL.md +45 -40
- package/.agents/skills/wiki-autoresearch/SKILL.md +3 -3
- package/.pi/PACKAGING.md +3 -2
- package/.pi/SYSTEM.md +12 -13
- package/.pi/agents/pi-pi/agent-expert.md +3 -3
- package/.pi/extensions/harness-web-guard.ts +95 -0
- package/.pi/extensions/harness-web-tools.ts +209 -0
- package/.pi/extensions/lib/harness-vcc-settings.ts +50 -0
- package/.pi/extensions/lib/harness-web/run-cli.ts +92 -0
- package/.pi/extensions/ultimate-pi-vcc.ts +17 -0
- package/.pi/harness/docs/adrs/0030-inhouse-vcc-compaction.md +40 -0
- package/.pi/harness/docs/adrs/README.md +1 -0
- package/.pi/harness/env.harness.template +3 -1
- package/.pi/prompts/harness-setup.md +48 -2
- package/.pi/scripts/harness-cli-verify.sh +12 -3
- package/.pi/scripts/harness-searxng-bootstrap.mjs +270 -0
- package/.pi/scripts/harness-web-search.md +24 -5
- package/.pi/scripts/harness-web.py +24 -7
- package/.pi/scripts/harness_web/config.py +37 -3
- package/.pi/scripts/harness_web/output.py +8 -2
- package/.pi/scripts/harness_web/search.py +22 -0
- package/.pi/scripts/harness_web/search_ddg.py +1 -5
- package/.pi/scripts/harness_web/search_searxng.py +100 -0
- package/.pi/scripts/vendor-pi-vcc-settings.stub.ts +8 -0
- package/.pi/scripts/vendor-sync-pi-vcc.sh +40 -0
- package/.pi/settings.example.json +1 -6
- package/CHANGELOG.md +20 -6
- package/THIRD_PARTY_NOTICES.md +8 -22
- package/package.json +7 -6
- package/vendor/pi-vcc/README.md +215 -0
- package/vendor/pi-vcc/UPSTREAM_PIN.md +12 -0
- package/vendor/pi-vcc/demo.gif +0 -0
- package/vendor/pi-vcc/index.ts +12 -0
- package/vendor/pi-vcc/package.json +26 -0
- package/vendor/pi-vcc/scripts/audit-sessions.ts +88 -0
- package/vendor/pi-vcc/scripts/benchmark-real-sessions.ts +25 -0
- package/vendor/pi-vcc/scripts/compare-before-after.ts +36 -0
- package/vendor/pi-vcc/scripts/dump-branch-output.ts +20 -0
- package/vendor/pi-vcc/src/commands/pi-vcc.ts +36 -0
- package/vendor/pi-vcc/src/commands/vcc-recall.ts +65 -0
- package/vendor/pi-vcc/src/core/brief.ts +381 -0
- package/vendor/pi-vcc/src/core/build-sections.ts +79 -0
- package/vendor/pi-vcc/src/core/content.ts +60 -0
- package/vendor/pi-vcc/src/core/filter-noise.ts +42 -0
- package/vendor/pi-vcc/src/core/format-recall.ts +27 -0
- package/vendor/pi-vcc/src/core/format.ts +49 -0
- package/vendor/pi-vcc/src/core/lineage.ts +26 -0
- package/vendor/pi-vcc/src/core/load-messages.ts +41 -0
- package/vendor/pi-vcc/src/core/normalize.ts +66 -0
- package/vendor/pi-vcc/src/core/recall-scope.ts +14 -0
- package/vendor/pi-vcc/src/core/render-entries.ts +55 -0
- package/vendor/pi-vcc/src/core/report.ts +237 -0
- package/vendor/pi-vcc/src/core/sanitize.ts +5 -0
- package/vendor/pi-vcc/src/core/search-entries.ts +221 -0
- package/vendor/pi-vcc/src/core/settings.ts +8 -0
- package/vendor/pi-vcc/src/core/skill-collapse.ts +35 -0
- package/vendor/pi-vcc/src/core/summarize.ts +157 -0
- package/vendor/pi-vcc/src/core/tool-args.ts +14 -0
- package/vendor/pi-vcc/src/details.ts +7 -0
- package/vendor/pi-vcc/src/extract/commits.ts +69 -0
- package/vendor/pi-vcc/src/extract/files.ts +80 -0
- package/vendor/pi-vcc/src/extract/goals.ts +79 -0
- package/vendor/pi-vcc/src/extract/preferences.ts +55 -0
- package/vendor/pi-vcc/src/hooks/before-compact.ts +314 -0
- package/vendor/pi-vcc/src/sections.ts +12 -0
- package/vendor/pi-vcc/src/tools/recall.ts +109 -0
- package/vendor/pi-vcc/src/types.ts +14 -0
- package/vendor/pi-vcc/tests/before-compact-hook.test.ts +204 -0
- package/vendor/pi-vcc/tests/before-compact.test.ts +145 -0
- package/vendor/pi-vcc/tests/brief.test.ts +206 -0
- package/vendor/pi-vcc/tests/build-sections.test.ts +59 -0
- package/vendor/pi-vcc/tests/compile.test.ts +80 -0
- package/vendor/pi-vcc/tests/content.test.ts +31 -0
- package/vendor/pi-vcc/tests/extract-goals.test.ts +86 -0
- package/vendor/pi-vcc/tests/extract-preferences.test.ts +30 -0
- package/vendor/pi-vcc/tests/filter-noise.test.ts +61 -0
- package/vendor/pi-vcc/tests/fixtures.ts +61 -0
- package/vendor/pi-vcc/tests/format-recall.test.ts +30 -0
- package/vendor/pi-vcc/tests/format.test.ts +62 -0
- package/vendor/pi-vcc/tests/lineage.test.ts +33 -0
- package/vendor/pi-vcc/tests/load-messages.test.ts +51 -0
- package/vendor/pi-vcc/tests/normalize.test.ts +97 -0
- package/vendor/pi-vcc/tests/real-sessions.test.ts +38 -0
- package/vendor/pi-vcc/tests/recall-expand.test.ts +15 -0
- package/vendor/pi-vcc/tests/recall-scope.test.ts +32 -0
- package/vendor/pi-vcc/tests/recall-tool-scope.test.ts +67 -0
- package/vendor/pi-vcc/tests/render-entries.test.ts +62 -0
- package/vendor/pi-vcc/tests/report.test.ts +44 -0
- package/vendor/pi-vcc/tests/sanitize.test.ts +24 -0
- package/vendor/pi-vcc/tests/search-entries.test.ts +144 -0
- package/vendor/pi-vcc/tests/support/load-session.ts +23 -0
- package/vendor/pi-vcc/tests/support/real-sessions.ts +51 -0
- package/.pi/pi-vcc-config.json +0 -4
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { spawnSync } from "node:child_process";
|
|
2
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
3
|
+
import { resolve } from "node:path";
|
|
4
|
+
import { resolveHarnessScript } from "../harness-paths.js";
|
|
5
|
+
|
|
6
|
+
export interface RunHarnessWebResult {
|
|
7
|
+
ok: boolean;
|
|
8
|
+
exitCode: number;
|
|
9
|
+
stdout: string;
|
|
10
|
+
stderr: string;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export function runHarnessWeb(
|
|
14
|
+
moduleUrl: string,
|
|
15
|
+
args: string[],
|
|
16
|
+
cwd: string,
|
|
17
|
+
): RunHarnessWebResult {
|
|
18
|
+
const script = resolveHarnessScript(moduleUrl, "harness-web.py");
|
|
19
|
+
const result = spawnSync("python3", [script, ...args], {
|
|
20
|
+
cwd,
|
|
21
|
+
env: process.env,
|
|
22
|
+
encoding: "utf-8",
|
|
23
|
+
maxBuffer: 16 * 1024 * 1024,
|
|
24
|
+
});
|
|
25
|
+
return {
|
|
26
|
+
ok: result.status === 0,
|
|
27
|
+
exitCode: result.status ?? 1,
|
|
28
|
+
stdout: (result.stdout ?? "").trim(),
|
|
29
|
+
stderr: (result.stderr ?? "").trim(),
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function readTextExcerpt(
|
|
34
|
+
filePath: string,
|
|
35
|
+
cwd: string,
|
|
36
|
+
maxChars = 2000,
|
|
37
|
+
): string {
|
|
38
|
+
const full = resolve(cwd, filePath);
|
|
39
|
+
if (!existsSync(full)) return "";
|
|
40
|
+
const text = readFileSync(full, "utf-8");
|
|
41
|
+
if (text.length <= maxChars) return text;
|
|
42
|
+
return `${text.slice(0, maxChars)}\n… (truncated; use read tool for full file)`;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export interface SearchHit {
|
|
46
|
+
url: string;
|
|
47
|
+
title: string;
|
|
48
|
+
description: string;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export function summarizeSearchJson(filePath: string, cwd: string): string {
|
|
52
|
+
const full = resolve(cwd, filePath);
|
|
53
|
+
if (!existsSync(full)) return "";
|
|
54
|
+
try {
|
|
55
|
+
const data = JSON.parse(readFileSync(full, "utf-8")) as {
|
|
56
|
+
query?: string;
|
|
57
|
+
engine?: string;
|
|
58
|
+
data?: { web?: SearchHit[] };
|
|
59
|
+
};
|
|
60
|
+
const hits = data.data?.web ?? [];
|
|
61
|
+
const lines = [
|
|
62
|
+
`engine: ${data.engine ?? "unknown"}`,
|
|
63
|
+
`query: ${data.query ?? ""}`,
|
|
64
|
+
`results: ${hits.length}`,
|
|
65
|
+
"",
|
|
66
|
+
];
|
|
67
|
+
for (const [i, hit] of hits.entries()) {
|
|
68
|
+
lines.push(`${i + 1}. ${hit.title || "(no title)"}`);
|
|
69
|
+
lines.push(` ${hit.url}`);
|
|
70
|
+
if (hit.description) {
|
|
71
|
+
const snip =
|
|
72
|
+
hit.description.length > 120
|
|
73
|
+
? `${hit.description.slice(0, 120)}…`
|
|
74
|
+
: hit.description;
|
|
75
|
+
lines.push(` ${snip}`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return lines.join("\n");
|
|
79
|
+
} catch {
|
|
80
|
+
return "";
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
export function harnessWebContextLine(): string {
|
|
85
|
+
const engine = process.env.HARNESS_WEB_SEARCH_ENGINE?.trim() || "ddg_html";
|
|
86
|
+
const searx = process.env.HARNESS_WEB_SEARXNG_URL?.trim();
|
|
87
|
+
const searxPart = searx ? ` searxng_url=${searx}` : "";
|
|
88
|
+
return (
|
|
89
|
+
`[HarnessWeb] search_engine=${engine}${searxPart} — use web_search / web_fetch tools; ` +
|
|
90
|
+
"never resolve UP_PKG, ls harness-web.py, or python3 -c import scrapling before searching."
|
|
91
|
+
);
|
|
92
|
+
}
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* In-house VCC compaction for ultimate-pi.
|
|
3
|
+
*
|
|
4
|
+
* Vendored compaction core from [pi-vcc](https://github.com/sting8k/pi-vcc),
|
|
5
|
+
* inspired by [VCC](https://github.com/lllyasviel/VCC).
|
|
6
|
+
*
|
|
7
|
+
* Configuration is **env-only** (no JSON config files):
|
|
8
|
+
* - `HARNESS_VCC_COMPACTION` — default on; set `false` for Pi LLM compaction on /compact + auto-compact
|
|
9
|
+
* - `HARNESS_VCC_DEBUG` — set `true` to write `/tmp/pi-vcc-debug.json` on compaction
|
|
10
|
+
*/
|
|
11
|
+
|
|
12
|
+
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
|
|
13
|
+
import registerVcc from "../../vendor/pi-vcc/index.js";
|
|
14
|
+
|
|
15
|
+
export default function ultimatePiVcc(pi: ExtensionAPI): void {
|
|
16
|
+
registerVcc(pi);
|
|
17
|
+
}
|
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
# ADR 0030: In-house VCC compaction (vendored pi-vcc)
|
|
2
|
+
|
|
3
|
+
- **Status:** Accepted
|
|
4
|
+
- **Date:** 2026-05-17
|
|
5
|
+
- **Deciders:** ultimate-pi harness team
|
|
6
|
+
|
|
7
|
+
## Context
|
|
8
|
+
|
|
9
|
+
ultimate-pi depended on the npm package `@sting8k/pi-vcc` for deterministic, view-oriented session compaction (no LLM summarization call). We need that behavior by default for harness sessions, without an external package coupling, while preserving attribution to upstream [pi-vcc](https://github.com/sting8k/pi-vcc) and the conceptual [VCC](https://github.com/lllyasviel/VCC) work.
|
|
10
|
+
|
|
11
|
+
## Decision
|
|
12
|
+
|
|
13
|
+
1. Vendor [sting8k/pi-vcc](https://github.com/sting8k/pi-vcc) under `vendor/pi-vcc/` (refresh via `npm run vendor:sync-vcc`), following the same pattern as `vendor/pi-model-router`.
|
|
14
|
+
2. Load compaction through [`.pi/extensions/ultimate-pi-vcc.ts`](../../../extensions/ultimate-pi-vcc.ts).
|
|
15
|
+
3. Remove `@sting8k/pi-vcc` from `package.json` dependencies and from `.pi/settings*.json` `packages` arrays.
|
|
16
|
+
4. **Configuration is env-only** — no JSON config files (`PI_VCC_CONFIG_PATH` and `.pi/pi-vcc-config.json` are not used).
|
|
17
|
+
5. **Default:** `HARNESS_VCC_COMPACTION` unset → VCC overrides Pi built-in LLM compaction for `/compact`, auto-threshold, and overflow.
|
|
18
|
+
6. **Opt-out:** `HARNESS_VCC_COMPACTION=false` (also `0` / `off`) uses Pi’s LLM compaction for those paths; explicit `/pi-vcc` still uses VCC.
|
|
19
|
+
7. **Debug:** `HARNESS_VCC_DEBUG=true` writes `/tmp/pi-vcc-debug.json` on compaction (default off).
|
|
20
|
+
8. Settings implementation: [`.pi/extensions/lib/harness-vcc-settings.ts`](../../../extensions/lib/harness-vcc-settings.ts).
|
|
21
|
+
9. Compaction telemetry `details.compactor` is `ultimate-pi-vcc`.
|
|
22
|
+
|
|
23
|
+
## Consequences
|
|
24
|
+
|
|
25
|
+
### Positive
|
|
26
|
+
|
|
27
|
+
- No runtime dependency on `@sting8k/pi-vcc` npm; vendored tree is pinned and patchable.
|
|
28
|
+
- Harness-default compaction matches ADR intent (deterministic, recall-friendly summaries).
|
|
29
|
+
- Operators can revert to LLM compaction per project via one env var.
|
|
30
|
+
|
|
31
|
+
### Negative / trade-offs
|
|
32
|
+
|
|
33
|
+
- Maintainer must run `vendor:sync-vcc` to pick up upstream pi-vcc fixes.
|
|
34
|
+
- Vendored `loadSettings` re-exports harness env settings from `.pi/extensions/lib/` (couples vendor tree to ultimate-pi layout).
|
|
35
|
+
|
|
36
|
+
## References
|
|
37
|
+
|
|
38
|
+
- [THIRD_PARTY_NOTICES.md](../../../../THIRD_PARTY_NOTICES.md)
|
|
39
|
+
- [vendor/pi-vcc/UPSTREAM_PIN.md](../../../../vendor/pi-vcc/UPSTREAM_PIN.md)
|
|
40
|
+
- [`.env.example`](../../../../.env.example) — `HARNESS_VCC_COMPACTION`, `HARNESS_VCC_DEBUG`
|
|
@@ -15,6 +15,7 @@ Team-shared ADRs for the ultimate-pi harness live under `.pi/harness/docs/adrs/`
|
|
|
15
15
|
| [0007](0007-interactive-drift-monitor.md) | Interactive drift monitor | Accepted |
|
|
16
16
|
| [0008](0008-harness-posthog-telemetry.md) | Harness PostHog telemetry | Accepted |
|
|
17
17
|
| [0009](0009-sentrux-rules-lifecycle.md) | Sentrux rules.toml lifecycle | Accepted |
|
|
18
|
+
| [0030](0030-inhouse-vcc-compaction.md) | In-house VCC compaction (vendored pi-vcc) | Accepted |
|
|
18
19
|
|
|
19
20
|
## Template
|
|
20
21
|
|
|
@@ -4,9 +4,11 @@
|
|
|
4
4
|
# Telemetry (set false to disable harness PostHog events)
|
|
5
5
|
HARNESS_TELEMETRY_ENABLED=true
|
|
6
6
|
|
|
7
|
-
# harness-web (Scrapling
|
|
7
|
+
# harness-web (Scrapling scrape + pluggable search)
|
|
8
8
|
HARNESS_WEB_FETCH_MODE=stealth
|
|
9
9
|
HARNESS_WEB_SEARCH_ENGINE=ddg_html
|
|
10
|
+
# SearXNG (when HARNESS_WEB_SEARCH_ENGINE=searxng):
|
|
11
|
+
# HARNESS_WEB_SEARXNG_URL=http://127.0.0.1:8080
|
|
10
12
|
# HARNESS_WEB_PROXY=
|
|
11
13
|
# HARNESS_WEB_RATE_LIMIT_MS=2000
|
|
12
14
|
# HARNESS_WEB_TIMEOUT_MS=30000
|
|
@@ -134,10 +134,13 @@ export PATH="$HOME/.local/bin:$PATH"
|
|
|
134
134
|
uv tool install "scrapling[fetchers]"
|
|
135
135
|
scrapling install # Chromium for default stealth scrape; may need sudo for OS libs on Linux
|
|
136
136
|
mkdir -p .web
|
|
137
|
+
python3 "$UP_PKG/.pi/scripts/harness-web.py" status # JSON config (setup/diagnostics only)
|
|
137
138
|
python3 "$UP_PKG/.pi/scripts/harness-web.py" search "ultimate-pi harness" -o .web/smoke-search.json --limit 3
|
|
138
139
|
python3 "$UP_PKG/.pi/scripts/harness-web.py" scrape "https://example.com" -o .web/smoke-page.md --fast
|
|
139
140
|
```
|
|
140
141
|
|
|
142
|
+
After pi loads extensions, agents should smoke **`web_search`** once (not `UP_PKG` / `import scrapling` preflight). Example intent: query `ultimate-pi harness`, `limit` 2.
|
|
143
|
+
|
|
141
144
|
- **`--skip-tools`:** skip Step 2 (includes Scrapling verify).
|
|
142
145
|
- On Linux/WSL, if stealth scrape fails, install browser libs from `harness-cli-verify.sh` output or use `--fast` for static targets.
|
|
143
146
|
|
|
@@ -343,7 +346,7 @@ Verify each package:
|
|
|
343
346
|
| `@posthog/pi` | Analytics event capture | F0 |
|
|
344
347
|
| `pi-lean-ctx` | Context runtime (read/bash/find/grep/MCP bridge) | F0 |
|
|
345
348
|
| `harness-subagents` (bundled extension) | L4 sub-agent spawn, blackboard, package agents | P16 |
|
|
346
|
-
|
|
|
349
|
+
| Vendored `pi-vcc` (`vendor/pi-vcc`, `.pi/extensions/ultimate-pi-vcc.ts`) | VCC compaction / `vcc_recall` — env-only: `HARNESS_VCC_COMPACTION` (default on), `HARNESS_VCC_DEBUG` | Shipped |
|
|
347
350
|
| `pi-model-router` | Vendored (`vendor/`); activates after `.pi/model-router.json` exists | F0 |
|
|
348
351
|
|
|
349
352
|
## Step 3.5 — Model Router Configuration (Dynamic)
|
|
@@ -421,6 +424,47 @@ If **no** `.env` at project root:
|
|
|
421
424
|
- On **skip** or `--non-interactive`: warn in report (non-interactive skips creation)
|
|
422
425
|
- If `ask_user` cancelled: stop with `needs_clarification`
|
|
423
426
|
|
|
427
|
+
### 4.0b — harness-web search engine (non-destructive)
|
|
428
|
+
|
|
429
|
+
Unless `--non-interactive`, **call `ask_user`** after Step 4.0 (harness-decisions skill):
|
|
430
|
+
|
|
431
|
+
```json
|
|
432
|
+
{
|
|
433
|
+
"question": "Which harness-web search backend should this project use?",
|
|
434
|
+
"context": "Scrapling still handles scrape/map/bulk. Search only: DuckDuckGo HTML needs no extra services. SearXNG must be self-hosted for agents — public instances often block JSON (403) and default to ~4 API requests/hour per IP.",
|
|
435
|
+
"options": [
|
|
436
|
+
{
|
|
437
|
+
"title": "DuckDuckGo HTML (default)",
|
|
438
|
+
"description": "HARNESS_WEB_SEARCH_ENGINE=ddg_html — no Docker"
|
|
439
|
+
},
|
|
440
|
+
{
|
|
441
|
+
"title": "Self-host SearXNG here (Docker)",
|
|
442
|
+
"description": "Bootstrap .searxng/ with official compose, enable JSON API, set harness env"
|
|
443
|
+
},
|
|
444
|
+
{
|
|
445
|
+
"title": "Use existing SearXNG instance",
|
|
446
|
+
"description": "You provide base URL; harness writes HARNESS_WEB_SEARXNG_URL"
|
|
447
|
+
}
|
|
448
|
+
],
|
|
449
|
+
"allowFreeform": true
|
|
450
|
+
}
|
|
451
|
+
```
|
|
452
|
+
|
|
453
|
+
| User choice | Actions |
|
|
454
|
+
|-------------|---------|
|
|
455
|
+
| **DDG** | Ensure `.env` has `HARNESS_WEB_SEARCH_ENGINE=ddg_html` via `harness-sync-env.mjs` (append only if missing; do not overwrite user values) |
|
|
456
|
+
| **Self-host** | `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs"` (requires Docker). Script sets `HARNESS_WEB_SEARCH_ENGINE=searxng` and `HARNESS_WEB_SEARXNG_URL` |
|
|
457
|
+
| **Existing instance** | Parse base URL from freeform answer. Run `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" --set-url {url}` (health check + upsert `.env`) |
|
|
458
|
+
| **Cancelled** | Stop with `needs_clarification` |
|
|
459
|
+
| **`--non-interactive`** | Skip prompt; leave/default `ddg_html`; do not run Docker bootstrap |
|
|
460
|
+
|
|
461
|
+
Post-choice smoke (report pass/fail):
|
|
462
|
+
|
|
463
|
+
```bash
|
|
464
|
+
mkdir -p .web
|
|
465
|
+
python3 "$UP_PKG/.pi/scripts/harness-web.py" search "ultimate-pi harness" -o .web/setup-search.json --limit 2
|
|
466
|
+
```
|
|
467
|
+
|
|
424
468
|
Rules:
|
|
425
469
|
|
|
426
470
|
- **Do not** `cp` over an existing `.env`.
|
|
@@ -428,7 +472,7 @@ Rules:
|
|
|
428
472
|
- Re-runs only add keys from `$UP_PKG/.pi/harness/env.harness.template` that are absent (managed block at EOF).
|
|
429
473
|
- Ensure `.env` is gitignored (Step 4.1).
|
|
430
474
|
|
|
431
|
-
Template keys (placeholders — user fills secrets): `HARNESS_TELEMETRY_ENABLED`, `HARNESS_WEB_*`, `
|
|
475
|
+
Template keys (placeholders — user fills secrets): `HARNESS_TELEMETRY_ENABLED`, `HARNESS_WEB_*`, `HARNESS_VCC_COMPACTION`, `HARNESS_VCC_DEBUG`, plus commented optional PostHog / Graphify vars.
|
|
432
476
|
|
|
433
477
|
### 4.1 — .gitignore Entries
|
|
434
478
|
|
|
@@ -436,6 +480,7 @@ Ensure `.gitignore` contains:
|
|
|
436
480
|
```
|
|
437
481
|
.env
|
|
438
482
|
.web/
|
|
483
|
+
.searxng/
|
|
439
484
|
.raw/
|
|
440
485
|
.vault-meta/
|
|
441
486
|
.pi/harness/critics/
|
|
@@ -646,6 +691,7 @@ Output summary table:
|
|
|
646
691
|
| .gitignore | ✓/✗ | entries added (incl. `.env`) |
|
|
647
692
|
| ./raw directory | ✓/✗ | Created for graphify source ingestion |
|
|
648
693
|
| harness-web (Scrapling) | ✓/✗ | search + scrape smoke |
|
|
694
|
+
| harness-web search engine | ddg / searxng / — | Step 4.0b choice; SearXNG URL if applicable |
|
|
649
695
|
|
|
650
696
|
Next steps:
|
|
651
697
|
1. If tools missing: re-run with `--force` or install individually
|
|
@@ -200,10 +200,19 @@ verify_scrapling() {
|
|
|
200
200
|
return
|
|
201
201
|
fi
|
|
202
202
|
mkdir -p .web
|
|
203
|
-
|
|
204
|
-
|
|
203
|
+
_search_engine="${HARNESS_WEB_SEARCH_ENGINE:-ddg_html}"
|
|
204
|
+
if [ "$_search_engine" = "searxng" ]; then
|
|
205
|
+
if [ -z "${HARNESS_WEB_SEARXNG_URL:-}" ]; then
|
|
206
|
+
fail "HARNESS_WEB_SEARCH_ENGINE=searxng but HARNESS_WEB_SEARXNG_URL is unset"
|
|
207
|
+
elif python3 "$_hw" search "ultimate-pi harness" -o .web/verify-search.json --limit 2 2>/dev/null | grep -q wrote; then
|
|
208
|
+
pass "harness-web search smoke (searxng)"
|
|
209
|
+
else
|
|
210
|
+
fail "harness-web search smoke failed (searxng at ${HARNESS_WEB_SEARXNG_URL})"
|
|
211
|
+
fi
|
|
212
|
+
elif python3 "$_hw" search "ultimate-pi harness" -o .web/verify-search.json --limit 2 2>/dev/null | grep -q wrote; then
|
|
213
|
+
pass "harness-web search smoke (ddg_html)"
|
|
205
214
|
else
|
|
206
|
-
fail "harness-web search smoke failed"
|
|
215
|
+
fail "harness-web search smoke failed (ddg_html)"
|
|
207
216
|
fi
|
|
208
217
|
if python3 "$_hw" scrape "https://example.com" -o .web/verify-page.md --fast 2>/dev/null | grep -q wrote; then
|
|
209
218
|
pass "harness-web scrape --fast smoke"
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
/**
|
|
3
|
+
* Bootstrap a project-local SearXNG instance for harness-web (Docker Compose).
|
|
4
|
+
*
|
|
5
|
+
* - Creates .searxng/ with official upstream compose template
|
|
6
|
+
* - Writes core-config/settings.yml with json format + limiter off (local dev)
|
|
7
|
+
* - Starts containers and waits for JSON search health
|
|
8
|
+
* - Upserts HARNESS_WEB_SEARCH_ENGINE / HARNESS_WEB_SEARXNG_URL in project .env
|
|
9
|
+
*
|
|
10
|
+
* Usage:
|
|
11
|
+
* node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" [PROJECT_ROOT] [--url-only]
|
|
12
|
+
* node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs" --set-url http://127.0.0.1:8080
|
|
13
|
+
*
|
|
14
|
+
* Requires: docker, docker compose, curl
|
|
15
|
+
*/
|
|
16
|
+
|
|
17
|
+
import {
|
|
18
|
+
access,
|
|
19
|
+
copyFile,
|
|
20
|
+
mkdir,
|
|
21
|
+
readFile,
|
|
22
|
+
writeFile,
|
|
23
|
+
} from "node:fs/promises";
|
|
24
|
+
import { constants } from "node:fs";
|
|
25
|
+
import { join, dirname } from "node:path";
|
|
26
|
+
import { fileURLToPath } from "node:url";
|
|
27
|
+
import { spawn } from "node:child_process";
|
|
28
|
+
|
|
29
|
+
const SCRIPT_DIR = dirname(fileURLToPath(import.meta.url));
|
|
30
|
+
const UP_PKG = join(SCRIPT_DIR, "..", "..");
|
|
31
|
+
|
|
32
|
+
const SEARXNG_BASE =
|
|
33
|
+
"https://raw.githubusercontent.com/searxng/searxng/master/container";
|
|
34
|
+
const DEFAULT_PORT = "8080";
|
|
35
|
+
const HEALTH_PATH = "/search?q=harness&format=json";
|
|
36
|
+
|
|
37
|
+
const MANAGED_START = "# --- harness:env:start ---";
|
|
38
|
+
const MANAGED_END = "# --- harness:env:end ---";
|
|
39
|
+
|
|
40
|
+
const args = process.argv.slice(2).filter((a) => !a.startsWith("-"));
|
|
41
|
+
const flags = new Set(process.argv.slice(2).filter((a) => a.startsWith("-")));
|
|
42
|
+
const urlOnly = flags.has("--url-only");
|
|
43
|
+
const setUrlIdx = process.argv.indexOf("--set-url");
|
|
44
|
+
const setUrl = setUrlIdx !== -1 ? process.argv[setUrlIdx + 1] : null;
|
|
45
|
+
|
|
46
|
+
const PROJECT_ROOT = args[0] || process.cwd();
|
|
47
|
+
const SEARXNG_DIR = join(PROJECT_ROOT, ".searxng");
|
|
48
|
+
const CORE_CONFIG = join(SEARXNG_DIR, "core-config");
|
|
49
|
+
const SETTINGS_PATH = join(CORE_CONFIG, "settings.yml");
|
|
50
|
+
const COMPOSE_PATH = join(SEARXNG_DIR, "docker-compose.yml");
|
|
51
|
+
const ENV_COMPOSE = join(SEARXNG_DIR, ".env");
|
|
52
|
+
|
|
53
|
+
const HARNESS_SETTINGS = `use_default_settings: true
|
|
54
|
+
|
|
55
|
+
search:
|
|
56
|
+
formats:
|
|
57
|
+
- html
|
|
58
|
+
- json
|
|
59
|
+
|
|
60
|
+
server:
|
|
61
|
+
limiter: false
|
|
62
|
+
public_instance: false
|
|
63
|
+
`;
|
|
64
|
+
|
|
65
|
+
async function exists(path) {
|
|
66
|
+
try {
|
|
67
|
+
await access(path, constants.F_OK);
|
|
68
|
+
return true;
|
|
69
|
+
} catch {
|
|
70
|
+
return false;
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
function run(cmd, cmdArgs, opts = {}) {
|
|
75
|
+
return new Promise((resolve, reject) => {
|
|
76
|
+
const child = spawn(cmd, cmdArgs, {
|
|
77
|
+
stdio: opts.inherit ? "inherit" : "pipe",
|
|
78
|
+
cwd: opts.cwd,
|
|
79
|
+
env: { ...process.env, ...opts.env },
|
|
80
|
+
});
|
|
81
|
+
let stdout = "";
|
|
82
|
+
let stderr = "";
|
|
83
|
+
if (!opts.inherit) {
|
|
84
|
+
child.stdout?.on("data", (d) => {
|
|
85
|
+
stdout += d;
|
|
86
|
+
});
|
|
87
|
+
child.stderr?.on("data", (d) => {
|
|
88
|
+
stderr += d;
|
|
89
|
+
});
|
|
90
|
+
}
|
|
91
|
+
child.on("error", reject);
|
|
92
|
+
child.on("close", (code) => {
|
|
93
|
+
if (code === 0) resolve({ stdout, stderr });
|
|
94
|
+
else
|
|
95
|
+
reject(
|
|
96
|
+
new Error(
|
|
97
|
+
`${cmd} ${cmdArgs.join(" ")} exited ${code}\n${stderr || stdout}`,
|
|
98
|
+
),
|
|
99
|
+
);
|
|
100
|
+
});
|
|
101
|
+
});
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
async function requireDocker() {
|
|
105
|
+
for (const bin of ["docker"]) {
|
|
106
|
+
try {
|
|
107
|
+
await run(bin, ["--version"]);
|
|
108
|
+
} catch {
|
|
109
|
+
console.error(`✗ ${bin} not found`);
|
|
110
|
+
console.error(
|
|
111
|
+
"Install Docker: https://docs.searxng.org/admin/installation-docker.html",
|
|
112
|
+
);
|
|
113
|
+
process.exit(1);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
try {
|
|
117
|
+
await run("docker", ["compose", "version"]);
|
|
118
|
+
} catch {
|
|
119
|
+
console.error("✗ docker compose not available");
|
|
120
|
+
console.error(
|
|
121
|
+
"Install Docker Compose v2: https://docs.docker.com/compose/install/",
|
|
122
|
+
);
|
|
123
|
+
process.exit(1);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
async function curlToFile(url, dest) {
|
|
128
|
+
await run("curl", ["-fsSL", "-o", dest, url]);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
async function readComposePort() {
|
|
132
|
+
if (!(await exists(ENV_COMPOSE))) return DEFAULT_PORT;
|
|
133
|
+
const text = await readFile(ENV_COMPOSE, "utf8");
|
|
134
|
+
for (const line of text.split("\n")) {
|
|
135
|
+
const m = line.match(/^SEARXNG_PORT=(.+)$/);
|
|
136
|
+
if (m) return m[1].trim().replace(/^["']|["']$/g, "") || DEFAULT_PORT;
|
|
137
|
+
}
|
|
138
|
+
return DEFAULT_PORT;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
async function ensureSearxngLayout() {
|
|
142
|
+
await mkdir(CORE_CONFIG, { recursive: true });
|
|
143
|
+
if (!(await exists(COMPOSE_PATH))) {
|
|
144
|
+
console.log("Fetching SearXNG docker-compose.yml …");
|
|
145
|
+
await curlToFile(`${SEARXNG_BASE}/docker-compose.yml`, COMPOSE_PATH);
|
|
146
|
+
}
|
|
147
|
+
if (!(await exists(ENV_COMPOSE))) {
|
|
148
|
+
const example = join(SEARXNG_DIR, ".env.example");
|
|
149
|
+
if (!(await exists(example))) {
|
|
150
|
+
console.log("Fetching SearXNG .env.example …");
|
|
151
|
+
await curlToFile(`${SEARXNG_BASE}/.env.example`, example);
|
|
152
|
+
}
|
|
153
|
+
await copyFile(example, ENV_COMPOSE);
|
|
154
|
+
}
|
|
155
|
+
const needsSettings =
|
|
156
|
+
!(await exists(SETTINGS_PATH)) ||
|
|
157
|
+
!(await readFile(SETTINGS_PATH, "utf8")).includes("json");
|
|
158
|
+
if (needsSettings) {
|
|
159
|
+
await writeFile(SETTINGS_PATH, HARNESS_SETTINGS, "utf8");
|
|
160
|
+
console.log(`✓ Wrote ${SETTINGS_PATH} (json format, limiter off)`);
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
async function composeUp() {
|
|
165
|
+
console.log("Starting SearXNG (docker compose up -d) …");
|
|
166
|
+
await run("docker", ["compose", "up", "-d"], { cwd: SEARXNG_DIR, inherit: true });
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
async function waitForHealth(baseUrl) {
|
|
170
|
+
const url = `${baseUrl}${HEALTH_PATH}`;
|
|
171
|
+
const deadline = Date.now() + 90_000;
|
|
172
|
+
let lastErr = "";
|
|
173
|
+
while (Date.now() < deadline) {
|
|
174
|
+
try {
|
|
175
|
+
const res = await fetch(url, {
|
|
176
|
+
headers: { Accept: "application/json" },
|
|
177
|
+
signal: AbortSignal.timeout(10_000),
|
|
178
|
+
});
|
|
179
|
+
if (res.status === 403) {
|
|
180
|
+
throw new Error(
|
|
181
|
+
"SearXNG returned 403 for format=json — ensure search.formats includes json in .searxng/core-config/settings.yml",
|
|
182
|
+
);
|
|
183
|
+
}
|
|
184
|
+
if (res.ok) {
|
|
185
|
+
const data = await res.json();
|
|
186
|
+
if (data && typeof data === "object") {
|
|
187
|
+
console.log(`✓ SearXNG healthy at ${baseUrl}`);
|
|
188
|
+
return;
|
|
189
|
+
}
|
|
190
|
+
}
|
|
191
|
+
lastErr = `HTTP ${res.status}`;
|
|
192
|
+
} catch (err) {
|
|
193
|
+
lastErr = err instanceof Error ? err.message : String(err);
|
|
194
|
+
}
|
|
195
|
+
await new Promise((r) => setTimeout(r, 3000));
|
|
196
|
+
}
|
|
197
|
+
throw new Error(`SearXNG health check timed out (${url}): ${lastErr}`);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
function upsertEnvKey(content, key, value) {
|
|
201
|
+
const line = `${key}=${value}`;
|
|
202
|
+
const re = new RegExp(`^${key}=.*$`, "m");
|
|
203
|
+
if (re.test(content)) {
|
|
204
|
+
return content.replace(re, line);
|
|
205
|
+
}
|
|
206
|
+
if (content.includes(MANAGED_START) && content.includes(MANAGED_END)) {
|
|
207
|
+
const end = content.indexOf(MANAGED_END);
|
|
208
|
+
return `${content.slice(0, end)}${line}\n${content.slice(end)}`;
|
|
209
|
+
}
|
|
210
|
+
const sep = content.endsWith("\n") || content.length === 0 ? "" : "\n";
|
|
211
|
+
return `${content}${sep}${MANAGED_START}\n# harness-web (SearXNG)\n${line}\n${MANAGED_END}\n`;
|
|
212
|
+
}
|
|
213
|
+
|
|
214
|
+
async function upsertHarnessEnv(baseUrl) {
|
|
215
|
+
const envPath = join(PROJECT_ROOT, ".env");
|
|
216
|
+
let content = "";
|
|
217
|
+
if (await exists(envPath)) {
|
|
218
|
+
content = await readFile(envPath, "utf8");
|
|
219
|
+
} else {
|
|
220
|
+
const template = join(UP_PKG, ".pi", "harness", "env.harness.template");
|
|
221
|
+
if (await exists(template)) {
|
|
222
|
+
content = await readFile(template, "utf8");
|
|
223
|
+
}
|
|
224
|
+
}
|
|
225
|
+
content = upsertEnvKey(content, "HARNESS_WEB_SEARCH_ENGINE", "searxng");
|
|
226
|
+
content = upsertEnvKey(content, "HARNESS_WEB_SEARXNG_URL", baseUrl);
|
|
227
|
+
await writeFile(envPath, content.endsWith("\n") ? content : `${content}\n`, "utf8");
|
|
228
|
+
console.log(`✓ Updated .env: HARNESS_WEB_SEARCH_ENGINE=searxng, HARNESS_WEB_SEARXNG_URL=${baseUrl}`);
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
function normalizeBaseUrl(raw) {
|
|
232
|
+
const url = raw.trim().replace(/\/+$/, "");
|
|
233
|
+
if (!/^https?:\/\//i.test(url)) {
|
|
234
|
+
throw new Error(`Invalid SearXNG URL: ${raw}`);
|
|
235
|
+
}
|
|
236
|
+
return url;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
async function main() {
|
|
240
|
+
if (setUrl) {
|
|
241
|
+
const baseUrl = normalizeBaseUrl(setUrl);
|
|
242
|
+
await waitForHealth(baseUrl);
|
|
243
|
+
await upsertHarnessEnv(baseUrl);
|
|
244
|
+
process.exit(0);
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
if (urlOnly) {
|
|
248
|
+
const port = (await exists(ENV_COMPOSE)) ? await readComposePort() : DEFAULT_PORT;
|
|
249
|
+
console.log(`http://127.0.0.1:${port}`);
|
|
250
|
+
process.exit(0);
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
await requireDocker();
|
|
254
|
+
await ensureSearxngLayout();
|
|
255
|
+
const port = await readComposePort();
|
|
256
|
+
const baseUrl = `http://127.0.0.1:${port}`;
|
|
257
|
+
await composeUp();
|
|
258
|
+
await waitForHealth(baseUrl);
|
|
259
|
+
await upsertHarnessEnv(baseUrl);
|
|
260
|
+
|
|
261
|
+
console.log("");
|
|
262
|
+
console.log("SearXNG is ready for harness-web:");
|
|
263
|
+
console.log(` HARNESS_WEB_SEARXNG_URL=${baseUrl}`);
|
|
264
|
+
console.log(` Test: python3 "${join(UP_PKG, ".pi/scripts/harness-web.py")}" search "test" -o .web/search.json --limit 2`);
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
main().catch((err) => {
|
|
268
|
+
console.error(`✗ ${err.message || err}`);
|
|
269
|
+
process.exit(1);
|
|
270
|
+
});
|
|
@@ -1,12 +1,21 @@
|
|
|
1
1
|
# harness-web search (internal)
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
Routing: `harness_web/search.py` dispatches by `HARNESS_WEB_SEARCH_ENGINE`.
|
|
4
4
|
|
|
5
|
-
|
|
5
|
+
## Engines
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
| Value | Module | Notes |
|
|
8
|
+
|-------|--------|-------|
|
|
9
|
+
| `ddg_html` (default) | `search_ddg.py` | DuckDuckGo HTML SERP via Scrapling HTTP (+ one stealth retry on challenge) |
|
|
10
|
+
| `searxng` | `search_searxng.py` | Self-hosted JSON API — requires `HARNESS_WEB_SEARXNG_URL` |
|
|
8
11
|
|
|
9
|
-
|
|
12
|
+
Bootstrap local SearXNG: `node "$UP_PKG/.pi/scripts/harness-searxng-bootstrap.mjs"`
|
|
13
|
+
|
|
14
|
+
## DuckDuckGo HTML (`ddg_html`)
|
|
15
|
+
|
|
16
|
+
`GET https://html.duckduckgo.com/html/?q=…`
|
|
17
|
+
|
|
18
|
+
### Selectors
|
|
10
19
|
|
|
11
20
|
| Field | CSS |
|
|
12
21
|
|-------|-----|
|
|
@@ -16,10 +25,18 @@ Implemented in `harness_web/search_ddg.py` via `Fetcher.get` (HTTP, not a browse
|
|
|
16
25
|
|
|
17
26
|
DDG redirect URLs (`//duckduckgo.com/l/?uddg=…`) are unwrapped to the target `uddg` parameter.
|
|
18
27
|
|
|
19
|
-
|
|
28
|
+
### Challenge detection
|
|
20
29
|
|
|
21
30
|
If status 403 or HTML contains challenge markers (`anomaly-modal`, etc.), retry **once** with `StealthyFetcher`, then exit with a clear “search engine blocked” message.
|
|
22
31
|
|
|
32
|
+
## SearXNG (`searxng`)
|
|
33
|
+
|
|
34
|
+
`GET {HARNESS_WEB_SEARXNG_URL}/search?q=…&format=json&pageno=1`
|
|
35
|
+
|
|
36
|
+
- No client API token (SearXNG has no standard search API key).
|
|
37
|
+
- `search.formats` in instance `settings.yml` must include `json` or the API returns **403**.
|
|
38
|
+
- Public instances are unsuitable (~4 JSON req/hr when limiter on; JSON often disabled). Use self-hosted bootstrap.
|
|
39
|
+
|
|
23
40
|
## Output
|
|
24
41
|
|
|
25
42
|
`.web/search.json` — envelope compatible with legacy Firecrawl skills:
|
|
@@ -31,3 +48,5 @@ If status 403 or HTML contains challenge markers (`anomaly-modal`, etc.), retry
|
|
|
31
48
|
"data": { "web": [{ "url", "title", "description" }] }
|
|
32
49
|
}
|
|
33
50
|
```
|
|
51
|
+
|
|
52
|
+
`engine` reflects the active backend (`ddg_html` or `searxng`).
|