ultimate-pi 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agents/skills/harness-decisions/SKILL.md +15 -0
- package/.agents/skills/scrapling-web/SKILL.md +45 -40
- package/.agents/skills/wiki-autoresearch/SKILL.md +3 -3
- package/.pi/SYSTEM.md +12 -13
- package/.pi/agents/pi-pi/agent-expert.md +3 -3
- package/.pi/extensions/harness-web-guard.ts +95 -0
- package/.pi/extensions/harness-web-tools.ts +209 -0
- package/.pi/extensions/lib/harness-web/run-cli.ts +92 -0
- package/.pi/harness/env.harness.template +3 -1
- package/.pi/prompts/harness-setup.md +46 -0
- package/.pi/scripts/harness-cli-verify.sh +12 -3
- package/.pi/scripts/harness-searxng-bootstrap.mjs +270 -0
- package/.pi/scripts/harness-web-search.md +24 -5
- package/.pi/scripts/harness-web.py +24 -7
- package/.pi/scripts/harness_web/config.py +37 -3
- package/.pi/scripts/harness_web/output.py +8 -2
- package/.pi/scripts/harness_web/search.py +22 -0
- package/.pi/scripts/harness_web/search_ddg.py +1 -5
- package/.pi/scripts/harness_web/search_searxng.py +100 -0
- package/CHANGELOG.md +12 -0
- package/package.json +2 -2
|
@@ -19,6 +19,21 @@ description: Structured user decisions via ask_user for harness setup, planning,
|
|
|
19
19
|
3. If the user **cancels** (Esc), stop with `needs_clarification` / `human_required` — do not assume defaults.
|
|
20
20
|
4. **CI / automation only:** pass `--non-interactive` to `/harness-setup` to skip prompts and use documented defaults.
|
|
21
21
|
|
|
22
|
+
## Example (harness-setup — search engine)
|
|
23
|
+
|
|
24
|
+
```json
|
|
25
|
+
{
|
|
26
|
+
"question": "Which harness-web search backend should this project use?",
|
|
27
|
+
"context": "Scrapling handles scrape/map/bulk. Search: DDG HTML needs no Docker. SearXNG must be self-hosted — public instances often block JSON and rate-limit API to ~4/hour per IP.",
|
|
28
|
+
"options": [
|
|
29
|
+
{ "title": "DuckDuckGo HTML (default)", "description": "HARNESS_WEB_SEARCH_ENGINE=ddg_html" },
|
|
30
|
+
{ "title": "Self-host SearXNG here (Docker)", "description": "node harness-searxng-bootstrap.mjs" },
|
|
31
|
+
{ "title": "Use existing SearXNG instance", "description": "Freeform base URL → HARNESS_WEB_SEARXNG_URL" }
|
|
32
|
+
],
|
|
33
|
+
"allowFreeform": true
|
|
34
|
+
}
|
|
35
|
+
```
|
|
36
|
+
|
|
22
37
|
## Example (plan — scope)
|
|
23
38
|
|
|
24
39
|
```json
|
|
@@ -1,22 +1,33 @@
|
|
|
1
1
|
---
|
|
2
2
|
name: scrapling-web
|
|
3
3
|
description: |
|
|
4
|
-
Harness web search and scrape via
|
|
5
|
-
non-API web task: search, scrape URLs, map site links, bulk research fetches.
|
|
4
|
+
Harness web search and scrape via pi tools web_search and web_fetch (harness-web.py).
|
|
5
|
+
Use for any non-API web task: search, scrape URLs, map site links, bulk research fetches.
|
|
6
6
|
Replaces Firecrawl in ultimate-pi harness agents. Triggers on: search the web,
|
|
7
|
-
scrape URL, fetch page, research online,
|
|
8
|
-
allowed-tools:
|
|
9
|
-
- Bash(python3 *harness-web.py *)
|
|
10
|
-
- Bash(python3 .pi/scripts/harness-web.py *)
|
|
11
|
-
- Bash(scrapling *)
|
|
7
|
+
scrape URL, fetch page, research online, web_search, web_fetch, .web/ artifacts.
|
|
12
8
|
---
|
|
13
9
|
|
|
14
10
|
# scrapling-web (harness-web)
|
|
15
11
|
|
|
16
|
-
Local web layer for harness agents — **no API keys
|
|
17
|
-
|
|
12
|
+
Local web layer for harness agents — **no API keys** for default search/scrape.
|
|
13
|
+
Pi registers **`web_search`** and **`web_fetch`** (wrap `harness-web.py` with Scrapling bootstrap).
|
|
14
|
+
Optional **self-hosted SearXNG** — see `/harness-setup` Step 4.0b.
|
|
18
15
|
|
|
19
|
-
##
|
|
16
|
+
## Agent tools (preferred)
|
|
17
|
+
|
|
18
|
+
| Task | Tool |
|
|
19
|
+
|------|------|
|
|
20
|
+
| Search (SERP) | `web_search` with `query` |
|
|
21
|
+
| Search + multi-scrape | `web_search` with `bulk: true` |
|
|
22
|
+
| Scrape URL | `web_fetch` with `url` (default mode `scrape`) |
|
|
23
|
+
| Map same-host links | `web_fetch` with `mode: map` |
|
|
24
|
+
| Static / simple page | `web_fetch` with `fast: true` |
|
|
25
|
+
|
|
26
|
+
**Never before search/fetch:** resolve `UP_PKG`, `ls harness-web.py`, `python3 -c "import scrapling"`, or Firecrawl/curl/wget/scrapling CLI for SERP or page fetch.
|
|
27
|
+
|
|
28
|
+
Full JSON/markdown lives under **`.web/`** (gitignored). Use `read` on `output` paths after tool calls.
|
|
29
|
+
|
|
30
|
+
## Install (once per machine — setup/humans only)
|
|
20
31
|
|
|
21
32
|
```bash
|
|
22
33
|
command -v uv &>/dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
@@ -24,25 +35,23 @@ uv tool install "scrapling[fetchers]"
|
|
|
24
35
|
scrapling install # browser binaries for default stealth scrape
|
|
25
36
|
```
|
|
26
37
|
|
|
27
|
-
Verify: `bash "$UP_PKG/.pi/scripts/harness-cli-verify.sh"`
|
|
28
|
-
|
|
29
|
-
## Output directory
|
|
38
|
+
Verify: `bash "$UP_PKG/.pi/scripts/harness-cli-verify.sh"`
|
|
39
|
+
Config diagnostics: `python3 "$UP_PKG/.pi/scripts/harness-web.py" status` (JSON; setup only)
|
|
30
40
|
|
|
31
|
-
|
|
41
|
+
## Bash fallback (if pi tools unavailable)
|
|
32
42
|
|
|
33
43
|
| Task | Command |
|
|
34
44
|
|------|---------|
|
|
35
45
|
| Search | `python3 "$UP_PKG/.pi/scripts/harness-web.py" search "query" -o .web/search.json --limit 5` |
|
|
36
|
-
| Scrape
|
|
37
|
-
| Fast/static
|
|
38
|
-
| Map
|
|
39
|
-
| Bulk | `python3 "$UP_PKG/.pi/scripts/harness-web.py" bulk-scrape "query" -o .web/bulk
|
|
46
|
+
| Scrape | `python3 "$UP_PKG/.pi/scripts/harness-web.py" scrape "<url>" -o .web/page.md` |
|
|
47
|
+
| Fast/static | add `--fast` |
|
|
48
|
+
| Map | `python3 "$UP_PKG/.pi/scripts/harness-web.py" map "<url>" -o .web/map.json` |
|
|
49
|
+
| Bulk | `python3 "$UP_PKG/.pi/scripts/harness-web.py" bulk-scrape "query" -o .web/bulk/` |
|
|
40
50
|
|
|
41
51
|
## Search JSON shape (Firecrawl-compatible)
|
|
42
52
|
|
|
43
53
|
```bash
|
|
44
54
|
jq -r '.data.web[].url' .web/search.json
|
|
45
|
-
jq -r '.data.web[] | "\(.title): \(.url)"' .web/search.json
|
|
46
55
|
```
|
|
47
56
|
|
|
48
57
|
Each entry: `url`, `title`, `description`.
|
|
@@ -51,43 +60,39 @@ Each entry: `url`, `title`, `description`.
|
|
|
51
60
|
|
|
52
61
|
| Mode | When |
|
|
53
62
|
|------|------|
|
|
54
|
-
| **stealth** (default
|
|
55
|
-
| **fast** (
|
|
63
|
+
| **stealth** (default) | Arbitrary URLs, JS-heavy sites |
|
|
64
|
+
| **fast** (`fast: true` / `--fast`) | Static docs, example.com, localhost |
|
|
56
65
|
| **auto** (`HARNESS_WEB_FETCH_MODE=auto`) | fast for known-static hosts, else stealth |
|
|
57
66
|
|
|
58
|
-
Search
|
|
67
|
+
| Search backend | Behavior |
|
|
68
|
+
|--------------|----------|
|
|
69
|
+
| `ddg_html` (default) | DuckDuckGo HTML SERP |
|
|
70
|
+
| `searxng` | JSON at `HARNESS_WEB_SEARXNG_URL` — bootstrap via `harness-searxng-bootstrap.mjs` |
|
|
59
71
|
|
|
60
72
|
## Environment
|
|
61
73
|
|
|
62
74
|
| Variable | Default | Purpose |
|
|
63
75
|
|----------|---------|---------|
|
|
64
76
|
| `HARNESS_WEB_FETCH_MODE` | `stealth` | `stealth` \| `fast` \| `auto` |
|
|
65
|
-
| `HARNESS_WEB_SEARCH_ENGINE` | `ddg_html` |
|
|
66
|
-
| `
|
|
67
|
-
| `HARNESS_WEB_RATE_LIMIT_MS` | `2000` | Delay between bulk scrapes |
|
|
68
|
-
| `HARNESS_WEB_TIMEOUT_MS` | `30000` | Per-request timeout |
|
|
77
|
+
| `HARNESS_WEB_SEARCH_ENGINE` | `ddg_html` | `ddg_html` \| `searxng` |
|
|
78
|
+
| `HARNESS_WEB_SEARXNG_URL` | (unset) | Required when `SEARCH_ENGINE=searxng` |
|
|
69
79
|
|
|
70
80
|
## Escalation
|
|
71
81
|
|
|
72
|
-
1. `
|
|
73
|
-
2. `
|
|
74
|
-
3. `
|
|
75
|
-
4.
|
|
82
|
+
1. `web_search` / `web_fetch`
|
|
83
|
+
2. `web_fetch` with `fast: true` for static hosts
|
|
84
|
+
3. `web_fetch` with `mode: map` then targeted fetches
|
|
85
|
+
4. Site-specific Scrapling only when tools are insufficient (not for routine SERP/fetch)
|
|
76
86
|
|
|
77
|
-
## Gaps vs
|
|
87
|
+
## Gaps vs Firecrawl
|
|
78
88
|
|
|
79
89
|
| Firecrawl | Harness path |
|
|
80
90
|
|-----------|----------------|
|
|
81
|
-
| `interact` |
|
|
82
|
-
| `agent`
|
|
83
|
-
| `parse` (
|
|
84
|
-
| `crawl`
|
|
91
|
+
| `interact` | gstack browse or manual browser |
|
|
92
|
+
| `agent` | Agent reasoning + graphify |
|
|
93
|
+
| `parse` (PDF) | pypdf, markitdown |
|
|
94
|
+
| `crawl` | `web_search` bulk or map + multiple `web_fetch` |
|
|
85
95
|
|
|
86
96
|
## Ethics
|
|
87
97
|
|
|
88
98
|
Respect site terms and rate limits. SERP scraping is for dev research, not high-volume harvesting.
|
|
89
|
-
See [Scrapling ethical considerations](https://scrapling.readthedocs.io/en/latest/cli/extract-commands.html#legal-and-ethical-considerations).
|
|
90
|
-
|
|
91
|
-
## Drawbacks of default stealth scrape
|
|
92
|
-
|
|
93
|
-
Higher latency and RAM (Chromium per session). Use `--fast` for static docs; reuse one `bulk-scrape` run (single `StealthySession`) instead of many cold starts.
|
|
@@ -8,7 +8,7 @@ description: >
|
|
|
8
8
|
Triggers on: "/wiki-autoresearch", "/autoresearch", "wiki-autoresearch", "autoresearch",
|
|
9
9
|
"research [topic]", "deep dive into [topic]", "investigate [topic]",
|
|
10
10
|
"find everything about [topic]", "research and file", "go research", "build a wiki on".
|
|
11
|
-
allowed-tools: Read Write Edit Glob Grep
|
|
11
|
+
allowed-tools: Read Write Edit Glob Grep web_search web_fetch Bash
|
|
12
12
|
---
|
|
13
13
|
|
|
14
14
|
# wiki-autoresearch: Autonomous Research Loop with Graphify
|
|
@@ -129,8 +129,8 @@ Input: topic (from Topic Selection, above)
|
|
|
129
129
|
|
|
130
130
|
Round 1. Broad search
|
|
131
131
|
1. Decompose topic into 3-5 distinct search angles
|
|
132
|
-
2. For each angle: run 2-3
|
|
133
|
-
3. For top 2-3 results per angle:
|
|
132
|
+
2. For each angle: run 2-3 `web_search` queries
|
|
133
|
+
3. For top 2-3 results per angle: `web_fetch` each URL (or `read` `.web/` artifacts)
|
|
134
134
|
4. Save each fetched page to ./raw/ as a markdown file
|
|
135
135
|
5. Extract from each: key claims, entities, concepts, open questions
|
|
136
136
|
|
package/.pi/SYSTEM.md
CHANGED
|
@@ -23,26 +23,25 @@ You are an enterprise coding agent. Optimize for correctness, minimal diffs, and
|
|
|
23
23
|
## Web Policy (Mandatory)
|
|
24
24
|
|
|
25
25
|
> [!warning] No raw HTTP
|
|
26
|
-
> Route **all** web
|
|
26
|
+
> Route **all** web through [[context7]] (API/library docs) or **`web_search` / `web_fetch`** ([[scrapling-web]]). No `curl`, `wget`, Firecrawl, or scrapling CLI preflight.
|
|
27
27
|
|
|
28
28
|
### API / Library Docs — context7 ONLY
|
|
29
29
|
- `ctx7 library <name> <query>` then `ctx7 docs <id> <query>`
|
|
30
30
|
- context7 owns: function signatures, class APIs, config options, stdlib, framework specs.
|
|
31
|
-
- **Never** use quality-sites for API docs.
|
|
31
|
+
- **Never** use quality-sites or web_fetch for API docs.
|
|
32
32
|
|
|
33
|
-
### All Non-API Web
|
|
34
|
-
See `.agents/skills/scrapling-web/SKILL.md`
|
|
33
|
+
### All Non-API Web — web_search + web_fetch
|
|
34
|
+
See `.agents/skills/scrapling-web/SKILL.md`. **No preflight:** never resolve `UP_PKG`, `ls harness-web.py`, or `python3 -c "import scrapling"` before searching.
|
|
35
35
|
|
|
36
|
-
| Task |
|
|
37
|
-
|
|
38
|
-
| Search (
|
|
39
|
-
| Scrape
|
|
40
|
-
|
|
|
41
|
-
| Map same-host links | `python3 "$UP_PKG/.pi/scripts/harness-web.py" map "<url>" -o .web/map.json` |
|
|
42
|
-
| Bulk search + scrape | `python3 "$UP_PKG/.pi/scripts/harness-web.py" bulk-scrape "query" -o .web/bulk/` |
|
|
36
|
+
| Task | Tool |
|
|
37
|
+
|------|------|
|
|
38
|
+
| Search (SERP) | `web_search` (`query`, optional `limit`, `bulk`) |
|
|
39
|
+
| Scrape page | `web_fetch` (`url`, optional `fast: true`) |
|
|
40
|
+
| Map links | `web_fetch` (`url`, `mode: map`) |
|
|
43
41
|
|
|
44
|
-
- **Artifacts:**
|
|
45
|
-
- **
|
|
42
|
+
- **Artifacts:** default under `.web/`; use `read` for full JSON/markdown.
|
|
43
|
+
- **Fallback** (tools unavailable): `python3 "$UP_PKG/.pi/scripts/harness-web.py" …` per scrapling-web skill.
|
|
44
|
+
- **Setup diagnostics only:** `harness-web.py status` (JSON config).
|
|
46
45
|
- **Quality sites:** check `.agents/skills/wiki-autoresearch/references/quality-sites.md` before citing non-API sources. Prefer Tier 1 (StackOverflow, GitHub issues, engineering blogs, arxiv). Exclude AI content farms, mirrors, stale packages.
|
|
47
46
|
- **Research:** use `/wiki-autoresearch <topic>` for deep research. Results are graphified into `graphify-out/`.
|
|
48
47
|
|
|
@@ -187,9 +187,9 @@ Before answering ANY question, search the local codebase for existing agent defi
|
|
|
187
187
|
find .pi/agents -name "*.md" -type f 2>/dev/null
|
|
188
188
|
```
|
|
189
189
|
|
|
190
|
-
Fetch the latest pi-subagents docs:
|
|
191
|
-
```
|
|
192
|
-
|
|
190
|
+
Fetch the latest pi-subagents docs (use `web_fetch` with `fast: true` for raw GitHub):
|
|
191
|
+
```
|
|
192
|
+
web_fetch url="https://raw.githubusercontent.com/tintinweb/pi-subagents/refs/heads/master/README.md" fast=true output=.web/pi-subagents-readme.md
|
|
193
193
|
```
|
|
194
194
|
|
|
195
195
|
## How to Respond
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* harness-web-guard — block bash that bypasses web_search / web_fetch tools.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
|
|
6
|
+
|
|
7
|
+
const BLOCK_REASON =
|
|
8
|
+
"harness-web-guard: use web_search (SERP) or web_fetch (page content) instead of raw curl/wget/firecrawl/scrapling fetch. " +
|
|
9
|
+
"Setup may use harness-web.py status directly.";
|
|
10
|
+
|
|
11
|
+
const ALLOW_PATTERNS = [
|
|
12
|
+
/harness-web\.py\b/i,
|
|
13
|
+
/harness-cli-verify\.sh\b/i,
|
|
14
|
+
/\bgraphify\b/i,
|
|
15
|
+
/\bctx7\b/i,
|
|
16
|
+
/\bcontext7\b/i,
|
|
17
|
+
/\bgit\b/i,
|
|
18
|
+
/harness-searxng-bootstrap/i,
|
|
19
|
+
];
|
|
20
|
+
|
|
21
|
+
const BLOCK_PATTERNS: Array<{ re: RegExp; note: string }> = [
|
|
22
|
+
{ re: /\bfirecrawl\b/i, note: "firecrawl" },
|
|
23
|
+
{
|
|
24
|
+
re: /\b(?:curl|wget)\b[^\n|;&]*\s+https?:\/\//i,
|
|
25
|
+
note: "curl/wget http(s)",
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
re: /\bscrapling\s+(?:fetch|extract)\b/i,
|
|
29
|
+
note: "scrapling fetch/extract",
|
|
30
|
+
},
|
|
31
|
+
];
|
|
32
|
+
|
|
33
|
+
function isBootstrapPrompt(prompt: string): boolean {
|
|
34
|
+
const p = prompt.toLowerCase();
|
|
35
|
+
return (
|
|
36
|
+
p.includes("/harness-setup") ||
|
|
37
|
+
p.includes("harness-setup") ||
|
|
38
|
+
p.includes("full harness bootstrap")
|
|
39
|
+
);
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
function latestUserPrompt(ctx: {
|
|
43
|
+
sessionManager: { getEntries(): unknown[] };
|
|
44
|
+
}): string {
|
|
45
|
+
const entries = ctx.sessionManager.getEntries() as Array<{
|
|
46
|
+
type?: string;
|
|
47
|
+
message?: { role?: string; content?: unknown };
|
|
48
|
+
}>;
|
|
49
|
+
for (let i = entries.length - 1; i >= 0; i--) {
|
|
50
|
+
const entry = entries[i];
|
|
51
|
+
if (entry?.message?.role !== "user") continue;
|
|
52
|
+
const content = entry.message.content;
|
|
53
|
+
if (typeof content === "string") return content;
|
|
54
|
+
if (Array.isArray(content)) {
|
|
55
|
+
return content
|
|
56
|
+
.map((part) =>
|
|
57
|
+
typeof part === "object" && part && "text" in part
|
|
58
|
+
? String((part as { text?: string }).text ?? "")
|
|
59
|
+
: "",
|
|
60
|
+
)
|
|
61
|
+
.join("\n");
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
return "";
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
function isAllowedBash(command: string): boolean {
|
|
68
|
+
return ALLOW_PATTERNS.some((re) => re.test(command));
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function blockedWebBash(command: string): string | null {
|
|
72
|
+
if (isAllowedBash(command)) return null;
|
|
73
|
+
for (const { re, note } of BLOCK_PATTERNS) {
|
|
74
|
+
if (re.test(command)) return note;
|
|
75
|
+
}
|
|
76
|
+
return null;
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
export default function harnessWebGuard(pi: ExtensionAPI) {
|
|
80
|
+
pi.on("tool_call", async (event, ctx) => {
|
|
81
|
+
if (event.toolName !== "bash") return undefined;
|
|
82
|
+
|
|
83
|
+
const prompt = latestUserPrompt(ctx);
|
|
84
|
+
if (isBootstrapPrompt(prompt)) return undefined;
|
|
85
|
+
|
|
86
|
+
const command = String((event.input as { command?: string }).command ?? "");
|
|
87
|
+
const hit = blockedWebBash(command);
|
|
88
|
+
if (!hit) return undefined;
|
|
89
|
+
|
|
90
|
+
return {
|
|
91
|
+
block: true,
|
|
92
|
+
reason: `${BLOCK_REASON} (matched: ${hit})`,
|
|
93
|
+
};
|
|
94
|
+
});
|
|
95
|
+
}
|
|
@@ -0,0 +1,209 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* harness-web-tools — web_search + web_fetch pi tools wrapping harness-web.py.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
|
|
6
|
+
import { Type } from "@sinclair/typebox";
|
|
7
|
+
import {
|
|
8
|
+
harnessWebContextLine,
|
|
9
|
+
readTextExcerpt,
|
|
10
|
+
runHarnessWeb,
|
|
11
|
+
summarizeSearchJson,
|
|
12
|
+
} from "./lib/harness-web/run-cli.js";
|
|
13
|
+
|
|
14
|
+
// @ts-expect-error pi extensions run as ESM
|
|
15
|
+
const MODULE_URL = import.meta.url;
|
|
16
|
+
|
|
17
|
+
const WEB_SEARCH_GUIDELINES = [
|
|
18
|
+
"Use web_search for open-web SERP — never preflight UP_PKG, ls harness-web.py, or python3 -c import scrapling.",
|
|
19
|
+
"Never use Firecrawl, curl/wget for search, or scrapling CLI for SERP.",
|
|
20
|
+
"After search, use web_fetch on URLs or read the output JSON under .web/.",
|
|
21
|
+
"Use bulk:true only when you need search plus multi-page scrape in one step.",
|
|
22
|
+
];
|
|
23
|
+
|
|
24
|
+
const WEB_FETCH_GUIDELINES = [
|
|
25
|
+
"Use web_fetch for page markdown or same-host link maps — never curl/wget the URL.",
|
|
26
|
+
"Never use raw scrapling CLI for fetch; harness-web handles Scrapling bootstrap.",
|
|
27
|
+
"Library API documentation → context7 only, not web_fetch.",
|
|
28
|
+
"Set fast:true for static docs (example.com, raw HTML docs, localhost).",
|
|
29
|
+
];
|
|
30
|
+
|
|
31
|
+
const WebSearchSchema = Type.Object({
|
|
32
|
+
query: Type.String({ description: "Search query" }),
|
|
33
|
+
limit: Type.Optional(
|
|
34
|
+
Type.Number({
|
|
35
|
+
description: "Max results (default 5)",
|
|
36
|
+
minimum: 1,
|
|
37
|
+
maximum: 20,
|
|
38
|
+
}),
|
|
39
|
+
),
|
|
40
|
+
output: Type.Optional(
|
|
41
|
+
Type.String({
|
|
42
|
+
description:
|
|
43
|
+
"Output path (default .web/search.json or .web/bulk for bulk)",
|
|
44
|
+
}),
|
|
45
|
+
),
|
|
46
|
+
bulk: Type.Optional(
|
|
47
|
+
Type.Boolean({
|
|
48
|
+
description:
|
|
49
|
+
"If true, run bulk-scrape (search then scrape top URLs to output directory)",
|
|
50
|
+
default: false,
|
|
51
|
+
}),
|
|
52
|
+
),
|
|
53
|
+
});
|
|
54
|
+
|
|
55
|
+
const WebFetchSchema = Type.Object({
|
|
56
|
+
url: Type.String({ description: "URL to fetch" }),
|
|
57
|
+
mode: Type.Optional(
|
|
58
|
+
Type.Union([Type.Literal("scrape"), Type.Literal("map")], {
|
|
59
|
+
description: "scrape (markdown) or map (same-host links JSON)",
|
|
60
|
+
default: "scrape",
|
|
61
|
+
}),
|
|
62
|
+
),
|
|
63
|
+
output: Type.Optional(
|
|
64
|
+
Type.String({ description: "Output file path under .web/" }),
|
|
65
|
+
),
|
|
66
|
+
fast: Type.Optional(
|
|
67
|
+
Type.Boolean({
|
|
68
|
+
description: "Use fast HTTP scrape (static/simple pages)",
|
|
69
|
+
default: false,
|
|
70
|
+
}),
|
|
71
|
+
),
|
|
72
|
+
limit: Type.Optional(
|
|
73
|
+
Type.Number({
|
|
74
|
+
description: "For map mode: max links (default 100)",
|
|
75
|
+
minimum: 1,
|
|
76
|
+
maximum: 500,
|
|
77
|
+
}),
|
|
78
|
+
),
|
|
79
|
+
});
|
|
80
|
+
|
|
81
|
+
function failResult(text: string) {
|
|
82
|
+
return {
|
|
83
|
+
content: [{ type: "text" as const, text }],
|
|
84
|
+
details: { ok: false },
|
|
85
|
+
};
|
|
86
|
+
}
|
|
87
|
+
|
|
88
|
+
function okResult(text: string, details: Record<string, unknown> = {}) {
|
|
89
|
+
return {
|
|
90
|
+
content: [{ type: "text" as const, text }],
|
|
91
|
+
details: { ok: true, ...details },
|
|
92
|
+
};
|
|
93
|
+
}
|
|
94
|
+
|
|
95
|
+
function sessionCwd(ctx: { cwd?: string }): string {
|
|
96
|
+
return ctx.cwd ?? process.cwd();
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
export default function harnessWebTools(pi: ExtensionAPI) {
|
|
100
|
+
pi.on("before_agent_start", async (event) => {
|
|
101
|
+
return {
|
|
102
|
+
systemPrompt: `${event.systemPrompt}\n\n${harnessWebContextLine()}`,
|
|
103
|
+
};
|
|
104
|
+
});
|
|
105
|
+
|
|
106
|
+
pi.registerTool({
|
|
107
|
+
name: "web_search",
|
|
108
|
+
label: "Web Search",
|
|
109
|
+
description:
|
|
110
|
+
"Search the web via harness-web (DuckDuckGo HTML or self-hosted SearXNG from .env). Returns result summaries and output path.",
|
|
111
|
+
promptSnippet: "SERP via configured engine (ddg_html or searxng from .env)",
|
|
112
|
+
promptGuidelines: WEB_SEARCH_GUIDELINES,
|
|
113
|
+
parameters: WebSearchSchema,
|
|
114
|
+
|
|
115
|
+
async execute(_id, params, _signal, _onUpdate, ctx) {
|
|
116
|
+
const cwd = sessionCwd(ctx);
|
|
117
|
+
const query = String(params.query ?? "").trim();
|
|
118
|
+
if (!query) return failResult("web_search: query is required.");
|
|
119
|
+
|
|
120
|
+
const limit = typeof params.limit === "number" ? params.limit : 5;
|
|
121
|
+
const bulk = params.bulk === true;
|
|
122
|
+
const output = String(
|
|
123
|
+
params.output ?? (bulk ? ".web/bulk" : ".web/search.json"),
|
|
124
|
+
);
|
|
125
|
+
|
|
126
|
+
const argv = bulk
|
|
127
|
+
? ["bulk-scrape", query, "-o", output, "--limit", String(limit)]
|
|
128
|
+
: ["search", query, "-o", output, "--limit", String(limit)];
|
|
129
|
+
|
|
130
|
+
const run = runHarnessWeb(MODULE_URL, argv, cwd);
|
|
131
|
+
if (!run.ok) {
|
|
132
|
+
const hint =
|
|
133
|
+
"\n\nHints: run /harness-setup; for searxng set HARNESS_WEB_SEARXNG_URL; " +
|
|
134
|
+
"enable json in SearXNG search.formats.";
|
|
135
|
+
return failResult(
|
|
136
|
+
`web_search failed (exit ${run.exitCode}).\n${run.stderr || run.stdout}${hint}`,
|
|
137
|
+
);
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
const parts = [run.stdout];
|
|
141
|
+
if (!bulk) {
|
|
142
|
+
const summary = summarizeSearchJson(output, cwd);
|
|
143
|
+
if (summary) {
|
|
144
|
+
parts.push("", summary);
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
parts.push("", `output: ${output}`);
|
|
148
|
+
parts.push("Use read tool for full JSON, or web_fetch on result URLs.");
|
|
149
|
+
|
|
150
|
+
return okResult(parts.join("\n"), {
|
|
151
|
+
output,
|
|
152
|
+
query,
|
|
153
|
+
bulk,
|
|
154
|
+
engine: process.env.HARNESS_WEB_SEARCH_ENGINE,
|
|
155
|
+
});
|
|
156
|
+
},
|
|
157
|
+
});
|
|
158
|
+
|
|
159
|
+
pi.registerTool({
|
|
160
|
+
name: "web_fetch",
|
|
161
|
+
label: "Web Fetch",
|
|
162
|
+
description:
|
|
163
|
+
"Fetch a URL via harness-web/Scrapling (scrape to markdown or map same-host links).",
|
|
164
|
+
promptSnippet: "Scrape/map URL via Scrapling (harness-web)",
|
|
165
|
+
promptGuidelines: WEB_FETCH_GUIDELINES,
|
|
166
|
+
parameters: WebFetchSchema,
|
|
167
|
+
|
|
168
|
+
async execute(_id, params, _signal, _onUpdate, ctx) {
|
|
169
|
+
const cwd = sessionCwd(ctx);
|
|
170
|
+
const url = String(params.url ?? "").trim();
|
|
171
|
+
if (!url) return failResult("web_fetch: url is required.");
|
|
172
|
+
|
|
173
|
+
const mode = params.mode === "map" ? "map" : "scrape";
|
|
174
|
+
const fast = params.fast === true;
|
|
175
|
+
const limit = typeof params.limit === "number" ? params.limit : 100;
|
|
176
|
+
const defaultOut = mode === "map" ? ".web/map.json" : ".web/page.md";
|
|
177
|
+
const output = String(params.output ?? defaultOut);
|
|
178
|
+
|
|
179
|
+
const argv =
|
|
180
|
+
mode === "map"
|
|
181
|
+
? [
|
|
182
|
+
"map",
|
|
183
|
+
url,
|
|
184
|
+
"-o",
|
|
185
|
+
output,
|
|
186
|
+
"--limit",
|
|
187
|
+
String(limit),
|
|
188
|
+
...(fast ? ["--fast"] : []),
|
|
189
|
+
]
|
|
190
|
+
: ["scrape", url, "-o", output, ...(fast ? ["--fast"] : [])];
|
|
191
|
+
|
|
192
|
+
const run = runHarnessWeb(MODULE_URL, argv, cwd);
|
|
193
|
+
if (!run.ok) {
|
|
194
|
+
return failResult(
|
|
195
|
+
`web_fetch failed (exit ${run.exitCode}).\n${run.stderr || run.stdout}\n` +
|
|
196
|
+
"Try fast:true for static pages, or run harness-cli-verify for Scrapling install.",
|
|
197
|
+
);
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
const parts = [run.stdout, "", `output: ${output}`];
|
|
201
|
+
const excerpt = readTextExcerpt(output, cwd);
|
|
202
|
+
if (excerpt) {
|
|
203
|
+
parts.push("", "--- excerpt ---", excerpt);
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
return okResult(parts.join("\n"), { output, url, mode });
|
|
207
|
+
},
|
|
208
|
+
});
|
|
209
|
+
}
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
import { spawnSync } from "node:child_process";
|
|
2
|
+
import { existsSync, readFileSync } from "node:fs";
|
|
3
|
+
import { resolve } from "node:path";
|
|
4
|
+
import { resolveHarnessScript } from "../harness-paths.js";
|
|
5
|
+
|
|
6
|
+
export interface RunHarnessWebResult {
|
|
7
|
+
ok: boolean;
|
|
8
|
+
exitCode: number;
|
|
9
|
+
stdout: string;
|
|
10
|
+
stderr: string;
|
|
11
|
+
}
|
|
12
|
+
|
|
13
|
+
export function runHarnessWeb(
|
|
14
|
+
moduleUrl: string,
|
|
15
|
+
args: string[],
|
|
16
|
+
cwd: string,
|
|
17
|
+
): RunHarnessWebResult {
|
|
18
|
+
const script = resolveHarnessScript(moduleUrl, "harness-web.py");
|
|
19
|
+
const result = spawnSync("python3", [script, ...args], {
|
|
20
|
+
cwd,
|
|
21
|
+
env: process.env,
|
|
22
|
+
encoding: "utf-8",
|
|
23
|
+
maxBuffer: 16 * 1024 * 1024,
|
|
24
|
+
});
|
|
25
|
+
return {
|
|
26
|
+
ok: result.status === 0,
|
|
27
|
+
exitCode: result.status ?? 1,
|
|
28
|
+
stdout: (result.stdout ?? "").trim(),
|
|
29
|
+
stderr: (result.stderr ?? "").trim(),
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
|
|
33
|
+
export function readTextExcerpt(
|
|
34
|
+
filePath: string,
|
|
35
|
+
cwd: string,
|
|
36
|
+
maxChars = 2000,
|
|
37
|
+
): string {
|
|
38
|
+
const full = resolve(cwd, filePath);
|
|
39
|
+
if (!existsSync(full)) return "";
|
|
40
|
+
const text = readFileSync(full, "utf-8");
|
|
41
|
+
if (text.length <= maxChars) return text;
|
|
42
|
+
return `${text.slice(0, maxChars)}\n… (truncated; use read tool for full file)`;
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
export interface SearchHit {
|
|
46
|
+
url: string;
|
|
47
|
+
title: string;
|
|
48
|
+
description: string;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
export function summarizeSearchJson(filePath: string, cwd: string): string {
|
|
52
|
+
const full = resolve(cwd, filePath);
|
|
53
|
+
if (!existsSync(full)) return "";
|
|
54
|
+
try {
|
|
55
|
+
const data = JSON.parse(readFileSync(full, "utf-8")) as {
|
|
56
|
+
query?: string;
|
|
57
|
+
engine?: string;
|
|
58
|
+
data?: { web?: SearchHit[] };
|
|
59
|
+
};
|
|
60
|
+
const hits = data.data?.web ?? [];
|
|
61
|
+
const lines = [
|
|
62
|
+
`engine: ${data.engine ?? "unknown"}`,
|
|
63
|
+
`query: ${data.query ?? ""}`,
|
|
64
|
+
`results: ${hits.length}`,
|
|
65
|
+
"",
|
|
66
|
+
];
|
|
67
|
+
for (const [i, hit] of hits.entries()) {
|
|
68
|
+
lines.push(`${i + 1}. ${hit.title || "(no title)"}`);
|
|
69
|
+
lines.push(` ${hit.url}`);
|
|
70
|
+
if (hit.description) {
|
|
71
|
+
const snip =
|
|
72
|
+
hit.description.length > 120
|
|
73
|
+
? `${hit.description.slice(0, 120)}…`
|
|
74
|
+
: hit.description;
|
|
75
|
+
lines.push(` ${snip}`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return lines.join("\n");
|
|
79
|
+
} catch {
|
|
80
|
+
return "";
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
export function harnessWebContextLine(): string {
|
|
85
|
+
const engine = process.env.HARNESS_WEB_SEARCH_ENGINE?.trim() || "ddg_html";
|
|
86
|
+
const searx = process.env.HARNESS_WEB_SEARXNG_URL?.trim();
|
|
87
|
+
const searxPart = searx ? ` searxng_url=${searx}` : "";
|
|
88
|
+
return (
|
|
89
|
+
`[HarnessWeb] search_engine=${engine}${searxPart} — use web_search / web_fetch tools; ` +
|
|
90
|
+
"never resolve UP_PKG, ls harness-web.py, or python3 -c import scrapling before searching."
|
|
91
|
+
);
|
|
92
|
+
}
|
|
@@ -4,9 +4,11 @@
|
|
|
4
4
|
# Telemetry (set false to disable harness PostHog events)
|
|
5
5
|
HARNESS_TELEMETRY_ENABLED=true
|
|
6
6
|
|
|
7
|
-
# harness-web (Scrapling
|
|
7
|
+
# harness-web (Scrapling scrape + pluggable search)
|
|
8
8
|
HARNESS_WEB_FETCH_MODE=stealth
|
|
9
9
|
HARNESS_WEB_SEARCH_ENGINE=ddg_html
|
|
10
|
+
# SearXNG (when HARNESS_WEB_SEARCH_ENGINE=searxng):
|
|
11
|
+
# HARNESS_WEB_SEARXNG_URL=http://127.0.0.1:8080
|
|
10
12
|
# HARNESS_WEB_PROXY=
|
|
11
13
|
# HARNESS_WEB_RATE_LIMIT_MS=2000
|
|
12
14
|
# HARNESS_WEB_TIMEOUT_MS=30000
|