@xberg-io/opencode-crawlberg 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,237 @@
1
+ import { spawn } from "node:child_process";
2
+ import { tool } from "@opencode-ai/plugin";
3
+
4
+ const schema = tool.schema;
5
+
6
+ const outputFormat = schema
7
+ .enum(["json", "markdown"])
8
+ .default("json")
9
+ .describe("CLI output format.");
10
+ const browserMode = schema
11
+ .enum(["auto", "always", "never"])
12
+ .default("auto")
13
+ .describe("When to use headless browser fallback.");
14
+
15
+ function hasValue(value) {
16
+ return value !== undefined && value !== null && value !== "";
17
+ }
18
+
19
+ function pushOption(args, name, value) {
20
+ if (hasValue(value)) {
21
+ args.push(name, String(value));
22
+ }
23
+ }
24
+
25
+ function pushFlag(args, name, enabled) {
26
+ if (enabled) {
27
+ args.push(name);
28
+ }
29
+ }
30
+
31
+ function validateJson(value, name) {
32
+ if (!hasValue(value)) {
33
+ return;
34
+ }
35
+
36
+ try {
37
+ JSON.parse(value);
38
+ } catch (error) {
39
+ throw new Error(`${name} must be valid JSON: ${error.message}`);
40
+ }
41
+ }
42
+
43
+ function runCli(args, context) {
44
+ const directory = context?.directory ?? context?.worktree ?? process.cwd();
45
+
46
+ return new Promise((resolve, reject) => {
47
+ const child = spawn("crawlberg", args, {
48
+ cwd: directory,
49
+ env: process.env,
50
+ signal: context?.abort,
51
+ stdio: ["ignore", "pipe", "pipe"],
52
+ });
53
+
54
+ const stdout = [];
55
+ const stderr = [];
56
+
57
+ child.stdout.on("data", (chunk) => stdout.push(chunk));
58
+ child.stderr.on("data", (chunk) => stderr.push(chunk));
59
+ child.on("error", (error) => {
60
+ if (error.code === "ENOENT") {
61
+ resolve({
62
+ title: "crawlberg CLI not found",
63
+ output:
64
+ "Install the crawlberg CLI with `brew install xberg-io/tap/crawlberg`, or run it via `npx @xberg-io/crawlberg-cli` / `uvx --from crawlberg-cli crawlberg`.",
65
+ metadata: { exitCode: 127, command: "crawlberg", subcommand: args[0] },
66
+ });
67
+ return;
68
+ }
69
+ reject(error);
70
+ });
71
+ child.on("close", (exitCode, signal) => {
72
+ const stdoutText = Buffer.concat(stdout).toString("utf8").trim();
73
+ const stderrText = Buffer.concat(stderr).toString("utf8").trim();
74
+ const output = [stdoutText, stderrText && `stderr:\n${stderrText}`]
75
+ .filter(Boolean)
76
+ .join("\n\n");
77
+
78
+ resolve({
79
+ title: exitCode === 0 ? `crawlberg ${args[0]}` : `crawlberg ${args[0]} failed`,
80
+ output: output || "(no output)",
81
+ metadata: {
82
+ exitCode,
83
+ signal,
84
+ command: "crawlberg",
85
+ subcommand: args[0],
86
+ },
87
+ });
88
+ });
89
+ });
90
+ }
91
+
92
+ function pushSharedCrawlOptions(cliArgs, args) {
93
+ pushOption(cliArgs, "--format", args.format);
94
+ pushOption(cliArgs, "--timeout", args.timeout);
95
+ pushOption(cliArgs, "--browser-mode", args.browser_mode);
96
+ pushOption(cliArgs, "--browser-endpoint", args.browser_endpoint);
97
+ pushOption(cliArgs, "--user-agent", args.user_agent);
98
+ pushOption(cliArgs, "--proxy", args.proxy);
99
+ pushFlag(cliArgs, "--respect-robots-txt", args.respect_robots_txt);
100
+ pushOption(cliArgs, "--config", args.config);
101
+ }
102
+
103
+ export const CrawlbergPlugin = async () => ({
104
+ tool: {
105
+ crawlberg_scrape: tool({
106
+ description: "Scrape one URL to JSON or Markdown with the crawlberg CLI.",
107
+ args: {
108
+ url: schema.string().url().describe("URL to scrape."),
109
+ format: outputFormat,
110
+ timeout: schema
111
+ .number()
112
+ .int()
113
+ .positive()
114
+ .max(600000)
115
+ .default(30000)
116
+ .describe("Request timeout in ms."),
117
+ browser_mode: browserMode,
118
+ browser_endpoint: schema
119
+ .string()
120
+ .url()
121
+ .optional()
122
+ .describe("Optional CDP WebSocket endpoint."),
123
+ user_agent: schema.string().min(1).optional().describe("Optional HTTP user agent."),
124
+ proxy: schema.string().url().optional().describe("Optional proxy URL."),
125
+ respect_robots_txt: schema.boolean().default(false).describe("Respect robots.txt."),
126
+ config: schema.string().min(2).optional().describe("Optional CrawlConfig JSON."),
127
+ },
128
+ async execute(args, context) {
129
+ validateJson(args.config, "config");
130
+
131
+ const cliArgs = ["scrape", args.url];
132
+ pushSharedCrawlOptions(cliArgs, args);
133
+ return runCli(cliArgs, context);
134
+ },
135
+ }),
136
+ crawlberg_crawl: tool({
137
+ description: "Crawl one or more seed URLs to JSON or Markdown with the crawlberg CLI.",
138
+ args: {
139
+ urls: schema.array(schema.string().url()).min(1).describe("Seed URLs to crawl."),
140
+ depth: schema.number().int().min(0).max(20).default(2).describe("Maximum crawl depth."),
141
+ max_pages: schema.number().int().positive().optional().describe("Maximum pages to crawl."),
142
+ concurrent: schema
143
+ .number()
144
+ .int()
145
+ .positive()
146
+ .max(256)
147
+ .default(10)
148
+ .describe("Maximum concurrent requests."),
149
+ rate_limit: schema
150
+ .number()
151
+ .int()
152
+ .min(0)
153
+ .default(200)
154
+ .describe("Delay between requests in ms."),
155
+ stay_on_domain: schema
156
+ .boolean()
157
+ .default(false)
158
+ .describe("Restrict crawling to the seed domain."),
159
+ format: outputFormat,
160
+ timeout: schema
161
+ .number()
162
+ .int()
163
+ .positive()
164
+ .max(600000)
165
+ .default(30000)
166
+ .describe("Request timeout in ms."),
167
+ browser_mode: browserMode,
168
+ browser_endpoint: schema
169
+ .string()
170
+ .url()
171
+ .optional()
172
+ .describe("Optional CDP WebSocket endpoint."),
173
+ user_agent: schema.string().min(1).optional().describe("Optional HTTP user agent."),
174
+ proxy: schema.string().url().optional().describe("Optional proxy URL."),
175
+ respect_robots_txt: schema.boolean().default(false).describe("Respect robots.txt."),
176
+ config: schema.string().min(2).optional().describe("Optional CrawlConfig JSON."),
177
+ },
178
+ async execute(args, context) {
179
+ validateJson(args.config, "config");
180
+
181
+ const cliArgs = [
182
+ "crawl",
183
+ ...args.urls,
184
+ "--depth",
185
+ String(args.depth),
186
+ "--concurrent",
187
+ String(args.concurrent),
188
+ ];
189
+ pushOption(cliArgs, "--max-pages", args.max_pages);
190
+ pushOption(cliArgs, "--rate-limit", args.rate_limit);
191
+ pushFlag(cliArgs, "--stay-on-domain", args.stay_on_domain);
192
+ pushSharedCrawlOptions(cliArgs, args);
193
+ return runCli(cliArgs, context);
194
+ },
195
+ }),
196
+ crawlberg_map: tool({
197
+ description: "Enumerate URLs from sitemaps and link extraction with the crawlberg CLI.",
198
+ args: {
199
+ url: schema.string().url().describe("URL to map."),
200
+ limit: schema.number().int().positive().optional().describe("Maximum URLs to return."),
201
+ search: schema.string().min(1).optional().describe("Filter URLs by substring."),
202
+ format: outputFormat,
203
+ timeout: schema
204
+ .number()
205
+ .int()
206
+ .positive()
207
+ .max(600000)
208
+ .default(30000)
209
+ .describe("Request timeout in ms."),
210
+ browser_mode: browserMode,
211
+ browser_endpoint: schema
212
+ .string()
213
+ .url()
214
+ .optional()
215
+ .describe("Optional CDP WebSocket endpoint."),
216
+ respect_robots_txt: schema.boolean().default(false).describe("Respect robots.txt."),
217
+ config: schema.string().min(2).optional().describe("Optional CrawlConfig JSON."),
218
+ },
219
+ async execute(args, context) {
220
+ validateJson(args.config, "config");
221
+
222
+ const cliArgs = ["map", args.url];
223
+ pushOption(cliArgs, "--limit", args.limit);
224
+ pushOption(cliArgs, "--search", args.search);
225
+ pushOption(cliArgs, "--format", args.format);
226
+ pushOption(cliArgs, "--timeout", args.timeout);
227
+ pushOption(cliArgs, "--browser-mode", args.browser_mode);
228
+ pushOption(cliArgs, "--browser-endpoint", args.browser_endpoint);
229
+ pushFlag(cliArgs, "--respect-robots-txt", args.respect_robots_txt);
230
+ pushOption(cliArgs, "--config", args.config);
231
+ return runCli(cliArgs, context);
232
+ },
233
+ }),
234
+ },
235
+ });
236
+
237
+ export default CrawlbergPlugin;
package/README.md ADDED
@@ -0,0 +1,121 @@
1
+ # crawlberg
2
+
3
+ Crawl, scrape, and convert websites to Markdown using the local `crawlberg` CLI in your agent.
4
+
5
+ <!-- TODO: screenshot -->
6
+
7
+ ## Install
8
+
9
+ ### From the marketplace (recommended)
10
+
11
+ Pending review for official Claude marketplace.
12
+
13
+ Self-host:
14
+
15
+ ```text
16
+ /plugin marketplace add xberg-io/plugins
17
+ /plugin install crawlberg@xberg
18
+ ```
19
+
20
+ ### Binary requirement
21
+
22
+ The MCP server runs through an auto-installing launcher (`scripts/mcp-launch.sh`): on first use it reuses any working `crawlberg` binary already on `PATH`, then tries `npx`/`uvx`, then Homebrew, then a prebuilt download. No manual install is required for MCP.
23
+
24
+ To install the CLI yourself (recommended for direct CLI use, and reused by the launcher):
25
+
26
+ ```bash
27
+ brew install xberg-io/tap/crawlberg
28
+ # or run it without a persistent install (the CLI proxy package self-installs the binary):
29
+ npx @xberg-io/crawlberg-cli --help
30
+ uvx --from crawlberg-cli crawlberg --help
31
+ # or build from source (the mcp/api subcommands are non-default features):
32
+ cargo install crawlberg-cli --features all
33
+ ```
34
+
35
+ The published npm (`@xberg-io/crawlberg`) and PyPI (`crawlberg`) packages are language *library* bindings, not the CLI — install via Homebrew or the from-source build above for the `crawlberg` binary.
36
+
37
+ Headless fallback requires Chrome/Chromium on your system. The CLI launches it on demand; skip the binary if you only plan to use `--browser-mode never`.
38
+
39
+ ## Skills shipped
40
+
41
+ | Skill | Trigger |
42
+ |-------|---------|
43
+ | **crawlberg** | Crawl, scrape, and convert websites to Markdown using the local crawlberg CLI and its MCP server. Use when the user wants to fetch a page, follow links across a domain, enumerate URLs, or drive a real browser. Covers installation, the subcommands (scrape, crawl, map, interact, batch-scrape, batch-crawl, download, citations, version, mcp, serve), output formats (JSON + Markdown), browser fallback, and when to prefer the MCP server over shelling out. |
44
+ | **crawling-a-site** | Use when the user wants to follow links across a domain and capture every reachable page as Markdown. Covers `crawlberg crawl` with depth, page caps, concurrency, rate limiting, domain scoping, robots, and output selection. |
45
+ | **scraping-html-to-markdown** | Use when the user wants a single page rendered as clean Markdown plus structured metadata. Covers `crawlberg scrape <url>`, JSON vs Markdown output, what metadata is returned, and how to handle JS-heavy pages. |
46
+ | **mapping-urls** | Use when the user wants the list of URLs on a site rather than the page content — sitemap analysis, link planning, or seeding another tool. Covers `crawlberg map <url>` with `--limit`, `--search`, robots, output, and how it differs from a full crawl. |
47
+ | **automating-the-browser** | Use when extracting a page needs scripted interaction first — click, type, press a key, scroll, wait, screenshot, or run JS before capturing the DOM. Covers `crawlberg interact <url> --actions` with the real action schema, result shape, limits, and external-CDP options. |
48
+ | **serving-the-api** | Use when the user wants a long-running HTTP service for scrape/crawl/map instead of one-shot CLI calls or the MCP server. Covers `crawlberg serve`, the Firecrawl-v1-compatible endpoints, `--host`/`--port`, and when to prefer it. |
49
+ | **headless-fallback** | Use when a static fetch returns nothing useful and the page needs a real browser. Covers `--browser-mode auto\|always\|never`, external CDP via `--browser-endpoint`, symptoms of JS-only pages and WAF blocks, and the performance cost. |
50
+
51
+ ## MCP / CLI
52
+
53
+ The plugin wires up the `crawlberg` MCP server via `scripts/mcp-launch.sh`, which resolves or installs a version-matched binary, then runs `crawlberg mcp` over stdio. Override binary resolution with `CRAWLBERG_LAUNCHER=auto|download`.
54
+
55
+ The MCP server exposes nine tools:
56
+
57
+ - `scrape` — fetch and convert a single URL to Markdown or JSON.
58
+ - `crawl` — follow links across a domain, bounded by depth and page count.
59
+ - `map` — enumerate URLs from sitemaps and link extraction.
60
+ - `batch_scrape` — scrape multiple URLs concurrently.
61
+ - `batch_crawl` — crawl multiple seed URLs concurrently.
62
+ - `download` — download a document from a URL and report its file metadata.
63
+ - `interact` — drive a headless browser with click, type, scroll actions.
64
+ - `generate_citations` — rewrite markdown links as numbered citations with a reference list.
65
+ - `get_version` — report the crawlberg library version.
66
+
67
+ The CLI offers the same operations as subcommands — `scrape`, `crawl`, `map`, `interact`, `batch-scrape`, `batch-crawl`, `download`, `citations`, `version` — plus `serve` (a Firecrawl-v1-compatible REST API server) and `mcp` (the stdio MCP server). See the `crawlberg` skill for the full per-subcommand flag surface, and the `serving-the-api` skill for when to run the server instead of the CLI or MCP.
68
+
69
+ ## Configuration
70
+
71
+ Pass flags or use inline JSON via `--config`:
72
+
73
+ ```bash
74
+ crawlberg scrape https://example.com \
75
+ --format markdown \
76
+ --browser-mode auto \
77
+ --timeout 30000
78
+ ```
79
+
80
+ For complex configs, use JSON:
81
+
82
+ ```bash
83
+ crawlberg crawl https://example.com \
84
+ --config '{"max_depth":3,"max_pages":200,"max_concurrent":8,"respect_robots_txt":true}'
85
+ ```
86
+
87
+ See the `crawlberg` and `crawling-a-site` skills for the full flag surface.
88
+
89
+ ## Examples
90
+
91
+ Fetch a single page and print Markdown:
92
+
93
+ ```text
94
+ crawlberg scrape https://example.com/article --format markdown
95
+ ```
96
+
97
+ Crawl a site at depth 3 with rate limiting:
98
+
99
+ ```text
100
+ crawlberg crawl https://example.com --depth 3 --max-pages 200 --concurrent 8 --stay-on-domain --format markdown
101
+ ```
102
+
103
+ Enumerate URLs from a sitemap:
104
+
105
+ ```text
106
+ crawlberg map https://example.com --limit 500
107
+ ```
108
+
109
+ ## Versioning
110
+
111
+ The plugin version tracks the marketplace `VERSION` file. See [CHANGELOG.md](../../CHANGELOG.md) for release notes.
112
+
113
+ ## License
114
+
115
+ MIT.
116
+
117
+ ## See also
118
+
119
+ - **Marketplace**: [xberg-io/plugins](https://github.com/xberg-io/plugins)
120
+ - **Upstream**: [xberg-io/crawlberg](https://github.com/xberg-io/crawlberg)
121
+ - **Sibling plugins**: [xberg](../xberg/README.md)
@@ -0,0 +1,14 @@
1
+ <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 256" width="256" height="256">
2
+ <rect width="256" height="256" rx="48" fill="#7C3AED"/>
3
+ <g stroke="white" stroke-width="12" stroke-linecap="round" fill="none">
4
+ <line x1="128" y1="64" x2="64" y2="128"/>
5
+ <line x1="128" y1="64" x2="192" y2="128"/>
6
+ <line x1="64" y1="128" x2="128" y2="192"/>
7
+ <line x1="192" y1="128" x2="128" y2="192"/>
8
+ <line x1="128" y1="64" x2="128" y2="192"/>
9
+ </g>
10
+ <circle cx="128" cy="64" r="20" fill="white"/>
11
+ <circle cx="64" cy="128" r="20" fill="white"/>
12
+ <circle cx="192" cy="128" r="20" fill="white"/>
13
+ <circle cx="128" cy="192" r="20" fill="white"/>
14
+ </svg>
Binary file
package/package.json ADDED
@@ -0,0 +1,41 @@
1
+ {
2
+ "name": "@xberg-io/opencode-crawlberg",
3
+ "version": "0.1.0",
4
+ "description": "OpenCode tools for crawling and scraping with the crawlberg CLI.",
5
+ "keywords": [
6
+ "crawling",
7
+ "html-to-markdown",
8
+ "opencode",
9
+ "web-scraping"
10
+ ],
11
+ "homepage": "https://github.com/xberg-io/plugins/tree/main/plugins/crawlberg",
12
+ "bugs": {
13
+ "url": "https://github.com/xberg-io/plugins/issues"
14
+ },
15
+ "license": "MIT",
16
+ "repository": {
17
+ "type": "git",
18
+ "url": "git+https://github.com/xberg-io/plugins.git",
19
+ "directory": "plugins/crawlberg"
20
+ },
21
+ "files": [
22
+ ".opencode/",
23
+ "assets/",
24
+ "README.md"
25
+ ],
26
+ "type": "module",
27
+ "main": ".opencode/plugins/crawlberg.js",
28
+ "exports": {
29
+ ".": "./.opencode/plugins/crawlberg.js"
30
+ },
31
+ "publishConfig": {
32
+ "access": "public",
33
+ "provenance": true
34
+ },
35
+ "dependencies": {
36
+ "@opencode-ai/plugin": "^1.17.8"
37
+ },
38
+ "engines": {
39
+ "node": ">=22.14.0"
40
+ }
41
+ }