npm - @xberg-io/opencode-crawlberg - Versions diffs - 0.1.0 - Mend

@xberg-io/opencode-crawlberg 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/.opencode/plugins/crawlberg.js +237 -0
package/README.md +121 -0
package/assets/icon.svg +14 -0
package/assets/logo.png +0 -0
package/package.json +41 -0

package/.opencode/plugins/crawlberg.js ADDED Viewed

@@ -0,0 +1,237 @@
+import { spawn } from "node:child_process";
+import { tool } from "@opencode-ai/plugin";
+const schema = tool.schema;
+const outputFormat = schema
+  .enum(["json", "markdown"])
+  .default("json")
+  .describe("CLI output format.");
+const browserMode = schema
+  .enum(["auto", "always", "never"])
+  .default("auto")
+  .describe("When to use headless browser fallback.");
+function hasValue(value) {
+  return value !== undefined && value !== null && value !== "";
+}
+function pushOption(args, name, value) {
+  if (hasValue(value)) {
+    args.push(name, String(value));
+  }
+}
+function pushFlag(args, name, enabled) {
+  if (enabled) {
+    args.push(name);
+  }
+}
+function validateJson(value, name) {
+  if (!hasValue(value)) {
+    return;
+  }
+  try {
+    JSON.parse(value);
+  } catch (error) {
+    throw new Error(`${name} must be valid JSON: ${error.message}`);
+  }
+}
+function runCli(args, context) {
+  const directory = context?.directory ?? context?.worktree ?? process.cwd();
+  return new Promise((resolve, reject) => {
+    const child = spawn("crawlberg", args, {
+      cwd: directory,
+      env: process.env,
+      signal: context?.abort,
+      stdio: ["ignore", "pipe", "pipe"],
+    });
+    const stdout = [];
+    const stderr = [];
+    child.stdout.on("data", (chunk) => stdout.push(chunk));
+    child.stderr.on("data", (chunk) => stderr.push(chunk));
+    child.on("error", (error) => {
+      if (error.code === "ENOENT") {
+        resolve({
+          title: "crawlberg CLI not found",
+          output:
+            "Install the crawlberg CLI with `brew install xberg-io/tap/crawlberg`, or run it via `npx @xberg-io/crawlberg-cli` / `uvx --from crawlberg-cli crawlberg`.",
+          metadata: { exitCode: 127, command: "crawlberg", subcommand: args[0] },
+        });
+        return;
+      }
+      reject(error);
+    });
+    child.on("close", (exitCode, signal) => {
+      const stdoutText = Buffer.concat(stdout).toString("utf8").trim();
+      const stderrText = Buffer.concat(stderr).toString("utf8").trim();
+      const output = [stdoutText, stderrText && `stderr:\n${stderrText}`]
+        .filter(Boolean)
+        .join("\n\n");
+      resolve({
+        title: exitCode === 0 ? `crawlberg ${args[0]}` : `crawlberg ${args[0]} failed`,
+        output: output || "(no output)",
+        metadata: {
+          exitCode,
+          signal,
+          command: "crawlberg",
+          subcommand: args[0],
+        },
+      });
+    });
+  });
+}
+function pushSharedCrawlOptions(cliArgs, args) {
+  pushOption(cliArgs, "--format", args.format);
+  pushOption(cliArgs, "--timeout", args.timeout);
+  pushOption(cliArgs, "--browser-mode", args.browser_mode);
+  pushOption(cliArgs, "--browser-endpoint", args.browser_endpoint);
+  pushOption(cliArgs, "--user-agent", args.user_agent);
+  pushOption(cliArgs, "--proxy", args.proxy);
+  pushFlag(cliArgs, "--respect-robots-txt", args.respect_robots_txt);
+  pushOption(cliArgs, "--config", args.config);
+}
+export const CrawlbergPlugin = async () => ({
+  tool: {
+    crawlberg_scrape: tool({
+      description: "Scrape one URL to JSON or Markdown with the crawlberg CLI.",
+      args: {
+        url: schema.string().url().describe("URL to scrape."),
+        format: outputFormat,
+        timeout: schema
+          .number()
+          .int()
+          .positive()
+          .max(600000)
+          .default(30000)
+          .describe("Request timeout in ms."),
+        browser_mode: browserMode,
+        browser_endpoint: schema
+          .string()
+          .url()
+          .optional()
+          .describe("Optional CDP WebSocket endpoint."),
+        user_agent: schema.string().min(1).optional().describe("Optional HTTP user agent."),
+        proxy: schema.string().url().optional().describe("Optional proxy URL."),
+        respect_robots_txt: schema.boolean().default(false).describe("Respect robots.txt."),
+        config: schema.string().min(2).optional().describe("Optional CrawlConfig JSON."),
+      },
+      async execute(args, context) {
+        validateJson(args.config, "config");
+        const cliArgs = ["scrape", args.url];
+        pushSharedCrawlOptions(cliArgs, args);
+        return runCli(cliArgs, context);
+      },
+    }),
+    crawlberg_crawl: tool({
+      description: "Crawl one or more seed URLs to JSON or Markdown with the crawlberg CLI.",
+      args: {
+        urls: schema.array(schema.string().url()).min(1).describe("Seed URLs to crawl."),
+        depth: schema.number().int().min(0).max(20).default(2).describe("Maximum crawl depth."),
+        max_pages: schema.number().int().positive().optional().describe("Maximum pages to crawl."),
+        concurrent: schema
+          .number()
+          .int()
+          .positive()
+          .max(256)
+          .default(10)
+          .describe("Maximum concurrent requests."),
+        rate_limit: schema
+          .number()
+          .int()
+          .min(0)
+          .default(200)
+          .describe("Delay between requests in ms."),
+        stay_on_domain: schema
+          .boolean()
+          .default(false)
+          .describe("Restrict crawling to the seed domain."),
+        format: outputFormat,
+        timeout: schema
+          .number()
+          .int()
+          .positive()
+          .max(600000)
+          .default(30000)
+          .describe("Request timeout in ms."),
+        browser_mode: browserMode,
+        browser_endpoint: schema
+          .string()
+          .url()
+          .optional()
+          .describe("Optional CDP WebSocket endpoint."),
+        user_agent: schema.string().min(1).optional().describe("Optional HTTP user agent."),
+        proxy: schema.string().url().optional().describe("Optional proxy URL."),
+        respect_robots_txt: schema.boolean().default(false).describe("Respect robots.txt."),
+        config: schema.string().min(2).optional().describe("Optional CrawlConfig JSON."),
+      },
+      async execute(args, context) {
+        validateJson(args.config, "config");
+        const cliArgs = [
+          "crawl",
+          ...args.urls,
+          "--depth",
+          String(args.depth),
+          "--concurrent",
+          String(args.concurrent),
+        ];
+        pushOption(cliArgs, "--max-pages", args.max_pages);
+        pushOption(cliArgs, "--rate-limit", args.rate_limit);
+        pushFlag(cliArgs, "--stay-on-domain", args.stay_on_domain);
+        pushSharedCrawlOptions(cliArgs, args);
+        return runCli(cliArgs, context);
+      },
+    }),
+    crawlberg_map: tool({
+      description: "Enumerate URLs from sitemaps and link extraction with the crawlberg CLI.",
+      args: {
+        url: schema.string().url().describe("URL to map."),
+        limit: schema.number().int().positive().optional().describe("Maximum URLs to return."),
+        search: schema.string().min(1).optional().describe("Filter URLs by substring."),
+        format: outputFormat,
+        timeout: schema
+          .number()
+          .int()
+          .positive()
+          .max(600000)
+          .default(30000)
+          .describe("Request timeout in ms."),
+        browser_mode: browserMode,
+        browser_endpoint: schema
+          .string()
+          .url()
+          .optional()
+          .describe("Optional CDP WebSocket endpoint."),
+        respect_robots_txt: schema.boolean().default(false).describe("Respect robots.txt."),
+        config: schema.string().min(2).optional().describe("Optional CrawlConfig JSON."),
+      },
+      async execute(args, context) {
+        validateJson(args.config, "config");
+        const cliArgs = ["map", args.url];
+        pushOption(cliArgs, "--limit", args.limit);
+        pushOption(cliArgs, "--search", args.search);
+        pushOption(cliArgs, "--format", args.format);
+        pushOption(cliArgs, "--timeout", args.timeout);
+        pushOption(cliArgs, "--browser-mode", args.browser_mode);
+        pushOption(cliArgs, "--browser-endpoint", args.browser_endpoint);
+        pushFlag(cliArgs, "--respect-robots-txt", args.respect_robots_txt);
+        pushOption(cliArgs, "--config", args.config);
+        return runCli(cliArgs, context);
+      },
+    }),
+  },
+});
+export default CrawlbergPlugin;

package/README.md ADDED Viewed

@@ -0,0 +1,121 @@
+# crawlberg
+Crawl, scrape, and convert websites to Markdown using the local `crawlberg` CLI in your agent.
+<!-- TODO: screenshot -->
+## Install
+### From the marketplace (recommended)
+Pending review for official Claude marketplace.
+Self-host:
+```text
+/plugin marketplace add xberg-io/plugins
+/plugin install crawlberg@xberg
+```
+### Binary requirement
+The MCP server runs through an auto-installing launcher (`scripts/mcp-launch.sh`): on first use it reuses any working `crawlberg` binary already on `PATH`, then tries `npx`/`uvx`, then Homebrew, then a prebuilt download. No manual install is required for MCP.
+To install the CLI yourself (recommended for direct CLI use, and reused by the launcher):
+```bash
+brew install xberg-io/tap/crawlberg
+# or run it without a persistent install (the CLI proxy package self-installs the binary):
+npx @xberg-io/crawlberg-cli --help
+uvx --from crawlberg-cli crawlberg --help
+# or build from source (the mcp/api subcommands are non-default features):
+cargo install crawlberg-cli --features all
+```
+The published npm (`@xberg-io/crawlberg`) and PyPI (`crawlberg`) packages are language *library* bindings, not the CLI — install via Homebrew or the from-source build above for the `crawlberg` binary.
+Headless fallback requires Chrome/Chromium on your system. The CLI launches it on demand; skip the binary if you only plan to use `--browser-mode never`.
+## Skills shipped
+| Skill | Trigger |
+|-------|---------|
+| **crawlberg** | Crawl, scrape, and convert websites to Markdown using the local crawlberg CLI and its MCP server. Use when the user wants to fetch a page, follow links across a domain, enumerate URLs, or drive a real browser. Covers installation, the subcommands (scrape, crawl, map, interact, batch-scrape, batch-crawl, download, citations, version, mcp, serve), output formats (JSON + Markdown), browser fallback, and when to prefer the MCP server over shelling out. |
+| **crawling-a-site** | Use when the user wants to follow links across a domain and capture every reachable page as Markdown. Covers `crawlberg crawl` with depth, page caps, concurrency, rate limiting, domain scoping, robots, and output selection. |
+| **scraping-html-to-markdown** | Use when the user wants a single page rendered as clean Markdown plus structured metadata. Covers `crawlberg scrape <url>`, JSON vs Markdown output, what metadata is returned, and how to handle JS-heavy pages. |
+| **mapping-urls** | Use when the user wants the list of URLs on a site rather than the page content — sitemap analysis, link planning, or seeding another tool. Covers `crawlberg map <url>` with `--limit`, `--search`, robots, output, and how it differs from a full crawl. |
+| **automating-the-browser** | Use when extracting a page needs scripted interaction first — click, type, press a key, scroll, wait, screenshot, or run JS before capturing the DOM. Covers `crawlberg interact <url> --actions` with the real action schema, result shape, limits, and external-CDP options. |
+| **serving-the-api** | Use when the user wants a long-running HTTP service for scrape/crawl/map instead of one-shot CLI calls or the MCP server. Covers `crawlberg serve`, the Firecrawl-v1-compatible endpoints, `--host`/`--port`, and when to prefer it. |
+| **headless-fallback** | Use when a static fetch returns nothing useful and the page needs a real browser. Covers `--browser-mode auto\|always\|never`, external CDP via `--browser-endpoint`, symptoms of JS-only pages and WAF blocks, and the performance cost. |
+## MCP / CLI
+The plugin wires up the `crawlberg` MCP server via `scripts/mcp-launch.sh`, which resolves or installs a version-matched binary, then runs `crawlberg mcp` over stdio. Override binary resolution with `CRAWLBERG_LAUNCHER=auto|download`.
+The MCP server exposes nine tools:
+- `scrape` — fetch and convert a single URL to Markdown or JSON.
+- `crawl` — follow links across a domain, bounded by depth and page count.
+- `map` — enumerate URLs from sitemaps and link extraction.
+- `batch_scrape` — scrape multiple URLs concurrently.
+- `batch_crawl` — crawl multiple seed URLs concurrently.
+- `download` — download a document from a URL and report its file metadata.
+- `interact` — drive a headless browser with click, type, scroll actions.
+- `generate_citations` — rewrite markdown links as numbered citations with a reference list.
+- `get_version` — report the crawlberg library version.
+The CLI offers the same operations as subcommands — `scrape`, `crawl`, `map`, `interact`, `batch-scrape`, `batch-crawl`, `download`, `citations`, `version` — plus `serve` (a Firecrawl-v1-compatible REST API server) and `mcp` (the stdio MCP server). See the `crawlberg` skill for the full per-subcommand flag surface, and the `serving-the-api` skill for when to run the server instead of the CLI or MCP.
+## Configuration
+Pass flags or use inline JSON via `--config`:
+```bash
+crawlberg scrape https://example.com \
+  --format markdown \
+  --browser-mode auto \
+  --timeout 30000
+```
+For complex configs, use JSON:
+```bash
+crawlberg crawl https://example.com \
+  --config '{"max_depth":3,"max_pages":200,"max_concurrent":8,"respect_robots_txt":true}'
+```
+See the `crawlberg` and `crawling-a-site` skills for the full flag surface.
+## Examples
+Fetch a single page and print Markdown:
+```text
+crawlberg scrape https://example.com/article --format markdown
+```
+Crawl a site at depth 3 with rate limiting:
+```text
+crawlberg crawl https://example.com --depth 3 --max-pages 200 --concurrent 8 --stay-on-domain --format markdown
+```
+Enumerate URLs from a sitemap:
+```text
+crawlberg map https://example.com --limit 500
+```
+## Versioning
+The plugin version tracks the marketplace `VERSION` file. See [CHANGELOG.md](../../CHANGELOG.md) for release notes.
+## License
+MIT.
+## See also
+- **Marketplace**: [xberg-io/plugins](https://github.com/xberg-io/plugins)
+- **Upstream**: [xberg-io/crawlberg](https://github.com/xberg-io/crawlberg)
+- **Sibling plugins**: [xberg](../xberg/README.md)

package/assets/icon.svg ADDED Viewed

@@ -0,0 +1,14 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 256" width="256" height="256">
+  <rect width="256" height="256" rx="48" fill="#7C3AED"/>
+  <g stroke="white" stroke-width="12" stroke-linecap="round" fill="none">
+    <line x1="128" y1="64"  x2="64"  y2="128"/>
+    <line x1="128" y1="64"  x2="192" y2="128"/>
+    <line x1="64"  y1="128" x2="128" y2="192"/>
+    <line x1="192" y1="128" x2="128" y2="192"/>
+    <line x1="128" y1="64"  x2="128" y2="192"/>
+  </g>
+  <circle cx="128" cy="64"  r="20" fill="white"/>
+  <circle cx="64"  cy="128" r="20" fill="white"/>
+  <circle cx="192" cy="128" r="20" fill="white"/>
+  <circle cx="128" cy="192" r="20" fill="white"/>
+</svg>

package/assets/logo.png ADDED Viewed

Binary file

package/package.json ADDED Viewed

@@ -0,0 +1,41 @@
+{
+  "name": "@xberg-io/opencode-crawlberg",
+  "version": "0.1.0",
+  "description": "OpenCode tools for crawling and scraping with the crawlberg CLI.",
+  "keywords": [
+    "crawling",
+    "html-to-markdown",
+    "opencode",
+    "web-scraping"
+  ],
+  "homepage": "https://github.com/xberg-io/plugins/tree/main/plugins/crawlberg",
+  "bugs": {
+    "url": "https://github.com/xberg-io/plugins/issues"
+  },
+  "license": "MIT",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/xberg-io/plugins.git",
+    "directory": "plugins/crawlberg"
+  },
+  "files": [
+    ".opencode/",
+    "assets/",
+    "README.md"
+  ],
+  "type": "module",
+  "main": ".opencode/plugins/crawlberg.js",
+  "exports": {
+    ".": "./.opencode/plugins/crawlberg.js"
+  },
+  "publishConfig": {
+    "access": "public",
+    "provenance": true
+  },
+  "dependencies": {
+    "@opencode-ai/plugin": "^1.17.8"
+  },
+  "engines": {
+    "node": ">=22.14.0"
+  }
+}