crw 0.2.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
crw-0.2.1/.gitignore ADDED
@@ -0,0 +1,7 @@
1
+ /target
2
+ .claude
3
+ PROGRESS.md
4
+ .gstack/
5
+ python/__pycache__/
6
+ python/.venv/
7
+ python/uv.lock
crw-0.2.1/PKG-INFO ADDED
@@ -0,0 +1,50 @@
1
+ Metadata-Version: 2.4
2
+ Name: crw
3
+ Version: 0.2.1
4
+ Summary: Python SDK for CRW web scraper — scrape, crawl, and map any website from Python
5
+ Project-URL: Homepage, https://github.com/us/crw
6
+ Project-URL: Documentation, https://us.github.io/crw
7
+ Project-URL: Repository, https://github.com/us/crw
8
+ License-Expression: MIT
9
+ Keywords: ai-agent,crawler,firecrawl,mcp,scraper,web-scraping
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: License :: OSI Approved :: MIT License
13
+ Classifier: Programming Language :: Python :: 3
14
+ Classifier: Topic :: Internet :: WWW/HTTP
15
+ Classifier: Topic :: Software Development :: Libraries
16
+ Requires-Python: >=3.9
17
+ Requires-Dist: platformdirs>=3.0
18
+ Description-Content-Type: text/markdown
19
+
20
+ # crw
21
+
22
+ Python SDK for [CRW](https://github.com/us/crw) — the open-source web scraper built for AI agents.
23
+
24
+ ## Install
25
+
26
+ ```bash
27
+ pip install crw
28
+ ```
29
+
30
+ ## Usage
31
+
32
+ ```python
33
+ from crw import CrwClient
34
+
35
+ # Zero-config (downloads crw-mcp binary automatically):
36
+ client = CrwClient()
37
+ result = client.scrape("https://example.com")
38
+ print(result["markdown"])
39
+
40
+ # Or connect to a remote server:
41
+ client = CrwClient(api_url="https://fastcrw.com/api", api_key="fc-...")
42
+ ```
43
+
44
+ ## MCP Server
45
+
46
+ After installing, you can also use `crw-mcp` as an MCP server:
47
+
48
+ ```bash
49
+ crw-mcp # starts stdio MCP server
50
+ ```
crw-0.2.1/README.md ADDED
@@ -0,0 +1,31 @@
1
+ # crw
2
+
3
+ Python SDK for [CRW](https://github.com/us/crw) — the open-source web scraper built for AI agents.
4
+
5
+ ## Install
6
+
7
+ ```bash
8
+ pip install crw
9
+ ```
10
+
11
+ ## Usage
12
+
13
+ ```python
14
+ from crw import CrwClient
15
+
16
+ # Zero-config (downloads crw-mcp binary automatically):
17
+ client = CrwClient()
18
+ result = client.scrape("https://example.com")
19
+ print(result["markdown"])
20
+
21
+ # Or connect to a remote server:
22
+ client = CrwClient(api_url="https://fastcrw.com/api", api_key="fc-...")
23
+ ```
24
+
25
+ ## MCP Server
26
+
27
+ After installing, you can also use `crw-mcp` as an MCP server:
28
+
29
+ ```bash
30
+ crw-mcp # starts stdio MCP server
31
+ ```
@@ -0,0 +1,32 @@
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "crw"
7
+ version = "0.2.1"
8
+ description = "Python SDK for CRW web scraper — scrape, crawl, and map any website from Python"
9
+ readme = "README.md"
10
+ requires-python = ">=3.9"
11
+ license = "MIT"
12
+ keywords = ["web-scraping", "mcp", "ai-agent", "crawler", "firecrawl", "scraper"]
13
+ classifiers = [
14
+ "Development Status :: 4 - Beta",
15
+ "Intended Audience :: Developers",
16
+ "License :: OSI Approved :: MIT License",
17
+ "Programming Language :: Python :: 3",
18
+ "Topic :: Internet :: WWW/HTTP",
19
+ "Topic :: Software Development :: Libraries",
20
+ ]
21
+ dependencies = ["platformdirs>=3.0"]
22
+
23
+ [project.urls]
24
+ Homepage = "https://github.com/us/crw"
25
+ Documentation = "https://us.github.io/crw"
26
+ Repository = "https://github.com/us/crw"
27
+
28
+ [project.scripts]
29
+ crw-mcp = "crw.__main__:main"
30
+
31
+ [tool.hatch.build.targets.wheel]
32
+ packages = ["src/crw"]
@@ -0,0 +1,6 @@
1
+ """CRW Python SDK — scrape, crawl, and map any website."""
2
+
3
+ from crw.client import CrwClient
4
+ from crw.exceptions import CrwError, CrwBinaryNotFoundError, CrwTimeoutError
5
+
6
+ __all__ = ["CrwClient", "CrwError", "CrwBinaryNotFoundError", "CrwTimeoutError"]
@@ -0,0 +1,15 @@
1
+ """CLI entry point — exec crw-mcp binary."""
2
+
3
+ import os
4
+ import sys
5
+
6
+ from crw._binary import ensure_binary
7
+
8
+
9
+ def main() -> None:
10
+ binary = ensure_binary()
11
+ os.execvp(str(binary), [str(binary)] + sys.argv[1:])
12
+
13
+
14
+ if __name__ == "__main__":
15
+ main()
@@ -0,0 +1,115 @@
1
+ """Download, cache, and locate the crw-mcp binary."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import os
6
+ import stat
7
+ import tarfile
8
+ import zipfile
9
+ from io import BytesIO
10
+ from pathlib import Path
11
+ from urllib.request import Request, urlopen
12
+
13
+ from platformdirs import user_cache_dir
14
+
15
+ from crw._platform import BINARY_NAME, get_asset_name
16
+ from crw.exceptions import CrwBinaryNotFoundError
17
+
18
+ try:
19
+ from importlib.metadata import version as _pkg_version
20
+ BINARY_VERSION = _pkg_version("crw")
21
+ except Exception:
22
+ BINARY_VERSION = "0.2.1" # fallback for development
23
+ GITHUB_REPO = "us/crw"
24
+ DOWNLOAD_URL = f"https://github.com/{GITHUB_REPO}/releases/download/v{BINARY_VERSION}"
25
+
26
+
27
+ def _cache_dir() -> Path:
28
+ return Path(user_cache_dir("crw")) / f"v{BINARY_VERSION}"
29
+
30
+
31
+ def _cached_binary() -> Path | None:
32
+ """Return path to cached binary if it exists and is executable."""
33
+ path = _cache_dir() / BINARY_NAME
34
+ if path.is_file() and os.access(path, os.X_OK):
35
+ return path
36
+ return None
37
+
38
+
39
+ def _download_binary() -> Path:
40
+ """Download the binary from GitHub Releases and cache it."""
41
+ asset = get_asset_name()
42
+ if asset is None:
43
+ raise CrwBinaryNotFoundError(
44
+ f"No prebuilt binary for this platform. "
45
+ f"Install from source: cargo install crw-mcp"
46
+ )
47
+
48
+ url = f"{DOWNLOAD_URL}/{asset}"
49
+ cache = _cache_dir()
50
+ cache.mkdir(parents=True, exist_ok=True)
51
+
52
+ req = Request(url, headers={"User-Agent": f"crw-python/{BINARY_VERSION}"})
53
+ with urlopen(req, timeout=120) as resp:
54
+ data = resp.read()
55
+
56
+ # Extract binary from archive
57
+ if asset.endswith(".tar.gz"):
58
+ with tarfile.open(fileobj=BytesIO(data), mode="r:gz") as tar:
59
+ for member in tar.getmembers():
60
+ if member.name.endswith("crw-mcp"):
61
+ member.name = BINARY_NAME
62
+ tar.extract(member, path=cache)
63
+ break
64
+ else:
65
+ raise CrwBinaryNotFoundError(f"crw-mcp not found in {asset}")
66
+ elif asset.endswith(".zip"):
67
+ with zipfile.ZipFile(BytesIO(data)) as zf:
68
+ for name in zf.namelist():
69
+ if name.endswith("crw-mcp.exe") or name.endswith("crw-mcp"):
70
+ target = cache / BINARY_NAME
71
+ target.write_bytes(zf.read(name))
72
+ break
73
+ else:
74
+ raise CrwBinaryNotFoundError(f"crw-mcp not found in {asset}")
75
+
76
+ binary = cache / BINARY_NAME
77
+ binary.chmod(binary.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
78
+ return binary
79
+
80
+
81
+ def _is_native_binary(path: Path) -> bool:
82
+ """Check if a file is a native binary (not a Python/shell script)."""
83
+ try:
84
+ with open(path, "rb") as f:
85
+ header = f.read(4)
86
+ # ELF, Mach-O, or PE magic bytes
87
+ return header[:4] in (b"\x7fELF", b"\xcf\xfa\xed\xfe", b"\xce\xfa\xed\xfe", b"MZ\x90\x00", b"MZ\x00\x00")
88
+ except OSError:
89
+ return False
90
+
91
+
92
+ def ensure_binary() -> Path:
93
+ """Return path to crw-mcp binary, downloading if necessary."""
94
+ # 1. Check CRW_BINARY env override
95
+ env_path = os.environ.get("CRW_BINARY")
96
+ if env_path:
97
+ p = Path(env_path)
98
+ if p.is_file():
99
+ return p
100
+ raise CrwBinaryNotFoundError(f"CRW_BINARY={env_path} does not exist")
101
+
102
+ # 2. Check if crw-mcp native binary is on PATH (e.g. cargo install)
103
+ # Search all PATH entries, skipping our Python wrapper
104
+ for directory in os.environ.get("PATH", "").split(os.pathsep):
105
+ candidate = Path(directory) / BINARY_NAME
106
+ if candidate.is_file() and _is_native_binary(candidate):
107
+ return candidate
108
+
109
+ # 3. Check cache
110
+ cached = _cached_binary()
111
+ if cached:
112
+ return cached
113
+
114
+ # 4. Download
115
+ return _download_binary()
@@ -0,0 +1,22 @@
1
+ """Platform detection and GitHub Release asset mapping."""
2
+
3
+ import platform
4
+ import sys
5
+
6
+ # Maps (system, machine) to GitHub Release asset filename
7
+ PLATFORM_MAP: dict[tuple[str, str], str] = {
8
+ ("Darwin", "arm64"): "crw-mcp-darwin-arm64.tar.gz",
9
+ ("Darwin", "x86_64"): "crw-mcp-darwin-x64.tar.gz",
10
+ ("Linux", "x86_64"): "crw-mcp-linux-x64.tar.gz",
11
+ ("Linux", "aarch64"): "crw-mcp-linux-arm64.tar.gz",
12
+ ("Windows", "AMD64"): "crw-mcp-win32-x64.zip",
13
+ ("Windows", "ARM64"): "crw-mcp-win32-arm64.zip",
14
+ }
15
+
16
+ BINARY_NAME = "crw-mcp.exe" if sys.platform == "win32" else "crw-mcp"
17
+
18
+
19
+ def get_asset_name() -> str | None:
20
+ """Return the GitHub Release asset filename for the current platform."""
21
+ key = (platform.system(), platform.machine())
22
+ return PLATFORM_MAP.get(key)
@@ -0,0 +1,247 @@
1
+ """CRW client — subprocess (embedded) or HTTP mode."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import json
6
+ import subprocess
7
+ import time
8
+ from typing import Any
9
+
10
+ from crw._binary import ensure_binary
11
+ from crw.exceptions import CrwApiError, CrwError, CrwTimeoutError
12
+
13
+ _REQUEST_ID = 0
14
+
15
+
16
+ def _next_id() -> int:
17
+ global _REQUEST_ID
18
+ _REQUEST_ID += 1
19
+ return _REQUEST_ID
20
+
21
+
22
+ class CrwClient:
23
+ """CRW web scraper client.
24
+
25
+ Args:
26
+ api_url: CRW server URL for HTTP mode. If None, uses subprocess mode
27
+ (spawns crw-mcp binary, no server required).
28
+ api_key: API key for authentication (HTTP mode or fastcrw.com).
29
+
30
+ Examples:
31
+ # Subprocess mode (zero config, no server):
32
+ client = CrwClient()
33
+ result = client.scrape("https://example.com")
34
+
35
+ # HTTP mode (remote server):
36
+ client = CrwClient(api_url="https://fastcrw.com/api", api_key="fc-...")
37
+ result = client.scrape("https://example.com")
38
+ """
39
+
40
+ def __init__(self, api_url: str | None = None, api_key: str | None = None):
41
+ self._api_url = api_url
42
+ self._api_key = api_key
43
+ self._process: subprocess.Popen | None = None
44
+
45
+ def scrape(
46
+ self,
47
+ url: str,
48
+ formats: list[str] | None = None,
49
+ only_main_content: bool = True,
50
+ include_tags: list[str] | None = None,
51
+ exclude_tags: list[str] | None = None,
52
+ **kwargs: Any,
53
+ ) -> dict:
54
+ """Scrape a single URL and return its content.
55
+
56
+ Returns:
57
+ Dict with keys like 'markdown', 'html', 'metadata', etc.
58
+ """
59
+ args: dict[str, Any] = {"url": url, "onlyMainContent": only_main_content}
60
+ if formats:
61
+ args["formats"] = formats
62
+ if include_tags:
63
+ args["includeTags"] = include_tags
64
+ if exclude_tags:
65
+ args["excludeTags"] = exclude_tags
66
+ args.update(kwargs)
67
+
68
+ if self._api_url:
69
+ return self._http_post("/v1/scrape", args)
70
+ return self._tool_call("crw_scrape", args)
71
+
72
+ def crawl(
73
+ self,
74
+ url: str,
75
+ max_depth: int = 2,
76
+ max_pages: int = 10,
77
+ poll_interval: float = 2.0,
78
+ timeout: float = 300.0,
79
+ **kwargs: Any,
80
+ ) -> list[dict]:
81
+ """Crawl a website and return all page results.
82
+
83
+ Starts an async crawl, polls for completion, and returns all results.
84
+ """
85
+ args: dict[str, Any] = {"url": url, "maxDepth": max_depth, "maxPages": max_pages}
86
+ args.update(kwargs)
87
+
88
+ if self._api_url:
89
+ return self._http_crawl(args, poll_interval, timeout)
90
+
91
+ # Subprocess mode: start crawl, poll status
92
+ result = self._tool_call("crw_crawl", args)
93
+ job_id = result.get("id")
94
+ if not job_id:
95
+ raise CrwError(f"Crawl did not return job ID: {result}")
96
+
97
+ return self._poll_crawl(job_id, poll_interval, timeout)
98
+
99
+ def map(
100
+ self,
101
+ url: str,
102
+ max_depth: int = 2,
103
+ use_sitemap: bool = True,
104
+ **kwargs: Any,
105
+ ) -> list[str]:
106
+ """Discover URLs on a website.
107
+
108
+ Returns:
109
+ List of discovered URLs.
110
+ """
111
+ args: dict[str, Any] = {"url": url, "maxDepth": max_depth, "useSitemap": use_sitemap}
112
+ args.update(kwargs)
113
+
114
+ if self._api_url:
115
+ data = self._http_post("/v1/map", args)
116
+ return data.get("links", [])
117
+
118
+ result = self._tool_call("crw_map", args)
119
+ return result.get("links", [])
120
+
121
+ def close(self) -> None:
122
+ """Shut down the subprocess if running."""
123
+ if self._process and self._process.poll() is None:
124
+ self._process.stdin.close()
125
+ try:
126
+ self._process.wait(timeout=5)
127
+ except subprocess.TimeoutExpired:
128
+ self._process.terminate()
129
+ self._process.wait(timeout=5)
130
+ self._process = None
131
+
132
+ def __enter__(self) -> CrwClient:
133
+ return self
134
+
135
+ def __exit__(self, *_: Any) -> None:
136
+ self.close()
137
+
138
+ def __del__(self) -> None:
139
+ self.close()
140
+
141
+ # --- Subprocess (embedded) mode ---
142
+
143
+ def _ensure_process(self) -> subprocess.Popen:
144
+ if self._process is None or self._process.poll() is not None:
145
+ binary = ensure_binary()
146
+ self._process = subprocess.Popen(
147
+ [str(binary)],
148
+ stdin=subprocess.PIPE,
149
+ stdout=subprocess.PIPE,
150
+ stderr=subprocess.DEVNULL,
151
+ text=True,
152
+ )
153
+ return self._process
154
+
155
+ def _jsonrpc(self, method: str, params: dict | None = None) -> Any:
156
+ proc = self._ensure_process()
157
+ req = {
158
+ "jsonrpc": "2.0",
159
+ "id": _next_id(),
160
+ "method": method,
161
+ "params": params or {},
162
+ }
163
+ proc.stdin.write(json.dumps(req) + "\n")
164
+ proc.stdin.flush()
165
+
166
+ line = proc.stdout.readline()
167
+ if not line:
168
+ raise CrwError("crw-mcp process closed unexpectedly")
169
+
170
+ resp = json.loads(line)
171
+ if "error" in resp:
172
+ raise CrwApiError(resp["error"].get("message", str(resp["error"])))
173
+ return resp.get("result")
174
+
175
+ def _tool_call(self, tool_name: str, arguments: dict) -> dict:
176
+ result = self._jsonrpc("tools/call", {"name": tool_name, "arguments": arguments})
177
+ if not result or not result.get("content"):
178
+ raise CrwError(f"Empty response from {tool_name}")
179
+
180
+ content = result["content"][0]
181
+ if result.get("isError"):
182
+ raise CrwApiError(content.get("text", "Unknown error"))
183
+
184
+ return json.loads(content["text"])
185
+
186
+ def _poll_crawl(self, job_id: str, poll_interval: float, timeout: float) -> list[dict]:
187
+ start = time.monotonic()
188
+ while True:
189
+ if time.monotonic() - start > timeout:
190
+ raise CrwTimeoutError(f"Crawl {job_id} timed out after {timeout}s")
191
+
192
+ result = self._tool_call("crw_check_crawl_status", {"id": job_id})
193
+ status = result.get("status")
194
+
195
+ if status == "completed":
196
+ return result.get("data", [])
197
+ if status == "failed":
198
+ raise CrwError(f"Crawl failed: {result.get('error', 'unknown')}")
199
+
200
+ time.sleep(poll_interval)
201
+
202
+ # --- HTTP mode ---
203
+
204
+ def _http_request(self, method: str, path: str, body: dict | None = None) -> dict:
205
+ import urllib.request
206
+
207
+ url = f"{self._api_url.rstrip('/')}{path}"
208
+ headers = {"Content-Type": "application/json"}
209
+ if self._api_key:
210
+ headers["Authorization"] = f"Bearer {self._api_key}"
211
+
212
+ data = json.dumps(body).encode() if body else None
213
+ req = urllib.request.Request(url, data=data, headers=headers, method=method)
214
+
215
+ with urllib.request.urlopen(req, timeout=120) as resp:
216
+ result = json.loads(resp.read())
217
+
218
+ if not result.get("success"):
219
+ raise CrwApiError(result.get("error", "API error"))
220
+ return result.get("data", result)
221
+
222
+ def _http_post(self, path: str, body: dict) -> dict:
223
+ return self._http_request("POST", path, body)
224
+
225
+ def _http_get(self, path: str) -> dict:
226
+ return self._http_request("GET", path)
227
+
228
+ def _http_crawl(self, args: dict, poll_interval: float, timeout: float) -> list[dict]:
229
+ result = self._http_post("/v1/crawl", args)
230
+ job_id = result.get("id")
231
+ if not job_id:
232
+ raise CrwError(f"Crawl did not return job ID: {result}")
233
+
234
+ start = time.monotonic()
235
+ while True:
236
+ if time.monotonic() - start > timeout:
237
+ raise CrwTimeoutError(f"Crawl {job_id} timed out after {timeout}s")
238
+
239
+ status_result = self._http_get(f"/v1/crawl/{job_id}")
240
+ status = status_result.get("status")
241
+
242
+ if status == "completed":
243
+ return status_result.get("data", [])
244
+ if status == "failed":
245
+ raise CrwError(f"Crawl failed: {status_result.get('error', 'unknown')}")
246
+
247
+ time.sleep(poll_interval)
@@ -0,0 +1,21 @@
1
+ """CRW SDK exceptions."""
2
+
3
+
4
+ class CrwError(Exception):
5
+ """Base exception for CRW SDK."""
6
+
7
+
8
+ class CrwBinaryNotFoundError(CrwError):
9
+ """Binary could not be found or downloaded."""
10
+
11
+
12
+ class CrwTimeoutError(CrwError):
13
+ """Operation timed out."""
14
+
15
+
16
+ class CrwApiError(CrwError):
17
+ """API returned an error."""
18
+
19
+ def __init__(self, message: str, status_code: int | None = None):
20
+ super().__init__(message)
21
+ self.status_code = status_code