crw 0.2.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- crw-0.2.1/.gitignore +7 -0
- crw-0.2.1/PKG-INFO +50 -0
- crw-0.2.1/README.md +31 -0
- crw-0.2.1/pyproject.toml +32 -0
- crw-0.2.1/src/crw/__init__.py +6 -0
- crw-0.2.1/src/crw/__main__.py +15 -0
- crw-0.2.1/src/crw/_binary.py +115 -0
- crw-0.2.1/src/crw/_platform.py +22 -0
- crw-0.2.1/src/crw/client.py +247 -0
- crw-0.2.1/src/crw/exceptions.py +21 -0
crw-0.2.1/.gitignore
ADDED
crw-0.2.1/PKG-INFO
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: crw
|
|
3
|
+
Version: 0.2.1
|
|
4
|
+
Summary: Python SDK for CRW web scraper — scrape, crawl, and map any website from Python
|
|
5
|
+
Project-URL: Homepage, https://github.com/us/crw
|
|
6
|
+
Project-URL: Documentation, https://us.github.io/crw
|
|
7
|
+
Project-URL: Repository, https://github.com/us/crw
|
|
8
|
+
License-Expression: MIT
|
|
9
|
+
Keywords: ai-agent,crawler,firecrawl,mcp,scraper,web-scraping
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
13
|
+
Classifier: Programming Language :: Python :: 3
|
|
14
|
+
Classifier: Topic :: Internet :: WWW/HTTP
|
|
15
|
+
Classifier: Topic :: Software Development :: Libraries
|
|
16
|
+
Requires-Python: >=3.9
|
|
17
|
+
Requires-Dist: platformdirs>=3.0
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
|
|
20
|
+
# crw
|
|
21
|
+
|
|
22
|
+
Python SDK for [CRW](https://github.com/us/crw) — the open-source web scraper built for AI agents.
|
|
23
|
+
|
|
24
|
+
## Install
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
pip install crw
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
## Usage
|
|
31
|
+
|
|
32
|
+
```python
|
|
33
|
+
from crw import CrwClient
|
|
34
|
+
|
|
35
|
+
# Zero-config (downloads crw-mcp binary automatically):
|
|
36
|
+
client = CrwClient()
|
|
37
|
+
result = client.scrape("https://example.com")
|
|
38
|
+
print(result["markdown"])
|
|
39
|
+
|
|
40
|
+
# Or connect to a remote server:
|
|
41
|
+
client = CrwClient(api_url="https://fastcrw.com/api", api_key="fc-...")
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
## MCP Server
|
|
45
|
+
|
|
46
|
+
After installing, you can also use `crw-mcp` as an MCP server:
|
|
47
|
+
|
|
48
|
+
```bash
|
|
49
|
+
crw-mcp # starts stdio MCP server
|
|
50
|
+
```
|
crw-0.2.1/README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
# crw
|
|
2
|
+
|
|
3
|
+
Python SDK for [CRW](https://github.com/us/crw) — the open-source web scraper built for AI agents.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
pip install crw
|
|
9
|
+
```
|
|
10
|
+
|
|
11
|
+
## Usage
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from crw import CrwClient
|
|
15
|
+
|
|
16
|
+
# Zero-config (downloads crw-mcp binary automatically):
|
|
17
|
+
client = CrwClient()
|
|
18
|
+
result = client.scrape("https://example.com")
|
|
19
|
+
print(result["markdown"])
|
|
20
|
+
|
|
21
|
+
# Or connect to a remote server:
|
|
22
|
+
client = CrwClient(api_url="https://fastcrw.com/api", api_key="fc-...")
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
## MCP Server
|
|
26
|
+
|
|
27
|
+
After installing, you can also use `crw-mcp` as an MCP server:
|
|
28
|
+
|
|
29
|
+
```bash
|
|
30
|
+
crw-mcp # starts stdio MCP server
|
|
31
|
+
```
|
crw-0.2.1/pyproject.toml
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["hatchling"]
|
|
3
|
+
build-backend = "hatchling.build"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "crw"
|
|
7
|
+
version = "0.2.1"
|
|
8
|
+
description = "Python SDK for CRW web scraper — scrape, crawl, and map any website from Python"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.9"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
keywords = ["web-scraping", "mcp", "ai-agent", "crawler", "firecrawl", "scraper"]
|
|
13
|
+
classifiers = [
|
|
14
|
+
"Development Status :: 4 - Beta",
|
|
15
|
+
"Intended Audience :: Developers",
|
|
16
|
+
"License :: OSI Approved :: MIT License",
|
|
17
|
+
"Programming Language :: Python :: 3",
|
|
18
|
+
"Topic :: Internet :: WWW/HTTP",
|
|
19
|
+
"Topic :: Software Development :: Libraries",
|
|
20
|
+
]
|
|
21
|
+
dependencies = ["platformdirs>=3.0"]
|
|
22
|
+
|
|
23
|
+
[project.urls]
|
|
24
|
+
Homepage = "https://github.com/us/crw"
|
|
25
|
+
Documentation = "https://us.github.io/crw"
|
|
26
|
+
Repository = "https://github.com/us/crw"
|
|
27
|
+
|
|
28
|
+
[project.scripts]
|
|
29
|
+
crw-mcp = "crw.__main__:main"
|
|
30
|
+
|
|
31
|
+
[tool.hatch.build.targets.wheel]
|
|
32
|
+
packages = ["src/crw"]
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""CLI entry point — exec crw-mcp binary."""
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
from crw._binary import ensure_binary
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def main() -> None:
|
|
10
|
+
binary = ensure_binary()
|
|
11
|
+
os.execvp(str(binary), [str(binary)] + sys.argv[1:])
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
if __name__ == "__main__":
|
|
15
|
+
main()
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""Download, cache, and locate the crw-mcp binary."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
import stat
|
|
7
|
+
import tarfile
|
|
8
|
+
import zipfile
|
|
9
|
+
from io import BytesIO
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from urllib.request import Request, urlopen
|
|
12
|
+
|
|
13
|
+
from platformdirs import user_cache_dir
|
|
14
|
+
|
|
15
|
+
from crw._platform import BINARY_NAME, get_asset_name
|
|
16
|
+
from crw.exceptions import CrwBinaryNotFoundError
|
|
17
|
+
|
|
18
|
+
try:
|
|
19
|
+
from importlib.metadata import version as _pkg_version
|
|
20
|
+
BINARY_VERSION = _pkg_version("crw")
|
|
21
|
+
except Exception:
|
|
22
|
+
BINARY_VERSION = "0.2.1" # fallback for development
|
|
23
|
+
GITHUB_REPO = "us/crw"
|
|
24
|
+
DOWNLOAD_URL = f"https://github.com/{GITHUB_REPO}/releases/download/v{BINARY_VERSION}"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _cache_dir() -> Path:
|
|
28
|
+
return Path(user_cache_dir("crw")) / f"v{BINARY_VERSION}"
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _cached_binary() -> Path | None:
|
|
32
|
+
"""Return path to cached binary if it exists and is executable."""
|
|
33
|
+
path = _cache_dir() / BINARY_NAME
|
|
34
|
+
if path.is_file() and os.access(path, os.X_OK):
|
|
35
|
+
return path
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _download_binary() -> Path:
|
|
40
|
+
"""Download the binary from GitHub Releases and cache it."""
|
|
41
|
+
asset = get_asset_name()
|
|
42
|
+
if asset is None:
|
|
43
|
+
raise CrwBinaryNotFoundError(
|
|
44
|
+
f"No prebuilt binary for this platform. "
|
|
45
|
+
f"Install from source: cargo install crw-mcp"
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
url = f"{DOWNLOAD_URL}/{asset}"
|
|
49
|
+
cache = _cache_dir()
|
|
50
|
+
cache.mkdir(parents=True, exist_ok=True)
|
|
51
|
+
|
|
52
|
+
req = Request(url, headers={"User-Agent": f"crw-python/{BINARY_VERSION}"})
|
|
53
|
+
with urlopen(req, timeout=120) as resp:
|
|
54
|
+
data = resp.read()
|
|
55
|
+
|
|
56
|
+
# Extract binary from archive
|
|
57
|
+
if asset.endswith(".tar.gz"):
|
|
58
|
+
with tarfile.open(fileobj=BytesIO(data), mode="r:gz") as tar:
|
|
59
|
+
for member in tar.getmembers():
|
|
60
|
+
if member.name.endswith("crw-mcp"):
|
|
61
|
+
member.name = BINARY_NAME
|
|
62
|
+
tar.extract(member, path=cache)
|
|
63
|
+
break
|
|
64
|
+
else:
|
|
65
|
+
raise CrwBinaryNotFoundError(f"crw-mcp not found in {asset}")
|
|
66
|
+
elif asset.endswith(".zip"):
|
|
67
|
+
with zipfile.ZipFile(BytesIO(data)) as zf:
|
|
68
|
+
for name in zf.namelist():
|
|
69
|
+
if name.endswith("crw-mcp.exe") or name.endswith("crw-mcp"):
|
|
70
|
+
target = cache / BINARY_NAME
|
|
71
|
+
target.write_bytes(zf.read(name))
|
|
72
|
+
break
|
|
73
|
+
else:
|
|
74
|
+
raise CrwBinaryNotFoundError(f"crw-mcp not found in {asset}")
|
|
75
|
+
|
|
76
|
+
binary = cache / BINARY_NAME
|
|
77
|
+
binary.chmod(binary.stat().st_mode | stat.S_IEXEC | stat.S_IXGRP | stat.S_IXOTH)
|
|
78
|
+
return binary
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
def _is_native_binary(path: Path) -> bool:
|
|
82
|
+
"""Check if a file is a native binary (not a Python/shell script)."""
|
|
83
|
+
try:
|
|
84
|
+
with open(path, "rb") as f:
|
|
85
|
+
header = f.read(4)
|
|
86
|
+
# ELF, Mach-O, or PE magic bytes
|
|
87
|
+
return header[:4] in (b"\x7fELF", b"\xcf\xfa\xed\xfe", b"\xce\xfa\xed\xfe", b"MZ\x90\x00", b"MZ\x00\x00")
|
|
88
|
+
except OSError:
|
|
89
|
+
return False
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def ensure_binary() -> Path:
|
|
93
|
+
"""Return path to crw-mcp binary, downloading if necessary."""
|
|
94
|
+
# 1. Check CRW_BINARY env override
|
|
95
|
+
env_path = os.environ.get("CRW_BINARY")
|
|
96
|
+
if env_path:
|
|
97
|
+
p = Path(env_path)
|
|
98
|
+
if p.is_file():
|
|
99
|
+
return p
|
|
100
|
+
raise CrwBinaryNotFoundError(f"CRW_BINARY={env_path} does not exist")
|
|
101
|
+
|
|
102
|
+
# 2. Check if crw-mcp native binary is on PATH (e.g. cargo install)
|
|
103
|
+
# Search all PATH entries, skipping our Python wrapper
|
|
104
|
+
for directory in os.environ.get("PATH", "").split(os.pathsep):
|
|
105
|
+
candidate = Path(directory) / BINARY_NAME
|
|
106
|
+
if candidate.is_file() and _is_native_binary(candidate):
|
|
107
|
+
return candidate
|
|
108
|
+
|
|
109
|
+
# 3. Check cache
|
|
110
|
+
cached = _cached_binary()
|
|
111
|
+
if cached:
|
|
112
|
+
return cached
|
|
113
|
+
|
|
114
|
+
# 4. Download
|
|
115
|
+
return _download_binary()
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Platform detection and GitHub Release asset mapping."""
|
|
2
|
+
|
|
3
|
+
import platform
|
|
4
|
+
import sys
|
|
5
|
+
|
|
6
|
+
# Maps (system, machine) to GitHub Release asset filename
|
|
7
|
+
PLATFORM_MAP: dict[tuple[str, str], str] = {
|
|
8
|
+
("Darwin", "arm64"): "crw-mcp-darwin-arm64.tar.gz",
|
|
9
|
+
("Darwin", "x86_64"): "crw-mcp-darwin-x64.tar.gz",
|
|
10
|
+
("Linux", "x86_64"): "crw-mcp-linux-x64.tar.gz",
|
|
11
|
+
("Linux", "aarch64"): "crw-mcp-linux-arm64.tar.gz",
|
|
12
|
+
("Windows", "AMD64"): "crw-mcp-win32-x64.zip",
|
|
13
|
+
("Windows", "ARM64"): "crw-mcp-win32-arm64.zip",
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
BINARY_NAME = "crw-mcp.exe" if sys.platform == "win32" else "crw-mcp"
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def get_asset_name() -> str | None:
|
|
20
|
+
"""Return the GitHub Release asset filename for the current platform."""
|
|
21
|
+
key = (platform.system(), platform.machine())
|
|
22
|
+
return PLATFORM_MAP.get(key)
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
"""CRW client — subprocess (embedded) or HTTP mode."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import subprocess
|
|
7
|
+
import time
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
from crw._binary import ensure_binary
|
|
11
|
+
from crw.exceptions import CrwApiError, CrwError, CrwTimeoutError
|
|
12
|
+
|
|
13
|
+
_REQUEST_ID = 0
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def _next_id() -> int:
|
|
17
|
+
global _REQUEST_ID
|
|
18
|
+
_REQUEST_ID += 1
|
|
19
|
+
return _REQUEST_ID
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class CrwClient:
|
|
23
|
+
"""CRW web scraper client.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
api_url: CRW server URL for HTTP mode. If None, uses subprocess mode
|
|
27
|
+
(spawns crw-mcp binary, no server required).
|
|
28
|
+
api_key: API key for authentication (HTTP mode or fastcrw.com).
|
|
29
|
+
|
|
30
|
+
Examples:
|
|
31
|
+
# Subprocess mode (zero config, no server):
|
|
32
|
+
client = CrwClient()
|
|
33
|
+
result = client.scrape("https://example.com")
|
|
34
|
+
|
|
35
|
+
# HTTP mode (remote server):
|
|
36
|
+
client = CrwClient(api_url="https://fastcrw.com/api", api_key="fc-...")
|
|
37
|
+
result = client.scrape("https://example.com")
|
|
38
|
+
"""
|
|
39
|
+
|
|
40
|
+
def __init__(self, api_url: str | None = None, api_key: str | None = None):
|
|
41
|
+
self._api_url = api_url
|
|
42
|
+
self._api_key = api_key
|
|
43
|
+
self._process: subprocess.Popen | None = None
|
|
44
|
+
|
|
45
|
+
def scrape(
|
|
46
|
+
self,
|
|
47
|
+
url: str,
|
|
48
|
+
formats: list[str] | None = None,
|
|
49
|
+
only_main_content: bool = True,
|
|
50
|
+
include_tags: list[str] | None = None,
|
|
51
|
+
exclude_tags: list[str] | None = None,
|
|
52
|
+
**kwargs: Any,
|
|
53
|
+
) -> dict:
|
|
54
|
+
"""Scrape a single URL and return its content.
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
Dict with keys like 'markdown', 'html', 'metadata', etc.
|
|
58
|
+
"""
|
|
59
|
+
args: dict[str, Any] = {"url": url, "onlyMainContent": only_main_content}
|
|
60
|
+
if formats:
|
|
61
|
+
args["formats"] = formats
|
|
62
|
+
if include_tags:
|
|
63
|
+
args["includeTags"] = include_tags
|
|
64
|
+
if exclude_tags:
|
|
65
|
+
args["excludeTags"] = exclude_tags
|
|
66
|
+
args.update(kwargs)
|
|
67
|
+
|
|
68
|
+
if self._api_url:
|
|
69
|
+
return self._http_post("/v1/scrape", args)
|
|
70
|
+
return self._tool_call("crw_scrape", args)
|
|
71
|
+
|
|
72
|
+
def crawl(
|
|
73
|
+
self,
|
|
74
|
+
url: str,
|
|
75
|
+
max_depth: int = 2,
|
|
76
|
+
max_pages: int = 10,
|
|
77
|
+
poll_interval: float = 2.0,
|
|
78
|
+
timeout: float = 300.0,
|
|
79
|
+
**kwargs: Any,
|
|
80
|
+
) -> list[dict]:
|
|
81
|
+
"""Crawl a website and return all page results.
|
|
82
|
+
|
|
83
|
+
Starts an async crawl, polls for completion, and returns all results.
|
|
84
|
+
"""
|
|
85
|
+
args: dict[str, Any] = {"url": url, "maxDepth": max_depth, "maxPages": max_pages}
|
|
86
|
+
args.update(kwargs)
|
|
87
|
+
|
|
88
|
+
if self._api_url:
|
|
89
|
+
return self._http_crawl(args, poll_interval, timeout)
|
|
90
|
+
|
|
91
|
+
# Subprocess mode: start crawl, poll status
|
|
92
|
+
result = self._tool_call("crw_crawl", args)
|
|
93
|
+
job_id = result.get("id")
|
|
94
|
+
if not job_id:
|
|
95
|
+
raise CrwError(f"Crawl did not return job ID: {result}")
|
|
96
|
+
|
|
97
|
+
return self._poll_crawl(job_id, poll_interval, timeout)
|
|
98
|
+
|
|
99
|
+
def map(
|
|
100
|
+
self,
|
|
101
|
+
url: str,
|
|
102
|
+
max_depth: int = 2,
|
|
103
|
+
use_sitemap: bool = True,
|
|
104
|
+
**kwargs: Any,
|
|
105
|
+
) -> list[str]:
|
|
106
|
+
"""Discover URLs on a website.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
List of discovered URLs.
|
|
110
|
+
"""
|
|
111
|
+
args: dict[str, Any] = {"url": url, "maxDepth": max_depth, "useSitemap": use_sitemap}
|
|
112
|
+
args.update(kwargs)
|
|
113
|
+
|
|
114
|
+
if self._api_url:
|
|
115
|
+
data = self._http_post("/v1/map", args)
|
|
116
|
+
return data.get("links", [])
|
|
117
|
+
|
|
118
|
+
result = self._tool_call("crw_map", args)
|
|
119
|
+
return result.get("links", [])
|
|
120
|
+
|
|
121
|
+
def close(self) -> None:
|
|
122
|
+
"""Shut down the subprocess if running."""
|
|
123
|
+
if self._process and self._process.poll() is None:
|
|
124
|
+
self._process.stdin.close()
|
|
125
|
+
try:
|
|
126
|
+
self._process.wait(timeout=5)
|
|
127
|
+
except subprocess.TimeoutExpired:
|
|
128
|
+
self._process.terminate()
|
|
129
|
+
self._process.wait(timeout=5)
|
|
130
|
+
self._process = None
|
|
131
|
+
|
|
132
|
+
def __enter__(self) -> CrwClient:
|
|
133
|
+
return self
|
|
134
|
+
|
|
135
|
+
def __exit__(self, *_: Any) -> None:
|
|
136
|
+
self.close()
|
|
137
|
+
|
|
138
|
+
def __del__(self) -> None:
|
|
139
|
+
self.close()
|
|
140
|
+
|
|
141
|
+
# --- Subprocess (embedded) mode ---
|
|
142
|
+
|
|
143
|
+
def _ensure_process(self) -> subprocess.Popen:
|
|
144
|
+
if self._process is None or self._process.poll() is not None:
|
|
145
|
+
binary = ensure_binary()
|
|
146
|
+
self._process = subprocess.Popen(
|
|
147
|
+
[str(binary)],
|
|
148
|
+
stdin=subprocess.PIPE,
|
|
149
|
+
stdout=subprocess.PIPE,
|
|
150
|
+
stderr=subprocess.DEVNULL,
|
|
151
|
+
text=True,
|
|
152
|
+
)
|
|
153
|
+
return self._process
|
|
154
|
+
|
|
155
|
+
def _jsonrpc(self, method: str, params: dict | None = None) -> Any:
|
|
156
|
+
proc = self._ensure_process()
|
|
157
|
+
req = {
|
|
158
|
+
"jsonrpc": "2.0",
|
|
159
|
+
"id": _next_id(),
|
|
160
|
+
"method": method,
|
|
161
|
+
"params": params or {},
|
|
162
|
+
}
|
|
163
|
+
proc.stdin.write(json.dumps(req) + "\n")
|
|
164
|
+
proc.stdin.flush()
|
|
165
|
+
|
|
166
|
+
line = proc.stdout.readline()
|
|
167
|
+
if not line:
|
|
168
|
+
raise CrwError("crw-mcp process closed unexpectedly")
|
|
169
|
+
|
|
170
|
+
resp = json.loads(line)
|
|
171
|
+
if "error" in resp:
|
|
172
|
+
raise CrwApiError(resp["error"].get("message", str(resp["error"])))
|
|
173
|
+
return resp.get("result")
|
|
174
|
+
|
|
175
|
+
def _tool_call(self, tool_name: str, arguments: dict) -> dict:
|
|
176
|
+
result = self._jsonrpc("tools/call", {"name": tool_name, "arguments": arguments})
|
|
177
|
+
if not result or not result.get("content"):
|
|
178
|
+
raise CrwError(f"Empty response from {tool_name}")
|
|
179
|
+
|
|
180
|
+
content = result["content"][0]
|
|
181
|
+
if result.get("isError"):
|
|
182
|
+
raise CrwApiError(content.get("text", "Unknown error"))
|
|
183
|
+
|
|
184
|
+
return json.loads(content["text"])
|
|
185
|
+
|
|
186
|
+
def _poll_crawl(self, job_id: str, poll_interval: float, timeout: float) -> list[dict]:
|
|
187
|
+
start = time.monotonic()
|
|
188
|
+
while True:
|
|
189
|
+
if time.monotonic() - start > timeout:
|
|
190
|
+
raise CrwTimeoutError(f"Crawl {job_id} timed out after {timeout}s")
|
|
191
|
+
|
|
192
|
+
result = self._tool_call("crw_check_crawl_status", {"id": job_id})
|
|
193
|
+
status = result.get("status")
|
|
194
|
+
|
|
195
|
+
if status == "completed":
|
|
196
|
+
return result.get("data", [])
|
|
197
|
+
if status == "failed":
|
|
198
|
+
raise CrwError(f"Crawl failed: {result.get('error', 'unknown')}")
|
|
199
|
+
|
|
200
|
+
time.sleep(poll_interval)
|
|
201
|
+
|
|
202
|
+
# --- HTTP mode ---
|
|
203
|
+
|
|
204
|
+
def _http_request(self, method: str, path: str, body: dict | None = None) -> dict:
|
|
205
|
+
import urllib.request
|
|
206
|
+
|
|
207
|
+
url = f"{self._api_url.rstrip('/')}{path}"
|
|
208
|
+
headers = {"Content-Type": "application/json"}
|
|
209
|
+
if self._api_key:
|
|
210
|
+
headers["Authorization"] = f"Bearer {self._api_key}"
|
|
211
|
+
|
|
212
|
+
data = json.dumps(body).encode() if body else None
|
|
213
|
+
req = urllib.request.Request(url, data=data, headers=headers, method=method)
|
|
214
|
+
|
|
215
|
+
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
216
|
+
result = json.loads(resp.read())
|
|
217
|
+
|
|
218
|
+
if not result.get("success"):
|
|
219
|
+
raise CrwApiError(result.get("error", "API error"))
|
|
220
|
+
return result.get("data", result)
|
|
221
|
+
|
|
222
|
+
def _http_post(self, path: str, body: dict) -> dict:
|
|
223
|
+
return self._http_request("POST", path, body)
|
|
224
|
+
|
|
225
|
+
def _http_get(self, path: str) -> dict:
|
|
226
|
+
return self._http_request("GET", path)
|
|
227
|
+
|
|
228
|
+
def _http_crawl(self, args: dict, poll_interval: float, timeout: float) -> list[dict]:
|
|
229
|
+
result = self._http_post("/v1/crawl", args)
|
|
230
|
+
job_id = result.get("id")
|
|
231
|
+
if not job_id:
|
|
232
|
+
raise CrwError(f"Crawl did not return job ID: {result}")
|
|
233
|
+
|
|
234
|
+
start = time.monotonic()
|
|
235
|
+
while True:
|
|
236
|
+
if time.monotonic() - start > timeout:
|
|
237
|
+
raise CrwTimeoutError(f"Crawl {job_id} timed out after {timeout}s")
|
|
238
|
+
|
|
239
|
+
status_result = self._http_get(f"/v1/crawl/{job_id}")
|
|
240
|
+
status = status_result.get("status")
|
|
241
|
+
|
|
242
|
+
if status == "completed":
|
|
243
|
+
return status_result.get("data", [])
|
|
244
|
+
if status == "failed":
|
|
245
|
+
raise CrwError(f"Crawl failed: {status_result.get('error', 'unknown')}")
|
|
246
|
+
|
|
247
|
+
time.sleep(poll_interval)
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
"""CRW SDK exceptions."""
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class CrwError(Exception):
|
|
5
|
+
"""Base exception for CRW SDK."""
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class CrwBinaryNotFoundError(CrwError):
|
|
9
|
+
"""Binary could not be found or downloaded."""
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class CrwTimeoutError(CrwError):
|
|
13
|
+
"""Operation timed out."""
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class CrwApiError(CrwError):
|
|
17
|
+
"""API returned an error."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, message: str, status_code: int | None = None):
|
|
20
|
+
super().__init__(message)
|
|
21
|
+
self.status_code = status_code
|