opencode-crawl4ai 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +144 -0
- package/bin/cli.js +189 -0
- package/dist/plugin.js +12634 -0
- package/package.json +56 -0
- package/python/bridge.py +422 -0
package/package.json
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "opencode-crawl4ai",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "OpenCode plugin for unrestricted web access via crawl4ai — fetch, search, extract, screenshot, crawl, map",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"main": "./dist/plugin.js",
|
|
7
|
+
"exports": {
|
|
8
|
+
".": "./dist/plugin.js"
|
|
9
|
+
},
|
|
10
|
+
"bin": {
|
|
11
|
+
"opencode-crawl4ai": "bin/cli.js"
|
|
12
|
+
},
|
|
13
|
+
"files": [
|
|
14
|
+
"dist",
|
|
15
|
+
"bin",
|
|
16
|
+
"python",
|
|
17
|
+
"README.md"
|
|
18
|
+
],
|
|
19
|
+
"scripts": {
|
|
20
|
+
"build": "bun build src/plugin.ts --target=bun --outfile=dist/plugin.js",
|
|
21
|
+
"dev": "bun build src/plugin.ts --target=bun --outfile=dist/plugin.js --watch",
|
|
22
|
+
"typecheck": "tsc --noEmit",
|
|
23
|
+
"prepublishOnly": "bun run build"
|
|
24
|
+
},
|
|
25
|
+
"keywords": [
|
|
26
|
+
"opencode",
|
|
27
|
+
"opencode-plugin",
|
|
28
|
+
"crawl4ai",
|
|
29
|
+
"web-crawling",
|
|
30
|
+
"web-scraping",
|
|
31
|
+
"search",
|
|
32
|
+
"ai",
|
|
33
|
+
"llm"
|
|
34
|
+
],
|
|
35
|
+
"author": "bewinxed",
|
|
36
|
+
"license": "MIT",
|
|
37
|
+
"repository": {
|
|
38
|
+
"type": "git",
|
|
39
|
+
"url": "git+https://github.com/bewinxed/opencode-crawl4ai.git"
|
|
40
|
+
},
|
|
41
|
+
"homepage": "https://github.com/bewinxed/opencode-crawl4ai#readme",
|
|
42
|
+
"bugs": {
|
|
43
|
+
"url": "https://github.com/bewinxed/opencode-crawl4ai/issues"
|
|
44
|
+
},
|
|
45
|
+
"peerDependencies": {
|
|
46
|
+
"@opencode-ai/plugin": ">=1.0.0"
|
|
47
|
+
},
|
|
48
|
+
"devDependencies": {
|
|
49
|
+
"@opencode-ai/plugin": "latest",
|
|
50
|
+
"@types/node": "^22.0.0",
|
|
51
|
+
"typescript": "^5.0.0"
|
|
52
|
+
},
|
|
53
|
+
"dependencies": {
|
|
54
|
+
"zod": "^4.3.5"
|
|
55
|
+
}
|
|
56
|
+
}
|
package/python/bridge.py
ADDED
|
@@ -0,0 +1,422 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Bridge between OpenCode plugin (TypeScript) and crawl4ai (Python).
|
|
4
|
+
|
|
5
|
+
Reads JSON request from stdin, executes action, writes JSON response to stdout.
|
|
6
|
+
Uses crawl4ai for web crawling with stealth mode by default.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import asyncio
|
|
10
|
+
import json
|
|
11
|
+
import sys
|
|
12
|
+
import os
|
|
13
|
+
from typing import Any, Optional
|
|
14
|
+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
|
15
|
+
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, DFSDeepCrawlStrategy
|
|
16
|
+
|
|
17
|
+
# SearXNG URL from environment or default
|
|
18
|
+
SEARXNG_URL = os.environ.get("SEARXNG_URL", "")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def log_error(msg: str) -> None:
|
|
22
|
+
"""Log to stderr (safe for stdin/stdout communication)."""
|
|
23
|
+
print(f"[bridge error] {msg}", file=sys.stderr)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def respond(success: bool, data: Any = None, error: Optional[str] = None) -> None:
|
|
27
|
+
"""Write JSON response to stdout."""
|
|
28
|
+
result: dict[str, Any] = {"success": success}
|
|
29
|
+
if data is not None:
|
|
30
|
+
result["data"] = data
|
|
31
|
+
if error:
|
|
32
|
+
result["error"] = error
|
|
33
|
+
print(json.dumps(result))
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
async def debug_action() -> dict:
|
|
37
|
+
"""Return debug info about the bridge environment."""
|
|
38
|
+
import crawl4ai
|
|
39
|
+
|
|
40
|
+
try:
|
|
41
|
+
from crawl4ai.__version__ import __version__ as crawl4ai_version
|
|
42
|
+
except Exception:
|
|
43
|
+
try:
|
|
44
|
+
v = getattr(crawl4ai, "__version__", "unknown")
|
|
45
|
+
crawl4ai_version = v.__version__ if hasattr(v, "__version__") else str(v)
|
|
46
|
+
except Exception:
|
|
47
|
+
crawl4ai_version = "unknown"
|
|
48
|
+
|
|
49
|
+
return {
|
|
50
|
+
"crawl4ai_version": crawl4ai_version,
|
|
51
|
+
"searxng_url": SEARXNG_URL or "not configured",
|
|
52
|
+
"python_version": sys.version,
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
async def fetch_action(
|
|
57
|
+
url: str,
|
|
58
|
+
format: str = "markdown",
|
|
59
|
+
wait_for: Optional[str] = None,
|
|
60
|
+
js_code: Optional[str] = None,
|
|
61
|
+
timeout: int = 30,
|
|
62
|
+
) -> str:
|
|
63
|
+
"""Fetch a URL and return content in specified format."""
|
|
64
|
+
|
|
65
|
+
browser_config = BrowserConfig(
|
|
66
|
+
headless=True,
|
|
67
|
+
verbose=False,
|
|
68
|
+
browser_type="chromium",
|
|
69
|
+
# Enable built-in stealth mode
|
|
70
|
+
enable_stealth=True,
|
|
71
|
+
extra_args=[
|
|
72
|
+
"--disable-blink-features=AutomationControlled",
|
|
73
|
+
],
|
|
74
|
+
)
|
|
75
|
+
|
|
76
|
+
run_config = CrawlerRunConfig(
|
|
77
|
+
cache_mode=CacheMode.BYPASS,
|
|
78
|
+
wait_for=wait_for,
|
|
79
|
+
js_code=[js_code] if js_code else None,
|
|
80
|
+
page_timeout=timeout * 1000,
|
|
81
|
+
verbose=False,
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
85
|
+
result = await crawler.arun(url=url, config=run_config)
|
|
86
|
+
|
|
87
|
+
if not result.success:
|
|
88
|
+
raise Exception(result.error_message or "Failed to fetch URL")
|
|
89
|
+
|
|
90
|
+
if format == "html":
|
|
91
|
+
return result.html
|
|
92
|
+
elif format == "raw":
|
|
93
|
+
return result.raw_html
|
|
94
|
+
else:
|
|
95
|
+
# Return clean markdown
|
|
96
|
+
if hasattr(result.markdown, "raw_markdown"):
|
|
97
|
+
return result.markdown.raw_markdown
|
|
98
|
+
return str(result.markdown)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
async def search_action(query: str, limit: int = 10) -> list:
|
|
102
|
+
"""Search the web using SearXNG (if configured) or DuckDuckGo via ddgs library."""
|
|
103
|
+
|
|
104
|
+
results = []
|
|
105
|
+
|
|
106
|
+
# Try SearXNG first if configured
|
|
107
|
+
if SEARXNG_URL:
|
|
108
|
+
try:
|
|
109
|
+
import httpx
|
|
110
|
+
|
|
111
|
+
async with httpx.AsyncClient(timeout=10) as client:
|
|
112
|
+
response = await client.get(
|
|
113
|
+
f"{SEARXNG_URL.rstrip('/')}/search",
|
|
114
|
+
params={
|
|
115
|
+
"q": query,
|
|
116
|
+
"format": "json",
|
|
117
|
+
},
|
|
118
|
+
)
|
|
119
|
+
if response.status_code == 200:
|
|
120
|
+
data = response.json()
|
|
121
|
+
for item in data.get("results", [])[:limit]:
|
|
122
|
+
results.append(
|
|
123
|
+
{
|
|
124
|
+
"url": item.get("url", ""),
|
|
125
|
+
"title": item.get("title", ""),
|
|
126
|
+
"snippet": item.get("content", ""),
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
if results:
|
|
130
|
+
return results
|
|
131
|
+
except Exception as e:
|
|
132
|
+
log_error(f"SearXNG search failed: {e}")
|
|
133
|
+
|
|
134
|
+
# Fallback: use ddgs (DuckDuckGo search library)
|
|
135
|
+
try:
|
|
136
|
+
# Try new package name first, fall back to old name
|
|
137
|
+
try:
|
|
138
|
+
from ddgs import DDGS
|
|
139
|
+
except ImportError:
|
|
140
|
+
from duckduckgo_search import DDGS
|
|
141
|
+
|
|
142
|
+
with DDGS() as ddgs:
|
|
143
|
+
for item in ddgs.text(query, max_results=limit):
|
|
144
|
+
results.append(
|
|
145
|
+
{
|
|
146
|
+
"url": item.get("href", ""),
|
|
147
|
+
"title": item.get("title", ""),
|
|
148
|
+
"snippet": item.get("body", ""),
|
|
149
|
+
}
|
|
150
|
+
)
|
|
151
|
+
return results
|
|
152
|
+
except Exception as e:
|
|
153
|
+
log_error(f"DDG search failed: {e}")
|
|
154
|
+
|
|
155
|
+
raise Exception(
|
|
156
|
+
"All search backends failed. Configure SEARXNG_URL or ensure ddgs is available."
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
async def extract_action(url: str, schema: dict) -> dict:
|
|
161
|
+
"""Extract structured data using CSS selectors."""
|
|
162
|
+
|
|
163
|
+
from crawl4ai import JsonCssExtractionStrategy
|
|
164
|
+
|
|
165
|
+
# Convert schema to crawl4ai format
|
|
166
|
+
fields = []
|
|
167
|
+
for name, selector in schema.items():
|
|
168
|
+
fields.append(
|
|
169
|
+
{
|
|
170
|
+
"name": name,
|
|
171
|
+
"selector": selector,
|
|
172
|
+
"type": "text",
|
|
173
|
+
}
|
|
174
|
+
)
|
|
175
|
+
|
|
176
|
+
extraction_schema = {
|
|
177
|
+
"name": "extraction",
|
|
178
|
+
"baseSelector": "html",
|
|
179
|
+
"fields": fields,
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
extraction_strategy = JsonCssExtractionStrategy(extraction_schema)
|
|
183
|
+
|
|
184
|
+
browser_config = BrowserConfig(
|
|
185
|
+
headless=True,
|
|
186
|
+
verbose=False,
|
|
187
|
+
browser_type="chromium",
|
|
188
|
+
enable_stealth=True,
|
|
189
|
+
extra_args=["--disable-blink-features=AutomationControlled"],
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
run_config = CrawlerRunConfig(
|
|
193
|
+
cache_mode=CacheMode.BYPASS,
|
|
194
|
+
extraction_strategy=extraction_strategy,
|
|
195
|
+
verbose=False,
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
199
|
+
result = await crawler.arun(url=url, config=run_config)
|
|
200
|
+
|
|
201
|
+
if not result.success:
|
|
202
|
+
raise Exception(result.error_message or "Failed to extract")
|
|
203
|
+
|
|
204
|
+
if result.extracted_content:
|
|
205
|
+
return json.loads(result.extracted_content)
|
|
206
|
+
return {}
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
async def screenshot_action(url: str, width: int = 1280, height: int = 720) -> str:
|
|
210
|
+
"""Take a screenshot and return base64 data URL."""
|
|
211
|
+
|
|
212
|
+
# viewport_width/height are on BrowserConfig, not CrawlerRunConfig
|
|
213
|
+
browser_config = BrowserConfig(
|
|
214
|
+
headless=True,
|
|
215
|
+
verbose=False,
|
|
216
|
+
browser_type="chromium",
|
|
217
|
+
viewport_width=width,
|
|
218
|
+
viewport_height=height,
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
run_config = CrawlerRunConfig(
|
|
222
|
+
cache_mode=CacheMode.BYPASS,
|
|
223
|
+
screenshot=True,
|
|
224
|
+
verbose=False,
|
|
225
|
+
)
|
|
226
|
+
|
|
227
|
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
228
|
+
result = await crawler.arun(url=url, config=run_config)
|
|
229
|
+
|
|
230
|
+
if not result.success:
|
|
231
|
+
raise Exception(result.error_message or "Failed to take screenshot")
|
|
232
|
+
|
|
233
|
+
if result.screenshot:
|
|
234
|
+
return f"data:image/png;base64,{result.screenshot}"
|
|
235
|
+
raise Exception("No screenshot captured")
|
|
236
|
+
|
|
237
|
+
|
|
238
|
+
async def crawl_action(
|
|
239
|
+
url: str,
|
|
240
|
+
max_pages: int = 10,
|
|
241
|
+
max_depth: int = 2,
|
|
242
|
+
strategy: str = "bfs",
|
|
243
|
+
url_pattern: Optional[str] = None,
|
|
244
|
+
) -> list:
|
|
245
|
+
"""Deep crawl a website."""
|
|
246
|
+
from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
|
|
247
|
+
|
|
248
|
+
# Build filter chain if url_pattern provided
|
|
249
|
+
filter_chain = None
|
|
250
|
+
if url_pattern:
|
|
251
|
+
filter_chain = FilterChain(filters=[URLPatternFilter(patterns=[url_pattern])])
|
|
252
|
+
|
|
253
|
+
# Choose strategy
|
|
254
|
+
if strategy == "dfs":
|
|
255
|
+
crawl_strategy = DFSDeepCrawlStrategy(
|
|
256
|
+
max_depth=max_depth,
|
|
257
|
+
max_pages=max_pages,
|
|
258
|
+
filter_chain=filter_chain,
|
|
259
|
+
)
|
|
260
|
+
else:
|
|
261
|
+
crawl_strategy = BFSDeepCrawlStrategy(
|
|
262
|
+
max_depth=max_depth,
|
|
263
|
+
max_pages=max_pages,
|
|
264
|
+
filter_chain=filter_chain,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
browser_config = BrowserConfig(
|
|
268
|
+
headless=True,
|
|
269
|
+
verbose=False,
|
|
270
|
+
browser_type="chromium",
|
|
271
|
+
enable_stealth=True,
|
|
272
|
+
extra_args=["--disable-blink-features=AutomationControlled"],
|
|
273
|
+
)
|
|
274
|
+
|
|
275
|
+
run_config = CrawlerRunConfig(
|
|
276
|
+
cache_mode=CacheMode.BYPASS,
|
|
277
|
+
deep_crawl_strategy=crawl_strategy,
|
|
278
|
+
stream=True,
|
|
279
|
+
verbose=False,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
pages = []
|
|
283
|
+
|
|
284
|
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
285
|
+
async for result in await crawler.arun(url=url, config=run_config):
|
|
286
|
+
if result.success:
|
|
287
|
+
markdown = result.markdown
|
|
288
|
+
if hasattr(markdown, "raw_markdown"):
|
|
289
|
+
markdown = markdown.raw_markdown
|
|
290
|
+
else:
|
|
291
|
+
markdown = str(markdown)
|
|
292
|
+
pages.append(
|
|
293
|
+
{
|
|
294
|
+
"url": result.url,
|
|
295
|
+
"markdown": markdown,
|
|
296
|
+
}
|
|
297
|
+
)
|
|
298
|
+
if len(pages) >= max_pages:
|
|
299
|
+
break
|
|
300
|
+
|
|
301
|
+
return pages
|
|
302
|
+
|
|
303
|
+
|
|
304
|
+
async def map_action(url: str, search: Optional[str] = None, limit: int = 100) -> list:
|
|
305
|
+
"""Discover all URLs on a site."""
|
|
306
|
+
|
|
307
|
+
browser_config = BrowserConfig(
|
|
308
|
+
headless=True,
|
|
309
|
+
verbose=False,
|
|
310
|
+
browser_type="chromium",
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Fetch single page and collect links (faster than deep crawling for mapping)
|
|
314
|
+
run_config = CrawlerRunConfig(
|
|
315
|
+
cache_mode=CacheMode.BYPASS,
|
|
316
|
+
verbose=False,
|
|
317
|
+
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, max_pages=limit),
|
|
318
|
+
stream=True,
|
|
319
|
+
)
|
|
320
|
+
|
|
321
|
+
links: list[dict] = []
|
|
322
|
+
|
|
323
|
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
324
|
+
async for result in await crawler.arun(url=url, config=run_config):
|
|
325
|
+
if result.success and hasattr(result, "links") and result.links:
|
|
326
|
+
for link in result.links.get("internal", []):
|
|
327
|
+
entry = {
|
|
328
|
+
"url": link.get("href", link)
|
|
329
|
+
if isinstance(link, dict)
|
|
330
|
+
else link,
|
|
331
|
+
"title": link.get("text", "") if isinstance(link, dict) else "",
|
|
332
|
+
}
|
|
333
|
+
if entry["url"] and entry not in links:
|
|
334
|
+
links.append(entry)
|
|
335
|
+
|
|
336
|
+
# Filter by search query if provided
|
|
337
|
+
if search and links:
|
|
338
|
+
search_lower = search.lower()
|
|
339
|
+
links = [
|
|
340
|
+
lnk
|
|
341
|
+
for lnk in links
|
|
342
|
+
if search_lower in lnk["url"].lower()
|
|
343
|
+
or search_lower in lnk["title"].lower()
|
|
344
|
+
]
|
|
345
|
+
|
|
346
|
+
return links[:limit]
|
|
347
|
+
|
|
348
|
+
|
|
349
|
+
async def main():
|
|
350
|
+
"""Read request from stdin, execute action, write response to stdout."""
|
|
351
|
+
try:
|
|
352
|
+
request_json = sys.stdin.read()
|
|
353
|
+
request = json.loads(request_json)
|
|
354
|
+
|
|
355
|
+
action = request.get("action")
|
|
356
|
+
params = request.get("params", {})
|
|
357
|
+
|
|
358
|
+
if action == "debug":
|
|
359
|
+
data = await debug_action()
|
|
360
|
+
respond(True, data)
|
|
361
|
+
|
|
362
|
+
elif action == "fetch":
|
|
363
|
+
data = await fetch_action(
|
|
364
|
+
url=params["url"],
|
|
365
|
+
format=params.get("format", "markdown"),
|
|
366
|
+
wait_for=params.get("wait_for"),
|
|
367
|
+
js_code=params.get("js_code"),
|
|
368
|
+
timeout=params.get("timeout", 30),
|
|
369
|
+
)
|
|
370
|
+
respond(True, data)
|
|
371
|
+
|
|
372
|
+
elif action == "search":
|
|
373
|
+
data = await search_action(
|
|
374
|
+
query=params["query"],
|
|
375
|
+
limit=params.get("limit", 10),
|
|
376
|
+
)
|
|
377
|
+
respond(True, data)
|
|
378
|
+
|
|
379
|
+
elif action == "extract":
|
|
380
|
+
data = await extract_action(
|
|
381
|
+
url=params["url"],
|
|
382
|
+
schema=params["schema"],
|
|
383
|
+
)
|
|
384
|
+
respond(True, data)
|
|
385
|
+
|
|
386
|
+
elif action == "screenshot":
|
|
387
|
+
data = await screenshot_action(
|
|
388
|
+
url=params["url"],
|
|
389
|
+
width=params.get("width", 1280),
|
|
390
|
+
height=params.get("height", 720),
|
|
391
|
+
)
|
|
392
|
+
respond(True, data)
|
|
393
|
+
|
|
394
|
+
elif action == "crawl":
|
|
395
|
+
data = await crawl_action(
|
|
396
|
+
url=params["url"],
|
|
397
|
+
max_pages=params.get("max_pages", 10),
|
|
398
|
+
max_depth=params.get("max_depth", 2),
|
|
399
|
+
strategy=params.get("strategy", "bfs"),
|
|
400
|
+
url_pattern=params.get("url_pattern"),
|
|
401
|
+
)
|
|
402
|
+
respond(True, data)
|
|
403
|
+
|
|
404
|
+
elif action == "map":
|
|
405
|
+
data = await map_action(
|
|
406
|
+
url=params["url"],
|
|
407
|
+
search=params.get("search"),
|
|
408
|
+
limit=params.get("limit", 100),
|
|
409
|
+
)
|
|
410
|
+
respond(True, data)
|
|
411
|
+
|
|
412
|
+
else:
|
|
413
|
+
respond(False, error=f"Unknown action: {action}")
|
|
414
|
+
|
|
415
|
+
except json.JSONDecodeError as e:
|
|
416
|
+
respond(False, error=f"Invalid JSON request: {e}")
|
|
417
|
+
except Exception as e:
|
|
418
|
+
respond(False, error=str(e))
|
|
419
|
+
|
|
420
|
+
|
|
421
|
+
if __name__ == "__main__":
|
|
422
|
+
asyncio.run(main())
|