opencode-crawl4ai 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json ADDED
@@ -0,0 +1,56 @@
1
+ {
2
+ "name": "opencode-crawl4ai",
3
+ "version": "0.1.0",
4
+ "description": "OpenCode plugin for unrestricted web access via crawl4ai — fetch, search, extract, screenshot, crawl, map",
5
+ "type": "module",
6
+ "main": "./dist/plugin.js",
7
+ "exports": {
8
+ ".": "./dist/plugin.js"
9
+ },
10
+ "bin": {
11
+ "opencode-crawl4ai": "bin/cli.js"
12
+ },
13
+ "files": [
14
+ "dist",
15
+ "bin",
16
+ "python",
17
+ "README.md"
18
+ ],
19
+ "scripts": {
20
+ "build": "bun build src/plugin.ts --target=bun --outfile=dist/plugin.js",
21
+ "dev": "bun build src/plugin.ts --target=bun --outfile=dist/plugin.js --watch",
22
+ "typecheck": "tsc --noEmit",
23
+ "prepublishOnly": "bun run build"
24
+ },
25
+ "keywords": [
26
+ "opencode",
27
+ "opencode-plugin",
28
+ "crawl4ai",
29
+ "web-crawling",
30
+ "web-scraping",
31
+ "search",
32
+ "ai",
33
+ "llm"
34
+ ],
35
+ "author": "bewinxed",
36
+ "license": "MIT",
37
+ "repository": {
38
+ "type": "git",
39
+ "url": "git+https://github.com/bewinxed/opencode-crawl4ai.git"
40
+ },
41
+ "homepage": "https://github.com/bewinxed/opencode-crawl4ai#readme",
42
+ "bugs": {
43
+ "url": "https://github.com/bewinxed/opencode-crawl4ai/issues"
44
+ },
45
+ "peerDependencies": {
46
+ "@opencode-ai/plugin": ">=1.0.0"
47
+ },
48
+ "devDependencies": {
49
+ "@opencode-ai/plugin": "latest",
50
+ "@types/node": "^22.0.0",
51
+ "typescript": "^5.0.0"
52
+ },
53
+ "dependencies": {
54
+ "zod": "^4.3.5"
55
+ }
56
+ }
@@ -0,0 +1,422 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Bridge between OpenCode plugin (TypeScript) and crawl4ai (Python).
4
+
5
+ Reads JSON request from stdin, executes action, writes JSON response to stdout.
6
+ Uses crawl4ai for web crawling with stealth mode by default.
7
+ """
8
+
9
+ import asyncio
10
+ import json
11
+ import sys
12
+ import os
13
+ from typing import Any, Optional
14
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
15
+ from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, DFSDeepCrawlStrategy
16
+
17
+ # SearXNG URL from environment or default
18
+ SEARXNG_URL = os.environ.get("SEARXNG_URL", "")
19
+
20
+
21
+ def log_error(msg: str) -> None:
22
+ """Log to stderr (safe for stdin/stdout communication)."""
23
+ print(f"[bridge error] {msg}", file=sys.stderr)
24
+
25
+
26
+ def respond(success: bool, data: Any = None, error: Optional[str] = None) -> None:
27
+ """Write JSON response to stdout."""
28
+ result: dict[str, Any] = {"success": success}
29
+ if data is not None:
30
+ result["data"] = data
31
+ if error:
32
+ result["error"] = error
33
+ print(json.dumps(result))
34
+
35
+
36
+ async def debug_action() -> dict:
37
+ """Return debug info about the bridge environment."""
38
+ import crawl4ai
39
+
40
+ try:
41
+ from crawl4ai.__version__ import __version__ as crawl4ai_version
42
+ except Exception:
43
+ try:
44
+ v = getattr(crawl4ai, "__version__", "unknown")
45
+ crawl4ai_version = v.__version__ if hasattr(v, "__version__") else str(v)
46
+ except Exception:
47
+ crawl4ai_version = "unknown"
48
+
49
+ return {
50
+ "crawl4ai_version": crawl4ai_version,
51
+ "searxng_url": SEARXNG_URL or "not configured",
52
+ "python_version": sys.version,
53
+ }
54
+
55
+
56
+ async def fetch_action(
57
+ url: str,
58
+ format: str = "markdown",
59
+ wait_for: Optional[str] = None,
60
+ js_code: Optional[str] = None,
61
+ timeout: int = 30,
62
+ ) -> str:
63
+ """Fetch a URL and return content in specified format."""
64
+
65
+ browser_config = BrowserConfig(
66
+ headless=True,
67
+ verbose=False,
68
+ browser_type="chromium",
69
+ # Enable built-in stealth mode
70
+ enable_stealth=True,
71
+ extra_args=[
72
+ "--disable-blink-features=AutomationControlled",
73
+ ],
74
+ )
75
+
76
+ run_config = CrawlerRunConfig(
77
+ cache_mode=CacheMode.BYPASS,
78
+ wait_for=wait_for,
79
+ js_code=[js_code] if js_code else None,
80
+ page_timeout=timeout * 1000,
81
+ verbose=False,
82
+ )
83
+
84
+ async with AsyncWebCrawler(config=browser_config) as crawler:
85
+ result = await crawler.arun(url=url, config=run_config)
86
+
87
+ if not result.success:
88
+ raise Exception(result.error_message or "Failed to fetch URL")
89
+
90
+ if format == "html":
91
+ return result.html
92
+ elif format == "raw":
93
+ return result.raw_html
94
+ else:
95
+ # Return clean markdown
96
+ if hasattr(result.markdown, "raw_markdown"):
97
+ return result.markdown.raw_markdown
98
+ return str(result.markdown)
99
+
100
+
101
+ async def search_action(query: str, limit: int = 10) -> list:
102
+ """Search the web using SearXNG (if configured) or DuckDuckGo via ddgs library."""
103
+
104
+ results = []
105
+
106
+ # Try SearXNG first if configured
107
+ if SEARXNG_URL:
108
+ try:
109
+ import httpx
110
+
111
+ async with httpx.AsyncClient(timeout=10) as client:
112
+ response = await client.get(
113
+ f"{SEARXNG_URL.rstrip('/')}/search",
114
+ params={
115
+ "q": query,
116
+ "format": "json",
117
+ },
118
+ )
119
+ if response.status_code == 200:
120
+ data = response.json()
121
+ for item in data.get("results", [])[:limit]:
122
+ results.append(
123
+ {
124
+ "url": item.get("url", ""),
125
+ "title": item.get("title", ""),
126
+ "snippet": item.get("content", ""),
127
+ }
128
+ )
129
+ if results:
130
+ return results
131
+ except Exception as e:
132
+ log_error(f"SearXNG search failed: {e}")
133
+
134
+ # Fallback: use ddgs (DuckDuckGo search library)
135
+ try:
136
+ # Try new package name first, fall back to old name
137
+ try:
138
+ from ddgs import DDGS
139
+ except ImportError:
140
+ from duckduckgo_search import DDGS
141
+
142
+ with DDGS() as ddgs:
143
+ for item in ddgs.text(query, max_results=limit):
144
+ results.append(
145
+ {
146
+ "url": item.get("href", ""),
147
+ "title": item.get("title", ""),
148
+ "snippet": item.get("body", ""),
149
+ }
150
+ )
151
+ return results
152
+ except Exception as e:
153
+ log_error(f"DDG search failed: {e}")
154
+
155
+ raise Exception(
156
+ "All search backends failed. Configure SEARXNG_URL or ensure ddgs is available."
157
+ )
158
+
159
+
160
+ async def extract_action(url: str, schema: dict) -> dict:
161
+ """Extract structured data using CSS selectors."""
162
+
163
+ from crawl4ai import JsonCssExtractionStrategy
164
+
165
+ # Convert schema to crawl4ai format
166
+ fields = []
167
+ for name, selector in schema.items():
168
+ fields.append(
169
+ {
170
+ "name": name,
171
+ "selector": selector,
172
+ "type": "text",
173
+ }
174
+ )
175
+
176
+ extraction_schema = {
177
+ "name": "extraction",
178
+ "baseSelector": "html",
179
+ "fields": fields,
180
+ }
181
+
182
+ extraction_strategy = JsonCssExtractionStrategy(extraction_schema)
183
+
184
+ browser_config = BrowserConfig(
185
+ headless=True,
186
+ verbose=False,
187
+ browser_type="chromium",
188
+ enable_stealth=True,
189
+ extra_args=["--disable-blink-features=AutomationControlled"],
190
+ )
191
+
192
+ run_config = CrawlerRunConfig(
193
+ cache_mode=CacheMode.BYPASS,
194
+ extraction_strategy=extraction_strategy,
195
+ verbose=False,
196
+ )
197
+
198
+ async with AsyncWebCrawler(config=browser_config) as crawler:
199
+ result = await crawler.arun(url=url, config=run_config)
200
+
201
+ if not result.success:
202
+ raise Exception(result.error_message or "Failed to extract")
203
+
204
+ if result.extracted_content:
205
+ return json.loads(result.extracted_content)
206
+ return {}
207
+
208
+
209
+ async def screenshot_action(url: str, width: int = 1280, height: int = 720) -> str:
210
+ """Take a screenshot and return base64 data URL."""
211
+
212
+ # viewport_width/height are on BrowserConfig, not CrawlerRunConfig
213
+ browser_config = BrowserConfig(
214
+ headless=True,
215
+ verbose=False,
216
+ browser_type="chromium",
217
+ viewport_width=width,
218
+ viewport_height=height,
219
+ )
220
+
221
+ run_config = CrawlerRunConfig(
222
+ cache_mode=CacheMode.BYPASS,
223
+ screenshot=True,
224
+ verbose=False,
225
+ )
226
+
227
+ async with AsyncWebCrawler(config=browser_config) as crawler:
228
+ result = await crawler.arun(url=url, config=run_config)
229
+
230
+ if not result.success:
231
+ raise Exception(result.error_message or "Failed to take screenshot")
232
+
233
+ if result.screenshot:
234
+ return f"data:image/png;base64,{result.screenshot}"
235
+ raise Exception("No screenshot captured")
236
+
237
+
238
+ async def crawl_action(
239
+ url: str,
240
+ max_pages: int = 10,
241
+ max_depth: int = 2,
242
+ strategy: str = "bfs",
243
+ url_pattern: Optional[str] = None,
244
+ ) -> list:
245
+ """Deep crawl a website."""
246
+ from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter
247
+
248
+ # Build filter chain if url_pattern provided
249
+ filter_chain = None
250
+ if url_pattern:
251
+ filter_chain = FilterChain(filters=[URLPatternFilter(patterns=[url_pattern])])
252
+
253
+ # Choose strategy
254
+ if strategy == "dfs":
255
+ crawl_strategy = DFSDeepCrawlStrategy(
256
+ max_depth=max_depth,
257
+ max_pages=max_pages,
258
+ filter_chain=filter_chain,
259
+ )
260
+ else:
261
+ crawl_strategy = BFSDeepCrawlStrategy(
262
+ max_depth=max_depth,
263
+ max_pages=max_pages,
264
+ filter_chain=filter_chain,
265
+ )
266
+
267
+ browser_config = BrowserConfig(
268
+ headless=True,
269
+ verbose=False,
270
+ browser_type="chromium",
271
+ enable_stealth=True,
272
+ extra_args=["--disable-blink-features=AutomationControlled"],
273
+ )
274
+
275
+ run_config = CrawlerRunConfig(
276
+ cache_mode=CacheMode.BYPASS,
277
+ deep_crawl_strategy=crawl_strategy,
278
+ stream=True,
279
+ verbose=False,
280
+ )
281
+
282
+ pages = []
283
+
284
+ async with AsyncWebCrawler(config=browser_config) as crawler:
285
+ async for result in await crawler.arun(url=url, config=run_config):
286
+ if result.success:
287
+ markdown = result.markdown
288
+ if hasattr(markdown, "raw_markdown"):
289
+ markdown = markdown.raw_markdown
290
+ else:
291
+ markdown = str(markdown)
292
+ pages.append(
293
+ {
294
+ "url": result.url,
295
+ "markdown": markdown,
296
+ }
297
+ )
298
+ if len(pages) >= max_pages:
299
+ break
300
+
301
+ return pages
302
+
303
+
304
+ async def map_action(url: str, search: Optional[str] = None, limit: int = 100) -> list:
305
+ """Discover all URLs on a site."""
306
+
307
+ browser_config = BrowserConfig(
308
+ headless=True,
309
+ verbose=False,
310
+ browser_type="chromium",
311
+ )
312
+
313
+ # Fetch single page and collect links (faster than deep crawling for mapping)
314
+ run_config = CrawlerRunConfig(
315
+ cache_mode=CacheMode.BYPASS,
316
+ verbose=False,
317
+ deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, max_pages=limit),
318
+ stream=True,
319
+ )
320
+
321
+ links: list[dict] = []
322
+
323
+ async with AsyncWebCrawler(config=browser_config) as crawler:
324
+ async for result in await crawler.arun(url=url, config=run_config):
325
+ if result.success and hasattr(result, "links") and result.links:
326
+ for link in result.links.get("internal", []):
327
+ entry = {
328
+ "url": link.get("href", link)
329
+ if isinstance(link, dict)
330
+ else link,
331
+ "title": link.get("text", "") if isinstance(link, dict) else "",
332
+ }
333
+ if entry["url"] and entry not in links:
334
+ links.append(entry)
335
+
336
+ # Filter by search query if provided
337
+ if search and links:
338
+ search_lower = search.lower()
339
+ links = [
340
+ lnk
341
+ for lnk in links
342
+ if search_lower in lnk["url"].lower()
343
+ or search_lower in lnk["title"].lower()
344
+ ]
345
+
346
+ return links[:limit]
347
+
348
+
349
+ async def main():
350
+ """Read request from stdin, execute action, write response to stdout."""
351
+ try:
352
+ request_json = sys.stdin.read()
353
+ request = json.loads(request_json)
354
+
355
+ action = request.get("action")
356
+ params = request.get("params", {})
357
+
358
+ if action == "debug":
359
+ data = await debug_action()
360
+ respond(True, data)
361
+
362
+ elif action == "fetch":
363
+ data = await fetch_action(
364
+ url=params["url"],
365
+ format=params.get("format", "markdown"),
366
+ wait_for=params.get("wait_for"),
367
+ js_code=params.get("js_code"),
368
+ timeout=params.get("timeout", 30),
369
+ )
370
+ respond(True, data)
371
+
372
+ elif action == "search":
373
+ data = await search_action(
374
+ query=params["query"],
375
+ limit=params.get("limit", 10),
376
+ )
377
+ respond(True, data)
378
+
379
+ elif action == "extract":
380
+ data = await extract_action(
381
+ url=params["url"],
382
+ schema=params["schema"],
383
+ )
384
+ respond(True, data)
385
+
386
+ elif action == "screenshot":
387
+ data = await screenshot_action(
388
+ url=params["url"],
389
+ width=params.get("width", 1280),
390
+ height=params.get("height", 720),
391
+ )
392
+ respond(True, data)
393
+
394
+ elif action == "crawl":
395
+ data = await crawl_action(
396
+ url=params["url"],
397
+ max_pages=params.get("max_pages", 10),
398
+ max_depth=params.get("max_depth", 2),
399
+ strategy=params.get("strategy", "bfs"),
400
+ url_pattern=params.get("url_pattern"),
401
+ )
402
+ respond(True, data)
403
+
404
+ elif action == "map":
405
+ data = await map_action(
406
+ url=params["url"],
407
+ search=params.get("search"),
408
+ limit=params.get("limit", 100),
409
+ )
410
+ respond(True, data)
411
+
412
+ else:
413
+ respond(False, error=f"Unknown action: {action}")
414
+
415
+ except json.JSONDecodeError as e:
416
+ respond(False, error=f"Invalid JSON request: {e}")
417
+ except Exception as e:
418
+ respond(False, error=str(e))
419
+
420
+
421
+ if __name__ == "__main__":
422
+ asyncio.run(main())