massgen 0.1.2__py3-none-any.whl → 0.1.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of massgen might be problematic. Click here for more details.

Files changed (82) hide show
  1. massgen/__init__.py +1 -1
  2. massgen/agent_config.py +33 -7
  3. massgen/api_params_handler/_api_params_handler_base.py +3 -0
  4. massgen/api_params_handler/_chat_completions_api_params_handler.py +4 -0
  5. massgen/api_params_handler/_claude_api_params_handler.py +4 -0
  6. massgen/api_params_handler/_gemini_api_params_handler.py +4 -0
  7. massgen/api_params_handler/_response_api_params_handler.py +4 -0
  8. massgen/backend/azure_openai.py +9 -1
  9. massgen/backend/base.py +4 -0
  10. massgen/backend/base_with_custom_tool_and_mcp.py +25 -5
  11. massgen/backend/claude_code.py +9 -1
  12. massgen/backend/docs/permissions_and_context_files.md +2 -2
  13. massgen/backend/gemini.py +35 -6
  14. massgen/backend/gemini_utils.py +30 -0
  15. massgen/backend/response.py +2 -0
  16. massgen/chat_agent.py +9 -3
  17. massgen/cli.py +291 -43
  18. massgen/config_builder.py +163 -18
  19. massgen/configs/README.md +69 -14
  20. massgen/configs/debug/restart_test_controlled.yaml +60 -0
  21. massgen/configs/debug/restart_test_controlled_filesystem.yaml +73 -0
  22. massgen/configs/tools/code-execution/docker_with_sudo.yaml +35 -0
  23. massgen/configs/tools/custom_tools/computer_use_browser_example.yaml +56 -0
  24. massgen/configs/tools/custom_tools/computer_use_docker_example.yaml +65 -0
  25. massgen/configs/tools/custom_tools/computer_use_example.yaml +50 -0
  26. massgen/configs/tools/custom_tools/crawl4ai_example.yaml +55 -0
  27. massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_multi.yaml +61 -0
  28. massgen/configs/tools/custom_tools/multimodal_tools/text_to_file_generation_single.yaml +29 -0
  29. massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_multi.yaml +51 -0
  30. massgen/configs/tools/custom_tools/multimodal_tools/text_to_image_generation_single.yaml +33 -0
  31. massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_multi.yaml +55 -0
  32. massgen/configs/tools/custom_tools/multimodal_tools/text_to_speech_generation_single.yaml +33 -0
  33. massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_multi.yaml +47 -0
  34. massgen/configs/tools/custom_tools/multimodal_tools/text_to_video_generation_single.yaml +29 -0
  35. massgen/configs/tools/custom_tools/multimodal_tools/understand_audio.yaml +33 -0
  36. massgen/configs/tools/custom_tools/multimodal_tools/understand_file.yaml +34 -0
  37. massgen/configs/tools/custom_tools/multimodal_tools/understand_image.yaml +33 -0
  38. massgen/configs/tools/custom_tools/multimodal_tools/understand_video.yaml +34 -0
  39. massgen/configs/tools/custom_tools/multimodal_tools/youtube_video_analysis.yaml +59 -0
  40. massgen/docker/README.md +83 -0
  41. massgen/filesystem_manager/_code_execution_server.py +22 -7
  42. massgen/filesystem_manager/_docker_manager.py +21 -1
  43. massgen/filesystem_manager/_filesystem_manager.py +9 -0
  44. massgen/filesystem_manager/_path_permission_manager.py +148 -0
  45. massgen/filesystem_manager/_workspace_tools_server.py +0 -997
  46. massgen/formatter/_gemini_formatter.py +73 -0
  47. massgen/frontend/coordination_ui.py +175 -257
  48. massgen/frontend/displays/base_display.py +29 -0
  49. massgen/frontend/displays/rich_terminal_display.py +155 -9
  50. massgen/frontend/displays/simple_display.py +21 -0
  51. massgen/frontend/displays/terminal_display.py +22 -2
  52. massgen/logger_config.py +50 -6
  53. massgen/message_templates.py +283 -15
  54. massgen/orchestrator.py +335 -38
  55. massgen/tests/test_binary_file_blocking.py +274 -0
  56. massgen/tests/test_case_studies.md +12 -12
  57. massgen/tests/test_code_execution.py +178 -0
  58. massgen/tests/test_multimodal_size_limits.py +407 -0
  59. massgen/tests/test_orchestration_restart.py +204 -0
  60. massgen/tool/__init__.py +4 -0
  61. massgen/tool/_manager.py +7 -2
  62. massgen/tool/_multimodal_tools/image_to_image_generation.py +293 -0
  63. massgen/tool/_multimodal_tools/text_to_file_generation.py +455 -0
  64. massgen/tool/_multimodal_tools/text_to_image_generation.py +222 -0
  65. massgen/tool/_multimodal_tools/text_to_speech_continue_generation.py +226 -0
  66. massgen/tool/_multimodal_tools/text_to_speech_transcription_generation.py +217 -0
  67. massgen/tool/_multimodal_tools/text_to_video_generation.py +223 -0
  68. massgen/tool/_multimodal_tools/understand_audio.py +211 -0
  69. massgen/tool/_multimodal_tools/understand_file.py +555 -0
  70. massgen/tool/_multimodal_tools/understand_image.py +316 -0
  71. massgen/tool/_multimodal_tools/understand_video.py +340 -0
  72. massgen/tool/_web_tools/crawl4ai_tool.py +718 -0
  73. massgen/tool/docs/multimodal_tools.md +1368 -0
  74. massgen/tool/workflow_toolkits/__init__.py +26 -0
  75. massgen/tool/workflow_toolkits/post_evaluation.py +216 -0
  76. massgen/utils.py +1 -0
  77. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/METADATA +101 -69
  78. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/RECORD +82 -46
  79. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/WHEEL +0 -0
  80. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/entry_points.txt +0 -0
  81. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/licenses/LICENSE +0 -0
  82. {massgen-0.1.2.dist-info → massgen-0.1.4.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,718 @@
1
+ # -*- coding: utf-8 -*-
2
+ """
3
+ Crawl4AI Web Scraping Tools - Custom tool wrapper for crawl4ai REST API.
4
+
5
+ This module provides MassGen custom tools that wrap the crawl4ai Docker container's
6
+ REST API, providing powerful web scraping capabilities without MCP protocol overhead.
7
+
8
+ Available Tools:
9
+ - crawl4ai_md: Extract clean markdown from webpages
10
+ - crawl4ai_html: Get preprocessed HTML
11
+ - crawl4ai_screenshot: Capture webpage screenshots
12
+ - crawl4ai_pdf: Generate PDFs from webpages
13
+ - crawl4ai_execute_js: Run JavaScript on pages
14
+ - crawl4ai_crawl: Crawl multiple URLs
15
+ - crawl4ai_ask: Query crawl4ai library documentation
16
+
17
+ Prerequisites:
18
+ - Crawl4ai Docker container running at http://localhost:11235
19
+ Start with: docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:latest
20
+ """
21
+
22
+ import json
23
+ from typing import List, Optional
24
+ from urllib.parse import urlparse
25
+
26
+ import httpx
27
+
28
+ from massgen.tool._result import ExecutionResult, TextContent
29
+
30
+ # Base URL for crawl4ai container
31
+ CRAWL4AI_BASE_URL = "http://localhost:11235"
32
+ DEFAULT_TIMEOUT = 60.0
33
+
34
+
35
+ def _validate_url(url: str) -> tuple[bool, str]:
36
+ """Validate that a URL is properly formatted.
37
+
38
+ Args:
39
+ url: URL to validate
40
+
41
+ Returns:
42
+ Tuple of (is_valid, error_message)
43
+ """
44
+ try:
45
+ parsed = urlparse(url)
46
+ if not parsed.scheme or parsed.scheme not in ("http", "https"):
47
+ return False, f"URL must use http or https protocol, got: {parsed.scheme or 'none'}"
48
+ if not parsed.netloc:
49
+ return False, "URL must have a valid domain"
50
+ return True, ""
51
+ except Exception as e:
52
+ return False, f"Invalid URL format: {str(e)}"
53
+
54
+
55
+ async def _check_url_accessible(url: str) -> tuple[bool, str, int]:
56
+ """Check if a URL is accessible via HEAD request.
57
+
58
+ Args:
59
+ url: URL to check
60
+
61
+ Returns:
62
+ Tuple of (is_accessible, error_message, status_code)
63
+ """
64
+ try:
65
+ async with httpx.AsyncClient(timeout=10.0, follow_redirects=True) as client:
66
+ response = await client.head(url)
67
+ if response.status_code >= 400:
68
+ return False, f"URL returned error status {response.status_code}", response.status_code
69
+ return True, "", response.status_code
70
+ except httpx.ConnectError:
71
+ return False, "Could not connect to URL (connection refused or DNS error)", 0
72
+ except httpx.TimeoutException:
73
+ return False, "URL request timed out", 0
74
+ except Exception as e:
75
+ return False, f"Error checking URL: {str(e)}", 0
76
+
77
+
78
+ async def _check_docker_running() -> tuple[bool, str]:
79
+ """Check if the crawl4ai Docker container is running and accessible.
80
+
81
+ Returns:
82
+ Tuple of (is_running, error_message)
83
+ """
84
+ try:
85
+ async with httpx.AsyncClient(timeout=5.0) as client:
86
+ response = await client.get(f"{CRAWL4AI_BASE_URL}/health")
87
+ if response.status_code == 200:
88
+ return True, ""
89
+ return False, f"crawl4ai container health check failed with status {response.status_code}"
90
+ except httpx.ConnectError:
91
+ return False, (
92
+ "crawl4ai Docker container is not running or not accessible at http://localhost:11235\n\n"
93
+ "To start the container, run:\n"
94
+ " docker pull unclecode/crawl4ai:latest\n"
95
+ " docker run -d -p 11235:11235 --name crawl4ai --shm-size=1g unclecode/crawl4ai:latest\n\n"
96
+ "To verify it's running:\n"
97
+ " docker ps | grep crawl4ai"
98
+ )
99
+ except httpx.TimeoutException:
100
+ return False, "crawl4ai container is not responding (timeout). Check if the container is healthy."
101
+ except Exception as e:
102
+ return False, f"Error checking crawl4ai container: {str(e)}"
103
+
104
+
105
+ def require_docker(func):
106
+ """Decorator that checks if Docker container is running before executing the function."""
107
+ from functools import wraps
108
+
109
+ @wraps(func)
110
+ async def wrapper(*args, **kwargs):
111
+ is_docker_running, docker_error = await _check_docker_running()
112
+ if not is_docker_running:
113
+ return ExecutionResult(
114
+ output_blocks=[
115
+ TextContent(
116
+ data=json.dumps(
117
+ {
118
+ "success": False,
119
+ "error": "Docker container not running",
120
+ "details": docker_error,
121
+ },
122
+ indent=2,
123
+ ),
124
+ ),
125
+ ],
126
+ )
127
+ return await func(*args, **kwargs)
128
+
129
+ return wrapper
130
+
131
+
132
+ @require_docker
133
+ async def crawl4ai_md(
134
+ url: str,
135
+ filter_type: str = "fit",
136
+ query: Optional[str] = None,
137
+ agent_cwd: Optional[str] = None,
138
+ ) -> ExecutionResult:
139
+ """Extract clean markdown text content from a webpage.
140
+
141
+ PRIMARY TOOL for reading and understanding website content. Use this when you need to:
142
+ - Read articles, documentation, blog posts, or any text content
143
+ - Understand what a webpage says
144
+ - Extract information from a website
145
+ - Summarize web content
146
+
147
+ DO NOT use screenshot tools for reading content - use this tool instead.
148
+
149
+ Fetches webpage and converts to clean markdown format ideal for LLM consumption.
150
+ Uses intelligent content filtering to extract only relevant text.
151
+
152
+ Args:
153
+ url: The webpage URL to scrape (must be absolute http/https URL)
154
+ filter_type: Content filter strategy - "fit" (smart filtering, default),
155
+ "raw" (no filtering), "bm25" (keyword-based), "llm" (AI-powered)
156
+ query: Query string for BM25/LLM filters (optional)
157
+
158
+ Returns:
159
+ ExecutionResult containing:
160
+ - success: Whether the operation succeeded
161
+ - url: The scraped URL
162
+ - markdown: Clean markdown content
163
+ - filter: Filter strategy used
164
+
165
+ Examples:
166
+ >>> result = await crawl4ai_md("https://example.com")
167
+ >>> # Returns markdown of the page
168
+
169
+ >>> result = await crawl4ai_md("https://news.ycombinator.com", filter_type="bm25", query="AI safety")
170
+ >>> # Returns filtered content matching "AI safety"
171
+ """
172
+ # Validate URL format
173
+ is_valid, error_msg = _validate_url(url)
174
+ if not is_valid:
175
+ return ExecutionResult(
176
+ output_blocks=[
177
+ TextContent(
178
+ data=json.dumps(
179
+ {
180
+ "success": False,
181
+ "error": f"Invalid URL: {error_msg}",
182
+ "url": url,
183
+ },
184
+ indent=2,
185
+ ),
186
+ ),
187
+ ],
188
+ )
189
+
190
+ # Check if URL is accessible
191
+ is_accessible, access_error, status_code = await _check_url_accessible(url)
192
+ if not is_accessible:
193
+ return ExecutionResult(
194
+ output_blocks=[
195
+ TextContent(
196
+ data=json.dumps(
197
+ {
198
+ "success": False,
199
+ "error": f"URL not accessible: {access_error}",
200
+ "url": url,
201
+ "status_code": status_code,
202
+ },
203
+ indent=2,
204
+ ),
205
+ ),
206
+ ],
207
+ )
208
+
209
+ try:
210
+ async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
211
+ response = await client.post(
212
+ f"{CRAWL4AI_BASE_URL}/md",
213
+ json={
214
+ "url": url,
215
+ "f": filter_type,
216
+ "q": query,
217
+ },
218
+ )
219
+ response.raise_for_status()
220
+ data = response.json()
221
+
222
+ if data.get("success"):
223
+ result_data = {
224
+ "success": True,
225
+ "url": data.get("url"),
226
+ "markdown": data.get("markdown"),
227
+ "filter": data.get("filter"),
228
+ }
229
+ else:
230
+ result_data = {
231
+ "success": False,
232
+ "error": "Crawl failed",
233
+ "url": url,
234
+ }
235
+
236
+ return ExecutionResult(
237
+ output_blocks=[TextContent(data=json.dumps(result_data, indent=2))],
238
+ )
239
+
240
+ except httpx.HTTPStatusError as e:
241
+ return ExecutionResult(
242
+ output_blocks=[
243
+ TextContent(
244
+ data=json.dumps(
245
+ {
246
+ "success": False,
247
+ "error": f"HTTP error {e.response.status_code}: {e.response.reason_phrase}",
248
+ "url": url,
249
+ "status_code": e.response.status_code,
250
+ },
251
+ indent=2,
252
+ ),
253
+ ),
254
+ ],
255
+ )
256
+ except Exception as e:
257
+ return ExecutionResult(
258
+ output_blocks=[
259
+ TextContent(
260
+ data=json.dumps(
261
+ {
262
+ "success": False,
263
+ "error": f"Failed to scrape URL: {str(e)}",
264
+ "url": url,
265
+ },
266
+ indent=2,
267
+ ),
268
+ ),
269
+ ],
270
+ )
271
+
272
+
273
+ @require_docker
274
+ async def crawl4ai_html(
275
+ url: str,
276
+ agent_cwd: Optional[str] = None,
277
+ ) -> ExecutionResult:
278
+ """Extract preprocessed HTML from a webpage.
279
+
280
+ Fetches and preprocesses HTML, removing scripts/styles for cleaner
281
+ structure extraction. Useful for building schemas or parsing structured data.
282
+
283
+ Args:
284
+ url: The webpage URL to scrape
285
+
286
+ Returns:
287
+ ExecutionResult containing preprocessed HTML
288
+
289
+ Examples:
290
+ >>> result = await crawl4ai_html("https://example.com")
291
+ >>> # Returns cleaned HTML
292
+ """
293
+ try:
294
+ async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
295
+ response = await client.post(
296
+ f"{CRAWL4AI_BASE_URL}/html",
297
+ json={"url": url},
298
+ )
299
+ response.raise_for_status()
300
+ data = response.json()
301
+
302
+ result_data = {
303
+ "success": True,
304
+ "url": url,
305
+ "html": data.get("html", ""),
306
+ }
307
+
308
+ return ExecutionResult(
309
+ output_blocks=[TextContent(data=json.dumps(result_data, indent=2))],
310
+ )
311
+
312
+ except Exception as e:
313
+ return ExecutionResult(
314
+ output_blocks=[
315
+ TextContent(
316
+ data=json.dumps(
317
+ {"success": False, "error": str(e), "url": url},
318
+ indent=2,
319
+ ),
320
+ ),
321
+ ],
322
+ )
323
+
324
+
325
+ @require_docker
326
+ async def crawl4ai_screenshot(
327
+ url: str,
328
+ wait_seconds: float = 2.0,
329
+ output_filename: Optional[str] = None,
330
+ agent_cwd: Optional[str] = None,
331
+ ) -> ExecutionResult:
332
+ """Capture a screenshot of a webpage.
333
+
334
+ Takes full-page PNG screenshot after waiting for page load.
335
+ Saves to agent's workspace if filename provided.
336
+ Should verify the webpage content either visually or via HTML/markdown tools.
337
+
338
+ Args:
339
+ url: The webpage URL to screenshot
340
+ wait_seconds: Seconds to wait before capturing (default: 2.0)
341
+ output_filename: Optional filename to save in workspace (e.g., "screenshot.png")
342
+ agent_cwd: Agent's workspace directory (auto-injected)
343
+
344
+ Returns:
345
+ ExecutionResult with base64 screenshot or saved file path
346
+
347
+ Examples:
348
+ >>> result = await crawl4ai_screenshot("https://example.com")
349
+ >>> # Returns base64-encoded screenshot
350
+
351
+ >>> result = await crawl4ai_screenshot("https://example.com", output_filename="example.png")
352
+ >>> # Saves example.png to agent's workspace
353
+ """
354
+ import base64
355
+ from pathlib import Path
356
+
357
+ try:
358
+ # Always get base64 response (don't use output_path - that saves in container)
359
+ async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
360
+ response = await client.post(
361
+ f"{CRAWL4AI_BASE_URL}/screenshot",
362
+ json={
363
+ "url": url,
364
+ "screenshot_wait_for": wait_seconds,
365
+ },
366
+ )
367
+ response.raise_for_status()
368
+ data = response.json()
369
+
370
+ screenshot_b64 = data.get("screenshot")
371
+
372
+ if not screenshot_b64:
373
+ return ExecutionResult(
374
+ output_blocks=[
375
+ TextContent(
376
+ data=json.dumps(
377
+ {"success": False, "error": "No screenshot returned", "url": url},
378
+ indent=2,
379
+ ),
380
+ ),
381
+ ],
382
+ )
383
+
384
+ # If filename provided, save to agent's workspace
385
+ if output_filename:
386
+ screenshot_data = base64.b64decode(screenshot_b64)
387
+
388
+ # Use agent_cwd if provided (auto-injected by MassGen)
389
+ if agent_cwd:
390
+ workspace_dir = Path(agent_cwd)
391
+ else:
392
+ # Fallback: use current directory if agent_cwd not provided
393
+ workspace_dir = Path.cwd()
394
+
395
+ output_path = workspace_dir / output_filename
396
+ output_path.write_bytes(screenshot_data)
397
+
398
+ result_data = {
399
+ "success": True,
400
+ "url": url,
401
+ "saved_to": str(output_path),
402
+ "filename": output_filename,
403
+ }
404
+ else:
405
+ result_data = {
406
+ "success": True,
407
+ "url": url,
408
+ "screenshot_base64": screenshot_b64[:100] + "...", # Preview
409
+ "note": "Provide output_filename parameter to save to workspace",
410
+ }
411
+
412
+ return ExecutionResult(
413
+ output_blocks=[TextContent(data=json.dumps(result_data, indent=2))],
414
+ )
415
+
416
+ except Exception as e:
417
+ return ExecutionResult(
418
+ output_blocks=[
419
+ TextContent(
420
+ data=json.dumps(
421
+ {"success": False, "error": str(e), "url": url},
422
+ indent=2,
423
+ ),
424
+ ),
425
+ ],
426
+ )
427
+
428
+
429
+ @require_docker
430
+ async def crawl4ai_pdf(
431
+ url: str,
432
+ output_filename: Optional[str] = None,
433
+ agent_cwd: Optional[str] = None,
434
+ ) -> ExecutionResult:
435
+ """Generate a PDF from a webpage.
436
+
437
+ Creates a PDF document of the rendered page. Useful for archival
438
+ or generating printable versions. Saves to agent's workspace if filename provided.
439
+
440
+ Args:
441
+ url: The webpage URL to convert to PDF
442
+ output_filename: Optional filename to save in workspace (e.g., "page.pdf")
443
+ agent_cwd: Agent's workspace directory (auto-injected by MassGen)
444
+
445
+ Returns:
446
+ ExecutionResult with saved file path
447
+
448
+ Examples:
449
+ >>> result = await crawl4ai_pdf("https://example.com", output_filename="example.pdf")
450
+ >>> # Saves example.pdf to agent's workspace
451
+ """
452
+ import base64
453
+ from pathlib import Path
454
+
455
+ try:
456
+ # Always get base64 response
457
+ async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
458
+ response = await client.post(
459
+ f"{CRAWL4AI_BASE_URL}/pdf",
460
+ json={"url": url},
461
+ )
462
+ response.raise_for_status()
463
+ data = response.json()
464
+
465
+ pdf_b64 = data.get("pdf")
466
+
467
+ if not pdf_b64:
468
+ return ExecutionResult(
469
+ output_blocks=[
470
+ TextContent(
471
+ data=json.dumps(
472
+ {"success": False, "error": "No PDF returned", "url": url},
473
+ indent=2,
474
+ ),
475
+ ),
476
+ ],
477
+ )
478
+
479
+ # If filename provided, save to agent's workspace
480
+ if output_filename:
481
+ pdf_data = base64.b64decode(pdf_b64)
482
+
483
+ # Use agent_cwd if provided (auto-injected by MassGen)
484
+ if agent_cwd:
485
+ workspace_dir = Path(agent_cwd)
486
+ else:
487
+ # Fallback: use current directory if agent_cwd not provided
488
+ workspace_dir = Path.cwd()
489
+
490
+ output_path = workspace_dir / output_filename
491
+ output_path.write_bytes(pdf_data)
492
+
493
+ result_data = {
494
+ "success": True,
495
+ "url": url,
496
+ "saved_to": str(output_path),
497
+ "filename": output_filename,
498
+ }
499
+ else:
500
+ result_data = {
501
+ "success": True,
502
+ "url": url,
503
+ "pdf_size_bytes": len(base64.b64decode(pdf_b64)),
504
+ "note": "Provide output_filename parameter to save to workspace",
505
+ }
506
+
507
+ return ExecutionResult(
508
+ output_blocks=[TextContent(data=json.dumps(result_data, indent=2))],
509
+ )
510
+
511
+ except Exception as e:
512
+ return ExecutionResult(
513
+ output_blocks=[
514
+ TextContent(
515
+ data=json.dumps(
516
+ {"success": False, "error": str(e), "url": url},
517
+ indent=2,
518
+ ),
519
+ ),
520
+ ],
521
+ )
522
+
523
+
524
+ @require_docker
525
+ async def crawl4ai_execute_js(
526
+ url: str,
527
+ scripts: List[str],
528
+ agent_cwd: Optional[str] = None,
529
+ ) -> ExecutionResult:
530
+ """Execute JavaScript on a webpage and return results.
531
+
532
+ Runs custom JavaScript in the page context. Each script should be
533
+ an expression that returns a value (can be IIFE or async function).
534
+ Returns full CrawlResult including markdown, links, and script outputs.
535
+
536
+ Args:
537
+ url: The webpage URL to execute scripts on
538
+ scripts: List of JavaScript code snippets to execute in order
539
+
540
+ Returns:
541
+ ExecutionResult with script execution results and page content
542
+
543
+ Examples:
544
+ >>> result = await crawl4ai_execute_js(
545
+ ... "https://example.com",
546
+ ... ["document.title", "document.links.length"]
547
+ ... )
548
+ >>> # Returns page title and number of links
549
+
550
+ >>> result = await crawl4ai_execute_js(
551
+ ... "https://example.com",
552
+ ... ["(async () => { await someAsyncOperation(); return result; })()"]
553
+ ... )
554
+ >>> # Executes async JavaScript
555
+ """
556
+ try:
557
+ async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
558
+ response = await client.post(
559
+ f"{CRAWL4AI_BASE_URL}/execute_js",
560
+ json={
561
+ "url": url,
562
+ "scripts": scripts,
563
+ },
564
+ )
565
+ response.raise_for_status()
566
+ data = response.json()
567
+
568
+ # Extract key information from CrawlResult
569
+ result_data = {
570
+ "success": data.get("success", True),
571
+ "url": data.get("url"),
572
+ "markdown": data.get("markdown"),
573
+ "js_execution_result": data.get("js_execution_result"),
574
+ "links": data.get("links"),
575
+ }
576
+
577
+ return ExecutionResult(
578
+ output_blocks=[TextContent(data=json.dumps(result_data, indent=2))],
579
+ )
580
+
581
+ except Exception as e:
582
+ return ExecutionResult(
583
+ output_blocks=[
584
+ TextContent(
585
+ data=json.dumps(
586
+ {"success": False, "error": str(e), "url": url},
587
+ indent=2,
588
+ ),
589
+ ),
590
+ ],
591
+ )
592
+
593
+
594
+ @require_docker
595
+ async def crawl4ai_crawl(
596
+ urls: List[str],
597
+ max_urls: int = 100,
598
+ agent_cwd: Optional[str] = None,
599
+ ) -> ExecutionResult:
600
+ """Crawl multiple URLs in parallel.
601
+
602
+ Efficiently scrapes multiple pages concurrently. Returns results
603
+ for all URLs. Limited to 100 URLs per request.
604
+
605
+ Args:
606
+ urls: List of URLs to crawl (max 100)
607
+ max_urls: Maximum number of URLs to process (default: 100)
608
+
609
+ Returns:
610
+ ExecutionResult with results for all crawled URLs
611
+
612
+ Examples:
613
+ >>> result = await crawl4ai_crawl([
614
+ ... "https://example.com",
615
+ ... "https://example.org",
616
+ ... "https://example.net",
617
+ ... ])
618
+ >>> # Returns markdown and metadata for all pages
619
+ """
620
+ try:
621
+ # Limit URLs to prevent overload
622
+ urls_to_crawl = urls[: min(len(urls), max_urls, 100)]
623
+
624
+ async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT * 3) as client:
625
+ response = await client.post(
626
+ f"{CRAWL4AI_BASE_URL}/crawl",
627
+ json={"urls": urls_to_crawl},
628
+ )
629
+ response.raise_for_status()
630
+ data = response.json()
631
+
632
+ result_data = {
633
+ "success": True,
634
+ "total_urls": len(urls_to_crawl),
635
+ "results": data.get("results", []),
636
+ }
637
+
638
+ return ExecutionResult(
639
+ output_blocks=[TextContent(data=json.dumps(result_data, indent=2))],
640
+ )
641
+
642
+ except Exception as e:
643
+ return ExecutionResult(
644
+ output_blocks=[
645
+ TextContent(
646
+ data=json.dumps(
647
+ {
648
+ "success": False,
649
+ "error": str(e),
650
+ "urls": urls[:5], # Show first 5 for debugging
651
+ },
652
+ indent=2,
653
+ ),
654
+ ),
655
+ ],
656
+ )
657
+
658
+
659
+ # async def crawl4ai_ask(
660
+ # query: str,
661
+ # context_type: str = "all",
662
+ # max_results: int = 20,
663
+ # ) -> ExecutionResult:
664
+ # """Query the Crawl4AI library documentation and code context.
665
+
666
+ # Searches crawl4ai documentation using BM25 search. Useful for
667
+ # learning about library features or getting code examples.
668
+
669
+ # Args:
670
+ # query: Search query (recommended, leave empty for all context)
671
+ # context_type: Type of context - "code", "doc", or "all" (default: "all")
672
+ # max_results: Maximum number of results (default: 20)
673
+
674
+ # Returns:
675
+ # ExecutionResult with relevant documentation snippets
676
+
677
+ # Examples:
678
+ # >>> result = await crawl4ai_ask("How do I extract structured data?")
679
+ # >>> # Returns documentation about data extraction
680
+
681
+ # >>> result = await crawl4ai_ask("JavaScript execution", context_type="code")
682
+ # >>> # Returns code examples for JS execution
683
+ # """
684
+ # try:
685
+ # async with httpx.AsyncClient(timeout=DEFAULT_TIMEOUT) as client:
686
+ # response = await client.get(
687
+ # f"{CRAWL4AI_BASE_URL}/ask",
688
+ # params={
689
+ # "query": query,
690
+ # "context_type": context_type,
691
+ # "max_results": max_results,
692
+ # }
693
+ # )
694
+ # response.raise_for_status()
695
+ # data = response.json()
696
+
697
+ # result_data = {
698
+ # "success": True,
699
+ # "query": query,
700
+ # "context_type": context_type,
701
+ # "results": data.get("results", data), # Flexible result format
702
+ # }
703
+
704
+ # return ExecutionResult(
705
+ # output_blocks=[TextContent(data=json.dumps(result_data, indent=2))],
706
+ # )
707
+
708
+ # except Exception as e:
709
+ # return ExecutionResult(
710
+ # output_blocks=[
711
+ # TextContent(
712
+ # data=json.dumps(
713
+ # {"success": False, "error": str(e), "query": query},
714
+ # indent=2,
715
+ # )
716
+ # )
717
+ # ],
718
+ # )