aiecs 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of aiecs might be problematic. Click here for more details.

Files changed (90) hide show
  1. aiecs/__init__.py +75 -0
  2. aiecs/__main__.py +41 -0
  3. aiecs/aiecs_client.py +295 -0
  4. aiecs/application/__init__.py +10 -0
  5. aiecs/application/executors/__init__.py +10 -0
  6. aiecs/application/executors/operation_executor.py +341 -0
  7. aiecs/config/__init__.py +15 -0
  8. aiecs/config/config.py +117 -0
  9. aiecs/config/registry.py +19 -0
  10. aiecs/core/__init__.py +46 -0
  11. aiecs/core/interface/__init__.py +34 -0
  12. aiecs/core/interface/execution_interface.py +150 -0
  13. aiecs/core/interface/storage_interface.py +214 -0
  14. aiecs/domain/__init__.py +20 -0
  15. aiecs/domain/context/__init__.py +28 -0
  16. aiecs/domain/context/content_engine.py +982 -0
  17. aiecs/domain/context/conversation_models.py +306 -0
  18. aiecs/domain/execution/__init__.py +12 -0
  19. aiecs/domain/execution/model.py +49 -0
  20. aiecs/domain/task/__init__.py +13 -0
  21. aiecs/domain/task/dsl_processor.py +460 -0
  22. aiecs/domain/task/model.py +50 -0
  23. aiecs/domain/task/task_context.py +257 -0
  24. aiecs/infrastructure/__init__.py +26 -0
  25. aiecs/infrastructure/messaging/__init__.py +13 -0
  26. aiecs/infrastructure/messaging/celery_task_manager.py +341 -0
  27. aiecs/infrastructure/messaging/websocket_manager.py +289 -0
  28. aiecs/infrastructure/monitoring/__init__.py +12 -0
  29. aiecs/infrastructure/monitoring/executor_metrics.py +138 -0
  30. aiecs/infrastructure/monitoring/structured_logger.py +50 -0
  31. aiecs/infrastructure/monitoring/tracing_manager.py +376 -0
  32. aiecs/infrastructure/persistence/__init__.py +12 -0
  33. aiecs/infrastructure/persistence/database_manager.py +286 -0
  34. aiecs/infrastructure/persistence/file_storage.py +671 -0
  35. aiecs/infrastructure/persistence/redis_client.py +162 -0
  36. aiecs/llm/__init__.py +54 -0
  37. aiecs/llm/base_client.py +99 -0
  38. aiecs/llm/client_factory.py +339 -0
  39. aiecs/llm/custom_callbacks.py +228 -0
  40. aiecs/llm/openai_client.py +125 -0
  41. aiecs/llm/vertex_client.py +186 -0
  42. aiecs/llm/xai_client.py +184 -0
  43. aiecs/main.py +351 -0
  44. aiecs/scripts/DEPENDENCY_SYSTEM_SUMMARY.md +241 -0
  45. aiecs/scripts/README_DEPENDENCY_CHECKER.md +309 -0
  46. aiecs/scripts/README_WEASEL_PATCH.md +126 -0
  47. aiecs/scripts/__init__.py +3 -0
  48. aiecs/scripts/dependency_checker.py +825 -0
  49. aiecs/scripts/dependency_fixer.py +348 -0
  50. aiecs/scripts/download_nlp_data.py +348 -0
  51. aiecs/scripts/fix_weasel_validator.py +121 -0
  52. aiecs/scripts/fix_weasel_validator.sh +82 -0
  53. aiecs/scripts/patch_weasel_library.sh +188 -0
  54. aiecs/scripts/quick_dependency_check.py +269 -0
  55. aiecs/scripts/run_weasel_patch.sh +41 -0
  56. aiecs/scripts/setup_nlp_data.sh +217 -0
  57. aiecs/tasks/__init__.py +2 -0
  58. aiecs/tasks/worker.py +111 -0
  59. aiecs/tools/__init__.py +196 -0
  60. aiecs/tools/base_tool.py +202 -0
  61. aiecs/tools/langchain_adapter.py +361 -0
  62. aiecs/tools/task_tools/__init__.py +82 -0
  63. aiecs/tools/task_tools/chart_tool.py +704 -0
  64. aiecs/tools/task_tools/classfire_tool.py +901 -0
  65. aiecs/tools/task_tools/image_tool.py +397 -0
  66. aiecs/tools/task_tools/office_tool.py +600 -0
  67. aiecs/tools/task_tools/pandas_tool.py +565 -0
  68. aiecs/tools/task_tools/report_tool.py +499 -0
  69. aiecs/tools/task_tools/research_tool.py +363 -0
  70. aiecs/tools/task_tools/scraper_tool.py +548 -0
  71. aiecs/tools/task_tools/search_api.py +7 -0
  72. aiecs/tools/task_tools/stats_tool.py +513 -0
  73. aiecs/tools/temp_file_manager.py +126 -0
  74. aiecs/tools/tool_executor/__init__.py +35 -0
  75. aiecs/tools/tool_executor/tool_executor.py +518 -0
  76. aiecs/utils/LLM_output_structor.py +409 -0
  77. aiecs/utils/__init__.py +23 -0
  78. aiecs/utils/base_callback.py +50 -0
  79. aiecs/utils/execution_utils.py +158 -0
  80. aiecs/utils/logging.py +1 -0
  81. aiecs/utils/prompt_loader.py +13 -0
  82. aiecs/utils/token_usage_repository.py +279 -0
  83. aiecs/ws/__init__.py +0 -0
  84. aiecs/ws/socket_server.py +41 -0
  85. aiecs-1.0.0.dist-info/METADATA +610 -0
  86. aiecs-1.0.0.dist-info/RECORD +90 -0
  87. aiecs-1.0.0.dist-info/WHEEL +5 -0
  88. aiecs-1.0.0.dist-info/entry_points.txt +7 -0
  89. aiecs-1.0.0.dist-info/licenses/LICENSE +225 -0
  90. aiecs-1.0.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,548 @@
1
+ import os
2
+ import json
3
+ import time
4
+ import logging
5
+ import asyncio
6
+ import tempfile
7
+ import subprocess
8
+ from typing import Dict, Any, List, Optional, Union, Tuple
9
+ from enum import Enum
10
+ from urllib.parse import urlparse, urljoin
11
+
12
+ import httpx
13
+ from bs4 import BeautifulSoup
14
+ from urllib import request as urllib_request
15
+ from pydantic import BaseModel, ValidationError, ConfigDict
16
+ from pydantic_settings import BaseSettings
17
+
18
+ from aiecs.tools.base_tool import BaseTool
19
+ from aiecs.tools import register_tool
20
+
21
+ # Enums for configuration options
22
+ class HttpMethod(str, Enum):
23
+ GET = "get"
24
+ POST = "post"
25
+ PUT = "put"
26
+ DELETE = "delete"
27
+ HEAD = "head"
28
+ OPTIONS = "options"
29
+ PATCH = "patch"
30
+
31
+ class ContentType(str, Enum):
32
+ HTML = "html"
33
+ JSON = "json"
34
+ TEXT = "text"
35
+ BINARY = "binary"
36
+
37
+ class OutputFormat(str, Enum):
38
+ TEXT = "text"
39
+ JSON = "json"
40
+ HTML = "html"
41
+ MARKDOWN = "markdown"
42
+ CSV = "csv"
43
+
44
+ class RenderEngine(str, Enum):
45
+ NONE = "none"
46
+ PLAYWRIGHT = "playwright"
47
+
48
+ # Global settings
49
+ class ScraperSettings(BaseSettings):
50
+ """
51
+ Configuration for ScraperTool.
52
+
53
+ Attributes:
54
+ user_agent (str): User agent for HTTP requests.
55
+ max_content_length (int): Maximum content length in bytes.
56
+ output_dir (str): Directory for output files.
57
+ scrapy_command (str): Command to run Scrapy.
58
+ allowed_domains (List[str]): Allowed domains for scraping.
59
+ blocked_domains (List[str]): Blocked domains for scraping.
60
+ playwright_available (bool): Whether Playwright is available.
61
+ env_prefix (str): Environment variable prefix.
62
+ """
63
+ user_agent: str = "PythonMiddlewareScraper/2.0"
64
+ max_content_length: int = 10 * 1024 * 1024 # 10MB
65
+ output_dir: str = os.path.join(tempfile.gettempdir(), 'scraper_outputs')
66
+ scrapy_command: str = "scrapy"
67
+ allowed_domains: List[str] = []
68
+ blocked_domains: List[str] = []
69
+ playwright_available: bool = False
70
+ env_prefix: str = "SCRAPER_TOOL_"
71
+
72
+ model_config = ConfigDict(env_prefix="SCRAPER_TOOL_")
73
+
74
+ # Exceptions
75
+ class ScraperToolError(Exception):
76
+ """Base exception for ScraperTool errors."""
77
+ pass
78
+
79
+ class HttpError(ScraperToolError):
80
+ """Raised when HTTP requests fail."""
81
+ pass
82
+
83
+ class TimeoutError(ScraperToolError):
84
+ """Raised when operations time out."""
85
+ pass
86
+
87
+ class RateLimitError(ScraperToolError):
88
+ """Raised when rate limits are exceeded."""
89
+ pass
90
+
91
+ class ParsingError(ScraperToolError):
92
+ """Raised when HTML parsing fails."""
93
+ pass
94
+
95
+ class RenderingError(ScraperToolError):
96
+ """Raised when rendering fails."""
97
+ pass
98
+
99
+ class ExternalToolError(ScraperToolError):
100
+ """Raised when external tools fail."""
101
+ pass
102
+
103
+ class FileOperationError(ScraperToolError):
104
+ """Raised when file operations fail."""
105
+ pass
106
+
107
+ @register_tool("scraper")
108
+ class ScraperTool(BaseTool):
109
+ """
110
+ Enhanced web scraping tool with multiple HTTP clients, JavaScript rendering,
111
+ HTML parsing, and security features.
112
+
113
+ Features:
114
+ - Multiple HTTP clients: httpx, urllib
115
+ - JavaScript rendering with Playwright or Selenium
116
+ - HTML parsing with BeautifulSoup
117
+ - Scrapy integration for advanced crawling
118
+ - Output in various formats: text, JSON, HTML, Markdown, CSV
119
+ """
120
+ def __init__(self, config: Optional[Dict] = None):
121
+ """
122
+ Initialize ScraperTool with settings and resources.
123
+
124
+ Args:
125
+ config (Dict, optional): Configuration overrides for ScraperSettings.
126
+
127
+ Raises:
128
+ ValueError: If config contains invalid settings.
129
+ """
130
+ super().__init__(config)
131
+ self.settings = ScraperSettings()
132
+ if config:
133
+ try:
134
+ self.settings = self.settings.model_validate({**self.settings.model_dump(), **config})
135
+ except ValidationError as e:
136
+ raise ValueError(f"Invalid settings: {e}")
137
+ self.logger = logging.getLogger(__name__)
138
+ if not self.logger.handlers:
139
+ handler = logging.StreamHandler()
140
+ handler.setFormatter(logging.Formatter('%(asctime)s %(levelname)s %(message)s'))
141
+ self.logger.addHandler(handler)
142
+ self.logger.setLevel(logging.INFO)
143
+ os.makedirs(self.settings.output_dir, exist_ok=True)
144
+ self._check_external_tools()
145
+
146
+ def _check_external_tools(self):
147
+ """Check if external tools are available."""
148
+ try:
149
+ import playwright
150
+ self.settings.playwright_available = True
151
+ except ImportError:
152
+ self.settings.playwright_available = False
153
+
154
+
155
+ async def _save_output(self, content: Any, path: str, format: OutputFormat) -> None:
156
+ """Save content to file in the specified format."""
157
+ try:
158
+ os.makedirs(os.path.dirname(os.path.abspath(path)), exist_ok=True)
159
+ if format == OutputFormat.TEXT:
160
+ with open(path, 'w', encoding='utf-8') as f:
161
+ if isinstance(content, dict):
162
+ f.write(json.dumps(content, indent=2))
163
+ else:
164
+ f.write(str(content))
165
+ elif format == OutputFormat.JSON:
166
+ with open(path, 'w', encoding='utf-8') as f:
167
+ if isinstance(content, dict):
168
+ json.dump(content, f, indent=2)
169
+ else:
170
+ json.dump({"content": content}, f, indent=2)
171
+ elif format == OutputFormat.HTML:
172
+ with open(path, 'w', encoding='utf-8') as f:
173
+ if isinstance(content, dict) and 'html' in content:
174
+ f.write(content['html'])
175
+ else:
176
+ f.write(str(content))
177
+ elif format == OutputFormat.MARKDOWN:
178
+ with open(path, 'w', encoding='utf-8') as f:
179
+ if isinstance(content, dict):
180
+ f.write("# Scraper Results\n\n")
181
+ for key, value in content.items():
182
+ f.write(f"## {key}\n\n")
183
+ f.write(f"{value}\n\n")
184
+ else:
185
+ f.write("# Scraper Results\n\n")
186
+ f.write(str(content))
187
+ elif format == OutputFormat.CSV:
188
+ import csv
189
+ with open(path, 'w', newline='', encoding='utf-8') as f:
190
+ if isinstance(content, dict):
191
+ writer = csv.writer(f)
192
+ writer.writerow(content.keys())
193
+ writer.writerow(content.values())
194
+ elif isinstance(content, list) and all(isinstance(item, dict) for item in content):
195
+ if content:
196
+ writer = csv.DictWriter(f, fieldnames=content[0].keys())
197
+ writer.writeheader()
198
+ writer.writerows(content)
199
+ else:
200
+ writer = csv.writer(f)
201
+ writer.writerow(["content"])
202
+ writer.writerow([str(content)])
203
+ except Exception as e:
204
+ raise FileOperationError(f"Error saving output: {str(e)}")
205
+
206
+ async def get_httpx(self, url: str, method: HttpMethod = HttpMethod.GET, params: Optional[Dict[str, str]] = None, data: Optional[Dict[str, Any]] = None, json_data: Optional[Dict[str, Any]] = None, cookies: Optional[Dict[str, str]] = None, auth: Optional[Tuple[str, str]] = None, verify_ssl: Optional[bool] = None, allow_redirects: bool = True, content_type: ContentType = ContentType.TEXT, headers: Optional[Dict[str, str]] = None, output_format: Optional[OutputFormat] = None, output_path: Optional[str] = None, async_mode: bool = True) -> Any:
207
+ """
208
+ Execute HTTP request using httpx library (supports both sync and async).
209
+
210
+ Args:
211
+ url (str): URL to scrape.
212
+ method (HttpMethod): HTTP method to use.
213
+ params (Optional[Dict[str, str]]): Query parameters.
214
+ data (Optional[Dict[str, Any]]): Form data.
215
+ json_data (Optional[Dict[str, Any]]): JSON data.
216
+ cookies (Optional[Dict[str, str]]): Cookies.
217
+ auth (Optional[Tuple[str, str]]): Authentication credentials.
218
+ verify_ssl (Optional[bool]): Verify SSL certificates.
219
+ allow_redirects (bool): Allow redirects.
220
+ content_type (ContentType): Expected content type.
221
+ headers (Optional[Dict[str, str]]): Custom headers.
222
+ output_format (Optional[OutputFormat]): Output format.
223
+ output_path (Optional[str]): Path to save output.
224
+ async_mode (bool): Whether to use async client.
225
+
226
+ Returns:
227
+ Any: Scraped content (dict, str, or bytes).
228
+
229
+ Raises:
230
+ HttpError: If the request fails.
231
+ """
232
+ try:
233
+ headers = headers or {}
234
+ if 'User-Agent' not in headers:
235
+ headers['User-Agent'] = self.settings.user_agent
236
+ kwargs = {
237
+ 'params': params,
238
+ 'headers': headers,
239
+ 'follow_redirects': allow_redirects,
240
+ }
241
+ if auth:
242
+ kwargs['auth'] = auth
243
+ if cookies:
244
+ kwargs['cookies'] = cookies
245
+ if json_data:
246
+ kwargs['json'] = json_data
247
+ elif data:
248
+ kwargs['data'] = data
249
+
250
+ if async_mode:
251
+ async with httpx.AsyncClient(verify=verify_ssl if verify_ssl is not None else True) as client:
252
+ method_fn = getattr(client, method.value)
253
+ resp = await method_fn(str(url), **kwargs)
254
+ else:
255
+ with httpx.Client(verify=verify_ssl if verify_ssl is not None else True) as client:
256
+ method_fn = getattr(client, method.value)
257
+ resp = method_fn(str(url), **kwargs)
258
+
259
+ try:
260
+ resp.raise_for_status()
261
+ except httpx.HTTPStatusError as e:
262
+ raise HttpError(f"HTTP {e.response.status_code}: {e.response.reason_phrase} for {url}")
263
+
264
+ if len(resp.content) > self.settings.max_content_length:
265
+ raise HttpError(f"Response content too large: {len(resp.content)} bytes")
266
+
267
+ if content_type == ContentType.JSON:
268
+ result = resp.json()
269
+ elif content_type == ContentType.HTML:
270
+ result = {'html': resp.text, 'url': str(resp.url), 'status': resp.status_code}
271
+ elif content_type == ContentType.BINARY:
272
+ result = {'content': resp.content, 'url': str(resp.url), 'status': resp.status_code}
273
+ else:
274
+ result = resp.text
275
+
276
+ if output_format and output_path:
277
+ await self._save_output(result, output_path, output_format)
278
+ if isinstance(result, dict):
279
+ result['saved_to'] = output_path
280
+ else:
281
+ result = {'content': result, 'saved_to': output_path}
282
+ return result
283
+ except httpx.RequestError as e:
284
+ raise HttpError(f"Request failed: {str(e)}")
285
+
286
+ async def get_urllib(self, url: str, method: HttpMethod = HttpMethod.GET, data: Optional[Dict[str, Any]] = None, content_type: ContentType = ContentType.TEXT, headers: Optional[Dict[str, str]] = None, output_format: Optional[OutputFormat] = None, output_path: Optional[str] = None) -> Any:
287
+ """
288
+ Execute HTTP request using urllib.
289
+
290
+ Args:
291
+ url (str): URL to scrape.
292
+ method (HttpMethod): HTTP method to use.
293
+ data (Optional[Dict[str, Any]]): Form data.
294
+ content_type (ContentType): Expected content type.
295
+ headers (Optional[Dict[str, str]]): Custom headers.
296
+ output_format (Optional[OutputFormat]): Output format.
297
+ output_path (Optional[str]): Path to save output.
298
+
299
+ Returns:
300
+ Any: Scraped content (dict, str, or bytes).
301
+
302
+ Raises:
303
+ HttpError: If the request fails.
304
+ """
305
+ try:
306
+ import urllib.parse
307
+ import urllib.error
308
+
309
+ headers = headers or {}
310
+ if 'User-Agent' not in headers:
311
+ headers['User-Agent'] = self.settings.user_agent
312
+ data_bytes = None
313
+ if data:
314
+ data_bytes = urllib.parse.urlencode(data).encode()
315
+ req = urllib_request.Request(
316
+ str(url),
317
+ data=data_bytes,
318
+ headers=headers,
319
+ method=method.value.upper()
320
+ )
321
+ with urllib_request.urlopen(req) as resp:
322
+ content_length = resp.getheader('Content-Length')
323
+ if content_length and int(content_length) > self.settings.max_content_length:
324
+ raise HttpError(f"Response content too large: {content_length} bytes")
325
+ content = resp.read()
326
+ charset = resp.headers.get_content_charset() or 'utf-8'
327
+ if content_type == ContentType.JSON:
328
+ result = json.loads(content.decode(charset, errors='ignore'))
329
+ elif content_type == ContentType.HTML:
330
+ result = {'html': content.decode(charset, errors='ignore'), 'url': resp.url, 'status': resp.status}
331
+ elif content_type == ContentType.BINARY:
332
+ result = {'content': content, 'url': resp.url, 'status': resp.status}
333
+ else:
334
+ result = content.decode(charset, errors='ignore')
335
+ if output_format and output_path:
336
+ await self._save_output(result, output_path, output_format)
337
+ if isinstance(result, dict):
338
+ result['saved_to'] = output_path
339
+ else:
340
+ result = {'content': result, 'saved_to': output_path}
341
+ return result
342
+ except urllib.error.URLError as e:
343
+ raise HttpError(f"Request failed: {str(e)}")
344
+
345
+ # Legacy method names for backward compatibility
346
+ async def get_requests(self, url: str, method: HttpMethod = HttpMethod.GET, params: Optional[Dict[str, str]] = None, data: Optional[Dict[str, Any]] = None, json_data: Optional[Dict[str, Any]] = None, cookies: Optional[Dict[str, str]] = None, auth: Optional[Tuple[str, str]] = None, verify_ssl: Optional[bool] = None, allow_redirects: bool = True, content_type: ContentType = ContentType.TEXT, headers: Optional[Dict[str, str]] = None, output_format: Optional[OutputFormat] = None, output_path: Optional[str] = None) -> Any:
347
+ """Legacy method - now uses httpx in sync mode."""
348
+ return await self.get_httpx(url, method, params, data, json_data, cookies, auth, verify_ssl, allow_redirects, content_type, headers, output_format, output_path, async_mode=False)
349
+
350
+ async def get_aiohttp(self, url: str, method: HttpMethod = HttpMethod.GET, params: Optional[Dict[str, str]] = None, data: Optional[Dict[str, Any]] = None, json_data: Optional[Dict[str, Any]] = None, cookies: Optional[Dict[str, str]] = None, auth: Optional[Tuple[str, str]] = None, verify_ssl: Optional[bool] = None, allow_redirects: bool = True, content_type: ContentType = ContentType.TEXT, headers: Optional[Dict[str, str]] = None, output_format: Optional[OutputFormat] = None, output_path: Optional[str] = None) -> Any:
351
+ """Legacy method - now uses httpx in async mode."""
352
+ return await self.get_httpx(url, method, params, data, json_data, cookies, auth, verify_ssl, allow_redirects, content_type, headers, output_format, output_path, async_mode=True)
353
+
354
+ async def render(self, url: str, engine: RenderEngine = RenderEngine.PLAYWRIGHT, wait_time: int = 5, wait_selector: Optional[str] = None, scroll_to_bottom: bool = False, screenshot: bool = False, screenshot_path: Optional[str] = None, headers: Optional[Dict[str, str]] = None, output_format: Optional[OutputFormat] = None, output_path: Optional[str] = None) -> Dict[str, Any]:
355
+ """
356
+ Render a web page using a headless browser (Playwright).
357
+
358
+ Args:
359
+ url (str): URL to render.
360
+ engine (RenderEngine): Rendering engine to use (only PLAYWRIGHT supported).
361
+ wait_time (int): Time to wait for JS execution.
362
+ wait_selector (Optional[str]): CSS selector to wait for.
363
+ scroll_to_bottom (bool): Whether to scroll to the bottom of the page.
364
+ screenshot (bool): Whether to take a screenshot.
365
+ screenshot_path (Optional[str]): Path to save the screenshot.
366
+ headers (Optional[Dict[str, str]]): Custom headers.
367
+ output_format (Optional[OutputFormat]): Output format.
368
+ output_path (Optional[str]): Path to save output.
369
+
370
+ Returns:
371
+ Dict[str, Any]: Rendered page content {'html': str, 'title': str, 'url': str, 'screenshot': Optional[str]}.
372
+
373
+ Raises:
374
+ RenderingError: If rendering fails.
375
+ """
376
+ try:
377
+ if engine == RenderEngine.PLAYWRIGHT:
378
+ if not self.settings.playwright_available:
379
+ raise RenderingError("Playwright is not available. Install with 'pip install playwright'")
380
+ result = await self._render_with_playwright(url, wait_time, wait_selector, scroll_to_bottom, screenshot, screenshot_path)
381
+ else:
382
+ raise RenderingError(f"Unsupported rendering engine: {engine}. Only PLAYWRIGHT is supported.")
383
+ if output_format and output_path:
384
+ await self._save_output(result, output_path, output_format)
385
+ result['saved_to'] = output_path
386
+ return result
387
+ except Exception as e:
388
+ raise RenderingError(f"Failed to render page: {str(e)}")
389
+
390
+ async def _render_with_playwright(self, url: str, wait_time: int, wait_selector: Optional[str], scroll_to_bottom: bool, screenshot: bool, screenshot_path: Optional[str]) -> Dict[str, Any]:
391
+ """Render a web page using Playwright with async API."""
392
+ from playwright.async_api import async_playwright
393
+ async with async_playwright() as p:
394
+ browser = await p.chromium.launch()
395
+ page = await browser.new_page(
396
+ user_agent=self.settings.user_agent,
397
+ viewport={'width': 1280, 'height': 800}
398
+ )
399
+ try:
400
+ await page.goto(url)
401
+ if wait_selector:
402
+ await page.wait_for_selector(wait_selector)
403
+ else:
404
+ await page.wait_for_load_state('networkidle')
405
+ if scroll_to_bottom:
406
+ await page.evaluate('window.scrollTo(0, document.body.scrollHeight)')
407
+ await page.wait_for_timeout(1000)
408
+ screenshot_result = None
409
+ if screenshot:
410
+ screenshot_path = screenshot_path or os.path.join(self.settings.output_dir, f"screenshot_{int(time.time())}.png")
411
+ os.makedirs(os.path.dirname(os.path.abspath(screenshot_path)), exist_ok=True)
412
+ await page.screenshot(path=screenshot_path)
413
+ screenshot_result = screenshot_path
414
+ html = await page.content()
415
+ title = await page.title()
416
+ result = {
417
+ 'html': html,
418
+ 'title': title,
419
+ 'url': page.url,
420
+ 'screenshot': screenshot_result
421
+ }
422
+ return result
423
+ finally:
424
+ await browser.close()
425
+
426
+
427
+ def crawl_scrapy(self, project_path: str, spider_name: str, output_path: str, spider_args: Optional[Dict[str, str]] = None, headers: Optional[Dict[str, str]] = None, output_format: Optional[OutputFormat] = None) -> Dict[str, Any]:
428
+ """
429
+ Execute a Scrapy spider in an existing project and output results to a file.
430
+
431
+ Args:
432
+ project_path (str): Path to the Scrapy project.
433
+ spider_name (str): Name of the spider to run.
434
+ output_path (str): Path to save the output.
435
+ spider_args (Optional[Dict[str, str]]): Arguments to pass to the spider.
436
+ headers (Optional[Dict[str, str]]): Custom headers.
437
+ output_format (Optional[OutputFormat]): Output format.
438
+
439
+ Returns:
440
+ Dict[str, Any]: Crawl results {'output_path': str, 'execution_time': float, 'file_size': int, 'stdout': str, 'stderr': str}.
441
+
442
+ Raises:
443
+ ExternalToolError: If Scrapy fails.
444
+ TimeoutError: If the operation times out.
445
+ """
446
+ try:
447
+ start_time = time.time()
448
+ os.makedirs(os.path.dirname(os.path.abspath(output_path)), exist_ok=True)
449
+ cmd = [
450
+ self.settings.scrapy_command,
451
+ 'crawl', spider_name,
452
+ '-o', output_path,
453
+ '-s', f'USER_AGENT={self.settings.user_agent}',
454
+ '-s', 'LOG_LEVEL=INFO'
455
+ ]
456
+ if spider_args:
457
+ for k, v in spider_args.items():
458
+ cmd += ['-a', f"{k}={v}"]
459
+ process = subprocess.run(
460
+ cmd,
461
+ cwd=project_path,
462
+ stdout=subprocess.PIPE,
463
+ stderr=subprocess.PIPE,
464
+ text=True
465
+ )
466
+ if process.returncode != 0:
467
+ error_msg = process.stderr.strip()
468
+ raise ExternalToolError(f"Scrapy crawl failed: {error_msg}")
469
+ if not os.path.exists(output_path):
470
+ raise ExternalToolError(f"Scrapy crawl did not create output file: {output_path}")
471
+ file_size = os.path.getsize(output_path)
472
+ result = {
473
+ 'output_path': output_path,
474
+ 'execution_time': time.time() - start_time,
475
+ 'file_size': file_size,
476
+ 'stdout': process.stdout.strip(),
477
+ 'stderr': process.stderr.strip()
478
+ }
479
+ return result
480
+ except subprocess.TimeoutExpired:
481
+ raise TimeoutError(f"Scrapy crawl timed out")
482
+ except Exception as e:
483
+ raise ExternalToolError(f"Error running Scrapy: {str(e)}")
484
+
485
+ def parse_html(self, html: str, selector: str, selector_type: str = "css", extract_attr: Optional[str] = None, extract_text: bool = True) -> Dict[str, Any]:
486
+ """
487
+ Parse HTML content using BeautifulSoup.
488
+
489
+ Args:
490
+ html (str): HTML content to parse.
491
+ selector (str): CSS or XPath selector.
492
+ selector_type (str): Selector type ('css' or 'xpath').
493
+ extract_attr (Optional[str]): Attribute to extract.
494
+ extract_text (bool): Whether to extract text content.
495
+
496
+ Returns:
497
+ Dict[str, Any]: Parsed results {'selector': str, 'selector_type': str, 'count': int, 'results': List[str]}.
498
+
499
+ Raises:
500
+ ParsingError: If parsing fails.
501
+ """
502
+ try:
503
+ soup = BeautifulSoup(html, 'html.parser')
504
+ if selector_type == 'css':
505
+ elements = soup.select(selector)
506
+ else:
507
+ from lxml.html import fromstring
508
+ from lxml.etree import XPath
509
+ root = fromstring(html)
510
+ xpath = XPath(selector)
511
+ elements = xpath(root)
512
+ results = []
513
+ for element in elements:
514
+ if extract_attr:
515
+ value = element.get(extract_attr) if hasattr(element, 'get') else element.get(extract_attr)
516
+ if value is not None:
517
+ results.append(value)
518
+ elif extract_text:
519
+ if hasattr(element, 'text_content') and callable(getattr(element, 'text_content')):
520
+ # lxml element
521
+ text = element.text_content()
522
+ else:
523
+ # BeautifulSoup element
524
+ text = element.get_text()
525
+
526
+ if text and text.strip():
527
+ results.append(text.strip())
528
+ return {
529
+ 'selector': selector,
530
+ 'selector_type': selector_type,
531
+ 'count': len(results),
532
+ 'results': results
533
+ }
534
+ except Exception as e:
535
+ raise ParsingError(f"Error parsing HTML: {str(e)}")
536
+
537
+ # HTTP method shortcuts
538
+ get = get_httpx
539
+ post = get_httpx
540
+ put = get_httpx
541
+ delete = get_httpx
542
+ head = get_httpx
543
+ options = get_httpx
544
+ patch = get_httpx
545
+
546
+ # Legacy method aliases
547
+ get_requests = get_httpx
548
+ get_aiohttp = get_httpx
@@ -0,0 +1,7 @@
1
+ from aiecs.tools import register_tool
2
+ from aiecs.tools.base_tool import BaseTool
3
+
4
+ @register_tool("search_api")
5
+ class SearchAPITool(BaseTool):
6
+ def run(self, query):
7
+ return f"[Search results for '{query}']"