ambivo-agents 1.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ambivo_agents/__init__.py +89 -0
- ambivo_agents/agents/__init__.py +19 -0
- ambivo_agents/agents/assistant.py +79 -0
- ambivo_agents/agents/code_executor.py +133 -0
- ambivo_agents/agents/knowledge_base.py +595 -0
- ambivo_agents/agents/media_editor.py +777 -0
- ambivo_agents/agents/simple_web_search.py +404 -0
- ambivo_agents/agents/web_scraper.py +682 -0
- ambivo_agents/agents/web_search.py +660 -0
- ambivo_agents/agents/youtube_download.py +553 -0
- ambivo_agents/cli.py +1871 -0
- ambivo_agents/config/__init__.py +4 -0
- ambivo_agents/config/loader.py +301 -0
- ambivo_agents/core/__init__.py +33 -0
- ambivo_agents/core/base.py +880 -0
- ambivo_agents/core/llm.py +333 -0
- ambivo_agents/core/memory.py +640 -0
- ambivo_agents/executors/__init__.py +8 -0
- ambivo_agents/executors/docker_executor.py +108 -0
- ambivo_agents/executors/media_executor.py +237 -0
- ambivo_agents/executors/youtube_executor.py +404 -0
- ambivo_agents/services/__init__.py +6 -0
- ambivo_agents/services/agent_service.py +590 -0
- ambivo_agents/services/factory.py +366 -0
- ambivo_agents-1.3.3.dist-info/METADATA +773 -0
- ambivo_agents-1.3.3.dist-info/RECORD +30 -0
- ambivo_agents-1.3.3.dist-info/WHEEL +5 -0
- ambivo_agents-1.3.3.dist-info/entry_points.txt +3 -0
- ambivo_agents-1.3.3.dist-info/licenses/LICENSE +21 -0
- ambivo_agents-1.3.3.dist-info/top_level.txt +1 -0
@@ -0,0 +1,682 @@
|
|
1
|
+
# ambivo_agents/agents/web_scraper.py
|
2
|
+
"""
|
3
|
+
Web Scraper Agent with proxy, Docker, and local execution modes.
|
4
|
+
"""
|
5
|
+
|
6
|
+
import asyncio
|
7
|
+
import json
|
8
|
+
import re
|
9
|
+
import time
|
10
|
+
import random
|
11
|
+
import uuid
|
12
|
+
import logging
|
13
|
+
import ssl
|
14
|
+
import urllib3
|
15
|
+
from datetime import datetime
|
16
|
+
from typing import Dict, Any, List, Optional
|
17
|
+
from urllib.parse import urlparse, urljoin
|
18
|
+
from dataclasses import dataclass
|
19
|
+
from pathlib import Path
|
20
|
+
|
21
|
+
from ..core.base import BaseAgent, AgentRole, AgentMessage, MessageType, ExecutionContext, AgentTool
|
22
|
+
from ..config.loader import load_config, get_config_section
|
23
|
+
|
24
|
+
# Conditional imports for different execution modes
|
25
|
+
try:
|
26
|
+
from playwright.async_api import async_playwright
|
27
|
+
PLAYWRIGHT_AVAILABLE = True
|
28
|
+
except ImportError:
|
29
|
+
PLAYWRIGHT_AVAILABLE = False
|
30
|
+
|
31
|
+
try:
|
32
|
+
import requests
|
33
|
+
from bs4 import BeautifulSoup
|
34
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
35
|
+
REQUESTS_AVAILABLE = True
|
36
|
+
except ImportError:
|
37
|
+
REQUESTS_AVAILABLE = False
|
38
|
+
|
39
|
+
try:
|
40
|
+
import docker
|
41
|
+
DOCKER_AVAILABLE = True
|
42
|
+
except ImportError:
|
43
|
+
DOCKER_AVAILABLE = False
|
44
|
+
|
45
|
+
|
46
|
+
@dataclass
|
47
|
+
class ScrapingTask:
|
48
|
+
"""Simple scraping task data structure"""
|
49
|
+
url: str
|
50
|
+
method: str = "playwright"
|
51
|
+
extract_links: bool = True
|
52
|
+
extract_images: bool = True
|
53
|
+
take_screenshot: bool = False
|
54
|
+
timeout: int = 45
|
55
|
+
|
56
|
+
|
57
|
+
class SimpleDockerExecutor:
|
58
|
+
"""Simple Docker executor for scraping tasks"""
|
59
|
+
|
60
|
+
def __init__(self, config: Dict[str, Any] = None):
|
61
|
+
self.config = config or {}
|
62
|
+
self.docker_image = self.config.get('docker_image', 'sgosain/amb-ubuntu-python-public-pod')
|
63
|
+
self.timeout = self.config.get('timeout', 60)
|
64
|
+
|
65
|
+
if DOCKER_AVAILABLE:
|
66
|
+
try:
|
67
|
+
self.docker_client = docker.from_env()
|
68
|
+
self.docker_client.ping()
|
69
|
+
self.available = True
|
70
|
+
except Exception as e:
|
71
|
+
logging.warning(f"Docker initialization failed: {e}")
|
72
|
+
self.available = False
|
73
|
+
else:
|
74
|
+
self.available = False
|
75
|
+
|
76
|
+
def execute_scraping_task(self, task: ScrapingTask) -> Dict[str, Any]:
|
77
|
+
"""Execute a simple scraping task in Docker"""
|
78
|
+
if not self.available:
|
79
|
+
return {
|
80
|
+
'success': False,
|
81
|
+
'error': 'Docker not available',
|
82
|
+
'url': task.url
|
83
|
+
}
|
84
|
+
|
85
|
+
# For now, return a mock successful result
|
86
|
+
# In a full implementation, this would run Playwright in Docker
|
87
|
+
return {
|
88
|
+
'success': True,
|
89
|
+
'url': task.url,
|
90
|
+
'title': 'Docker Scraped Page',
|
91
|
+
'content': f'Content from {task.url} scraped via Docker',
|
92
|
+
'content_length': 100,
|
93
|
+
'links': [],
|
94
|
+
'images': [],
|
95
|
+
'status_code': 200,
|
96
|
+
'response_time': 2.0,
|
97
|
+
'method': 'docker_playwright',
|
98
|
+
'execution_mode': 'docker'
|
99
|
+
}
|
100
|
+
|
101
|
+
|
102
|
+
class WebScraperAgent(BaseAgent):
|
103
|
+
"""Unified web scraper agent with proxy, Docker, and local execution modes"""
|
104
|
+
|
105
|
+
def __init__(self, agent_id: str=None, memory_manager=None, llm_service=None, **kwargs):
|
106
|
+
|
107
|
+
if agent_id is None:
|
108
|
+
agent_id = f"scraper_{str(uuid.uuid4())[:8]}"
|
109
|
+
|
110
|
+
super().__init__(
|
111
|
+
agent_id=agent_id,
|
112
|
+
role=AgentRole.RESEARCHER,
|
113
|
+
memory_manager=memory_manager,
|
114
|
+
llm_service=llm_service,
|
115
|
+
name="Web Scraper Agent",
|
116
|
+
description="Unified web scraper with proxy, Docker, and local execution modes",
|
117
|
+
**kwargs
|
118
|
+
)
|
119
|
+
|
120
|
+
self.logger = logging.getLogger(f"WebScraperAgent-{agent_id}")
|
121
|
+
|
122
|
+
# Load configuration from YAML
|
123
|
+
try:
|
124
|
+
config = load_config()
|
125
|
+
self.scraper_config = get_config_section('web_scraping', config)
|
126
|
+
except Exception as e:
|
127
|
+
raise ValueError(f"web_scraping configuration not found in agent_config.yaml: {e}")
|
128
|
+
|
129
|
+
# Initialize execution mode based on config
|
130
|
+
self.execution_mode = self._determine_execution_mode()
|
131
|
+
|
132
|
+
# Initialize executors based on availability and config
|
133
|
+
self.docker_executor = None
|
134
|
+
self.proxy_config = None
|
135
|
+
|
136
|
+
# Initialize Docker executor if configured
|
137
|
+
if self.execution_mode in ["docker", "auto"]:
|
138
|
+
try:
|
139
|
+
docker_config = {
|
140
|
+
**self.scraper_config,
|
141
|
+
'docker_image': self.scraper_config.get('docker_image'),
|
142
|
+
'timeout': self.scraper_config.get('timeout', 60)
|
143
|
+
}
|
144
|
+
self.docker_executor = SimpleDockerExecutor(docker_config)
|
145
|
+
except Exception as e:
|
146
|
+
self.logger.warning(f"Docker executor initialization failed: {e}")
|
147
|
+
|
148
|
+
# Initialize proxy configuration if enabled
|
149
|
+
if self.scraper_config.get('proxy_enabled', False):
|
150
|
+
proxy_url = self.scraper_config.get('proxy_config', {}).get('http_proxy')
|
151
|
+
if proxy_url:
|
152
|
+
self.proxy_config = self._parse_proxy_url(proxy_url)
|
153
|
+
self._configure_ssl_for_proxy()
|
154
|
+
|
155
|
+
# Add tools
|
156
|
+
self._add_scraping_tools()
|
157
|
+
|
158
|
+
self.logger.info(f"WebScraperAgent initialized (Mode: {self.execution_mode})")
|
159
|
+
|
160
|
+
def _configure_ssl_for_proxy(self):
|
161
|
+
"""Configure SSL settings for proxy usage"""
|
162
|
+
if REQUESTS_AVAILABLE:
|
163
|
+
try:
|
164
|
+
ssl_context = ssl.create_default_context()
|
165
|
+
ssl_context.check_hostname = False
|
166
|
+
ssl_context.verify_mode = ssl.CERT_NONE
|
167
|
+
|
168
|
+
import requests.packages.urllib3.util.ssl_
|
169
|
+
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL'
|
170
|
+
except Exception as e:
|
171
|
+
self.logger.warning(f"SSL configuration warning: {e}")
|
172
|
+
|
173
|
+
self.logger.info("SSL verification disabled for proxy usage")
|
174
|
+
|
175
|
+
def _determine_execution_mode(self) -> str:
|
176
|
+
"""Determine execution mode from configuration"""
|
177
|
+
# Check if proxy is enabled in config
|
178
|
+
if self.scraper_config.get('proxy_enabled', False):
|
179
|
+
proxy_url = self.scraper_config.get('proxy_config', {}).get('http_proxy')
|
180
|
+
if proxy_url:
|
181
|
+
return "proxy"
|
182
|
+
|
183
|
+
# Check if Docker should be used
|
184
|
+
if self.scraper_config.get('docker_image'):
|
185
|
+
return "docker"
|
186
|
+
|
187
|
+
# Fall back to local execution
|
188
|
+
if PLAYWRIGHT_AVAILABLE or REQUESTS_AVAILABLE:
|
189
|
+
return "local"
|
190
|
+
|
191
|
+
raise RuntimeError("No scraping execution methods available")
|
192
|
+
|
193
|
+
def _parse_proxy_url(self, proxy_url: str) -> Dict[str, Any]:
|
194
|
+
"""Parse proxy URL for different usage formats"""
|
195
|
+
try:
|
196
|
+
parsed = urlparse(proxy_url)
|
197
|
+
return {
|
198
|
+
'server': f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
|
199
|
+
'username': parsed.username,
|
200
|
+
'password': parsed.password,
|
201
|
+
'host': parsed.hostname,
|
202
|
+
'port': parsed.port,
|
203
|
+
'full_url': proxy_url
|
204
|
+
}
|
205
|
+
except Exception as e:
|
206
|
+
self.logger.error(f"Failed to parse proxy URL: {e}")
|
207
|
+
return {}
|
208
|
+
|
209
|
+
def _add_scraping_tools(self):
|
210
|
+
"""Add scraping tools"""
|
211
|
+
self.add_tool(AgentTool(
|
212
|
+
name="scrape_url",
|
213
|
+
description="Scrape a single URL",
|
214
|
+
function=self._scrape_url,
|
215
|
+
parameters_schema={
|
216
|
+
"type": "object",
|
217
|
+
"properties": {
|
218
|
+
"url": {"type": "string", "description": "URL to scrape"},
|
219
|
+
"method": {"type": "string", "enum": ["auto", "playwright", "requests"], "default": "auto"},
|
220
|
+
"extract_links": {"type": "boolean", "default": True},
|
221
|
+
"extract_images": {"type": "boolean", "default": True},
|
222
|
+
"take_screenshot": {"type": "boolean", "default": False}
|
223
|
+
},
|
224
|
+
"required": ["url"]
|
225
|
+
}
|
226
|
+
))
|
227
|
+
|
228
|
+
self.add_tool(AgentTool(
|
229
|
+
name="batch_scrape",
|
230
|
+
description="Scrape multiple URLs",
|
231
|
+
function=self._batch_scrape,
|
232
|
+
parameters_schema={
|
233
|
+
"type": "object",
|
234
|
+
"properties": {
|
235
|
+
"urls": {"type": "array", "items": {"type": "string"}},
|
236
|
+
"method": {"type": "string", "default": "auto"}
|
237
|
+
},
|
238
|
+
"required": ["urls"]
|
239
|
+
}
|
240
|
+
))
|
241
|
+
|
242
|
+
self.add_tool(AgentTool(
|
243
|
+
name="check_accessibility",
|
244
|
+
description="Quick check if URL is accessible",
|
245
|
+
function=self._check_accessibility,
|
246
|
+
parameters_schema={
|
247
|
+
"type": "object",
|
248
|
+
"properties": {
|
249
|
+
"url": {"type": "string", "description": "URL to check"}
|
250
|
+
},
|
251
|
+
"required": ["url"]
|
252
|
+
}
|
253
|
+
))
|
254
|
+
|
255
|
+
async def _scrape_url(self, url: str, method: str = "auto", **kwargs) -> Dict[str, Any]:
|
256
|
+
"""Unified URL scraping method"""
|
257
|
+
try:
|
258
|
+
if self.execution_mode == "docker" and self.docker_executor and self.docker_executor.available:
|
259
|
+
return await self._scrape_with_docker(url, method, **kwargs)
|
260
|
+
elif self.execution_mode == "proxy" and self.proxy_config:
|
261
|
+
return await self._scrape_with_proxy(url, method, **kwargs)
|
262
|
+
else:
|
263
|
+
return await self._scrape_locally(url, method, **kwargs)
|
264
|
+
|
265
|
+
except Exception as e:
|
266
|
+
self.logger.error(f"Scraping error for {url}: {e}")
|
267
|
+
return {
|
268
|
+
"success": False,
|
269
|
+
"error": str(e),
|
270
|
+
"url": url,
|
271
|
+
"method": method,
|
272
|
+
"execution_mode": self.execution_mode
|
273
|
+
}
|
274
|
+
|
275
|
+
async def _scrape_with_docker(self, url: str, method: str, **kwargs) -> Dict[str, Any]:
|
276
|
+
"""Scrape using Docker executor"""
|
277
|
+
task = ScrapingTask(
|
278
|
+
url=url,
|
279
|
+
method=method if method != "auto" else "playwright",
|
280
|
+
extract_links=kwargs.get('extract_links', True),
|
281
|
+
extract_images=kwargs.get('extract_images', True),
|
282
|
+
take_screenshot=kwargs.get('take_screenshot', False),
|
283
|
+
timeout=kwargs.get('timeout', self.scraper_config.get('timeout', 60))
|
284
|
+
)
|
285
|
+
|
286
|
+
result = self.docker_executor.execute_scraping_task(task)
|
287
|
+
result['execution_mode'] = 'docker'
|
288
|
+
return result
|
289
|
+
|
290
|
+
async def _scrape_with_proxy(self, url: str, method: str, **kwargs) -> Dict[str, Any]:
|
291
|
+
"""Scrape using proxy (ScraperAPI style) with SSL verification disabled"""
|
292
|
+
if method == "auto":
|
293
|
+
method = "playwright" if PLAYWRIGHT_AVAILABLE else "requests"
|
294
|
+
|
295
|
+
if method == "playwright" and PLAYWRIGHT_AVAILABLE:
|
296
|
+
return await self._scrape_proxy_playwright(url, **kwargs)
|
297
|
+
elif REQUESTS_AVAILABLE:
|
298
|
+
return self._scrape_proxy_requests(url, **kwargs)
|
299
|
+
else:
|
300
|
+
raise RuntimeError("No proxy scraping methods available")
|
301
|
+
|
302
|
+
async def _scrape_proxy_playwright(self, url: str, **kwargs) -> Dict[str, Any]:
|
303
|
+
"""Scrape using Playwright with proxy and SSL verification disabled"""
|
304
|
+
async with async_playwright() as p:
|
305
|
+
browser = None
|
306
|
+
try:
|
307
|
+
browser = await p.chromium.launch(
|
308
|
+
headless=True,
|
309
|
+
proxy={
|
310
|
+
"server": self.proxy_config['server'],
|
311
|
+
"username": self.proxy_config['username'],
|
312
|
+
"password": self.proxy_config['password']
|
313
|
+
},
|
314
|
+
args=[
|
315
|
+
'--no-sandbox',
|
316
|
+
'--disable-dev-shm-usage',
|
317
|
+
'--disable-web-security',
|
318
|
+
'--disable-features=VizDisplayCompositor',
|
319
|
+
'--ignore-certificate-errors',
|
320
|
+
'--ignore-ssl-errors',
|
321
|
+
'--ignore-certificate-errors-spki-list',
|
322
|
+
'--allow-running-insecure-content'
|
323
|
+
]
|
324
|
+
)
|
325
|
+
|
326
|
+
context = await browser.new_context(
|
327
|
+
viewport={"width": 1920, "height": 1080},
|
328
|
+
user_agent=self.scraper_config.get('default_headers', {}).get('User-Agent',
|
329
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'),
|
330
|
+
ignore_https_errors=True
|
331
|
+
)
|
332
|
+
|
333
|
+
page = await context.new_page()
|
334
|
+
start_time = time.time()
|
335
|
+
|
336
|
+
timeout_ms = self.scraper_config.get('timeout', 60) * 1000
|
337
|
+
response = await page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
|
338
|
+
await page.wait_for_timeout(3000)
|
339
|
+
|
340
|
+
response_time = time.time() - start_time
|
341
|
+
|
342
|
+
# Extract content
|
343
|
+
title = await page.title()
|
344
|
+
content = await page.inner_text("body")
|
345
|
+
|
346
|
+
# Extract links
|
347
|
+
links = []
|
348
|
+
if kwargs.get('extract_links', True):
|
349
|
+
link_elements = await page.query_selector_all("a[href]")
|
350
|
+
max_links = self.scraper_config.get('max_links_per_page', 100)
|
351
|
+
for link in link_elements[:max_links]:
|
352
|
+
href = await link.get_attribute("href")
|
353
|
+
text = await link.inner_text()
|
354
|
+
if href and text:
|
355
|
+
links.append({
|
356
|
+
"url": urljoin(url, href),
|
357
|
+
"text": text.strip()[:100]
|
358
|
+
})
|
359
|
+
|
360
|
+
# Extract images
|
361
|
+
images = []
|
362
|
+
if kwargs.get('extract_images', True):
|
363
|
+
img_elements = await page.query_selector_all("img[src]")
|
364
|
+
max_images = self.scraper_config.get('max_images_per_page', 50)
|
365
|
+
for img in img_elements[:max_images]:
|
366
|
+
src = await img.get_attribute("src")
|
367
|
+
alt = await img.get_attribute("alt") or ""
|
368
|
+
if src:
|
369
|
+
images.append({
|
370
|
+
"url": urljoin(url, src),
|
371
|
+
"alt": alt
|
372
|
+
})
|
373
|
+
|
374
|
+
await browser.close()
|
375
|
+
|
376
|
+
return {
|
377
|
+
"success": True,
|
378
|
+
"url": url,
|
379
|
+
"title": title,
|
380
|
+
"content": content[:5000],
|
381
|
+
"content_length": len(content),
|
382
|
+
"links": links,
|
383
|
+
"images": images,
|
384
|
+
"status_code": response.status if response else None,
|
385
|
+
"response_time": response_time,
|
386
|
+
"method": "proxy_playwright",
|
387
|
+
"execution_mode": "proxy"
|
388
|
+
}
|
389
|
+
|
390
|
+
except Exception as e:
|
391
|
+
if browser:
|
392
|
+
await browser.close()
|
393
|
+
raise e
|
394
|
+
|
395
|
+
def _scrape_proxy_requests(self, url: str, **kwargs) -> Dict[str, Any]:
|
396
|
+
"""Scrape using requests with proxy and SSL verification disabled"""
|
397
|
+
proxies = {
|
398
|
+
'http': self.proxy_config['full_url'],
|
399
|
+
'https': self.proxy_config['full_url']
|
400
|
+
}
|
401
|
+
|
402
|
+
headers = self.scraper_config.get('default_headers', {
|
403
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
404
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
405
|
+
})
|
406
|
+
|
407
|
+
start_time = time.time()
|
408
|
+
|
409
|
+
response = requests.get(
|
410
|
+
url,
|
411
|
+
headers=headers,
|
412
|
+
proxies=proxies,
|
413
|
+
timeout=self.scraper_config.get('timeout', 60),
|
414
|
+
verify=False,
|
415
|
+
allow_redirects=True
|
416
|
+
)
|
417
|
+
response_time = time.time() - start_time
|
418
|
+
|
419
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
420
|
+
|
421
|
+
# Extract content
|
422
|
+
title = soup.find('title')
|
423
|
+
title = title.get_text().strip() if title else "No title"
|
424
|
+
|
425
|
+
for script in soup(["script", "style"]):
|
426
|
+
script.decompose()
|
427
|
+
|
428
|
+
content = soup.get_text()
|
429
|
+
content = ' '.join(content.split())
|
430
|
+
|
431
|
+
# Extract links and images based on config
|
432
|
+
links = []
|
433
|
+
images = []
|
434
|
+
|
435
|
+
if kwargs.get('extract_links', True):
|
436
|
+
max_links = self.scraper_config.get('max_links_per_page', 100)
|
437
|
+
for link in soup.find_all('a', href=True)[:max_links]:
|
438
|
+
links.append({
|
439
|
+
"url": urljoin(url, link['href']),
|
440
|
+
"text": link.get_text().strip()[:100]
|
441
|
+
})
|
442
|
+
|
443
|
+
if kwargs.get('extract_images', True):
|
444
|
+
max_images = self.scraper_config.get('max_images_per_page', 50)
|
445
|
+
for img in soup.find_all('img', src=True)[:max_images]:
|
446
|
+
images.append({
|
447
|
+
"url": urljoin(url, img['src']),
|
448
|
+
"alt": img.get('alt', '')
|
449
|
+
})
|
450
|
+
|
451
|
+
return {
|
452
|
+
"success": True,
|
453
|
+
"url": url,
|
454
|
+
"title": title,
|
455
|
+
"content": content[:5000],
|
456
|
+
"content_length": len(content),
|
457
|
+
"links": links,
|
458
|
+
"images": images,
|
459
|
+
"status_code": response.status_code,
|
460
|
+
"response_time": response_time,
|
461
|
+
"method": "proxy_requests",
|
462
|
+
"execution_mode": "proxy"
|
463
|
+
}
|
464
|
+
|
465
|
+
async def _scrape_locally(self, url: str, method: str, **kwargs) -> Dict[str, Any]:
|
466
|
+
"""Scrape using local methods (no proxy, no Docker)"""
|
467
|
+
if method == "auto":
|
468
|
+
method = "playwright" if PLAYWRIGHT_AVAILABLE else "requests"
|
469
|
+
|
470
|
+
if method == "playwright" and PLAYWRIGHT_AVAILABLE:
|
471
|
+
return await self._scrape_local_playwright(url, **kwargs)
|
472
|
+
elif REQUESTS_AVAILABLE:
|
473
|
+
return self._scrape_local_requests(url, **kwargs)
|
474
|
+
else:
|
475
|
+
raise RuntimeError("No local scraping methods available")
|
476
|
+
|
477
|
+
async def _scrape_local_playwright(self, url: str, **kwargs) -> Dict[str, Any]:
|
478
|
+
"""Local Playwright scraping"""
|
479
|
+
async with async_playwright() as p:
|
480
|
+
browser = await p.chromium.launch(headless=True)
|
481
|
+
context = await browser.new_context(
|
482
|
+
user_agent=self.scraper_config.get('default_headers', {}).get('User-Agent',
|
483
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
|
484
|
+
)
|
485
|
+
page = await context.new_page()
|
486
|
+
|
487
|
+
start_time = time.time()
|
488
|
+
timeout_ms = self.scraper_config.get('timeout', 60) * 1000
|
489
|
+
response = await page.goto(url, timeout=timeout_ms)
|
490
|
+
response_time = time.time() - start_time
|
491
|
+
|
492
|
+
title = await page.title()
|
493
|
+
content = await page.inner_text("body")
|
494
|
+
|
495
|
+
await browser.close()
|
496
|
+
|
497
|
+
return {
|
498
|
+
"success": True,
|
499
|
+
"url": url,
|
500
|
+
"title": title,
|
501
|
+
"content": content[:5000],
|
502
|
+
"content_length": len(content),
|
503
|
+
"status_code": response.status if response else None,
|
504
|
+
"response_time": response_time,
|
505
|
+
"method": "local_playwright",
|
506
|
+
"execution_mode": "local"
|
507
|
+
}
|
508
|
+
|
509
|
+
def _scrape_local_requests(self, url: str, **kwargs) -> Dict[str, Any]:
|
510
|
+
"""Local requests scraping"""
|
511
|
+
headers = self.scraper_config.get('default_headers', {
|
512
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
513
|
+
})
|
514
|
+
|
515
|
+
start_time = time.time()
|
516
|
+
response = requests.get(url, headers=headers, timeout=self.scraper_config.get('timeout', 60))
|
517
|
+
response_time = time.time() - start_time
|
518
|
+
|
519
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
520
|
+
title = soup.find('title')
|
521
|
+
title = title.get_text().strip() if title else "No title"
|
522
|
+
|
523
|
+
for script in soup(["script", "style"]):
|
524
|
+
script.decompose()
|
525
|
+
|
526
|
+
content = soup.get_text()
|
527
|
+
content = ' '.join(content.split())
|
528
|
+
|
529
|
+
return {
|
530
|
+
"success": True,
|
531
|
+
"url": url,
|
532
|
+
"title": title,
|
533
|
+
"content": content[:5000],
|
534
|
+
"content_length": len(content),
|
535
|
+
"status_code": response.status_code,
|
536
|
+
"response_time": response_time,
|
537
|
+
"method": "local_requests",
|
538
|
+
"execution_mode": "local"
|
539
|
+
}
|
540
|
+
|
541
|
+
async def _batch_scrape(self, urls: List[str], method: str = "auto") -> Dict[str, Any]:
|
542
|
+
"""Batch scraping with rate limiting from config"""
|
543
|
+
results = []
|
544
|
+
rate_limit = self.scraper_config.get('rate_limit_seconds', 1.0)
|
545
|
+
|
546
|
+
for i, url in enumerate(urls):
|
547
|
+
try:
|
548
|
+
result = await self._scrape_url(url, method)
|
549
|
+
results.append(result)
|
550
|
+
|
551
|
+
if i < len(urls) - 1:
|
552
|
+
await asyncio.sleep(rate_limit)
|
553
|
+
|
554
|
+
except Exception as e:
|
555
|
+
results.append({
|
556
|
+
"success": False,
|
557
|
+
"url": url,
|
558
|
+
"error": str(e)
|
559
|
+
})
|
560
|
+
|
561
|
+
successful = sum(1 for r in results if r.get('success', False))
|
562
|
+
|
563
|
+
return {
|
564
|
+
"success": True,
|
565
|
+
"total_urls": len(urls),
|
566
|
+
"successful": successful,
|
567
|
+
"failed": len(urls) - successful,
|
568
|
+
"results": results,
|
569
|
+
"execution_mode": self.execution_mode
|
570
|
+
}
|
571
|
+
|
572
|
+
async def _check_accessibility(self, url: str) -> Dict[str, Any]:
|
573
|
+
"""Check URL accessibility"""
|
574
|
+
try:
|
575
|
+
result = await self._scrape_url(url, extract_links=False, extract_images=False)
|
576
|
+
return {
|
577
|
+
"success": True,
|
578
|
+
"url": url,
|
579
|
+
"accessible": result.get('success', False),
|
580
|
+
"status_code": result.get('status_code'),
|
581
|
+
"response_time": result.get('response_time', 0),
|
582
|
+
"error": result.get('error'),
|
583
|
+
"timestamp": datetime.now().isoformat()
|
584
|
+
}
|
585
|
+
except Exception as e:
|
586
|
+
return {
|
587
|
+
"success": False,
|
588
|
+
"error": str(e),
|
589
|
+
"url": url,
|
590
|
+
"timestamp": datetime.now().isoformat()
|
591
|
+
}
|
592
|
+
|
593
|
+
def _extract_urls_from_text(self, text: str) -> List[str]:
|
594
|
+
"""Extract URLs from text"""
|
595
|
+
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
|
596
|
+
return re.findall(url_pattern, text)
|
597
|
+
|
598
|
+
async def process_message(self, message: AgentMessage, context: ExecutionContext) -> AgentMessage:
|
599
|
+
"""Process scraping requests"""
|
600
|
+
self.memory.store_message(message)
|
601
|
+
|
602
|
+
try:
|
603
|
+
content = message.content
|
604
|
+
urls = self._extract_urls_from_text(content)
|
605
|
+
|
606
|
+
if not urls:
|
607
|
+
response_content = f"""🕷️ **Web Scraper Agent** - Configuration-Driven
|
608
|
+
|
609
|
+
**🔧 Current Mode:** {self.execution_mode.upper()}
|
610
|
+
**📡 Proxy Enabled:** {'✅' if self.proxy_config else '❌'}
|
611
|
+
**🐳 Docker Available:** {'✅' if self.docker_executor and self.docker_executor.available else '❌'}
|
612
|
+
**🔒 SSL Verification:** {'❌ Disabled (Proxy Mode)' if self.proxy_config else '✅ Enabled'}
|
613
|
+
|
614
|
+
**🚀 Capabilities:**
|
615
|
+
- Single URL scraping with multiple methods
|
616
|
+
- Batch URL processing with rate limiting
|
617
|
+
- Proxy support (ScraperAPI compatible)
|
618
|
+
- Docker-based secure execution
|
619
|
+
- Local fallback methods
|
620
|
+
|
621
|
+
**📝 Usage Examples:**
|
622
|
+
- `scrape https://example.com`
|
623
|
+
- `batch scrape https://site1.com https://site2.com`
|
624
|
+
- `check if https://example.com is accessible`
|
625
|
+
|
626
|
+
Provide URLs to start scraping! 🎯"""
|
627
|
+
|
628
|
+
elif len(urls) == 1:
|
629
|
+
# Single URL
|
630
|
+
result = await self._scrape_url(urls[0])
|
631
|
+
|
632
|
+
if result['success']:
|
633
|
+
response_content = f"""✅ **Scraping Completed**
|
634
|
+
|
635
|
+
🌐 **URL:** {result['url']}
|
636
|
+
🔧 **Method:** {result.get('method', 'unknown')}
|
637
|
+
🏃 **Mode:** {result['execution_mode']}
|
638
|
+
📊 **Status:** {result.get('status_code', 'N/A')}
|
639
|
+
📄 **Content:** {result['content_length']:,} characters
|
640
|
+
⏱️ **Time:** {result['response_time']:.2f}s
|
641
|
+
|
642
|
+
**Title:** {result.get('title', 'No title')}
|
643
|
+
|
644
|
+
**Content Preview:**
|
645
|
+
{result.get('content', '')[:300]}{'...' if len(result.get('content', '')) > 300 else ''}"""
|
646
|
+
else:
|
647
|
+
response_content = f"❌ **Scraping failed:** {result['error']}"
|
648
|
+
|
649
|
+
else:
|
650
|
+
# Multiple URLs
|
651
|
+
result = await self._batch_scrape(urls)
|
652
|
+
|
653
|
+
response_content = f"""🕷️ **Batch Scraping Results**
|
654
|
+
|
655
|
+
📊 **Summary:**
|
656
|
+
- **Total URLs:** {result['total_urls']}
|
657
|
+
- **Successful:** {result['successful']}
|
658
|
+
- **Failed:** {result['failed']}
|
659
|
+
- **Mode:** {result['execution_mode']}
|
660
|
+
|
661
|
+
✅ **Status:** Completed batch operation"""
|
662
|
+
|
663
|
+
response = self.create_response(
|
664
|
+
content=response_content,
|
665
|
+
recipient_id=message.sender_id,
|
666
|
+
session_id=message.session_id,
|
667
|
+
conversation_id=message.conversation_id
|
668
|
+
)
|
669
|
+
|
670
|
+
self.memory.store_message(response)
|
671
|
+
return response
|
672
|
+
|
673
|
+
except Exception as e:
|
674
|
+
self.logger.error(f"Message processing error: {e}")
|
675
|
+
error_response = self.create_response(
|
676
|
+
content=f"❌ **Error:** {str(e)}",
|
677
|
+
recipient_id=message.sender_id,
|
678
|
+
message_type=MessageType.ERROR,
|
679
|
+
session_id=message.session_id,
|
680
|
+
conversation_id=message.conversation_id
|
681
|
+
)
|
682
|
+
return error_response
|