ambivo-agents 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ambivo_agents/__init__.py +91 -0
- ambivo_agents/agents/__init__.py +21 -0
- ambivo_agents/agents/assistant.py +203 -0
- ambivo_agents/agents/code_executor.py +133 -0
- ambivo_agents/agents/code_executor2.py +222 -0
- ambivo_agents/agents/knowledge_base.py +935 -0
- ambivo_agents/agents/media_editor.py +992 -0
- ambivo_agents/agents/moderator.py +617 -0
- ambivo_agents/agents/simple_web_search.py +404 -0
- ambivo_agents/agents/web_scraper.py +1027 -0
- ambivo_agents/agents/web_search.py +933 -0
- ambivo_agents/agents/youtube_download.py +784 -0
- ambivo_agents/cli.py +699 -0
- ambivo_agents/config/__init__.py +4 -0
- ambivo_agents/config/loader.py +301 -0
- ambivo_agents/core/__init__.py +33 -0
- ambivo_agents/core/base.py +1024 -0
- ambivo_agents/core/history.py +606 -0
- ambivo_agents/core/llm.py +333 -0
- ambivo_agents/core/memory.py +640 -0
- ambivo_agents/executors/__init__.py +8 -0
- ambivo_agents/executors/docker_executor.py +108 -0
- ambivo_agents/executors/media_executor.py +237 -0
- ambivo_agents/executors/youtube_executor.py +404 -0
- ambivo_agents/services/__init__.py +6 -0
- ambivo_agents/services/agent_service.py +605 -0
- ambivo_agents/services/factory.py +370 -0
- ambivo_agents-1.0.1.dist-info/METADATA +1090 -0
- ambivo_agents-1.0.1.dist-info/RECORD +33 -0
- ambivo_agents-1.0.1.dist-info/WHEEL +5 -0
- ambivo_agents-1.0.1.dist-info/entry_points.txt +3 -0
- ambivo_agents-1.0.1.dist-info/licenses/LICENSE +21 -0
- ambivo_agents-1.0.1.dist-info/top_level.txt +1 -0
@@ -0,0 +1,1027 @@
|
|
1
|
+
# ambivo_agents/agents/web_scraper.py
|
2
|
+
"""
|
3
|
+
Web Scraper Agent with proxy, Docker, and local execution modes.
|
4
|
+
Updated with LLM-aware intent detection and conversation history integration.
|
5
|
+
"""
|
6
|
+
|
7
|
+
import asyncio
|
8
|
+
import json
|
9
|
+
import re
|
10
|
+
import time
|
11
|
+
import random
|
12
|
+
import uuid
|
13
|
+
import logging
|
14
|
+
import ssl
|
15
|
+
import urllib3
|
16
|
+
from datetime import datetime
|
17
|
+
from typing import Dict, Any, List, Optional
|
18
|
+
from urllib.parse import urlparse, urljoin
|
19
|
+
from dataclasses import dataclass
|
20
|
+
from pathlib import Path
|
21
|
+
|
22
|
+
from ..core.base import BaseAgent, AgentRole, AgentMessage, MessageType, ExecutionContext, AgentTool
|
23
|
+
from ..config.loader import load_config, get_config_section
|
24
|
+
from ..core.history import WebAgentHistoryMixin, ContextType
|
25
|
+
|
26
|
+
# Conditional imports for different execution modes
|
27
|
+
try:
|
28
|
+
from playwright.async_api import async_playwright
|
29
|
+
|
30
|
+
PLAYWRIGHT_AVAILABLE = True
|
31
|
+
except ImportError:
|
32
|
+
PLAYWRIGHT_AVAILABLE = False
|
33
|
+
|
34
|
+
try:
|
35
|
+
import requests
|
36
|
+
from bs4 import BeautifulSoup
|
37
|
+
|
38
|
+
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
39
|
+
REQUESTS_AVAILABLE = True
|
40
|
+
except ImportError:
|
41
|
+
REQUESTS_AVAILABLE = False
|
42
|
+
|
43
|
+
try:
|
44
|
+
import docker
|
45
|
+
|
46
|
+
DOCKER_AVAILABLE = True
|
47
|
+
except ImportError:
|
48
|
+
DOCKER_AVAILABLE = False
|
49
|
+
|
50
|
+
|
51
|
+
@dataclass
|
52
|
+
class ScrapingTask:
|
53
|
+
"""Simple scraping task data structure"""
|
54
|
+
url: str
|
55
|
+
method: str = "playwright"
|
56
|
+
extract_links: bool = True
|
57
|
+
extract_images: bool = True
|
58
|
+
take_screenshot: bool = False
|
59
|
+
timeout: int = 45
|
60
|
+
|
61
|
+
|
62
|
+
class SimpleDockerExecutor:
|
63
|
+
"""Simple Docker executor for scraping tasks"""
|
64
|
+
|
65
|
+
def __init__(self, config: Dict[str, Any] = None):
|
66
|
+
self.config = config or {}
|
67
|
+
self.docker_image = self.config.get('docker_image', 'sgosain/amb-ubuntu-python-public-pod')
|
68
|
+
self.timeout = self.config.get('timeout', 60)
|
69
|
+
|
70
|
+
if DOCKER_AVAILABLE:
|
71
|
+
try:
|
72
|
+
self.docker_client = docker.from_env()
|
73
|
+
self.docker_client.ping()
|
74
|
+
self.available = True
|
75
|
+
except Exception as e:
|
76
|
+
logging.warning(f"Docker initialization failed: {e}")
|
77
|
+
self.available = False
|
78
|
+
else:
|
79
|
+
self.available = False
|
80
|
+
|
81
|
+
def execute_scraping_task(self, task: ScrapingTask) -> Dict[str, Any]:
|
82
|
+
"""Execute a scraping task in Docker"""
|
83
|
+
if not self.available:
|
84
|
+
return {
|
85
|
+
'success': False,
|
86
|
+
'error': 'Docker not available',
|
87
|
+
'url': task.url
|
88
|
+
}
|
89
|
+
|
90
|
+
try:
|
91
|
+
# Create scraping script for Docker
|
92
|
+
script_content = f"""
|
93
|
+
import asyncio
|
94
|
+
from playwright.async_api import async_playwright
|
95
|
+
import json
|
96
|
+
|
97
|
+
async def scrape_url():
|
98
|
+
async with async_playwright() as p:
|
99
|
+
browser = await p.chromium.launch(headless=True)
|
100
|
+
page = await browser.new_page()
|
101
|
+
|
102
|
+
try:
|
103
|
+
response = await page.goto('{task.url}', timeout={task.timeout * 1000})
|
104
|
+
title = await page.title()
|
105
|
+
content = await page.inner_text('body')
|
106
|
+
|
107
|
+
# Extract links if requested
|
108
|
+
links = []
|
109
|
+
if {task.extract_links}:
|
110
|
+
link_elements = await page.query_selector_all('a[href]')
|
111
|
+
for link in link_elements[:50]: # Limit to 50 links
|
112
|
+
href = await link.get_attribute('href')
|
113
|
+
text = await link.inner_text()
|
114
|
+
if href and text:
|
115
|
+
links.append({{'url': href, 'text': text[:100]}})
|
116
|
+
|
117
|
+
# Extract images if requested
|
118
|
+
images = []
|
119
|
+
if {task.extract_images}:
|
120
|
+
img_elements = await page.query_selector_all('img[src]')
|
121
|
+
for img in img_elements[:25]: # Limit to 25 images
|
122
|
+
src = await img.get_attribute('src')
|
123
|
+
alt = await img.get_attribute('alt') or ''
|
124
|
+
if src:
|
125
|
+
images.append({{'url': src, 'alt': alt}})
|
126
|
+
|
127
|
+
result = {{
|
128
|
+
'success': True,
|
129
|
+
'url': '{task.url}',
|
130
|
+
'title': title,
|
131
|
+
'content': content[:5000], # Limit content
|
132
|
+
'content_length': len(content),
|
133
|
+
'links': links,
|
134
|
+
'images': images,
|
135
|
+
'status_code': response.status if response else None,
|
136
|
+
'method': 'docker_playwright',
|
137
|
+
'execution_mode': 'docker'
|
138
|
+
}}
|
139
|
+
|
140
|
+
print(json.dumps(result))
|
141
|
+
|
142
|
+
except Exception as e:
|
143
|
+
error_result = {{
|
144
|
+
'success': False,
|
145
|
+
'error': str(e),
|
146
|
+
'url': '{task.url}',
|
147
|
+
'execution_mode': 'docker'
|
148
|
+
}}
|
149
|
+
print(json.dumps(error_result))
|
150
|
+
|
151
|
+
finally:
|
152
|
+
await browser.close()
|
153
|
+
|
154
|
+
asyncio.run(scrape_url())
|
155
|
+
"""
|
156
|
+
|
157
|
+
# Execute in Docker container
|
158
|
+
container = self.docker_client.containers.run(
|
159
|
+
image=self.docker_image,
|
160
|
+
command=['python', '-c', script_content],
|
161
|
+
remove=True,
|
162
|
+
mem_limit='512m',
|
163
|
+
network_disabled=False, # Need network for scraping
|
164
|
+
stdout=True,
|
165
|
+
stderr=True,
|
166
|
+
timeout=self.timeout
|
167
|
+
)
|
168
|
+
|
169
|
+
# Parse result
|
170
|
+
output = container.decode('utf-8') if isinstance(container, bytes) else str(container)
|
171
|
+
return json.loads(output.strip().split('\n')[-1])
|
172
|
+
|
173
|
+
except Exception as e:
|
174
|
+
return {
|
175
|
+
'success': False,
|
176
|
+
'error': str(e),
|
177
|
+
'url': task.url,
|
178
|
+
'execution_mode': 'docker'
|
179
|
+
}
|
180
|
+
|
181
|
+
|
182
|
+
class WebScraperAgent(BaseAgent, WebAgentHistoryMixin):
|
183
|
+
"""Unified web scraper agent with proxy, Docker, and local execution modes"""
|
184
|
+
|
185
|
+
def __init__(self, agent_id: str = None, memory_manager=None, llm_service=None, **kwargs):
|
186
|
+
|
187
|
+
if agent_id is None:
|
188
|
+
agent_id = f"scraper_{str(uuid.uuid4())[:8]}"
|
189
|
+
|
190
|
+
super().__init__(
|
191
|
+
agent_id=agent_id,
|
192
|
+
role=AgentRole.RESEARCHER,
|
193
|
+
memory_manager=memory_manager,
|
194
|
+
llm_service=llm_service,
|
195
|
+
name="Web Scraper Agent",
|
196
|
+
description="Unified web scraper with proxy, Docker, and local execution modes",
|
197
|
+
**kwargs
|
198
|
+
)
|
199
|
+
|
200
|
+
# Initialize history mixin
|
201
|
+
self.setup_history_mixin()
|
202
|
+
|
203
|
+
self.logger = logging.getLogger(f"WebScraperAgent-{agent_id}")
|
204
|
+
|
205
|
+
# Load configuration from YAML
|
206
|
+
try:
|
207
|
+
config = load_config()
|
208
|
+
self.scraper_config = get_config_section('web_scraping', config)
|
209
|
+
except Exception as e:
|
210
|
+
raise ValueError(f"web_scraping configuration not found in agent_config.yaml: {e}")
|
211
|
+
|
212
|
+
# Initialize execution mode based on config
|
213
|
+
self.execution_mode = self._determine_execution_mode()
|
214
|
+
|
215
|
+
# Initialize executors based on availability and config
|
216
|
+
self.docker_executor = None
|
217
|
+
self.proxy_config = None
|
218
|
+
|
219
|
+
# Initialize Docker executor if configured
|
220
|
+
if self.execution_mode in ["docker", "auto"]:
|
221
|
+
try:
|
222
|
+
docker_config = {
|
223
|
+
**self.scraper_config,
|
224
|
+
'docker_image': self.scraper_config.get('docker_image'),
|
225
|
+
'timeout': self.scraper_config.get('timeout', 60)
|
226
|
+
}
|
227
|
+
self.docker_executor = SimpleDockerExecutor(docker_config)
|
228
|
+
except Exception as e:
|
229
|
+
self.logger.warning(f"Docker executor initialization failed: {e}")
|
230
|
+
|
231
|
+
# Initialize proxy configuration if enabled
|
232
|
+
if self.scraper_config.get('proxy_enabled', False):
|
233
|
+
proxy_url = self.scraper_config.get('proxy_config', {}).get('http_proxy')
|
234
|
+
if proxy_url:
|
235
|
+
self.proxy_config = self._parse_proxy_url(proxy_url)
|
236
|
+
self._configure_ssl_for_proxy()
|
237
|
+
|
238
|
+
# Add tools
|
239
|
+
self._add_scraping_tools()
|
240
|
+
|
241
|
+
self.logger.info(f"WebScraperAgent initialized (Mode: {self.execution_mode})")
|
242
|
+
|
243
|
+
async def _llm_analyze_scraping_intent(self, user_message: str, conversation_context: str = "") -> Dict[str, Any]:
|
244
|
+
"""Use LLM to analyze web scraping intent"""
|
245
|
+
if not self.llm_service:
|
246
|
+
return self._keyword_based_scraping_analysis(user_message)
|
247
|
+
|
248
|
+
prompt = f"""
|
249
|
+
Analyze this user message in the context of web scraping and extract:
|
250
|
+
1. Primary intent (scrape_single, scrape_batch, check_accessibility, help_request)
|
251
|
+
2. URLs to scrape
|
252
|
+
3. Extraction preferences (links, images, content)
|
253
|
+
4. Context references (referring to previous scraping operations)
|
254
|
+
5. Technical specifications (method, timeout, etc.)
|
255
|
+
|
256
|
+
Conversation Context:
|
257
|
+
{conversation_context}
|
258
|
+
|
259
|
+
Current User Message: {user_message}
|
260
|
+
|
261
|
+
Respond in JSON format:
|
262
|
+
{{
|
263
|
+
"primary_intent": "scrape_single|scrape_batch|check_accessibility|help_request",
|
264
|
+
"urls": ["http://example.com"],
|
265
|
+
"extraction_preferences": {{
|
266
|
+
"extract_links": true,
|
267
|
+
"extract_images": true,
|
268
|
+
"take_screenshot": false
|
269
|
+
}},
|
270
|
+
"uses_context_reference": true/false,
|
271
|
+
"context_type": "previous_url|previous_operation",
|
272
|
+
"technical_specs": {{
|
273
|
+
"method": "playwright|requests|auto",
|
274
|
+
"timeout": 60
|
275
|
+
}},
|
276
|
+
"confidence": 0.0-1.0
|
277
|
+
}}
|
278
|
+
"""
|
279
|
+
|
280
|
+
try:
|
281
|
+
response = await self.llm_service.generate_response(prompt)
|
282
|
+
import re
|
283
|
+
json_match = re.search(r'\{.*\}', response, re.DOTALL)
|
284
|
+
if json_match:
|
285
|
+
return json.loads(json_match.group())
|
286
|
+
else:
|
287
|
+
return self._extract_scraping_intent_from_llm_response(response, user_message)
|
288
|
+
except Exception as e:
|
289
|
+
return self._keyword_based_scraping_analysis(user_message)
|
290
|
+
|
291
|
+
def _keyword_based_scraping_analysis(self, user_message: str) -> Dict[str, Any]:
|
292
|
+
"""Fallback keyword-based scraping intent analysis"""
|
293
|
+
content_lower = user_message.lower()
|
294
|
+
|
295
|
+
# Determine intent
|
296
|
+
if any(word in content_lower for word in ['batch', 'multiple', 'several']):
|
297
|
+
intent = 'scrape_batch'
|
298
|
+
elif any(word in content_lower for word in ['check', 'test', 'accessible']):
|
299
|
+
intent = 'check_accessibility'
|
300
|
+
elif any(word in content_lower for word in ['scrape', 'extract', 'crawl']):
|
301
|
+
intent = 'scrape_single'
|
302
|
+
else:
|
303
|
+
intent = 'help_request'
|
304
|
+
|
305
|
+
# Extract URLs
|
306
|
+
urls = self.extract_context_from_text(user_message, ContextType.URL)
|
307
|
+
|
308
|
+
return {
|
309
|
+
"primary_intent": intent,
|
310
|
+
"urls": urls,
|
311
|
+
"extraction_preferences": {
|
312
|
+
"extract_links": True,
|
313
|
+
"extract_images": True,
|
314
|
+
"take_screenshot": False
|
315
|
+
},
|
316
|
+
"uses_context_reference": any(word in content_lower for word in ['this', 'that', 'it']),
|
317
|
+
"context_type": "previous_url",
|
318
|
+
"technical_specs": {
|
319
|
+
"method": "auto",
|
320
|
+
"timeout": 60
|
321
|
+
},
|
322
|
+
"confidence": 0.7
|
323
|
+
}
|
324
|
+
|
325
|
+
async def process_message(self, message: AgentMessage, context: ExecutionContext = None) -> AgentMessage:
|
326
|
+
"""Process message with LLM-based scraping intent detection and history context"""
|
327
|
+
self.memory.store_message(message)
|
328
|
+
|
329
|
+
try:
|
330
|
+
user_message = message.content
|
331
|
+
|
332
|
+
# Update conversation state
|
333
|
+
self.update_conversation_state(user_message)
|
334
|
+
|
335
|
+
# Get conversation context for LLM analysis
|
336
|
+
conversation_context = self._get_scraping_conversation_context_summary()
|
337
|
+
|
338
|
+
# Use LLM to analyze intent
|
339
|
+
intent_analysis = await self._llm_analyze_scraping_intent(user_message, conversation_context)
|
340
|
+
|
341
|
+
# Route request based on LLM analysis
|
342
|
+
response_content = await self._route_scraping_with_llm_analysis(intent_analysis, user_message, context)
|
343
|
+
|
344
|
+
response = self.create_response(
|
345
|
+
content=response_content,
|
346
|
+
recipient_id=message.sender_id,
|
347
|
+
session_id=message.session_id,
|
348
|
+
conversation_id=message.conversation_id
|
349
|
+
)
|
350
|
+
|
351
|
+
self.memory.store_message(response)
|
352
|
+
return response
|
353
|
+
|
354
|
+
except Exception as e:
|
355
|
+
error_response = self.create_response(
|
356
|
+
content=f"Web Scraper Agent error: {str(e)}",
|
357
|
+
recipient_id=message.sender_id,
|
358
|
+
message_type=MessageType.ERROR,
|
359
|
+
session_id=message.session_id,
|
360
|
+
conversation_id=message.conversation_id
|
361
|
+
)
|
362
|
+
return error_response
|
363
|
+
|
364
|
+
def _get_scraping_conversation_context_summary(self) -> str:
|
365
|
+
"""Get scraping conversation context summary"""
|
366
|
+
try:
|
367
|
+
recent_history = self.get_conversation_history_with_context(
|
368
|
+
limit=3,
|
369
|
+
context_types=[ContextType.URL]
|
370
|
+
)
|
371
|
+
|
372
|
+
context_summary = []
|
373
|
+
for msg in recent_history:
|
374
|
+
if msg.get('message_type') == 'user_input':
|
375
|
+
extracted_context = msg.get('extracted_context', {})
|
376
|
+
urls = extracted_context.get('url', [])
|
377
|
+
|
378
|
+
if urls:
|
379
|
+
context_summary.append(f"Previous URL: {urls[0]}")
|
380
|
+
|
381
|
+
return "\n".join(context_summary) if context_summary else "No previous scraping context"
|
382
|
+
except:
|
383
|
+
return "No previous scraping context"
|
384
|
+
|
385
|
+
async def _route_scraping_with_llm_analysis(self, intent_analysis: Dict[str, Any], user_message: str,
|
386
|
+
context: ExecutionContext) -> str:
|
387
|
+
"""Route scraping request based on LLM intent analysis"""
|
388
|
+
|
389
|
+
primary_intent = intent_analysis.get("primary_intent", "help_request")
|
390
|
+
urls = intent_analysis.get("urls", [])
|
391
|
+
extraction_prefs = intent_analysis.get("extraction_preferences", {})
|
392
|
+
uses_context = intent_analysis.get("uses_context_reference", False)
|
393
|
+
|
394
|
+
# Resolve context references if needed
|
395
|
+
if uses_context and not urls:
|
396
|
+
recent_url = self.get_recent_url()
|
397
|
+
if recent_url:
|
398
|
+
urls = [recent_url]
|
399
|
+
|
400
|
+
# Route based on intent
|
401
|
+
if primary_intent == "help_request":
|
402
|
+
return await self._handle_scraping_help_request(user_message)
|
403
|
+
elif primary_intent == "scrape_single":
|
404
|
+
return await self._handle_single_scrape(urls, extraction_prefs, user_message)
|
405
|
+
elif primary_intent == "scrape_batch":
|
406
|
+
return await self._handle_batch_scrape(urls, extraction_prefs, user_message)
|
407
|
+
elif primary_intent == "check_accessibility":
|
408
|
+
return await self._handle_accessibility_check(urls, user_message)
|
409
|
+
else:
|
410
|
+
return await self._handle_scraping_help_request(user_message)
|
411
|
+
|
412
|
+
async def _handle_single_scrape(self, urls: List[str], extraction_prefs: Dict[str, Any], user_message: str) -> str:
|
413
|
+
"""Handle single URL scraping"""
|
414
|
+
if not urls:
|
415
|
+
recent_url = self.get_recent_url()
|
416
|
+
if recent_url:
|
417
|
+
return f"I can scrape web pages. Did you mean to scrape **{recent_url}**? Please confirm."
|
418
|
+
else:
|
419
|
+
return "I can scrape web pages. Please provide a URL to scrape.\n\n" \
|
420
|
+
"Example: 'scrape https://example.com'"
|
421
|
+
|
422
|
+
url = urls[0]
|
423
|
+
|
424
|
+
try:
|
425
|
+
result = await self._scrape_url(
|
426
|
+
url=url,
|
427
|
+
extract_links=extraction_prefs.get("extract_links", True),
|
428
|
+
extract_images=extraction_prefs.get("extract_images", True),
|
429
|
+
take_screenshot=extraction_prefs.get("take_screenshot", False)
|
430
|
+
)
|
431
|
+
|
432
|
+
if result['success']:
|
433
|
+
return f"""✅ **Web Scraping Completed**
|
434
|
+
|
435
|
+
🌐 **URL:** {result['url']}
|
436
|
+
🔧 **Method:** {result.get('method', 'unknown')}
|
437
|
+
🏃 **Mode:** {result['execution_mode']}
|
438
|
+
📊 **Status:** {result.get('status_code', 'N/A')}
|
439
|
+
📄 **Content:** {result['content_length']:,} characters
|
440
|
+
⏱️ **Time:** {result['response_time']:.2f}s
|
441
|
+
|
442
|
+
**Title:** {result.get('title', 'No title')}
|
443
|
+
|
444
|
+
**Content Preview:**
|
445
|
+
{result.get('content', '')[:300]}{'...' if len(result.get('content', '')) > 300 else ''}
|
446
|
+
|
447
|
+
**Links Found:** {len(result.get('links', []))}
|
448
|
+
**Images Found:** {len(result.get('images', []))}"""
|
449
|
+
else:
|
450
|
+
return f"❌ **Scraping failed:** {result['error']}"
|
451
|
+
|
452
|
+
except Exception as e:
|
453
|
+
return f"❌ **Error during scraping:** {str(e)}"
|
454
|
+
|
455
|
+
async def _handle_batch_scrape(self, urls: List[str], extraction_prefs: Dict[str, Any], user_message: str) -> str:
|
456
|
+
"""Handle batch URL scraping"""
|
457
|
+
if not urls:
|
458
|
+
return "I can scrape multiple web pages. Please provide URLs to scrape.\n\n" \
|
459
|
+
"Example: 'scrape https://example1.com and https://example2.com'"
|
460
|
+
|
461
|
+
try:
|
462
|
+
result = await self._batch_scrape(
|
463
|
+
urls=urls,
|
464
|
+
method="auto"
|
465
|
+
)
|
466
|
+
|
467
|
+
if result['success']:
|
468
|
+
successful = result['successful']
|
469
|
+
failed = result['failed']
|
470
|
+
total = result['total_urls']
|
471
|
+
|
472
|
+
response = f"""📦 **Batch Web Scraping Completed**
|
473
|
+
|
474
|
+
📊 **Summary:**
|
475
|
+
- **Total URLs:** {total}
|
476
|
+
- **Successful:** {successful}
|
477
|
+
- **Failed:** {failed}
|
478
|
+
- **Mode:** {result['execution_mode']}
|
479
|
+
|
480
|
+
"""
|
481
|
+
|
482
|
+
if successful > 0:
|
483
|
+
response += "✅ **Successfully Scraped:**\n"
|
484
|
+
for i, scrape_result in enumerate(result['results'], 1):
|
485
|
+
if scrape_result.get('success', False):
|
486
|
+
response += f"{i}. {scrape_result.get('url', 'Unknown')}\n"
|
487
|
+
|
488
|
+
if failed > 0:
|
489
|
+
response += f"\n❌ **Failed Scrapes:** {failed}\n"
|
490
|
+
for i, scrape_result in enumerate(result['results'], 1):
|
491
|
+
if not scrape_result.get('success', False):
|
492
|
+
response += f"{i}. {scrape_result.get('url', 'Unknown')}: {scrape_result.get('error', 'Unknown error')}\n"
|
493
|
+
|
494
|
+
response += f"\n🎉 Batch scraping completed with {successful}/{total} successful scrapes!"
|
495
|
+
return response
|
496
|
+
else:
|
497
|
+
return f"❌ **Batch scraping failed:** {result['error']}"
|
498
|
+
|
499
|
+
except Exception as e:
|
500
|
+
return f"❌ **Error during batch scraping:** {str(e)}"
|
501
|
+
|
502
|
+
async def _handle_accessibility_check(self, urls: List[str], user_message: str) -> str:
|
503
|
+
"""Handle accessibility check"""
|
504
|
+
if not urls:
|
505
|
+
recent_url = self.get_recent_url()
|
506
|
+
if recent_url:
|
507
|
+
return f"I can check if websites are accessible. Did you mean to check **{recent_url}**?"
|
508
|
+
else:
|
509
|
+
return "I can check if websites are accessible. Please provide a URL to check."
|
510
|
+
|
511
|
+
url = urls[0]
|
512
|
+
|
513
|
+
try:
|
514
|
+
result = await self._check_accessibility(url)
|
515
|
+
|
516
|
+
if result['success']:
|
517
|
+
status = "✅ Accessible" if result.get('accessible', False) else "❌ Not Accessible"
|
518
|
+
return f"""🔍 **Accessibility Check Results**
|
519
|
+
|
520
|
+
🌐 **URL:** {result['url']}
|
521
|
+
🚦 **Status:** {status}
|
522
|
+
📊 **HTTP Status:** {result.get('status_code', 'Unknown')}
|
523
|
+
⏱️ **Response Time:** {result.get('response_time', 0):.2f}s
|
524
|
+
📅 **Checked:** {result.get('timestamp', 'Unknown')}
|
525
|
+
|
526
|
+
{'The website is accessible and responding normally.' if result.get('accessible', False) else 'The website is not accessible or not responding.'}"""
|
527
|
+
else:
|
528
|
+
return f"❌ **Accessibility check failed:** {result['error']}"
|
529
|
+
|
530
|
+
except Exception as e:
|
531
|
+
return f"❌ **Error during accessibility check:** {str(e)}"
|
532
|
+
|
533
|
+
async def _handle_scraping_help_request(self, user_message: str) -> str:
|
534
|
+
"""Handle scraping help requests with conversation context"""
|
535
|
+
state = self.get_conversation_state()
|
536
|
+
|
537
|
+
response = ("I'm your Web Scraper Agent! I can help you with:\n\n"
|
538
|
+
"🕷️ **Web Scraping**\n"
|
539
|
+
"- Extract content from web pages\n"
|
540
|
+
"- Scrape multiple URLs at once\n"
|
541
|
+
"- Extract links and images\n"
|
542
|
+
"- Take screenshots\n\n"
|
543
|
+
"🔧 **Multiple Execution Modes**\n"
|
544
|
+
"- Proxy support (ScraperAPI compatible)\n"
|
545
|
+
"- Docker-based secure execution\n"
|
546
|
+
"- Local fallback methods\n\n"
|
547
|
+
"🧠 **Smart Context Features**\n"
|
548
|
+
"- Remembers URLs from previous messages\n"
|
549
|
+
"- Understands 'that website' and 'this page'\n"
|
550
|
+
"- Maintains conversation state\n\n")
|
551
|
+
|
552
|
+
# Add current context information
|
553
|
+
if state.current_resource:
|
554
|
+
response += f"🎯 **Current URL:** {state.current_resource}\n"
|
555
|
+
|
556
|
+
response += f"\n🔧 **Current Mode:** {self.execution_mode.upper()}\n"
|
557
|
+
response += f"📡 **Proxy Enabled:** {'✅' if self.proxy_config else '❌'}\n"
|
558
|
+
response += f"🐳 **Docker Available:** {'✅' if self.docker_executor and self.docker_executor.available else '❌'}\n"
|
559
|
+
|
560
|
+
response += "\n💡 **Examples:**\n"
|
561
|
+
response += "• 'scrape https://example.com'\n"
|
562
|
+
response += "• 'batch scrape https://site1.com https://site2.com'\n"
|
563
|
+
response += "• 'check if https://example.com is accessible'\n"
|
564
|
+
response += "\nI understand context from our conversation! 🚀"
|
565
|
+
|
566
|
+
return response
|
567
|
+
|
568
|
+
def _extract_scraping_intent_from_llm_response(self, llm_response: str, user_message: str) -> Dict[str, Any]:
|
569
|
+
"""Extract scraping intent from non-JSON LLM response"""
|
570
|
+
content_lower = llm_response.lower()
|
571
|
+
|
572
|
+
if 'batch' in content_lower or 'multiple' in content_lower:
|
573
|
+
intent = 'scrape_batch'
|
574
|
+
elif 'scrape' in content_lower:
|
575
|
+
intent = 'scrape_single'
|
576
|
+
elif 'check' in content_lower or 'accessible' in content_lower:
|
577
|
+
intent = 'check_accessibility'
|
578
|
+
else:
|
579
|
+
intent = 'help_request'
|
580
|
+
|
581
|
+
return {
|
582
|
+
"primary_intent": intent,
|
583
|
+
"urls": [],
|
584
|
+
"extraction_preferences": {"extract_links": True, "extract_images": True},
|
585
|
+
"uses_context_reference": False,
|
586
|
+
"context_type": "none",
|
587
|
+
"technical_specs": {"method": "auto"},
|
588
|
+
"confidence": 0.6
|
589
|
+
}
|
590
|
+
|
591
|
+
def _configure_ssl_for_proxy(self):
|
592
|
+
"""Configure SSL settings for proxy usage"""
|
593
|
+
if REQUESTS_AVAILABLE:
|
594
|
+
try:
|
595
|
+
ssl_context = ssl.create_default_context()
|
596
|
+
ssl_context.check_hostname = False
|
597
|
+
ssl_context.verify_mode = ssl.CERT_NONE
|
598
|
+
|
599
|
+
import requests.packages.urllib3.util.ssl_
|
600
|
+
requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL'
|
601
|
+
except Exception as e:
|
602
|
+
self.logger.warning(f"SSL configuration warning: {e}")
|
603
|
+
|
604
|
+
self.logger.info("SSL verification disabled for proxy usage")
|
605
|
+
|
606
|
+
def _determine_execution_mode(self) -> str:
|
607
|
+
"""Determine execution mode from configuration"""
|
608
|
+
# Check if proxy is enabled in config
|
609
|
+
if self.scraper_config.get('proxy_enabled', False):
|
610
|
+
proxy_url = self.scraper_config.get('proxy_config', {}).get('http_proxy')
|
611
|
+
if proxy_url:
|
612
|
+
return "proxy"
|
613
|
+
|
614
|
+
# Check if Docker should be used
|
615
|
+
if self.scraper_config.get('docker_image'):
|
616
|
+
return "docker"
|
617
|
+
|
618
|
+
# Fall back to local execution
|
619
|
+
if PLAYWRIGHT_AVAILABLE or REQUESTS_AVAILABLE:
|
620
|
+
return "local"
|
621
|
+
|
622
|
+
raise RuntimeError("No scraping execution methods available")
|
623
|
+
|
624
|
+
def _parse_proxy_url(self, proxy_url: str) -> Dict[str, Any]:
|
625
|
+
"""Parse proxy URL for different usage formats"""
|
626
|
+
try:
|
627
|
+
parsed = urlparse(proxy_url)
|
628
|
+
return {
|
629
|
+
'server': f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
|
630
|
+
'username': parsed.username,
|
631
|
+
'password': parsed.password,
|
632
|
+
'host': parsed.hostname,
|
633
|
+
'port': parsed.port,
|
634
|
+
'full_url': proxy_url
|
635
|
+
}
|
636
|
+
except Exception as e:
|
637
|
+
self.logger.error(f"Failed to parse proxy URL: {e}")
|
638
|
+
return {}
|
639
|
+
|
640
|
+
def _add_scraping_tools(self):
|
641
|
+
"""Add scraping tools"""
|
642
|
+
self.add_tool(AgentTool(
|
643
|
+
name="scrape_url",
|
644
|
+
description="Scrape a single URL",
|
645
|
+
function=self._scrape_url,
|
646
|
+
parameters_schema={
|
647
|
+
"type": "object",
|
648
|
+
"properties": {
|
649
|
+
"url": {"type": "string", "description": "URL to scrape"},
|
650
|
+
"method": {"type": "string", "enum": ["auto", "playwright", "requests"], "default": "auto"},
|
651
|
+
"extract_links": {"type": "boolean", "default": True},
|
652
|
+
"extract_images": {"type": "boolean", "default": True},
|
653
|
+
"take_screenshot": {"type": "boolean", "default": False}
|
654
|
+
},
|
655
|
+
"required": ["url"]
|
656
|
+
}
|
657
|
+
))
|
658
|
+
|
659
|
+
self.add_tool(AgentTool(
|
660
|
+
name="batch_scrape",
|
661
|
+
description="Scrape multiple URLs",
|
662
|
+
function=self._batch_scrape,
|
663
|
+
parameters_schema={
|
664
|
+
"type": "object",
|
665
|
+
"properties": {
|
666
|
+
"urls": {"type": "array", "items": {"type": "string"}},
|
667
|
+
"method": {"type": "string", "default": "auto"}
|
668
|
+
},
|
669
|
+
"required": ["urls"]
|
670
|
+
}
|
671
|
+
))
|
672
|
+
|
673
|
+
self.add_tool(AgentTool(
|
674
|
+
name="check_accessibility",
|
675
|
+
description="Quick check if URL is accessible",
|
676
|
+
function=self._check_accessibility,
|
677
|
+
parameters_schema={
|
678
|
+
"type": "object",
|
679
|
+
"properties": {
|
680
|
+
"url": {"type": "string", "description": "URL to check"}
|
681
|
+
},
|
682
|
+
"required": ["url"]
|
683
|
+
}
|
684
|
+
))
|
685
|
+
|
686
|
+
async def _scrape_url(self, url: str, method: str = "auto", **kwargs) -> Dict[str, Any]:
|
687
|
+
"""Unified URL scraping method"""
|
688
|
+
try:
|
689
|
+
if self.execution_mode == "docker" and self.docker_executor and self.docker_executor.available:
|
690
|
+
return await self._scrape_with_docker(url, method, **kwargs)
|
691
|
+
elif self.execution_mode == "proxy" and self.proxy_config:
|
692
|
+
return await self._scrape_with_proxy(url, method, **kwargs)
|
693
|
+
else:
|
694
|
+
return await self._scrape_locally(url, method, **kwargs)
|
695
|
+
|
696
|
+
except Exception as e:
|
697
|
+
self.logger.error(f"Scraping error for {url}: {e}")
|
698
|
+
return {
|
699
|
+
"success": False,
|
700
|
+
"error": str(e),
|
701
|
+
"url": url,
|
702
|
+
"method": method,
|
703
|
+
"execution_mode": self.execution_mode
|
704
|
+
}
|
705
|
+
|
706
|
+
async def _scrape_with_docker(self, url: str, method: str, **kwargs) -> Dict[str, Any]:
|
707
|
+
"""Scrape using Docker executor"""
|
708
|
+
task = ScrapingTask(
|
709
|
+
url=url,
|
710
|
+
method=method if method != "auto" else "playwright",
|
711
|
+
extract_links=kwargs.get('extract_links', True),
|
712
|
+
extract_images=kwargs.get('extract_images', True),
|
713
|
+
take_screenshot=kwargs.get('take_screenshot', False),
|
714
|
+
timeout=kwargs.get('timeout', self.scraper_config.get('timeout', 60))
|
715
|
+
)
|
716
|
+
|
717
|
+
result = self.docker_executor.execute_scraping_task(task)
|
718
|
+
result['execution_mode'] = 'docker'
|
719
|
+
return result
|
720
|
+
|
721
|
+
async def _scrape_with_proxy(self, url: str, method: str, **kwargs) -> Dict[str, Any]:
|
722
|
+
"""Scrape using proxy (ScraperAPI style) with SSL verification disabled"""
|
723
|
+
if method == "auto":
|
724
|
+
method = "playwright" if PLAYWRIGHT_AVAILABLE else "requests"
|
725
|
+
|
726
|
+
if method == "playwright" and PLAYWRIGHT_AVAILABLE:
|
727
|
+
return await self._scrape_proxy_playwright(url, **kwargs)
|
728
|
+
elif REQUESTS_AVAILABLE:
|
729
|
+
return self._scrape_proxy_requests(url, **kwargs)
|
730
|
+
else:
|
731
|
+
raise RuntimeError("No proxy scraping methods available")
|
732
|
+
|
733
|
+
async def _scrape_proxy_playwright(self, url: str, **kwargs) -> Dict[str, Any]:
|
734
|
+
"""Scrape using Playwright with proxy and SSL verification disabled"""
|
735
|
+
async with async_playwright() as p:
|
736
|
+
browser = None
|
737
|
+
try:
|
738
|
+
browser = await p.chromium.launch(
|
739
|
+
headless=True,
|
740
|
+
proxy={
|
741
|
+
"server": self.proxy_config['server'],
|
742
|
+
"username": self.proxy_config['username'],
|
743
|
+
"password": self.proxy_config['password']
|
744
|
+
},
|
745
|
+
args=[
|
746
|
+
'--no-sandbox',
|
747
|
+
'--disable-dev-shm-usage',
|
748
|
+
'--disable-web-security',
|
749
|
+
'--disable-features=VizDisplayCompositor',
|
750
|
+
'--ignore-certificate-errors',
|
751
|
+
'--ignore-ssl-errors',
|
752
|
+
'--ignore-certificate-errors-spki-list',
|
753
|
+
'--allow-running-insecure-content'
|
754
|
+
]
|
755
|
+
)
|
756
|
+
|
757
|
+
context = await browser.new_context(
|
758
|
+
viewport={"width": 1920, "height": 1080},
|
759
|
+
user_agent=self.scraper_config.get('default_headers', {}).get('User-Agent',
|
760
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'),
|
761
|
+
ignore_https_errors=True
|
762
|
+
)
|
763
|
+
|
764
|
+
page = await context.new_page()
|
765
|
+
start_time = time.time()
|
766
|
+
|
767
|
+
timeout_ms = self.scraper_config.get('timeout', 60) * 1000
|
768
|
+
response = await page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
|
769
|
+
await page.wait_for_timeout(3000)
|
770
|
+
|
771
|
+
response_time = time.time() - start_time
|
772
|
+
|
773
|
+
# Extract content
|
774
|
+
title = await page.title()
|
775
|
+
content = await page.inner_text("body")
|
776
|
+
|
777
|
+
# Extract links
|
778
|
+
links = []
|
779
|
+
if kwargs.get('extract_links', True):
|
780
|
+
link_elements = await page.query_selector_all("a[href]")
|
781
|
+
max_links = self.scraper_config.get('max_links_per_page', 100)
|
782
|
+
for link in link_elements[:max_links]:
|
783
|
+
href = await link.get_attribute("href")
|
784
|
+
text = await link.inner_text()
|
785
|
+
if href and text:
|
786
|
+
links.append({
|
787
|
+
"url": urljoin(url, href),
|
788
|
+
"text": text.strip()[:100]
|
789
|
+
})
|
790
|
+
|
791
|
+
# Extract images
|
792
|
+
images = []
|
793
|
+
if kwargs.get('extract_images', True):
|
794
|
+
img_elements = await page.query_selector_all("img[src]")
|
795
|
+
max_images = self.scraper_config.get('max_images_per_page', 50)
|
796
|
+
for img in img_elements[:max_images]:
|
797
|
+
src = await img.get_attribute("src")
|
798
|
+
alt = await img.get_attribute("alt") or ""
|
799
|
+
if src:
|
800
|
+
images.append({
|
801
|
+
"url": urljoin(url, src),
|
802
|
+
"alt": alt
|
803
|
+
})
|
804
|
+
|
805
|
+
await browser.close()
|
806
|
+
|
807
|
+
return {
|
808
|
+
"success": True,
|
809
|
+
"url": url,
|
810
|
+
"title": title,
|
811
|
+
"content": content[:5000],
|
812
|
+
"content_length": len(content),
|
813
|
+
"links": links,
|
814
|
+
"images": images,
|
815
|
+
"status_code": response.status if response else None,
|
816
|
+
"response_time": response_time,
|
817
|
+
"method": "proxy_playwright",
|
818
|
+
"execution_mode": "proxy"
|
819
|
+
}
|
820
|
+
|
821
|
+
except Exception as e:
|
822
|
+
if browser:
|
823
|
+
await browser.close()
|
824
|
+
raise e
|
825
|
+
|
826
|
+
def _scrape_proxy_requests(self, url: str, **kwargs) -> Dict[str, Any]:
|
827
|
+
"""Scrape using requests with proxy and SSL verification disabled"""
|
828
|
+
proxies = {
|
829
|
+
'http': self.proxy_config['full_url'],
|
830
|
+
'https': self.proxy_config['full_url']
|
831
|
+
}
|
832
|
+
|
833
|
+
headers = self.scraper_config.get('default_headers', {
|
834
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
|
835
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
836
|
+
})
|
837
|
+
|
838
|
+
start_time = time.time()
|
839
|
+
|
840
|
+
response = requests.get(
|
841
|
+
url,
|
842
|
+
headers=headers,
|
843
|
+
proxies=proxies,
|
844
|
+
timeout=self.scraper_config.get('timeout', 60),
|
845
|
+
verify=False,
|
846
|
+
allow_redirects=True
|
847
|
+
)
|
848
|
+
response_time = time.time() - start_time
|
849
|
+
|
850
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
851
|
+
|
852
|
+
# Extract content
|
853
|
+
title = soup.find('title')
|
854
|
+
title = title.get_text().strip() if title else "No title"
|
855
|
+
|
856
|
+
for script in soup(["script", "style"]):
|
857
|
+
script.decompose()
|
858
|
+
|
859
|
+
content = soup.get_text()
|
860
|
+
content = ' '.join(content.split())
|
861
|
+
|
862
|
+
# Extract links and images based on config
|
863
|
+
links = []
|
864
|
+
images = []
|
865
|
+
|
866
|
+
if kwargs.get('extract_links', True):
|
867
|
+
max_links = self.scraper_config.get('max_links_per_page', 100)
|
868
|
+
for link in soup.find_all('a', href=True)[:max_links]:
|
869
|
+
links.append({
|
870
|
+
"url": urljoin(url, link['href']),
|
871
|
+
"text": link.get_text().strip()[:100]
|
872
|
+
})
|
873
|
+
|
874
|
+
if kwargs.get('extract_images', True):
|
875
|
+
max_images = self.scraper_config.get('max_images_per_page', 50)
|
876
|
+
for img in soup.find_all('img', src=True)[:max_images]:
|
877
|
+
images.append({
|
878
|
+
"url": urljoin(url, img['src']),
|
879
|
+
"alt": img.get('alt', '')
|
880
|
+
})
|
881
|
+
|
882
|
+
return {
|
883
|
+
"success": True,
|
884
|
+
"url": url,
|
885
|
+
"title": title,
|
886
|
+
"content": content[:5000],
|
887
|
+
"content_length": len(content),
|
888
|
+
"links": links,
|
889
|
+
"images": images,
|
890
|
+
"status_code": response.status_code,
|
891
|
+
"response_time": response_time,
|
892
|
+
"method": "proxy_requests",
|
893
|
+
"execution_mode": "proxy"
|
894
|
+
}
|
895
|
+
|
896
|
+
async def _scrape_locally(self, url: str, method: str, **kwargs) -> Dict[str, Any]:
|
897
|
+
"""Scrape using local methods (no proxy, no Docker)"""
|
898
|
+
if method == "auto":
|
899
|
+
method = "playwright" if PLAYWRIGHT_AVAILABLE else "requests"
|
900
|
+
|
901
|
+
if method == "playwright" and PLAYWRIGHT_AVAILABLE:
|
902
|
+
return await self._scrape_local_playwright(url, **kwargs)
|
903
|
+
elif REQUESTS_AVAILABLE:
|
904
|
+
return self._scrape_local_requests(url, **kwargs)
|
905
|
+
else:
|
906
|
+
raise RuntimeError("No local scraping methods available")
|
907
|
+
|
908
|
+
async def _scrape_local_playwright(self, url: str, **kwargs) -> Dict[str, Any]:
|
909
|
+
"""Local Playwright scraping"""
|
910
|
+
async with async_playwright() as p:
|
911
|
+
browser = await p.chromium.launch(headless=True)
|
912
|
+
context = await browser.new_context(
|
913
|
+
user_agent=self.scraper_config.get('default_headers', {}).get('User-Agent',
|
914
|
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
|
915
|
+
)
|
916
|
+
page = await context.new_page()
|
917
|
+
|
918
|
+
start_time = time.time()
|
919
|
+
timeout_ms = self.scraper_config.get('timeout', 60) * 1000
|
920
|
+
response = await page.goto(url, timeout=timeout_ms)
|
921
|
+
response_time = time.time() - start_time
|
922
|
+
|
923
|
+
title = await page.title()
|
924
|
+
content = await page.inner_text("body")
|
925
|
+
|
926
|
+
await browser.close()
|
927
|
+
|
928
|
+
return {
|
929
|
+
"success": True,
|
930
|
+
"url": url,
|
931
|
+
"title": title,
|
932
|
+
"content": content[:5000],
|
933
|
+
"content_length": len(content),
|
934
|
+
"status_code": response.status if response else None,
|
935
|
+
"response_time": response_time,
|
936
|
+
"method": "local_playwright",
|
937
|
+
"execution_mode": "local"
|
938
|
+
}
|
939
|
+
|
940
|
+
def _scrape_local_requests(self, url: str, **kwargs) -> Dict[str, Any]:
|
941
|
+
"""Local requests scraping"""
|
942
|
+
headers = self.scraper_config.get('default_headers', {
|
943
|
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
944
|
+
})
|
945
|
+
|
946
|
+
start_time = time.time()
|
947
|
+
response = requests.get(url, headers=headers, timeout=self.scraper_config.get('timeout', 60))
|
948
|
+
response_time = time.time() - start_time
|
949
|
+
|
950
|
+
soup = BeautifulSoup(response.content, 'html.parser')
|
951
|
+
title = soup.find('title')
|
952
|
+
title = title.get_text().strip() if title else "No title"
|
953
|
+
|
954
|
+
for script in soup(["script", "style"]):
|
955
|
+
script.decompose()
|
956
|
+
|
957
|
+
content = soup.get_text()
|
958
|
+
content = ' '.join(content.split())
|
959
|
+
|
960
|
+
return {
|
961
|
+
"success": True,
|
962
|
+
"url": url,
|
963
|
+
"title": title,
|
964
|
+
"content": content[:5000],
|
965
|
+
"content_length": len(content),
|
966
|
+
"status_code": response.status_code,
|
967
|
+
"response_time": response_time,
|
968
|
+
"method": "local_requests",
|
969
|
+
"execution_mode": "local"
|
970
|
+
}
|
971
|
+
|
972
|
+
async def _batch_scrape(self, urls: List[str], method: str = "auto") -> Dict[str, Any]:
|
973
|
+
"""Batch scraping with rate limiting from config"""
|
974
|
+
results = []
|
975
|
+
rate_limit = self.scraper_config.get('rate_limit_seconds', 1.0)
|
976
|
+
|
977
|
+
for i, url in enumerate(urls):
|
978
|
+
try:
|
979
|
+
result = await self._scrape_url(url, method)
|
980
|
+
results.append(result)
|
981
|
+
|
982
|
+
if i < len(urls) - 1:
|
983
|
+
await asyncio.sleep(rate_limit)
|
984
|
+
|
985
|
+
except Exception as e:
|
986
|
+
results.append({
|
987
|
+
"success": False,
|
988
|
+
"url": url,
|
989
|
+
"error": str(e)
|
990
|
+
})
|
991
|
+
|
992
|
+
successful = sum(1 for r in results if r.get('success', False))
|
993
|
+
|
994
|
+
return {
|
995
|
+
"success": True,
|
996
|
+
"total_urls": len(urls),
|
997
|
+
"successful": successful,
|
998
|
+
"failed": len(urls) - successful,
|
999
|
+
"results": results,
|
1000
|
+
"execution_mode": self.execution_mode
|
1001
|
+
}
|
1002
|
+
|
1003
|
+
async def _check_accessibility(self, url: str) -> Dict[str, Any]:
|
1004
|
+
"""Check URL accessibility"""
|
1005
|
+
try:
|
1006
|
+
result = await self._scrape_url(url, extract_links=False, extract_images=False)
|
1007
|
+
return {
|
1008
|
+
"success": True,
|
1009
|
+
"url": url,
|
1010
|
+
"accessible": result.get('success', False),
|
1011
|
+
"status_code": result.get('status_code'),
|
1012
|
+
"response_time": result.get('response_time', 0),
|
1013
|
+
"error": result.get('error'),
|
1014
|
+
"timestamp": datetime.now().isoformat()
|
1015
|
+
}
|
1016
|
+
except Exception as e:
|
1017
|
+
return {
|
1018
|
+
"success": False,
|
1019
|
+
"error": str(e),
|
1020
|
+
"url": url,
|
1021
|
+
"timestamp": datetime.now().isoformat()
|
1022
|
+
}
|
1023
|
+
|
1024
|
+
def _extract_urls_from_text(self, text: str) -> List[str]:
|
1025
|
+
"""Extract URLs from text"""
|
1026
|
+
url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
|
1027
|
+
return re.findall(url_pattern, text)
|