ambivo-agents 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1027 @@
1
+ # ambivo_agents/agents/web_scraper.py
2
+ """
3
+ Web Scraper Agent with proxy, Docker, and local execution modes.
4
+ Updated with LLM-aware intent detection and conversation history integration.
5
+ """
6
+
7
+ import asyncio
8
+ import json
9
+ import re
10
+ import time
11
+ import random
12
+ import uuid
13
+ import logging
14
+ import ssl
15
+ import urllib3
16
+ from datetime import datetime
17
+ from typing import Dict, Any, List, Optional
18
+ from urllib.parse import urlparse, urljoin
19
+ from dataclasses import dataclass
20
+ from pathlib import Path
21
+
22
+ from ..core.base import BaseAgent, AgentRole, AgentMessage, MessageType, ExecutionContext, AgentTool
23
+ from ..config.loader import load_config, get_config_section
24
+ from ..core.history import WebAgentHistoryMixin, ContextType
25
+
26
+ # Conditional imports for different execution modes
27
+ try:
28
+ from playwright.async_api import async_playwright
29
+
30
+ PLAYWRIGHT_AVAILABLE = True
31
+ except ImportError:
32
+ PLAYWRIGHT_AVAILABLE = False
33
+
34
+ try:
35
+ import requests
36
+ from bs4 import BeautifulSoup
37
+
38
+ urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
39
+ REQUESTS_AVAILABLE = True
40
+ except ImportError:
41
+ REQUESTS_AVAILABLE = False
42
+
43
+ try:
44
+ import docker
45
+
46
+ DOCKER_AVAILABLE = True
47
+ except ImportError:
48
+ DOCKER_AVAILABLE = False
49
+
50
+
51
+ @dataclass
52
+ class ScrapingTask:
53
+ """Simple scraping task data structure"""
54
+ url: str
55
+ method: str = "playwright"
56
+ extract_links: bool = True
57
+ extract_images: bool = True
58
+ take_screenshot: bool = False
59
+ timeout: int = 45
60
+
61
+
62
+ class SimpleDockerExecutor:
63
+ """Simple Docker executor for scraping tasks"""
64
+
65
+ def __init__(self, config: Dict[str, Any] = None):
66
+ self.config = config or {}
67
+ self.docker_image = self.config.get('docker_image', 'sgosain/amb-ubuntu-python-public-pod')
68
+ self.timeout = self.config.get('timeout', 60)
69
+
70
+ if DOCKER_AVAILABLE:
71
+ try:
72
+ self.docker_client = docker.from_env()
73
+ self.docker_client.ping()
74
+ self.available = True
75
+ except Exception as e:
76
+ logging.warning(f"Docker initialization failed: {e}")
77
+ self.available = False
78
+ else:
79
+ self.available = False
80
+
81
+ def execute_scraping_task(self, task: ScrapingTask) -> Dict[str, Any]:
82
+ """Execute a scraping task in Docker"""
83
+ if not self.available:
84
+ return {
85
+ 'success': False,
86
+ 'error': 'Docker not available',
87
+ 'url': task.url
88
+ }
89
+
90
+ try:
91
+ # Create scraping script for Docker
92
+ script_content = f"""
93
+ import asyncio
94
+ from playwright.async_api import async_playwright
95
+ import json
96
+
97
+ async def scrape_url():
98
+ async with async_playwright() as p:
99
+ browser = await p.chromium.launch(headless=True)
100
+ page = await browser.new_page()
101
+
102
+ try:
103
+ response = await page.goto('{task.url}', timeout={task.timeout * 1000})
104
+ title = await page.title()
105
+ content = await page.inner_text('body')
106
+
107
+ # Extract links if requested
108
+ links = []
109
+ if {task.extract_links}:
110
+ link_elements = await page.query_selector_all('a[href]')
111
+ for link in link_elements[:50]: # Limit to 50 links
112
+ href = await link.get_attribute('href')
113
+ text = await link.inner_text()
114
+ if href and text:
115
+ links.append({{'url': href, 'text': text[:100]}})
116
+
117
+ # Extract images if requested
118
+ images = []
119
+ if {task.extract_images}:
120
+ img_elements = await page.query_selector_all('img[src]')
121
+ for img in img_elements[:25]: # Limit to 25 images
122
+ src = await img.get_attribute('src')
123
+ alt = await img.get_attribute('alt') or ''
124
+ if src:
125
+ images.append({{'url': src, 'alt': alt}})
126
+
127
+ result = {{
128
+ 'success': True,
129
+ 'url': '{task.url}',
130
+ 'title': title,
131
+ 'content': content[:5000], # Limit content
132
+ 'content_length': len(content),
133
+ 'links': links,
134
+ 'images': images,
135
+ 'status_code': response.status if response else None,
136
+ 'method': 'docker_playwright',
137
+ 'execution_mode': 'docker'
138
+ }}
139
+
140
+ print(json.dumps(result))
141
+
142
+ except Exception as e:
143
+ error_result = {{
144
+ 'success': False,
145
+ 'error': str(e),
146
+ 'url': '{task.url}',
147
+ 'execution_mode': 'docker'
148
+ }}
149
+ print(json.dumps(error_result))
150
+
151
+ finally:
152
+ await browser.close()
153
+
154
+ asyncio.run(scrape_url())
155
+ """
156
+
157
+ # Execute in Docker container
158
+ container = self.docker_client.containers.run(
159
+ image=self.docker_image,
160
+ command=['python', '-c', script_content],
161
+ remove=True,
162
+ mem_limit='512m',
163
+ network_disabled=False, # Need network for scraping
164
+ stdout=True,
165
+ stderr=True,
166
+ timeout=self.timeout
167
+ )
168
+
169
+ # Parse result
170
+ output = container.decode('utf-8') if isinstance(container, bytes) else str(container)
171
+ return json.loads(output.strip().split('\n')[-1])
172
+
173
+ except Exception as e:
174
+ return {
175
+ 'success': False,
176
+ 'error': str(e),
177
+ 'url': task.url,
178
+ 'execution_mode': 'docker'
179
+ }
180
+
181
+
182
+ class WebScraperAgent(BaseAgent, WebAgentHistoryMixin):
183
+ """Unified web scraper agent with proxy, Docker, and local execution modes"""
184
+
185
+ def __init__(self, agent_id: str = None, memory_manager=None, llm_service=None, **kwargs):
186
+
187
+ if agent_id is None:
188
+ agent_id = f"scraper_{str(uuid.uuid4())[:8]}"
189
+
190
+ super().__init__(
191
+ agent_id=agent_id,
192
+ role=AgentRole.RESEARCHER,
193
+ memory_manager=memory_manager,
194
+ llm_service=llm_service,
195
+ name="Web Scraper Agent",
196
+ description="Unified web scraper with proxy, Docker, and local execution modes",
197
+ **kwargs
198
+ )
199
+
200
+ # Initialize history mixin
201
+ self.setup_history_mixin()
202
+
203
+ self.logger = logging.getLogger(f"WebScraperAgent-{agent_id}")
204
+
205
+ # Load configuration from YAML
206
+ try:
207
+ config = load_config()
208
+ self.scraper_config = get_config_section('web_scraping', config)
209
+ except Exception as e:
210
+ raise ValueError(f"web_scraping configuration not found in agent_config.yaml: {e}")
211
+
212
+ # Initialize execution mode based on config
213
+ self.execution_mode = self._determine_execution_mode()
214
+
215
+ # Initialize executors based on availability and config
216
+ self.docker_executor = None
217
+ self.proxy_config = None
218
+
219
+ # Initialize Docker executor if configured
220
+ if self.execution_mode in ["docker", "auto"]:
221
+ try:
222
+ docker_config = {
223
+ **self.scraper_config,
224
+ 'docker_image': self.scraper_config.get('docker_image'),
225
+ 'timeout': self.scraper_config.get('timeout', 60)
226
+ }
227
+ self.docker_executor = SimpleDockerExecutor(docker_config)
228
+ except Exception as e:
229
+ self.logger.warning(f"Docker executor initialization failed: {e}")
230
+
231
+ # Initialize proxy configuration if enabled
232
+ if self.scraper_config.get('proxy_enabled', False):
233
+ proxy_url = self.scraper_config.get('proxy_config', {}).get('http_proxy')
234
+ if proxy_url:
235
+ self.proxy_config = self._parse_proxy_url(proxy_url)
236
+ self._configure_ssl_for_proxy()
237
+
238
+ # Add tools
239
+ self._add_scraping_tools()
240
+
241
+ self.logger.info(f"WebScraperAgent initialized (Mode: {self.execution_mode})")
242
+
243
+ async def _llm_analyze_scraping_intent(self, user_message: str, conversation_context: str = "") -> Dict[str, Any]:
244
+ """Use LLM to analyze web scraping intent"""
245
+ if not self.llm_service:
246
+ return self._keyword_based_scraping_analysis(user_message)
247
+
248
+ prompt = f"""
249
+ Analyze this user message in the context of web scraping and extract:
250
+ 1. Primary intent (scrape_single, scrape_batch, check_accessibility, help_request)
251
+ 2. URLs to scrape
252
+ 3. Extraction preferences (links, images, content)
253
+ 4. Context references (referring to previous scraping operations)
254
+ 5. Technical specifications (method, timeout, etc.)
255
+
256
+ Conversation Context:
257
+ {conversation_context}
258
+
259
+ Current User Message: {user_message}
260
+
261
+ Respond in JSON format:
262
+ {{
263
+ "primary_intent": "scrape_single|scrape_batch|check_accessibility|help_request",
264
+ "urls": ["http://example.com"],
265
+ "extraction_preferences": {{
266
+ "extract_links": true,
267
+ "extract_images": true,
268
+ "take_screenshot": false
269
+ }},
270
+ "uses_context_reference": true/false,
271
+ "context_type": "previous_url|previous_operation",
272
+ "technical_specs": {{
273
+ "method": "playwright|requests|auto",
274
+ "timeout": 60
275
+ }},
276
+ "confidence": 0.0-1.0
277
+ }}
278
+ """
279
+
280
+ try:
281
+ response = await self.llm_service.generate_response(prompt)
282
+ import re
283
+ json_match = re.search(r'\{.*\}', response, re.DOTALL)
284
+ if json_match:
285
+ return json.loads(json_match.group())
286
+ else:
287
+ return self._extract_scraping_intent_from_llm_response(response, user_message)
288
+ except Exception as e:
289
+ return self._keyword_based_scraping_analysis(user_message)
290
+
291
+ def _keyword_based_scraping_analysis(self, user_message: str) -> Dict[str, Any]:
292
+ """Fallback keyword-based scraping intent analysis"""
293
+ content_lower = user_message.lower()
294
+
295
+ # Determine intent
296
+ if any(word in content_lower for word in ['batch', 'multiple', 'several']):
297
+ intent = 'scrape_batch'
298
+ elif any(word in content_lower for word in ['check', 'test', 'accessible']):
299
+ intent = 'check_accessibility'
300
+ elif any(word in content_lower for word in ['scrape', 'extract', 'crawl']):
301
+ intent = 'scrape_single'
302
+ else:
303
+ intent = 'help_request'
304
+
305
+ # Extract URLs
306
+ urls = self.extract_context_from_text(user_message, ContextType.URL)
307
+
308
+ return {
309
+ "primary_intent": intent,
310
+ "urls": urls,
311
+ "extraction_preferences": {
312
+ "extract_links": True,
313
+ "extract_images": True,
314
+ "take_screenshot": False
315
+ },
316
+ "uses_context_reference": any(word in content_lower for word in ['this', 'that', 'it']),
317
+ "context_type": "previous_url",
318
+ "technical_specs": {
319
+ "method": "auto",
320
+ "timeout": 60
321
+ },
322
+ "confidence": 0.7
323
+ }
324
+
325
+ async def process_message(self, message: AgentMessage, context: ExecutionContext = None) -> AgentMessage:
326
+ """Process message with LLM-based scraping intent detection and history context"""
327
+ self.memory.store_message(message)
328
+
329
+ try:
330
+ user_message = message.content
331
+
332
+ # Update conversation state
333
+ self.update_conversation_state(user_message)
334
+
335
+ # Get conversation context for LLM analysis
336
+ conversation_context = self._get_scraping_conversation_context_summary()
337
+
338
+ # Use LLM to analyze intent
339
+ intent_analysis = await self._llm_analyze_scraping_intent(user_message, conversation_context)
340
+
341
+ # Route request based on LLM analysis
342
+ response_content = await self._route_scraping_with_llm_analysis(intent_analysis, user_message, context)
343
+
344
+ response = self.create_response(
345
+ content=response_content,
346
+ recipient_id=message.sender_id,
347
+ session_id=message.session_id,
348
+ conversation_id=message.conversation_id
349
+ )
350
+
351
+ self.memory.store_message(response)
352
+ return response
353
+
354
+ except Exception as e:
355
+ error_response = self.create_response(
356
+ content=f"Web Scraper Agent error: {str(e)}",
357
+ recipient_id=message.sender_id,
358
+ message_type=MessageType.ERROR,
359
+ session_id=message.session_id,
360
+ conversation_id=message.conversation_id
361
+ )
362
+ return error_response
363
+
364
+ def _get_scraping_conversation_context_summary(self) -> str:
365
+ """Get scraping conversation context summary"""
366
+ try:
367
+ recent_history = self.get_conversation_history_with_context(
368
+ limit=3,
369
+ context_types=[ContextType.URL]
370
+ )
371
+
372
+ context_summary = []
373
+ for msg in recent_history:
374
+ if msg.get('message_type') == 'user_input':
375
+ extracted_context = msg.get('extracted_context', {})
376
+ urls = extracted_context.get('url', [])
377
+
378
+ if urls:
379
+ context_summary.append(f"Previous URL: {urls[0]}")
380
+
381
+ return "\n".join(context_summary) if context_summary else "No previous scraping context"
382
+ except:
383
+ return "No previous scraping context"
384
+
385
+ async def _route_scraping_with_llm_analysis(self, intent_analysis: Dict[str, Any], user_message: str,
386
+ context: ExecutionContext) -> str:
387
+ """Route scraping request based on LLM intent analysis"""
388
+
389
+ primary_intent = intent_analysis.get("primary_intent", "help_request")
390
+ urls = intent_analysis.get("urls", [])
391
+ extraction_prefs = intent_analysis.get("extraction_preferences", {})
392
+ uses_context = intent_analysis.get("uses_context_reference", False)
393
+
394
+ # Resolve context references if needed
395
+ if uses_context and not urls:
396
+ recent_url = self.get_recent_url()
397
+ if recent_url:
398
+ urls = [recent_url]
399
+
400
+ # Route based on intent
401
+ if primary_intent == "help_request":
402
+ return await self._handle_scraping_help_request(user_message)
403
+ elif primary_intent == "scrape_single":
404
+ return await self._handle_single_scrape(urls, extraction_prefs, user_message)
405
+ elif primary_intent == "scrape_batch":
406
+ return await self._handle_batch_scrape(urls, extraction_prefs, user_message)
407
+ elif primary_intent == "check_accessibility":
408
+ return await self._handle_accessibility_check(urls, user_message)
409
+ else:
410
+ return await self._handle_scraping_help_request(user_message)
411
+
412
+ async def _handle_single_scrape(self, urls: List[str], extraction_prefs: Dict[str, Any], user_message: str) -> str:
413
+ """Handle single URL scraping"""
414
+ if not urls:
415
+ recent_url = self.get_recent_url()
416
+ if recent_url:
417
+ return f"I can scrape web pages. Did you mean to scrape **{recent_url}**? Please confirm."
418
+ else:
419
+ return "I can scrape web pages. Please provide a URL to scrape.\n\n" \
420
+ "Example: 'scrape https://example.com'"
421
+
422
+ url = urls[0]
423
+
424
+ try:
425
+ result = await self._scrape_url(
426
+ url=url,
427
+ extract_links=extraction_prefs.get("extract_links", True),
428
+ extract_images=extraction_prefs.get("extract_images", True),
429
+ take_screenshot=extraction_prefs.get("take_screenshot", False)
430
+ )
431
+
432
+ if result['success']:
433
+ return f"""✅ **Web Scraping Completed**
434
+
435
+ 🌐 **URL:** {result['url']}
436
+ 🔧 **Method:** {result.get('method', 'unknown')}
437
+ 🏃 **Mode:** {result['execution_mode']}
438
+ 📊 **Status:** {result.get('status_code', 'N/A')}
439
+ 📄 **Content:** {result['content_length']:,} characters
440
+ ⏱️ **Time:** {result['response_time']:.2f}s
441
+
442
+ **Title:** {result.get('title', 'No title')}
443
+
444
+ **Content Preview:**
445
+ {result.get('content', '')[:300]}{'...' if len(result.get('content', '')) > 300 else ''}
446
+
447
+ **Links Found:** {len(result.get('links', []))}
448
+ **Images Found:** {len(result.get('images', []))}"""
449
+ else:
450
+ return f"❌ **Scraping failed:** {result['error']}"
451
+
452
+ except Exception as e:
453
+ return f"❌ **Error during scraping:** {str(e)}"
454
+
455
+ async def _handle_batch_scrape(self, urls: List[str], extraction_prefs: Dict[str, Any], user_message: str) -> str:
456
+ """Handle batch URL scraping"""
457
+ if not urls:
458
+ return "I can scrape multiple web pages. Please provide URLs to scrape.\n\n" \
459
+ "Example: 'scrape https://example1.com and https://example2.com'"
460
+
461
+ try:
462
+ result = await self._batch_scrape(
463
+ urls=urls,
464
+ method="auto"
465
+ )
466
+
467
+ if result['success']:
468
+ successful = result['successful']
469
+ failed = result['failed']
470
+ total = result['total_urls']
471
+
472
+ response = f"""📦 **Batch Web Scraping Completed**
473
+
474
+ 📊 **Summary:**
475
+ - **Total URLs:** {total}
476
+ - **Successful:** {successful}
477
+ - **Failed:** {failed}
478
+ - **Mode:** {result['execution_mode']}
479
+
480
+ """
481
+
482
+ if successful > 0:
483
+ response += "✅ **Successfully Scraped:**\n"
484
+ for i, scrape_result in enumerate(result['results'], 1):
485
+ if scrape_result.get('success', False):
486
+ response += f"{i}. {scrape_result.get('url', 'Unknown')}\n"
487
+
488
+ if failed > 0:
489
+ response += f"\n❌ **Failed Scrapes:** {failed}\n"
490
+ for i, scrape_result in enumerate(result['results'], 1):
491
+ if not scrape_result.get('success', False):
492
+ response += f"{i}. {scrape_result.get('url', 'Unknown')}: {scrape_result.get('error', 'Unknown error')}\n"
493
+
494
+ response += f"\n🎉 Batch scraping completed with {successful}/{total} successful scrapes!"
495
+ return response
496
+ else:
497
+ return f"❌ **Batch scraping failed:** {result['error']}"
498
+
499
+ except Exception as e:
500
+ return f"❌ **Error during batch scraping:** {str(e)}"
501
+
502
+ async def _handle_accessibility_check(self, urls: List[str], user_message: str) -> str:
503
+ """Handle accessibility check"""
504
+ if not urls:
505
+ recent_url = self.get_recent_url()
506
+ if recent_url:
507
+ return f"I can check if websites are accessible. Did you mean to check **{recent_url}**?"
508
+ else:
509
+ return "I can check if websites are accessible. Please provide a URL to check."
510
+
511
+ url = urls[0]
512
+
513
+ try:
514
+ result = await self._check_accessibility(url)
515
+
516
+ if result['success']:
517
+ status = "✅ Accessible" if result.get('accessible', False) else "❌ Not Accessible"
518
+ return f"""🔍 **Accessibility Check Results**
519
+
520
+ 🌐 **URL:** {result['url']}
521
+ 🚦 **Status:** {status}
522
+ 📊 **HTTP Status:** {result.get('status_code', 'Unknown')}
523
+ ⏱️ **Response Time:** {result.get('response_time', 0):.2f}s
524
+ 📅 **Checked:** {result.get('timestamp', 'Unknown')}
525
+
526
+ {'The website is accessible and responding normally.' if result.get('accessible', False) else 'The website is not accessible or not responding.'}"""
527
+ else:
528
+ return f"❌ **Accessibility check failed:** {result['error']}"
529
+
530
+ except Exception as e:
531
+ return f"❌ **Error during accessibility check:** {str(e)}"
532
+
533
+ async def _handle_scraping_help_request(self, user_message: str) -> str:
534
+ """Handle scraping help requests with conversation context"""
535
+ state = self.get_conversation_state()
536
+
537
+ response = ("I'm your Web Scraper Agent! I can help you with:\n\n"
538
+ "🕷️ **Web Scraping**\n"
539
+ "- Extract content from web pages\n"
540
+ "- Scrape multiple URLs at once\n"
541
+ "- Extract links and images\n"
542
+ "- Take screenshots\n\n"
543
+ "🔧 **Multiple Execution Modes**\n"
544
+ "- Proxy support (ScraperAPI compatible)\n"
545
+ "- Docker-based secure execution\n"
546
+ "- Local fallback methods\n\n"
547
+ "🧠 **Smart Context Features**\n"
548
+ "- Remembers URLs from previous messages\n"
549
+ "- Understands 'that website' and 'this page'\n"
550
+ "- Maintains conversation state\n\n")
551
+
552
+ # Add current context information
553
+ if state.current_resource:
554
+ response += f"🎯 **Current URL:** {state.current_resource}\n"
555
+
556
+ response += f"\n🔧 **Current Mode:** {self.execution_mode.upper()}\n"
557
+ response += f"📡 **Proxy Enabled:** {'✅' if self.proxy_config else '❌'}\n"
558
+ response += f"🐳 **Docker Available:** {'✅' if self.docker_executor and self.docker_executor.available else '❌'}\n"
559
+
560
+ response += "\n💡 **Examples:**\n"
561
+ response += "• 'scrape https://example.com'\n"
562
+ response += "• 'batch scrape https://site1.com https://site2.com'\n"
563
+ response += "• 'check if https://example.com is accessible'\n"
564
+ response += "\nI understand context from our conversation! 🚀"
565
+
566
+ return response
567
+
568
+ def _extract_scraping_intent_from_llm_response(self, llm_response: str, user_message: str) -> Dict[str, Any]:
569
+ """Extract scraping intent from non-JSON LLM response"""
570
+ content_lower = llm_response.lower()
571
+
572
+ if 'batch' in content_lower or 'multiple' in content_lower:
573
+ intent = 'scrape_batch'
574
+ elif 'scrape' in content_lower:
575
+ intent = 'scrape_single'
576
+ elif 'check' in content_lower or 'accessible' in content_lower:
577
+ intent = 'check_accessibility'
578
+ else:
579
+ intent = 'help_request'
580
+
581
+ return {
582
+ "primary_intent": intent,
583
+ "urls": [],
584
+ "extraction_preferences": {"extract_links": True, "extract_images": True},
585
+ "uses_context_reference": False,
586
+ "context_type": "none",
587
+ "technical_specs": {"method": "auto"},
588
+ "confidence": 0.6
589
+ }
590
+
591
+ def _configure_ssl_for_proxy(self):
592
+ """Configure SSL settings for proxy usage"""
593
+ if REQUESTS_AVAILABLE:
594
+ try:
595
+ ssl_context = ssl.create_default_context()
596
+ ssl_context.check_hostname = False
597
+ ssl_context.verify_mode = ssl.CERT_NONE
598
+
599
+ import requests.packages.urllib3.util.ssl_
600
+ requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL'
601
+ except Exception as e:
602
+ self.logger.warning(f"SSL configuration warning: {e}")
603
+
604
+ self.logger.info("SSL verification disabled for proxy usage")
605
+
606
+ def _determine_execution_mode(self) -> str:
607
+ """Determine execution mode from configuration"""
608
+ # Check if proxy is enabled in config
609
+ if self.scraper_config.get('proxy_enabled', False):
610
+ proxy_url = self.scraper_config.get('proxy_config', {}).get('http_proxy')
611
+ if proxy_url:
612
+ return "proxy"
613
+
614
+ # Check if Docker should be used
615
+ if self.scraper_config.get('docker_image'):
616
+ return "docker"
617
+
618
+ # Fall back to local execution
619
+ if PLAYWRIGHT_AVAILABLE or REQUESTS_AVAILABLE:
620
+ return "local"
621
+
622
+ raise RuntimeError("No scraping execution methods available")
623
+
624
+ def _parse_proxy_url(self, proxy_url: str) -> Dict[str, Any]:
625
+ """Parse proxy URL for different usage formats"""
626
+ try:
627
+ parsed = urlparse(proxy_url)
628
+ return {
629
+ 'server': f"{parsed.scheme}://{parsed.hostname}:{parsed.port}",
630
+ 'username': parsed.username,
631
+ 'password': parsed.password,
632
+ 'host': parsed.hostname,
633
+ 'port': parsed.port,
634
+ 'full_url': proxy_url
635
+ }
636
+ except Exception as e:
637
+ self.logger.error(f"Failed to parse proxy URL: {e}")
638
+ return {}
639
+
640
+ def _add_scraping_tools(self):
641
+ """Add scraping tools"""
642
+ self.add_tool(AgentTool(
643
+ name="scrape_url",
644
+ description="Scrape a single URL",
645
+ function=self._scrape_url,
646
+ parameters_schema={
647
+ "type": "object",
648
+ "properties": {
649
+ "url": {"type": "string", "description": "URL to scrape"},
650
+ "method": {"type": "string", "enum": ["auto", "playwright", "requests"], "default": "auto"},
651
+ "extract_links": {"type": "boolean", "default": True},
652
+ "extract_images": {"type": "boolean", "default": True},
653
+ "take_screenshot": {"type": "boolean", "default": False}
654
+ },
655
+ "required": ["url"]
656
+ }
657
+ ))
658
+
659
+ self.add_tool(AgentTool(
660
+ name="batch_scrape",
661
+ description="Scrape multiple URLs",
662
+ function=self._batch_scrape,
663
+ parameters_schema={
664
+ "type": "object",
665
+ "properties": {
666
+ "urls": {"type": "array", "items": {"type": "string"}},
667
+ "method": {"type": "string", "default": "auto"}
668
+ },
669
+ "required": ["urls"]
670
+ }
671
+ ))
672
+
673
+ self.add_tool(AgentTool(
674
+ name="check_accessibility",
675
+ description="Quick check if URL is accessible",
676
+ function=self._check_accessibility,
677
+ parameters_schema={
678
+ "type": "object",
679
+ "properties": {
680
+ "url": {"type": "string", "description": "URL to check"}
681
+ },
682
+ "required": ["url"]
683
+ }
684
+ ))
685
+
686
+ async def _scrape_url(self, url: str, method: str = "auto", **kwargs) -> Dict[str, Any]:
687
+ """Unified URL scraping method"""
688
+ try:
689
+ if self.execution_mode == "docker" and self.docker_executor and self.docker_executor.available:
690
+ return await self._scrape_with_docker(url, method, **kwargs)
691
+ elif self.execution_mode == "proxy" and self.proxy_config:
692
+ return await self._scrape_with_proxy(url, method, **kwargs)
693
+ else:
694
+ return await self._scrape_locally(url, method, **kwargs)
695
+
696
+ except Exception as e:
697
+ self.logger.error(f"Scraping error for {url}: {e}")
698
+ return {
699
+ "success": False,
700
+ "error": str(e),
701
+ "url": url,
702
+ "method": method,
703
+ "execution_mode": self.execution_mode
704
+ }
705
+
706
+ async def _scrape_with_docker(self, url: str, method: str, **kwargs) -> Dict[str, Any]:
707
+ """Scrape using Docker executor"""
708
+ task = ScrapingTask(
709
+ url=url,
710
+ method=method if method != "auto" else "playwright",
711
+ extract_links=kwargs.get('extract_links', True),
712
+ extract_images=kwargs.get('extract_images', True),
713
+ take_screenshot=kwargs.get('take_screenshot', False),
714
+ timeout=kwargs.get('timeout', self.scraper_config.get('timeout', 60))
715
+ )
716
+
717
+ result = self.docker_executor.execute_scraping_task(task)
718
+ result['execution_mode'] = 'docker'
719
+ return result
720
+
721
+ async def _scrape_with_proxy(self, url: str, method: str, **kwargs) -> Dict[str, Any]:
722
+ """Scrape using proxy (ScraperAPI style) with SSL verification disabled"""
723
+ if method == "auto":
724
+ method = "playwright" if PLAYWRIGHT_AVAILABLE else "requests"
725
+
726
+ if method == "playwright" and PLAYWRIGHT_AVAILABLE:
727
+ return await self._scrape_proxy_playwright(url, **kwargs)
728
+ elif REQUESTS_AVAILABLE:
729
+ return self._scrape_proxy_requests(url, **kwargs)
730
+ else:
731
+ raise RuntimeError("No proxy scraping methods available")
732
+
733
+ async def _scrape_proxy_playwright(self, url: str, **kwargs) -> Dict[str, Any]:
734
+ """Scrape using Playwright with proxy and SSL verification disabled"""
735
+ async with async_playwright() as p:
736
+ browser = None
737
+ try:
738
+ browser = await p.chromium.launch(
739
+ headless=True,
740
+ proxy={
741
+ "server": self.proxy_config['server'],
742
+ "username": self.proxy_config['username'],
743
+ "password": self.proxy_config['password']
744
+ },
745
+ args=[
746
+ '--no-sandbox',
747
+ '--disable-dev-shm-usage',
748
+ '--disable-web-security',
749
+ '--disable-features=VizDisplayCompositor',
750
+ '--ignore-certificate-errors',
751
+ '--ignore-ssl-errors',
752
+ '--ignore-certificate-errors-spki-list',
753
+ '--allow-running-insecure-content'
754
+ ]
755
+ )
756
+
757
+ context = await browser.new_context(
758
+ viewport={"width": 1920, "height": 1080},
759
+ user_agent=self.scraper_config.get('default_headers', {}).get('User-Agent',
760
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'),
761
+ ignore_https_errors=True
762
+ )
763
+
764
+ page = await context.new_page()
765
+ start_time = time.time()
766
+
767
+ timeout_ms = self.scraper_config.get('timeout', 60) * 1000
768
+ response = await page.goto(url, wait_until="domcontentloaded", timeout=timeout_ms)
769
+ await page.wait_for_timeout(3000)
770
+
771
+ response_time = time.time() - start_time
772
+
773
+ # Extract content
774
+ title = await page.title()
775
+ content = await page.inner_text("body")
776
+
777
+ # Extract links
778
+ links = []
779
+ if kwargs.get('extract_links', True):
780
+ link_elements = await page.query_selector_all("a[href]")
781
+ max_links = self.scraper_config.get('max_links_per_page', 100)
782
+ for link in link_elements[:max_links]:
783
+ href = await link.get_attribute("href")
784
+ text = await link.inner_text()
785
+ if href and text:
786
+ links.append({
787
+ "url": urljoin(url, href),
788
+ "text": text.strip()[:100]
789
+ })
790
+
791
+ # Extract images
792
+ images = []
793
+ if kwargs.get('extract_images', True):
794
+ img_elements = await page.query_selector_all("img[src]")
795
+ max_images = self.scraper_config.get('max_images_per_page', 50)
796
+ for img in img_elements[:max_images]:
797
+ src = await img.get_attribute("src")
798
+ alt = await img.get_attribute("alt") or ""
799
+ if src:
800
+ images.append({
801
+ "url": urljoin(url, src),
802
+ "alt": alt
803
+ })
804
+
805
+ await browser.close()
806
+
807
+ return {
808
+ "success": True,
809
+ "url": url,
810
+ "title": title,
811
+ "content": content[:5000],
812
+ "content_length": len(content),
813
+ "links": links,
814
+ "images": images,
815
+ "status_code": response.status if response else None,
816
+ "response_time": response_time,
817
+ "method": "proxy_playwright",
818
+ "execution_mode": "proxy"
819
+ }
820
+
821
+ except Exception as e:
822
+ if browser:
823
+ await browser.close()
824
+ raise e
825
+
826
+ def _scrape_proxy_requests(self, url: str, **kwargs) -> Dict[str, Any]:
827
+ """Scrape using requests with proxy and SSL verification disabled"""
828
+ proxies = {
829
+ 'http': self.proxy_config['full_url'],
830
+ 'https': self.proxy_config['full_url']
831
+ }
832
+
833
+ headers = self.scraper_config.get('default_headers', {
834
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
835
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
836
+ })
837
+
838
+ start_time = time.time()
839
+
840
+ response = requests.get(
841
+ url,
842
+ headers=headers,
843
+ proxies=proxies,
844
+ timeout=self.scraper_config.get('timeout', 60),
845
+ verify=False,
846
+ allow_redirects=True
847
+ )
848
+ response_time = time.time() - start_time
849
+
850
+ soup = BeautifulSoup(response.content, 'html.parser')
851
+
852
+ # Extract content
853
+ title = soup.find('title')
854
+ title = title.get_text().strip() if title else "No title"
855
+
856
+ for script in soup(["script", "style"]):
857
+ script.decompose()
858
+
859
+ content = soup.get_text()
860
+ content = ' '.join(content.split())
861
+
862
+ # Extract links and images based on config
863
+ links = []
864
+ images = []
865
+
866
+ if kwargs.get('extract_links', True):
867
+ max_links = self.scraper_config.get('max_links_per_page', 100)
868
+ for link in soup.find_all('a', href=True)[:max_links]:
869
+ links.append({
870
+ "url": urljoin(url, link['href']),
871
+ "text": link.get_text().strip()[:100]
872
+ })
873
+
874
+ if kwargs.get('extract_images', True):
875
+ max_images = self.scraper_config.get('max_images_per_page', 50)
876
+ for img in soup.find_all('img', src=True)[:max_images]:
877
+ images.append({
878
+ "url": urljoin(url, img['src']),
879
+ "alt": img.get('alt', '')
880
+ })
881
+
882
+ return {
883
+ "success": True,
884
+ "url": url,
885
+ "title": title,
886
+ "content": content[:5000],
887
+ "content_length": len(content),
888
+ "links": links,
889
+ "images": images,
890
+ "status_code": response.status_code,
891
+ "response_time": response_time,
892
+ "method": "proxy_requests",
893
+ "execution_mode": "proxy"
894
+ }
895
+
896
+ async def _scrape_locally(self, url: str, method: str, **kwargs) -> Dict[str, Any]:
897
+ """Scrape using local methods (no proxy, no Docker)"""
898
+ if method == "auto":
899
+ method = "playwright" if PLAYWRIGHT_AVAILABLE else "requests"
900
+
901
+ if method == "playwright" and PLAYWRIGHT_AVAILABLE:
902
+ return await self._scrape_local_playwright(url, **kwargs)
903
+ elif REQUESTS_AVAILABLE:
904
+ return self._scrape_local_requests(url, **kwargs)
905
+ else:
906
+ raise RuntimeError("No local scraping methods available")
907
+
908
+ async def _scrape_local_playwright(self, url: str, **kwargs) -> Dict[str, Any]:
909
+ """Local Playwright scraping"""
910
+ async with async_playwright() as p:
911
+ browser = await p.chromium.launch(headless=True)
912
+ context = await browser.new_context(
913
+ user_agent=self.scraper_config.get('default_headers', {}).get('User-Agent',
914
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36')
915
+ )
916
+ page = await context.new_page()
917
+
918
+ start_time = time.time()
919
+ timeout_ms = self.scraper_config.get('timeout', 60) * 1000
920
+ response = await page.goto(url, timeout=timeout_ms)
921
+ response_time = time.time() - start_time
922
+
923
+ title = await page.title()
924
+ content = await page.inner_text("body")
925
+
926
+ await browser.close()
927
+
928
+ return {
929
+ "success": True,
930
+ "url": url,
931
+ "title": title,
932
+ "content": content[:5000],
933
+ "content_length": len(content),
934
+ "status_code": response.status if response else None,
935
+ "response_time": response_time,
936
+ "method": "local_playwright",
937
+ "execution_mode": "local"
938
+ }
939
+
940
+ def _scrape_local_requests(self, url: str, **kwargs) -> Dict[str, Any]:
941
+ """Local requests scraping"""
942
+ headers = self.scraper_config.get('default_headers', {
943
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
944
+ })
945
+
946
+ start_time = time.time()
947
+ response = requests.get(url, headers=headers, timeout=self.scraper_config.get('timeout', 60))
948
+ response_time = time.time() - start_time
949
+
950
+ soup = BeautifulSoup(response.content, 'html.parser')
951
+ title = soup.find('title')
952
+ title = title.get_text().strip() if title else "No title"
953
+
954
+ for script in soup(["script", "style"]):
955
+ script.decompose()
956
+
957
+ content = soup.get_text()
958
+ content = ' '.join(content.split())
959
+
960
+ return {
961
+ "success": True,
962
+ "url": url,
963
+ "title": title,
964
+ "content": content[:5000],
965
+ "content_length": len(content),
966
+ "status_code": response.status_code,
967
+ "response_time": response_time,
968
+ "method": "local_requests",
969
+ "execution_mode": "local"
970
+ }
971
+
972
+ async def _batch_scrape(self, urls: List[str], method: str = "auto") -> Dict[str, Any]:
973
+ """Batch scraping with rate limiting from config"""
974
+ results = []
975
+ rate_limit = self.scraper_config.get('rate_limit_seconds', 1.0)
976
+
977
+ for i, url in enumerate(urls):
978
+ try:
979
+ result = await self._scrape_url(url, method)
980
+ results.append(result)
981
+
982
+ if i < len(urls) - 1:
983
+ await asyncio.sleep(rate_limit)
984
+
985
+ except Exception as e:
986
+ results.append({
987
+ "success": False,
988
+ "url": url,
989
+ "error": str(e)
990
+ })
991
+
992
+ successful = sum(1 for r in results if r.get('success', False))
993
+
994
+ return {
995
+ "success": True,
996
+ "total_urls": len(urls),
997
+ "successful": successful,
998
+ "failed": len(urls) - successful,
999
+ "results": results,
1000
+ "execution_mode": self.execution_mode
1001
+ }
1002
+
1003
+ async def _check_accessibility(self, url: str) -> Dict[str, Any]:
1004
+ """Check URL accessibility"""
1005
+ try:
1006
+ result = await self._scrape_url(url, extract_links=False, extract_images=False)
1007
+ return {
1008
+ "success": True,
1009
+ "url": url,
1010
+ "accessible": result.get('success', False),
1011
+ "status_code": result.get('status_code'),
1012
+ "response_time": result.get('response_time', 0),
1013
+ "error": result.get('error'),
1014
+ "timestamp": datetime.now().isoformat()
1015
+ }
1016
+ except Exception as e:
1017
+ return {
1018
+ "success": False,
1019
+ "error": str(e),
1020
+ "url": url,
1021
+ "timestamp": datetime.now().isoformat()
1022
+ }
1023
+
1024
+ def _extract_urls_from_text(self, text: str) -> List[str]:
1025
+ """Extract URLs from text"""
1026
+ url_pattern = r'https?://[^\s<>"{}|\\^`\[\]]+'
1027
+ return re.findall(url_pattern, text)