praisonaiagents 0.0.23__py3-none-any.whl → 0.0.24__py3-none-any.whl

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,431 @@
1
+ """Tools for web scraping and crawling.
2
+
3
+ Usage:
4
+ from praisonaiagents.tools import spider_tools
5
+ content = spider_tools.scrape_page("https://example.com")
6
+ links = spider_tools.extract_links("https://example.com")
7
+
8
+ or
9
+ from praisonaiagents.tools import scrape_page, extract_links
10
+ content = scrape_page("https://example.com")
11
+ """
12
+
13
+ import logging
14
+ from typing import List, Dict, Union, Optional, Any
15
+ from importlib import util
16
+ import json
17
+ from urllib.parse import urljoin, urlparse
18
+ import re
19
+ import os
20
+ import hashlib
21
+ import time
22
+
23
+ class SpiderTools:
24
+ """Tools for web scraping and crawling."""
25
+
26
+ def __init__(self):
27
+ """Initialize SpiderTools and check for required packages."""
28
+ self._session = None
29
+
30
+ def _get_session(self):
31
+ """Get or create requests session with common headers."""
32
+ if util.find_spec('requests') is None:
33
+ error_msg = "requests package is not available. Please install it using: pip install requests"
34
+ logging.error(error_msg)
35
+ return None
36
+
37
+ if self._session is None:
38
+ import requests
39
+ self._session = requests.Session()
40
+ self._session.headers.update({
41
+ 'User-Agent': 'Mozilla/5.0 (compatible; PraisonAI/1.0; +http://praisonai.com/bot)',
42
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43
+ 'Accept-Language': 'en-US,en;q=0.5',
44
+ 'Accept-Encoding': 'gzip, deflate, br',
45
+ 'Connection': 'keep-alive',
46
+ })
47
+ return self._session
48
+
49
+ def scrape_page(
50
+ self,
51
+ url: str,
52
+ selector: Optional[str] = None,
53
+ extract_images: bool = False,
54
+ extract_links: bool = False,
55
+ timeout: int = 30,
56
+ verify_ssl: bool = True
57
+ ) -> Union[Dict[str, Any], Dict[str, str]]:
58
+ """
59
+ Scrape content from a webpage.
60
+
61
+ Args:
62
+ url: URL to scrape
63
+ selector: Optional CSS selector to extract specific content
64
+ extract_images: Whether to extract image URLs
65
+ extract_links: Whether to extract links
66
+ timeout: Request timeout in seconds
67
+ verify_ssl: Whether to verify SSL certificates
68
+
69
+ Returns:
70
+ Dict: Scraped content or error dict
71
+ """
72
+ try:
73
+ session = self._get_session()
74
+ if session is None:
75
+ return {"error": "requests package not available"}
76
+
77
+ # Import BeautifulSoup only when needed
78
+ if util.find_spec('bs4') is None:
79
+ error_msg = "bs4 package is not available. Please install it using: pip install beautifulsoup4"
80
+ logging.error(error_msg)
81
+ return {"error": error_msg}
82
+ from bs4 import BeautifulSoup
83
+
84
+ # Make request
85
+ response = session.get(
86
+ url,
87
+ timeout=timeout,
88
+ verify=verify_ssl
89
+ )
90
+ response.raise_for_status()
91
+
92
+ # Parse HTML
93
+ soup = BeautifulSoup(response.text, 'lxml')
94
+
95
+ # Remove unwanted elements
96
+ for element in soup(['script', 'style']):
97
+ element.decompose()
98
+
99
+ # Initialize result
100
+ result = {
101
+ 'url': url,
102
+ 'status_code': response.status_code,
103
+ 'encoding': response.encoding,
104
+ 'headers': dict(response.headers),
105
+ }
106
+
107
+ # Extract content based on selector
108
+ if selector:
109
+ elements = soup.select(selector)
110
+ result['content'] = [elem.get_text(strip=True) for elem in elements]
111
+ result['html'] = [str(elem) for elem in elements]
112
+ else:
113
+ result['title'] = soup.title.string if soup.title else None
114
+ result['content'] = soup.get_text(separator=' ', strip=True)
115
+ result['html'] = str(soup)
116
+
117
+ # Extract metadata
118
+ meta_tags = {}
119
+ for meta in soup.find_all('meta'):
120
+ name = meta.get('name') or meta.get('property')
121
+ if name:
122
+ meta_tags[name] = meta.get('content')
123
+ result['meta_tags'] = meta_tags
124
+
125
+ # Extract images if requested
126
+ if extract_images:
127
+ images = []
128
+ for img in soup.find_all('img'):
129
+ src = img.get('src')
130
+ if src:
131
+ images.append({
132
+ 'src': urljoin(url, src),
133
+ 'alt': img.get('alt', ''),
134
+ 'title': img.get('title', '')
135
+ })
136
+ result['images'] = images
137
+
138
+ # Extract links if requested
139
+ if extract_links:
140
+ links = []
141
+ for link in soup.find_all('a'):
142
+ href = link.get('href')
143
+ if href:
144
+ links.append({
145
+ 'url': urljoin(url, href),
146
+ 'text': link.get_text(strip=True),
147
+ 'title': link.get('title', '')
148
+ })
149
+ result['links'] = links
150
+
151
+ return result
152
+ except Exception as e:
153
+ error_msg = f"Error scraping {url}: {str(e)}"
154
+ logging.error(error_msg)
155
+ return {"error": error_msg}
156
+
157
+ def extract_links(
158
+ self,
159
+ url: str,
160
+ same_domain: bool = True,
161
+ exclude_patterns: Optional[List[str]] = None,
162
+ timeout: int = 30,
163
+ verify_ssl: bool = True
164
+ ) -> Union[List[Dict[str, str]], Dict[str, str]]:
165
+ """
166
+ Extract all links from a webpage.
167
+
168
+ Args:
169
+ url: URL to extract links from
170
+ same_domain: Only return links from the same domain
171
+ exclude_patterns: List of regex patterns to exclude
172
+ timeout: Request timeout in seconds
173
+ verify_ssl: Whether to verify SSL certificates
174
+
175
+ Returns:
176
+ List[Dict] or Dict: List of links or error dict
177
+ """
178
+ try:
179
+ # Compile exclude patterns
180
+ if exclude_patterns:
181
+ exclude_patterns = [re.compile(p) for p in exclude_patterns]
182
+
183
+ # Get base domain
184
+ base_domain = urlparse(url).netloc
185
+
186
+ # Scrape page
187
+ result = self.scrape_page(
188
+ url,
189
+ extract_links=True,
190
+ timeout=timeout,
191
+ verify_ssl=verify_ssl
192
+ )
193
+
194
+ if "error" in result:
195
+ return result
196
+
197
+ # Filter and clean links
198
+ links = []
199
+ seen_urls = set()
200
+
201
+ for link in result.get('links', []):
202
+ link_url = link['url']
203
+
204
+ # Skip if already seen
205
+ if link_url in seen_urls:
206
+ continue
207
+
208
+ # Parse URL
209
+ parsed = urlparse(link_url)
210
+
211
+ # Skip if not same domain and same_domain is True
212
+ if same_domain and parsed.netloc != base_domain:
213
+ continue
214
+
215
+ # Skip if matches exclude patterns
216
+ if exclude_patterns and any(p.search(link_url) for p in exclude_patterns):
217
+ continue
218
+
219
+ # Add to results
220
+ links.append(link)
221
+ seen_urls.add(link_url)
222
+
223
+ return links
224
+ except Exception as e:
225
+ error_msg = f"Error extracting links from {url}: {str(e)}"
226
+ logging.error(error_msg)
227
+ return {"error": error_msg}
228
+
229
+ def crawl(
230
+ self,
231
+ start_url: str,
232
+ max_pages: int = 10,
233
+ same_domain: bool = True,
234
+ exclude_patterns: Optional[List[str]] = None,
235
+ delay: float = 1.0,
236
+ timeout: int = 30,
237
+ verify_ssl: bool = True,
238
+ output_dir: Optional[str] = None
239
+ ) -> Union[List[Dict[str, Any]], Dict[str, str]]:
240
+ """
241
+ Crawl multiple pages starting from a URL.
242
+
243
+ Args:
244
+ start_url: Starting URL
245
+ max_pages: Maximum number of pages to crawl
246
+ same_domain: Only crawl pages from the same domain
247
+ exclude_patterns: List of regex patterns to exclude
248
+ delay: Delay between requests in seconds
249
+ timeout: Request timeout in seconds
250
+ verify_ssl: Whether to verify SSL certificates
251
+ output_dir: Directory to save crawled pages
252
+
253
+ Returns:
254
+ List[Dict] or Dict: Crawled pages or error dict
255
+ """
256
+ try:
257
+ # Create output directory if needed
258
+ if output_dir:
259
+ os.makedirs(output_dir, exist_ok=True)
260
+
261
+ # Initialize crawl state
262
+ to_visit = {start_url}
263
+ visited = set()
264
+ results = []
265
+
266
+ while to_visit and len(visited) < max_pages:
267
+ # Get next URL
268
+ url = to_visit.pop()
269
+
270
+ # Skip if already visited
271
+ if url in visited:
272
+ continue
273
+
274
+ # Add to visited
275
+ visited.add(url)
276
+
277
+ # Delay if not first request
278
+ if len(visited) > 1:
279
+ time.sleep(delay)
280
+
281
+ # Scrape page
282
+ result = self.scrape_page(
283
+ url,
284
+ extract_links=True,
285
+ timeout=timeout,
286
+ verify_ssl=verify_ssl
287
+ )
288
+
289
+ if "error" in result:
290
+ logging.warning(f"Error crawling {url}: {result['error']}")
291
+ continue
292
+
293
+ # Save result
294
+ results.append(result)
295
+
296
+ # Save to file if requested
297
+ if output_dir:
298
+ filename = hashlib.md5(url.encode()).hexdigest() + '.json'
299
+ filepath = os.path.join(output_dir, filename)
300
+ with open(filepath, 'w', encoding='utf-8') as f:
301
+ json.dump(result, f, indent=2, ensure_ascii=False)
302
+
303
+ # Add new links to visit
304
+ for link in result.get('links', []):
305
+ link_url = link['url']
306
+ parsed = urlparse(link_url)
307
+
308
+ # Skip if not same domain and same_domain is True
309
+ if same_domain and parsed.netloc != urlparse(start_url).netloc:
310
+ continue
311
+
312
+ # Skip if matches exclude patterns
313
+ if exclude_patterns and any(
314
+ re.compile(p).search(link_url) for p in exclude_patterns
315
+ ):
316
+ continue
317
+
318
+ # Add to visit if not visited
319
+ if link_url not in visited:
320
+ to_visit.add(link_url)
321
+
322
+ return results
323
+ except Exception as e:
324
+ error_msg = f"Error crawling from {start_url}: {str(e)}"
325
+ logging.error(error_msg)
326
+ return {"error": error_msg}
327
+
328
+ def extract_text(
329
+ self,
330
+ url: str,
331
+ selector: Optional[str] = None,
332
+ timeout: int = 30,
333
+ verify_ssl: bool = True
334
+ ) -> Union[str, Dict[str, str]]:
335
+ """
336
+ Extract clean text content from a webpage.
337
+
338
+ Args:
339
+ url: URL to extract text from
340
+ selector: Optional CSS selector to extract specific content
341
+ timeout: Request timeout in seconds
342
+ verify_ssl: Whether to verify SSL certificates
343
+
344
+ Returns:
345
+ str or Dict: Extracted text or error dict
346
+ """
347
+ try:
348
+ result = self.scrape_page(
349
+ url,
350
+ selector=selector,
351
+ timeout=timeout,
352
+ verify_ssl=verify_ssl
353
+ )
354
+
355
+ if "error" in result:
356
+ return result
357
+
358
+ if selector:
359
+ return '\n'.join(result['content'])
360
+ return result['content']
361
+ except Exception as e:
362
+ error_msg = f"Error extracting text from {url}: {str(e)}"
363
+ logging.error(error_msg)
364
+ return {"error": error_msg}
365
+
366
+ # Create instance for direct function access
367
+ _spider_tools = SpiderTools()
368
+ scrape_page = _spider_tools.scrape_page
369
+ extract_links = _spider_tools.extract_links
370
+ crawl = _spider_tools.crawl
371
+ extract_text = _spider_tools.extract_text
372
+
373
+ if __name__ == "__main__":
374
+ # Example usage
375
+ print("\n==================================================")
376
+ print("SpiderTools Demonstration")
377
+ print("==================================================\n")
378
+
379
+ # 1. Scrape a webpage
380
+ print("1. Scraping Webpage")
381
+ print("------------------------------")
382
+ url = "https://example.com"
383
+ result = scrape_page(url, extract_images=True, extract_links=True)
384
+ print(f"Content from {url}:")
385
+ if "error" not in result:
386
+ print(f"Title: {result['title']}")
387
+ print(f"Content length: {len(result['content'])} characters")
388
+ print(f"Number of images: {len(result.get('images', []))}")
389
+ print(f"Number of links: {len(result.get('links', []))}")
390
+ else:
391
+ print(result) # Show error
392
+ print()
393
+
394
+ # 2. Extract links
395
+ print("2. Extracting Links")
396
+ print("------------------------------")
397
+ links = extract_links(url)
398
+ print(f"Links from {url}:")
399
+ if isinstance(links, list):
400
+ for link in links:
401
+ print(f"- {link['url']} ({link['text']})")
402
+ else:
403
+ print(links) # Show error
404
+ print()
405
+
406
+ # 3. Extract text
407
+ print("3. Extracting Text")
408
+ print("------------------------------")
409
+ text = extract_text(url)
410
+ print(f"Text from {url}:")
411
+ if isinstance(text, str):
412
+ print(text[:500] + "..." if len(text) > 500 else text)
413
+ else:
414
+ print(text) # Show error
415
+ print()
416
+
417
+ # 4. Crawl multiple pages
418
+ print("4. Crawling Multiple Pages")
419
+ print("------------------------------")
420
+ results = crawl(url, max_pages=2, delay=1.0)
421
+ print(f"Crawl results from {url}:")
422
+ if isinstance(results, list):
423
+ print(f"Crawled {len(results)} pages")
424
+ for result in results:
425
+ print(f"- {result['url']}: {result['title']}")
426
+ else:
427
+ print(results) # Show error
428
+
429
+ print("\n==================================================")
430
+ print("Demonstration Complete")
431
+ print("==================================================")
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env python3
2
+ """Test runner for all tools."""
3
+
4
+ import os
5
+ import glob
6
+ import logging
7
+ import subprocess
8
+
9
+ # Set up logging
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
13
+ )
14
+ logger = logging.getLogger(__name__)
15
+
16
+ def main():
17
+ """Run all tool files."""
18
+ logger.info("Starting tests...")
19
+
20
+ # Get all *_tools.py files
21
+ tools_dir = os.path.dirname(os.path.abspath(__file__))
22
+ tool_files = glob.glob(os.path.join(tools_dir, "*_tools.py"))
23
+
24
+ # Run each tool file
25
+ for tool_file in sorted(tool_files):
26
+ module_name = os.path.basename(tool_file)
27
+ logger.info(f"\nRunning {module_name}...")
28
+
29
+ try:
30
+ # Run the tool file directly
31
+ result = subprocess.run(
32
+ ["python3", tool_file],
33
+ capture_output=True,
34
+ text=True,
35
+ cwd=tools_dir
36
+ )
37
+
38
+ # Log output
39
+ if result.stdout:
40
+ logger.info(f"Output:\n{result.stdout}")
41
+ if result.stderr:
42
+ logger.error(f"Errors:\n{result.stderr}")
43
+
44
+ if result.returncode == 0:
45
+ logger.info(f" {module_name} completed successfully")
46
+ else:
47
+ logger.error(f" {module_name} failed with return code {result.returncode}")
48
+
49
+ except Exception as e:
50
+ logger.error(f"Error running {module_name}: {str(e)}")
51
+ continue
52
+
53
+ logger.info("\nAll tests completed!")
54
+
55
+ if __name__ == "__main__":
56
+ main()
@@ -1,40 +1,9 @@
1
1
  """Tools module for PraisonAI Agents"""
2
- from typing import List, Dict
3
- import logging
4
- import importlib
2
+ from .internet_search import internet_search
5
3
 
6
4
  class Tools:
7
- @staticmethod
8
- def internet_search(query: str) -> List[Dict]:
9
- """
10
- Perform a search using DuckDuckGo.
5
+ """Tools class for backward compatibility"""
6
+ internet_search = staticmethod(internet_search)
11
7
 
12
- Args:
13
- query (str): The search query.
14
-
15
- Returns:
16
- list: A list of search result titles, URLs, and snippets.
17
- """
18
- # Check if duckduckgo_search is installed
19
- if importlib.util.find_spec("duckduckgo_search") is None:
20
- error_msg = "DuckDuckGo search is not available. Please install duckduckgo_search package using: pip install duckduckgo_search"
21
- logging.error(error_msg)
22
- return [{"error": error_msg}]
23
-
24
- try:
25
- # Import only when needed
26
- from duckduckgo_search import DDGS
27
- results = []
28
- ddgs = DDGS()
29
- for result in ddgs.text(keywords=query, max_results=5):
30
- results.append({
31
- "title": result.get("title", ""),
32
- "url": result.get("href", ""),
33
- "snippet": result.get("body", "")
34
- })
35
- return results
36
-
37
- except Exception as e:
38
- error_msg = f"Error during DuckDuckGo search: {e}"
39
- logging.error(error_msg)
40
- return [{"error": error_msg}]
8
+ # Re-export the function
9
+ __all__ = ['Tools', 'internet_search']