praisonaiagents 0.0.23__py3-none-any.whl → 0.0.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,431 @@
1
+ """Tools for web scraping and crawling.
2
+
3
+ Usage:
4
+ from praisonaiagents.tools import spider_tools
5
+ content = spider_tools.scrape_page("https://example.com")
6
+ links = spider_tools.extract_links("https://example.com")
7
+
8
+ or
9
+ from praisonaiagents.tools import scrape_page, extract_links
10
+ content = scrape_page("https://example.com")
11
+ """
12
+
13
+ import logging
14
+ from typing import List, Dict, Union, Optional, Any
15
+ from importlib import util
16
+ import json
17
+ from urllib.parse import urljoin, urlparse
18
+ import re
19
+ import os
20
+ import hashlib
21
+ import time
22
+
23
+ class SpiderTools:
24
+ """Tools for web scraping and crawling."""
25
+
26
+ def __init__(self):
27
+ """Initialize SpiderTools and check for required packages."""
28
+ self._session = None
29
+
30
+ def _get_session(self):
31
+ """Get or create requests session with common headers."""
32
+ if util.find_spec('requests') is None:
33
+ error_msg = "requests package is not available. Please install it using: pip install requests"
34
+ logging.error(error_msg)
35
+ return None
36
+
37
+ if self._session is None:
38
+ import requests
39
+ self._session = requests.Session()
40
+ self._session.headers.update({
41
+ 'User-Agent': 'Mozilla/5.0 (compatible; PraisonAI/1.0; +http://praisonai.com/bot)',
42
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
43
+ 'Accept-Language': 'en-US,en;q=0.5',
44
+ 'Accept-Encoding': 'gzip, deflate, br',
45
+ 'Connection': 'keep-alive',
46
+ })
47
+ return self._session
48
+
49
+ def scrape_page(
50
+ self,
51
+ url: str,
52
+ selector: Optional[str] = None,
53
+ extract_images: bool = False,
54
+ extract_links: bool = False,
55
+ timeout: int = 30,
56
+ verify_ssl: bool = True
57
+ ) -> Union[Dict[str, Any], Dict[str, str]]:
58
+ """
59
+ Scrape content from a webpage.
60
+
61
+ Args:
62
+ url: URL to scrape
63
+ selector: Optional CSS selector to extract specific content
64
+ extract_images: Whether to extract image URLs
65
+ extract_links: Whether to extract links
66
+ timeout: Request timeout in seconds
67
+ verify_ssl: Whether to verify SSL certificates
68
+
69
+ Returns:
70
+ Dict: Scraped content or error dict
71
+ """
72
+ try:
73
+ session = self._get_session()
74
+ if session is None:
75
+ return {"error": "requests package not available"}
76
+
77
+ # Import BeautifulSoup only when needed
78
+ if util.find_spec('bs4') is None:
79
+ error_msg = "bs4 package is not available. Please install it using: pip install beautifulsoup4"
80
+ logging.error(error_msg)
81
+ return {"error": error_msg}
82
+ from bs4 import BeautifulSoup
83
+
84
+ # Make request
85
+ response = session.get(
86
+ url,
87
+ timeout=timeout,
88
+ verify=verify_ssl
89
+ )
90
+ response.raise_for_status()
91
+
92
+ # Parse HTML
93
+ soup = BeautifulSoup(response.text, 'lxml')
94
+
95
+ # Remove unwanted elements
96
+ for element in soup(['script', 'style']):
97
+ element.decompose()
98
+
99
+ # Initialize result
100
+ result = {
101
+ 'url': url,
102
+ 'status_code': response.status_code,
103
+ 'encoding': response.encoding,
104
+ 'headers': dict(response.headers),
105
+ }
106
+
107
+ # Extract content based on selector
108
+ if selector:
109
+ elements = soup.select(selector)
110
+ result['content'] = [elem.get_text(strip=True) for elem in elements]
111
+ result['html'] = [str(elem) for elem in elements]
112
+ else:
113
+ result['title'] = soup.title.string if soup.title else None
114
+ result['content'] = soup.get_text(separator=' ', strip=True)
115
+ result['html'] = str(soup)
116
+
117
+ # Extract metadata
118
+ meta_tags = {}
119
+ for meta in soup.find_all('meta'):
120
+ name = meta.get('name') or meta.get('property')
121
+ if name:
122
+ meta_tags[name] = meta.get('content')
123
+ result['meta_tags'] = meta_tags
124
+
125
+ # Extract images if requested
126
+ if extract_images:
127
+ images = []
128
+ for img in soup.find_all('img'):
129
+ src = img.get('src')
130
+ if src:
131
+ images.append({
132
+ 'src': urljoin(url, src),
133
+ 'alt': img.get('alt', ''),
134
+ 'title': img.get('title', '')
135
+ })
136
+ result['images'] = images
137
+
138
+ # Extract links if requested
139
+ if extract_links:
140
+ links = []
141
+ for link in soup.find_all('a'):
142
+ href = link.get('href')
143
+ if href:
144
+ links.append({
145
+ 'url': urljoin(url, href),
146
+ 'text': link.get_text(strip=True),
147
+ 'title': link.get('title', '')
148
+ })
149
+ result['links'] = links
150
+
151
+ return result
152
+ except Exception as e:
153
+ error_msg = f"Error scraping {url}: {str(e)}"
154
+ logging.error(error_msg)
155
+ return {"error": error_msg}
156
+
157
+ def extract_links(
158
+ self,
159
+ url: str,
160
+ same_domain: bool = True,
161
+ exclude_patterns: Optional[List[str]] = None,
162
+ timeout: int = 30,
163
+ verify_ssl: bool = True
164
+ ) -> Union[List[Dict[str, str]], Dict[str, str]]:
165
+ """
166
+ Extract all links from a webpage.
167
+
168
+ Args:
169
+ url: URL to extract links from
170
+ same_domain: Only return links from the same domain
171
+ exclude_patterns: List of regex patterns to exclude
172
+ timeout: Request timeout in seconds
173
+ verify_ssl: Whether to verify SSL certificates
174
+
175
+ Returns:
176
+ List[Dict] or Dict: List of links or error dict
177
+ """
178
+ try:
179
+ # Compile exclude patterns
180
+ if exclude_patterns:
181
+ exclude_patterns = [re.compile(p) for p in exclude_patterns]
182
+
183
+ # Get base domain
184
+ base_domain = urlparse(url).netloc
185
+
186
+ # Scrape page
187
+ result = self.scrape_page(
188
+ url,
189
+ extract_links=True,
190
+ timeout=timeout,
191
+ verify_ssl=verify_ssl
192
+ )
193
+
194
+ if "error" in result:
195
+ return result
196
+
197
+ # Filter and clean links
198
+ links = []
199
+ seen_urls = set()
200
+
201
+ for link in result.get('links', []):
202
+ link_url = link['url']
203
+
204
+ # Skip if already seen
205
+ if link_url in seen_urls:
206
+ continue
207
+
208
+ # Parse URL
209
+ parsed = urlparse(link_url)
210
+
211
+ # Skip if not same domain and same_domain is True
212
+ if same_domain and parsed.netloc != base_domain:
213
+ continue
214
+
215
+ # Skip if matches exclude patterns
216
+ if exclude_patterns and any(p.search(link_url) for p in exclude_patterns):
217
+ continue
218
+
219
+ # Add to results
220
+ links.append(link)
221
+ seen_urls.add(link_url)
222
+
223
+ return links
224
+ except Exception as e:
225
+ error_msg = f"Error extracting links from {url}: {str(e)}"
226
+ logging.error(error_msg)
227
+ return {"error": error_msg}
228
+
229
+ def crawl(
230
+ self,
231
+ start_url: str,
232
+ max_pages: int = 10,
233
+ same_domain: bool = True,
234
+ exclude_patterns: Optional[List[str]] = None,
235
+ delay: float = 1.0,
236
+ timeout: int = 30,
237
+ verify_ssl: bool = True,
238
+ output_dir: Optional[str] = None
239
+ ) -> Union[List[Dict[str, Any]], Dict[str, str]]:
240
+ """
241
+ Crawl multiple pages starting from a URL.
242
+
243
+ Args:
244
+ start_url: Starting URL
245
+ max_pages: Maximum number of pages to crawl
246
+ same_domain: Only crawl pages from the same domain
247
+ exclude_patterns: List of regex patterns to exclude
248
+ delay: Delay between requests in seconds
249
+ timeout: Request timeout in seconds
250
+ verify_ssl: Whether to verify SSL certificates
251
+ output_dir: Directory to save crawled pages
252
+
253
+ Returns:
254
+ List[Dict] or Dict: Crawled pages or error dict
255
+ """
256
+ try:
257
+ # Create output directory if needed
258
+ if output_dir:
259
+ os.makedirs(output_dir, exist_ok=True)
260
+
261
+ # Initialize crawl state
262
+ to_visit = {start_url}
263
+ visited = set()
264
+ results = []
265
+
266
+ while to_visit and len(visited) < max_pages:
267
+ # Get next URL
268
+ url = to_visit.pop()
269
+
270
+ # Skip if already visited
271
+ if url in visited:
272
+ continue
273
+
274
+ # Add to visited
275
+ visited.add(url)
276
+
277
+ # Delay if not first request
278
+ if len(visited) > 1:
279
+ time.sleep(delay)
280
+
281
+ # Scrape page
282
+ result = self.scrape_page(
283
+ url,
284
+ extract_links=True,
285
+ timeout=timeout,
286
+ verify_ssl=verify_ssl
287
+ )
288
+
289
+ if "error" in result:
290
+ logging.warning(f"Error crawling {url}: {result['error']}")
291
+ continue
292
+
293
+ # Save result
294
+ results.append(result)
295
+
296
+ # Save to file if requested
297
+ if output_dir:
298
+ filename = hashlib.md5(url.encode()).hexdigest() + '.json'
299
+ filepath = os.path.join(output_dir, filename)
300
+ with open(filepath, 'w', encoding='utf-8') as f:
301
+ json.dump(result, f, indent=2, ensure_ascii=False)
302
+
303
+ # Add new links to visit
304
+ for link in result.get('links', []):
305
+ link_url = link['url']
306
+ parsed = urlparse(link_url)
307
+
308
+ # Skip if not same domain and same_domain is True
309
+ if same_domain and parsed.netloc != urlparse(start_url).netloc:
310
+ continue
311
+
312
+ # Skip if matches exclude patterns
313
+ if exclude_patterns and any(
314
+ re.compile(p).search(link_url) for p in exclude_patterns
315
+ ):
316
+ continue
317
+
318
+ # Add to visit if not visited
319
+ if link_url not in visited:
320
+ to_visit.add(link_url)
321
+
322
+ return results
323
+ except Exception as e:
324
+ error_msg = f"Error crawling from {start_url}: {str(e)}"
325
+ logging.error(error_msg)
326
+ return {"error": error_msg}
327
+
328
+ def extract_text(
329
+ self,
330
+ url: str,
331
+ selector: Optional[str] = None,
332
+ timeout: int = 30,
333
+ verify_ssl: bool = True
334
+ ) -> Union[str, Dict[str, str]]:
335
+ """
336
+ Extract clean text content from a webpage.
337
+
338
+ Args:
339
+ url: URL to extract text from
340
+ selector: Optional CSS selector to extract specific content
341
+ timeout: Request timeout in seconds
342
+ verify_ssl: Whether to verify SSL certificates
343
+
344
+ Returns:
345
+ str or Dict: Extracted text or error dict
346
+ """
347
+ try:
348
+ result = self.scrape_page(
349
+ url,
350
+ selector=selector,
351
+ timeout=timeout,
352
+ verify_ssl=verify_ssl
353
+ )
354
+
355
+ if "error" in result:
356
+ return result
357
+
358
+ if selector:
359
+ return '\n'.join(result['content'])
360
+ return result['content']
361
+ except Exception as e:
362
+ error_msg = f"Error extracting text from {url}: {str(e)}"
363
+ logging.error(error_msg)
364
+ return {"error": error_msg}
365
+
366
+ # Create instance for direct function access
367
+ _spider_tools = SpiderTools()
368
+ scrape_page = _spider_tools.scrape_page
369
+ extract_links = _spider_tools.extract_links
370
+ crawl = _spider_tools.crawl
371
+ extract_text = _spider_tools.extract_text
372
+
373
+ if __name__ == "__main__":
374
+ # Example usage
375
+ print("\n==================================================")
376
+ print("SpiderTools Demonstration")
377
+ print("==================================================\n")
378
+
379
+ # 1. Scrape a webpage
380
+ print("1. Scraping Webpage")
381
+ print("------------------------------")
382
+ url = "https://example.com"
383
+ result = scrape_page(url, extract_images=True, extract_links=True)
384
+ print(f"Content from {url}:")
385
+ if "error" not in result:
386
+ print(f"Title: {result['title']}")
387
+ print(f"Content length: {len(result['content'])} characters")
388
+ print(f"Number of images: {len(result.get('images', []))}")
389
+ print(f"Number of links: {len(result.get('links', []))}")
390
+ else:
391
+ print(result) # Show error
392
+ print()
393
+
394
+ # 2. Extract links
395
+ print("2. Extracting Links")
396
+ print("------------------------------")
397
+ links = extract_links(url)
398
+ print(f"Links from {url}:")
399
+ if isinstance(links, list):
400
+ for link in links:
401
+ print(f"- {link['url']} ({link['text']})")
402
+ else:
403
+ print(links) # Show error
404
+ print()
405
+
406
+ # 3. Extract text
407
+ print("3. Extracting Text")
408
+ print("------------------------------")
409
+ text = extract_text(url)
410
+ print(f"Text from {url}:")
411
+ if isinstance(text, str):
412
+ print(text[:500] + "..." if len(text) > 500 else text)
413
+ else:
414
+ print(text) # Show error
415
+ print()
416
+
417
+ # 4. Crawl multiple pages
418
+ print("4. Crawling Multiple Pages")
419
+ print("------------------------------")
420
+ results = crawl(url, max_pages=2, delay=1.0)
421
+ print(f"Crawl results from {url}:")
422
+ if isinstance(results, list):
423
+ print(f"Crawled {len(results)} pages")
424
+ for result in results:
425
+ print(f"- {result['url']}: {result['title']}")
426
+ else:
427
+ print(results) # Show error
428
+
429
+ print("\n==================================================")
430
+ print("Demonstration Complete")
431
+ print("==================================================")
@@ -0,0 +1,56 @@
1
+ #!/usr/bin/env python3
2
+ """Test runner for all tools."""
3
+
4
+ import os
5
+ import glob
6
+ import logging
7
+ import subprocess
8
+
9
+ # Set up logging
10
+ logging.basicConfig(
11
+ level=logging.INFO,
12
+ format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
13
+ )
14
+ logger = logging.getLogger(__name__)
15
+
16
+ def main():
17
+ """Run all tool files."""
18
+ logger.info("Starting tests...")
19
+
20
+ # Get all *_tools.py files
21
+ tools_dir = os.path.dirname(os.path.abspath(__file__))
22
+ tool_files = glob.glob(os.path.join(tools_dir, "*_tools.py"))
23
+
24
+ # Run each tool file
25
+ for tool_file in sorted(tool_files):
26
+ module_name = os.path.basename(tool_file)
27
+ logger.info(f"\nRunning {module_name}...")
28
+
29
+ try:
30
+ # Run the tool file directly
31
+ result = subprocess.run(
32
+ ["python3", tool_file],
33
+ capture_output=True,
34
+ text=True,
35
+ cwd=tools_dir
36
+ )
37
+
38
+ # Log output
39
+ if result.stdout:
40
+ logger.info(f"Output:\n{result.stdout}")
41
+ if result.stderr:
42
+ logger.error(f"Errors:\n{result.stderr}")
43
+
44
+ if result.returncode == 0:
45
+ logger.info(f" {module_name} completed successfully")
46
+ else:
47
+ logger.error(f" {module_name} failed with return code {result.returncode}")
48
+
49
+ except Exception as e:
50
+ logger.error(f"Error running {module_name}: {str(e)}")
51
+ continue
52
+
53
+ logger.info("\nAll tests completed!")
54
+
55
+ if __name__ == "__main__":
56
+ main()
@@ -1,40 +1,9 @@
1
1
  """Tools module for PraisonAI Agents"""
2
- from typing import List, Dict
3
- import logging
4
- import importlib
2
+ from .internet_search import internet_search
5
3
 
6
4
  class Tools:
7
- @staticmethod
8
- def internet_search(query: str) -> List[Dict]:
9
- """
10
- Perform a search using DuckDuckGo.
5
+ """Tools class for backward compatibility"""
6
+ internet_search = staticmethod(internet_search)
11
7
 
12
- Args:
13
- query (str): The search query.
14
-
15
- Returns:
16
- list: A list of search result titles, URLs, and snippets.
17
- """
18
- # Check if duckduckgo_search is installed
19
- if importlib.util.find_spec("duckduckgo_search") is None:
20
- error_msg = "DuckDuckGo search is not available. Please install duckduckgo_search package using: pip install duckduckgo_search"
21
- logging.error(error_msg)
22
- return [{"error": error_msg}]
23
-
24
- try:
25
- # Import only when needed
26
- from duckduckgo_search import DDGS
27
- results = []
28
- ddgs = DDGS()
29
- for result in ddgs.text(keywords=query, max_results=5):
30
- results.append({
31
- "title": result.get("title", ""),
32
- "url": result.get("href", ""),
33
- "snippet": result.get("body", "")
34
- })
35
- return results
36
-
37
- except Exception as e:
38
- error_msg = f"Error during DuckDuckGo search: {e}"
39
- logging.error(error_msg)
40
- return [{"error": error_msg}]
8
+ # Re-export the function
9
+ __all__ = ['Tools', 'internet_search']