praisonaiagents 0.0.23__py3-none-any.whl → 0.0.24__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- praisonaiagents/tools/__init__.py +165 -2
- praisonaiagents/tools/arxiv_tools.py +292 -0
- praisonaiagents/tools/calculator_tools.py +278 -0
- praisonaiagents/tools/csv_tools.py +266 -0
- praisonaiagents/tools/duckdb_tools.py +268 -0
- praisonaiagents/tools/duckduckgo_tools.py +52 -0
- praisonaiagents/tools/excel_tools.py +310 -0
- praisonaiagents/tools/file_tools.py +274 -0
- praisonaiagents/tools/json_tools.py +515 -0
- praisonaiagents/tools/newspaper_tools.py +354 -0
- praisonaiagents/tools/pandas_tools.py +326 -0
- praisonaiagents/tools/python_tools.py +423 -0
- praisonaiagents/tools/shell_tools.py +278 -0
- praisonaiagents/tools/spider_tools.py +431 -0
- praisonaiagents/tools/test.py +56 -0
- praisonaiagents/tools/tools.py +5 -36
- praisonaiagents/tools/wikipedia_tools.py +272 -0
- praisonaiagents/tools/xml_tools.py +498 -0
- praisonaiagents/tools/yaml_tools.py +417 -0
- praisonaiagents/tools/yfinance_tools.py +213 -0
- {praisonaiagents-0.0.23.dist-info → praisonaiagents-0.0.24.dist-info}/METADATA +1 -1
- praisonaiagents-0.0.24.dist-info/RECORD +42 -0
- praisonaiagents-0.0.23.dist-info/RECORD +0 -24
- {praisonaiagents-0.0.23.dist-info → praisonaiagents-0.0.24.dist-info}/WHEEL +0 -0
- {praisonaiagents-0.0.23.dist-info → praisonaiagents-0.0.24.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,431 @@
|
|
1
|
+
"""Tools for web scraping and crawling.
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
from praisonaiagents.tools import spider_tools
|
5
|
+
content = spider_tools.scrape_page("https://example.com")
|
6
|
+
links = spider_tools.extract_links("https://example.com")
|
7
|
+
|
8
|
+
or
|
9
|
+
from praisonaiagents.tools import scrape_page, extract_links
|
10
|
+
content = scrape_page("https://example.com")
|
11
|
+
"""
|
12
|
+
|
13
|
+
import logging
|
14
|
+
from typing import List, Dict, Union, Optional, Any
|
15
|
+
from importlib import util
|
16
|
+
import json
|
17
|
+
from urllib.parse import urljoin, urlparse
|
18
|
+
import re
|
19
|
+
import os
|
20
|
+
import hashlib
|
21
|
+
import time
|
22
|
+
|
23
|
+
class SpiderTools:
|
24
|
+
"""Tools for web scraping and crawling."""
|
25
|
+
|
26
|
+
def __init__(self):
|
27
|
+
"""Initialize SpiderTools and check for required packages."""
|
28
|
+
self._session = None
|
29
|
+
|
30
|
+
def _get_session(self):
|
31
|
+
"""Get or create requests session with common headers."""
|
32
|
+
if util.find_spec('requests') is None:
|
33
|
+
error_msg = "requests package is not available. Please install it using: pip install requests"
|
34
|
+
logging.error(error_msg)
|
35
|
+
return None
|
36
|
+
|
37
|
+
if self._session is None:
|
38
|
+
import requests
|
39
|
+
self._session = requests.Session()
|
40
|
+
self._session.headers.update({
|
41
|
+
'User-Agent': 'Mozilla/5.0 (compatible; PraisonAI/1.0; +http://praisonai.com/bot)',
|
42
|
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
43
|
+
'Accept-Language': 'en-US,en;q=0.5',
|
44
|
+
'Accept-Encoding': 'gzip, deflate, br',
|
45
|
+
'Connection': 'keep-alive',
|
46
|
+
})
|
47
|
+
return self._session
|
48
|
+
|
49
|
+
def scrape_page(
|
50
|
+
self,
|
51
|
+
url: str,
|
52
|
+
selector: Optional[str] = None,
|
53
|
+
extract_images: bool = False,
|
54
|
+
extract_links: bool = False,
|
55
|
+
timeout: int = 30,
|
56
|
+
verify_ssl: bool = True
|
57
|
+
) -> Union[Dict[str, Any], Dict[str, str]]:
|
58
|
+
"""
|
59
|
+
Scrape content from a webpage.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
url: URL to scrape
|
63
|
+
selector: Optional CSS selector to extract specific content
|
64
|
+
extract_images: Whether to extract image URLs
|
65
|
+
extract_links: Whether to extract links
|
66
|
+
timeout: Request timeout in seconds
|
67
|
+
verify_ssl: Whether to verify SSL certificates
|
68
|
+
|
69
|
+
Returns:
|
70
|
+
Dict: Scraped content or error dict
|
71
|
+
"""
|
72
|
+
try:
|
73
|
+
session = self._get_session()
|
74
|
+
if session is None:
|
75
|
+
return {"error": "requests package not available"}
|
76
|
+
|
77
|
+
# Import BeautifulSoup only when needed
|
78
|
+
if util.find_spec('bs4') is None:
|
79
|
+
error_msg = "bs4 package is not available. Please install it using: pip install beautifulsoup4"
|
80
|
+
logging.error(error_msg)
|
81
|
+
return {"error": error_msg}
|
82
|
+
from bs4 import BeautifulSoup
|
83
|
+
|
84
|
+
# Make request
|
85
|
+
response = session.get(
|
86
|
+
url,
|
87
|
+
timeout=timeout,
|
88
|
+
verify=verify_ssl
|
89
|
+
)
|
90
|
+
response.raise_for_status()
|
91
|
+
|
92
|
+
# Parse HTML
|
93
|
+
soup = BeautifulSoup(response.text, 'lxml')
|
94
|
+
|
95
|
+
# Remove unwanted elements
|
96
|
+
for element in soup(['script', 'style']):
|
97
|
+
element.decompose()
|
98
|
+
|
99
|
+
# Initialize result
|
100
|
+
result = {
|
101
|
+
'url': url,
|
102
|
+
'status_code': response.status_code,
|
103
|
+
'encoding': response.encoding,
|
104
|
+
'headers': dict(response.headers),
|
105
|
+
}
|
106
|
+
|
107
|
+
# Extract content based on selector
|
108
|
+
if selector:
|
109
|
+
elements = soup.select(selector)
|
110
|
+
result['content'] = [elem.get_text(strip=True) for elem in elements]
|
111
|
+
result['html'] = [str(elem) for elem in elements]
|
112
|
+
else:
|
113
|
+
result['title'] = soup.title.string if soup.title else None
|
114
|
+
result['content'] = soup.get_text(separator=' ', strip=True)
|
115
|
+
result['html'] = str(soup)
|
116
|
+
|
117
|
+
# Extract metadata
|
118
|
+
meta_tags = {}
|
119
|
+
for meta in soup.find_all('meta'):
|
120
|
+
name = meta.get('name') or meta.get('property')
|
121
|
+
if name:
|
122
|
+
meta_tags[name] = meta.get('content')
|
123
|
+
result['meta_tags'] = meta_tags
|
124
|
+
|
125
|
+
# Extract images if requested
|
126
|
+
if extract_images:
|
127
|
+
images = []
|
128
|
+
for img in soup.find_all('img'):
|
129
|
+
src = img.get('src')
|
130
|
+
if src:
|
131
|
+
images.append({
|
132
|
+
'src': urljoin(url, src),
|
133
|
+
'alt': img.get('alt', ''),
|
134
|
+
'title': img.get('title', '')
|
135
|
+
})
|
136
|
+
result['images'] = images
|
137
|
+
|
138
|
+
# Extract links if requested
|
139
|
+
if extract_links:
|
140
|
+
links = []
|
141
|
+
for link in soup.find_all('a'):
|
142
|
+
href = link.get('href')
|
143
|
+
if href:
|
144
|
+
links.append({
|
145
|
+
'url': urljoin(url, href),
|
146
|
+
'text': link.get_text(strip=True),
|
147
|
+
'title': link.get('title', '')
|
148
|
+
})
|
149
|
+
result['links'] = links
|
150
|
+
|
151
|
+
return result
|
152
|
+
except Exception as e:
|
153
|
+
error_msg = f"Error scraping {url}: {str(e)}"
|
154
|
+
logging.error(error_msg)
|
155
|
+
return {"error": error_msg}
|
156
|
+
|
157
|
+
def extract_links(
|
158
|
+
self,
|
159
|
+
url: str,
|
160
|
+
same_domain: bool = True,
|
161
|
+
exclude_patterns: Optional[List[str]] = None,
|
162
|
+
timeout: int = 30,
|
163
|
+
verify_ssl: bool = True
|
164
|
+
) -> Union[List[Dict[str, str]], Dict[str, str]]:
|
165
|
+
"""
|
166
|
+
Extract all links from a webpage.
|
167
|
+
|
168
|
+
Args:
|
169
|
+
url: URL to extract links from
|
170
|
+
same_domain: Only return links from the same domain
|
171
|
+
exclude_patterns: List of regex patterns to exclude
|
172
|
+
timeout: Request timeout in seconds
|
173
|
+
verify_ssl: Whether to verify SSL certificates
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
List[Dict] or Dict: List of links or error dict
|
177
|
+
"""
|
178
|
+
try:
|
179
|
+
# Compile exclude patterns
|
180
|
+
if exclude_patterns:
|
181
|
+
exclude_patterns = [re.compile(p) for p in exclude_patterns]
|
182
|
+
|
183
|
+
# Get base domain
|
184
|
+
base_domain = urlparse(url).netloc
|
185
|
+
|
186
|
+
# Scrape page
|
187
|
+
result = self.scrape_page(
|
188
|
+
url,
|
189
|
+
extract_links=True,
|
190
|
+
timeout=timeout,
|
191
|
+
verify_ssl=verify_ssl
|
192
|
+
)
|
193
|
+
|
194
|
+
if "error" in result:
|
195
|
+
return result
|
196
|
+
|
197
|
+
# Filter and clean links
|
198
|
+
links = []
|
199
|
+
seen_urls = set()
|
200
|
+
|
201
|
+
for link in result.get('links', []):
|
202
|
+
link_url = link['url']
|
203
|
+
|
204
|
+
# Skip if already seen
|
205
|
+
if link_url in seen_urls:
|
206
|
+
continue
|
207
|
+
|
208
|
+
# Parse URL
|
209
|
+
parsed = urlparse(link_url)
|
210
|
+
|
211
|
+
# Skip if not same domain and same_domain is True
|
212
|
+
if same_domain and parsed.netloc != base_domain:
|
213
|
+
continue
|
214
|
+
|
215
|
+
# Skip if matches exclude patterns
|
216
|
+
if exclude_patterns and any(p.search(link_url) for p in exclude_patterns):
|
217
|
+
continue
|
218
|
+
|
219
|
+
# Add to results
|
220
|
+
links.append(link)
|
221
|
+
seen_urls.add(link_url)
|
222
|
+
|
223
|
+
return links
|
224
|
+
except Exception as e:
|
225
|
+
error_msg = f"Error extracting links from {url}: {str(e)}"
|
226
|
+
logging.error(error_msg)
|
227
|
+
return {"error": error_msg}
|
228
|
+
|
229
|
+
def crawl(
|
230
|
+
self,
|
231
|
+
start_url: str,
|
232
|
+
max_pages: int = 10,
|
233
|
+
same_domain: bool = True,
|
234
|
+
exclude_patterns: Optional[List[str]] = None,
|
235
|
+
delay: float = 1.0,
|
236
|
+
timeout: int = 30,
|
237
|
+
verify_ssl: bool = True,
|
238
|
+
output_dir: Optional[str] = None
|
239
|
+
) -> Union[List[Dict[str, Any]], Dict[str, str]]:
|
240
|
+
"""
|
241
|
+
Crawl multiple pages starting from a URL.
|
242
|
+
|
243
|
+
Args:
|
244
|
+
start_url: Starting URL
|
245
|
+
max_pages: Maximum number of pages to crawl
|
246
|
+
same_domain: Only crawl pages from the same domain
|
247
|
+
exclude_patterns: List of regex patterns to exclude
|
248
|
+
delay: Delay between requests in seconds
|
249
|
+
timeout: Request timeout in seconds
|
250
|
+
verify_ssl: Whether to verify SSL certificates
|
251
|
+
output_dir: Directory to save crawled pages
|
252
|
+
|
253
|
+
Returns:
|
254
|
+
List[Dict] or Dict: Crawled pages or error dict
|
255
|
+
"""
|
256
|
+
try:
|
257
|
+
# Create output directory if needed
|
258
|
+
if output_dir:
|
259
|
+
os.makedirs(output_dir, exist_ok=True)
|
260
|
+
|
261
|
+
# Initialize crawl state
|
262
|
+
to_visit = {start_url}
|
263
|
+
visited = set()
|
264
|
+
results = []
|
265
|
+
|
266
|
+
while to_visit and len(visited) < max_pages:
|
267
|
+
# Get next URL
|
268
|
+
url = to_visit.pop()
|
269
|
+
|
270
|
+
# Skip if already visited
|
271
|
+
if url in visited:
|
272
|
+
continue
|
273
|
+
|
274
|
+
# Add to visited
|
275
|
+
visited.add(url)
|
276
|
+
|
277
|
+
# Delay if not first request
|
278
|
+
if len(visited) > 1:
|
279
|
+
time.sleep(delay)
|
280
|
+
|
281
|
+
# Scrape page
|
282
|
+
result = self.scrape_page(
|
283
|
+
url,
|
284
|
+
extract_links=True,
|
285
|
+
timeout=timeout,
|
286
|
+
verify_ssl=verify_ssl
|
287
|
+
)
|
288
|
+
|
289
|
+
if "error" in result:
|
290
|
+
logging.warning(f"Error crawling {url}: {result['error']}")
|
291
|
+
continue
|
292
|
+
|
293
|
+
# Save result
|
294
|
+
results.append(result)
|
295
|
+
|
296
|
+
# Save to file if requested
|
297
|
+
if output_dir:
|
298
|
+
filename = hashlib.md5(url.encode()).hexdigest() + '.json'
|
299
|
+
filepath = os.path.join(output_dir, filename)
|
300
|
+
with open(filepath, 'w', encoding='utf-8') as f:
|
301
|
+
json.dump(result, f, indent=2, ensure_ascii=False)
|
302
|
+
|
303
|
+
# Add new links to visit
|
304
|
+
for link in result.get('links', []):
|
305
|
+
link_url = link['url']
|
306
|
+
parsed = urlparse(link_url)
|
307
|
+
|
308
|
+
# Skip if not same domain and same_domain is True
|
309
|
+
if same_domain and parsed.netloc != urlparse(start_url).netloc:
|
310
|
+
continue
|
311
|
+
|
312
|
+
# Skip if matches exclude patterns
|
313
|
+
if exclude_patterns and any(
|
314
|
+
re.compile(p).search(link_url) for p in exclude_patterns
|
315
|
+
):
|
316
|
+
continue
|
317
|
+
|
318
|
+
# Add to visit if not visited
|
319
|
+
if link_url not in visited:
|
320
|
+
to_visit.add(link_url)
|
321
|
+
|
322
|
+
return results
|
323
|
+
except Exception as e:
|
324
|
+
error_msg = f"Error crawling from {start_url}: {str(e)}"
|
325
|
+
logging.error(error_msg)
|
326
|
+
return {"error": error_msg}
|
327
|
+
|
328
|
+
def extract_text(
|
329
|
+
self,
|
330
|
+
url: str,
|
331
|
+
selector: Optional[str] = None,
|
332
|
+
timeout: int = 30,
|
333
|
+
verify_ssl: bool = True
|
334
|
+
) -> Union[str, Dict[str, str]]:
|
335
|
+
"""
|
336
|
+
Extract clean text content from a webpage.
|
337
|
+
|
338
|
+
Args:
|
339
|
+
url: URL to extract text from
|
340
|
+
selector: Optional CSS selector to extract specific content
|
341
|
+
timeout: Request timeout in seconds
|
342
|
+
verify_ssl: Whether to verify SSL certificates
|
343
|
+
|
344
|
+
Returns:
|
345
|
+
str or Dict: Extracted text or error dict
|
346
|
+
"""
|
347
|
+
try:
|
348
|
+
result = self.scrape_page(
|
349
|
+
url,
|
350
|
+
selector=selector,
|
351
|
+
timeout=timeout,
|
352
|
+
verify_ssl=verify_ssl
|
353
|
+
)
|
354
|
+
|
355
|
+
if "error" in result:
|
356
|
+
return result
|
357
|
+
|
358
|
+
if selector:
|
359
|
+
return '\n'.join(result['content'])
|
360
|
+
return result['content']
|
361
|
+
except Exception as e:
|
362
|
+
error_msg = f"Error extracting text from {url}: {str(e)}"
|
363
|
+
logging.error(error_msg)
|
364
|
+
return {"error": error_msg}
|
365
|
+
|
366
|
+
# Create instance for direct function access
|
367
|
+
_spider_tools = SpiderTools()
|
368
|
+
scrape_page = _spider_tools.scrape_page
|
369
|
+
extract_links = _spider_tools.extract_links
|
370
|
+
crawl = _spider_tools.crawl
|
371
|
+
extract_text = _spider_tools.extract_text
|
372
|
+
|
373
|
+
if __name__ == "__main__":
|
374
|
+
# Example usage
|
375
|
+
print("\n==================================================")
|
376
|
+
print("SpiderTools Demonstration")
|
377
|
+
print("==================================================\n")
|
378
|
+
|
379
|
+
# 1. Scrape a webpage
|
380
|
+
print("1. Scraping Webpage")
|
381
|
+
print("------------------------------")
|
382
|
+
url = "https://example.com"
|
383
|
+
result = scrape_page(url, extract_images=True, extract_links=True)
|
384
|
+
print(f"Content from {url}:")
|
385
|
+
if "error" not in result:
|
386
|
+
print(f"Title: {result['title']}")
|
387
|
+
print(f"Content length: {len(result['content'])} characters")
|
388
|
+
print(f"Number of images: {len(result.get('images', []))}")
|
389
|
+
print(f"Number of links: {len(result.get('links', []))}")
|
390
|
+
else:
|
391
|
+
print(result) # Show error
|
392
|
+
print()
|
393
|
+
|
394
|
+
# 2. Extract links
|
395
|
+
print("2. Extracting Links")
|
396
|
+
print("------------------------------")
|
397
|
+
links = extract_links(url)
|
398
|
+
print(f"Links from {url}:")
|
399
|
+
if isinstance(links, list):
|
400
|
+
for link in links:
|
401
|
+
print(f"- {link['url']} ({link['text']})")
|
402
|
+
else:
|
403
|
+
print(links) # Show error
|
404
|
+
print()
|
405
|
+
|
406
|
+
# 3. Extract text
|
407
|
+
print("3. Extracting Text")
|
408
|
+
print("------------------------------")
|
409
|
+
text = extract_text(url)
|
410
|
+
print(f"Text from {url}:")
|
411
|
+
if isinstance(text, str):
|
412
|
+
print(text[:500] + "..." if len(text) > 500 else text)
|
413
|
+
else:
|
414
|
+
print(text) # Show error
|
415
|
+
print()
|
416
|
+
|
417
|
+
# 4. Crawl multiple pages
|
418
|
+
print("4. Crawling Multiple Pages")
|
419
|
+
print("------------------------------")
|
420
|
+
results = crawl(url, max_pages=2, delay=1.0)
|
421
|
+
print(f"Crawl results from {url}:")
|
422
|
+
if isinstance(results, list):
|
423
|
+
print(f"Crawled {len(results)} pages")
|
424
|
+
for result in results:
|
425
|
+
print(f"- {result['url']}: {result['title']}")
|
426
|
+
else:
|
427
|
+
print(results) # Show error
|
428
|
+
|
429
|
+
print("\n==================================================")
|
430
|
+
print("Demonstration Complete")
|
431
|
+
print("==================================================")
|
@@ -0,0 +1,56 @@
|
|
1
|
+
#!/usr/bin/env python3
|
2
|
+
"""Test runner for all tools."""
|
3
|
+
|
4
|
+
import os
|
5
|
+
import glob
|
6
|
+
import logging
|
7
|
+
import subprocess
|
8
|
+
|
9
|
+
# Set up logging
|
10
|
+
logging.basicConfig(
|
11
|
+
level=logging.INFO,
|
12
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
13
|
+
)
|
14
|
+
logger = logging.getLogger(__name__)
|
15
|
+
|
16
|
+
def main():
|
17
|
+
"""Run all tool files."""
|
18
|
+
logger.info("Starting tests...")
|
19
|
+
|
20
|
+
# Get all *_tools.py files
|
21
|
+
tools_dir = os.path.dirname(os.path.abspath(__file__))
|
22
|
+
tool_files = glob.glob(os.path.join(tools_dir, "*_tools.py"))
|
23
|
+
|
24
|
+
# Run each tool file
|
25
|
+
for tool_file in sorted(tool_files):
|
26
|
+
module_name = os.path.basename(tool_file)
|
27
|
+
logger.info(f"\nRunning {module_name}...")
|
28
|
+
|
29
|
+
try:
|
30
|
+
# Run the tool file directly
|
31
|
+
result = subprocess.run(
|
32
|
+
["python3", tool_file],
|
33
|
+
capture_output=True,
|
34
|
+
text=True,
|
35
|
+
cwd=tools_dir
|
36
|
+
)
|
37
|
+
|
38
|
+
# Log output
|
39
|
+
if result.stdout:
|
40
|
+
logger.info(f"Output:\n{result.stdout}")
|
41
|
+
if result.stderr:
|
42
|
+
logger.error(f"Errors:\n{result.stderr}")
|
43
|
+
|
44
|
+
if result.returncode == 0:
|
45
|
+
logger.info(f" {module_name} completed successfully")
|
46
|
+
else:
|
47
|
+
logger.error(f" {module_name} failed with return code {result.returncode}")
|
48
|
+
|
49
|
+
except Exception as e:
|
50
|
+
logger.error(f"Error running {module_name}: {str(e)}")
|
51
|
+
continue
|
52
|
+
|
53
|
+
logger.info("\nAll tests completed!")
|
54
|
+
|
55
|
+
if __name__ == "__main__":
|
56
|
+
main()
|
praisonaiagents/tools/tools.py
CHANGED
@@ -1,40 +1,9 @@
|
|
1
1
|
"""Tools module for PraisonAI Agents"""
|
2
|
-
from
|
3
|
-
import logging
|
4
|
-
import importlib
|
2
|
+
from .internet_search import internet_search
|
5
3
|
|
6
4
|
class Tools:
|
7
|
-
|
8
|
-
|
9
|
-
"""
|
10
|
-
Perform a search using DuckDuckGo.
|
5
|
+
"""Tools class for backward compatibility"""
|
6
|
+
internet_search = staticmethod(internet_search)
|
11
7
|
|
12
|
-
|
13
|
-
|
14
|
-
|
15
|
-
Returns:
|
16
|
-
list: A list of search result titles, URLs, and snippets.
|
17
|
-
"""
|
18
|
-
# Check if duckduckgo_search is installed
|
19
|
-
if importlib.util.find_spec("duckduckgo_search") is None:
|
20
|
-
error_msg = "DuckDuckGo search is not available. Please install duckduckgo_search package using: pip install duckduckgo_search"
|
21
|
-
logging.error(error_msg)
|
22
|
-
return [{"error": error_msg}]
|
23
|
-
|
24
|
-
try:
|
25
|
-
# Import only when needed
|
26
|
-
from duckduckgo_search import DDGS
|
27
|
-
results = []
|
28
|
-
ddgs = DDGS()
|
29
|
-
for result in ddgs.text(keywords=query, max_results=5):
|
30
|
-
results.append({
|
31
|
-
"title": result.get("title", ""),
|
32
|
-
"url": result.get("href", ""),
|
33
|
-
"snippet": result.get("body", "")
|
34
|
-
})
|
35
|
-
return results
|
36
|
-
|
37
|
-
except Exception as e:
|
38
|
-
error_msg = f"Error during DuckDuckGo search: {e}"
|
39
|
-
logging.error(error_msg)
|
40
|
-
return [{"error": error_msg}]
|
8
|
+
# Re-export the function
|
9
|
+
__all__ = ['Tools', 'internet_search']
|