praisonaiagents 0.0.23__py3-none-any.whl → 0.0.25__py3-none-any.whl
Sign up to get free protection for your applications and to get access to all the features.
- praisonaiagents/tools/__init__.py +165 -2
- praisonaiagents/tools/arxiv_tools.py +292 -0
- praisonaiagents/tools/calculator_tools.py +278 -0
- praisonaiagents/tools/csv_tools.py +266 -0
- praisonaiagents/tools/duckdb_tools.py +268 -0
- praisonaiagents/tools/duckduckgo_tools.py +52 -0
- praisonaiagents/tools/excel_tools.py +310 -0
- praisonaiagents/tools/file_tools.py +274 -0
- praisonaiagents/tools/json_tools.py +515 -0
- praisonaiagents/tools/newspaper_tools.py +354 -0
- praisonaiagents/tools/pandas_tools.py +326 -0
- praisonaiagents/tools/python_tools.py +423 -0
- praisonaiagents/tools/shell_tools.py +278 -0
- praisonaiagents/tools/spider_tools.py +431 -0
- praisonaiagents/tools/test.py +56 -0
- praisonaiagents/tools/tools.py +5 -36
- praisonaiagents/tools/wikipedia_tools.py +272 -0
- praisonaiagents/tools/xml_tools.py +498 -0
- praisonaiagents/tools/yaml_tools.py +417 -0
- praisonaiagents/tools/yfinance_tools.py +213 -0
- {praisonaiagents-0.0.23.dist-info → praisonaiagents-0.0.25.dist-info}/METADATA +1 -1
- praisonaiagents-0.0.25.dist-info/RECORD +42 -0
- praisonaiagents-0.0.23.dist-info/RECORD +0 -24
- {praisonaiagents-0.0.23.dist-info → praisonaiagents-0.0.25.dist-info}/WHEEL +0 -0
- {praisonaiagents-0.0.23.dist-info → praisonaiagents-0.0.25.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,354 @@
|
|
1
|
+
"""Newspaper tools for scraping and parsing news articles.
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
from praisonaiagents.tools import newspaper_tools
|
5
|
+
article = newspaper_tools.get_article("https://example.com/article")
|
6
|
+
sources = newspaper_tools.get_news_sources("technology")
|
7
|
+
articles = newspaper_tools.get_articles_from_source("https://techcrunch.com")
|
8
|
+
|
9
|
+
or
|
10
|
+
from praisonaiagents.tools import get_article, get_news_sources
|
11
|
+
article = get_article("https://example.com/article")
|
12
|
+
"""
|
13
|
+
|
14
|
+
import logging
|
15
|
+
from typing import List, Dict, Union, Optional, Any
|
16
|
+
from importlib import util
|
17
|
+
import json
|
18
|
+
from urllib.parse import urlparse
|
19
|
+
|
20
|
+
# Predefined list of popular news sources
|
21
|
+
POPULAR_NEWS_SOURCES = {
|
22
|
+
'technology': [
|
23
|
+
'https://techcrunch.com',
|
24
|
+
'https://www.theverge.com',
|
25
|
+
'https://www.wired.com',
|
26
|
+
'https://www.engadget.com',
|
27
|
+
'https://arstechnica.com'
|
28
|
+
],
|
29
|
+
'business': [
|
30
|
+
'https://www.bloomberg.com',
|
31
|
+
'https://www.reuters.com',
|
32
|
+
'https://www.wsj.com',
|
33
|
+
'https://www.ft.com',
|
34
|
+
'https://www.cnbc.com'
|
35
|
+
],
|
36
|
+
'general': [
|
37
|
+
'https://www.nytimes.com',
|
38
|
+
'https://www.theguardian.com',
|
39
|
+
'https://www.washingtonpost.com',
|
40
|
+
'https://www.bbc.com',
|
41
|
+
'https://www.cnn.com'
|
42
|
+
],
|
43
|
+
'sports': [
|
44
|
+
'https://www.espn.com',
|
45
|
+
'https://sports.yahoo.com',
|
46
|
+
'https://www.cbssports.com',
|
47
|
+
'https://www.skysports.com',
|
48
|
+
'https://www.bleacherreport.com'
|
49
|
+
],
|
50
|
+
'entertainment': [
|
51
|
+
'https://variety.com',
|
52
|
+
'https://www.hollywoodreporter.com',
|
53
|
+
'https://www.ew.com',
|
54
|
+
'https://www.deadline.com',
|
55
|
+
'https://www.imdb.com/news'
|
56
|
+
],
|
57
|
+
'science': [
|
58
|
+
'https://www.scientificamerican.com',
|
59
|
+
'https://www.sciencedaily.com',
|
60
|
+
'https://www.newscientist.com',
|
61
|
+
'https://www.sciencemag.org',
|
62
|
+
'https://www.nature.com/news'
|
63
|
+
]
|
64
|
+
}
|
65
|
+
|
66
|
+
class NewspaperTools:
|
67
|
+
"""Tools for scraping and parsing news articles."""
|
68
|
+
|
69
|
+
def __init__(self):
|
70
|
+
"""Initialize NewspaperTools and check for newspaper package."""
|
71
|
+
self._check_newspaper()
|
72
|
+
|
73
|
+
def _check_newspaper(self):
|
74
|
+
"""Check if newspaper package is installed."""
|
75
|
+
if util.find_spec("newspaper") is None:
|
76
|
+
raise ImportError("newspaper3k package is not available. Please install it using: pip install newspaper3k")
|
77
|
+
global newspaper
|
78
|
+
import newspaper
|
79
|
+
|
80
|
+
def get_article(
|
81
|
+
self,
|
82
|
+
url: str,
|
83
|
+
language: str = 'en'
|
84
|
+
) -> Dict[str, Any]:
|
85
|
+
"""
|
86
|
+
Extract and parse a news article from a URL.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
url: URL of the article
|
90
|
+
language: Language code (e.g., 'en' for English)
|
91
|
+
|
92
|
+
Returns:
|
93
|
+
Dict: Article information including title, text, authors, etc.
|
94
|
+
"""
|
95
|
+
try:
|
96
|
+
from newspaper import Article, Config
|
97
|
+
|
98
|
+
# Configure article download
|
99
|
+
config = Config()
|
100
|
+
config.browser_user_agent = 'Mozilla/5.0'
|
101
|
+
config.language = language
|
102
|
+
|
103
|
+
# Download and parse article
|
104
|
+
article = Article(url, config=config)
|
105
|
+
article.download()
|
106
|
+
article.parse()
|
107
|
+
|
108
|
+
# Try to extract additional information
|
109
|
+
try:
|
110
|
+
article.nlp()
|
111
|
+
except Exception as e:
|
112
|
+
logging.warning(f"NLP processing failed: {str(e)}")
|
113
|
+
|
114
|
+
# Build response
|
115
|
+
response = {
|
116
|
+
"url": url,
|
117
|
+
"title": article.title,
|
118
|
+
"text": article.text,
|
119
|
+
"authors": article.authors,
|
120
|
+
"publish_date": article.publish_date.isoformat() if article.publish_date else None,
|
121
|
+
"top_image": article.top_image,
|
122
|
+
"images": list(article.images),
|
123
|
+
"movies": list(article.movies),
|
124
|
+
"source_domain": urlparse(url).netloc,
|
125
|
+
}
|
126
|
+
|
127
|
+
# Add NLP results if available
|
128
|
+
if hasattr(article, 'keywords') and article.keywords:
|
129
|
+
response["keywords"] = article.keywords
|
130
|
+
if hasattr(article, 'summary') and article.summary:
|
131
|
+
response["summary"] = article.summary
|
132
|
+
|
133
|
+
return response
|
134
|
+
except Exception as e:
|
135
|
+
error_msg = f"Error extracting article from {url}: {str(e)}"
|
136
|
+
logging.error(error_msg)
|
137
|
+
return {"error": error_msg}
|
138
|
+
|
139
|
+
def get_news_sources(
|
140
|
+
self,
|
141
|
+
category: Optional[str] = None,
|
142
|
+
language: str = 'en',
|
143
|
+
country: Optional[str] = None
|
144
|
+
) -> Union[List[Dict[str, str]], Dict[str, str]]:
|
145
|
+
"""
|
146
|
+
Get a list of news sources, optionally filtered by category.
|
147
|
+
|
148
|
+
Args:
|
149
|
+
category: Category to filter by (e.g., 'technology', 'sports')
|
150
|
+
language: Language code
|
151
|
+
country: Country code
|
152
|
+
|
153
|
+
Returns:
|
154
|
+
List[Dict] or Dict: List of news sources or error dict
|
155
|
+
"""
|
156
|
+
try:
|
157
|
+
sources = []
|
158
|
+
|
159
|
+
# Get sources for the specified category or all categories
|
160
|
+
if category:
|
161
|
+
category = category.lower()
|
162
|
+
if category in POPULAR_NEWS_SOURCES:
|
163
|
+
urls = POPULAR_NEWS_SOURCES[category]
|
164
|
+
else:
|
165
|
+
urls = []
|
166
|
+
for cat_urls in POPULAR_NEWS_SOURCES.values():
|
167
|
+
urls.extend(cat_urls)
|
168
|
+
else:
|
169
|
+
urls = []
|
170
|
+
for cat_urls in POPULAR_NEWS_SOURCES.values():
|
171
|
+
urls.extend(cat_urls)
|
172
|
+
|
173
|
+
# Create source objects
|
174
|
+
for url in urls:
|
175
|
+
domain = urlparse(url).netloc
|
176
|
+
source = {
|
177
|
+
"url": url,
|
178
|
+
"domain": domain,
|
179
|
+
"name": domain.replace("www.", "").split(".")[0].title(),
|
180
|
+
"category": category if category else "general"
|
181
|
+
}
|
182
|
+
sources.append(source)
|
183
|
+
|
184
|
+
return sources
|
185
|
+
except Exception as e:
|
186
|
+
error_msg = f"Error getting news sources: {str(e)}"
|
187
|
+
logging.error(error_msg)
|
188
|
+
return {"error": error_msg}
|
189
|
+
|
190
|
+
def get_articles_from_source(
|
191
|
+
self,
|
192
|
+
source_url: str,
|
193
|
+
limit: int = 10,
|
194
|
+
language: str = 'en'
|
195
|
+
) -> Union[List[Dict[str, Any]], Dict[str, str]]:
|
196
|
+
"""
|
197
|
+
Get recent articles from a news source.
|
198
|
+
|
199
|
+
Args:
|
200
|
+
source_url: URL of the news source
|
201
|
+
limit: Maximum number of articles to return
|
202
|
+
language: Language code
|
203
|
+
|
204
|
+
Returns:
|
205
|
+
List[Dict] or Dict: List of articles or error dict
|
206
|
+
"""
|
207
|
+
try:
|
208
|
+
from newspaper import Source, Config
|
209
|
+
|
210
|
+
# Configure source scraping
|
211
|
+
config = Config()
|
212
|
+
config.browser_user_agent = 'Mozilla/5.0'
|
213
|
+
config.language = language
|
214
|
+
config.fetch_images = False # Speed up processing
|
215
|
+
|
216
|
+
# Build news source
|
217
|
+
source = Source(source_url, config=config)
|
218
|
+
source.build()
|
219
|
+
|
220
|
+
# Get articles
|
221
|
+
articles = []
|
222
|
+
for article_url in source.article_urls()[:limit]:
|
223
|
+
try:
|
224
|
+
article = self.get_article(article_url, language)
|
225
|
+
if "error" not in article:
|
226
|
+
articles.append(article)
|
227
|
+
except Exception as e:
|
228
|
+
logging.warning(f"Error processing article {article_url}: {str(e)}")
|
229
|
+
continue
|
230
|
+
|
231
|
+
if len(articles) >= limit:
|
232
|
+
break
|
233
|
+
|
234
|
+
return articles
|
235
|
+
except Exception as e:
|
236
|
+
error_msg = f"Error getting articles from {source_url}: {str(e)}"
|
237
|
+
logging.error(error_msg)
|
238
|
+
return {"error": error_msg}
|
239
|
+
|
240
|
+
def get_trending_topics(
|
241
|
+
self,
|
242
|
+
sources: Optional[List[str]] = None,
|
243
|
+
limit: int = 10,
|
244
|
+
language: str = 'en'
|
245
|
+
) -> Union[List[str], Dict[str, str]]:
|
246
|
+
"""
|
247
|
+
Get trending topics across news sources.
|
248
|
+
|
249
|
+
Args:
|
250
|
+
sources: List of source URLs to analyze
|
251
|
+
limit: Maximum number of trending topics to return
|
252
|
+
language: Language code
|
253
|
+
|
254
|
+
Returns:
|
255
|
+
List[str] or Dict: List of trending topics or error dict
|
256
|
+
"""
|
257
|
+
try:
|
258
|
+
from collections import Counter
|
259
|
+
|
260
|
+
# Use default sources if none provided
|
261
|
+
if not sources:
|
262
|
+
sources_data = self.get_news_sources(language=language)
|
263
|
+
if isinstance(sources_data, dict) and "error" in sources_data:
|
264
|
+
return sources_data
|
265
|
+
sources = [s["url"] for s in sources_data[:5]] # Use top 5 sources
|
266
|
+
|
267
|
+
# Collect keywords from articles
|
268
|
+
all_keywords = []
|
269
|
+
for source_url in sources:
|
270
|
+
try:
|
271
|
+
articles = self.get_articles_from_source(source_url, limit=5, language=language)
|
272
|
+
if isinstance(articles, list):
|
273
|
+
for article in articles:
|
274
|
+
if "keywords" in article:
|
275
|
+
all_keywords.extend(article["keywords"])
|
276
|
+
except Exception as e:
|
277
|
+
logging.warning(f"Error processing source {source_url}: {str(e)}")
|
278
|
+
continue
|
279
|
+
|
280
|
+
# Get most common keywords
|
281
|
+
trending = Counter(all_keywords).most_common(limit)
|
282
|
+
return [topic for topic, count in trending]
|
283
|
+
except Exception as e:
|
284
|
+
error_msg = f"Error getting trending topics: {str(e)}"
|
285
|
+
logging.error(error_msg)
|
286
|
+
return {"error": error_msg}
|
287
|
+
|
288
|
+
# Create instance for direct function access
|
289
|
+
_newspaper_tools = NewspaperTools()
|
290
|
+
get_article = _newspaper_tools.get_article
|
291
|
+
get_news_sources = _newspaper_tools.get_news_sources
|
292
|
+
get_articles_from_source = _newspaper_tools.get_articles_from_source
|
293
|
+
get_trending_topics = _newspaper_tools.get_trending_topics
|
294
|
+
|
295
|
+
if __name__ == "__main__":
|
296
|
+
# Example usage
|
297
|
+
print("\n==================================================")
|
298
|
+
print("NewspaperTools Demonstration")
|
299
|
+
print("==================================================\n")
|
300
|
+
|
301
|
+
# 1. Get news sources
|
302
|
+
print("1. Getting News Sources")
|
303
|
+
print("------------------------------")
|
304
|
+
tech_sources = get_news_sources("technology")
|
305
|
+
print("Technology news sources:")
|
306
|
+
if isinstance(tech_sources, list):
|
307
|
+
print(json.dumps(tech_sources[:3], indent=2)) # Show first 3 sources
|
308
|
+
else:
|
309
|
+
print(tech_sources) # Show error
|
310
|
+
print()
|
311
|
+
|
312
|
+
if isinstance(tech_sources, list) and tech_sources:
|
313
|
+
source_url = tech_sources[0]["url"]
|
314
|
+
|
315
|
+
# 2. Get articles from a source
|
316
|
+
print("2. Getting Articles from Source")
|
317
|
+
print("------------------------------")
|
318
|
+
articles = get_articles_from_source(source_url, limit=2)
|
319
|
+
print(f"Articles from {source_url}:")
|
320
|
+
if isinstance(articles, list):
|
321
|
+
for article in articles:
|
322
|
+
print(f"- {article['title']}")
|
323
|
+
if "summary" in article:
|
324
|
+
print(f" Summary: {article['summary'][:200]}...")
|
325
|
+
else:
|
326
|
+
print(articles) # Show error
|
327
|
+
print()
|
328
|
+
|
329
|
+
# 3. Get a single article
|
330
|
+
print("3. Getting Single Article")
|
331
|
+
print("------------------------------")
|
332
|
+
if isinstance(articles, list) and articles:
|
333
|
+
article_url = articles[0]["url"]
|
334
|
+
article = get_article(article_url)
|
335
|
+
if "error" not in article:
|
336
|
+
print(f"Article: {article['title']}")
|
337
|
+
if "summary" in article:
|
338
|
+
print(f"Summary: {article['summary'][:200]}...")
|
339
|
+
print(f"Authors: {', '.join(article['authors'])}")
|
340
|
+
print(f"Date: {article['publish_date']}")
|
341
|
+
else:
|
342
|
+
print(article) # Show error
|
343
|
+
print()
|
344
|
+
|
345
|
+
# 4. Get trending topics
|
346
|
+
print("4. Getting Trending Topics")
|
347
|
+
print("------------------------------")
|
348
|
+
topics = get_trending_topics([source_url], limit=5)
|
349
|
+
print("Trending topics:")
|
350
|
+
print(json.dumps(topics, indent=2))
|
351
|
+
|
352
|
+
print("\n==================================================")
|
353
|
+
print("Demonstration Complete")
|
354
|
+
print("==================================================")
|
@@ -0,0 +1,326 @@
|
|
1
|
+
"""Pandas tools for data manipulation and analysis.
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
from praisonaiagents.tools import pandas_tools
|
5
|
+
df = pandas_tools.read_csv("data.csv")
|
6
|
+
df = pandas_tools.filter_data(df, "column > 5")
|
7
|
+
summary = pandas_tools.get_summary(df)
|
8
|
+
|
9
|
+
or
|
10
|
+
from praisonaiagents.tools import read_csv, filter_data, get_summary
|
11
|
+
df = read_csv("data.csv")
|
12
|
+
"""
|
13
|
+
|
14
|
+
import logging
|
15
|
+
from typing import List, Dict, Union, Optional, Any
|
16
|
+
from importlib import util
|
17
|
+
import json
|
18
|
+
import os
|
19
|
+
|
20
|
+
# Import pandas for type hints, but don't use it until we check it's installed
|
21
|
+
if util.find_spec("pandas") is not None:
|
22
|
+
import pandas as pd
|
23
|
+
import numpy as np
|
24
|
+
else:
|
25
|
+
# Create a placeholder for type hints
|
26
|
+
class pd:
|
27
|
+
DataFrame = None
|
28
|
+
|
29
|
+
def _convert_to_serializable(obj: Any) -> Any:
|
30
|
+
"""Convert numpy/pandas types to JSON serializable Python types."""
|
31
|
+
if isinstance(obj, (np.integer, np.floating)):
|
32
|
+
return obj.item()
|
33
|
+
elif isinstance(obj, np.ndarray):
|
34
|
+
return obj.tolist()
|
35
|
+
elif isinstance(obj, pd.Series):
|
36
|
+
return obj.to_list()
|
37
|
+
elif isinstance(obj, pd.DataFrame):
|
38
|
+
return obj.to_dict(orient='records')
|
39
|
+
return obj
|
40
|
+
|
41
|
+
class PandasTools:
|
42
|
+
"""Tools for data manipulation and analysis using pandas."""
|
43
|
+
|
44
|
+
def __init__(self):
|
45
|
+
"""Initialize PandasTools and check for pandas installation."""
|
46
|
+
self._check_pandas()
|
47
|
+
|
48
|
+
def _check_pandas(self):
|
49
|
+
"""Check if pandas is installed."""
|
50
|
+
if util.find_spec("pandas") is None:
|
51
|
+
raise ImportError("pandas is not available. Please install it using: pip install pandas")
|
52
|
+
global pd, np
|
53
|
+
import pandas as pd
|
54
|
+
import numpy as np
|
55
|
+
|
56
|
+
def read_csv(self, filepath: str, **kwargs) -> Union[pd.DataFrame, Dict[str, str]]:
|
57
|
+
"""
|
58
|
+
Read a CSV file into a pandas DataFrame.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
filepath: Path to the CSV file
|
62
|
+
**kwargs: Additional arguments to pass to pd.read_csv()
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
pd.DataFrame or Dict: DataFrame if successful, error dict if failed
|
66
|
+
"""
|
67
|
+
try:
|
68
|
+
return pd.read_csv(filepath, **kwargs)
|
69
|
+
except Exception as e:
|
70
|
+
error_msg = f"Error reading CSV file {filepath}: {str(e)}"
|
71
|
+
logging.error(error_msg)
|
72
|
+
return {"error": error_msg}
|
73
|
+
|
74
|
+
def read_excel(self, filepath: str, **kwargs) -> Union[pd.DataFrame, Dict[str, str]]:
|
75
|
+
"""
|
76
|
+
Read an Excel file into a pandas DataFrame.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
filepath: Path to the Excel file
|
80
|
+
**kwargs: Additional arguments to pass to pd.read_excel()
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
pd.DataFrame or Dict: DataFrame if successful, error dict if failed
|
84
|
+
"""
|
85
|
+
try:
|
86
|
+
return pd.read_excel(filepath, **kwargs)
|
87
|
+
except Exception as e:
|
88
|
+
error_msg = f"Error reading Excel file {filepath}: {str(e)}"
|
89
|
+
logging.error(error_msg)
|
90
|
+
return {"error": error_msg}
|
91
|
+
|
92
|
+
def write_csv(self, df: pd.DataFrame, filepath: str, **kwargs) -> bool:
|
93
|
+
"""
|
94
|
+
Write DataFrame to a CSV file.
|
95
|
+
|
96
|
+
Args:
|
97
|
+
df: DataFrame to write
|
98
|
+
filepath: Output file path
|
99
|
+
**kwargs: Additional arguments to pass to df.to_csv()
|
100
|
+
|
101
|
+
Returns:
|
102
|
+
bool: True if successful, False otherwise
|
103
|
+
"""
|
104
|
+
try:
|
105
|
+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
106
|
+
df.to_csv(filepath, **kwargs)
|
107
|
+
return True
|
108
|
+
except Exception as e:
|
109
|
+
error_msg = f"Error writing CSV file {filepath}: {str(e)}"
|
110
|
+
logging.error(error_msg)
|
111
|
+
return False
|
112
|
+
|
113
|
+
def write_excel(self, df: pd.DataFrame, filepath: str, **kwargs) -> bool:
|
114
|
+
"""
|
115
|
+
Write DataFrame to an Excel file.
|
116
|
+
|
117
|
+
Args:
|
118
|
+
df: DataFrame to write
|
119
|
+
filepath: Output file path
|
120
|
+
**kwargs: Additional arguments to pass to df.to_excel()
|
121
|
+
|
122
|
+
Returns:
|
123
|
+
bool: True if successful, False otherwise
|
124
|
+
"""
|
125
|
+
try:
|
126
|
+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
127
|
+
df.to_excel(filepath, **kwargs)
|
128
|
+
return True
|
129
|
+
except Exception as e:
|
130
|
+
error_msg = f"Error writing Excel file {filepath}: {str(e)}"
|
131
|
+
logging.error(error_msg)
|
132
|
+
return False
|
133
|
+
|
134
|
+
def filter_data(self, df: pd.DataFrame, query: str) -> Union[pd.DataFrame, Dict[str, str]]:
|
135
|
+
"""
|
136
|
+
Filter DataFrame using a query string.
|
137
|
+
|
138
|
+
Args:
|
139
|
+
df: Input DataFrame
|
140
|
+
query: Query string (e.g., "column > 5 and other_column == 'value'")
|
141
|
+
|
142
|
+
Returns:
|
143
|
+
pd.DataFrame or Dict: Filtered DataFrame if successful, error dict if failed
|
144
|
+
"""
|
145
|
+
try:
|
146
|
+
return df.query(query)
|
147
|
+
except Exception as e:
|
148
|
+
error_msg = f"Error filtering data with query '{query}': {str(e)}"
|
149
|
+
logging.error(error_msg)
|
150
|
+
return {"error": error_msg}
|
151
|
+
|
152
|
+
def get_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
|
153
|
+
"""
|
154
|
+
Get a summary of the DataFrame including basic statistics and info.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
df: Input DataFrame
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
Dict: Summary statistics and information
|
161
|
+
"""
|
162
|
+
try:
|
163
|
+
numeric_summary = df.describe().to_dict()
|
164
|
+
# Convert numpy types to native Python types
|
165
|
+
for col in numeric_summary:
|
166
|
+
numeric_summary[col] = {k: _convert_to_serializable(v)
|
167
|
+
for k, v in numeric_summary[col].items()}
|
168
|
+
|
169
|
+
summary = {
|
170
|
+
"shape": list(df.shape),
|
171
|
+
"columns": list(df.columns),
|
172
|
+
"dtypes": df.dtypes.astype(str).to_dict(),
|
173
|
+
"null_counts": df.isnull().sum().to_dict(),
|
174
|
+
"numeric_summary": numeric_summary,
|
175
|
+
"memory_usage": int(df.memory_usage(deep=True).sum()),
|
176
|
+
}
|
177
|
+
return summary
|
178
|
+
except Exception as e:
|
179
|
+
error_msg = f"Error getting data summary: {str(e)}"
|
180
|
+
logging.error(error_msg)
|
181
|
+
return {"error": error_msg}
|
182
|
+
|
183
|
+
def group_by(
|
184
|
+
self,
|
185
|
+
df: pd.DataFrame,
|
186
|
+
columns: Union[str, List[str]],
|
187
|
+
agg_dict: Dict[str, Union[str, List[str]]]
|
188
|
+
) -> Union[pd.DataFrame, Dict[str, str]]:
|
189
|
+
"""
|
190
|
+
Group DataFrame by columns and apply aggregation functions.
|
191
|
+
|
192
|
+
Args:
|
193
|
+
df: Input DataFrame
|
194
|
+
columns: Column(s) to group by
|
195
|
+
agg_dict: Dictionary of column:function pairs for aggregation
|
196
|
+
|
197
|
+
Returns:
|
198
|
+
pd.DataFrame or Dict: Grouped DataFrame if successful, error dict if failed
|
199
|
+
"""
|
200
|
+
try:
|
201
|
+
return df.groupby(columns).agg(agg_dict).reset_index()
|
202
|
+
except Exception as e:
|
203
|
+
error_msg = f"Error grouping data: {str(e)}"
|
204
|
+
logging.error(error_msg)
|
205
|
+
return {"error": error_msg}
|
206
|
+
|
207
|
+
def pivot_table(
|
208
|
+
self,
|
209
|
+
df: pd.DataFrame,
|
210
|
+
index: Union[str, List[str]],
|
211
|
+
columns: Optional[Union[str, List[str]]] = None,
|
212
|
+
values: Optional[Union[str, List[str]]] = None,
|
213
|
+
aggfunc: str = "mean"
|
214
|
+
) -> Union[pd.DataFrame, Dict[str, str]]:
|
215
|
+
"""
|
216
|
+
Create a pivot table from DataFrame.
|
217
|
+
|
218
|
+
Args:
|
219
|
+
df: Input DataFrame
|
220
|
+
index: Column(s) to use as index
|
221
|
+
columns: Column(s) to use as columns
|
222
|
+
values: Column(s) to aggregate
|
223
|
+
aggfunc: Aggregation function to use
|
224
|
+
|
225
|
+
Returns:
|
226
|
+
pd.DataFrame or Dict: Pivot table if successful, error dict if failed
|
227
|
+
"""
|
228
|
+
try:
|
229
|
+
return pd.pivot_table(
|
230
|
+
df,
|
231
|
+
index=index,
|
232
|
+
columns=columns,
|
233
|
+
values=values,
|
234
|
+
aggfunc=aggfunc
|
235
|
+
).reset_index()
|
236
|
+
except Exception as e:
|
237
|
+
error_msg = f"Error creating pivot table: {str(e)}"
|
238
|
+
logging.error(error_msg)
|
239
|
+
return {"error": error_msg}
|
240
|
+
|
241
|
+
# Create instance for direct function access
|
242
|
+
_pandas_tools = PandasTools()
|
243
|
+
read_csv = _pandas_tools.read_csv
|
244
|
+
read_excel = _pandas_tools.read_excel
|
245
|
+
write_csv = _pandas_tools.write_csv
|
246
|
+
write_excel = _pandas_tools.write_excel
|
247
|
+
filter_data = _pandas_tools.filter_data
|
248
|
+
get_summary = _pandas_tools.get_summary
|
249
|
+
group_by = _pandas_tools.group_by
|
250
|
+
pivot_table = _pandas_tools.pivot_table
|
251
|
+
|
252
|
+
if __name__ == "__main__":
|
253
|
+
# Example usage
|
254
|
+
print("\n==================================================")
|
255
|
+
print("PandasTools Demonstration")
|
256
|
+
print("==================================================\n")
|
257
|
+
|
258
|
+
# Create a test directory
|
259
|
+
test_dir = os.path.join(os.getcwd(), "test_files")
|
260
|
+
os.makedirs(test_dir, exist_ok=True)
|
261
|
+
|
262
|
+
# Create a sample DataFrame
|
263
|
+
df = pd.DataFrame({
|
264
|
+
'name': ['John', 'Jane', 'Bob', 'Alice', 'Charlie'],
|
265
|
+
'age': [25, 30, 35, 28, 32],
|
266
|
+
'city': ['New York', 'London', 'Paris', 'Tokyo', 'London'],
|
267
|
+
'salary': [50000, 60000, 75000, 65000, 55000]
|
268
|
+
})
|
269
|
+
|
270
|
+
# 1. Write to CSV
|
271
|
+
print("1. Writing to CSV")
|
272
|
+
print("------------------------------")
|
273
|
+
csv_file = os.path.join(test_dir, "sample.csv")
|
274
|
+
success = write_csv(df, csv_file, index=False)
|
275
|
+
print(f"Write successful: {success}\n")
|
276
|
+
|
277
|
+
# 2. Read from CSV
|
278
|
+
print("2. Reading from CSV")
|
279
|
+
print("------------------------------")
|
280
|
+
df_read = read_csv(csv_file)
|
281
|
+
print("First few rows:")
|
282
|
+
print(df_read.head())
|
283
|
+
print()
|
284
|
+
|
285
|
+
# 3. Filter Data
|
286
|
+
print("3. Filtering Data")
|
287
|
+
print("------------------------------")
|
288
|
+
filtered_df = filter_data(df, "age > 30 and salary > 60000")
|
289
|
+
print("People over 30 with salary > 60000:")
|
290
|
+
print(filtered_df)
|
291
|
+
print()
|
292
|
+
|
293
|
+
# 4. Get Summary
|
294
|
+
print("4. Data Summary")
|
295
|
+
print("------------------------------")
|
296
|
+
summary = get_summary(df)
|
297
|
+
print(json.dumps(summary, indent=2))
|
298
|
+
print()
|
299
|
+
|
300
|
+
# 5. Group By
|
301
|
+
print("5. Group By")
|
302
|
+
print("------------------------------")
|
303
|
+
grouped = group_by(df, "city", {"salary": ["mean", "count"], "age": "mean"})
|
304
|
+
print("Statistics by city:")
|
305
|
+
print(grouped)
|
306
|
+
print()
|
307
|
+
|
308
|
+
# 6. Pivot Table
|
309
|
+
print("6. Pivot Table")
|
310
|
+
print("------------------------------")
|
311
|
+
pivoted = pivot_table(df, index="city", values=["salary", "age"])
|
312
|
+
print("Pivot table by city:")
|
313
|
+
print(pivoted)
|
314
|
+
print()
|
315
|
+
|
316
|
+
# Clean up test directory
|
317
|
+
try:
|
318
|
+
import shutil
|
319
|
+
shutil.rmtree(test_dir)
|
320
|
+
print("Test directory cleaned up successfully")
|
321
|
+
except Exception as e:
|
322
|
+
print(f"Error cleaning up test directory: {str(e)}")
|
323
|
+
|
324
|
+
print("\n==================================================")
|
325
|
+
print("Demonstration Complete")
|
326
|
+
print("==================================================")
|