praisonaiagents 0.0.23__py3-none-any.whl → 0.0.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- praisonaiagents/tools/__init__.py +165 -2
- praisonaiagents/tools/arxiv_tools.py +292 -0
- praisonaiagents/tools/calculator_tools.py +278 -0
- praisonaiagents/tools/csv_tools.py +266 -0
- praisonaiagents/tools/duckdb_tools.py +268 -0
- praisonaiagents/tools/duckduckgo_tools.py +52 -0
- praisonaiagents/tools/excel_tools.py +310 -0
- praisonaiagents/tools/file_tools.py +274 -0
- praisonaiagents/tools/json_tools.py +515 -0
- praisonaiagents/tools/newspaper_tools.py +354 -0
- praisonaiagents/tools/pandas_tools.py +326 -0
- praisonaiagents/tools/python_tools.py +423 -0
- praisonaiagents/tools/shell_tools.py +278 -0
- praisonaiagents/tools/spider_tools.py +431 -0
- praisonaiagents/tools/test.py +56 -0
- praisonaiagents/tools/tools.py +5 -36
- praisonaiagents/tools/wikipedia_tools.py +272 -0
- praisonaiagents/tools/xml_tools.py +498 -0
- praisonaiagents/tools/yaml_tools.py +417 -0
- praisonaiagents/tools/yfinance_tools.py +213 -0
- {praisonaiagents-0.0.23.dist-info → praisonaiagents-0.0.24.dist-info}/METADATA +1 -1
- praisonaiagents-0.0.24.dist-info/RECORD +42 -0
- praisonaiagents-0.0.23.dist-info/RECORD +0 -24
- {praisonaiagents-0.0.23.dist-info → praisonaiagents-0.0.24.dist-info}/WHEEL +0 -0
- {praisonaiagents-0.0.23.dist-info → praisonaiagents-0.0.24.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,354 @@
|
|
1
|
+
"""Newspaper tools for scraping and parsing news articles.
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
from praisonaiagents.tools import newspaper_tools
|
5
|
+
article = newspaper_tools.get_article("https://example.com/article")
|
6
|
+
sources = newspaper_tools.get_news_sources("technology")
|
7
|
+
articles = newspaper_tools.get_articles_from_source("https://techcrunch.com")
|
8
|
+
|
9
|
+
or
|
10
|
+
from praisonaiagents.tools import get_article, get_news_sources
|
11
|
+
article = get_article("https://example.com/article")
|
12
|
+
"""
|
13
|
+
|
14
|
+
import logging
|
15
|
+
from typing import List, Dict, Union, Optional, Any
|
16
|
+
from importlib import util
|
17
|
+
import json
|
18
|
+
from urllib.parse import urlparse
|
19
|
+
|
20
|
+
# Predefined list of popular news sources
|
21
|
+
POPULAR_NEWS_SOURCES = {
|
22
|
+
'technology': [
|
23
|
+
'https://techcrunch.com',
|
24
|
+
'https://www.theverge.com',
|
25
|
+
'https://www.wired.com',
|
26
|
+
'https://www.engadget.com',
|
27
|
+
'https://arstechnica.com'
|
28
|
+
],
|
29
|
+
'business': [
|
30
|
+
'https://www.bloomberg.com',
|
31
|
+
'https://www.reuters.com',
|
32
|
+
'https://www.wsj.com',
|
33
|
+
'https://www.ft.com',
|
34
|
+
'https://www.cnbc.com'
|
35
|
+
],
|
36
|
+
'general': [
|
37
|
+
'https://www.nytimes.com',
|
38
|
+
'https://www.theguardian.com',
|
39
|
+
'https://www.washingtonpost.com',
|
40
|
+
'https://www.bbc.com',
|
41
|
+
'https://www.cnn.com'
|
42
|
+
],
|
43
|
+
'sports': [
|
44
|
+
'https://www.espn.com',
|
45
|
+
'https://sports.yahoo.com',
|
46
|
+
'https://www.cbssports.com',
|
47
|
+
'https://www.skysports.com',
|
48
|
+
'https://www.bleacherreport.com'
|
49
|
+
],
|
50
|
+
'entertainment': [
|
51
|
+
'https://variety.com',
|
52
|
+
'https://www.hollywoodreporter.com',
|
53
|
+
'https://www.ew.com',
|
54
|
+
'https://www.deadline.com',
|
55
|
+
'https://www.imdb.com/news'
|
56
|
+
],
|
57
|
+
'science': [
|
58
|
+
'https://www.scientificamerican.com',
|
59
|
+
'https://www.sciencedaily.com',
|
60
|
+
'https://www.newscientist.com',
|
61
|
+
'https://www.sciencemag.org',
|
62
|
+
'https://www.nature.com/news'
|
63
|
+
]
|
64
|
+
}
|
65
|
+
|
66
|
+
class NewspaperTools:
|
67
|
+
"""Tools for scraping and parsing news articles."""
|
68
|
+
|
69
|
+
def __init__(self):
|
70
|
+
"""Initialize NewspaperTools and check for newspaper package."""
|
71
|
+
self._check_newspaper()
|
72
|
+
|
73
|
+
def _check_newspaper(self):
|
74
|
+
"""Check if newspaper package is installed."""
|
75
|
+
if util.find_spec("newspaper") is None:
|
76
|
+
raise ImportError("newspaper3k package is not available. Please install it using: pip install newspaper3k")
|
77
|
+
global newspaper
|
78
|
+
import newspaper
|
79
|
+
|
80
|
+
def get_article(
|
81
|
+
self,
|
82
|
+
url: str,
|
83
|
+
language: str = 'en'
|
84
|
+
) -> Dict[str, Any]:
|
85
|
+
"""
|
86
|
+
Extract and parse a news article from a URL.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
url: URL of the article
|
90
|
+
language: Language code (e.g., 'en' for English)
|
91
|
+
|
92
|
+
Returns:
|
93
|
+
Dict: Article information including title, text, authors, etc.
|
94
|
+
"""
|
95
|
+
try:
|
96
|
+
from newspaper import Article, Config
|
97
|
+
|
98
|
+
# Configure article download
|
99
|
+
config = Config()
|
100
|
+
config.browser_user_agent = 'Mozilla/5.0'
|
101
|
+
config.language = language
|
102
|
+
|
103
|
+
# Download and parse article
|
104
|
+
article = Article(url, config=config)
|
105
|
+
article.download()
|
106
|
+
article.parse()
|
107
|
+
|
108
|
+
# Try to extract additional information
|
109
|
+
try:
|
110
|
+
article.nlp()
|
111
|
+
except Exception as e:
|
112
|
+
logging.warning(f"NLP processing failed: {str(e)}")
|
113
|
+
|
114
|
+
# Build response
|
115
|
+
response = {
|
116
|
+
"url": url,
|
117
|
+
"title": article.title,
|
118
|
+
"text": article.text,
|
119
|
+
"authors": article.authors,
|
120
|
+
"publish_date": article.publish_date.isoformat() if article.publish_date else None,
|
121
|
+
"top_image": article.top_image,
|
122
|
+
"images": list(article.images),
|
123
|
+
"movies": list(article.movies),
|
124
|
+
"source_domain": urlparse(url).netloc,
|
125
|
+
}
|
126
|
+
|
127
|
+
# Add NLP results if available
|
128
|
+
if hasattr(article, 'keywords') and article.keywords:
|
129
|
+
response["keywords"] = article.keywords
|
130
|
+
if hasattr(article, 'summary') and article.summary:
|
131
|
+
response["summary"] = article.summary
|
132
|
+
|
133
|
+
return response
|
134
|
+
except Exception as e:
|
135
|
+
error_msg = f"Error extracting article from {url}: {str(e)}"
|
136
|
+
logging.error(error_msg)
|
137
|
+
return {"error": error_msg}
|
138
|
+
|
139
|
+
def get_news_sources(
|
140
|
+
self,
|
141
|
+
category: Optional[str] = None,
|
142
|
+
language: str = 'en',
|
143
|
+
country: Optional[str] = None
|
144
|
+
) -> Union[List[Dict[str, str]], Dict[str, str]]:
|
145
|
+
"""
|
146
|
+
Get a list of news sources, optionally filtered by category.
|
147
|
+
|
148
|
+
Args:
|
149
|
+
category: Category to filter by (e.g., 'technology', 'sports')
|
150
|
+
language: Language code
|
151
|
+
country: Country code
|
152
|
+
|
153
|
+
Returns:
|
154
|
+
List[Dict] or Dict: List of news sources or error dict
|
155
|
+
"""
|
156
|
+
try:
|
157
|
+
sources = []
|
158
|
+
|
159
|
+
# Get sources for the specified category or all categories
|
160
|
+
if category:
|
161
|
+
category = category.lower()
|
162
|
+
if category in POPULAR_NEWS_SOURCES:
|
163
|
+
urls = POPULAR_NEWS_SOURCES[category]
|
164
|
+
else:
|
165
|
+
urls = []
|
166
|
+
for cat_urls in POPULAR_NEWS_SOURCES.values():
|
167
|
+
urls.extend(cat_urls)
|
168
|
+
else:
|
169
|
+
urls = []
|
170
|
+
for cat_urls in POPULAR_NEWS_SOURCES.values():
|
171
|
+
urls.extend(cat_urls)
|
172
|
+
|
173
|
+
# Create source objects
|
174
|
+
for url in urls:
|
175
|
+
domain = urlparse(url).netloc
|
176
|
+
source = {
|
177
|
+
"url": url,
|
178
|
+
"domain": domain,
|
179
|
+
"name": domain.replace("www.", "").split(".")[0].title(),
|
180
|
+
"category": category if category else "general"
|
181
|
+
}
|
182
|
+
sources.append(source)
|
183
|
+
|
184
|
+
return sources
|
185
|
+
except Exception as e:
|
186
|
+
error_msg = f"Error getting news sources: {str(e)}"
|
187
|
+
logging.error(error_msg)
|
188
|
+
return {"error": error_msg}
|
189
|
+
|
190
|
+
def get_articles_from_source(
|
191
|
+
self,
|
192
|
+
source_url: str,
|
193
|
+
limit: int = 10,
|
194
|
+
language: str = 'en'
|
195
|
+
) -> Union[List[Dict[str, Any]], Dict[str, str]]:
|
196
|
+
"""
|
197
|
+
Get recent articles from a news source.
|
198
|
+
|
199
|
+
Args:
|
200
|
+
source_url: URL of the news source
|
201
|
+
limit: Maximum number of articles to return
|
202
|
+
language: Language code
|
203
|
+
|
204
|
+
Returns:
|
205
|
+
List[Dict] or Dict: List of articles or error dict
|
206
|
+
"""
|
207
|
+
try:
|
208
|
+
from newspaper import Source, Config
|
209
|
+
|
210
|
+
# Configure source scraping
|
211
|
+
config = Config()
|
212
|
+
config.browser_user_agent = 'Mozilla/5.0'
|
213
|
+
config.language = language
|
214
|
+
config.fetch_images = False # Speed up processing
|
215
|
+
|
216
|
+
# Build news source
|
217
|
+
source = Source(source_url, config=config)
|
218
|
+
source.build()
|
219
|
+
|
220
|
+
# Get articles
|
221
|
+
articles = []
|
222
|
+
for article_url in source.article_urls()[:limit]:
|
223
|
+
try:
|
224
|
+
article = self.get_article(article_url, language)
|
225
|
+
if "error" not in article:
|
226
|
+
articles.append(article)
|
227
|
+
except Exception as e:
|
228
|
+
logging.warning(f"Error processing article {article_url}: {str(e)}")
|
229
|
+
continue
|
230
|
+
|
231
|
+
if len(articles) >= limit:
|
232
|
+
break
|
233
|
+
|
234
|
+
return articles
|
235
|
+
except Exception as e:
|
236
|
+
error_msg = f"Error getting articles from {source_url}: {str(e)}"
|
237
|
+
logging.error(error_msg)
|
238
|
+
return {"error": error_msg}
|
239
|
+
|
240
|
+
def get_trending_topics(
|
241
|
+
self,
|
242
|
+
sources: Optional[List[str]] = None,
|
243
|
+
limit: int = 10,
|
244
|
+
language: str = 'en'
|
245
|
+
) -> Union[List[str], Dict[str, str]]:
|
246
|
+
"""
|
247
|
+
Get trending topics across news sources.
|
248
|
+
|
249
|
+
Args:
|
250
|
+
sources: List of source URLs to analyze
|
251
|
+
limit: Maximum number of trending topics to return
|
252
|
+
language: Language code
|
253
|
+
|
254
|
+
Returns:
|
255
|
+
List[str] or Dict: List of trending topics or error dict
|
256
|
+
"""
|
257
|
+
try:
|
258
|
+
from collections import Counter
|
259
|
+
|
260
|
+
# Use default sources if none provided
|
261
|
+
if not sources:
|
262
|
+
sources_data = self.get_news_sources(language=language)
|
263
|
+
if isinstance(sources_data, dict) and "error" in sources_data:
|
264
|
+
return sources_data
|
265
|
+
sources = [s["url"] for s in sources_data[:5]] # Use top 5 sources
|
266
|
+
|
267
|
+
# Collect keywords from articles
|
268
|
+
all_keywords = []
|
269
|
+
for source_url in sources:
|
270
|
+
try:
|
271
|
+
articles = self.get_articles_from_source(source_url, limit=5, language=language)
|
272
|
+
if isinstance(articles, list):
|
273
|
+
for article in articles:
|
274
|
+
if "keywords" in article:
|
275
|
+
all_keywords.extend(article["keywords"])
|
276
|
+
except Exception as e:
|
277
|
+
logging.warning(f"Error processing source {source_url}: {str(e)}")
|
278
|
+
continue
|
279
|
+
|
280
|
+
# Get most common keywords
|
281
|
+
trending = Counter(all_keywords).most_common(limit)
|
282
|
+
return [topic for topic, count in trending]
|
283
|
+
except Exception as e:
|
284
|
+
error_msg = f"Error getting trending topics: {str(e)}"
|
285
|
+
logging.error(error_msg)
|
286
|
+
return {"error": error_msg}
|
287
|
+
|
288
|
+
# Create instance for direct function access
|
289
|
+
_newspaper_tools = NewspaperTools()
|
290
|
+
get_article = _newspaper_tools.get_article
|
291
|
+
get_news_sources = _newspaper_tools.get_news_sources
|
292
|
+
get_articles_from_source = _newspaper_tools.get_articles_from_source
|
293
|
+
get_trending_topics = _newspaper_tools.get_trending_topics
|
294
|
+
|
295
|
+
if __name__ == "__main__":
|
296
|
+
# Example usage
|
297
|
+
print("\n==================================================")
|
298
|
+
print("NewspaperTools Demonstration")
|
299
|
+
print("==================================================\n")
|
300
|
+
|
301
|
+
# 1. Get news sources
|
302
|
+
print("1. Getting News Sources")
|
303
|
+
print("------------------------------")
|
304
|
+
tech_sources = get_news_sources("technology")
|
305
|
+
print("Technology news sources:")
|
306
|
+
if isinstance(tech_sources, list):
|
307
|
+
print(json.dumps(tech_sources[:3], indent=2)) # Show first 3 sources
|
308
|
+
else:
|
309
|
+
print(tech_sources) # Show error
|
310
|
+
print()
|
311
|
+
|
312
|
+
if isinstance(tech_sources, list) and tech_sources:
|
313
|
+
source_url = tech_sources[0]["url"]
|
314
|
+
|
315
|
+
# 2. Get articles from a source
|
316
|
+
print("2. Getting Articles from Source")
|
317
|
+
print("------------------------------")
|
318
|
+
articles = get_articles_from_source(source_url, limit=2)
|
319
|
+
print(f"Articles from {source_url}:")
|
320
|
+
if isinstance(articles, list):
|
321
|
+
for article in articles:
|
322
|
+
print(f"- {article['title']}")
|
323
|
+
if "summary" in article:
|
324
|
+
print(f" Summary: {article['summary'][:200]}...")
|
325
|
+
else:
|
326
|
+
print(articles) # Show error
|
327
|
+
print()
|
328
|
+
|
329
|
+
# 3. Get a single article
|
330
|
+
print("3. Getting Single Article")
|
331
|
+
print("------------------------------")
|
332
|
+
if isinstance(articles, list) and articles:
|
333
|
+
article_url = articles[0]["url"]
|
334
|
+
article = get_article(article_url)
|
335
|
+
if "error" not in article:
|
336
|
+
print(f"Article: {article['title']}")
|
337
|
+
if "summary" in article:
|
338
|
+
print(f"Summary: {article['summary'][:200]}...")
|
339
|
+
print(f"Authors: {', '.join(article['authors'])}")
|
340
|
+
print(f"Date: {article['publish_date']}")
|
341
|
+
else:
|
342
|
+
print(article) # Show error
|
343
|
+
print()
|
344
|
+
|
345
|
+
# 4. Get trending topics
|
346
|
+
print("4. Getting Trending Topics")
|
347
|
+
print("------------------------------")
|
348
|
+
topics = get_trending_topics([source_url], limit=5)
|
349
|
+
print("Trending topics:")
|
350
|
+
print(json.dumps(topics, indent=2))
|
351
|
+
|
352
|
+
print("\n==================================================")
|
353
|
+
print("Demonstration Complete")
|
354
|
+
print("==================================================")
|
@@ -0,0 +1,326 @@
|
|
1
|
+
"""Pandas tools for data manipulation and analysis.
|
2
|
+
|
3
|
+
Usage:
|
4
|
+
from praisonaiagents.tools import pandas_tools
|
5
|
+
df = pandas_tools.read_csv("data.csv")
|
6
|
+
df = pandas_tools.filter_data(df, "column > 5")
|
7
|
+
summary = pandas_tools.get_summary(df)
|
8
|
+
|
9
|
+
or
|
10
|
+
from praisonaiagents.tools import read_csv, filter_data, get_summary
|
11
|
+
df = read_csv("data.csv")
|
12
|
+
"""
|
13
|
+
|
14
|
+
import logging
|
15
|
+
from typing import List, Dict, Union, Optional, Any
|
16
|
+
from importlib import util
|
17
|
+
import json
|
18
|
+
import os
|
19
|
+
|
20
|
+
# Import pandas for type hints, but don't use it until we check it's installed
|
21
|
+
if util.find_spec("pandas") is not None:
|
22
|
+
import pandas as pd
|
23
|
+
import numpy as np
|
24
|
+
else:
|
25
|
+
# Create a placeholder for type hints
|
26
|
+
class pd:
|
27
|
+
DataFrame = None
|
28
|
+
|
29
|
+
def _convert_to_serializable(obj: Any) -> Any:
|
30
|
+
"""Convert numpy/pandas types to JSON serializable Python types."""
|
31
|
+
if isinstance(obj, (np.integer, np.floating)):
|
32
|
+
return obj.item()
|
33
|
+
elif isinstance(obj, np.ndarray):
|
34
|
+
return obj.tolist()
|
35
|
+
elif isinstance(obj, pd.Series):
|
36
|
+
return obj.to_list()
|
37
|
+
elif isinstance(obj, pd.DataFrame):
|
38
|
+
return obj.to_dict(orient='records')
|
39
|
+
return obj
|
40
|
+
|
41
|
+
class PandasTools:
|
42
|
+
"""Tools for data manipulation and analysis using pandas."""
|
43
|
+
|
44
|
+
def __init__(self):
|
45
|
+
"""Initialize PandasTools and check for pandas installation."""
|
46
|
+
self._check_pandas()
|
47
|
+
|
48
|
+
def _check_pandas(self):
|
49
|
+
"""Check if pandas is installed."""
|
50
|
+
if util.find_spec("pandas") is None:
|
51
|
+
raise ImportError("pandas is not available. Please install it using: pip install pandas")
|
52
|
+
global pd, np
|
53
|
+
import pandas as pd
|
54
|
+
import numpy as np
|
55
|
+
|
56
|
+
def read_csv(self, filepath: str, **kwargs) -> Union[pd.DataFrame, Dict[str, str]]:
|
57
|
+
"""
|
58
|
+
Read a CSV file into a pandas DataFrame.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
filepath: Path to the CSV file
|
62
|
+
**kwargs: Additional arguments to pass to pd.read_csv()
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
pd.DataFrame or Dict: DataFrame if successful, error dict if failed
|
66
|
+
"""
|
67
|
+
try:
|
68
|
+
return pd.read_csv(filepath, **kwargs)
|
69
|
+
except Exception as e:
|
70
|
+
error_msg = f"Error reading CSV file {filepath}: {str(e)}"
|
71
|
+
logging.error(error_msg)
|
72
|
+
return {"error": error_msg}
|
73
|
+
|
74
|
+
def read_excel(self, filepath: str, **kwargs) -> Union[pd.DataFrame, Dict[str, str]]:
|
75
|
+
"""
|
76
|
+
Read an Excel file into a pandas DataFrame.
|
77
|
+
|
78
|
+
Args:
|
79
|
+
filepath: Path to the Excel file
|
80
|
+
**kwargs: Additional arguments to pass to pd.read_excel()
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
pd.DataFrame or Dict: DataFrame if successful, error dict if failed
|
84
|
+
"""
|
85
|
+
try:
|
86
|
+
return pd.read_excel(filepath, **kwargs)
|
87
|
+
except Exception as e:
|
88
|
+
error_msg = f"Error reading Excel file {filepath}: {str(e)}"
|
89
|
+
logging.error(error_msg)
|
90
|
+
return {"error": error_msg}
|
91
|
+
|
92
|
+
def write_csv(self, df: pd.DataFrame, filepath: str, **kwargs) -> bool:
|
93
|
+
"""
|
94
|
+
Write DataFrame to a CSV file.
|
95
|
+
|
96
|
+
Args:
|
97
|
+
df: DataFrame to write
|
98
|
+
filepath: Output file path
|
99
|
+
**kwargs: Additional arguments to pass to df.to_csv()
|
100
|
+
|
101
|
+
Returns:
|
102
|
+
bool: True if successful, False otherwise
|
103
|
+
"""
|
104
|
+
try:
|
105
|
+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
106
|
+
df.to_csv(filepath, **kwargs)
|
107
|
+
return True
|
108
|
+
except Exception as e:
|
109
|
+
error_msg = f"Error writing CSV file {filepath}: {str(e)}"
|
110
|
+
logging.error(error_msg)
|
111
|
+
return False
|
112
|
+
|
113
|
+
def write_excel(self, df: pd.DataFrame, filepath: str, **kwargs) -> bool:
|
114
|
+
"""
|
115
|
+
Write DataFrame to an Excel file.
|
116
|
+
|
117
|
+
Args:
|
118
|
+
df: DataFrame to write
|
119
|
+
filepath: Output file path
|
120
|
+
**kwargs: Additional arguments to pass to df.to_excel()
|
121
|
+
|
122
|
+
Returns:
|
123
|
+
bool: True if successful, False otherwise
|
124
|
+
"""
|
125
|
+
try:
|
126
|
+
os.makedirs(os.path.dirname(filepath), exist_ok=True)
|
127
|
+
df.to_excel(filepath, **kwargs)
|
128
|
+
return True
|
129
|
+
except Exception as e:
|
130
|
+
error_msg = f"Error writing Excel file {filepath}: {str(e)}"
|
131
|
+
logging.error(error_msg)
|
132
|
+
return False
|
133
|
+
|
134
|
+
def filter_data(self, df: pd.DataFrame, query: str) -> Union[pd.DataFrame, Dict[str, str]]:
|
135
|
+
"""
|
136
|
+
Filter DataFrame using a query string.
|
137
|
+
|
138
|
+
Args:
|
139
|
+
df: Input DataFrame
|
140
|
+
query: Query string (e.g., "column > 5 and other_column == 'value'")
|
141
|
+
|
142
|
+
Returns:
|
143
|
+
pd.DataFrame or Dict: Filtered DataFrame if successful, error dict if failed
|
144
|
+
"""
|
145
|
+
try:
|
146
|
+
return df.query(query)
|
147
|
+
except Exception as e:
|
148
|
+
error_msg = f"Error filtering data with query '{query}': {str(e)}"
|
149
|
+
logging.error(error_msg)
|
150
|
+
return {"error": error_msg}
|
151
|
+
|
152
|
+
def get_summary(self, df: pd.DataFrame) -> Dict[str, Any]:
|
153
|
+
"""
|
154
|
+
Get a summary of the DataFrame including basic statistics and info.
|
155
|
+
|
156
|
+
Args:
|
157
|
+
df: Input DataFrame
|
158
|
+
|
159
|
+
Returns:
|
160
|
+
Dict: Summary statistics and information
|
161
|
+
"""
|
162
|
+
try:
|
163
|
+
numeric_summary = df.describe().to_dict()
|
164
|
+
# Convert numpy types to native Python types
|
165
|
+
for col in numeric_summary:
|
166
|
+
numeric_summary[col] = {k: _convert_to_serializable(v)
|
167
|
+
for k, v in numeric_summary[col].items()}
|
168
|
+
|
169
|
+
summary = {
|
170
|
+
"shape": list(df.shape),
|
171
|
+
"columns": list(df.columns),
|
172
|
+
"dtypes": df.dtypes.astype(str).to_dict(),
|
173
|
+
"null_counts": df.isnull().sum().to_dict(),
|
174
|
+
"numeric_summary": numeric_summary,
|
175
|
+
"memory_usage": int(df.memory_usage(deep=True).sum()),
|
176
|
+
}
|
177
|
+
return summary
|
178
|
+
except Exception as e:
|
179
|
+
error_msg = f"Error getting data summary: {str(e)}"
|
180
|
+
logging.error(error_msg)
|
181
|
+
return {"error": error_msg}
|
182
|
+
|
183
|
+
def group_by(
|
184
|
+
self,
|
185
|
+
df: pd.DataFrame,
|
186
|
+
columns: Union[str, List[str]],
|
187
|
+
agg_dict: Dict[str, Union[str, List[str]]]
|
188
|
+
) -> Union[pd.DataFrame, Dict[str, str]]:
|
189
|
+
"""
|
190
|
+
Group DataFrame by columns and apply aggregation functions.
|
191
|
+
|
192
|
+
Args:
|
193
|
+
df: Input DataFrame
|
194
|
+
columns: Column(s) to group by
|
195
|
+
agg_dict: Dictionary of column:function pairs for aggregation
|
196
|
+
|
197
|
+
Returns:
|
198
|
+
pd.DataFrame or Dict: Grouped DataFrame if successful, error dict if failed
|
199
|
+
"""
|
200
|
+
try:
|
201
|
+
return df.groupby(columns).agg(agg_dict).reset_index()
|
202
|
+
except Exception as e:
|
203
|
+
error_msg = f"Error grouping data: {str(e)}"
|
204
|
+
logging.error(error_msg)
|
205
|
+
return {"error": error_msg}
|
206
|
+
|
207
|
+
def pivot_table(
|
208
|
+
self,
|
209
|
+
df: pd.DataFrame,
|
210
|
+
index: Union[str, List[str]],
|
211
|
+
columns: Optional[Union[str, List[str]]] = None,
|
212
|
+
values: Optional[Union[str, List[str]]] = None,
|
213
|
+
aggfunc: str = "mean"
|
214
|
+
) -> Union[pd.DataFrame, Dict[str, str]]:
|
215
|
+
"""
|
216
|
+
Create a pivot table from DataFrame.
|
217
|
+
|
218
|
+
Args:
|
219
|
+
df: Input DataFrame
|
220
|
+
index: Column(s) to use as index
|
221
|
+
columns: Column(s) to use as columns
|
222
|
+
values: Column(s) to aggregate
|
223
|
+
aggfunc: Aggregation function to use
|
224
|
+
|
225
|
+
Returns:
|
226
|
+
pd.DataFrame or Dict: Pivot table if successful, error dict if failed
|
227
|
+
"""
|
228
|
+
try:
|
229
|
+
return pd.pivot_table(
|
230
|
+
df,
|
231
|
+
index=index,
|
232
|
+
columns=columns,
|
233
|
+
values=values,
|
234
|
+
aggfunc=aggfunc
|
235
|
+
).reset_index()
|
236
|
+
except Exception as e:
|
237
|
+
error_msg = f"Error creating pivot table: {str(e)}"
|
238
|
+
logging.error(error_msg)
|
239
|
+
return {"error": error_msg}
|
240
|
+
|
241
|
+
# Create instance for direct function access
|
242
|
+
_pandas_tools = PandasTools()
|
243
|
+
read_csv = _pandas_tools.read_csv
|
244
|
+
read_excel = _pandas_tools.read_excel
|
245
|
+
write_csv = _pandas_tools.write_csv
|
246
|
+
write_excel = _pandas_tools.write_excel
|
247
|
+
filter_data = _pandas_tools.filter_data
|
248
|
+
get_summary = _pandas_tools.get_summary
|
249
|
+
group_by = _pandas_tools.group_by
|
250
|
+
pivot_table = _pandas_tools.pivot_table
|
251
|
+
|
252
|
+
if __name__ == "__main__":
|
253
|
+
# Example usage
|
254
|
+
print("\n==================================================")
|
255
|
+
print("PandasTools Demonstration")
|
256
|
+
print("==================================================\n")
|
257
|
+
|
258
|
+
# Create a test directory
|
259
|
+
test_dir = os.path.join(os.getcwd(), "test_files")
|
260
|
+
os.makedirs(test_dir, exist_ok=True)
|
261
|
+
|
262
|
+
# Create a sample DataFrame
|
263
|
+
df = pd.DataFrame({
|
264
|
+
'name': ['John', 'Jane', 'Bob', 'Alice', 'Charlie'],
|
265
|
+
'age': [25, 30, 35, 28, 32],
|
266
|
+
'city': ['New York', 'London', 'Paris', 'Tokyo', 'London'],
|
267
|
+
'salary': [50000, 60000, 75000, 65000, 55000]
|
268
|
+
})
|
269
|
+
|
270
|
+
# 1. Write to CSV
|
271
|
+
print("1. Writing to CSV")
|
272
|
+
print("------------------------------")
|
273
|
+
csv_file = os.path.join(test_dir, "sample.csv")
|
274
|
+
success = write_csv(df, csv_file, index=False)
|
275
|
+
print(f"Write successful: {success}\n")
|
276
|
+
|
277
|
+
# 2. Read from CSV
|
278
|
+
print("2. Reading from CSV")
|
279
|
+
print("------------------------------")
|
280
|
+
df_read = read_csv(csv_file)
|
281
|
+
print("First few rows:")
|
282
|
+
print(df_read.head())
|
283
|
+
print()
|
284
|
+
|
285
|
+
# 3. Filter Data
|
286
|
+
print("3. Filtering Data")
|
287
|
+
print("------------------------------")
|
288
|
+
filtered_df = filter_data(df, "age > 30 and salary > 60000")
|
289
|
+
print("People over 30 with salary > 60000:")
|
290
|
+
print(filtered_df)
|
291
|
+
print()
|
292
|
+
|
293
|
+
# 4. Get Summary
|
294
|
+
print("4. Data Summary")
|
295
|
+
print("------------------------------")
|
296
|
+
summary = get_summary(df)
|
297
|
+
print(json.dumps(summary, indent=2))
|
298
|
+
print()
|
299
|
+
|
300
|
+
# 5. Group By
|
301
|
+
print("5. Group By")
|
302
|
+
print("------------------------------")
|
303
|
+
grouped = group_by(df, "city", {"salary": ["mean", "count"], "age": "mean"})
|
304
|
+
print("Statistics by city:")
|
305
|
+
print(grouped)
|
306
|
+
print()
|
307
|
+
|
308
|
+
# 6. Pivot Table
|
309
|
+
print("6. Pivot Table")
|
310
|
+
print("------------------------------")
|
311
|
+
pivoted = pivot_table(df, index="city", values=["salary", "age"])
|
312
|
+
print("Pivot table by city:")
|
313
|
+
print(pivoted)
|
314
|
+
print()
|
315
|
+
|
316
|
+
# Clean up test directory
|
317
|
+
try:
|
318
|
+
import shutil
|
319
|
+
shutil.rmtree(test_dir)
|
320
|
+
print("Test directory cleaned up successfully")
|
321
|
+
except Exception as e:
|
322
|
+
print(f"Error cleaning up test directory: {str(e)}")
|
323
|
+
|
324
|
+
print("\n==================================================")
|
325
|
+
print("Demonstration Complete")
|
326
|
+
print("==================================================")
|