noesium 0.1.0__py3-none-any.whl → 0.2.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- noesium/agents/askura_agent/__init__.py +22 -0
- noesium/agents/askura_agent/askura_agent.py +480 -0
- noesium/agents/askura_agent/conversation.py +164 -0
- noesium/agents/askura_agent/extractor.py +175 -0
- noesium/agents/askura_agent/memory.py +14 -0
- noesium/agents/askura_agent/models.py +239 -0
- noesium/agents/askura_agent/prompts.py +202 -0
- noesium/agents/askura_agent/reflection.py +234 -0
- noesium/agents/askura_agent/summarizer.py +30 -0
- noesium/agents/askura_agent/utils.py +6 -0
- noesium/agents/deep_research/__init__.py +13 -0
- noesium/agents/deep_research/agent.py +398 -0
- noesium/agents/deep_research/prompts.py +84 -0
- noesium/agents/deep_research/schemas.py +42 -0
- noesium/agents/deep_research/state.py +54 -0
- noesium/agents/search/__init__.py +5 -0
- noesium/agents/search/agent.py +474 -0
- noesium/agents/search/state.py +28 -0
- noesium/core/__init__.py +1 -1
- noesium/core/agent/base.py +10 -2
- noesium/core/goalith/decomposer/llm_decomposer.py +1 -1
- noesium/core/llm/__init__.py +1 -1
- noesium/core/llm/base.py +2 -2
- noesium/core/llm/litellm.py +42 -21
- noesium/core/llm/llamacpp.py +25 -4
- noesium/core/llm/ollama.py +43 -22
- noesium/core/llm/openai.py +25 -5
- noesium/core/llm/openrouter.py +1 -1
- noesium/core/toolify/base.py +9 -2
- noesium/core/toolify/config.py +2 -2
- noesium/core/toolify/registry.py +21 -5
- noesium/core/tracing/opik_tracing.py +7 -7
- noesium/core/vector_store/__init__.py +2 -2
- noesium/core/vector_store/base.py +1 -1
- noesium/core/vector_store/pgvector.py +10 -13
- noesium/core/vector_store/weaviate.py +2 -1
- noesium/toolkits/__init__.py +1 -0
- noesium/toolkits/arxiv_toolkit.py +310 -0
- noesium/toolkits/audio_aliyun_toolkit.py +441 -0
- noesium/toolkits/audio_toolkit.py +370 -0
- noesium/toolkits/bash_toolkit.py +332 -0
- noesium/toolkits/document_toolkit.py +454 -0
- noesium/toolkits/file_edit_toolkit.py +552 -0
- noesium/toolkits/github_toolkit.py +395 -0
- noesium/toolkits/gmail_toolkit.py +575 -0
- noesium/toolkits/image_toolkit.py +425 -0
- noesium/toolkits/memory_toolkit.py +398 -0
- noesium/toolkits/python_executor_toolkit.py +334 -0
- noesium/toolkits/search_toolkit.py +451 -0
- noesium/toolkits/serper_toolkit.py +623 -0
- noesium/toolkits/tabular_data_toolkit.py +537 -0
- noesium/toolkits/user_interaction_toolkit.py +365 -0
- noesium/toolkits/video_toolkit.py +168 -0
- noesium/toolkits/wikipedia_toolkit.py +420 -0
- noesium-0.2.1.dist-info/METADATA +253 -0
- {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/RECORD +59 -23
- {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/licenses/LICENSE +1 -1
- noesium-0.1.0.dist-info/METADATA +0 -525
- {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/WHEEL +0 -0
- {noesium-0.1.0.dist-info → noesium-0.2.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,537 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tabular data toolkit for analyzing and processing structured data files.
|
|
3
|
+
|
|
4
|
+
Provides tools for reading, analyzing, and extracting insights from various
|
|
5
|
+
tabular data formats including CSV, Excel, JSON, and Parquet files.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import math
|
|
10
|
+
import os
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
from typing import Callable, Dict, List, Optional
|
|
13
|
+
|
|
14
|
+
from noesium.core.toolify.base import AsyncBaseToolkit
|
|
15
|
+
from noesium.core.toolify.config import ToolkitConfig
|
|
16
|
+
from noesium.core.toolify.registry import register_toolkit
|
|
17
|
+
from noesium.core.utils.logging import get_logger
|
|
18
|
+
|
|
19
|
+
logger = get_logger(__name__)
|
|
20
|
+
|
|
21
|
+
try:
|
|
22
|
+
import pandas as pd
|
|
23
|
+
|
|
24
|
+
PANDAS_AVAILABLE = True
|
|
25
|
+
except ImportError:
|
|
26
|
+
pd = None
|
|
27
|
+
PANDAS_AVAILABLE = False
|
|
28
|
+
|
|
29
|
+
# Template for column analysis
|
|
30
|
+
COLUMN_ANALYSIS_TEMPLATE = """You are a data analysis agent that extracts and summarizes data structure information from tabular data files (CSV, Excel, etc.).
|
|
31
|
+
|
|
32
|
+
<column_info>
|
|
33
|
+
{column_info}
|
|
34
|
+
</column_info>
|
|
35
|
+
|
|
36
|
+
You should extract the file structure (e.g. the delimiter), and provide detailed column information (e.g. column_name, type, column explanation and sample values) for each column.
|
|
37
|
+
|
|
38
|
+
<output_format>
|
|
39
|
+
### File Structure
|
|
40
|
+
- Delimiter: <the delimiter used in the file, e.g. ',', '\\t', ' '>
|
|
41
|
+
|
|
42
|
+
### Columns
|
|
43
|
+
| Column Name | Type | Explanation | Sample Value |
|
|
44
|
+
|-------------|------|-------------|--------------|
|
|
45
|
+
| name_of_column1 | type_of_column1 | explanation_of_column1, i.e. what the column represents | sample_value_of_column1 |
|
|
46
|
+
| name_of_column2 | type_of_column2 | explanation_of_column2, i.e. what the column represents | sample_value_of_column2 |
|
|
47
|
+
| ... | ... | ... | ... |
|
|
48
|
+
</output_format>"""
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@register_toolkit("tabular_data")
|
|
52
|
+
class TabularDataToolkit(AsyncBaseToolkit):
|
|
53
|
+
"""
|
|
54
|
+
Toolkit for tabular data analysis and processing.
|
|
55
|
+
|
|
56
|
+
This toolkit provides capabilities for:
|
|
57
|
+
- Reading various tabular data formats (CSV, Excel, JSON, Parquet, TSV)
|
|
58
|
+
- Analyzing data structure and column information
|
|
59
|
+
- Extracting metadata and statistics
|
|
60
|
+
- LLM-powered data interpretation
|
|
61
|
+
- Data quality assessment
|
|
62
|
+
|
|
63
|
+
Features:
|
|
64
|
+
- Multi-format support with automatic encoding detection
|
|
65
|
+
- Intelligent column analysis and interpretation
|
|
66
|
+
- Statistical summaries and data profiling
|
|
67
|
+
- LLM-powered column explanation generation
|
|
68
|
+
- Data validation and quality checks
|
|
69
|
+
- Memory-efficient processing for large files
|
|
70
|
+
|
|
71
|
+
Supported Formats:
|
|
72
|
+
- **CSV**: Comma-separated values with encoding detection
|
|
73
|
+
- **Excel**: XLSX and XLS files with multiple sheets
|
|
74
|
+
- **JSON**: JSON files with tabular structure
|
|
75
|
+
- **Parquet**: Columnar storage format
|
|
76
|
+
- **TSV**: Tab-separated values
|
|
77
|
+
- **Generic**: Auto-detection for other delimited formats
|
|
78
|
+
|
|
79
|
+
Required dependency: pandas
|
|
80
|
+
Install with: pip install pandas
|
|
81
|
+
"""
|
|
82
|
+
|
|
83
|
+
def __init__(self, config: ToolkitConfig = None):
|
|
84
|
+
"""
|
|
85
|
+
Initialize the tabular data toolkit.
|
|
86
|
+
|
|
87
|
+
Args:
|
|
88
|
+
config: Toolkit configuration
|
|
89
|
+
|
|
90
|
+
Raises:
|
|
91
|
+
ImportError: If pandas is not installed
|
|
92
|
+
"""
|
|
93
|
+
super().__init__(config)
|
|
94
|
+
|
|
95
|
+
if not PANDAS_AVAILABLE:
|
|
96
|
+
raise ImportError("pandas is required for TabularDataToolkit. " "Install with: pip install pandas")
|
|
97
|
+
|
|
98
|
+
# Configuration
|
|
99
|
+
self.max_file_size = self.config.config.get("max_file_size", 100 * 1024 * 1024) # 100MB
|
|
100
|
+
self.max_rows_preview = self.config.config.get("max_rows_preview", 1000)
|
|
101
|
+
self.cache_dir = Path(self.config.config.get("cache_dir", "./tabular_cache"))
|
|
102
|
+
|
|
103
|
+
# Create cache directory
|
|
104
|
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
|
105
|
+
|
|
106
|
+
self.logger.info("Tabular data toolkit initialized")
|
|
107
|
+
|
|
108
|
+
def _load_tabular_data(self, file_path: str, **kwargs) -> pd.DataFrame:
|
|
109
|
+
"""
|
|
110
|
+
Load tabular data from a file with automatic format detection.
|
|
111
|
+
|
|
112
|
+
Args:
|
|
113
|
+
file_path: Path to the data file
|
|
114
|
+
**kwargs: Additional parameters for pandas readers
|
|
115
|
+
|
|
116
|
+
Returns:
|
|
117
|
+
DataFrame containing the tabular data
|
|
118
|
+
|
|
119
|
+
Raises:
|
|
120
|
+
Exception: If the file cannot be loaded as tabular data
|
|
121
|
+
"""
|
|
122
|
+
file_path = Path(file_path)
|
|
123
|
+
|
|
124
|
+
# Check file size
|
|
125
|
+
if file_path.stat().st_size > self.max_file_size:
|
|
126
|
+
raise ValueError(f"File too large ({file_path.stat().st_size} bytes, max: {self.max_file_size})")
|
|
127
|
+
|
|
128
|
+
file_ext = file_path.suffix.lower()
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
if file_ext == ".csv":
|
|
132
|
+
# Try different encodings for CSV files
|
|
133
|
+
encodings = ["utf-8", "latin1", "cp1252", "iso-8859-1"]
|
|
134
|
+
for encoding in encodings:
|
|
135
|
+
try:
|
|
136
|
+
df = pd.read_csv(file_path, encoding=encoding, **kwargs)
|
|
137
|
+
self.logger.info(f"Successfully loaded CSV with {encoding} encoding")
|
|
138
|
+
return df
|
|
139
|
+
except UnicodeDecodeError:
|
|
140
|
+
continue
|
|
141
|
+
raise Exception("Could not read CSV file with any supported encoding")
|
|
142
|
+
|
|
143
|
+
elif file_ext in [".xlsx", ".xls"]:
|
|
144
|
+
df = pd.read_excel(file_path, **kwargs)
|
|
145
|
+
self.logger.info(f"Successfully loaded Excel file")
|
|
146
|
+
return df
|
|
147
|
+
|
|
148
|
+
elif file_ext == ".json":
|
|
149
|
+
df = pd.read_json(file_path, **kwargs)
|
|
150
|
+
self.logger.info(f"Successfully loaded JSON file")
|
|
151
|
+
return df
|
|
152
|
+
|
|
153
|
+
elif file_ext == ".parquet":
|
|
154
|
+
df = pd.read_parquet(file_path, **kwargs)
|
|
155
|
+
self.logger.info(f"Successfully loaded Parquet file")
|
|
156
|
+
return df
|
|
157
|
+
|
|
158
|
+
elif file_ext == ".tsv":
|
|
159
|
+
# Tab-separated values
|
|
160
|
+
encodings = ["utf-8", "latin1", "cp1252", "iso-8859-1"]
|
|
161
|
+
for encoding in encodings:
|
|
162
|
+
try:
|
|
163
|
+
df = pd.read_csv(file_path, sep="\t", encoding=encoding, **kwargs)
|
|
164
|
+
self.logger.info(f"Successfully loaded TSV with {encoding} encoding")
|
|
165
|
+
return df
|
|
166
|
+
except UnicodeDecodeError:
|
|
167
|
+
continue
|
|
168
|
+
raise Exception("Could not read TSV file with any supported encoding")
|
|
169
|
+
|
|
170
|
+
else:
|
|
171
|
+
# Try to read as CSV by default
|
|
172
|
+
try:
|
|
173
|
+
df = pd.read_csv(file_path, **kwargs)
|
|
174
|
+
self.logger.info(f"Successfully loaded file as CSV")
|
|
175
|
+
return df
|
|
176
|
+
except Exception as e:
|
|
177
|
+
raise Exception(f"Unsupported file format: {file_ext}") from e
|
|
178
|
+
|
|
179
|
+
except Exception as e:
|
|
180
|
+
self.logger.error(f"Failed to load tabular data: {e}")
|
|
181
|
+
raise
|
|
182
|
+
|
|
183
|
+
def _extract_column_info(self, df: pd.DataFrame, return_features: Optional[List[str]] = None) -> List[Dict]:
|
|
184
|
+
"""
|
|
185
|
+
Extract detailed information about DataFrame columns.
|
|
186
|
+
|
|
187
|
+
Args:
|
|
188
|
+
df: DataFrame to analyze
|
|
189
|
+
return_features: List of features to include in output
|
|
190
|
+
|
|
191
|
+
Returns:
|
|
192
|
+
List of dictionaries containing column information
|
|
193
|
+
"""
|
|
194
|
+
column_info = []
|
|
195
|
+
|
|
196
|
+
for col in df.columns:
|
|
197
|
+
try:
|
|
198
|
+
# Get data type
|
|
199
|
+
dtype = str(df[col].dtype)
|
|
200
|
+
|
|
201
|
+
# Get sample value (first non-null value)
|
|
202
|
+
sample_value = None
|
|
203
|
+
non_null_values = df[col].dropna()
|
|
204
|
+
|
|
205
|
+
if len(non_null_values) > 0:
|
|
206
|
+
sample_value = non_null_values.iloc[0]
|
|
207
|
+
|
|
208
|
+
# Handle different data types for display
|
|
209
|
+
if pd.isna(sample_value):
|
|
210
|
+
sample_str = "NaN"
|
|
211
|
+
elif isinstance(sample_value, float):
|
|
212
|
+
if math.isnan(sample_value):
|
|
213
|
+
sample_str = "NaN"
|
|
214
|
+
else:
|
|
215
|
+
sample_str = str(sample_value)
|
|
216
|
+
else:
|
|
217
|
+
sample_str = str(sample_value)
|
|
218
|
+
else:
|
|
219
|
+
sample_str = "No data"
|
|
220
|
+
|
|
221
|
+
# Additional statistics
|
|
222
|
+
null_count = df[col].isnull().sum()
|
|
223
|
+
null_percentage = (null_count / len(df)) * 100
|
|
224
|
+
unique_count = df[col].nunique()
|
|
225
|
+
|
|
226
|
+
col_info = {
|
|
227
|
+
"column_name": str(col),
|
|
228
|
+
"type": dtype,
|
|
229
|
+
"sample": sample_str,
|
|
230
|
+
"null_count": int(null_count),
|
|
231
|
+
"null_percentage": round(null_percentage, 2),
|
|
232
|
+
"unique_count": int(unique_count),
|
|
233
|
+
"total_rows": len(df),
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
# Add numeric statistics for numeric columns
|
|
237
|
+
if pd.api.types.is_numeric_dtype(df[col]):
|
|
238
|
+
col_info.update(
|
|
239
|
+
{
|
|
240
|
+
"mean": df[col].mean() if not df[col].empty else None,
|
|
241
|
+
"std": df[col].std() if not df[col].empty else None,
|
|
242
|
+
"min": df[col].min() if not df[col].empty else None,
|
|
243
|
+
"max": df[col].max() if not df[col].empty else None,
|
|
244
|
+
}
|
|
245
|
+
)
|
|
246
|
+
|
|
247
|
+
column_info.append(col_info)
|
|
248
|
+
|
|
249
|
+
except Exception as e:
|
|
250
|
+
self.logger.warning(f"Error processing column '{col}': {e}")
|
|
251
|
+
column_info.append(
|
|
252
|
+
{"column_name": str(col), "type": "unknown", "sample": "Error reading sample", "error": str(e)}
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
return column_info
|
|
256
|
+
|
|
257
|
+
def _format_column_info(self, column_info: List[Dict], return_features: Optional[List[str]] = None) -> str:
|
|
258
|
+
"""
|
|
259
|
+
Format column information as a readable string.
|
|
260
|
+
|
|
261
|
+
Args:
|
|
262
|
+
column_info: List of column information dictionaries
|
|
263
|
+
return_features: Features to include in output
|
|
264
|
+
|
|
265
|
+
Returns:
|
|
266
|
+
Formatted string representation
|
|
267
|
+
"""
|
|
268
|
+
if not column_info:
|
|
269
|
+
return "No columns found"
|
|
270
|
+
|
|
271
|
+
if "error" in column_info[0]:
|
|
272
|
+
return column_info[0]["error"]
|
|
273
|
+
|
|
274
|
+
lines = []
|
|
275
|
+
default_features = ["column_name", "type", "sample", "null_percentage", "unique_count"]
|
|
276
|
+
features_to_show = return_features if return_features else default_features
|
|
277
|
+
|
|
278
|
+
for i, col in enumerate(column_info):
|
|
279
|
+
# Filter features to show
|
|
280
|
+
filtered_info = {k: col[k] for k in features_to_show if k in col}
|
|
281
|
+
|
|
282
|
+
lines.append(f"- Column {i + 1}: {json.dumps(filtered_info, ensure_ascii=False, default=str)}")
|
|
283
|
+
|
|
284
|
+
return "\n".join(lines)
|
|
285
|
+
|
|
286
|
+
async def get_tabular_columns(self, file_path: str, return_features: Optional[List[str]] = None) -> str:
|
|
287
|
+
"""
|
|
288
|
+
Extract raw column metadata from tabular data files.
|
|
289
|
+
|
|
290
|
+
This tool directly reads tabular data files and returns basic column
|
|
291
|
+
information including names, data types, sample values, and statistics.
|
|
292
|
+
It's useful for understanding the structure of data files before analysis.
|
|
293
|
+
|
|
294
|
+
Args:
|
|
295
|
+
file_path: Path to the tabular data file
|
|
296
|
+
return_features: List of features to include (column_name, type, sample,
|
|
297
|
+
null_count, null_percentage, unique_count, mean, std, min, max)
|
|
298
|
+
|
|
299
|
+
Returns:
|
|
300
|
+
Formatted string with column information and statistics
|
|
301
|
+
|
|
302
|
+
Supported formats: CSV, Excel (XLSX/XLS), JSON, Parquet, TSV
|
|
303
|
+
|
|
304
|
+
Example:
|
|
305
|
+
info = await get_tabular_columns("data.csv", ["column_name", "type", "sample"])
|
|
306
|
+
"""
|
|
307
|
+
self.logger.info(f"Analyzing tabular columns for: {file_path}")
|
|
308
|
+
|
|
309
|
+
if not os.path.exists(file_path):
|
|
310
|
+
return f"Error: File '{file_path}' does not exist."
|
|
311
|
+
|
|
312
|
+
try:
|
|
313
|
+
# Load the data
|
|
314
|
+
df = self._load_tabular_data(file_path)
|
|
315
|
+
|
|
316
|
+
# Extract column information
|
|
317
|
+
column_info = self._extract_column_info(df, return_features)
|
|
318
|
+
|
|
319
|
+
# Format and return
|
|
320
|
+
result = self._format_column_info(column_info, return_features)
|
|
321
|
+
|
|
322
|
+
self.logger.info(f"Successfully analyzed {len(column_info)} columns")
|
|
323
|
+
return result
|
|
324
|
+
|
|
325
|
+
except Exception as e:
|
|
326
|
+
error_msg = f"Error analyzing file '{file_path}': {str(e)}"
|
|
327
|
+
self.logger.error(error_msg)
|
|
328
|
+
return error_msg
|
|
329
|
+
|
|
330
|
+
async def get_column_info(self, file_path: str) -> str:
|
|
331
|
+
"""
|
|
332
|
+
Get intelligent analysis and interpretation of column information.
|
|
333
|
+
|
|
334
|
+
This tool builds on get_tabular_columns() to provide LLM-powered analysis
|
|
335
|
+
of the data structure, including file format detection and column meaning
|
|
336
|
+
interpretation. It's useful for understanding what the data represents.
|
|
337
|
+
|
|
338
|
+
Args:
|
|
339
|
+
file_path: Path to the tabular data file
|
|
340
|
+
|
|
341
|
+
Returns:
|
|
342
|
+
Detailed analysis with file structure and column explanations
|
|
343
|
+
|
|
344
|
+
Example:
|
|
345
|
+
analysis = await get_column_info("sales_data.csv")
|
|
346
|
+
# Returns structured analysis with column explanations
|
|
347
|
+
"""
|
|
348
|
+
self.logger.info(f"Getting intelligent column analysis for: {file_path}")
|
|
349
|
+
|
|
350
|
+
try:
|
|
351
|
+
# Get raw column information
|
|
352
|
+
column_info_str = await self.get_tabular_columns(file_path)
|
|
353
|
+
|
|
354
|
+
if column_info_str.startswith("Error:"):
|
|
355
|
+
return column_info_str
|
|
356
|
+
|
|
357
|
+
# Use LLM for intelligent analysis
|
|
358
|
+
prompt = COLUMN_ANALYSIS_TEMPLATE.format(column_info=column_info_str)
|
|
359
|
+
|
|
360
|
+
response = self.llm_client.completion(
|
|
361
|
+
messages=[
|
|
362
|
+
{
|
|
363
|
+
"role": "system",
|
|
364
|
+
"content": "You are a data analysis expert specializing in tabular data structure analysis.",
|
|
365
|
+
},
|
|
366
|
+
{"role": "user", "content": prompt},
|
|
367
|
+
],
|
|
368
|
+
temperature=0.1,
|
|
369
|
+
max_tokens=1500,
|
|
370
|
+
)
|
|
371
|
+
|
|
372
|
+
return response.strip()
|
|
373
|
+
|
|
374
|
+
except Exception as e:
|
|
375
|
+
error_msg = f"Error during intelligent analysis: {str(e)}"
|
|
376
|
+
self.logger.error(error_msg)
|
|
377
|
+
return error_msg
|
|
378
|
+
|
|
379
|
+
async def get_data_summary(self, file_path: str, max_rows: Optional[int] = None) -> str:
|
|
380
|
+
"""
|
|
381
|
+
Get a comprehensive summary of the tabular data.
|
|
382
|
+
|
|
383
|
+
Args:
|
|
384
|
+
file_path: Path to the tabular data file
|
|
385
|
+
max_rows: Maximum number of rows to analyze (default: 1000)
|
|
386
|
+
|
|
387
|
+
Returns:
|
|
388
|
+
Comprehensive data summary including statistics and insights
|
|
389
|
+
"""
|
|
390
|
+
self.logger.info(f"Generating data summary for: {file_path}")
|
|
391
|
+
|
|
392
|
+
max_rows = max_rows or self.max_rows_preview
|
|
393
|
+
|
|
394
|
+
try:
|
|
395
|
+
# Load data (with row limit for large files)
|
|
396
|
+
df = self._load_tabular_data(file_path, nrows=max_rows)
|
|
397
|
+
|
|
398
|
+
summary_lines = [
|
|
399
|
+
f"Data Summary for: {file_path}",
|
|
400
|
+
"=" * 50,
|
|
401
|
+
"",
|
|
402
|
+
f"Shape: {df.shape[0]:,} rows × {df.shape[1]} columns",
|
|
403
|
+
f"Memory usage: {df.memory_usage(deep=True).sum() / 1024 / 1024:.2f} MB",
|
|
404
|
+
"",
|
|
405
|
+
]
|
|
406
|
+
|
|
407
|
+
# Data types summary
|
|
408
|
+
type_counts = df.dtypes.value_counts()
|
|
409
|
+
summary_lines.append("Data Types:")
|
|
410
|
+
for dtype, count in type_counts.items():
|
|
411
|
+
summary_lines.append(f" {dtype}: {count} columns")
|
|
412
|
+
summary_lines.append("")
|
|
413
|
+
|
|
414
|
+
# Missing data summary
|
|
415
|
+
missing_data = df.isnull().sum()
|
|
416
|
+
missing_cols = missing_data[missing_data > 0]
|
|
417
|
+
if len(missing_cols) > 0:
|
|
418
|
+
summary_lines.append("Missing Data:")
|
|
419
|
+
for col, count in missing_cols.items():
|
|
420
|
+
percentage = (count / len(df)) * 100
|
|
421
|
+
summary_lines.append(f" {col}: {count} ({percentage:.1f}%)")
|
|
422
|
+
else:
|
|
423
|
+
summary_lines.append("Missing Data: None")
|
|
424
|
+
summary_lines.append("")
|
|
425
|
+
|
|
426
|
+
# Numeric columns summary
|
|
427
|
+
numeric_cols = df.select_dtypes(include=["number"]).columns
|
|
428
|
+
if len(numeric_cols) > 0:
|
|
429
|
+
summary_lines.append("Numeric Columns Summary:")
|
|
430
|
+
desc = df[numeric_cols].describe()
|
|
431
|
+
summary_lines.append(desc.to_string())
|
|
432
|
+
summary_lines.append("")
|
|
433
|
+
|
|
434
|
+
# Categorical columns summary
|
|
435
|
+
categorical_cols = df.select_dtypes(include=["object", "category"]).columns
|
|
436
|
+
if len(categorical_cols) > 0:
|
|
437
|
+
summary_lines.append("Categorical Columns (Top Values):")
|
|
438
|
+
for col in categorical_cols[:5]: # Limit to first 5 categorical columns
|
|
439
|
+
top_values = df[col].value_counts().head(3)
|
|
440
|
+
summary_lines.append(f" {col}:")
|
|
441
|
+
for value, count in top_values.items():
|
|
442
|
+
summary_lines.append(f" {value}: {count}")
|
|
443
|
+
summary_lines.append("")
|
|
444
|
+
|
|
445
|
+
# Data quality indicators
|
|
446
|
+
summary_lines.append("Data Quality Indicators:")
|
|
447
|
+
duplicate_rows = df.duplicated().sum()
|
|
448
|
+
summary_lines.append(f" Duplicate rows: {duplicate_rows}")
|
|
449
|
+
|
|
450
|
+
# Unique values per column
|
|
451
|
+
unique_ratios = df.nunique() / len(df)
|
|
452
|
+
high_cardinality = unique_ratios[unique_ratios > 0.9].index.tolist()
|
|
453
|
+
if high_cardinality:
|
|
454
|
+
summary_lines.append(f" High cardinality columns: {', '.join(high_cardinality)}")
|
|
455
|
+
|
|
456
|
+
return "\n".join(summary_lines)
|
|
457
|
+
|
|
458
|
+
except Exception as e:
|
|
459
|
+
error_msg = f"Error generating data summary: {str(e)}"
|
|
460
|
+
self.logger.error(error_msg)
|
|
461
|
+
return error_msg
|
|
462
|
+
|
|
463
|
+
async def validate_data_quality(self, file_path: str) -> str:
|
|
464
|
+
"""
|
|
465
|
+
Perform data quality validation and return a report.
|
|
466
|
+
|
|
467
|
+
Args:
|
|
468
|
+
file_path: Path to the tabular data file
|
|
469
|
+
|
|
470
|
+
Returns:
|
|
471
|
+
Data quality validation report
|
|
472
|
+
"""
|
|
473
|
+
self.logger.info(f"Validating data quality for: {file_path}")
|
|
474
|
+
|
|
475
|
+
try:
|
|
476
|
+
df = self._load_tabular_data(file_path, nrows=self.max_rows_preview)
|
|
477
|
+
|
|
478
|
+
issues = []
|
|
479
|
+
|
|
480
|
+
# Check for missing data
|
|
481
|
+
missing_data = df.isnull().sum()
|
|
482
|
+
high_missing = missing_data[missing_data > len(df) * 0.5]
|
|
483
|
+
if len(high_missing) > 0:
|
|
484
|
+
issues.append(f"High missing data (>50%): {list(high_missing.index)}")
|
|
485
|
+
|
|
486
|
+
# Check for duplicate rows
|
|
487
|
+
duplicates = df.duplicated().sum()
|
|
488
|
+
if duplicates > 0:
|
|
489
|
+
issues.append(f"Duplicate rows found: {duplicates}")
|
|
490
|
+
|
|
491
|
+
# Check for columns with single value
|
|
492
|
+
single_value_cols = [col for col in df.columns if df[col].nunique() <= 1]
|
|
493
|
+
if single_value_cols:
|
|
494
|
+
issues.append(f"Columns with single/no values: {single_value_cols}")
|
|
495
|
+
|
|
496
|
+
# Check for potential ID columns (high cardinality)
|
|
497
|
+
potential_ids = [col for col in df.columns if df[col].nunique() == len(df)]
|
|
498
|
+
if potential_ids:
|
|
499
|
+
issues.append(f"Potential ID columns (unique values): {potential_ids}")
|
|
500
|
+
|
|
501
|
+
# Check for mixed data types in object columns
|
|
502
|
+
mixed_type_issues = []
|
|
503
|
+
for col in df.select_dtypes(include=["object"]).columns:
|
|
504
|
+
sample_types = df[col].dropna().apply(type).unique()
|
|
505
|
+
if len(sample_types) > 1:
|
|
506
|
+
mixed_type_issues.append(col)
|
|
507
|
+
if mixed_type_issues:
|
|
508
|
+
issues.append(f"Columns with mixed data types: {mixed_type_issues}")
|
|
509
|
+
|
|
510
|
+
# Generate report
|
|
511
|
+
if issues:
|
|
512
|
+
report = f"Data Quality Issues Found ({len(issues)}):\n\n"
|
|
513
|
+
for i, issue in enumerate(issues, 1):
|
|
514
|
+
report += f"{i}. {issue}\n"
|
|
515
|
+
else:
|
|
516
|
+
report = "✅ No major data quality issues detected."
|
|
517
|
+
|
|
518
|
+
return report
|
|
519
|
+
|
|
520
|
+
except Exception as e:
|
|
521
|
+
error_msg = f"Error during data quality validation: {str(e)}"
|
|
522
|
+
self.logger.error(error_msg)
|
|
523
|
+
return error_msg
|
|
524
|
+
|
|
525
|
+
async def get_tools_map(self) -> Dict[str, Callable]:
|
|
526
|
+
"""
|
|
527
|
+
Get the mapping of tool names to their implementation functions.
|
|
528
|
+
|
|
529
|
+
Returns:
|
|
530
|
+
Dictionary mapping tool names to callable functions
|
|
531
|
+
"""
|
|
532
|
+
return {
|
|
533
|
+
"get_tabular_columns": self.get_tabular_columns,
|
|
534
|
+
"get_column_info": self.get_column_info,
|
|
535
|
+
"get_data_summary": self.get_data_summary,
|
|
536
|
+
"validate_data_quality": self.validate_data_quality,
|
|
537
|
+
}
|