contextagent 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- agentz/agent/base.py +262 -0
- agentz/artifacts/__init__.py +5 -0
- agentz/artifacts/artifact_writer.py +538 -0
- agentz/artifacts/reporter.py +235 -0
- agentz/artifacts/terminal_writer.py +100 -0
- agentz/context/__init__.py +6 -0
- agentz/context/context.py +91 -0
- agentz/context/conversation.py +205 -0
- agentz/context/data_store.py +208 -0
- agentz/llm/llm_setup.py +156 -0
- agentz/mcp/manager.py +142 -0
- agentz/mcp/patches.py +88 -0
- agentz/mcp/servers/chrome_devtools/server.py +14 -0
- agentz/profiles/base.py +108 -0
- agentz/profiles/data/data_analysis.py +38 -0
- agentz/profiles/data/data_loader.py +35 -0
- agentz/profiles/data/evaluation.py +43 -0
- agentz/profiles/data/model_training.py +47 -0
- agentz/profiles/data/preprocessing.py +47 -0
- agentz/profiles/data/visualization.py +47 -0
- agentz/profiles/manager/evaluate.py +51 -0
- agentz/profiles/manager/memory.py +62 -0
- agentz/profiles/manager/observe.py +48 -0
- agentz/profiles/manager/routing.py +66 -0
- agentz/profiles/manager/writer.py +51 -0
- agentz/profiles/mcp/browser.py +21 -0
- agentz/profiles/mcp/chrome.py +21 -0
- agentz/profiles/mcp/notion.py +21 -0
- agentz/runner/__init__.py +74 -0
- agentz/runner/base.py +28 -0
- agentz/runner/executor.py +320 -0
- agentz/runner/hooks.py +110 -0
- agentz/runner/iteration.py +142 -0
- agentz/runner/patterns.py +215 -0
- agentz/runner/tracker.py +188 -0
- agentz/runner/utils.py +45 -0
- agentz/runner/workflow.py +250 -0
- agentz/tools/__init__.py +20 -0
- agentz/tools/data_tools/__init__.py +17 -0
- agentz/tools/data_tools/data_analysis.py +152 -0
- agentz/tools/data_tools/data_loading.py +92 -0
- agentz/tools/data_tools/evaluation.py +175 -0
- agentz/tools/data_tools/helpers.py +120 -0
- agentz/tools/data_tools/model_training.py +192 -0
- agentz/tools/data_tools/preprocessing.py +229 -0
- agentz/tools/data_tools/visualization.py +281 -0
- agentz/utils/__init__.py +69 -0
- agentz/utils/config.py +708 -0
- agentz/utils/helpers.py +10 -0
- agentz/utils/parsers.py +142 -0
- agentz/utils/printer.py +539 -0
- contextagent-0.1.0.dist-info/METADATA +269 -0
- contextagent-0.1.0.dist-info/RECORD +66 -0
- contextagent-0.1.0.dist-info/WHEEL +5 -0
- contextagent-0.1.0.dist-info/licenses/LICENSE +21 -0
- contextagent-0.1.0.dist-info/top_level.txt +2 -0
- pipelines/base.py +972 -0
- pipelines/data_scientist.py +97 -0
- pipelines/data_scientist_memory.py +151 -0
- pipelines/experience_learner.py +0 -0
- pipelines/prompt_generator.py +0 -0
- pipelines/simple.py +78 -0
- pipelines/simple_browser.py +145 -0
- pipelines/simple_chrome.py +75 -0
- pipelines/simple_notion.py +103 -0
- pipelines/tool_builder.py +0 -0
@@ -0,0 +1,229 @@
|
|
1
|
+
"""Data preprocessing tool for cleaning and transforming datasets."""
|
2
|
+
|
3
|
+
from typing import Union, Dict, Any, List, Optional
|
4
|
+
from pathlib import Path
|
5
|
+
import pandas as pd
|
6
|
+
import numpy as np
|
7
|
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
|
8
|
+
from agents import function_tool
|
9
|
+
from agents.run_context import RunContextWrapper
|
10
|
+
from agentz.context.data_store import DataStore
|
11
|
+
from .helpers import load_or_get_dataframe
|
12
|
+
from loguru import logger
|
13
|
+
|
14
|
+
|
15
|
+
@function_tool
|
16
|
+
async def preprocess_data(
|
17
|
+
ctx: RunContextWrapper[DataStore],
|
18
|
+
operations: List[str],
|
19
|
+
file_path: Optional[str] = None,
|
20
|
+
target_column: Optional[str] = None,
|
21
|
+
output_path: Optional[str] = None
|
22
|
+
) -> Union[Dict[str, Any], str]:
|
23
|
+
"""Performs data preprocessing operations on a dataset.
|
24
|
+
|
25
|
+
This tool automatically uses the current dataset from the pipeline context.
|
26
|
+
A file_path can optionally be provided to preprocess a different dataset.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
ctx: Pipeline context wrapper for accessing the data store
|
30
|
+
operations: List of preprocessing operations to perform. Options:
|
31
|
+
- "handle_missing": Handle missing values (mean/median/mode imputation)
|
32
|
+
- "remove_duplicates": Remove duplicate rows
|
33
|
+
- "encode_categorical": Encode categorical variables
|
34
|
+
- "scale_standard": Standardize numerical features (z-score)
|
35
|
+
- "scale_minmax": Min-max scaling for numerical features
|
36
|
+
- "remove_outliers": Remove outliers using IQR method
|
37
|
+
- "feature_engineering": Create interaction features
|
38
|
+
file_path: Optional path to dataset file. If not provided, uses current dataset.
|
39
|
+
target_column: Optional target column to preserve
|
40
|
+
output_path: Optional path to save preprocessed dataset
|
41
|
+
|
42
|
+
Returns:
|
43
|
+
Dictionary containing:
|
44
|
+
- operations_applied: List of operations performed
|
45
|
+
- original_shape: Original dataset shape
|
46
|
+
- preprocessed_shape: Preprocessed dataset shape
|
47
|
+
- changes_summary: Summary of changes made
|
48
|
+
- output_path: Path where preprocessed data was saved (if output_path provided)
|
49
|
+
Or error message string if preprocessing fails
|
50
|
+
"""
|
51
|
+
try:
|
52
|
+
# Get DataFrame - either from file_path or current dataset
|
53
|
+
data_store = ctx.context
|
54
|
+
if file_path is None:
|
55
|
+
if data_store and data_store.has("current_dataset"):
|
56
|
+
df = data_store.get("current_dataset")
|
57
|
+
logger.info("Preprocessing current dataset from pipeline context")
|
58
|
+
else:
|
59
|
+
return "Error: No dataset loaded. Please load a dataset first using the load_dataset tool."
|
60
|
+
else:
|
61
|
+
df = load_or_get_dataframe(file_path, prefer_preprocessed=False, data_store=data_store)
|
62
|
+
logger.info(f"Preprocessing dataset from: {file_path}")
|
63
|
+
|
64
|
+
original_shape = df.shape
|
65
|
+
changes_summary = []
|
66
|
+
operations_applied = []
|
67
|
+
|
68
|
+
# Handle missing values
|
69
|
+
if "handle_missing" in operations:
|
70
|
+
missing_before = df.isnull().sum().sum()
|
71
|
+
|
72
|
+
for col in df.columns:
|
73
|
+
if df[col].isnull().sum() > 0:
|
74
|
+
if pd.api.types.is_numeric_dtype(df[col]):
|
75
|
+
# Use median for numerical columns
|
76
|
+
df[col].fillna(df[col].median(), inplace=True)
|
77
|
+
else:
|
78
|
+
# Use mode for categorical columns
|
79
|
+
df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'Unknown', inplace=True)
|
80
|
+
|
81
|
+
missing_after = df.isnull().sum().sum()
|
82
|
+
changes_summary.append(f"Filled {missing_before - missing_after} missing values")
|
83
|
+
operations_applied.append("handle_missing")
|
84
|
+
|
85
|
+
# Remove duplicates
|
86
|
+
if "remove_duplicates" in operations:
|
87
|
+
duplicates_before = df.duplicated().sum()
|
88
|
+
df.drop_duplicates(inplace=True)
|
89
|
+
changes_summary.append(f"Removed {duplicates_before} duplicate rows")
|
90
|
+
operations_applied.append("remove_duplicates")
|
91
|
+
|
92
|
+
# Encode categorical variables
|
93
|
+
if "encode_categorical" in operations:
|
94
|
+
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
|
95
|
+
|
96
|
+
# Exclude target column from encoding if specified
|
97
|
+
if target_column and target_column in categorical_cols:
|
98
|
+
categorical_cols.remove(target_column)
|
99
|
+
|
100
|
+
encoded_cols = []
|
101
|
+
for col in categorical_cols:
|
102
|
+
if df[col].nunique() <= 10: # Only encode if reasonable number of categories
|
103
|
+
le = LabelEncoder()
|
104
|
+
df[col] = le.fit_transform(df[col].astype(str))
|
105
|
+
encoded_cols.append(col)
|
106
|
+
|
107
|
+
changes_summary.append(f"Encoded {len(encoded_cols)} categorical columns: {', '.join(encoded_cols)}")
|
108
|
+
operations_applied.append("encode_categorical")
|
109
|
+
|
110
|
+
# Standardize numerical features
|
111
|
+
if "scale_standard" in operations:
|
112
|
+
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
|
113
|
+
|
114
|
+
# Exclude target column from scaling if specified
|
115
|
+
if target_column and target_column in numerical_cols:
|
116
|
+
numerical_cols.remove(target_column)
|
117
|
+
|
118
|
+
if numerical_cols:
|
119
|
+
scaler = StandardScaler()
|
120
|
+
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
|
121
|
+
changes_summary.append(f"Standardized {len(numerical_cols)} numerical columns")
|
122
|
+
operations_applied.append("scale_standard")
|
123
|
+
|
124
|
+
# Min-max scaling
|
125
|
+
if "scale_minmax" in operations:
|
126
|
+
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
|
127
|
+
|
128
|
+
# Exclude target column from scaling if specified
|
129
|
+
if target_column and target_column in numerical_cols:
|
130
|
+
numerical_cols.remove(target_column)
|
131
|
+
|
132
|
+
if numerical_cols:
|
133
|
+
scaler = MinMaxScaler()
|
134
|
+
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
|
135
|
+
changes_summary.append(f"Min-max scaled {len(numerical_cols)} numerical columns")
|
136
|
+
operations_applied.append("scale_minmax")
|
137
|
+
|
138
|
+
# Remove outliers
|
139
|
+
if "remove_outliers" in operations:
|
140
|
+
rows_before = len(df)
|
141
|
+
numerical_cols = df.select_dtypes(include=['number']).columns
|
142
|
+
|
143
|
+
for col in numerical_cols:
|
144
|
+
Q1 = df[col].quantile(0.25)
|
145
|
+
Q3 = df[col].quantile(0.75)
|
146
|
+
IQR = Q3 - Q1
|
147
|
+
lower_bound = Q1 - 1.5 * IQR
|
148
|
+
upper_bound = Q3 + 1.5 * IQR
|
149
|
+
df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
|
150
|
+
|
151
|
+
rows_removed = rows_before - len(df)
|
152
|
+
changes_summary.append(f"Removed {rows_removed} outlier rows")
|
153
|
+
operations_applied.append("remove_outliers")
|
154
|
+
|
155
|
+
# Feature engineering
|
156
|
+
if "feature_engineering" in operations:
|
157
|
+
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
|
158
|
+
|
159
|
+
# Exclude target column
|
160
|
+
if target_column and target_column in numerical_cols:
|
161
|
+
numerical_cols.remove(target_column)
|
162
|
+
|
163
|
+
# Create polynomial features for first 3 numerical columns (to avoid explosion)
|
164
|
+
if len(numerical_cols) >= 2:
|
165
|
+
for i, col1 in enumerate(numerical_cols[:3]):
|
166
|
+
for col2 in numerical_cols[i+1:3]:
|
167
|
+
new_col_name = f"{col1}_x_{col2}"
|
168
|
+
df[new_col_name] = df[col1] * df[col2]
|
169
|
+
|
170
|
+
changes_summary.append("Created interaction features")
|
171
|
+
operations_applied.append("feature_engineering")
|
172
|
+
|
173
|
+
# Save preprocessed dataset if output path provided
|
174
|
+
if output_path:
|
175
|
+
output_path = Path(output_path)
|
176
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
177
|
+
|
178
|
+
if output_path.suffix.lower() == '.csv':
|
179
|
+
df.to_csv(output_path, index=False)
|
180
|
+
elif output_path.suffix.lower() in ['.xlsx', '.xls']:
|
181
|
+
df.to_excel(output_path, index=False)
|
182
|
+
elif output_path.suffix.lower() == '.json':
|
183
|
+
df.to_json(output_path, orient='records')
|
184
|
+
else:
|
185
|
+
return f"Unsupported output format: {output_path.suffix}"
|
186
|
+
|
187
|
+
# Update the current dataset with preprocessed version
|
188
|
+
if data_store:
|
189
|
+
# Always update current_dataset with the preprocessed version
|
190
|
+
data_store.set(
|
191
|
+
"current_dataset",
|
192
|
+
df,
|
193
|
+
data_type="dataframe",
|
194
|
+
metadata={
|
195
|
+
"shape": df.shape,
|
196
|
+
"operations": operations_applied,
|
197
|
+
"source": "preprocessed"
|
198
|
+
}
|
199
|
+
)
|
200
|
+
logger.info(f"Updated current dataset with preprocessed version")
|
201
|
+
|
202
|
+
# Also cache with file path key if provided
|
203
|
+
if file_path:
|
204
|
+
file_path_obj = Path(file_path)
|
205
|
+
preprocessed_key = f"preprocessed:{file_path_obj.resolve()}"
|
206
|
+
data_store.set(
|
207
|
+
preprocessed_key,
|
208
|
+
df,
|
209
|
+
data_type="dataframe",
|
210
|
+
metadata={
|
211
|
+
"file_path": str(file_path),
|
212
|
+
"shape": df.shape,
|
213
|
+
"operations": operations_applied
|
214
|
+
}
|
215
|
+
)
|
216
|
+
logger.info(f"Cached preprocessed DataFrame with key: {preprocessed_key}")
|
217
|
+
|
218
|
+
result = {
|
219
|
+
"operations_applied": operations_applied,
|
220
|
+
"original_shape": original_shape,
|
221
|
+
"preprocessed_shape": df.shape,
|
222
|
+
"changes_summary": changes_summary,
|
223
|
+
"output_path": str(output_path) if output_path else None,
|
224
|
+
}
|
225
|
+
|
226
|
+
return result
|
227
|
+
|
228
|
+
except Exception as e:
|
229
|
+
return f"Error preprocessing dataset: {str(e)}"
|
@@ -0,0 +1,281 @@
|
|
1
|
+
"""Data visualization tool for creating charts and plots."""
|
2
|
+
|
3
|
+
from typing import Union, Dict, Any, Optional, List
|
4
|
+
from pathlib import Path
|
5
|
+
import pandas as pd
|
6
|
+
import numpy as np
|
7
|
+
import matplotlib.pyplot as plt
|
8
|
+
import seaborn as sns
|
9
|
+
import base64
|
10
|
+
from io import BytesIO
|
11
|
+
from agents import function_tool
|
12
|
+
from agents.run_context import RunContextWrapper
|
13
|
+
from agentz.context.data_store import DataStore
|
14
|
+
from .helpers import load_or_get_dataframe
|
15
|
+
from loguru import logger
|
16
|
+
|
17
|
+
|
18
|
+
@function_tool
|
19
|
+
async def create_visualization(
|
20
|
+
ctx: RunContextWrapper[DataStore],
|
21
|
+
plot_type: str,
|
22
|
+
file_path: Optional[str] = None,
|
23
|
+
columns: Optional[List[str]] = None,
|
24
|
+
target_column: Optional[str] = None,
|
25
|
+
output_path: Optional[str] = None
|
26
|
+
) -> Union[Dict[str, Any], str]:
|
27
|
+
"""Creates data visualizations from a dataset.
|
28
|
+
|
29
|
+
This tool automatically uses the current dataset from the pipeline context.
|
30
|
+
A file_path can optionally be provided to visualize a different dataset.
|
31
|
+
|
32
|
+
Args:
|
33
|
+
ctx: Pipeline context wrapper for accessing the data store
|
34
|
+
plot_type: Type of visualization to create. Options:
|
35
|
+
- "distribution": Histogram/distribution plots for numerical columns
|
36
|
+
- "correlation": Correlation heatmap
|
37
|
+
- "scatter": Scatter plot (requires 2 columns)
|
38
|
+
- "box": Box plot for outlier detection
|
39
|
+
- "bar": Bar chart for categorical data
|
40
|
+
- "pairplot": Pairwise relationships plot
|
41
|
+
file_path: Optional path to dataset file. If not provided, uses current dataset.
|
42
|
+
columns: List of columns to visualize (optional, uses all if not specified)
|
43
|
+
target_column: Target column for colored scatter/pair plots
|
44
|
+
output_path: Path to save the visualization (PNG format)
|
45
|
+
|
46
|
+
Returns:
|
47
|
+
Dictionary containing:
|
48
|
+
- plot_type: Type of plot created
|
49
|
+
- columns_plotted: Columns included in the plot
|
50
|
+
- output_path: Path where plot was saved
|
51
|
+
- plot_base64: Base64-encoded PNG image (if no output_path)
|
52
|
+
- insights: Visual insights extracted from the plot
|
53
|
+
Or error message string if visualization fails
|
54
|
+
"""
|
55
|
+
try:
|
56
|
+
# Get DataFrame - either from file_path or current dataset
|
57
|
+
data_store = ctx.context
|
58
|
+
if file_path is None:
|
59
|
+
if data_store and data_store.has("current_dataset"):
|
60
|
+
df = data_store.get("current_dataset")
|
61
|
+
logger.info("Creating visualization from current dataset in pipeline context")
|
62
|
+
else:
|
63
|
+
return "Error: No dataset loaded. Please load a dataset first using the load_dataset tool."
|
64
|
+
else:
|
65
|
+
df = load_or_get_dataframe(file_path, prefer_preprocessed=False, data_store=data_store)
|
66
|
+
logger.info(f"Creating visualization from dataset: {file_path}")
|
67
|
+
|
68
|
+
# Set style
|
69
|
+
sns.set_style("whitegrid")
|
70
|
+
plt.rcParams['figure.figsize'] = (12, 8)
|
71
|
+
|
72
|
+
insights = []
|
73
|
+
columns_plotted = []
|
74
|
+
|
75
|
+
if plot_type == "distribution":
|
76
|
+
# Distribution plots for numerical columns
|
77
|
+
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
|
78
|
+
|
79
|
+
if columns:
|
80
|
+
numerical_cols = [col for col in columns if col in numerical_cols]
|
81
|
+
|
82
|
+
if not numerical_cols:
|
83
|
+
return "No numerical columns found for distribution plot"
|
84
|
+
|
85
|
+
n_cols = min(len(numerical_cols), 6) # Limit to 6 subplots
|
86
|
+
n_rows = (n_cols + 2) // 3
|
87
|
+
fig, axes = plt.subplots(n_rows, 3, figsize=(15, 5*n_rows))
|
88
|
+
axes = axes.flatten() if n_cols > 1 else [axes]
|
89
|
+
|
90
|
+
for i, col in enumerate(numerical_cols[:n_cols]):
|
91
|
+
df[col].hist(bins=30, ax=axes[i], edgecolor='black')
|
92
|
+
axes[i].set_title(f'Distribution of {col}')
|
93
|
+
axes[i].set_xlabel(col)
|
94
|
+
axes[i].set_ylabel('Frequency')
|
95
|
+
|
96
|
+
# Generate insight
|
97
|
+
skewness = df[col].skew()
|
98
|
+
if abs(skewness) > 1:
|
99
|
+
insights.append(f"{col} shows {'right' if skewness > 0 else 'left'} skewness ({skewness:.2f})")
|
100
|
+
|
101
|
+
# Hide empty subplots
|
102
|
+
for i in range(n_cols, len(axes)):
|
103
|
+
axes[i].set_visible(False)
|
104
|
+
|
105
|
+
plt.tight_layout()
|
106
|
+
columns_plotted = numerical_cols[:n_cols]
|
107
|
+
|
108
|
+
elif plot_type == "correlation":
|
109
|
+
# Correlation heatmap
|
110
|
+
numerical_cols = df.select_dtypes(include=['number']).columns
|
111
|
+
|
112
|
+
if columns:
|
113
|
+
numerical_cols = [col for col in columns if col in numerical_cols]
|
114
|
+
|
115
|
+
if len(numerical_cols) < 2:
|
116
|
+
return "Need at least 2 numerical columns for correlation plot"
|
117
|
+
|
118
|
+
corr_matrix = df[numerical_cols].corr()
|
119
|
+
|
120
|
+
plt.figure(figsize=(12, 10))
|
121
|
+
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm',
|
122
|
+
center=0, square=True, linewidths=1)
|
123
|
+
plt.title('Correlation Heatmap')
|
124
|
+
plt.tight_layout()
|
125
|
+
|
126
|
+
# Generate insights
|
127
|
+
high_corr = []
|
128
|
+
for i in range(len(corr_matrix.columns)):
|
129
|
+
for j in range(i+1, len(corr_matrix.columns)):
|
130
|
+
corr_val = corr_matrix.iloc[i, j]
|
131
|
+
if abs(corr_val) > 0.7:
|
132
|
+
high_corr.append(f"{corr_matrix.columns[i]} & {corr_matrix.columns[j]}: {corr_val:.2f}")
|
133
|
+
|
134
|
+
if high_corr:
|
135
|
+
insights.append(f"Strong correlations found: {', '.join(high_corr[:3])}")
|
136
|
+
|
137
|
+
columns_plotted = list(numerical_cols)
|
138
|
+
|
139
|
+
elif plot_type == "scatter":
|
140
|
+
# Scatter plot
|
141
|
+
if not columns or len(columns) < 2:
|
142
|
+
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
|
143
|
+
if len(numerical_cols) < 2:
|
144
|
+
return "Need at least 2 numerical columns for scatter plot"
|
145
|
+
columns = numerical_cols[:2]
|
146
|
+
|
147
|
+
x_col, y_col = columns[0], columns[1]
|
148
|
+
|
149
|
+
plt.figure(figsize=(10, 8))
|
150
|
+
if target_column and target_column in df.columns:
|
151
|
+
scatter = plt.scatter(df[x_col], df[y_col], c=df[target_column],
|
152
|
+
cmap='viridis', alpha=0.6)
|
153
|
+
plt.colorbar(scatter, label=target_column)
|
154
|
+
else:
|
155
|
+
plt.scatter(df[x_col], df[y_col], alpha=0.6)
|
156
|
+
|
157
|
+
plt.xlabel(x_col)
|
158
|
+
plt.ylabel(y_col)
|
159
|
+
plt.title(f'Scatter Plot: {x_col} vs {y_col}')
|
160
|
+
plt.tight_layout()
|
161
|
+
|
162
|
+
# Generate insight
|
163
|
+
correlation = df[x_col].corr(df[y_col])
|
164
|
+
insights.append(f"Correlation between {x_col} and {y_col}: {correlation:.2f}")
|
165
|
+
|
166
|
+
columns_plotted = [x_col, y_col]
|
167
|
+
|
168
|
+
elif plot_type == "box":
|
169
|
+
# Box plot for outlier detection
|
170
|
+
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
|
171
|
+
|
172
|
+
if columns:
|
173
|
+
numerical_cols = [col for col in columns if col in numerical_cols]
|
174
|
+
|
175
|
+
if not numerical_cols:
|
176
|
+
return "No numerical columns found for box plot"
|
177
|
+
|
178
|
+
n_cols = min(len(numerical_cols), 6)
|
179
|
+
plt.figure(figsize=(15, 8))
|
180
|
+
df[numerical_cols[:n_cols]].boxplot()
|
181
|
+
plt.xticks(rotation=45)
|
182
|
+
plt.title('Box Plot - Outlier Detection')
|
183
|
+
plt.ylabel('Value')
|
184
|
+
plt.tight_layout()
|
185
|
+
|
186
|
+
# Generate insights
|
187
|
+
for col in numerical_cols[:n_cols]:
|
188
|
+
Q1 = df[col].quantile(0.25)
|
189
|
+
Q3 = df[col].quantile(0.75)
|
190
|
+
IQR = Q3 - Q1
|
191
|
+
outliers = ((df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)).sum()
|
192
|
+
if outliers > 0:
|
193
|
+
insights.append(f"{col}: {outliers} outliers detected ({outliers/len(df)*100:.1f}%)")
|
194
|
+
|
195
|
+
columns_plotted = numerical_cols[:n_cols]
|
196
|
+
|
197
|
+
elif plot_type == "bar":
|
198
|
+
# Bar chart for categorical data
|
199
|
+
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
|
200
|
+
|
201
|
+
if columns:
|
202
|
+
categorical_cols = [col for col in columns if col in categorical_cols]
|
203
|
+
|
204
|
+
if not categorical_cols:
|
205
|
+
return "No categorical columns found for bar chart"
|
206
|
+
|
207
|
+
col = categorical_cols[0]
|
208
|
+
value_counts = df[col].value_counts().head(10) # Top 10
|
209
|
+
|
210
|
+
plt.figure(figsize=(12, 8))
|
211
|
+
value_counts.plot(kind='bar')
|
212
|
+
plt.title(f'Bar Chart: {col}')
|
213
|
+
plt.xlabel(col)
|
214
|
+
plt.ylabel('Count')
|
215
|
+
plt.xticks(rotation=45, ha='right')
|
216
|
+
plt.tight_layout()
|
217
|
+
|
218
|
+
# Generate insight
|
219
|
+
total = value_counts.sum()
|
220
|
+
top_percent = value_counts.iloc[0] / total * 100
|
221
|
+
insights.append(f"Most common value in {col}: '{value_counts.index[0]}' ({top_percent:.1f}%)")
|
222
|
+
|
223
|
+
columns_plotted = [col]
|
224
|
+
|
225
|
+
elif plot_type == "pairplot":
|
226
|
+
# Pairwise relationships
|
227
|
+
numerical_cols = df.select_dtypes(include=['number']).columns.tolist()
|
228
|
+
|
229
|
+
if columns:
|
230
|
+
numerical_cols = [col for col in columns if col in numerical_cols]
|
231
|
+
|
232
|
+
# Limit to 5 columns to avoid too large plots
|
233
|
+
numerical_cols = numerical_cols[:5]
|
234
|
+
|
235
|
+
if len(numerical_cols) < 2:
|
236
|
+
return "Need at least 2 numerical columns for pair plot"
|
237
|
+
|
238
|
+
if target_column and target_column in df.columns:
|
239
|
+
pairplot_df = df[numerical_cols + [target_column]]
|
240
|
+
sns.pairplot(pairplot_df, hue=target_column)
|
241
|
+
else:
|
242
|
+
pairplot_df = df[numerical_cols]
|
243
|
+
sns.pairplot(pairplot_df)
|
244
|
+
|
245
|
+
plt.suptitle('Pairwise Relationships', y=1.02)
|
246
|
+
|
247
|
+
insights.append(f"Pairplot created for {len(numerical_cols)} numerical columns")
|
248
|
+
columns_plotted = numerical_cols
|
249
|
+
|
250
|
+
else:
|
251
|
+
return f"Unknown plot type: {plot_type}"
|
252
|
+
|
253
|
+
# Save or encode plot
|
254
|
+
if output_path:
|
255
|
+
output_path = Path(output_path)
|
256
|
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
257
|
+
plt.savefig(output_path, dpi=300, bbox_inches='tight')
|
258
|
+
plot_base64 = None
|
259
|
+
else:
|
260
|
+
# Encode to base64
|
261
|
+
buffer = BytesIO()
|
262
|
+
plt.savefig(buffer, format='png', dpi=150, bbox_inches='tight')
|
263
|
+
buffer.seek(0)
|
264
|
+
plot_base64 = base64.b64encode(buffer.read()).decode()
|
265
|
+
buffer.close()
|
266
|
+
|
267
|
+
plt.close('all')
|
268
|
+
|
269
|
+
result = {
|
270
|
+
"plot_type": plot_type,
|
271
|
+
"columns_plotted": columns_plotted,
|
272
|
+
"output_path": str(output_path) if output_path else None,
|
273
|
+
"plot_base64": plot_base64,
|
274
|
+
"insights": insights,
|
275
|
+
}
|
276
|
+
|
277
|
+
return result
|
278
|
+
|
279
|
+
except Exception as e:
|
280
|
+
plt.close('all')
|
281
|
+
return f"Error creating visualization: {str(e)}"
|
agentz/utils/__init__.py
ADDED
@@ -0,0 +1,69 @@
|
|
1
|
+
"""
|
2
|
+
Utility helpers for the multi-agent data science system.
|
3
|
+
|
4
|
+
This package provides utilities for:
|
5
|
+
- Configuration management (config.py)
|
6
|
+
- Logging setup (logging.py)
|
7
|
+
- Rich terminal output (printer.py)
|
8
|
+
- JSON/output parsing (parsers.py)
|
9
|
+
- Miscellaneous helpers (helpers.py)
|
10
|
+
"""
|
11
|
+
|
12
|
+
# Configuration utilities
|
13
|
+
from agentz.utils.config import (
|
14
|
+
load_json_config,
|
15
|
+
save_json_config,
|
16
|
+
merge_configs,
|
17
|
+
get_env_with_prefix,
|
18
|
+
load_config,
|
19
|
+
get_agent_instructions,
|
20
|
+
get_pipeline_settings,
|
21
|
+
BaseConfig,
|
22
|
+
load_mapping_from_path,
|
23
|
+
get_api_key_from_env,
|
24
|
+
resolve_config,
|
25
|
+
load_pipeline_config,
|
26
|
+
normalize_agents,
|
27
|
+
get_agent_spec,
|
28
|
+
)
|
29
|
+
|
30
|
+
# Printer utilities
|
31
|
+
from agentz.utils.printer import Printer
|
32
|
+
|
33
|
+
# Parser utilities
|
34
|
+
from agentz.utils.parsers import (
|
35
|
+
OutputParserError,
|
36
|
+
parse_json_output,
|
37
|
+
find_json_in_string,
|
38
|
+
create_type_parser,
|
39
|
+
)
|
40
|
+
|
41
|
+
# Helper utilities
|
42
|
+
from agentz.utils.helpers import get_experiment_timestamp
|
43
|
+
|
44
|
+
__all__ = [
|
45
|
+
# Config
|
46
|
+
"load_json_config",
|
47
|
+
"save_json_config",
|
48
|
+
"merge_configs",
|
49
|
+
"get_env_with_prefix",
|
50
|
+
"load_config",
|
51
|
+
"get_agent_instructions",
|
52
|
+
"get_pipeline_settings",
|
53
|
+
"BaseConfig",
|
54
|
+
"load_mapping_from_path",
|
55
|
+
"get_api_key_from_env",
|
56
|
+
"resolve_config",
|
57
|
+
"load_pipeline_config",
|
58
|
+
"normalize_agents",
|
59
|
+
"get_agent_spec",
|
60
|
+
# Printer
|
61
|
+
"Printer",
|
62
|
+
# Parsers
|
63
|
+
"OutputParserError",
|
64
|
+
"parse_json_output",
|
65
|
+
"find_json_in_string",
|
66
|
+
"create_type_parser",
|
67
|
+
# Helpers
|
68
|
+
"get_experiment_timestamp",
|
69
|
+
]
|