mcp-automl 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
mcp_automl/__init__.py
ADDED
|
File without changes
|
mcp_automl/__main__.py
ADDED
mcp_automl/server.py
ADDED
|
@@ -0,0 +1,946 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
import pandas as pd
|
|
3
|
+
import uuid
|
|
4
|
+
import os
|
|
5
|
+
import json
|
|
6
|
+
import asyncio
|
|
7
|
+
import duckdb
|
|
8
|
+
import logging
|
|
9
|
+
import argparse
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from mcp.server.fastmcp import FastMCP, Context
|
|
12
|
+
from mcp.types import PromptMessage, TextContent
|
|
13
|
+
from pycaret.classification import setup as setup_clf, compare_models as compare_models_clf, pull as pull_clf, save_model as save_model_clf, load_model as load_model_clf, predict_model as predict_model_clf, get_config as get_config_clf
|
|
14
|
+
from pycaret.regression import setup as setup_reg, compare_models as compare_models_reg, pull as pull_reg, save_model as save_model_reg, load_model as load_model_reg, predict_model as predict_model_reg, get_config as get_config_reg
|
|
15
|
+
|
|
16
|
+
# Configure logging
|
|
17
|
+
logging.basicConfig(
|
|
18
|
+
level=logging.INFO,
|
|
19
|
+
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
20
|
+
)
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
# Module-level configuration (set via argparse in main())
|
|
24
|
+
EXPERIMENT_DIR = "~/.mcp-automl/experiments"
|
|
25
|
+
DEFAULT_SESSION_ID = 42
|
|
26
|
+
QUERY_RESULT_LIMIT = 100
|
|
27
|
+
SUPPORTED_FILE_FORMATS = ('.csv', '.parquet', '.json')
|
|
28
|
+
|
|
29
|
+
mcp = FastMCP("mcp-automl")
|
|
30
|
+
|
|
31
|
+
class PandasJSONEncoder(json.JSONEncoder):
|
|
32
|
+
"""Custom JSON encoder that handles pandas NA types and numpy types."""
|
|
33
|
+
|
|
34
|
+
def default(self, obj):
|
|
35
|
+
# Handle pandas NA types (pd.NA, pd.NaT)
|
|
36
|
+
if pd.isna(obj):
|
|
37
|
+
return None
|
|
38
|
+
# Handle numpy integer types
|
|
39
|
+
if isinstance(obj, (np.integer, np.int64, np.int32)):
|
|
40
|
+
return int(obj)
|
|
41
|
+
# Handle numpy floating types
|
|
42
|
+
if isinstance(obj, (np.floating, np.float64, np.float32)):
|
|
43
|
+
if np.isnan(obj):
|
|
44
|
+
return None
|
|
45
|
+
return float(obj)
|
|
46
|
+
# Handle numpy boolean
|
|
47
|
+
if isinstance(obj, np.bool_):
|
|
48
|
+
return bool(obj)
|
|
49
|
+
# Handle numpy arrays
|
|
50
|
+
if isinstance(obj, np.ndarray):
|
|
51
|
+
return obj.tolist()
|
|
52
|
+
# Let the base class raise the TypeError
|
|
53
|
+
return super().default(obj)
|
|
54
|
+
|
|
55
|
+
def _load_dataframe_fast(data_path: str, sample_size: int = None,
|
|
56
|
+
sample_method: str = 'reservoir') -> pd.DataFrame:
|
|
57
|
+
"""
|
|
58
|
+
Unified data loader using DuckDB with optional sampling.
|
|
59
|
+
|
|
60
|
+
This function provides:
|
|
61
|
+
- Fast I/O using DuckDB for all file formats (CSV, Parquet, JSON)
|
|
62
|
+
- Smart sampling for large files using reservoir sampling
|
|
63
|
+
- Consistent dtype inference by always returning pandas DataFrame
|
|
64
|
+
- Flexible loading: full data for training, sampled data for inspection
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
data_path: Absolute path to the data file (CSV, Parquet, or JSON).
|
|
68
|
+
sample_size: If provided, returns a random sample of this size.
|
|
69
|
+
If None, loads the entire dataset.
|
|
70
|
+
sample_method: Sampling method to use (default: 'reservoir').
|
|
71
|
+
Currently only 'reservoir' is supported.
|
|
72
|
+
|
|
73
|
+
Returns:
|
|
74
|
+
pandas DataFrame with data loaded from file.
|
|
75
|
+
|
|
76
|
+
Raises:
|
|
77
|
+
ValueError: If file format is not supported or sample_method is invalid.
|
|
78
|
+
"""
|
|
79
|
+
# Validate file format
|
|
80
|
+
if not any(data_path.endswith(ext) for ext in SUPPORTED_FILE_FORMATS):
|
|
81
|
+
supported = ', '.join(SUPPORTED_FILE_FORMATS)
|
|
82
|
+
raise ValueError(f"Unsupported file format: {data_path}. Supported formats: {supported}")
|
|
83
|
+
|
|
84
|
+
# Connect to DuckDB (in-memory)
|
|
85
|
+
con = duckdb.connect(database=':memory:')
|
|
86
|
+
|
|
87
|
+
# Full load for training (no sampling)
|
|
88
|
+
if sample_size is None:
|
|
89
|
+
logger.debug(f"Loading full dataset from {data_path}")
|
|
90
|
+
return con.execute(f"SELECT * FROM '{data_path}'").df()
|
|
91
|
+
|
|
92
|
+
# Check total row count to determine if sampling is needed
|
|
93
|
+
total_rows = con.execute(f"SELECT COUNT(*) FROM '{data_path}'").fetchone()[0]
|
|
94
|
+
logger.debug(f"File has {total_rows} total rows, sample_size={sample_size}")
|
|
95
|
+
|
|
96
|
+
if total_rows <= sample_size:
|
|
97
|
+
# File is small enough, just load everything
|
|
98
|
+
logger.debug(f"File is small ({total_rows} <= {sample_size}), loading all rows")
|
|
99
|
+
return con.execute(f"SELECT * FROM '{data_path}'").df()
|
|
100
|
+
|
|
101
|
+
# Apply sampling for large files
|
|
102
|
+
if sample_method == 'reservoir':
|
|
103
|
+
logger.info(f"Applying reservoir sampling: {sample_size} rows from {total_rows} total")
|
|
104
|
+
# Reservoir sampling gives truly random sample
|
|
105
|
+
return con.execute(f"""
|
|
106
|
+
SELECT * FROM '{data_path}'
|
|
107
|
+
USING SAMPLE reservoir({sample_size} ROWS)
|
|
108
|
+
""").df()
|
|
109
|
+
else:
|
|
110
|
+
raise ValueError(f"Unknown sample_method: {sample_method}. Only 'reservoir' is supported.")
|
|
111
|
+
|
|
112
|
+
def _get_feature_info(get_config_func, target_column: str) -> dict:
|
|
113
|
+
"""Extracts feature information from PyCaret config."""
|
|
114
|
+
try:
|
|
115
|
+
X_train = get_config_func('X_train')
|
|
116
|
+
dataset = get_config_func('dataset')
|
|
117
|
+
|
|
118
|
+
used_features = list(X_train.columns)
|
|
119
|
+
all_cols = list(dataset.columns)
|
|
120
|
+
|
|
121
|
+
# Deduce ignored features: in dataset but not in X_train and not target
|
|
122
|
+
ignored_features = [c for c in all_cols if c != target_column and c not in used_features]
|
|
123
|
+
|
|
124
|
+
numeric_features = list(X_train.select_dtypes(include=np.number).columns)
|
|
125
|
+
# Objects and categories are categorical
|
|
126
|
+
categorical_features = list(X_train.select_dtypes(include=['object', 'category']).columns)
|
|
127
|
+
|
|
128
|
+
return {
|
|
129
|
+
"used_features": used_features,
|
|
130
|
+
"ignored_features": ignored_features,
|
|
131
|
+
"actual_numeric_features": numeric_features,
|
|
132
|
+
"actual_categorical_features": categorical_features
|
|
133
|
+
}
|
|
134
|
+
except Exception as e:
|
|
135
|
+
logger.error(f"Error extracting feature info: {e}", exc_info=True)
|
|
136
|
+
return {}
|
|
137
|
+
|
|
138
|
+
def _get_feature_importances(model, get_config_func) -> dict:
|
|
139
|
+
"""Extracts feature importances from the model if available.
|
|
140
|
+
|
|
141
|
+
Supports tree-based models (feature_importances_) and linear models (coef_).
|
|
142
|
+
Returns a dict of {feature_name: importance} sorted by importance descending.
|
|
143
|
+
"""
|
|
144
|
+
try:
|
|
145
|
+
X_train = get_config_func('X_train')
|
|
146
|
+
feature_names = list(X_train.columns)
|
|
147
|
+
|
|
148
|
+
# Try tree-based models first (RF, XGBoost, LightGBM, etc.)
|
|
149
|
+
if hasattr(model, 'feature_importances_'):
|
|
150
|
+
importances = model.feature_importances_
|
|
151
|
+
importance_dict = dict(zip(feature_names, [float(x) for x in importances]))
|
|
152
|
+
# Sort by importance descending
|
|
153
|
+
return dict(sorted(importance_dict.items(), key=lambda x: abs(x[1]), reverse=True))
|
|
154
|
+
|
|
155
|
+
# Try linear models (LogisticRegression, Ridge, Lasso, etc.)
|
|
156
|
+
if hasattr(model, 'coef_'):
|
|
157
|
+
coef = model.coef_
|
|
158
|
+
# For multi-class, coef_ has shape (n_classes, n_features), take mean absolute
|
|
159
|
+
if len(coef.shape) > 1:
|
|
160
|
+
importances = np.abs(coef).mean(axis=0)
|
|
161
|
+
else:
|
|
162
|
+
importances = np.abs(coef)
|
|
163
|
+
importance_dict = dict(zip(feature_names, [float(x) for x in importances]))
|
|
164
|
+
return dict(sorted(importance_dict.items(), key=lambda x: abs(x[1]), reverse=True))
|
|
165
|
+
|
|
166
|
+
return {}
|
|
167
|
+
except Exception as e:
|
|
168
|
+
logger.warning(f"Could not extract feature importances: {e}")
|
|
169
|
+
return {}
|
|
170
|
+
|
|
171
|
+
def _save_results(run_id: str, model, results: pd.DataFrame, save_model_func, metadata: dict, test_results: pd.DataFrame = None, feature_importances: dict = None) -> str:
|
|
172
|
+
"""Helper to save model and metrics.
|
|
173
|
+
|
|
174
|
+
Args:
|
|
175
|
+
run_id: Unique run identifier.
|
|
176
|
+
model: The trained model object.
|
|
177
|
+
results: DataFrame containing CV results (from pull()).
|
|
178
|
+
save_model_func: Function to save the model.
|
|
179
|
+
metadata: Dictionary of run configuration.
|
|
180
|
+
test_results: DataFrame containing test/holdout results (from predict_model()).
|
|
181
|
+
"""
|
|
182
|
+
# Create directory
|
|
183
|
+
run_dir = os.path.join(EXPERIMENT_DIR, run_id)
|
|
184
|
+
os.makedirs(run_dir, exist_ok=True)
|
|
185
|
+
|
|
186
|
+
# Save model
|
|
187
|
+
model_path = os.path.join(run_dir, "model")
|
|
188
|
+
save_model_func(model, model_path)
|
|
189
|
+
|
|
190
|
+
# Save CV metrics (best model)
|
|
191
|
+
if not results.empty:
|
|
192
|
+
metrics = results.iloc[0].to_dict()
|
|
193
|
+
else:
|
|
194
|
+
metrics = {}
|
|
195
|
+
|
|
196
|
+
# Save Test/Holdout metrics
|
|
197
|
+
test_metrics = {}
|
|
198
|
+
if test_results is not None and not test_results.empty:
|
|
199
|
+
# predict_model returns a dataframe where metrics are often in the first row or summary
|
|
200
|
+
# For PyCaret predict_model(), it returns the metrics if data is passed with labels.
|
|
201
|
+
# However, pull() after predict_model() usually contains the metrics.
|
|
202
|
+
# Let's assume test_results is result of pull() after predict_model.
|
|
203
|
+
try:
|
|
204
|
+
test_metrics = test_results.to_dict(orient='records')[0]
|
|
205
|
+
except (KeyError, IndexError) as e:
|
|
206
|
+
logger.warning(f"Could not extract test metrics: {e}")
|
|
207
|
+
test_metrics = {}
|
|
208
|
+
|
|
209
|
+
# Merge metadata into metrics for saving
|
|
210
|
+
full_metadata = {**metadata, "cv_metrics": metrics, "test_metrics": test_metrics}
|
|
211
|
+
|
|
212
|
+
metadata_path = os.path.join(run_dir, "metadata.json")
|
|
213
|
+
with open(metadata_path, "w") as f:
|
|
214
|
+
json.dump(full_metadata, f, indent=2)
|
|
215
|
+
|
|
216
|
+
# Generate HTML Report
|
|
217
|
+
|
|
218
|
+
# Format lists for HTML
|
|
219
|
+
def fmt_list(l):
|
|
220
|
+
return ", ".join(l) if l else "None"
|
|
221
|
+
|
|
222
|
+
# Separate metrics from configuration
|
|
223
|
+
config_metadata = {k: v for k, v in metadata.items() if k not in metrics}
|
|
224
|
+
|
|
225
|
+
# Generate config rows
|
|
226
|
+
config_rows = ""
|
|
227
|
+
for k, v in config_metadata.items():
|
|
228
|
+
# Clean up keys for display
|
|
229
|
+
display_key = k.replace("_", " ").title()
|
|
230
|
+
|
|
231
|
+
# Format values
|
|
232
|
+
if isinstance(v, list):
|
|
233
|
+
display_val = fmt_list(v)
|
|
234
|
+
else:
|
|
235
|
+
display_val = str(v)
|
|
236
|
+
|
|
237
|
+
config_rows += f'<div class="metadata-item"><strong>{display_key}:</strong> {display_val}</div>\n'
|
|
238
|
+
|
|
239
|
+
html_content = f"""
|
|
240
|
+
<html>
|
|
241
|
+
<head>
|
|
242
|
+
<title>Training Result - {run_id}</title>
|
|
243
|
+
<style>
|
|
244
|
+
body {{ font-family: monospace, sans-serif; margin: 20px; }}
|
|
245
|
+
h1, h2 {{ color: #333; }}
|
|
246
|
+
table {{ border-collapse: collapse; width: 100%; margin-bottom: 20px; }}
|
|
247
|
+
th, td {{ border: 1px solid #ddd; padding: 8px; text-align: left; }}
|
|
248
|
+
th {{ background-color: #f2f2f2; }}
|
|
249
|
+
.metadata-item {{ margin-bottom: 5px; }}
|
|
250
|
+
</style>
|
|
251
|
+
</head>
|
|
252
|
+
<body>
|
|
253
|
+
<h1>Training Run: {run_id}</h1>
|
|
254
|
+
|
|
255
|
+
<h2>Configuration</h2>
|
|
256
|
+
<div class="metadata">
|
|
257
|
+
{config_rows}
|
|
258
|
+
</div>
|
|
259
|
+
|
|
260
|
+
<h2>Model Metrics (CV)</h2>
|
|
261
|
+
{results.to_html(classes='table', index=False) if not results.empty else "<p>No results available</p>"}
|
|
262
|
+
|
|
263
|
+
<h2>Test/Holdout Metrics</h2>
|
|
264
|
+
{test_results.to_html(classes='table', index=False) if test_results is not None and not test_results.empty else "<p>No test results available</p>"}
|
|
265
|
+
|
|
266
|
+
</body>
|
|
267
|
+
</html>
|
|
268
|
+
"""
|
|
269
|
+
|
|
270
|
+
html_path = os.path.join(run_dir, "result.html")
|
|
271
|
+
with open(html_path, "w") as f:
|
|
272
|
+
f.write(html_content)
|
|
273
|
+
|
|
274
|
+
return json.dumps({
|
|
275
|
+
"run_id": run_id,
|
|
276
|
+
"model_path": model_path + ".pkl", # pycaret adds .pkl
|
|
277
|
+
"data_path": metadata.get("data_path"),
|
|
278
|
+
"test_data_path": metadata.get("test_data_path"),
|
|
279
|
+
"metadata": metrics,
|
|
280
|
+
"test_metrics": test_metrics,
|
|
281
|
+
"feature_importances": feature_importances if feature_importances else {},
|
|
282
|
+
"report_path": html_path
|
|
283
|
+
}, indent=2)
|
|
284
|
+
|
|
285
|
+
def _train_classifier_sync(run_id: str, data_path: str, target_column: str, ignore_features: list[str], numeric_features: list[str], categorical_features: list[str],
|
|
286
|
+
ordinal_features: dict[str, list[str]], date_features: list[str], text_features: list[str], keep_features: list[str],
|
|
287
|
+
imputation_type: str, numeric_imputation: str, categorical_imputation: str,
|
|
288
|
+
fix_imbalance: bool, remove_outliers: bool, normalize: bool, normalize_method: str,
|
|
289
|
+
transformation: bool, transformation_method: str,
|
|
290
|
+
polynomial_features: bool, interaction_features: list[str], bin_numeric_features: list[str],
|
|
291
|
+
feature_selection: bool, feature_selection_method: str, n_features_to_select: float,
|
|
292
|
+
fold_strategy: str, fold: int, n_jobs: int, test_data_path: str = None, optimize: str = None,
|
|
293
|
+
include_models: list[str] = None, exclude_models: list[str] = None) -> str:
|
|
294
|
+
"""Synchronous helper for classifier training."""
|
|
295
|
+
# Use unified loader for consistent dtypes
|
|
296
|
+
data = _load_dataframe_fast(data_path)
|
|
297
|
+
|
|
298
|
+
session_id = DEFAULT_SESSION_ID
|
|
299
|
+
|
|
300
|
+
# Handle Test Data
|
|
301
|
+
test_data = None
|
|
302
|
+
if test_data_path:
|
|
303
|
+
test_data = _load_dataframe_fast(test_data_path)
|
|
304
|
+
|
|
305
|
+
# Ensure unique indices across train and test
|
|
306
|
+
data.reset_index(drop=True, inplace=True)
|
|
307
|
+
test_data.reset_index(drop=True, inplace=True)
|
|
308
|
+
test_data.index = test_data.index + len(data)
|
|
309
|
+
# Filter out None values to let PyCaret defaults take over where appropriate
|
|
310
|
+
setup_params = {
|
|
311
|
+
"data": data,
|
|
312
|
+
"test_data": test_data,
|
|
313
|
+
"target": target_column,
|
|
314
|
+
"session_id": session_id,
|
|
315
|
+
"verbose": False,
|
|
316
|
+
"html": False,
|
|
317
|
+
"ignore_features": ignore_features,
|
|
318
|
+
"numeric_features": numeric_features,
|
|
319
|
+
"categorical_features": categorical_features,
|
|
320
|
+
"ordinal_features": ordinal_features,
|
|
321
|
+
"date_features": date_features,
|
|
322
|
+
"text_features": text_features,
|
|
323
|
+
"keep_features": keep_features,
|
|
324
|
+
"imputation_type": imputation_type,
|
|
325
|
+
"numeric_imputation": numeric_imputation,
|
|
326
|
+
"categorical_imputation": categorical_imputation,
|
|
327
|
+
"fix_imbalance": fix_imbalance,
|
|
328
|
+
"remove_outliers": remove_outliers,
|
|
329
|
+
"normalize": normalize,
|
|
330
|
+
"normalize_method": normalize_method,
|
|
331
|
+
"transformation": transformation,
|
|
332
|
+
"transformation_method": transformation_method,
|
|
333
|
+
"polynomial_features": polynomial_features,
|
|
334
|
+
"interaction_features": interaction_features,
|
|
335
|
+
"bin_numeric_features": bin_numeric_features,
|
|
336
|
+
"feature_selection": feature_selection,
|
|
337
|
+
"feature_selection_method": feature_selection_method,
|
|
338
|
+
"n_features_to_select": n_features_to_select,
|
|
339
|
+
"fold_strategy": fold_strategy,
|
|
340
|
+
"fold": fold,
|
|
341
|
+
"n_jobs": n_jobs
|
|
342
|
+
}
|
|
343
|
+
# Remove None values
|
|
344
|
+
setup_params = {k: v for k, v in setup_params.items() if v is not None}
|
|
345
|
+
|
|
346
|
+
s = setup_clf(**setup_params)
|
|
347
|
+
|
|
348
|
+
feature_info = _get_feature_info(get_config_clf, target_column)
|
|
349
|
+
|
|
350
|
+
# Only pass sort if optimize is specified
|
|
351
|
+
compare_kwargs = {"n_select": 1, "verbose": False}
|
|
352
|
+
if optimize is not None:
|
|
353
|
+
compare_kwargs["sort"] = optimize
|
|
354
|
+
if include_models is not None:
|
|
355
|
+
compare_kwargs["include"] = include_models
|
|
356
|
+
if exclude_models is not None:
|
|
357
|
+
compare_kwargs["exclude"] = exclude_models
|
|
358
|
+
|
|
359
|
+
best_model = compare_models_clf(**compare_kwargs)
|
|
360
|
+
if isinstance(best_model, list):
|
|
361
|
+
if not best_model:
|
|
362
|
+
raise ValueError("compare_models returned an empty list. Try relaxing constraints or collecting more data.")
|
|
363
|
+
best_model = best_model[0]
|
|
364
|
+
results = pull_clf()
|
|
365
|
+
|
|
366
|
+
# Extract feature importances
|
|
367
|
+
feature_importances = _get_feature_importances(best_model, get_config_clf)
|
|
368
|
+
|
|
369
|
+
# Evaluate on holdout (test_data or split)
|
|
370
|
+
predict_model_clf(best_model)
|
|
371
|
+
test_results = pull_clf()
|
|
372
|
+
|
|
373
|
+
metadata = {
|
|
374
|
+
"data_path": data_path,
|
|
375
|
+
"test_data_path": test_data_path,
|
|
376
|
+
"target_column": target_column,
|
|
377
|
+
"session_id": session_id,
|
|
378
|
+
"task": "classification",
|
|
379
|
+
"include_models": include_models,
|
|
380
|
+
"exclude_models": exclude_models,
|
|
381
|
+
**setup_params, # Include all setup params in metadata
|
|
382
|
+
**feature_info
|
|
383
|
+
}
|
|
384
|
+
# Remove dataframes/series from metadata if they slipped in (data is in formatting)
|
|
385
|
+
if "data" in metadata: del metadata["data"]
|
|
386
|
+
if "test_data" in metadata: del metadata["test_data"]
|
|
387
|
+
|
|
388
|
+
return _save_results(run_id, best_model, results, save_model_clf, metadata, test_results, feature_importances)
|
|
389
|
+
|
|
390
|
+
@mcp.tool()
|
|
391
|
+
async def train_classifier(data_path: str, target_column: str, ctx: Context,
|
|
392
|
+
ignore_features: list[str] = None, numeric_features: list[str] = None, categorical_features: list[str] = None,
|
|
393
|
+
ordinal_features: dict[str, list[str]] = None, date_features: list[str] = None, text_features: list[str] = None, keep_features: list[str] = None,
|
|
394
|
+
imputation_type: str = "simple", numeric_imputation: str = "mean", categorical_imputation: str = "mode",
|
|
395
|
+
fix_imbalance: bool = False, remove_outliers: bool = False, normalize: bool = False, normalize_method: str = "zscore",
|
|
396
|
+
transformation: bool = False, transformation_method: str = "yeo-johnson",
|
|
397
|
+
polynomial_features: bool = False, interaction_features: list[str] = None, bin_numeric_features: list[str] = None,
|
|
398
|
+
feature_selection: bool = False, feature_selection_method: str = "classic", n_features_to_select: float = 0.2,
|
|
399
|
+
fold_strategy: str = "kfold", fold: int = 10, n_jobs: int = -1, test_data_path: str = None, optimize: str = None,
|
|
400
|
+
include_models: list[str] = None, exclude_models: list[str] = None) -> str:
|
|
401
|
+
"""
|
|
402
|
+
Train a classification model using PyCaret with advanced configuration.
|
|
403
|
+
|
|
404
|
+
- NOTE: Please use absolute paths for data_path and test_data_path to avoid path resolution errors.
|
|
405
|
+
|
|
406
|
+
Args:
|
|
407
|
+
data_path: Path to dataset (csv/parquet/json).
|
|
408
|
+
target_column: Name of target column.
|
|
409
|
+
test_data_path: Optional path to specific test dataset. If provided, used for evaluation/holdout.
|
|
410
|
+
optimize: Metric to optimize for (e.g., 'Accuracy', 'AUC', 'Recall', 'Precision', 'F1', 'Kappa', 'MCC'). Default is 'Accuracy'.
|
|
411
|
+
include_models: List of model IDs to include in comparison (e.g., ['lr', 'dt', 'rf']). If None, all models are compared.
|
|
412
|
+
exclude_models: List of model IDs to exclude from comparison (e.g., ['catboost']). If None, no models are excluded.
|
|
413
|
+
ignore_features: Features to ignore.
|
|
414
|
+
numeric_features: Features to treat as numeric.
|
|
415
|
+
categorical_features: Features to treat as categorical.
|
|
416
|
+
ordinal_features: Dictionary of ordinal features and their order (e.g., {'grade': ['low', 'medium', 'high']}).
|
|
417
|
+
date_features: Features to treat as dates.
|
|
418
|
+
text_features: Features to treat as text (for TF-IDF etc).
|
|
419
|
+
keep_features: Features to ensure are kept.
|
|
420
|
+
imputation_type: 'simple' or 'iterative' (default: 'simple').
|
|
421
|
+
numeric_imputation: 'mean', 'median', 'mode' or int/float (default: 'mean').
|
|
422
|
+
categorical_imputation: 'mode' or str (default: 'mode').
|
|
423
|
+
fix_imbalance: If True, fix imbalance in training data (default: False).
|
|
424
|
+
remove_outliers: If True, remove outliers from training data (default: False).
|
|
425
|
+
normalize: If True, scale features (default: False). Recommended for linear models.
|
|
426
|
+
normalize_method: 'zscore', 'minmax', 'maxabs', 'robust' (default: 'zscore').
|
|
427
|
+
transformation: If True, apply gaussian transformation to make data more normal (default: False).
|
|
428
|
+
transformation_method: 'yeo-johnson' or 'quantile' (default: 'yeo-johnson').
|
|
429
|
+
polynomial_features: If True, create polynomial features (default: False).
|
|
430
|
+
interaction_features: List of features to create interactions for.
|
|
431
|
+
bin_numeric_features: List of numeric features to bin into categories.
|
|
432
|
+
feature_selection: If True, select best features (default: False).
|
|
433
|
+
feature_selection_method: 'classic', 'univariate', 'sequential' (default: 'classic').
|
|
434
|
+
n_features_to_select: Fraction (0.0-1.0) or number of features to select (default: 0.2).
|
|
435
|
+
fold_strategy: 'kfold', 'stratifiedkfold', 'groupkfold', 'timeseries' (default: 'kfold').
|
|
436
|
+
fold: Number of folds (default: 10).
|
|
437
|
+
n_jobs: Number of jobs to run in parallel (-1 for all cores).
|
|
438
|
+
|
|
439
|
+
Returns:
|
|
440
|
+
JSON string with run_id, model_path, metrics, feature_importances, and report_path.
|
|
441
|
+
"""
|
|
442
|
+
try:
|
|
443
|
+
run_id = str(uuid.uuid4())
|
|
444
|
+
await ctx.report_progress(0, 100)
|
|
445
|
+
await ctx.info(f"Starting advanced classification training run {run_id}")
|
|
446
|
+
|
|
447
|
+
result = await asyncio.to_thread(
|
|
448
|
+
_train_classifier_sync,
|
|
449
|
+
run_id, data_path, target_column, ignore_features, numeric_features, categorical_features,
|
|
450
|
+
ordinal_features, date_features, text_features, keep_features,
|
|
451
|
+
imputation_type, numeric_imputation, categorical_imputation,
|
|
452
|
+
fix_imbalance, remove_outliers, normalize, normalize_method,
|
|
453
|
+
transformation, transformation_method,
|
|
454
|
+
polynomial_features, interaction_features, bin_numeric_features,
|
|
455
|
+
feature_selection, feature_selection_method, n_features_to_select,
|
|
456
|
+
fold_strategy, fold, n_jobs, test_data_path, optimize,
|
|
457
|
+
include_models, exclude_models
|
|
458
|
+
)
|
|
459
|
+
|
|
460
|
+
await ctx.report_progress(100, 100)
|
|
461
|
+
await ctx.info(f"Finished classification training run {run_id}")
|
|
462
|
+
return result
|
|
463
|
+
except Exception as e:
|
|
464
|
+
return f"Error training classifier: {str(e)}"
|
|
465
|
+
|
|
466
|
+
def _train_regressor_sync(run_id: str, data_path: str, target_column: str, ignore_features: list[str], numeric_features: list[str], categorical_features: list[str],
|
|
467
|
+
ordinal_features: dict[str, list[str]], date_features: list[str], text_features: list[str], keep_features: list[str],
|
|
468
|
+
imputation_type: str, numeric_imputation: str, categorical_imputation: str,
|
|
469
|
+
remove_outliers: bool, normalize: bool, normalize_method: str,
|
|
470
|
+
transformation: bool, transformation_method: str,
|
|
471
|
+
polynomial_features: bool, interaction_features: list[str], bin_numeric_features: list[str],
|
|
472
|
+
feature_selection: bool, feature_selection_method: str, n_features_to_select: float,
|
|
473
|
+
fold_strategy: str, fold: int, n_jobs: int, test_data_path: str = None, optimize: str = "R2",
|
|
474
|
+
include_models: list[str] = None, exclude_models: list[str] = None) -> str:
|
|
475
|
+
"""Synchronous helper for regressor training."""
|
|
476
|
+
# Use unified loader for consistent dtypes
|
|
477
|
+
data = _load_dataframe_fast(data_path)
|
|
478
|
+
|
|
479
|
+
session_id = DEFAULT_SESSION_ID
|
|
480
|
+
|
|
481
|
+
# Handle Test Data
|
|
482
|
+
test_data = None
|
|
483
|
+
if test_data_path:
|
|
484
|
+
test_data = _load_dataframe_fast(test_data_path)
|
|
485
|
+
|
|
486
|
+
# Ensure unique indices across train and test
|
|
487
|
+
data.reset_index(drop=True, inplace=True)
|
|
488
|
+
test_data.reset_index(drop=True, inplace=True)
|
|
489
|
+
test_data.index = test_data.index + len(data)
|
|
490
|
+
# Filter out None values to let PyCaret defaults take over where appropriate
|
|
491
|
+
setup_params = {
|
|
492
|
+
"data": data,
|
|
493
|
+
"test_data": test_data,
|
|
494
|
+
"target": target_column,
|
|
495
|
+
"session_id": session_id,
|
|
496
|
+
"verbose": False,
|
|
497
|
+
"html": False,
|
|
498
|
+
"ignore_features": ignore_features,
|
|
499
|
+
"numeric_features": numeric_features,
|
|
500
|
+
"categorical_features": categorical_features,
|
|
501
|
+
"ordinal_features": ordinal_features,
|
|
502
|
+
"date_features": date_features,
|
|
503
|
+
"text_features": text_features,
|
|
504
|
+
"keep_features": keep_features,
|
|
505
|
+
"imputation_type": imputation_type,
|
|
506
|
+
"numeric_imputation": numeric_imputation,
|
|
507
|
+
"categorical_imputation": categorical_imputation,
|
|
508
|
+
"remove_outliers": remove_outliers,
|
|
509
|
+
"normalize": normalize,
|
|
510
|
+
"normalize_method": normalize_method,
|
|
511
|
+
"transformation": transformation,
|
|
512
|
+
"transformation_method": transformation_method,
|
|
513
|
+
"polynomial_features": polynomial_features,
|
|
514
|
+
"interaction_features": interaction_features,
|
|
515
|
+
"bin_numeric_features": bin_numeric_features,
|
|
516
|
+
"feature_selection": feature_selection,
|
|
517
|
+
"feature_selection_method": feature_selection_method,
|
|
518
|
+
"n_features_to_select": n_features_to_select,
|
|
519
|
+
"fold_strategy": fold_strategy,
|
|
520
|
+
"fold": fold,
|
|
521
|
+
"n_jobs": n_jobs
|
|
522
|
+
}
|
|
523
|
+
# Remove None values
|
|
524
|
+
setup_params = {k: v for k, v in setup_params.items() if v is not None}
|
|
525
|
+
|
|
526
|
+
s = setup_reg(**setup_params)
|
|
527
|
+
|
|
528
|
+
feature_info = _get_feature_info(get_config_reg, target_column)
|
|
529
|
+
|
|
530
|
+
# Only pass sort if optimize is specified
|
|
531
|
+
compare_kwargs = {"n_select": 1, "verbose": False}
|
|
532
|
+
if optimize is not None:
|
|
533
|
+
compare_kwargs["sort"] = optimize
|
|
534
|
+
if include_models is not None:
|
|
535
|
+
compare_kwargs["include"] = include_models
|
|
536
|
+
if exclude_models is not None:
|
|
537
|
+
compare_kwargs["exclude"] = exclude_models
|
|
538
|
+
|
|
539
|
+
best_model = compare_models_reg(**compare_kwargs)
|
|
540
|
+
if isinstance(best_model, list):
|
|
541
|
+
if not best_model:
|
|
542
|
+
raise ValueError("compare_models returned an empty list. Try relaxing constraints or collecting more data.")
|
|
543
|
+
best_model = best_model[0]
|
|
544
|
+
results = pull_reg()
|
|
545
|
+
|
|
546
|
+
# Extract feature importances
|
|
547
|
+
feature_importances = _get_feature_importances(best_model, get_config_reg)
|
|
548
|
+
|
|
549
|
+
# Evaluate on holdout
|
|
550
|
+
predict_model_reg(best_model)
|
|
551
|
+
test_results = pull_reg()
|
|
552
|
+
|
|
553
|
+
metadata = {
|
|
554
|
+
"data_path": data_path,
|
|
555
|
+
"test_data_path": test_data_path,
|
|
556
|
+
"target_column": target_column,
|
|
557
|
+
"session_id": session_id,
|
|
558
|
+
"task": "regression",
|
|
559
|
+
"include_models": include_models,
|
|
560
|
+
"exclude_models": exclude_models,
|
|
561
|
+
**setup_params, # Include all setup params in metadata
|
|
562
|
+
**feature_info
|
|
563
|
+
}
|
|
564
|
+
# Remove dataframes/series from metadata if they slipped in
|
|
565
|
+
if "data" in metadata: del metadata["data"]
|
|
566
|
+
if "test_data" in metadata: del metadata["test_data"]
|
|
567
|
+
|
|
568
|
+
return _save_results(run_id, best_model, results, save_model_reg, metadata, test_results, feature_importances)
|
|
569
|
+
|
|
570
|
+
@mcp.tool()
|
|
571
|
+
async def train_regressor(data_path: str, target_column: str, ctx: Context,
|
|
572
|
+
ignore_features: list[str] = None, numeric_features: list[str] = None, categorical_features: list[str] = None,
|
|
573
|
+
ordinal_features: dict[str, list[str]] = None, date_features: list[str] = None, text_features: list[str] = None, keep_features: list[str] = None,
|
|
574
|
+
imputation_type: str = "simple", numeric_imputation: str = "mean", categorical_imputation: str = "mode",
|
|
575
|
+
remove_outliers: bool = False, normalize: bool = False, normalize_method: str = "zscore",
|
|
576
|
+
transformation: bool = False, transformation_method: str = "yeo-johnson",
|
|
577
|
+
polynomial_features: bool = False, interaction_features: list[str] = None, bin_numeric_features: list[str] = None,
|
|
578
|
+
feature_selection: bool = False, feature_selection_method: str = "classic", n_features_to_select: float = 0.2,
|
|
579
|
+
fold_strategy: str = "kfold", fold: int = 10, n_jobs: int = -1, test_data_path: str = None, optimize: str = "R2",
|
|
580
|
+
include_models: list[str] = None, exclude_models: list[str] = None) -> str:
|
|
581
|
+
"""
|
|
582
|
+
Train a regression model using PyCaret with advanced configuration.
|
|
583
|
+
|
|
584
|
+
- NOTE: Please use absolute paths for data_path and test_data_path to avoid path resolution errors.
|
|
585
|
+
|
|
586
|
+
Args:
|
|
587
|
+
data_path: Path to dataset (csv/parquet/json).
|
|
588
|
+
target_column: Name of target column.
|
|
589
|
+
test_data_path: Optional path to specific test dataset. If provided, used for evaluation/holdout.
|
|
590
|
+
optimize: Metric to optimize for (e.g., 'R2', 'RMSE', 'MAE', 'MSE', 'RMSLE', 'MAPE'). Default is 'R2'.
|
|
591
|
+
include_models: List of model IDs to include in comparison (e.g., ['lr', 'dt', 'rf']). If None, all models are compared.
|
|
592
|
+
exclude_models: List of model IDs to exclude from comparison (e.g., ['catboost']). If None, no models are excluded.
|
|
593
|
+
ignore_features: Features to ignore.
|
|
594
|
+
numeric_features: Features to treat as numeric.
|
|
595
|
+
categorical_features: Features to treat as categorical.
|
|
596
|
+
ordinal_features: Dictionary of ordinal features and their order.
|
|
597
|
+
date_features: Features to treat as dates.
|
|
598
|
+
text_features: Features to treat as text (for TF-IDF etc).
|
|
599
|
+
keep_features: Features to ensure are kept.
|
|
600
|
+
imputation_type: 'simple' or 'iterative' (default: 'simple').
|
|
601
|
+
numeric_imputation: 'mean', 'median', 'mode' or int/float (default: 'mean').
|
|
602
|
+
categorical_imputation: 'mode' or str (default: 'mode').
|
|
603
|
+
remove_outliers: If True, remove outliers from training data (default: False).
|
|
604
|
+
normalize: If True, scale features (default: False). Recommended for linear models.
|
|
605
|
+
normalize_method: 'zscore', 'minmax', 'maxabs', 'robust' (default: 'zscore').
|
|
606
|
+
transformation: If True, apply gaussian transformation to make data more normal (default: False).
|
|
607
|
+
transformation_method: 'yeo-johnson' or 'quantile' (default: 'yeo-johnson').
|
|
608
|
+
polynomial_features: If True, create polynomial features (default: False).
|
|
609
|
+
interaction_features: List of features to create interactions for.
|
|
610
|
+
bin_numeric_features: List of numeric features to bin into categories.
|
|
611
|
+
feature_selection: If True, select best features (default: False).
|
|
612
|
+
feature_selection_method: 'classic', 'univariate', 'sequential' (default: 'classic').
|
|
613
|
+
n_features_to_select: Fraction (0.0-1.0) or number of features to select (default: 0.2).
|
|
614
|
+
fold_strategy: 'kfold', 'stratifiedkfold', 'groupkfold', 'timeseries' (default: 'kfold').
|
|
615
|
+
fold: Number of folds (default: 10).
|
|
616
|
+
n_jobs: Number of jobs to run in parallel (-1 for all cores).
|
|
617
|
+
|
|
618
|
+
Returns:
|
|
619
|
+
JSON string with run_id, model_path, metrics, feature_importances, and report_path.
|
|
620
|
+
"""
|
|
621
|
+
try:
|
|
622
|
+
run_id = str(uuid.uuid4())
|
|
623
|
+
await ctx.report_progress(0, 100)
|
|
624
|
+
await ctx.info(f"Starting advanced regression training run {run_id}")
|
|
625
|
+
|
|
626
|
+
result = await asyncio.to_thread(
|
|
627
|
+
_train_regressor_sync,
|
|
628
|
+
run_id, data_path, target_column, ignore_features, numeric_features, categorical_features,
|
|
629
|
+
ordinal_features, date_features, text_features, keep_features,
|
|
630
|
+
imputation_type, numeric_imputation, categorical_imputation,
|
|
631
|
+
remove_outliers, normalize, normalize_method,
|
|
632
|
+
transformation, transformation_method,
|
|
633
|
+
polynomial_features, interaction_features, bin_numeric_features,
|
|
634
|
+
feature_selection, feature_selection_method, n_features_to_select,
|
|
635
|
+
fold_strategy, fold, n_jobs, test_data_path, optimize,
|
|
636
|
+
include_models, exclude_models
|
|
637
|
+
)
|
|
638
|
+
|
|
639
|
+
await ctx.report_progress(100, 100)
|
|
640
|
+
await ctx.info(f"Finished regression training run {run_id}")
|
|
641
|
+
return result
|
|
642
|
+
except Exception as e:
|
|
643
|
+
return f"Error training regressor: {str(e)}"
|
|
644
|
+
|
|
645
|
+
|
|
646
|
+
|
|
647
|
+
def _predict_sync(run_id: str, data_path: str) -> str:
|
|
648
|
+
"""Synchronous helper for predictions."""
|
|
649
|
+
run_dir = os.path.join(EXPERIMENT_DIR, run_id)
|
|
650
|
+
metadata_path = os.path.join(run_dir, "metadata.json")
|
|
651
|
+
|
|
652
|
+
if not os.path.exists(metadata_path):
|
|
653
|
+
return f"Error: Run ID {run_id} not found."
|
|
654
|
+
|
|
655
|
+
with open(metadata_path, "r") as f:
|
|
656
|
+
metadata = json.load(f)
|
|
657
|
+
|
|
658
|
+
task = metadata.get("task")
|
|
659
|
+
model_path = os.path.join(run_dir, "model")
|
|
660
|
+
|
|
661
|
+
# Load data using unified loader
|
|
662
|
+
if not os.path.exists(data_path):
|
|
663
|
+
return f"Error: Data file not found at {data_path}"
|
|
664
|
+
|
|
665
|
+
try:
|
|
666
|
+
input_data = _load_dataframe_fast(data_path)
|
|
667
|
+
except Exception as e:
|
|
668
|
+
return f"Error loading data file: {str(e)}"
|
|
669
|
+
|
|
670
|
+
if task == "classification":
|
|
671
|
+
model = load_model_clf(model_path)
|
|
672
|
+
predictions = predict_model_clf(model, data=input_data)
|
|
673
|
+
elif task == "regression":
|
|
674
|
+
model = load_model_reg(model_path)
|
|
675
|
+
predictions = predict_model_reg(model, data=input_data)
|
|
676
|
+
else:
|
|
677
|
+
return f"Error: Unknown task type '{task}' in metadata."
|
|
678
|
+
|
|
679
|
+
# Save predictions
|
|
680
|
+
predictions_dir = os.path.join(run_dir, "predictions")
|
|
681
|
+
os.makedirs(predictions_dir, exist_ok=True)
|
|
682
|
+
|
|
683
|
+
prediction_id = str(uuid.uuid4())
|
|
684
|
+
prediction_file = f"prediction_{prediction_id}.json"
|
|
685
|
+
prediction_path = os.path.join(predictions_dir, prediction_file)
|
|
686
|
+
|
|
687
|
+
predictions.to_json(prediction_path, orient="records", indent=2)
|
|
688
|
+
|
|
689
|
+
return prediction_path
|
|
690
|
+
|
|
691
|
+
@mcp.tool()
|
|
692
|
+
async def predict(run_id: str, data_path: str, ctx: Context = None) -> str:
|
|
693
|
+
"""
|
|
694
|
+
Make predictions using a trained model.
|
|
695
|
+
|
|
696
|
+
- NOTE: Please use absolute paths for data_path to avoid path resolution errors.
|
|
697
|
+
|
|
698
|
+
Args:
|
|
699
|
+
run_id: The ID of the training run (returned by train_classifier or train_regressor).
|
|
700
|
+
data_path: The path to the CSV or JSON file containing the input data.
|
|
701
|
+
|
|
702
|
+
Returns:
|
|
703
|
+
The absolute path to the JSON file containing the predictions.
|
|
704
|
+
"""
|
|
705
|
+
try:
|
|
706
|
+
if ctx:
|
|
707
|
+
await ctx.report_progress(0, 100)
|
|
708
|
+
await ctx.info(f"Loading model and making predictions...")
|
|
709
|
+
|
|
710
|
+
result = await asyncio.to_thread(_predict_sync, run_id, data_path)
|
|
711
|
+
|
|
712
|
+
if ctx:
|
|
713
|
+
await ctx.report_progress(100, 100)
|
|
714
|
+
await ctx.info("Prediction complete")
|
|
715
|
+
|
|
716
|
+
return result
|
|
717
|
+
except Exception as e:
|
|
718
|
+
logger.error(f"Error making predictions: {e}", exc_info=True)
|
|
719
|
+
return f"Error making predictions: {str(e)}"
|
|
720
|
+
|
|
721
|
+
def _inspect_data_sync(data_path: str, n_rows: int = 5) -> str:
|
|
722
|
+
"""Synchronous helper for data inspection using unified loader."""
|
|
723
|
+
# Use unified loader with sampling for large files
|
|
724
|
+
# Sample size of 10,000 rows for statistics computation
|
|
725
|
+
SAMPLE_SIZE = 10000
|
|
726
|
+
|
|
727
|
+
# Get total row count first
|
|
728
|
+
con = duckdb.connect(database=':memory:')
|
|
729
|
+
total_rows = con.execute(f"SELECT COUNT(*) FROM '{data_path}'").fetchone()[0]
|
|
730
|
+
|
|
731
|
+
# Load data using unified loader (with sampling if needed)
|
|
732
|
+
data = _load_dataframe_fast(data_path, sample_size=SAMPLE_SIZE)
|
|
733
|
+
|
|
734
|
+
# Structure
|
|
735
|
+
structure = {
|
|
736
|
+
"rows": total_rows, # Report actual total
|
|
737
|
+
"columns": len(data.columns),
|
|
738
|
+
"column_names": list(data.columns),
|
|
739
|
+
"dtypes": data.dtypes.astype(str).to_dict()
|
|
740
|
+
}
|
|
741
|
+
|
|
742
|
+
# Statistics (computed on sample if file is large)
|
|
743
|
+
stats = {
|
|
744
|
+
"missing_values": data.isnull().sum().to_dict(),
|
|
745
|
+
"missing_ratio": (data.isnull().sum() / len(data)).to_dict(),
|
|
746
|
+
"unique_values": data.nunique().to_dict()
|
|
747
|
+
}
|
|
748
|
+
|
|
749
|
+
# Add note if sampling was used
|
|
750
|
+
if total_rows > SAMPLE_SIZE:
|
|
751
|
+
stats["⚠️ note"] = f"Statistics computed on {SAMPLE_SIZE} row sample from {total_rows} total rows"
|
|
752
|
+
|
|
753
|
+
# Previews (Column-oriented)
|
|
754
|
+
# For preview, always get first/last rows from original file
|
|
755
|
+
preview_df = con.execute(f"SELECT * FROM '{data_path}' LIMIT {n_rows}").df()
|
|
756
|
+
tail_df = con.execute(f"""
|
|
757
|
+
SELECT * FROM '{data_path}'
|
|
758
|
+
OFFSET {max(0, total_rows - n_rows)}
|
|
759
|
+
""").df()
|
|
760
|
+
|
|
761
|
+
seed = 42
|
|
762
|
+
previews = {
|
|
763
|
+
"head": preview_df.to_dict(orient="list"),
|
|
764
|
+
"tail": tail_df.to_dict(orient="list"),
|
|
765
|
+
"sample": data.sample(min(n_rows, len(data)), random_state=seed).to_dict(orient="list")
|
|
766
|
+
}
|
|
767
|
+
|
|
768
|
+
return json.dumps({
|
|
769
|
+
"structure": structure,
|
|
770
|
+
"statistics": stats,
|
|
771
|
+
"previews": previews
|
|
772
|
+
}, indent=2, cls=PandasJSONEncoder)
|
|
773
|
+
|
|
774
|
+
@mcp.tool()
|
|
775
|
+
async def inspect_data(data_path: str, n_rows: int = 5, ctx: Context = None) -> str:
|
|
776
|
+
"""
|
|
777
|
+
Get comprehensive statistics and a preview of the dataset to understand its quality and structure.
|
|
778
|
+
Use this to check for missing values, unique counts, and basic data types.
|
|
779
|
+
|
|
780
|
+
- NOTE: Please use absolute paths for data_path to avoid path resolution errors.
|
|
781
|
+
- NOTE: For files larger than 10,000 rows, statistics are computed on a random sample for performance.
|
|
782
|
+
|
|
783
|
+
Args:
|
|
784
|
+
data_path: Path to the CSV, Parquet, or JSON file.
|
|
785
|
+
n_rows: Number of rows to show in head/tail/sample previews (default: 5).
|
|
786
|
+
|
|
787
|
+
Returns:
|
|
788
|
+
JSON string containing structure, stats, and previews.
|
|
789
|
+
"""
|
|
790
|
+
try:
|
|
791
|
+
if ctx:
|
|
792
|
+
await ctx.report_progress(0, 100)
|
|
793
|
+
await ctx.info(f"Inspecting data from {data_path}")
|
|
794
|
+
|
|
795
|
+
result = await asyncio.to_thread(_inspect_data_sync, data_path, n_rows)
|
|
796
|
+
|
|
797
|
+
if ctx:
|
|
798
|
+
await ctx.report_progress(100, 100)
|
|
799
|
+
await ctx.info("Data inspection complete")
|
|
800
|
+
|
|
801
|
+
return result
|
|
802
|
+
except Exception as e:
|
|
803
|
+
logger.error(f"Error inspecting data: {e}", exc_info=True)
|
|
804
|
+
return f"Error inspecting data: {str(e)}"
|
|
805
|
+
|
|
806
|
+
def _query_data_sync(query: str) -> str:
|
|
807
|
+
"""Synchronous helper for DuckDB queries."""
|
|
808
|
+
con = duckdb.connect(database=':memory:')
|
|
809
|
+
|
|
810
|
+
# Security/Limit check: enforce LIMIT if not present?
|
|
811
|
+
# For now, just truncating the result df is safer and easier than parsing SQL.
|
|
812
|
+
|
|
813
|
+
df = con.execute(query).df()
|
|
814
|
+
|
|
815
|
+
if len(df) > QUERY_RESULT_LIMIT:
|
|
816
|
+
df = df.head(QUERY_RESULT_LIMIT)
|
|
817
|
+
|
|
818
|
+
return df.to_json(orient="records", date_format="iso")
|
|
819
|
+
|
|
820
|
+
@mcp.tool()
|
|
821
|
+
async def query_data(query: str, ctx: Context = None) -> str:
|
|
822
|
+
"""
|
|
823
|
+
Execute a DuckDB SQL query on data files (CSV, Parquet, JSON) to gain deeper insights.
|
|
824
|
+
|
|
825
|
+
CRITICAL: This is your PRIMARY tool for advanced data exploration.
|
|
826
|
+
- Use this to aggregate data (GROUP BY), join multiple files, calculate derived metrics, or inspect specific subsets.
|
|
827
|
+
- Prefer this over 'inspect_data' when you need to answer specific questions about the data distribution or relationships.
|
|
828
|
+
- You can query files directly in the FROM clause, e.g., "SELECT category, AVG(price) FROM 'data.csv' GROUP BY category".
|
|
829
|
+
|
|
830
|
+
- NOTE: Please use absolute paths for files in your FROM clause to avoid path resolution errors.
|
|
831
|
+
|
|
832
|
+
Args:
|
|
833
|
+
query: Standard DuckDB SQL query.
|
|
834
|
+
|
|
835
|
+
Returns:
|
|
836
|
+
JSON string containing the query results (limit 100 rows).
|
|
837
|
+
"""
|
|
838
|
+
try:
|
|
839
|
+
if ctx:
|
|
840
|
+
await ctx.report_progress(0, 100)
|
|
841
|
+
await ctx.info("Executing query...")
|
|
842
|
+
|
|
843
|
+
result = await asyncio.to_thread(_query_data_sync, query)
|
|
844
|
+
|
|
845
|
+
if ctx:
|
|
846
|
+
await ctx.report_progress(100, 100)
|
|
847
|
+
await ctx.info("Query complete")
|
|
848
|
+
|
|
849
|
+
return result
|
|
850
|
+
except Exception as e:
|
|
851
|
+
logger.error(f"Error executing query: {e}", exc_info=True)
|
|
852
|
+
return f"Error executing query: {str(e)}"
|
|
853
|
+
|
|
854
|
+
def _process_data_sync(query: str, output_path: str) -> str:
|
|
855
|
+
"""Synchronous helper for process_data."""
|
|
856
|
+
try:
|
|
857
|
+
con = duckdb.connect(database=':memory:')
|
|
858
|
+
|
|
859
|
+
df = con.execute(query).df()
|
|
860
|
+
|
|
861
|
+
if output_path.endswith(".csv"):
|
|
862
|
+
df.to_csv(output_path, index=False)
|
|
863
|
+
elif output_path.endswith(".parquet"):
|
|
864
|
+
df.to_parquet(output_path, index=False)
|
|
865
|
+
elif output_path.endswith(".json"):
|
|
866
|
+
df.to_json(output_path, orient="records", indent=2)
|
|
867
|
+
else:
|
|
868
|
+
return "Error: Output path must end with .csv, .parquet, or .json"
|
|
869
|
+
|
|
870
|
+
return f"Successfully processed data and saved to {output_path}. Rows: {len(df)}"
|
|
871
|
+
except Exception as e:
|
|
872
|
+
return f"Error processing data: {str(e)}"
|
|
873
|
+
|
|
874
|
+
@mcp.tool()
|
|
875
|
+
async def process_data(query: str, output_path: str, ctx: Context) -> str:
|
|
876
|
+
"""
|
|
877
|
+
Execute a DuckDB SQL query to transform data and save it to a new file.
|
|
878
|
+
|
|
879
|
+
CRITICAL: This is your PRIMARY tool for Feature Engineering and Data Cleaning.
|
|
880
|
+
- Use this to create new features, clean dirty data, handle missing values (COALESCE), or join datasets.
|
|
881
|
+
- You MUST use this tool to prepare the data before training if feature engineering is needed.
|
|
882
|
+
- Example: "SELECT *, price/sqft as price_per_sqft, COALESCE(garage, 0) as garage_clean FROM 'train.csv'"
|
|
883
|
+
|
|
884
|
+
IMPORTANT: Strongly RECOMMEND using '.parquet' extension for output_path (e.g. 'clean_data.parquet').
|
|
885
|
+
- Parquet preserves data types (int, float, string, date) much better than CSV.
|
|
886
|
+
- CSV often loses type information (everything becomes string or inferred incorrectly).
|
|
887
|
+
|
|
888
|
+
- NOTE: Please use absolute paths for files in your query and for output_path to avoid path resolution errors.
|
|
889
|
+
|
|
890
|
+
Args:
|
|
891
|
+
query: Standard DuckDB SQL query.
|
|
892
|
+
output_path: Absolute path to save the result (must be .csv, .parquet, or .json).
|
|
893
|
+
|
|
894
|
+
Returns:
|
|
895
|
+
Confirmation message with the output path.
|
|
896
|
+
"""
|
|
897
|
+
try:
|
|
898
|
+
await ctx.report_progress(0, 100)
|
|
899
|
+
await ctx.info(f"Starting data processing task...")
|
|
900
|
+
|
|
901
|
+
result = await asyncio.to_thread(_process_data_sync, query, output_path)
|
|
902
|
+
|
|
903
|
+
await ctx.report_progress(100, 100)
|
|
904
|
+
await ctx.info(f"Finished data processing task.")
|
|
905
|
+
return result
|
|
906
|
+
except Exception as e:
|
|
907
|
+
return f"Error in process_data: {str(e)}"
|
|
908
|
+
|
|
909
|
+
def main():
|
|
910
|
+
"""Main entry point with argument parsing."""
|
|
911
|
+
global EXPERIMENT_DIR, DEFAULT_SESSION_ID
|
|
912
|
+
|
|
913
|
+
parser = argparse.ArgumentParser(
|
|
914
|
+
description='MCP PyCaret Server - AutoML service using PyCaret',
|
|
915
|
+
formatter_class=argparse.ArgumentDefaultsHelpFormatter
|
|
916
|
+
)
|
|
917
|
+
parser.add_argument(
|
|
918
|
+
'--experiment-dir',
|
|
919
|
+
type=str,
|
|
920
|
+
default=EXPERIMENT_DIR,
|
|
921
|
+
help='Directory to store experiment results and trained models'
|
|
922
|
+
)
|
|
923
|
+
parser.add_argument(
|
|
924
|
+
'--session-id',
|
|
925
|
+
type=int,
|
|
926
|
+
default=DEFAULT_SESSION_ID,
|
|
927
|
+
help='Random seed for reproducibility'
|
|
928
|
+
)
|
|
929
|
+
|
|
930
|
+
args = parser.parse_args()
|
|
931
|
+
|
|
932
|
+
# Update module-level configuration
|
|
933
|
+
EXPERIMENT_DIR = os.path.expanduser(args.experiment_dir)
|
|
934
|
+
DEFAULT_SESSION_ID = args.session_id
|
|
935
|
+
|
|
936
|
+
# Ensure experiment directory exists
|
|
937
|
+
Path(EXPERIMENT_DIR).mkdir(parents=True, exist_ok=True)
|
|
938
|
+
|
|
939
|
+
logger.info(f"Starting MCP PyCaret Server")
|
|
940
|
+
logger.info(f"Experiment directory: {EXPERIMENT_DIR}")
|
|
941
|
+
logger.info(f"Session ID: {DEFAULT_SESSION_ID}")
|
|
942
|
+
|
|
943
|
+
mcp.run()
|
|
944
|
+
|
|
945
|
+
if __name__ == "__main__":
|
|
946
|
+
main()
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: mcp-automl
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: MCP server for end-to-end machine learning
|
|
5
|
+
Requires-Python: >=3.11
|
|
6
|
+
Requires-Dist: duckdb>=1.4.3
|
|
7
|
+
Requires-Dist: joblib<1.4
|
|
8
|
+
Requires-Dist: mcp>=1.21.2
|
|
9
|
+
Requires-Dist: pandas<2.2.0
|
|
10
|
+
Requires-Dist: pycaret>=3.0.0
|
|
11
|
+
Requires-Dist: scikit-learn<1.4
|
|
12
|
+
Requires-Dist: tabulate>=0.9.0
|
|
13
|
+
Description-Content-Type: text/markdown
|
|
14
|
+
|
|
15
|
+
# MCP AutoML
|
|
16
|
+
|
|
17
|
+
MCP AutoML is a server that enables AI Agents to perform end-to-end machine learning workflows including data inspection, processing, model training. With MCP AutoML, AI Agents can perform more than a typical autoML framework. AI Agents can identify the target, setting baseline, and creating features by themselves.
|
|
18
|
+
|
|
19
|
+
MCP AutoML seperates tools and workflows, allowing you to create your own workflow.
|
|
20
|
+
|
|
21
|
+
## Features
|
|
22
|
+
|
|
23
|
+
- **Data Inspection**: Analyze datasets with comprehensive statistics, data types, and previews
|
|
24
|
+
- **SQL-based Data Processing**: Transform and engineer features using DuckDB SQL queries
|
|
25
|
+
- **AutoML Training**: Train classification and regression models with automatic model comparison using PyCaret
|
|
26
|
+
- **Prediction**: Make predictions using trained models
|
|
27
|
+
- **Multi-format Support**: Works with CSV, Parquet, and JSON files
|
|
28
|
+
|
|
29
|
+
## Usage
|
|
30
|
+
|
|
31
|
+
### Configure MCP Server
|
|
32
|
+
|
|
33
|
+
Add to your MCP client configuration (e.g., Claude Desktop, Gemini CLI, Cursor, Antigravity):
|
|
34
|
+
|
|
35
|
+
```json
|
|
36
|
+
{
|
|
37
|
+
"mcpServers": {
|
|
38
|
+
"mcp-automl": {
|
|
39
|
+
"command": "uvx",
|
|
40
|
+
"args": ["--from", "git+https://github.com/idea7766/mcp-automl", "mcp-automl"]
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
### Available Tools
|
|
47
|
+
|
|
48
|
+
| Tool | Description |
|
|
49
|
+
|------|-------------|
|
|
50
|
+
| `inspect_data` | Get comprehensive statistics and preview of a dataset |
|
|
51
|
+
| `query_data` | Execute DuckDB SQL queries on data files |
|
|
52
|
+
| `process_data` | Transform data using SQL and save to a new file |
|
|
53
|
+
| `train_classifier` | Train a classification model with AutoML |
|
|
54
|
+
| `train_regressor` | Train a regression model with AutoML |
|
|
55
|
+
| `predict` | Make predictions using a trained model |
|
|
56
|
+
|
|
57
|
+
## Agent Skill
|
|
58
|
+
|
|
59
|
+
MCP AutoML includes an **data science workflow skill** that guides AI agents through best practices for machine learning projects. This skill teaches agents to:
|
|
60
|
+
|
|
61
|
+
- Identify targets and establish baselines
|
|
62
|
+
- Perform exploratory data analysis
|
|
63
|
+
- Engineer domain-specific features
|
|
64
|
+
- Train and evaluate models systematically
|
|
65
|
+
|
|
66
|
+
### Installing the Skill
|
|
67
|
+
|
|
68
|
+
Copy the skill directory to your agent's skill folder:
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# For Gemini Code Assist
|
|
72
|
+
cp -r skill/data-science-workflow ~/.gemini/skills/
|
|
73
|
+
|
|
74
|
+
# For Claude Code
|
|
75
|
+
cp -r skill/data-science-workflow ~/.claude/skills/
|
|
76
|
+
|
|
77
|
+
# For other agents, copy to their respective skill directories
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
The skill file is located at `skill/data-science-workflow/SKILL.md`.
|
|
81
|
+
|
|
82
|
+
## Configuration
|
|
83
|
+
|
|
84
|
+
Models and experiments are saved to `~/.mcp-automl/experiments/` by default.
|
|
85
|
+
|
|
86
|
+
## Dependencies
|
|
87
|
+
|
|
88
|
+
- [PyCaret](https://pycaret.org/) - AutoML library
|
|
89
|
+
- [DuckDB](https://duckdb.org/) - Fast SQL analytics
|
|
90
|
+
- [MCP](https://github.com/modelcontextprotocol/python-sdk) - Model Context Protocol SDK
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
mcp_automl/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
2
|
+
mcp_automl/__main__.py,sha256=ncTRY5zgcNOS7JcLnCVhPd9KsxjyREC245P2eo33BuI,74
|
|
3
|
+
mcp_automl/server.py,sha256=rk8mQFSm-Y-p5-6DqvPkdiUN6WQrC7jXGXTb4Byedgw,42435
|
|
4
|
+
mcp_automl-0.1.0.dist-info/METADATA,sha256=HEkgZj9ePTMKUGsnWYeQrldrE5KKptv4wz_fV43sBPA,3006
|
|
5
|
+
mcp_automl-0.1.0.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
6
|
+
mcp_automl-0.1.0.dist-info/entry_points.txt,sha256=7QuAE_HatGpFE7Ul7hqNHmpaMf0Ug86aFkaCXofjhLg,54
|
|
7
|
+
mcp_automl-0.1.0.dist-info/RECORD,,
|