mcp-automl 0.1.5__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/PKG-INFO +3 -2
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/pyproject.toml +3 -3
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/src/mcp_automl/server.py +45 -8
- mcp_automl-0.1.7/tests/test_formats.py +451 -0
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/uv.lock +47 -8
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/.dockerignore +0 -0
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/.github/workflows/docker-publish.yml +0 -0
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/.github/workflows/publish.yml +0 -0
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/.github/workflows/test.yml +0 -0
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/.gitignore +0 -0
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/.python-version +0 -0
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/Dockerfile +0 -0
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/LICENSE +0 -0
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/README.md +0 -0
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/skill/data-science-workflow/SKILL.md +0 -0
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/src/mcp_automl/__init__.py +0 -0
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/src/mcp_automl/__main__.py +0 -0
- {mcp_automl-0.1.5 → mcp_automl-0.1.7}/tests/test_server.py +0 -0
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: mcp-automl
|
|
3
|
-
Version: 0.1.
|
|
3
|
+
Version: 0.1.7
|
|
4
4
|
Summary: MCP server for end-to-end machine learning
|
|
5
5
|
Author-email: ke <idea7766@gmail.com>
|
|
6
6
|
License-File: LICENSE
|
|
7
7
|
Requires-Python: <3.12,>=3.10
|
|
8
|
-
Requires-Dist: duckdb>=1.4.3
|
|
8
|
+
Requires-Dist: duckdb[all]>=1.4.3
|
|
9
9
|
Requires-Dist: joblib<1.4
|
|
10
10
|
Requires-Dist: mcp>=1.21.2
|
|
11
11
|
Requires-Dist: pandas<2.2.0
|
|
12
|
+
Requires-Dist: pyarrow>=23.0.0
|
|
12
13
|
Requires-Dist: pycaret>=3.0.0
|
|
13
14
|
Requires-Dist: scikit-learn<1.4
|
|
14
15
|
Requires-Dist: tabulate>=0.9.0
|
|
@@ -1,14 +1,15 @@
|
|
|
1
1
|
[project]
|
|
2
2
|
name = "mcp-automl"
|
|
3
|
-
version = "0.1.
|
|
3
|
+
version = "0.1.7"
|
|
4
4
|
description = "MCP server for end-to-end machine learning"
|
|
5
5
|
readme = "README.md"
|
|
6
6
|
requires-python = ">=3.10,<3.12"
|
|
7
7
|
dependencies = [
|
|
8
|
-
"duckdb>=1.4.3",
|
|
8
|
+
"duckdb[all]>=1.4.3",
|
|
9
9
|
"joblib<1.4",
|
|
10
10
|
"mcp>=1.21.2",
|
|
11
11
|
"pandas<2.2.0",
|
|
12
|
+
"pyarrow>=23.0.0",
|
|
12
13
|
"pycaret>=3.0.0",
|
|
13
14
|
"scikit-learn<1.4",
|
|
14
15
|
"tabulate>=0.9.0",
|
|
@@ -30,5 +31,4 @@ package = true
|
|
|
30
31
|
[dependency-groups]
|
|
31
32
|
dev = [
|
|
32
33
|
"pytest-asyncio>=1.3.0",
|
|
33
|
-
"pyarrow>=14.0.0",
|
|
34
34
|
]
|
|
@@ -10,8 +10,8 @@ import argparse
|
|
|
10
10
|
from pathlib import Path
|
|
11
11
|
from mcp.server.fastmcp import FastMCP, Context
|
|
12
12
|
from mcp.types import PromptMessage, TextContent
|
|
13
|
-
from pycaret.classification import setup as setup_clf, compare_models as compare_models_clf, pull as pull_clf, save_model as save_model_clf, load_model as load_model_clf, predict_model as predict_model_clf, get_config as get_config_clf
|
|
14
|
-
from pycaret.regression import setup as setup_reg, compare_models as compare_models_reg, pull as pull_reg, save_model as save_model_reg, load_model as load_model_reg, predict_model as predict_model_reg, get_config as get_config_reg
|
|
13
|
+
from pycaret.classification import setup as setup_clf, compare_models as compare_models_clf, pull as pull_clf, save_model as save_model_clf, load_model as load_model_clf, predict_model as predict_model_clf, get_config as get_config_clf, tune_model as tune_model_clf, finalize_model as finalize_model_clf
|
|
14
|
+
from pycaret.regression import setup as setup_reg, compare_models as compare_models_reg, pull as pull_reg, save_model as save_model_reg, load_model as load_model_reg, predict_model as predict_model_reg, get_config as get_config_reg, tune_model as tune_model_reg, finalize_model as finalize_model_reg
|
|
15
15
|
|
|
16
16
|
# Configure logging
|
|
17
17
|
logging.basicConfig(
|
|
@@ -363,12 +363,31 @@ def _train_classifier_sync(run_id: str, data_path: str, target_column: str, igno
|
|
|
363
363
|
best_model = best_model[0]
|
|
364
364
|
results = pull_clf()
|
|
365
365
|
|
|
366
|
-
#
|
|
366
|
+
# Tune Model
|
|
367
|
+
logger.info("Tuning best model with Optuna...")
|
|
368
|
+
try:
|
|
369
|
+
best_model = tune_model_clf(best_model, optimize=optimize, search_library="optuna", n_trials=10)
|
|
370
|
+
results = pull_clf()
|
|
371
|
+
except Exception as e:
|
|
372
|
+
logger.warning(f"Tuning failed: {e}. Proceeding with untuned model.")
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
# Extract feature importances (from the potentially tuned model)
|
|
367
376
|
feature_importances = _get_feature_importances(best_model, get_config_clf)
|
|
368
377
|
|
|
369
|
-
# Evaluate on holdout (
|
|
370
|
-
|
|
371
|
-
|
|
378
|
+
# Evaluate on holdout (test_data_path)
|
|
379
|
+
test_results = None
|
|
380
|
+
if test_data_path:
|
|
381
|
+
logger.info("Evaluating on provided test data...")
|
|
382
|
+
predict_model_clf(best_model)
|
|
383
|
+
test_results = pull_clf()
|
|
384
|
+
|
|
385
|
+
# Finalize Model
|
|
386
|
+
logger.info("Finalizing model on all data...")
|
|
387
|
+
try:
|
|
388
|
+
best_model = finalize_model_clf(best_model)
|
|
389
|
+
except Exception as e:
|
|
390
|
+
logger.warning(f"Finalization failed: {e}. Saving non-finalized model.")
|
|
372
391
|
|
|
373
392
|
metadata = {
|
|
374
393
|
"data_path": data_path,
|
|
@@ -543,12 +562,30 @@ def _train_regressor_sync(run_id: str, data_path: str, target_column: str, ignor
|
|
|
543
562
|
best_model = best_model[0]
|
|
544
563
|
results = pull_reg()
|
|
545
564
|
|
|
565
|
+
# Tune Model
|
|
566
|
+
logger.info("Tuning best model with Optuna...")
|
|
567
|
+
try:
|
|
568
|
+
best_model = tune_model_reg(best_model, optimize=optimize, search_library="optuna", n_trials=10)
|
|
569
|
+
results = pull_reg()
|
|
570
|
+
except Exception as e:
|
|
571
|
+
logger.warning(f"Tuning failed: {e}. Proceeding with untuned model.")
|
|
572
|
+
|
|
546
573
|
# Extract feature importances
|
|
547
574
|
feature_importances = _get_feature_importances(best_model, get_config_reg)
|
|
548
575
|
|
|
549
576
|
# Evaluate on holdout
|
|
550
|
-
|
|
551
|
-
|
|
577
|
+
test_results = None
|
|
578
|
+
if test_data_path:
|
|
579
|
+
logger.info("Evaluating on provided test data...")
|
|
580
|
+
predict_model_reg(best_model)
|
|
581
|
+
test_results = pull_reg()
|
|
582
|
+
|
|
583
|
+
# Finalize Model
|
|
584
|
+
logger.info("Finalizing model on all data...")
|
|
585
|
+
try:
|
|
586
|
+
best_model = finalize_model_reg(best_model)
|
|
587
|
+
except Exception as e:
|
|
588
|
+
logger.warning(f"Finalization failed: {e}. Saving non-finalized model.")
|
|
552
589
|
|
|
553
590
|
metadata = {
|
|
554
591
|
"data_path": data_path,
|
|
@@ -0,0 +1,451 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Tests for verifying supported document formats (CSV, Parquet, JSON).
|
|
3
|
+
|
|
4
|
+
These tests ensure that all supported file formats work correctly with
|
|
5
|
+
exposed MCP tools:
|
|
6
|
+
- inspect_data() - data inspection tool
|
|
7
|
+
- query_data() - SQL query tool
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
import pytest
|
|
11
|
+
import pandas as pd
|
|
12
|
+
import numpy as np
|
|
13
|
+
import json
|
|
14
|
+
import asyncio
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from mcp_automl.server import (
|
|
18
|
+
inspect_data,
|
|
19
|
+
query_data,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
# =============================================================================
|
|
24
|
+
# Fixtures for creating test data in different formats
|
|
25
|
+
# =============================================================================
|
|
26
|
+
|
|
27
|
+
@pytest.fixture
|
|
28
|
+
def sample_dataframe():
|
|
29
|
+
"""Create a sample DataFrame for testing."""
|
|
30
|
+
return pd.DataFrame({
|
|
31
|
+
'int_col': [1, 2, 3, 4, 5],
|
|
32
|
+
'float_col': [1.1, 2.2, 3.3, 4.4, 5.5],
|
|
33
|
+
'str_col': ['a', 'b', 'c', 'd', 'e'],
|
|
34
|
+
'bool_col': [True, False, True, False, True],
|
|
35
|
+
'target': [0, 1, 0, 1, 0]
|
|
36
|
+
})
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@pytest.fixture
|
|
40
|
+
def sample_csv_file(tmp_path, sample_dataframe):
|
|
41
|
+
"""Create a sample CSV file."""
|
|
42
|
+
file_path = tmp_path / "data.csv"
|
|
43
|
+
sample_dataframe.to_csv(file_path, index=False)
|
|
44
|
+
return str(file_path)
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@pytest.fixture
|
|
48
|
+
def sample_parquet_file(tmp_path, sample_dataframe):
|
|
49
|
+
"""Create a sample Parquet file."""
|
|
50
|
+
file_path = tmp_path / "data.parquet"
|
|
51
|
+
sample_dataframe.to_parquet(file_path, index=False)
|
|
52
|
+
return str(file_path)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@pytest.fixture
|
|
56
|
+
def sample_json_file(tmp_path, sample_dataframe):
|
|
57
|
+
"""Create a sample JSON file (records orient)."""
|
|
58
|
+
file_path = tmp_path / "data.json"
|
|
59
|
+
sample_dataframe.to_json(file_path, orient='records')
|
|
60
|
+
return str(file_path)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@pytest.fixture
|
|
64
|
+
def all_format_files(sample_csv_file, sample_parquet_file, sample_json_file):
|
|
65
|
+
"""Return all format files as a dict."""
|
|
66
|
+
return {
|
|
67
|
+
'csv': sample_csv_file,
|
|
68
|
+
'parquet': sample_parquet_file,
|
|
69
|
+
'json': sample_json_file
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
|
|
73
|
+
# =============================================================================
|
|
74
|
+
# Test inspect_data() with different formats
|
|
75
|
+
# =============================================================================
|
|
76
|
+
|
|
77
|
+
class TestInspectDataFormats:
|
|
78
|
+
"""Tests for inspect_data() with different file formats."""
|
|
79
|
+
|
|
80
|
+
def test_inspect_csv(self, sample_csv_file):
|
|
81
|
+
"""Test inspect_data with CSV file."""
|
|
82
|
+
result = asyncio.run(inspect_data(sample_csv_file, n_rows=3))
|
|
83
|
+
|
|
84
|
+
data = json.loads(result)
|
|
85
|
+
|
|
86
|
+
assert "structure" in data
|
|
87
|
+
assert "statistics" in data
|
|
88
|
+
assert "previews" in data
|
|
89
|
+
assert data["structure"]["rows"] == 5
|
|
90
|
+
assert data["structure"]["columns"] == 5
|
|
91
|
+
|
|
92
|
+
def test_inspect_parquet(self, sample_parquet_file):
|
|
93
|
+
"""Test inspect_data with Parquet file."""
|
|
94
|
+
result = asyncio.run(inspect_data(sample_parquet_file, n_rows=3))
|
|
95
|
+
|
|
96
|
+
data = json.loads(result)
|
|
97
|
+
|
|
98
|
+
assert "structure" in data
|
|
99
|
+
assert data["structure"]["rows"] == 5
|
|
100
|
+
assert data["structure"]["columns"] == 5
|
|
101
|
+
assert "int_col" in data["structure"]["column_names"]
|
|
102
|
+
|
|
103
|
+
def test_inspect_json(self, sample_json_file):
|
|
104
|
+
"""Test inspect_data with JSON file."""
|
|
105
|
+
result = asyncio.run(inspect_data(sample_json_file, n_rows=3))
|
|
106
|
+
|
|
107
|
+
data = json.loads(result)
|
|
108
|
+
|
|
109
|
+
assert "structure" in data
|
|
110
|
+
assert data["structure"]["rows"] == 5
|
|
111
|
+
assert data["structure"]["columns"] == 5
|
|
112
|
+
|
|
113
|
+
def test_inspect_all_formats_consistent(self, all_format_files):
|
|
114
|
+
"""Test that all formats return consistent structure info."""
|
|
115
|
+
results = {}
|
|
116
|
+
for fmt, path in all_format_files.items():
|
|
117
|
+
result = asyncio.run(inspect_data(path, n_rows=3))
|
|
118
|
+
results[fmt] = json.loads(result)
|
|
119
|
+
|
|
120
|
+
# All formats should report same row/column counts
|
|
121
|
+
for fmt in ['parquet', 'json']:
|
|
122
|
+
assert results[fmt]["structure"]["rows"] == results['csv']["structure"]["rows"]
|
|
123
|
+
assert results[fmt]["structure"]["columns"] == results['csv']["structure"]["columns"]
|
|
124
|
+
assert set(results[fmt]["structure"]["column_names"]) == set(results['csv']["structure"]["column_names"])
|
|
125
|
+
|
|
126
|
+
def test_inspect_unsupported_format_returns_error(self, tmp_path):
|
|
127
|
+
"""Test that unsupported format returns error message."""
|
|
128
|
+
txt_file = tmp_path / "data.txt"
|
|
129
|
+
txt_file.write_text("some data")
|
|
130
|
+
|
|
131
|
+
result = asyncio.run(inspect_data(str(txt_file)))
|
|
132
|
+
|
|
133
|
+
assert "Error" in result
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
# =============================================================================
|
|
137
|
+
# Test query_data() with different formats
|
|
138
|
+
# =============================================================================
|
|
139
|
+
|
|
140
|
+
class TestQueryDataFormats:
|
|
141
|
+
"""Tests for query_data() with different file formats."""
|
|
142
|
+
|
|
143
|
+
def test_query_csv(self, sample_csv_file):
|
|
144
|
+
"""Test query_data with CSV file."""
|
|
145
|
+
query = f"SELECT COUNT(*) as cnt FROM '{sample_csv_file}'"
|
|
146
|
+
result = asyncio.run(query_data(query))
|
|
147
|
+
|
|
148
|
+
data = json.loads(result)
|
|
149
|
+
assert data[0]["cnt"] == 5
|
|
150
|
+
|
|
151
|
+
def test_query_parquet(self, sample_parquet_file):
|
|
152
|
+
"""Test query_data with Parquet file."""
|
|
153
|
+
query = f"SELECT COUNT(*) as cnt FROM '{sample_parquet_file}'"
|
|
154
|
+
result = asyncio.run(query_data(query))
|
|
155
|
+
|
|
156
|
+
data = json.loads(result)
|
|
157
|
+
assert data[0]["cnt"] == 5
|
|
158
|
+
|
|
159
|
+
def test_query_json(self, sample_json_file):
|
|
160
|
+
"""Test query_data with JSON file."""
|
|
161
|
+
query = f"SELECT COUNT(*) as cnt FROM '{sample_json_file}'"
|
|
162
|
+
result = asyncio.run(query_data(query))
|
|
163
|
+
|
|
164
|
+
data = json.loads(result)
|
|
165
|
+
assert data[0]["cnt"] == 5
|
|
166
|
+
|
|
167
|
+
def test_query_aggregation_all_formats(self, all_format_files):
|
|
168
|
+
"""Test aggregation query works on all formats."""
|
|
169
|
+
for fmt, path in all_format_files.items():
|
|
170
|
+
query = f"SELECT SUM(int_col) as total FROM '{path}'"
|
|
171
|
+
result = asyncio.run(query_data(query))
|
|
172
|
+
|
|
173
|
+
data = json.loads(result)
|
|
174
|
+
assert data[0]["total"] == 15 # 1+2+3+4+5
|
|
175
|
+
|
|
176
|
+
def test_query_filter_all_formats(self, all_format_files):
|
|
177
|
+
"""Test filter query works on all formats."""
|
|
178
|
+
for fmt, path in all_format_files.items():
|
|
179
|
+
query = f"SELECT * FROM '{path}' WHERE int_col > 3"
|
|
180
|
+
result = asyncio.run(query_data(query))
|
|
181
|
+
|
|
182
|
+
data = json.loads(result)
|
|
183
|
+
assert len(data) == 2 # int_col 4 and 5
|
|
184
|
+
|
|
185
|
+
def test_query_join_csv_parquet(self, sample_csv_file, sample_parquet_file):
|
|
186
|
+
"""Test joining CSV and Parquet files in a single query."""
|
|
187
|
+
query = f"""
|
|
188
|
+
SELECT c.int_col, p.float_col
|
|
189
|
+
FROM '{sample_csv_file}' c
|
|
190
|
+
JOIN '{sample_parquet_file}' p ON c.int_col = p.int_col
|
|
191
|
+
WHERE c.int_col <= 3
|
|
192
|
+
"""
|
|
193
|
+
result = asyncio.run(query_data(query))
|
|
194
|
+
|
|
195
|
+
data = json.loads(result)
|
|
196
|
+
assert len(data) == 3
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
# =============================================================================
|
|
200
|
+
# Test special cases and edge cases
|
|
201
|
+
# =============================================================================
|
|
202
|
+
|
|
203
|
+
class TestFormatEdgeCases:
|
|
204
|
+
"""Tests for edge cases in format handling via exposed tools."""
|
|
205
|
+
|
|
206
|
+
def test_csv_with_special_characters(self, tmp_path):
|
|
207
|
+
"""Test CSV with special characters in data."""
|
|
208
|
+
df = pd.DataFrame({
|
|
209
|
+
'text': ['hello, world', 'foo "bar"', "line1\nline2", 'tab\there'],
|
|
210
|
+
'target': [0, 1, 0, 1]
|
|
211
|
+
})
|
|
212
|
+
file_path = tmp_path / "special.csv"
|
|
213
|
+
df.to_csv(file_path, index=False)
|
|
214
|
+
|
|
215
|
+
result = asyncio.run(inspect_data(str(file_path)))
|
|
216
|
+
data = json.loads(result)
|
|
217
|
+
assert data["structure"]["rows"] == 4
|
|
218
|
+
assert "text" in data["structure"]["column_names"]
|
|
219
|
+
|
|
220
|
+
def test_json_nested_to_flat(self, tmp_path):
|
|
221
|
+
"""Test JSON with records orientation (flat structure)."""
|
|
222
|
+
records = [
|
|
223
|
+
{"a": 1, "b": "x"},
|
|
224
|
+
{"a": 2, "b": "y"},
|
|
225
|
+
{"a": 3, "b": "z"}
|
|
226
|
+
]
|
|
227
|
+
file_path = tmp_path / "flat.json"
|
|
228
|
+
with open(file_path, 'w') as f:
|
|
229
|
+
json.dump(records, f)
|
|
230
|
+
|
|
231
|
+
result = asyncio.run(inspect_data(str(file_path)))
|
|
232
|
+
data = json.loads(result)
|
|
233
|
+
assert data["structure"]["rows"] == 3
|
|
234
|
+
assert set(data["structure"]["column_names"]) == {'a', 'b'}
|
|
235
|
+
|
|
236
|
+
def test_parquet_with_nullable_types(self, tmp_path):
|
|
237
|
+
"""Test Parquet with nullable/arrow types."""
|
|
238
|
+
df = pd.DataFrame({
|
|
239
|
+
'nullable_int': pd.array([1, 2, None, 4, 5], dtype="Int64"),
|
|
240
|
+
'nullable_float': pd.array([1.0, None, 3.0, 4.0, 5.0], dtype="Float64"),
|
|
241
|
+
'nullable_str': pd.array(['a', 'b', None, 'd', 'e'], dtype="string"),
|
|
242
|
+
'target': [0, 1, 0, 1, 0]
|
|
243
|
+
})
|
|
244
|
+
file_path = tmp_path / "nullable.parquet"
|
|
245
|
+
df.to_parquet(file_path, index=False)
|
|
246
|
+
|
|
247
|
+
result = asyncio.run(inspect_data(str(file_path)))
|
|
248
|
+
data = json.loads(result)
|
|
249
|
+
assert data["structure"]["rows"] == 5
|
|
250
|
+
# Check that nulls are counted correctly
|
|
251
|
+
assert data["statistics"]["missing_values"]["nullable_int"] == 1
|
|
252
|
+
assert data["statistics"]["missing_values"]["nullable_float"] == 1
|
|
253
|
+
assert data["statistics"]["missing_values"]["nullable_str"] == 1
|
|
254
|
+
|
|
255
|
+
def test_csv_empty_file(self, tmp_path):
|
|
256
|
+
"""Test handling of empty CSV file with headers only."""
|
|
257
|
+
file_path = tmp_path / "empty.csv"
|
|
258
|
+
file_path.write_text("col1,col2,col3\n")
|
|
259
|
+
|
|
260
|
+
result = asyncio.run(inspect_data(str(file_path)))
|
|
261
|
+
data = json.loads(result)
|
|
262
|
+
assert data["structure"]["rows"] == 0
|
|
263
|
+
assert set(data["structure"]["column_names"]) == {'col1', 'col2', 'col3'}
|
|
264
|
+
|
|
265
|
+
def test_parquet_empty_dataframe(self, tmp_path):
|
|
266
|
+
"""Test handling of empty Parquet file."""
|
|
267
|
+
df = pd.DataFrame({'col1': [], 'col2': [], 'col3': []})
|
|
268
|
+
file_path = tmp_path / "empty.parquet"
|
|
269
|
+
df.to_parquet(file_path, index=False)
|
|
270
|
+
|
|
271
|
+
result = asyncio.run(inspect_data(str(file_path)))
|
|
272
|
+
data = json.loads(result)
|
|
273
|
+
assert data["structure"]["rows"] == 0
|
|
274
|
+
assert set(data["structure"]["column_names"]) == {'col1', 'col2', 'col3'}
|
|
275
|
+
|
|
276
|
+
def test_json_empty_array(self, tmp_path):
|
|
277
|
+
"""Test handling of empty JSON array."""
|
|
278
|
+
file_path = tmp_path / "empty.json"
|
|
279
|
+
file_path.write_text("[]")
|
|
280
|
+
|
|
281
|
+
result = asyncio.run(inspect_data(str(file_path)))
|
|
282
|
+
data = json.loads(result)
|
|
283
|
+
assert data["structure"]["rows"] == 0
|
|
284
|
+
|
|
285
|
+
def test_csv_with_unicode(self, tmp_path):
|
|
286
|
+
"""Test CSV with Unicode characters."""
|
|
287
|
+
df = pd.DataFrame({
|
|
288
|
+
'text': ['日本語', 'العربية', 'emoji: 🎉🚀', 'ñoño'],
|
|
289
|
+
'target': [0, 1, 0, 1]
|
|
290
|
+
})
|
|
291
|
+
file_path = tmp_path / "unicode.csv"
|
|
292
|
+
df.to_csv(file_path, index=False)
|
|
293
|
+
|
|
294
|
+
result = asyncio.run(inspect_data(str(file_path)))
|
|
295
|
+
data = json.loads(result)
|
|
296
|
+
assert data["structure"]["rows"] == 4
|
|
297
|
+
# Verify via query
|
|
298
|
+
query = f"SELECT * FROM '{file_path}' WHERE text = '日本語'"
|
|
299
|
+
query_result = asyncio.run(query_data(query))
|
|
300
|
+
query_data_list = json.loads(query_result)
|
|
301
|
+
assert len(query_data_list) == 1
|
|
302
|
+
|
|
303
|
+
def test_large_csv_file(self, tmp_path):
|
|
304
|
+
"""Test loading a larger CSV file."""
|
|
305
|
+
n_rows = 50000
|
|
306
|
+
df = pd.DataFrame({
|
|
307
|
+
'id': range(n_rows),
|
|
308
|
+
'value': np.random.randn(n_rows),
|
|
309
|
+
'category': [f'cat_{i % 10}' for i in range(n_rows)]
|
|
310
|
+
})
|
|
311
|
+
file_path = tmp_path / "large.csv"
|
|
312
|
+
df.to_csv(file_path, index=False)
|
|
313
|
+
|
|
314
|
+
result = asyncio.run(inspect_data(str(file_path)))
|
|
315
|
+
data = json.loads(result)
|
|
316
|
+
assert data["structure"]["rows"] == n_rows
|
|
317
|
+
|
|
318
|
+
# Verify aggregation query works
|
|
319
|
+
query = f"SELECT COUNT(*) as cnt FROM '{file_path}'"
|
|
320
|
+
query_result = asyncio.run(query_data(query))
|
|
321
|
+
query_data_list = json.loads(query_result)
|
|
322
|
+
assert query_data_list[0]["cnt"] == n_rows
|
|
323
|
+
|
|
324
|
+
def test_parquet_with_multiple_types(self, tmp_path):
|
|
325
|
+
"""Test Parquet with diverse column types."""
|
|
326
|
+
df = pd.DataFrame({
|
|
327
|
+
'int8_col': np.array([1, 2, 3], dtype=np.int8),
|
|
328
|
+
'int16_col': np.array([100, 200, 300], dtype=np.int16),
|
|
329
|
+
'int32_col': np.array([1000, 2000, 3000], dtype=np.int32),
|
|
330
|
+
'int64_col': np.array([10000, 20000, 30000], dtype=np.int64),
|
|
331
|
+
'float32_col': np.array([1.1, 2.2, 3.3], dtype=np.float32),
|
|
332
|
+
'float64_col': np.array([1.11, 2.22, 3.33], dtype=np.float64),
|
|
333
|
+
'bool_col': [True, False, True],
|
|
334
|
+
'str_col': ['a', 'b', 'c'],
|
|
335
|
+
'target': [0, 1, 0]
|
|
336
|
+
})
|
|
337
|
+
file_path = tmp_path / "multitypes.parquet"
|
|
338
|
+
df.to_parquet(file_path, index=False)
|
|
339
|
+
|
|
340
|
+
result = asyncio.run(inspect_data(str(file_path)))
|
|
341
|
+
data = json.loads(result)
|
|
342
|
+
assert data["structure"]["rows"] == 3
|
|
343
|
+
assert data["structure"]["columns"] == 9
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
# =============================================================================
|
|
347
|
+
# Test file extension validation via exposed tools
|
|
348
|
+
# =============================================================================
|
|
349
|
+
|
|
350
|
+
class TestFileExtensionValidation:
|
|
351
|
+
"""Tests for file extension handling via exposed tools."""
|
|
352
|
+
|
|
353
|
+
def test_unsupported_txt_extension(self, tmp_path):
|
|
354
|
+
"""Test that .txt extension is not supported."""
|
|
355
|
+
txt_file = tmp_path / "data.txt"
|
|
356
|
+
txt_file.write_text("some data")
|
|
357
|
+
|
|
358
|
+
result = asyncio.run(inspect_data(str(txt_file)))
|
|
359
|
+
assert "Error" in result
|
|
360
|
+
|
|
361
|
+
def test_unsupported_xlsx_extension(self, tmp_path):
|
|
362
|
+
"""Test that .xlsx extension is not supported."""
|
|
363
|
+
xlsx_file = tmp_path / "data.xlsx"
|
|
364
|
+
xlsx_file.write_bytes(b"fake xlsx content")
|
|
365
|
+
|
|
366
|
+
result = asyncio.run(inspect_data(str(xlsx_file)))
|
|
367
|
+
assert "Error" in result
|
|
368
|
+
|
|
369
|
+
def test_double_extension_csv(self, tmp_path, sample_dataframe):
|
|
370
|
+
"""Test file with double extension like .tar.csv works."""
|
|
371
|
+
file_path = tmp_path / "data.tar.csv"
|
|
372
|
+
sample_dataframe.to_csv(file_path, index=False)
|
|
373
|
+
|
|
374
|
+
result = asyncio.run(inspect_data(str(file_path)))
|
|
375
|
+
data = json.loads(result)
|
|
376
|
+
assert data["structure"]["rows"] == 5
|
|
377
|
+
|
|
378
|
+
|
|
379
|
+
# =============================================================================
|
|
380
|
+
# Test format-specific inspect_data behavior
|
|
381
|
+
# =============================================================================
|
|
382
|
+
|
|
383
|
+
class TestInspectDataFormatDetails:
|
|
384
|
+
"""Tests for format-specific behavior in inspect_data."""
|
|
385
|
+
|
|
386
|
+
def test_inspect_csv_dtypes(self, tmp_path):
|
|
387
|
+
"""Test that CSV dtypes are correctly inferred."""
|
|
388
|
+
df = pd.DataFrame({
|
|
389
|
+
'int_col': [1, 2, 3],
|
|
390
|
+
'float_col': [1.5, 2.5, 3.5],
|
|
391
|
+
'str_col': ['a', 'b', 'c']
|
|
392
|
+
})
|
|
393
|
+
file_path = tmp_path / "typed.csv"
|
|
394
|
+
df.to_csv(file_path, index=False)
|
|
395
|
+
|
|
396
|
+
result = asyncio.run(inspect_data(str(file_path)))
|
|
397
|
+
data = json.loads(result)
|
|
398
|
+
|
|
399
|
+
dtypes = data["structure"]["dtypes"]
|
|
400
|
+
# DuckDB may infer these differently, just check keys exist
|
|
401
|
+
assert "int_col" in dtypes
|
|
402
|
+
assert "float_col" in dtypes
|
|
403
|
+
assert "str_col" in dtypes
|
|
404
|
+
|
|
405
|
+
def test_inspect_parquet_preserves_dtypes(self, tmp_path):
|
|
406
|
+
"""Test that Parquet preserves exact dtypes."""
|
|
407
|
+
df = pd.DataFrame({
|
|
408
|
+
'int32_col': np.array([1, 2, 3], dtype=np.int32),
|
|
409
|
+
'float32_col': np.array([1.5, 2.5, 3.5], dtype=np.float32),
|
|
410
|
+
})
|
|
411
|
+
file_path = tmp_path / "typed.parquet"
|
|
412
|
+
df.to_parquet(file_path, index=False)
|
|
413
|
+
|
|
414
|
+
result = asyncio.run(inspect_data(str(file_path)))
|
|
415
|
+
data = json.loads(result)
|
|
416
|
+
|
|
417
|
+
dtypes = data["structure"]["dtypes"]
|
|
418
|
+
assert "int32_col" in dtypes
|
|
419
|
+
assert "float32_col" in dtypes
|
|
420
|
+
|
|
421
|
+
def test_inspect_counts_missing_values_csv(self, tmp_path):
|
|
422
|
+
"""Test that missing values are correctly counted in CSV."""
|
|
423
|
+
# CSV doesn't preserve NA types well, use empty strings which become NaN
|
|
424
|
+
file_path = tmp_path / "missing.csv"
|
|
425
|
+
file_path.write_text("a,b,c\n1,2,3\n,5,\n7,,9\n")
|
|
426
|
+
|
|
427
|
+
result = asyncio.run(inspect_data(str(file_path)))
|
|
428
|
+
data = json.loads(result)
|
|
429
|
+
|
|
430
|
+
missing = data["statistics"]["missing_values"]
|
|
431
|
+
assert missing["a"] == 1
|
|
432
|
+
assert missing["b"] == 1
|
|
433
|
+
assert missing["c"] == 1
|
|
434
|
+
|
|
435
|
+
def test_inspect_counts_missing_values_parquet(self, tmp_path):
|
|
436
|
+
"""Test that missing values are correctly counted in Parquet."""
|
|
437
|
+
df = pd.DataFrame({
|
|
438
|
+
'a': [1, None, 3],
|
|
439
|
+
'b': [None, 2, None],
|
|
440
|
+
'c': [1, 2, 3]
|
|
441
|
+
})
|
|
442
|
+
file_path = tmp_path / "missing.parquet"
|
|
443
|
+
df.to_parquet(file_path, index=False)
|
|
444
|
+
|
|
445
|
+
result = asyncio.run(inspect_data(str(file_path)))
|
|
446
|
+
data = json.loads(result)
|
|
447
|
+
|
|
448
|
+
missing = data["statistics"]["missing_values"]
|
|
449
|
+
assert missing["a"] == 1
|
|
450
|
+
assert missing["b"] == 2
|
|
451
|
+
assert missing["c"] == 0
|
|
@@ -6,6 +6,27 @@ resolution-markers = [
|
|
|
6
6
|
"python_full_version < '3.11'",
|
|
7
7
|
]
|
|
8
8
|
|
|
9
|
+
[[package]]
|
|
10
|
+
name = "adbc-driver-manager"
|
|
11
|
+
version = "1.10.0"
|
|
12
|
+
source = { registry = "https://pypi.org/simple" }
|
|
13
|
+
dependencies = [
|
|
14
|
+
{ name = "typing-extensions" },
|
|
15
|
+
]
|
|
16
|
+
sdist = { url = "https://files.pythonhosted.org/packages/9e/77/b6ffd112a67d133810d0027e9de4408a6e63e0e1c438f5866cc28eb3c213/adbc_driver_manager-1.10.0.tar.gz", hash = "sha256:f04407cf2f99bfde13dea0e136d87219c8a16678d43e322744dbd84cdd8eaac2", size = 208204, upload-time = "2026-01-09T07:13:45.803Z" }
|
|
17
|
+
wheels = [
|
|
18
|
+
{ url = "https://files.pythonhosted.org/packages/8a/0e/95eae266a8d97f2f222e6db9047dc4c1fab6a3e1d5e6bd9c8efb29881ec4/adbc_driver_manager-1.10.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:b82d7ffab5ad4c892e2f3201cc3781db3f87ef0c5ce1938715fb39a5dc6671b0", size = 532926, upload-time = "2026-01-09T07:11:52.672Z" },
|
|
19
|
+
{ url = "https://files.pythonhosted.org/packages/bc/7c/c7234fe0e25ccd0fe23d8fa1e3f2682d407f49916e845e15869d262fc648/adbc_driver_manager-1.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e124ad209bc7112d0c0778fcc2e727c4fdf733188403129a82c10e563e89252b", size = 513090, upload-time = "2026-01-09T07:11:54.807Z" },
|
|
20
|
+
{ url = "https://files.pythonhosted.org/packages/8d/81/6fb0075c67d1039e82960ab9d039da00ef3149b872a067d2e83ea9bb9956/adbc_driver_manager-1.10.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0abafd6b7d8ef5ba9c33fa92a1c5c329bfb89a68fb12e88ca62a4e32d822f257", size = 3039894, upload-time = "2026-01-09T07:11:56.892Z" },
|
|
21
|
+
{ url = "https://files.pythonhosted.org/packages/8a/43/10e2abe7c600545fcf5b684b04073b36c87ed879a4bbc8fcd4f6f329c302/adbc_driver_manager-1.10.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ceca0800974137d2373cfb3aa4862af4b9361a2e5b94808b52df63c3f34a14eb", size = 3053785, upload-time = "2026-01-09T07:11:59.051Z" },
|
|
22
|
+
{ url = "https://files.pythonhosted.org/packages/ee/dd/8f0fe60d49fe0b7bd9eb0b76268d662f95b31a8c623fc7cef40ad9488d0f/adbc_driver_manager-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:23504672daeafabe03d4e07038754910f55f6845ef260f2249d9d8942ab16866", size = 714987, upload-time = "2026-01-09T07:12:00.771Z" },
|
|
23
|
+
{ url = "https://files.pythonhosted.org/packages/bd/23/eaea050e76a1f65749be243a68514d67e13ab896c47cbf9e652da0ba9c10/adbc_driver_manager-1.10.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:715a33d750af09e1c03fde1783490c816e08a786f151ac79269659da1d2cc4e0", size = 533268, upload-time = "2026-01-09T07:12:02.401Z" },
|
|
24
|
+
{ url = "https://files.pythonhosted.org/packages/4b/37/b81d64da4b1a032df0798bbf8c2e3abf875f9dd319598308d2efebe06523/adbc_driver_manager-1.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd40c9b20be54c55b3ce64cabd5f35f29a61886574d990a1d5b5bdd7f81a7b6", size = 513190, upload-time = "2026-01-09T07:12:04.025Z" },
|
|
25
|
+
{ url = "https://files.pythonhosted.org/packages/2b/2a/a03cd7d4eb81c478566a38e6a657b83171e61e84f6aa0c0f9b49ae9d498c/adbc_driver_manager-1.10.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:595ab4a8ec2ddb338c70f3c31481a41830ad9e2d8c1a1884184023303098bc92", size = 3111408, upload-time = "2026-01-09T07:12:06.421Z" },
|
|
26
|
+
{ url = "https://files.pythonhosted.org/packages/97/67/b9309e5351d4ff02720719c6ca01716ded33075fa486157db409bc5f47be/adbc_driver_manager-1.10.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:92fdf3247aef506583e79b3b583c1bf93f28c70e771281a41843aba63c61f732", size = 3124914, upload-time = "2026-01-09T07:12:08.274Z" },
|
|
27
|
+
{ url = "https://files.pythonhosted.org/packages/41/1d/228041cc7ee30e51556d991d5f30981bfbf0c2d2a91c83f34ace2a2a9d2c/adbc_driver_manager-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:7c5becb5a81fae563a10d82b570c4e1c7a8994c5b110ddaaae6afa9fd52a17b6", size = 716182, upload-time = "2026-01-09T07:12:09.766Z" },
|
|
28
|
+
]
|
|
29
|
+
|
|
9
30
|
[[package]]
|
|
10
31
|
name = "annotated-types"
|
|
11
32
|
version = "0.7.0"
|
|
@@ -450,6 +471,17 @@ wheels = [
|
|
|
450
471
|
{ url = "https://files.pythonhosted.org/packages/b0/83/9d8fc3413f854effa680dcad1781f68f3ada8679863c0c94ba3b36bae6ff/duckdb-1.4.3-cp311-cp311-win_arm64.whl", hash = "sha256:fbc63ffdd03835f660155b37a1b6db2005bcd46e5ad398b8cac141eb305d2a3d", size = 13070898, upload-time = "2025-12-09T10:58:14.301Z" },
|
|
451
472
|
]
|
|
452
473
|
|
|
474
|
+
[package.optional-dependencies]
|
|
475
|
+
all = [
|
|
476
|
+
{ name = "adbc-driver-manager" },
|
|
477
|
+
{ name = "fsspec" },
|
|
478
|
+
{ name = "ipython", version = "8.38.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
|
|
479
|
+
{ name = "ipython", version = "9.7.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
|
|
480
|
+
{ name = "numpy" },
|
|
481
|
+
{ name = "pandas" },
|
|
482
|
+
{ name = "pyarrow" },
|
|
483
|
+
]
|
|
484
|
+
|
|
453
485
|
[[package]]
|
|
454
486
|
name = "exceptiongroup"
|
|
455
487
|
version = "1.3.1"
|
|
@@ -522,6 +554,15 @@ wheels = [
|
|
|
522
554
|
{ url = "https://files.pythonhosted.org/packages/c7/93/0dd45cd283c32dea1545151d8c3637b4b8c53cdb3a625aeb2885b184d74d/fonttools-4.60.1-py3-none-any.whl", hash = "sha256:906306ac7afe2156fcf0042173d6ebbb05416af70f6b370967b47f8f00103bbb", size = 1143175, upload-time = "2025-09-29T21:13:24.134Z" },
|
|
523
555
|
]
|
|
524
556
|
|
|
557
|
+
[[package]]
|
|
558
|
+
name = "fsspec"
|
|
559
|
+
version = "2026.1.0"
|
|
560
|
+
source = { registry = "https://pypi.org/simple" }
|
|
561
|
+
sdist = { url = "https://files.pythonhosted.org/packages/d5/7d/5df2650c57d47c57232af5ef4b4fdbff182070421e405e0d62c6cdbfaa87/fsspec-2026.1.0.tar.gz", hash = "sha256:e987cb0496a0d81bba3a9d1cee62922fb395e7d4c3b575e57f547953334fe07b", size = 310496, upload-time = "2026-01-09T15:21:35.562Z" }
|
|
562
|
+
wheels = [
|
|
563
|
+
{ url = "https://files.pythonhosted.org/packages/01/c9/97cc5aae1648dcb851958a3ddf73ccd7dbe5650d95203ecb4d7720b4cdbf/fsspec-2026.1.0-py3-none-any.whl", hash = "sha256:cb76aa913c2285a3b49bdd5fc55b1d7c708d7208126b60f2eb8194fe1b4cbdcc", size = 201838, upload-time = "2026-01-09T15:21:34.041Z" },
|
|
564
|
+
]
|
|
565
|
+
|
|
525
566
|
[[package]]
|
|
526
567
|
name = "h11"
|
|
527
568
|
version = "0.16.0"
|
|
@@ -993,13 +1034,14 @@ wheels = [
|
|
|
993
1034
|
|
|
994
1035
|
[[package]]
|
|
995
1036
|
name = "mcp-automl"
|
|
996
|
-
version = "0.1.
|
|
1037
|
+
version = "0.1.3"
|
|
997
1038
|
source = { editable = "." }
|
|
998
1039
|
dependencies = [
|
|
999
|
-
{ name = "duckdb" },
|
|
1040
|
+
{ name = "duckdb", extra = ["all"] },
|
|
1000
1041
|
{ name = "joblib" },
|
|
1001
1042
|
{ name = "mcp" },
|
|
1002
1043
|
{ name = "pandas" },
|
|
1044
|
+
{ name = "pyarrow" },
|
|
1003
1045
|
{ name = "pycaret" },
|
|
1004
1046
|
{ name = "scikit-learn" },
|
|
1005
1047
|
{ name = "tabulate" },
|
|
@@ -1007,26 +1049,23 @@ dependencies = [
|
|
|
1007
1049
|
|
|
1008
1050
|
[package.dev-dependencies]
|
|
1009
1051
|
dev = [
|
|
1010
|
-
{ name = "pyarrow" },
|
|
1011
1052
|
{ name = "pytest-asyncio" },
|
|
1012
1053
|
]
|
|
1013
1054
|
|
|
1014
1055
|
[package.metadata]
|
|
1015
1056
|
requires-dist = [
|
|
1016
|
-
{ name = "duckdb", specifier = ">=1.4.3" },
|
|
1057
|
+
{ name = "duckdb", extras = ["all"], specifier = ">=1.4.3" },
|
|
1017
1058
|
{ name = "joblib", specifier = "<1.4" },
|
|
1018
1059
|
{ name = "mcp", specifier = ">=1.21.2" },
|
|
1019
1060
|
{ name = "pandas", specifier = "<2.2.0" },
|
|
1061
|
+
{ name = "pyarrow", specifier = ">=23.0.0" },
|
|
1020
1062
|
{ name = "pycaret", specifier = ">=3.0.0" },
|
|
1021
1063
|
{ name = "scikit-learn", specifier = "<1.4" },
|
|
1022
1064
|
{ name = "tabulate", specifier = ">=0.9.0" },
|
|
1023
1065
|
]
|
|
1024
1066
|
|
|
1025
1067
|
[package.metadata.requires-dev]
|
|
1026
|
-
dev = [
|
|
1027
|
-
{ name = "pyarrow", specifier = ">=14.0.0" },
|
|
1028
|
-
{ name = "pytest-asyncio", specifier = ">=1.3.0" },
|
|
1029
|
-
]
|
|
1068
|
+
dev = [{ name = "pytest-asyncio", specifier = ">=1.3.0" }]
|
|
1030
1069
|
|
|
1031
1070
|
[[package]]
|
|
1032
1071
|
name = "narwhals"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|