mcp-automl 0.1.5__tar.gz → 0.1.7__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,14 +1,15 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: mcp-automl
3
- Version: 0.1.5
3
+ Version: 0.1.7
4
4
  Summary: MCP server for end-to-end machine learning
5
5
  Author-email: ke <idea7766@gmail.com>
6
6
  License-File: LICENSE
7
7
  Requires-Python: <3.12,>=3.10
8
- Requires-Dist: duckdb>=1.4.3
8
+ Requires-Dist: duckdb[all]>=1.4.3
9
9
  Requires-Dist: joblib<1.4
10
10
  Requires-Dist: mcp>=1.21.2
11
11
  Requires-Dist: pandas<2.2.0
12
+ Requires-Dist: pyarrow>=23.0.0
12
13
  Requires-Dist: pycaret>=3.0.0
13
14
  Requires-Dist: scikit-learn<1.4
14
15
  Requires-Dist: tabulate>=0.9.0
@@ -1,14 +1,15 @@
1
1
  [project]
2
2
  name = "mcp-automl"
3
- version = "0.1.5"
3
+ version = "0.1.7"
4
4
  description = "MCP server for end-to-end machine learning"
5
5
  readme = "README.md"
6
6
  requires-python = ">=3.10,<3.12"
7
7
  dependencies = [
8
- "duckdb>=1.4.3",
8
+ "duckdb[all]>=1.4.3",
9
9
  "joblib<1.4",
10
10
  "mcp>=1.21.2",
11
11
  "pandas<2.2.0",
12
+ "pyarrow>=23.0.0",
12
13
  "pycaret>=3.0.0",
13
14
  "scikit-learn<1.4",
14
15
  "tabulate>=0.9.0",
@@ -30,5 +31,4 @@ package = true
30
31
  [dependency-groups]
31
32
  dev = [
32
33
  "pytest-asyncio>=1.3.0",
33
- "pyarrow>=14.0.0",
34
34
  ]
@@ -10,8 +10,8 @@ import argparse
10
10
  from pathlib import Path
11
11
  from mcp.server.fastmcp import FastMCP, Context
12
12
  from mcp.types import PromptMessage, TextContent
13
- from pycaret.classification import setup as setup_clf, compare_models as compare_models_clf, pull as pull_clf, save_model as save_model_clf, load_model as load_model_clf, predict_model as predict_model_clf, get_config as get_config_clf
14
- from pycaret.regression import setup as setup_reg, compare_models as compare_models_reg, pull as pull_reg, save_model as save_model_reg, load_model as load_model_reg, predict_model as predict_model_reg, get_config as get_config_reg
13
+ from pycaret.classification import setup as setup_clf, compare_models as compare_models_clf, pull as pull_clf, save_model as save_model_clf, load_model as load_model_clf, predict_model as predict_model_clf, get_config as get_config_clf, tune_model as tune_model_clf, finalize_model as finalize_model_clf
14
+ from pycaret.regression import setup as setup_reg, compare_models as compare_models_reg, pull as pull_reg, save_model as save_model_reg, load_model as load_model_reg, predict_model as predict_model_reg, get_config as get_config_reg, tune_model as tune_model_reg, finalize_model as finalize_model_reg
15
15
 
16
16
  # Configure logging
17
17
  logging.basicConfig(
@@ -363,12 +363,31 @@ def _train_classifier_sync(run_id: str, data_path: str, target_column: str, igno
363
363
  best_model = best_model[0]
364
364
  results = pull_clf()
365
365
 
366
- # Extract feature importances
366
+ # Tune Model
367
+ logger.info("Tuning best model with Optuna...")
368
+ try:
369
+ best_model = tune_model_clf(best_model, optimize=optimize, search_library="optuna", n_trials=10)
370
+ results = pull_clf()
371
+ except Exception as e:
372
+ logger.warning(f"Tuning failed: {e}. Proceeding with untuned model.")
373
+
374
+
375
+ # Extract feature importances (from the potentially tuned model)
367
376
  feature_importances = _get_feature_importances(best_model, get_config_clf)
368
377
 
369
- # Evaluate on holdout (test_data or split)
370
- predict_model_clf(best_model)
371
- test_results = pull_clf()
378
+ # Evaluate on holdout (test_data_path)
379
+ test_results = None
380
+ if test_data_path:
381
+ logger.info("Evaluating on provided test data...")
382
+ predict_model_clf(best_model)
383
+ test_results = pull_clf()
384
+
385
+ # Finalize Model
386
+ logger.info("Finalizing model on all data...")
387
+ try:
388
+ best_model = finalize_model_clf(best_model)
389
+ except Exception as e:
390
+ logger.warning(f"Finalization failed: {e}. Saving non-finalized model.")
372
391
 
373
392
  metadata = {
374
393
  "data_path": data_path,
@@ -543,12 +562,30 @@ def _train_regressor_sync(run_id: str, data_path: str, target_column: str, ignor
543
562
  best_model = best_model[0]
544
563
  results = pull_reg()
545
564
 
565
+ # Tune Model
566
+ logger.info("Tuning best model with Optuna...")
567
+ try:
568
+ best_model = tune_model_reg(best_model, optimize=optimize, search_library="optuna", n_trials=10)
569
+ results = pull_reg()
570
+ except Exception as e:
571
+ logger.warning(f"Tuning failed: {e}. Proceeding with untuned model.")
572
+
546
573
  # Extract feature importances
547
574
  feature_importances = _get_feature_importances(best_model, get_config_reg)
548
575
 
549
576
  # Evaluate on holdout
550
- predict_model_reg(best_model)
551
- test_results = pull_reg()
577
+ test_results = None
578
+ if test_data_path:
579
+ logger.info("Evaluating on provided test data...")
580
+ predict_model_reg(best_model)
581
+ test_results = pull_reg()
582
+
583
+ # Finalize Model
584
+ logger.info("Finalizing model on all data...")
585
+ try:
586
+ best_model = finalize_model_reg(best_model)
587
+ except Exception as e:
588
+ logger.warning(f"Finalization failed: {e}. Saving non-finalized model.")
552
589
 
553
590
  metadata = {
554
591
  "data_path": data_path,
@@ -0,0 +1,451 @@
1
+ """
2
+ Tests for verifying supported document formats (CSV, Parquet, JSON).
3
+
4
+ These tests ensure that all supported file formats work correctly with
5
+ exposed MCP tools:
6
+ - inspect_data() - data inspection tool
7
+ - query_data() - SQL query tool
8
+ """
9
+
10
+ import pytest
11
+ import pandas as pd
12
+ import numpy as np
13
+ import json
14
+ import asyncio
15
+ from pathlib import Path
16
+
17
+ from mcp_automl.server import (
18
+ inspect_data,
19
+ query_data,
20
+ )
21
+
22
+
23
+ # =============================================================================
24
+ # Fixtures for creating test data in different formats
25
+ # =============================================================================
26
+
27
+ @pytest.fixture
28
+ def sample_dataframe():
29
+ """Create a sample DataFrame for testing."""
30
+ return pd.DataFrame({
31
+ 'int_col': [1, 2, 3, 4, 5],
32
+ 'float_col': [1.1, 2.2, 3.3, 4.4, 5.5],
33
+ 'str_col': ['a', 'b', 'c', 'd', 'e'],
34
+ 'bool_col': [True, False, True, False, True],
35
+ 'target': [0, 1, 0, 1, 0]
36
+ })
37
+
38
+
39
+ @pytest.fixture
40
+ def sample_csv_file(tmp_path, sample_dataframe):
41
+ """Create a sample CSV file."""
42
+ file_path = tmp_path / "data.csv"
43
+ sample_dataframe.to_csv(file_path, index=False)
44
+ return str(file_path)
45
+
46
+
47
+ @pytest.fixture
48
+ def sample_parquet_file(tmp_path, sample_dataframe):
49
+ """Create a sample Parquet file."""
50
+ file_path = tmp_path / "data.parquet"
51
+ sample_dataframe.to_parquet(file_path, index=False)
52
+ return str(file_path)
53
+
54
+
55
+ @pytest.fixture
56
+ def sample_json_file(tmp_path, sample_dataframe):
57
+ """Create a sample JSON file (records orient)."""
58
+ file_path = tmp_path / "data.json"
59
+ sample_dataframe.to_json(file_path, orient='records')
60
+ return str(file_path)
61
+
62
+
63
+ @pytest.fixture
64
+ def all_format_files(sample_csv_file, sample_parquet_file, sample_json_file):
65
+ """Return all format files as a dict."""
66
+ return {
67
+ 'csv': sample_csv_file,
68
+ 'parquet': sample_parquet_file,
69
+ 'json': sample_json_file
70
+ }
71
+
72
+
73
+ # =============================================================================
74
+ # Test inspect_data() with different formats
75
+ # =============================================================================
76
+
77
+ class TestInspectDataFormats:
78
+ """Tests for inspect_data() with different file formats."""
79
+
80
+ def test_inspect_csv(self, sample_csv_file):
81
+ """Test inspect_data with CSV file."""
82
+ result = asyncio.run(inspect_data(sample_csv_file, n_rows=3))
83
+
84
+ data = json.loads(result)
85
+
86
+ assert "structure" in data
87
+ assert "statistics" in data
88
+ assert "previews" in data
89
+ assert data["structure"]["rows"] == 5
90
+ assert data["structure"]["columns"] == 5
91
+
92
+ def test_inspect_parquet(self, sample_parquet_file):
93
+ """Test inspect_data with Parquet file."""
94
+ result = asyncio.run(inspect_data(sample_parquet_file, n_rows=3))
95
+
96
+ data = json.loads(result)
97
+
98
+ assert "structure" in data
99
+ assert data["structure"]["rows"] == 5
100
+ assert data["structure"]["columns"] == 5
101
+ assert "int_col" in data["structure"]["column_names"]
102
+
103
+ def test_inspect_json(self, sample_json_file):
104
+ """Test inspect_data with JSON file."""
105
+ result = asyncio.run(inspect_data(sample_json_file, n_rows=3))
106
+
107
+ data = json.loads(result)
108
+
109
+ assert "structure" in data
110
+ assert data["structure"]["rows"] == 5
111
+ assert data["structure"]["columns"] == 5
112
+
113
+ def test_inspect_all_formats_consistent(self, all_format_files):
114
+ """Test that all formats return consistent structure info."""
115
+ results = {}
116
+ for fmt, path in all_format_files.items():
117
+ result = asyncio.run(inspect_data(path, n_rows=3))
118
+ results[fmt] = json.loads(result)
119
+
120
+ # All formats should report same row/column counts
121
+ for fmt in ['parquet', 'json']:
122
+ assert results[fmt]["structure"]["rows"] == results['csv']["structure"]["rows"]
123
+ assert results[fmt]["structure"]["columns"] == results['csv']["structure"]["columns"]
124
+ assert set(results[fmt]["structure"]["column_names"]) == set(results['csv']["structure"]["column_names"])
125
+
126
+ def test_inspect_unsupported_format_returns_error(self, tmp_path):
127
+ """Test that unsupported format returns error message."""
128
+ txt_file = tmp_path / "data.txt"
129
+ txt_file.write_text("some data")
130
+
131
+ result = asyncio.run(inspect_data(str(txt_file)))
132
+
133
+ assert "Error" in result
134
+
135
+
136
+ # =============================================================================
137
+ # Test query_data() with different formats
138
+ # =============================================================================
139
+
140
+ class TestQueryDataFormats:
141
+ """Tests for query_data() with different file formats."""
142
+
143
+ def test_query_csv(self, sample_csv_file):
144
+ """Test query_data with CSV file."""
145
+ query = f"SELECT COUNT(*) as cnt FROM '{sample_csv_file}'"
146
+ result = asyncio.run(query_data(query))
147
+
148
+ data = json.loads(result)
149
+ assert data[0]["cnt"] == 5
150
+
151
+ def test_query_parquet(self, sample_parquet_file):
152
+ """Test query_data with Parquet file."""
153
+ query = f"SELECT COUNT(*) as cnt FROM '{sample_parquet_file}'"
154
+ result = asyncio.run(query_data(query))
155
+
156
+ data = json.loads(result)
157
+ assert data[0]["cnt"] == 5
158
+
159
+ def test_query_json(self, sample_json_file):
160
+ """Test query_data with JSON file."""
161
+ query = f"SELECT COUNT(*) as cnt FROM '{sample_json_file}'"
162
+ result = asyncio.run(query_data(query))
163
+
164
+ data = json.loads(result)
165
+ assert data[0]["cnt"] == 5
166
+
167
+ def test_query_aggregation_all_formats(self, all_format_files):
168
+ """Test aggregation query works on all formats."""
169
+ for fmt, path in all_format_files.items():
170
+ query = f"SELECT SUM(int_col) as total FROM '{path}'"
171
+ result = asyncio.run(query_data(query))
172
+
173
+ data = json.loads(result)
174
+ assert data[0]["total"] == 15 # 1+2+3+4+5
175
+
176
+ def test_query_filter_all_formats(self, all_format_files):
177
+ """Test filter query works on all formats."""
178
+ for fmt, path in all_format_files.items():
179
+ query = f"SELECT * FROM '{path}' WHERE int_col > 3"
180
+ result = asyncio.run(query_data(query))
181
+
182
+ data = json.loads(result)
183
+ assert len(data) == 2 # int_col 4 and 5
184
+
185
+ def test_query_join_csv_parquet(self, sample_csv_file, sample_parquet_file):
186
+ """Test joining CSV and Parquet files in a single query."""
187
+ query = f"""
188
+ SELECT c.int_col, p.float_col
189
+ FROM '{sample_csv_file}' c
190
+ JOIN '{sample_parquet_file}' p ON c.int_col = p.int_col
191
+ WHERE c.int_col <= 3
192
+ """
193
+ result = asyncio.run(query_data(query))
194
+
195
+ data = json.loads(result)
196
+ assert len(data) == 3
197
+
198
+
199
+ # =============================================================================
200
+ # Test special cases and edge cases
201
+ # =============================================================================
202
+
203
+ class TestFormatEdgeCases:
204
+ """Tests for edge cases in format handling via exposed tools."""
205
+
206
+ def test_csv_with_special_characters(self, tmp_path):
207
+ """Test CSV with special characters in data."""
208
+ df = pd.DataFrame({
209
+ 'text': ['hello, world', 'foo "bar"', "line1\nline2", 'tab\there'],
210
+ 'target': [0, 1, 0, 1]
211
+ })
212
+ file_path = tmp_path / "special.csv"
213
+ df.to_csv(file_path, index=False)
214
+
215
+ result = asyncio.run(inspect_data(str(file_path)))
216
+ data = json.loads(result)
217
+ assert data["structure"]["rows"] == 4
218
+ assert "text" in data["structure"]["column_names"]
219
+
220
+ def test_json_nested_to_flat(self, tmp_path):
221
+ """Test JSON with records orientation (flat structure)."""
222
+ records = [
223
+ {"a": 1, "b": "x"},
224
+ {"a": 2, "b": "y"},
225
+ {"a": 3, "b": "z"}
226
+ ]
227
+ file_path = tmp_path / "flat.json"
228
+ with open(file_path, 'w') as f:
229
+ json.dump(records, f)
230
+
231
+ result = asyncio.run(inspect_data(str(file_path)))
232
+ data = json.loads(result)
233
+ assert data["structure"]["rows"] == 3
234
+ assert set(data["structure"]["column_names"]) == {'a', 'b'}
235
+
236
+ def test_parquet_with_nullable_types(self, tmp_path):
237
+ """Test Parquet with nullable/arrow types."""
238
+ df = pd.DataFrame({
239
+ 'nullable_int': pd.array([1, 2, None, 4, 5], dtype="Int64"),
240
+ 'nullable_float': pd.array([1.0, None, 3.0, 4.0, 5.0], dtype="Float64"),
241
+ 'nullable_str': pd.array(['a', 'b', None, 'd', 'e'], dtype="string"),
242
+ 'target': [0, 1, 0, 1, 0]
243
+ })
244
+ file_path = tmp_path / "nullable.parquet"
245
+ df.to_parquet(file_path, index=False)
246
+
247
+ result = asyncio.run(inspect_data(str(file_path)))
248
+ data = json.loads(result)
249
+ assert data["structure"]["rows"] == 5
250
+ # Check that nulls are counted correctly
251
+ assert data["statistics"]["missing_values"]["nullable_int"] == 1
252
+ assert data["statistics"]["missing_values"]["nullable_float"] == 1
253
+ assert data["statistics"]["missing_values"]["nullable_str"] == 1
254
+
255
+ def test_csv_empty_file(self, tmp_path):
256
+ """Test handling of empty CSV file with headers only."""
257
+ file_path = tmp_path / "empty.csv"
258
+ file_path.write_text("col1,col2,col3\n")
259
+
260
+ result = asyncio.run(inspect_data(str(file_path)))
261
+ data = json.loads(result)
262
+ assert data["structure"]["rows"] == 0
263
+ assert set(data["structure"]["column_names"]) == {'col1', 'col2', 'col3'}
264
+
265
+ def test_parquet_empty_dataframe(self, tmp_path):
266
+ """Test handling of empty Parquet file."""
267
+ df = pd.DataFrame({'col1': [], 'col2': [], 'col3': []})
268
+ file_path = tmp_path / "empty.parquet"
269
+ df.to_parquet(file_path, index=False)
270
+
271
+ result = asyncio.run(inspect_data(str(file_path)))
272
+ data = json.loads(result)
273
+ assert data["structure"]["rows"] == 0
274
+ assert set(data["structure"]["column_names"]) == {'col1', 'col2', 'col3'}
275
+
276
+ def test_json_empty_array(self, tmp_path):
277
+ """Test handling of empty JSON array."""
278
+ file_path = tmp_path / "empty.json"
279
+ file_path.write_text("[]")
280
+
281
+ result = asyncio.run(inspect_data(str(file_path)))
282
+ data = json.loads(result)
283
+ assert data["structure"]["rows"] == 0
284
+
285
+ def test_csv_with_unicode(self, tmp_path):
286
+ """Test CSV with Unicode characters."""
287
+ df = pd.DataFrame({
288
+ 'text': ['日本語', 'العربية', 'emoji: 🎉🚀', 'ñoño'],
289
+ 'target': [0, 1, 0, 1]
290
+ })
291
+ file_path = tmp_path / "unicode.csv"
292
+ df.to_csv(file_path, index=False)
293
+
294
+ result = asyncio.run(inspect_data(str(file_path)))
295
+ data = json.loads(result)
296
+ assert data["structure"]["rows"] == 4
297
+ # Verify via query
298
+ query = f"SELECT * FROM '{file_path}' WHERE text = '日本語'"
299
+ query_result = asyncio.run(query_data(query))
300
+ query_data_list = json.loads(query_result)
301
+ assert len(query_data_list) == 1
302
+
303
+ def test_large_csv_file(self, tmp_path):
304
+ """Test loading a larger CSV file."""
305
+ n_rows = 50000
306
+ df = pd.DataFrame({
307
+ 'id': range(n_rows),
308
+ 'value': np.random.randn(n_rows),
309
+ 'category': [f'cat_{i % 10}' for i in range(n_rows)]
310
+ })
311
+ file_path = tmp_path / "large.csv"
312
+ df.to_csv(file_path, index=False)
313
+
314
+ result = asyncio.run(inspect_data(str(file_path)))
315
+ data = json.loads(result)
316
+ assert data["structure"]["rows"] == n_rows
317
+
318
+ # Verify aggregation query works
319
+ query = f"SELECT COUNT(*) as cnt FROM '{file_path}'"
320
+ query_result = asyncio.run(query_data(query))
321
+ query_data_list = json.loads(query_result)
322
+ assert query_data_list[0]["cnt"] == n_rows
323
+
324
+ def test_parquet_with_multiple_types(self, tmp_path):
325
+ """Test Parquet with diverse column types."""
326
+ df = pd.DataFrame({
327
+ 'int8_col': np.array([1, 2, 3], dtype=np.int8),
328
+ 'int16_col': np.array([100, 200, 300], dtype=np.int16),
329
+ 'int32_col': np.array([1000, 2000, 3000], dtype=np.int32),
330
+ 'int64_col': np.array([10000, 20000, 30000], dtype=np.int64),
331
+ 'float32_col': np.array([1.1, 2.2, 3.3], dtype=np.float32),
332
+ 'float64_col': np.array([1.11, 2.22, 3.33], dtype=np.float64),
333
+ 'bool_col': [True, False, True],
334
+ 'str_col': ['a', 'b', 'c'],
335
+ 'target': [0, 1, 0]
336
+ })
337
+ file_path = tmp_path / "multitypes.parquet"
338
+ df.to_parquet(file_path, index=False)
339
+
340
+ result = asyncio.run(inspect_data(str(file_path)))
341
+ data = json.loads(result)
342
+ assert data["structure"]["rows"] == 3
343
+ assert data["structure"]["columns"] == 9
344
+
345
+
346
+ # =============================================================================
347
+ # Test file extension validation via exposed tools
348
+ # =============================================================================
349
+
350
+ class TestFileExtensionValidation:
351
+ """Tests for file extension handling via exposed tools."""
352
+
353
+ def test_unsupported_txt_extension(self, tmp_path):
354
+ """Test that .txt extension is not supported."""
355
+ txt_file = tmp_path / "data.txt"
356
+ txt_file.write_text("some data")
357
+
358
+ result = asyncio.run(inspect_data(str(txt_file)))
359
+ assert "Error" in result
360
+
361
+ def test_unsupported_xlsx_extension(self, tmp_path):
362
+ """Test that .xlsx extension is not supported."""
363
+ xlsx_file = tmp_path / "data.xlsx"
364
+ xlsx_file.write_bytes(b"fake xlsx content")
365
+
366
+ result = asyncio.run(inspect_data(str(xlsx_file)))
367
+ assert "Error" in result
368
+
369
+ def test_double_extension_csv(self, tmp_path, sample_dataframe):
370
+ """Test file with double extension like .tar.csv works."""
371
+ file_path = tmp_path / "data.tar.csv"
372
+ sample_dataframe.to_csv(file_path, index=False)
373
+
374
+ result = asyncio.run(inspect_data(str(file_path)))
375
+ data = json.loads(result)
376
+ assert data["structure"]["rows"] == 5
377
+
378
+
379
+ # =============================================================================
380
+ # Test format-specific inspect_data behavior
381
+ # =============================================================================
382
+
383
+ class TestInspectDataFormatDetails:
384
+ """Tests for format-specific behavior in inspect_data."""
385
+
386
+ def test_inspect_csv_dtypes(self, tmp_path):
387
+ """Test that CSV dtypes are correctly inferred."""
388
+ df = pd.DataFrame({
389
+ 'int_col': [1, 2, 3],
390
+ 'float_col': [1.5, 2.5, 3.5],
391
+ 'str_col': ['a', 'b', 'c']
392
+ })
393
+ file_path = tmp_path / "typed.csv"
394
+ df.to_csv(file_path, index=False)
395
+
396
+ result = asyncio.run(inspect_data(str(file_path)))
397
+ data = json.loads(result)
398
+
399
+ dtypes = data["structure"]["dtypes"]
400
+ # DuckDB may infer these differently, just check keys exist
401
+ assert "int_col" in dtypes
402
+ assert "float_col" in dtypes
403
+ assert "str_col" in dtypes
404
+
405
+ def test_inspect_parquet_preserves_dtypes(self, tmp_path):
406
+ """Test that Parquet preserves exact dtypes."""
407
+ df = pd.DataFrame({
408
+ 'int32_col': np.array([1, 2, 3], dtype=np.int32),
409
+ 'float32_col': np.array([1.5, 2.5, 3.5], dtype=np.float32),
410
+ })
411
+ file_path = tmp_path / "typed.parquet"
412
+ df.to_parquet(file_path, index=False)
413
+
414
+ result = asyncio.run(inspect_data(str(file_path)))
415
+ data = json.loads(result)
416
+
417
+ dtypes = data["structure"]["dtypes"]
418
+ assert "int32_col" in dtypes
419
+ assert "float32_col" in dtypes
420
+
421
+ def test_inspect_counts_missing_values_csv(self, tmp_path):
422
+ """Test that missing values are correctly counted in CSV."""
423
+ # CSV doesn't preserve NA types well, use empty strings which become NaN
424
+ file_path = tmp_path / "missing.csv"
425
+ file_path.write_text("a,b,c\n1,2,3\n,5,\n7,,9\n")
426
+
427
+ result = asyncio.run(inspect_data(str(file_path)))
428
+ data = json.loads(result)
429
+
430
+ missing = data["statistics"]["missing_values"]
431
+ assert missing["a"] == 1
432
+ assert missing["b"] == 1
433
+ assert missing["c"] == 1
434
+
435
+ def test_inspect_counts_missing_values_parquet(self, tmp_path):
436
+ """Test that missing values are correctly counted in Parquet."""
437
+ df = pd.DataFrame({
438
+ 'a': [1, None, 3],
439
+ 'b': [None, 2, None],
440
+ 'c': [1, 2, 3]
441
+ })
442
+ file_path = tmp_path / "missing.parquet"
443
+ df.to_parquet(file_path, index=False)
444
+
445
+ result = asyncio.run(inspect_data(str(file_path)))
446
+ data = json.loads(result)
447
+
448
+ missing = data["statistics"]["missing_values"]
449
+ assert missing["a"] == 1
450
+ assert missing["b"] == 2
451
+ assert missing["c"] == 0
@@ -6,6 +6,27 @@ resolution-markers = [
6
6
  "python_full_version < '3.11'",
7
7
  ]
8
8
 
9
+ [[package]]
10
+ name = "adbc-driver-manager"
11
+ version = "1.10.0"
12
+ source = { registry = "https://pypi.org/simple" }
13
+ dependencies = [
14
+ { name = "typing-extensions" },
15
+ ]
16
+ sdist = { url = "https://files.pythonhosted.org/packages/9e/77/b6ffd112a67d133810d0027e9de4408a6e63e0e1c438f5866cc28eb3c213/adbc_driver_manager-1.10.0.tar.gz", hash = "sha256:f04407cf2f99bfde13dea0e136d87219c8a16678d43e322744dbd84cdd8eaac2", size = 208204, upload-time = "2026-01-09T07:13:45.803Z" }
17
+ wheels = [
18
+ { url = "https://files.pythonhosted.org/packages/8a/0e/95eae266a8d97f2f222e6db9047dc4c1fab6a3e1d5e6bd9c8efb29881ec4/adbc_driver_manager-1.10.0-cp310-cp310-macosx_10_15_x86_64.whl", hash = "sha256:b82d7ffab5ad4c892e2f3201cc3781db3f87ef0c5ce1938715fb39a5dc6671b0", size = 532926, upload-time = "2026-01-09T07:11:52.672Z" },
19
+ { url = "https://files.pythonhosted.org/packages/bc/7c/c7234fe0e25ccd0fe23d8fa1e3f2682d407f49916e845e15869d262fc648/adbc_driver_manager-1.10.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e124ad209bc7112d0c0778fcc2e727c4fdf733188403129a82c10e563e89252b", size = 513090, upload-time = "2026-01-09T07:11:54.807Z" },
20
+ { url = "https://files.pythonhosted.org/packages/8d/81/6fb0075c67d1039e82960ab9d039da00ef3149b872a067d2e83ea9bb9956/adbc_driver_manager-1.10.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0abafd6b7d8ef5ba9c33fa92a1c5c329bfb89a68fb12e88ca62a4e32d822f257", size = 3039894, upload-time = "2026-01-09T07:11:56.892Z" },
21
+ { url = "https://files.pythonhosted.org/packages/8a/43/10e2abe7c600545fcf5b684b04073b36c87ed879a4bbc8fcd4f6f329c302/adbc_driver_manager-1.10.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ceca0800974137d2373cfb3aa4862af4b9361a2e5b94808b52df63c3f34a14eb", size = 3053785, upload-time = "2026-01-09T07:11:59.051Z" },
22
+ { url = "https://files.pythonhosted.org/packages/ee/dd/8f0fe60d49fe0b7bd9eb0b76268d662f95b31a8c623fc7cef40ad9488d0f/adbc_driver_manager-1.10.0-cp310-cp310-win_amd64.whl", hash = "sha256:23504672daeafabe03d4e07038754910f55f6845ef260f2249d9d8942ab16866", size = 714987, upload-time = "2026-01-09T07:12:00.771Z" },
23
+ { url = "https://files.pythonhosted.org/packages/bd/23/eaea050e76a1f65749be243a68514d67e13ab896c47cbf9e652da0ba9c10/adbc_driver_manager-1.10.0-cp311-cp311-macosx_10_15_x86_64.whl", hash = "sha256:715a33d750af09e1c03fde1783490c816e08a786f151ac79269659da1d2cc4e0", size = 533268, upload-time = "2026-01-09T07:12:02.401Z" },
24
+ { url = "https://files.pythonhosted.org/packages/4b/37/b81d64da4b1a032df0798bbf8c2e3abf875f9dd319598308d2efebe06523/adbc_driver_manager-1.10.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4bd40c9b20be54c55b3ce64cabd5f35f29a61886574d990a1d5b5bdd7f81a7b6", size = 513190, upload-time = "2026-01-09T07:12:04.025Z" },
25
+ { url = "https://files.pythonhosted.org/packages/2b/2a/a03cd7d4eb81c478566a38e6a657b83171e61e84f6aa0c0f9b49ae9d498c/adbc_driver_manager-1.10.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:595ab4a8ec2ddb338c70f3c31481a41830ad9e2d8c1a1884184023303098bc92", size = 3111408, upload-time = "2026-01-09T07:12:06.421Z" },
26
+ { url = "https://files.pythonhosted.org/packages/97/67/b9309e5351d4ff02720719c6ca01716ded33075fa486157db409bc5f47be/adbc_driver_manager-1.10.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:92fdf3247aef506583e79b3b583c1bf93f28c70e771281a41843aba63c61f732", size = 3124914, upload-time = "2026-01-09T07:12:08.274Z" },
27
+ { url = "https://files.pythonhosted.org/packages/41/1d/228041cc7ee30e51556d991d5f30981bfbf0c2d2a91c83f34ace2a2a9d2c/adbc_driver_manager-1.10.0-cp311-cp311-win_amd64.whl", hash = "sha256:7c5becb5a81fae563a10d82b570c4e1c7a8994c5b110ddaaae6afa9fd52a17b6", size = 716182, upload-time = "2026-01-09T07:12:09.766Z" },
28
+ ]
29
+
9
30
  [[package]]
10
31
  name = "annotated-types"
11
32
  version = "0.7.0"
@@ -450,6 +471,17 @@ wheels = [
450
471
  { url = "https://files.pythonhosted.org/packages/b0/83/9d8fc3413f854effa680dcad1781f68f3ada8679863c0c94ba3b36bae6ff/duckdb-1.4.3-cp311-cp311-win_arm64.whl", hash = "sha256:fbc63ffdd03835f660155b37a1b6db2005bcd46e5ad398b8cac141eb305d2a3d", size = 13070898, upload-time = "2025-12-09T10:58:14.301Z" },
451
472
  ]
452
473
 
474
+ [package.optional-dependencies]
475
+ all = [
476
+ { name = "adbc-driver-manager" },
477
+ { name = "fsspec" },
478
+ { name = "ipython", version = "8.38.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" },
479
+ { name = "ipython", version = "9.7.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" },
480
+ { name = "numpy" },
481
+ { name = "pandas" },
482
+ { name = "pyarrow" },
483
+ ]
484
+
453
485
  [[package]]
454
486
  name = "exceptiongroup"
455
487
  version = "1.3.1"
@@ -522,6 +554,15 @@ wheels = [
522
554
  { url = "https://files.pythonhosted.org/packages/c7/93/0dd45cd283c32dea1545151d8c3637b4b8c53cdb3a625aeb2885b184d74d/fonttools-4.60.1-py3-none-any.whl", hash = "sha256:906306ac7afe2156fcf0042173d6ebbb05416af70f6b370967b47f8f00103bbb", size = 1143175, upload-time = "2025-09-29T21:13:24.134Z" },
523
555
  ]
524
556
 
557
+ [[package]]
558
+ name = "fsspec"
559
+ version = "2026.1.0"
560
+ source = { registry = "https://pypi.org/simple" }
561
+ sdist = { url = "https://files.pythonhosted.org/packages/d5/7d/5df2650c57d47c57232af5ef4b4fdbff182070421e405e0d62c6cdbfaa87/fsspec-2026.1.0.tar.gz", hash = "sha256:e987cb0496a0d81bba3a9d1cee62922fb395e7d4c3b575e57f547953334fe07b", size = 310496, upload-time = "2026-01-09T15:21:35.562Z" }
562
+ wheels = [
563
+ { url = "https://files.pythonhosted.org/packages/01/c9/97cc5aae1648dcb851958a3ddf73ccd7dbe5650d95203ecb4d7720b4cdbf/fsspec-2026.1.0-py3-none-any.whl", hash = "sha256:cb76aa913c2285a3b49bdd5fc55b1d7c708d7208126b60f2eb8194fe1b4cbdcc", size = 201838, upload-time = "2026-01-09T15:21:34.041Z" },
564
+ ]
565
+
525
566
  [[package]]
526
567
  name = "h11"
527
568
  version = "0.16.0"
@@ -993,13 +1034,14 @@ wheels = [
993
1034
 
994
1035
  [[package]]
995
1036
  name = "mcp-automl"
996
- version = "0.1.1"
1037
+ version = "0.1.3"
997
1038
  source = { editable = "." }
998
1039
  dependencies = [
999
- { name = "duckdb" },
1040
+ { name = "duckdb", extra = ["all"] },
1000
1041
  { name = "joblib" },
1001
1042
  { name = "mcp" },
1002
1043
  { name = "pandas" },
1044
+ { name = "pyarrow" },
1003
1045
  { name = "pycaret" },
1004
1046
  { name = "scikit-learn" },
1005
1047
  { name = "tabulate" },
@@ -1007,26 +1049,23 @@ dependencies = [
1007
1049
 
1008
1050
  [package.dev-dependencies]
1009
1051
  dev = [
1010
- { name = "pyarrow" },
1011
1052
  { name = "pytest-asyncio" },
1012
1053
  ]
1013
1054
 
1014
1055
  [package.metadata]
1015
1056
  requires-dist = [
1016
- { name = "duckdb", specifier = ">=1.4.3" },
1057
+ { name = "duckdb", extras = ["all"], specifier = ">=1.4.3" },
1017
1058
  { name = "joblib", specifier = "<1.4" },
1018
1059
  { name = "mcp", specifier = ">=1.21.2" },
1019
1060
  { name = "pandas", specifier = "<2.2.0" },
1061
+ { name = "pyarrow", specifier = ">=23.0.0" },
1020
1062
  { name = "pycaret", specifier = ">=3.0.0" },
1021
1063
  { name = "scikit-learn", specifier = "<1.4" },
1022
1064
  { name = "tabulate", specifier = ">=0.9.0" },
1023
1065
  ]
1024
1066
 
1025
1067
  [package.metadata.requires-dev]
1026
- dev = [
1027
- { name = "pyarrow", specifier = ">=14.0.0" },
1028
- { name = "pytest-asyncio", specifier = ">=1.3.0" },
1029
- ]
1068
+ dev = [{ name = "pytest-asyncio", specifier = ">=1.3.0" }]
1030
1069
 
1031
1070
  [[package]]
1032
1071
  name = "narwhals"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes