ds-agent-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/bin/ds-agent.js +451 -0
  2. package/ds_agent/__init__.py +8 -0
  3. package/package.json +28 -0
  4. package/requirements.txt +126 -0
  5. package/setup.py +35 -0
  6. package/src/__init__.py +7 -0
  7. package/src/_compress_tool_result.py +118 -0
  8. package/src/api/__init__.py +4 -0
  9. package/src/api/app.py +1626 -0
  10. package/src/cache/__init__.py +5 -0
  11. package/src/cache/cache_manager.py +561 -0
  12. package/src/cli.py +2886 -0
  13. package/src/dynamic_prompts.py +281 -0
  14. package/src/orchestrator.py +4799 -0
  15. package/src/progress_manager.py +139 -0
  16. package/src/reasoning/__init__.py +332 -0
  17. package/src/reasoning/business_summary.py +431 -0
  18. package/src/reasoning/data_understanding.py +356 -0
  19. package/src/reasoning/model_explanation.py +383 -0
  20. package/src/reasoning/reasoning_trace.py +239 -0
  21. package/src/registry/__init__.py +3 -0
  22. package/src/registry/tools_registry.py +3 -0
  23. package/src/session_memory.py +448 -0
  24. package/src/session_store.py +370 -0
  25. package/src/storage/__init__.py +19 -0
  26. package/src/storage/artifact_store.py +620 -0
  27. package/src/storage/helpers.py +116 -0
  28. package/src/storage/huggingface_storage.py +694 -0
  29. package/src/storage/r2_storage.py +0 -0
  30. package/src/storage/user_files_service.py +288 -0
  31. package/src/tools/__init__.py +335 -0
  32. package/src/tools/advanced_analysis.py +823 -0
  33. package/src/tools/advanced_feature_engineering.py +708 -0
  34. package/src/tools/advanced_insights.py +578 -0
  35. package/src/tools/advanced_preprocessing.py +549 -0
  36. package/src/tools/advanced_training.py +906 -0
  37. package/src/tools/agent_tool_mapping.py +326 -0
  38. package/src/tools/auto_pipeline.py +420 -0
  39. package/src/tools/autogluon_training.py +1480 -0
  40. package/src/tools/business_intelligence.py +860 -0
  41. package/src/tools/cloud_data_sources.py +581 -0
  42. package/src/tools/code_interpreter.py +390 -0
  43. package/src/tools/computer_vision.py +614 -0
  44. package/src/tools/data_cleaning.py +614 -0
  45. package/src/tools/data_profiling.py +593 -0
  46. package/src/tools/data_type_conversion.py +268 -0
  47. package/src/tools/data_wrangling.py +433 -0
  48. package/src/tools/eda_reports.py +284 -0
  49. package/src/tools/enhanced_feature_engineering.py +241 -0
  50. package/src/tools/feature_engineering.py +302 -0
  51. package/src/tools/matplotlib_visualizations.py +1327 -0
  52. package/src/tools/model_training.py +520 -0
  53. package/src/tools/nlp_text_analytics.py +761 -0
  54. package/src/tools/plotly_visualizations.py +497 -0
  55. package/src/tools/production_mlops.py +852 -0
  56. package/src/tools/time_series.py +507 -0
  57. package/src/tools/tools_registry.py +2133 -0
  58. package/src/tools/visualization_engine.py +559 -0
  59. package/src/utils/__init__.py +42 -0
  60. package/src/utils/error_recovery.py +313 -0
  61. package/src/utils/parallel_executor.py +402 -0
  62. package/src/utils/polars_helpers.py +248 -0
  63. package/src/utils/schema_extraction.py +132 -0
  64. package/src/utils/semantic_layer.py +392 -0
  65. package/src/utils/token_budget.py +411 -0
  66. package/src/utils/validation.py +377 -0
  67. package/src/workflow_state.py +154 -0
@@ -0,0 +1,614 @@
1
+ """
2
+ Data Cleaning Tools
3
+ Tools for handling missing values, outliers, and data type issues.
4
+ """
5
+
6
+ import polars as pl
7
+ import numpy as np
8
+ from typing import Dict, Any, List, Optional
9
+ from pathlib import Path
10
+ import sys
11
+ import os
12
+
13
+ # Add parent directory to path for imports
14
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
15
+
16
+ from ds_agent.utils.polars_helpers import (
17
+ load_dataframe,
18
+ save_dataframe,
19
+ get_numeric_columns,
20
+ get_categorical_columns,
21
+ get_datetime_columns,
22
+ detect_id_columns,
23
+ )
24
+ from ds_agent.utils.validation import (
25
+ validate_file_exists,
26
+ validate_file_format,
27
+ validate_dataframe,
28
+ validate_columns_exist,
29
+ )
30
+
31
+
32
+ def clean_missing_values(file_path: str, strategy,
33
+ output_path: str, threshold: float = 0.4) -> Dict[str, Any]:
34
+ """
35
+ Handle missing values using appropriate strategies with smart threshold-based column dropping.
36
+
37
+ Args:
38
+ file_path: Path to CSV or Parquet file
39
+ strategy: Either "auto" (string) to automatically decide strategies for all columns,
40
+ or a dictionary mapping column names to strategies
41
+ ('median', 'mean', 'mode', 'forward_fill', 'drop')
42
+ output_path: Path to save cleaned dataset
43
+ threshold: For "auto" strategy, drop columns with missing % > threshold (default: 0.4 = 40%)
44
+
45
+ Returns:
46
+ Dictionary with cleaning report
47
+
48
+ Auto Strategy Behavior:
49
+ 1. Drop columns with >threshold missing (default 40%)
50
+ 2. Impute numeric columns with median
51
+ 3. Impute categorical columns with mode
52
+ 4. Forward-fill for time series columns
53
+ """
54
+ # Validation
55
+ validate_file_exists(file_path)
56
+ validate_file_format(file_path)
57
+
58
+ # Load data
59
+ df = load_dataframe(file_path)
60
+ validate_dataframe(df)
61
+
62
+ # Get column type information
63
+ numeric_cols = get_numeric_columns(df)
64
+ categorical_cols = get_categorical_columns(df)
65
+ datetime_cols = get_datetime_columns(df)
66
+ id_cols = detect_id_columns(df)
67
+
68
+ report = {
69
+ "original_rows": len(df),
70
+ "original_columns": len(df.columns),
71
+ "columns_dropped": [],
72
+ "columns_processed": {},
73
+ "rows_dropped": 0,
74
+ "threshold_used": threshold
75
+ }
76
+
77
+ # Handle string strategy modes
78
+ if isinstance(strategy, str):
79
+ if strategy == "auto":
80
+ # Step 1: Identify and drop high-missing columns (>threshold)
81
+ cols_to_drop = []
82
+ for col in df.columns:
83
+ null_count = df[col].null_count()
84
+ null_pct = null_count / len(df) if len(df) > 0 else 0
85
+
86
+ if null_pct > threshold:
87
+ cols_to_drop.append(col)
88
+ report["columns_dropped"].append({
89
+ "column": col,
90
+ "missing_percentage": round(null_pct * 100, 2),
91
+ "reason": f"Missing >{threshold*100}% of values"
92
+ })
93
+
94
+ # Drop high-missing columns
95
+ if cols_to_drop:
96
+ df = df.drop(cols_to_drop)
97
+ print(f"🗑️ Dropped {len(cols_to_drop)} columns with >{threshold*100}% missing:")
98
+ for col_info in report["columns_dropped"]:
99
+ print(f" - {col_info['column']} ({col_info['missing_percentage']}% missing)")
100
+
101
+ # Step 2: Build strategy for remaining columns
102
+ strategy = {}
103
+ for col in df.columns:
104
+ if df[col].null_count() > 0:
105
+ if col in id_cols:
106
+ strategy[col] = "drop" # Drop rows with missing IDs
107
+ elif col in datetime_cols:
108
+ strategy[col] = "forward_fill" # Forward fill for time series
109
+ elif col in numeric_cols:
110
+ strategy[col] = "median" # Median for numeric (robust to outliers)
111
+ elif col in categorical_cols:
112
+ strategy[col] = "mode" # Mode for categorical
113
+ else:
114
+ strategy[col] = "mode" # Default to mode
115
+
116
+ print(f"🔧 Auto-detected strategies for {len(strategy)} remaining columns with missing values")
117
+
118
+ elif strategy in ["median", "mean", "mode", "forward_fill", "drop"]:
119
+ # Apply same strategy to all columns with missing values
120
+ strategy_dict = {}
121
+ for col in df.columns:
122
+ if df[col].null_count() > 0:
123
+ strategy_dict[col] = strategy
124
+ strategy = strategy_dict
125
+ print(f"🔧 Applying '{list(strategy_dict.values())[0] if strategy_dict else strategy}' strategy to {len(strategy_dict)} columns with missing values")
126
+
127
+ elif strategy in ["iterative", "mice"]:
128
+ # MICE / Iterative Imputation using sklearn IterativeImputer
129
+ # This handles ALL numeric columns at once (multivariate imputation)
130
+ print(f"🔧 Applying Iterative (MICE) imputation to numeric columns...")
131
+ try:
132
+ from sklearn.experimental import enable_iterative_imputer # noqa: F401
133
+ from sklearn.impute import IterativeImputer
134
+ from sklearn.linear_model import BayesianRidge
135
+ import pandas as pd
136
+
137
+ # Identify numeric columns with missing values
138
+ numeric_cols_with_nulls = [
139
+ col for col in numeric_cols if df[col].null_count() > 0
140
+ ]
141
+
142
+ if not numeric_cols_with_nulls:
143
+ print(" ℹ️ No numeric columns with missing values for MICE imputation")
144
+ else:
145
+ # Convert numeric columns to pandas for IterativeImputer
146
+ df_pd = df.select(numeric_cols).to_pandas()
147
+
148
+ # Fit and transform
149
+ imputer = IterativeImputer(
150
+ estimator=BayesianRidge(),
151
+ max_iter=10,
152
+ random_state=42,
153
+ missing_values=float('nan')
154
+ )
155
+ imputed_data = imputer.fit_transform(df_pd)
156
+
157
+ # Replace columns back in Polars DataFrame
158
+ for i, col_name in enumerate(numeric_cols):
159
+ df = df.with_columns(
160
+ pl.Series(col_name, imputed_data[:, i])
161
+ )
162
+
163
+ for col_name in numeric_cols_with_nulls:
164
+ report["columns_processed"][col_name] = {
165
+ "status": "success",
166
+ "strategy": "iterative_mice",
167
+ "nulls_before": int(df[col_name].null_count()), # Should be 0 now
168
+ "nulls_after": 0
169
+ }
170
+
171
+ print(f" ✅ MICE imputed {len(numeric_cols_with_nulls)} numeric columns using {len(numeric_cols)} features")
172
+
173
+ # Handle remaining non-numeric columns with mode
174
+ for col in df.columns:
175
+ if df[col].null_count() > 0 and col not in numeric_cols:
176
+ mode_val = df[col].drop_nulls().mode().first()
177
+ if mode_val is not None:
178
+ df = df.with_columns(
179
+ pl.col(col).fill_null(mode_val).alias(col)
180
+ )
181
+ report["columns_processed"][col] = {
182
+ "status": "success",
183
+ "strategy": "mode (non-numeric fallback)",
184
+ "nulls_before": int(df[col].null_count()),
185
+ "nulls_after": 0
186
+ }
187
+
188
+ except ImportError:
189
+ return {
190
+ "success": False,
191
+ "error": "IterativeImputer requires scikit-learn >= 1.4. Install with: pip install scikit-learn>=1.4",
192
+ "error_type": "MissingDependency"
193
+ }
194
+
195
+ # Skip per-column processing for MICE (already handled above)
196
+ strategy = {}
197
+
198
+ else:
199
+ return {
200
+ "success": False,
201
+ "error": f"Invalid strategy '{strategy}'. Use 'auto', 'median', 'mean', 'mode', 'forward_fill', 'drop', 'iterative', 'mice', or provide a dictionary.",
202
+ "error_type": "ValueError"
203
+ }
204
+
205
+ # Process each column based on strategy
206
+ for col, strat in strategy.items():
207
+ if col not in df.columns:
208
+ report["columns_processed"][col] = {
209
+ "status": "error",
210
+ "message": f"Column not found (may have been dropped)"
211
+ }
212
+ continue
213
+
214
+ null_count_before = df[col].null_count()
215
+
216
+ if null_count_before == 0:
217
+ report["columns_processed"][col] = {
218
+ "status": "skipped",
219
+ "message": "No missing values"
220
+ }
221
+ continue
222
+
223
+ # Don't impute ID columns - drop rows instead
224
+ if col in id_cols and strat != "drop":
225
+ report["columns_processed"][col] = {
226
+ "status": "skipped",
227
+ "message": "ID column - not imputed (use 'drop' to remove rows)"
228
+ }
229
+ continue
230
+
231
+ # Apply strategy
232
+ try:
233
+ rows_before = len(df)
234
+
235
+ if strat == "median":
236
+ if col in numeric_cols:
237
+ median_val = df[col].median()
238
+ df = df.with_columns(
239
+ pl.col(col).fill_null(median_val).alias(col)
240
+ )
241
+ report["columns_processed"][col] = {
242
+ "status": "success",
243
+ "strategy": "median",
244
+ "nulls_before": int(null_count_before),
245
+ "nulls_after": int(df[col].null_count()),
246
+ "fill_value": float(median_val)
247
+ }
248
+ else:
249
+ report["columns_processed"][col] = {
250
+ "status": "error",
251
+ "message": "Cannot use median on non-numeric column"
252
+ }
253
+ continue
254
+
255
+ elif strat == "mean":
256
+ if col in numeric_cols:
257
+ mean_val = df[col].mean()
258
+ df = df.with_columns(
259
+ pl.col(col).fill_null(mean_val).alias(col)
260
+ )
261
+ report["columns_processed"][col] = {
262
+ "status": "success",
263
+ "strategy": "mean",
264
+ "nulls_before": int(null_count_before),
265
+ "nulls_after": int(df[col].null_count()),
266
+ "fill_value": float(mean_val)
267
+ }
268
+ else:
269
+ report["columns_processed"][col] = {
270
+ "status": "error",
271
+ "message": "Cannot use mean on non-numeric column"
272
+ }
273
+ continue
274
+
275
+ elif strat == "mode":
276
+ mode_val = df[col].drop_nulls().mode().first()
277
+ if mode_val is not None:
278
+ df = df.with_columns(
279
+ pl.col(col).fill_null(mode_val).alias(col)
280
+ )
281
+ report["columns_processed"][col] = {
282
+ "status": "success",
283
+ "strategy": "mode",
284
+ "nulls_before": int(null_count_before),
285
+ "nulls_after": int(df[col].null_count()),
286
+ "fill_value": str(mode_val)
287
+ }
288
+
289
+ elif strat == "forward_fill":
290
+ df = df.with_columns(
291
+ pl.col(col).forward_fill().alias(col)
292
+ )
293
+ report["columns_processed"][col] = {
294
+ "status": "success",
295
+ "strategy": "forward_fill",
296
+ "nulls_before": int(null_count_before),
297
+ "nulls_after": int(df[col].null_count())
298
+ }
299
+
300
+ elif strat == "drop":
301
+ df = df.filter(pl.col(col).is_not_null())
302
+ rows_after = len(df)
303
+ report["columns_processed"][col] = {
304
+ "status": "success",
305
+ "strategy": "drop",
306
+ "nulls_before": int(null_count_before),
307
+ "rows_dropped": rows_before - rows_after
308
+ }
309
+
310
+ else:
311
+ report["columns_processed"][col] = {
312
+ "status": "error",
313
+ "message": f"Unknown strategy: {strat}"
314
+ }
315
+ continue
316
+
317
+ except Exception as e:
318
+ report["columns_processed"][col] = {
319
+ "status": "error",
320
+ "message": str(e)
321
+ }
322
+
323
+ report["final_rows"] = len(df)
324
+ report["final_columns"] = len(df.columns)
325
+ report["rows_dropped"] = report["original_rows"] - report["final_rows"]
326
+ report["columns_dropped_count"] = len(report["columns_dropped"])
327
+
328
+ # Save cleaned dataset
329
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
330
+ save_dataframe(df, output_path)
331
+ report["output_path"] = output_path
332
+
333
+ # Summary message
334
+ report["message"] = f"Cleaned {report['original_rows']} rows → {report['final_rows']} rows. "
335
+ report["message"] += f"Dropped {report['columns_dropped_count']} columns. "
336
+ report["message"] += f"Processed {len([c for c in report['columns_processed'].values() if c['status'] == 'success'])} columns."
337
+
338
+ return report
339
+
340
+
341
+ def handle_outliers(file_path: str, strategy: str, columns: List[str],
342
+ output_path: str) -> Dict[str, Any]:
343
+ """
344
+ Detect and handle outliers in numeric columns.
345
+
346
+ Args:
347
+ file_path: Path to CSV or Parquet file
348
+ strategy: Method to handle outliers ('clip', 'cap', 'winsorize', 'remove')
349
+ columns: List of columns to check, or ['all'] for all numeric columns
350
+ output_path: Path to save cleaned dataset
351
+
352
+ Returns:
353
+ Dictionary with outlier handling report
354
+ """
355
+ # Validation
356
+ validate_file_exists(file_path)
357
+ validate_file_format(file_path)
358
+
359
+ # Load data
360
+ df = load_dataframe(file_path)
361
+ validate_dataframe(df)
362
+
363
+ # Determine which columns to process
364
+ numeric_cols = get_numeric_columns(df)
365
+
366
+ if columns == ["all"]:
367
+ target_cols = numeric_cols
368
+ else:
369
+ # Filter to only existing numeric columns (auto-skip dropped columns)
370
+ target_cols = []
371
+ for col in columns:
372
+ if col not in df.columns:
373
+ print(f"⚠️ Skipping '{col}' - column was dropped in previous step")
374
+ continue
375
+ if col not in numeric_cols:
376
+ print(f"⚠️ Skipping '{col}' - not numeric")
377
+ continue
378
+ target_cols.append(col)
379
+
380
+ # If no valid columns remain, return early
381
+ if not target_cols:
382
+ return {
383
+ "success": False,
384
+ "error": f"None of the requested columns exist in the dataset. Available numeric columns: {', '.join(numeric_cols[:20])}",
385
+ "error_type": "ValueError"
386
+ }
387
+
388
+ report = {
389
+ "original_rows": len(df),
390
+ "strategy": strategy,
391
+ "columns_processed": {}
392
+ }
393
+
394
+ # Process each column
395
+ for col in target_cols:
396
+ col_data = df[col].drop_nulls()
397
+
398
+ if len(col_data) == 0:
399
+ report["columns_processed"][col] = {
400
+ "status": "skipped",
401
+ "message": "All values are null"
402
+ }
403
+ continue
404
+
405
+ # Calculate IQR bounds
406
+ q1 = col_data.quantile(0.25)
407
+ q3 = col_data.quantile(0.75)
408
+ iqr = q3 - q1
409
+
410
+ lower_bound = q1 - 1.5 * iqr
411
+ upper_bound = q3 + 1.5 * iqr
412
+
413
+ # Count outliers
414
+ outliers_mask = (df[col] < lower_bound) | (df[col] > upper_bound)
415
+ outlier_count = outliers_mask.sum()
416
+
417
+ if outlier_count == 0:
418
+ report["columns_processed"][col] = {
419
+ "status": "skipped",
420
+ "message": "No outliers detected"
421
+ }
422
+ continue
423
+
424
+ # Apply strategy
425
+ if strategy == "clip" or strategy == "cap":
426
+ # Clip/cap values to bounds
427
+ df = df.with_columns(
428
+ pl.col(col).clip(lower_bound, upper_bound).alias(col)
429
+ )
430
+
431
+ elif strategy == "winsorize":
432
+ # Winsorize: cap at 1st and 99th percentiles
433
+ p1 = col_data.quantile(0.01)
434
+ p99 = col_data.quantile(0.99)
435
+ df = df.with_columns(
436
+ pl.col(col).clip(p1, p99).alias(col)
437
+ )
438
+
439
+ elif strategy == "remove":
440
+ # Remove rows with outliers
441
+ df = df.filter(~outliers_mask)
442
+
443
+ report["columns_processed"][col] = {
444
+ "status": "success",
445
+ "outliers_detected": int(outlier_count),
446
+ "bounds": {
447
+ "lower": float(lower_bound),
448
+ "upper": float(upper_bound)
449
+ }
450
+ }
451
+
452
+ report["final_rows"] = len(df)
453
+ report["rows_dropped"] = report["original_rows"] - report["final_rows"]
454
+
455
+ # Save cleaned dataset
456
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
457
+ save_dataframe(df, output_path)
458
+ report["output_path"] = output_path
459
+
460
+ return report
461
+
462
+
463
+ def fix_data_types(file_path: str, type_mapping: Optional[Dict[str, str]] = None,
464
+ output_path: str = None) -> Dict[str, Any]:
465
+ """
466
+ Auto-detect and fix incorrect data types.
467
+
468
+ Args:
469
+ file_path: Path to CSV or Parquet file
470
+ type_mapping: Optional dictionary mapping columns to target types
471
+ ('int', 'float', 'string', 'date', 'bool', 'category')
472
+ Use 'auto' or None for automatic detection
473
+ output_path: Path to save dataset with fixed types
474
+
475
+ Returns:
476
+ Dictionary with type fixing report
477
+ """
478
+ # Validation
479
+ validate_file_exists(file_path)
480
+ validate_file_format(file_path)
481
+
482
+ # Load data
483
+ df = load_dataframe(file_path)
484
+ validate_dataframe(df)
485
+
486
+ if type_mapping is None or type_mapping == {"auto": "auto"}:
487
+ type_mapping = {}
488
+
489
+ report = {
490
+ "columns_processed": {}
491
+ }
492
+
493
+ for col in df.columns:
494
+ original_dtype = str(df[col].dtype)
495
+
496
+ # Get target type from mapping or auto-detect
497
+ if col in type_mapping and type_mapping[col] != "auto":
498
+ target_type = type_mapping[col]
499
+ else:
500
+ # Auto-detect target type
501
+ target_type = _auto_detect_type(df[col])
502
+
503
+ if target_type is None:
504
+ report["columns_processed"][col] = {
505
+ "status": "skipped",
506
+ "original_dtype": original_dtype,
507
+ "message": "Could not auto-detect type"
508
+ }
509
+ continue
510
+
511
+ # Try to convert
512
+ try:
513
+ if target_type == "int":
514
+ df = df.with_columns(
515
+ pl.col(col).cast(pl.Int64, strict=False).alias(col)
516
+ )
517
+ elif target_type == "float":
518
+ df = df.with_columns(
519
+ pl.col(col).cast(pl.Float64, strict=False).alias(col)
520
+ )
521
+ elif target_type == "string":
522
+ df = df.with_columns(
523
+ pl.col(col).cast(pl.Utf8).alias(col)
524
+ )
525
+ elif target_type == "date":
526
+ df = df.with_columns(
527
+ pl.col(col).str.strptime(pl.Date, "%Y-%m-%d", strict=False).alias(col)
528
+ )
529
+ elif target_type == "bool":
530
+ df = df.with_columns(
531
+ pl.col(col).cast(pl.Boolean, strict=False).alias(col)
532
+ )
533
+ elif target_type == "category":
534
+ df = df.with_columns(
535
+ pl.col(col).cast(pl.Categorical).alias(col)
536
+ )
537
+
538
+ new_dtype = str(df[col].dtype)
539
+
540
+ report["columns_processed"][col] = {
541
+ "status": "success",
542
+ "original_dtype": original_dtype,
543
+ "new_dtype": new_dtype,
544
+ "target_type": target_type
545
+ }
546
+
547
+ except Exception as e:
548
+ report["columns_processed"][col] = {
549
+ "status": "error",
550
+ "original_dtype": original_dtype,
551
+ "target_type": target_type,
552
+ "message": str(e)
553
+ }
554
+
555
+ # Save dataset
556
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
557
+ save_dataframe(df, output_path)
558
+ report["output_path"] = output_path
559
+
560
+ return report
561
+
562
+
563
+ def _auto_detect_type(series: pl.Series) -> Optional[str]:
564
+ """
565
+ Auto-detect appropriate type for a series.
566
+
567
+ Args:
568
+ series: Polars series
569
+
570
+ Returns:
571
+ Detected type string or None
572
+ """
573
+ # Already correct type
574
+ if series.dtype in pl.NUMERIC_DTYPES:
575
+ return None
576
+
577
+ if series.dtype in [pl.Date, pl.Datetime]:
578
+ return None
579
+
580
+ # Try to detect from string values
581
+ if series.dtype == pl.Utf8:
582
+ sample = series.drop_nulls().head(100)
583
+
584
+ if len(sample) == 0:
585
+ return None
586
+
587
+ # Check for boolean
588
+ unique_vals = set(str(v).lower() for v in sample.to_list())
589
+ if unique_vals.issubset({'true', 'false', '1', '0', 'yes', 'no', 't', 'f'}):
590
+ return "bool"
591
+
592
+ # Check for numeric
593
+ try:
594
+ sample.cast(pl.Float64)
595
+ # Check if all are integers
596
+ if all('.' not in str(v) for v in sample.to_list() if v is not None):
597
+ return "int"
598
+ return "float"
599
+ except:
600
+ pass
601
+
602
+ # Check for date
603
+ try:
604
+ sample.str.strptime(pl.Date, "%Y-%m-%d", strict=False)
605
+ return "date"
606
+ except:
607
+ pass
608
+
609
+ # Check if should be categorical (low cardinality)
610
+ n_unique = series.n_unique()
611
+ if n_unique < len(series) * 0.5 and n_unique < 100:
612
+ return "category"
613
+
614
+ return None