ds-agent-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ds-agent.js +451 -0
- package/ds_agent/__init__.py +8 -0
- package/package.json +28 -0
- package/requirements.txt +126 -0
- package/setup.py +35 -0
- package/src/__init__.py +7 -0
- package/src/_compress_tool_result.py +118 -0
- package/src/api/__init__.py +4 -0
- package/src/api/app.py +1626 -0
- package/src/cache/__init__.py +5 -0
- package/src/cache/cache_manager.py +561 -0
- package/src/cli.py +2886 -0
- package/src/dynamic_prompts.py +281 -0
- package/src/orchestrator.py +4799 -0
- package/src/progress_manager.py +139 -0
- package/src/reasoning/__init__.py +332 -0
- package/src/reasoning/business_summary.py +431 -0
- package/src/reasoning/data_understanding.py +356 -0
- package/src/reasoning/model_explanation.py +383 -0
- package/src/reasoning/reasoning_trace.py +239 -0
- package/src/registry/__init__.py +3 -0
- package/src/registry/tools_registry.py +3 -0
- package/src/session_memory.py +448 -0
- package/src/session_store.py +370 -0
- package/src/storage/__init__.py +19 -0
- package/src/storage/artifact_store.py +620 -0
- package/src/storage/helpers.py +116 -0
- package/src/storage/huggingface_storage.py +694 -0
- package/src/storage/r2_storage.py +0 -0
- package/src/storage/user_files_service.py +288 -0
- package/src/tools/__init__.py +335 -0
- package/src/tools/advanced_analysis.py +823 -0
- package/src/tools/advanced_feature_engineering.py +708 -0
- package/src/tools/advanced_insights.py +578 -0
- package/src/tools/advanced_preprocessing.py +549 -0
- package/src/tools/advanced_training.py +906 -0
- package/src/tools/agent_tool_mapping.py +326 -0
- package/src/tools/auto_pipeline.py +420 -0
- package/src/tools/autogluon_training.py +1480 -0
- package/src/tools/business_intelligence.py +860 -0
- package/src/tools/cloud_data_sources.py +581 -0
- package/src/tools/code_interpreter.py +390 -0
- package/src/tools/computer_vision.py +614 -0
- package/src/tools/data_cleaning.py +614 -0
- package/src/tools/data_profiling.py +593 -0
- package/src/tools/data_type_conversion.py +268 -0
- package/src/tools/data_wrangling.py +433 -0
- package/src/tools/eda_reports.py +284 -0
- package/src/tools/enhanced_feature_engineering.py +241 -0
- package/src/tools/feature_engineering.py +302 -0
- package/src/tools/matplotlib_visualizations.py +1327 -0
- package/src/tools/model_training.py +520 -0
- package/src/tools/nlp_text_analytics.py +761 -0
- package/src/tools/plotly_visualizations.py +497 -0
- package/src/tools/production_mlops.py +852 -0
- package/src/tools/time_series.py +507 -0
- package/src/tools/tools_registry.py +2133 -0
- package/src/tools/visualization_engine.py +559 -0
- package/src/utils/__init__.py +42 -0
- package/src/utils/error_recovery.py +313 -0
- package/src/utils/parallel_executor.py +402 -0
- package/src/utils/polars_helpers.py +248 -0
- package/src/utils/schema_extraction.py +132 -0
- package/src/utils/semantic_layer.py +392 -0
- package/src/utils/token_budget.py +411 -0
- package/src/utils/validation.py +377 -0
- package/src/workflow_state.py +154 -0
|
@@ -0,0 +1,614 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Cleaning Tools
|
|
3
|
+
Tools for handling missing values, outliers, and data type issues.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
import numpy as np
|
|
8
|
+
from typing import Dict, Any, List, Optional
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import sys
|
|
11
|
+
import os
|
|
12
|
+
|
|
13
|
+
# Add parent directory to path for imports
|
|
14
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
15
|
+
|
|
16
|
+
from ds_agent.utils.polars_helpers import (
|
|
17
|
+
load_dataframe,
|
|
18
|
+
save_dataframe,
|
|
19
|
+
get_numeric_columns,
|
|
20
|
+
get_categorical_columns,
|
|
21
|
+
get_datetime_columns,
|
|
22
|
+
detect_id_columns,
|
|
23
|
+
)
|
|
24
|
+
from ds_agent.utils.validation import (
|
|
25
|
+
validate_file_exists,
|
|
26
|
+
validate_file_format,
|
|
27
|
+
validate_dataframe,
|
|
28
|
+
validate_columns_exist,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def clean_missing_values(file_path: str, strategy,
|
|
33
|
+
output_path: str, threshold: float = 0.4) -> Dict[str, Any]:
|
|
34
|
+
"""
|
|
35
|
+
Handle missing values using appropriate strategies with smart threshold-based column dropping.
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
file_path: Path to CSV or Parquet file
|
|
39
|
+
strategy: Either "auto" (string) to automatically decide strategies for all columns,
|
|
40
|
+
or a dictionary mapping column names to strategies
|
|
41
|
+
('median', 'mean', 'mode', 'forward_fill', 'drop')
|
|
42
|
+
output_path: Path to save cleaned dataset
|
|
43
|
+
threshold: For "auto" strategy, drop columns with missing % > threshold (default: 0.4 = 40%)
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
Dictionary with cleaning report
|
|
47
|
+
|
|
48
|
+
Auto Strategy Behavior:
|
|
49
|
+
1. Drop columns with >threshold missing (default 40%)
|
|
50
|
+
2. Impute numeric columns with median
|
|
51
|
+
3. Impute categorical columns with mode
|
|
52
|
+
4. Forward-fill for time series columns
|
|
53
|
+
"""
|
|
54
|
+
# Validation
|
|
55
|
+
validate_file_exists(file_path)
|
|
56
|
+
validate_file_format(file_path)
|
|
57
|
+
|
|
58
|
+
# Load data
|
|
59
|
+
df = load_dataframe(file_path)
|
|
60
|
+
validate_dataframe(df)
|
|
61
|
+
|
|
62
|
+
# Get column type information
|
|
63
|
+
numeric_cols = get_numeric_columns(df)
|
|
64
|
+
categorical_cols = get_categorical_columns(df)
|
|
65
|
+
datetime_cols = get_datetime_columns(df)
|
|
66
|
+
id_cols = detect_id_columns(df)
|
|
67
|
+
|
|
68
|
+
report = {
|
|
69
|
+
"original_rows": len(df),
|
|
70
|
+
"original_columns": len(df.columns),
|
|
71
|
+
"columns_dropped": [],
|
|
72
|
+
"columns_processed": {},
|
|
73
|
+
"rows_dropped": 0,
|
|
74
|
+
"threshold_used": threshold
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
# Handle string strategy modes
|
|
78
|
+
if isinstance(strategy, str):
|
|
79
|
+
if strategy == "auto":
|
|
80
|
+
# Step 1: Identify and drop high-missing columns (>threshold)
|
|
81
|
+
cols_to_drop = []
|
|
82
|
+
for col in df.columns:
|
|
83
|
+
null_count = df[col].null_count()
|
|
84
|
+
null_pct = null_count / len(df) if len(df) > 0 else 0
|
|
85
|
+
|
|
86
|
+
if null_pct > threshold:
|
|
87
|
+
cols_to_drop.append(col)
|
|
88
|
+
report["columns_dropped"].append({
|
|
89
|
+
"column": col,
|
|
90
|
+
"missing_percentage": round(null_pct * 100, 2),
|
|
91
|
+
"reason": f"Missing >{threshold*100}% of values"
|
|
92
|
+
})
|
|
93
|
+
|
|
94
|
+
# Drop high-missing columns
|
|
95
|
+
if cols_to_drop:
|
|
96
|
+
df = df.drop(cols_to_drop)
|
|
97
|
+
print(f"🗑️ Dropped {len(cols_to_drop)} columns with >{threshold*100}% missing:")
|
|
98
|
+
for col_info in report["columns_dropped"]:
|
|
99
|
+
print(f" - {col_info['column']} ({col_info['missing_percentage']}% missing)")
|
|
100
|
+
|
|
101
|
+
# Step 2: Build strategy for remaining columns
|
|
102
|
+
strategy = {}
|
|
103
|
+
for col in df.columns:
|
|
104
|
+
if df[col].null_count() > 0:
|
|
105
|
+
if col in id_cols:
|
|
106
|
+
strategy[col] = "drop" # Drop rows with missing IDs
|
|
107
|
+
elif col in datetime_cols:
|
|
108
|
+
strategy[col] = "forward_fill" # Forward fill for time series
|
|
109
|
+
elif col in numeric_cols:
|
|
110
|
+
strategy[col] = "median" # Median for numeric (robust to outliers)
|
|
111
|
+
elif col in categorical_cols:
|
|
112
|
+
strategy[col] = "mode" # Mode for categorical
|
|
113
|
+
else:
|
|
114
|
+
strategy[col] = "mode" # Default to mode
|
|
115
|
+
|
|
116
|
+
print(f"🔧 Auto-detected strategies for {len(strategy)} remaining columns with missing values")
|
|
117
|
+
|
|
118
|
+
elif strategy in ["median", "mean", "mode", "forward_fill", "drop"]:
|
|
119
|
+
# Apply same strategy to all columns with missing values
|
|
120
|
+
strategy_dict = {}
|
|
121
|
+
for col in df.columns:
|
|
122
|
+
if df[col].null_count() > 0:
|
|
123
|
+
strategy_dict[col] = strategy
|
|
124
|
+
strategy = strategy_dict
|
|
125
|
+
print(f"🔧 Applying '{list(strategy_dict.values())[0] if strategy_dict else strategy}' strategy to {len(strategy_dict)} columns with missing values")
|
|
126
|
+
|
|
127
|
+
elif strategy in ["iterative", "mice"]:
|
|
128
|
+
# MICE / Iterative Imputation using sklearn IterativeImputer
|
|
129
|
+
# This handles ALL numeric columns at once (multivariate imputation)
|
|
130
|
+
print(f"🔧 Applying Iterative (MICE) imputation to numeric columns...")
|
|
131
|
+
try:
|
|
132
|
+
from sklearn.experimental import enable_iterative_imputer # noqa: F401
|
|
133
|
+
from sklearn.impute import IterativeImputer
|
|
134
|
+
from sklearn.linear_model import BayesianRidge
|
|
135
|
+
import pandas as pd
|
|
136
|
+
|
|
137
|
+
# Identify numeric columns with missing values
|
|
138
|
+
numeric_cols_with_nulls = [
|
|
139
|
+
col for col in numeric_cols if df[col].null_count() > 0
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
if not numeric_cols_with_nulls:
|
|
143
|
+
print(" ℹ️ No numeric columns with missing values for MICE imputation")
|
|
144
|
+
else:
|
|
145
|
+
# Convert numeric columns to pandas for IterativeImputer
|
|
146
|
+
df_pd = df.select(numeric_cols).to_pandas()
|
|
147
|
+
|
|
148
|
+
# Fit and transform
|
|
149
|
+
imputer = IterativeImputer(
|
|
150
|
+
estimator=BayesianRidge(),
|
|
151
|
+
max_iter=10,
|
|
152
|
+
random_state=42,
|
|
153
|
+
missing_values=float('nan')
|
|
154
|
+
)
|
|
155
|
+
imputed_data = imputer.fit_transform(df_pd)
|
|
156
|
+
|
|
157
|
+
# Replace columns back in Polars DataFrame
|
|
158
|
+
for i, col_name in enumerate(numeric_cols):
|
|
159
|
+
df = df.with_columns(
|
|
160
|
+
pl.Series(col_name, imputed_data[:, i])
|
|
161
|
+
)
|
|
162
|
+
|
|
163
|
+
for col_name in numeric_cols_with_nulls:
|
|
164
|
+
report["columns_processed"][col_name] = {
|
|
165
|
+
"status": "success",
|
|
166
|
+
"strategy": "iterative_mice",
|
|
167
|
+
"nulls_before": int(df[col_name].null_count()), # Should be 0 now
|
|
168
|
+
"nulls_after": 0
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
print(f" ✅ MICE imputed {len(numeric_cols_with_nulls)} numeric columns using {len(numeric_cols)} features")
|
|
172
|
+
|
|
173
|
+
# Handle remaining non-numeric columns with mode
|
|
174
|
+
for col in df.columns:
|
|
175
|
+
if df[col].null_count() > 0 and col not in numeric_cols:
|
|
176
|
+
mode_val = df[col].drop_nulls().mode().first()
|
|
177
|
+
if mode_val is not None:
|
|
178
|
+
df = df.with_columns(
|
|
179
|
+
pl.col(col).fill_null(mode_val).alias(col)
|
|
180
|
+
)
|
|
181
|
+
report["columns_processed"][col] = {
|
|
182
|
+
"status": "success",
|
|
183
|
+
"strategy": "mode (non-numeric fallback)",
|
|
184
|
+
"nulls_before": int(df[col].null_count()),
|
|
185
|
+
"nulls_after": 0
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
except ImportError:
|
|
189
|
+
return {
|
|
190
|
+
"success": False,
|
|
191
|
+
"error": "IterativeImputer requires scikit-learn >= 1.4. Install with: pip install scikit-learn>=1.4",
|
|
192
|
+
"error_type": "MissingDependency"
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
# Skip per-column processing for MICE (already handled above)
|
|
196
|
+
strategy = {}
|
|
197
|
+
|
|
198
|
+
else:
|
|
199
|
+
return {
|
|
200
|
+
"success": False,
|
|
201
|
+
"error": f"Invalid strategy '{strategy}'. Use 'auto', 'median', 'mean', 'mode', 'forward_fill', 'drop', 'iterative', 'mice', or provide a dictionary.",
|
|
202
|
+
"error_type": "ValueError"
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
# Process each column based on strategy
|
|
206
|
+
for col, strat in strategy.items():
|
|
207
|
+
if col not in df.columns:
|
|
208
|
+
report["columns_processed"][col] = {
|
|
209
|
+
"status": "error",
|
|
210
|
+
"message": f"Column not found (may have been dropped)"
|
|
211
|
+
}
|
|
212
|
+
continue
|
|
213
|
+
|
|
214
|
+
null_count_before = df[col].null_count()
|
|
215
|
+
|
|
216
|
+
if null_count_before == 0:
|
|
217
|
+
report["columns_processed"][col] = {
|
|
218
|
+
"status": "skipped",
|
|
219
|
+
"message": "No missing values"
|
|
220
|
+
}
|
|
221
|
+
continue
|
|
222
|
+
|
|
223
|
+
# Don't impute ID columns - drop rows instead
|
|
224
|
+
if col in id_cols and strat != "drop":
|
|
225
|
+
report["columns_processed"][col] = {
|
|
226
|
+
"status": "skipped",
|
|
227
|
+
"message": "ID column - not imputed (use 'drop' to remove rows)"
|
|
228
|
+
}
|
|
229
|
+
continue
|
|
230
|
+
|
|
231
|
+
# Apply strategy
|
|
232
|
+
try:
|
|
233
|
+
rows_before = len(df)
|
|
234
|
+
|
|
235
|
+
if strat == "median":
|
|
236
|
+
if col in numeric_cols:
|
|
237
|
+
median_val = df[col].median()
|
|
238
|
+
df = df.with_columns(
|
|
239
|
+
pl.col(col).fill_null(median_val).alias(col)
|
|
240
|
+
)
|
|
241
|
+
report["columns_processed"][col] = {
|
|
242
|
+
"status": "success",
|
|
243
|
+
"strategy": "median",
|
|
244
|
+
"nulls_before": int(null_count_before),
|
|
245
|
+
"nulls_after": int(df[col].null_count()),
|
|
246
|
+
"fill_value": float(median_val)
|
|
247
|
+
}
|
|
248
|
+
else:
|
|
249
|
+
report["columns_processed"][col] = {
|
|
250
|
+
"status": "error",
|
|
251
|
+
"message": "Cannot use median on non-numeric column"
|
|
252
|
+
}
|
|
253
|
+
continue
|
|
254
|
+
|
|
255
|
+
elif strat == "mean":
|
|
256
|
+
if col in numeric_cols:
|
|
257
|
+
mean_val = df[col].mean()
|
|
258
|
+
df = df.with_columns(
|
|
259
|
+
pl.col(col).fill_null(mean_val).alias(col)
|
|
260
|
+
)
|
|
261
|
+
report["columns_processed"][col] = {
|
|
262
|
+
"status": "success",
|
|
263
|
+
"strategy": "mean",
|
|
264
|
+
"nulls_before": int(null_count_before),
|
|
265
|
+
"nulls_after": int(df[col].null_count()),
|
|
266
|
+
"fill_value": float(mean_val)
|
|
267
|
+
}
|
|
268
|
+
else:
|
|
269
|
+
report["columns_processed"][col] = {
|
|
270
|
+
"status": "error",
|
|
271
|
+
"message": "Cannot use mean on non-numeric column"
|
|
272
|
+
}
|
|
273
|
+
continue
|
|
274
|
+
|
|
275
|
+
elif strat == "mode":
|
|
276
|
+
mode_val = df[col].drop_nulls().mode().first()
|
|
277
|
+
if mode_val is not None:
|
|
278
|
+
df = df.with_columns(
|
|
279
|
+
pl.col(col).fill_null(mode_val).alias(col)
|
|
280
|
+
)
|
|
281
|
+
report["columns_processed"][col] = {
|
|
282
|
+
"status": "success",
|
|
283
|
+
"strategy": "mode",
|
|
284
|
+
"nulls_before": int(null_count_before),
|
|
285
|
+
"nulls_after": int(df[col].null_count()),
|
|
286
|
+
"fill_value": str(mode_val)
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
elif strat == "forward_fill":
|
|
290
|
+
df = df.with_columns(
|
|
291
|
+
pl.col(col).forward_fill().alias(col)
|
|
292
|
+
)
|
|
293
|
+
report["columns_processed"][col] = {
|
|
294
|
+
"status": "success",
|
|
295
|
+
"strategy": "forward_fill",
|
|
296
|
+
"nulls_before": int(null_count_before),
|
|
297
|
+
"nulls_after": int(df[col].null_count())
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
elif strat == "drop":
|
|
301
|
+
df = df.filter(pl.col(col).is_not_null())
|
|
302
|
+
rows_after = len(df)
|
|
303
|
+
report["columns_processed"][col] = {
|
|
304
|
+
"status": "success",
|
|
305
|
+
"strategy": "drop",
|
|
306
|
+
"nulls_before": int(null_count_before),
|
|
307
|
+
"rows_dropped": rows_before - rows_after
|
|
308
|
+
}
|
|
309
|
+
|
|
310
|
+
else:
|
|
311
|
+
report["columns_processed"][col] = {
|
|
312
|
+
"status": "error",
|
|
313
|
+
"message": f"Unknown strategy: {strat}"
|
|
314
|
+
}
|
|
315
|
+
continue
|
|
316
|
+
|
|
317
|
+
except Exception as e:
|
|
318
|
+
report["columns_processed"][col] = {
|
|
319
|
+
"status": "error",
|
|
320
|
+
"message": str(e)
|
|
321
|
+
}
|
|
322
|
+
|
|
323
|
+
report["final_rows"] = len(df)
|
|
324
|
+
report["final_columns"] = len(df.columns)
|
|
325
|
+
report["rows_dropped"] = report["original_rows"] - report["final_rows"]
|
|
326
|
+
report["columns_dropped_count"] = len(report["columns_dropped"])
|
|
327
|
+
|
|
328
|
+
# Save cleaned dataset
|
|
329
|
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
|
330
|
+
save_dataframe(df, output_path)
|
|
331
|
+
report["output_path"] = output_path
|
|
332
|
+
|
|
333
|
+
# Summary message
|
|
334
|
+
report["message"] = f"Cleaned {report['original_rows']} rows → {report['final_rows']} rows. "
|
|
335
|
+
report["message"] += f"Dropped {report['columns_dropped_count']} columns. "
|
|
336
|
+
report["message"] += f"Processed {len([c for c in report['columns_processed'].values() if c['status'] == 'success'])} columns."
|
|
337
|
+
|
|
338
|
+
return report
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def handle_outliers(file_path: str, strategy: str, columns: List[str],
|
|
342
|
+
output_path: str) -> Dict[str, Any]:
|
|
343
|
+
"""
|
|
344
|
+
Detect and handle outliers in numeric columns.
|
|
345
|
+
|
|
346
|
+
Args:
|
|
347
|
+
file_path: Path to CSV or Parquet file
|
|
348
|
+
strategy: Method to handle outliers ('clip', 'cap', 'winsorize', 'remove')
|
|
349
|
+
columns: List of columns to check, or ['all'] for all numeric columns
|
|
350
|
+
output_path: Path to save cleaned dataset
|
|
351
|
+
|
|
352
|
+
Returns:
|
|
353
|
+
Dictionary with outlier handling report
|
|
354
|
+
"""
|
|
355
|
+
# Validation
|
|
356
|
+
validate_file_exists(file_path)
|
|
357
|
+
validate_file_format(file_path)
|
|
358
|
+
|
|
359
|
+
# Load data
|
|
360
|
+
df = load_dataframe(file_path)
|
|
361
|
+
validate_dataframe(df)
|
|
362
|
+
|
|
363
|
+
# Determine which columns to process
|
|
364
|
+
numeric_cols = get_numeric_columns(df)
|
|
365
|
+
|
|
366
|
+
if columns == ["all"]:
|
|
367
|
+
target_cols = numeric_cols
|
|
368
|
+
else:
|
|
369
|
+
# Filter to only existing numeric columns (auto-skip dropped columns)
|
|
370
|
+
target_cols = []
|
|
371
|
+
for col in columns:
|
|
372
|
+
if col not in df.columns:
|
|
373
|
+
print(f"⚠️ Skipping '{col}' - column was dropped in previous step")
|
|
374
|
+
continue
|
|
375
|
+
if col not in numeric_cols:
|
|
376
|
+
print(f"⚠️ Skipping '{col}' - not numeric")
|
|
377
|
+
continue
|
|
378
|
+
target_cols.append(col)
|
|
379
|
+
|
|
380
|
+
# If no valid columns remain, return early
|
|
381
|
+
if not target_cols:
|
|
382
|
+
return {
|
|
383
|
+
"success": False,
|
|
384
|
+
"error": f"None of the requested columns exist in the dataset. Available numeric columns: {', '.join(numeric_cols[:20])}",
|
|
385
|
+
"error_type": "ValueError"
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
report = {
|
|
389
|
+
"original_rows": len(df),
|
|
390
|
+
"strategy": strategy,
|
|
391
|
+
"columns_processed": {}
|
|
392
|
+
}
|
|
393
|
+
|
|
394
|
+
# Process each column
|
|
395
|
+
for col in target_cols:
|
|
396
|
+
col_data = df[col].drop_nulls()
|
|
397
|
+
|
|
398
|
+
if len(col_data) == 0:
|
|
399
|
+
report["columns_processed"][col] = {
|
|
400
|
+
"status": "skipped",
|
|
401
|
+
"message": "All values are null"
|
|
402
|
+
}
|
|
403
|
+
continue
|
|
404
|
+
|
|
405
|
+
# Calculate IQR bounds
|
|
406
|
+
q1 = col_data.quantile(0.25)
|
|
407
|
+
q3 = col_data.quantile(0.75)
|
|
408
|
+
iqr = q3 - q1
|
|
409
|
+
|
|
410
|
+
lower_bound = q1 - 1.5 * iqr
|
|
411
|
+
upper_bound = q3 + 1.5 * iqr
|
|
412
|
+
|
|
413
|
+
# Count outliers
|
|
414
|
+
outliers_mask = (df[col] < lower_bound) | (df[col] > upper_bound)
|
|
415
|
+
outlier_count = outliers_mask.sum()
|
|
416
|
+
|
|
417
|
+
if outlier_count == 0:
|
|
418
|
+
report["columns_processed"][col] = {
|
|
419
|
+
"status": "skipped",
|
|
420
|
+
"message": "No outliers detected"
|
|
421
|
+
}
|
|
422
|
+
continue
|
|
423
|
+
|
|
424
|
+
# Apply strategy
|
|
425
|
+
if strategy == "clip" or strategy == "cap":
|
|
426
|
+
# Clip/cap values to bounds
|
|
427
|
+
df = df.with_columns(
|
|
428
|
+
pl.col(col).clip(lower_bound, upper_bound).alias(col)
|
|
429
|
+
)
|
|
430
|
+
|
|
431
|
+
elif strategy == "winsorize":
|
|
432
|
+
# Winsorize: cap at 1st and 99th percentiles
|
|
433
|
+
p1 = col_data.quantile(0.01)
|
|
434
|
+
p99 = col_data.quantile(0.99)
|
|
435
|
+
df = df.with_columns(
|
|
436
|
+
pl.col(col).clip(p1, p99).alias(col)
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
elif strategy == "remove":
|
|
440
|
+
# Remove rows with outliers
|
|
441
|
+
df = df.filter(~outliers_mask)
|
|
442
|
+
|
|
443
|
+
report["columns_processed"][col] = {
|
|
444
|
+
"status": "success",
|
|
445
|
+
"outliers_detected": int(outlier_count),
|
|
446
|
+
"bounds": {
|
|
447
|
+
"lower": float(lower_bound),
|
|
448
|
+
"upper": float(upper_bound)
|
|
449
|
+
}
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
report["final_rows"] = len(df)
|
|
453
|
+
report["rows_dropped"] = report["original_rows"] - report["final_rows"]
|
|
454
|
+
|
|
455
|
+
# Save cleaned dataset
|
|
456
|
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
|
457
|
+
save_dataframe(df, output_path)
|
|
458
|
+
report["output_path"] = output_path
|
|
459
|
+
|
|
460
|
+
return report
|
|
461
|
+
|
|
462
|
+
|
|
463
|
+
def fix_data_types(file_path: str, type_mapping: Optional[Dict[str, str]] = None,
|
|
464
|
+
output_path: str = None) -> Dict[str, Any]:
|
|
465
|
+
"""
|
|
466
|
+
Auto-detect and fix incorrect data types.
|
|
467
|
+
|
|
468
|
+
Args:
|
|
469
|
+
file_path: Path to CSV or Parquet file
|
|
470
|
+
type_mapping: Optional dictionary mapping columns to target types
|
|
471
|
+
('int', 'float', 'string', 'date', 'bool', 'category')
|
|
472
|
+
Use 'auto' or None for automatic detection
|
|
473
|
+
output_path: Path to save dataset with fixed types
|
|
474
|
+
|
|
475
|
+
Returns:
|
|
476
|
+
Dictionary with type fixing report
|
|
477
|
+
"""
|
|
478
|
+
# Validation
|
|
479
|
+
validate_file_exists(file_path)
|
|
480
|
+
validate_file_format(file_path)
|
|
481
|
+
|
|
482
|
+
# Load data
|
|
483
|
+
df = load_dataframe(file_path)
|
|
484
|
+
validate_dataframe(df)
|
|
485
|
+
|
|
486
|
+
if type_mapping is None or type_mapping == {"auto": "auto"}:
|
|
487
|
+
type_mapping = {}
|
|
488
|
+
|
|
489
|
+
report = {
|
|
490
|
+
"columns_processed": {}
|
|
491
|
+
}
|
|
492
|
+
|
|
493
|
+
for col in df.columns:
|
|
494
|
+
original_dtype = str(df[col].dtype)
|
|
495
|
+
|
|
496
|
+
# Get target type from mapping or auto-detect
|
|
497
|
+
if col in type_mapping and type_mapping[col] != "auto":
|
|
498
|
+
target_type = type_mapping[col]
|
|
499
|
+
else:
|
|
500
|
+
# Auto-detect target type
|
|
501
|
+
target_type = _auto_detect_type(df[col])
|
|
502
|
+
|
|
503
|
+
if target_type is None:
|
|
504
|
+
report["columns_processed"][col] = {
|
|
505
|
+
"status": "skipped",
|
|
506
|
+
"original_dtype": original_dtype,
|
|
507
|
+
"message": "Could not auto-detect type"
|
|
508
|
+
}
|
|
509
|
+
continue
|
|
510
|
+
|
|
511
|
+
# Try to convert
|
|
512
|
+
try:
|
|
513
|
+
if target_type == "int":
|
|
514
|
+
df = df.with_columns(
|
|
515
|
+
pl.col(col).cast(pl.Int64, strict=False).alias(col)
|
|
516
|
+
)
|
|
517
|
+
elif target_type == "float":
|
|
518
|
+
df = df.with_columns(
|
|
519
|
+
pl.col(col).cast(pl.Float64, strict=False).alias(col)
|
|
520
|
+
)
|
|
521
|
+
elif target_type == "string":
|
|
522
|
+
df = df.with_columns(
|
|
523
|
+
pl.col(col).cast(pl.Utf8).alias(col)
|
|
524
|
+
)
|
|
525
|
+
elif target_type == "date":
|
|
526
|
+
df = df.with_columns(
|
|
527
|
+
pl.col(col).str.strptime(pl.Date, "%Y-%m-%d", strict=False).alias(col)
|
|
528
|
+
)
|
|
529
|
+
elif target_type == "bool":
|
|
530
|
+
df = df.with_columns(
|
|
531
|
+
pl.col(col).cast(pl.Boolean, strict=False).alias(col)
|
|
532
|
+
)
|
|
533
|
+
elif target_type == "category":
|
|
534
|
+
df = df.with_columns(
|
|
535
|
+
pl.col(col).cast(pl.Categorical).alias(col)
|
|
536
|
+
)
|
|
537
|
+
|
|
538
|
+
new_dtype = str(df[col].dtype)
|
|
539
|
+
|
|
540
|
+
report["columns_processed"][col] = {
|
|
541
|
+
"status": "success",
|
|
542
|
+
"original_dtype": original_dtype,
|
|
543
|
+
"new_dtype": new_dtype,
|
|
544
|
+
"target_type": target_type
|
|
545
|
+
}
|
|
546
|
+
|
|
547
|
+
except Exception as e:
|
|
548
|
+
report["columns_processed"][col] = {
|
|
549
|
+
"status": "error",
|
|
550
|
+
"original_dtype": original_dtype,
|
|
551
|
+
"target_type": target_type,
|
|
552
|
+
"message": str(e)
|
|
553
|
+
}
|
|
554
|
+
|
|
555
|
+
# Save dataset
|
|
556
|
+
Path(output_path).parent.mkdir(parents=True, exist_ok=True)
|
|
557
|
+
save_dataframe(df, output_path)
|
|
558
|
+
report["output_path"] = output_path
|
|
559
|
+
|
|
560
|
+
return report
|
|
561
|
+
|
|
562
|
+
|
|
563
|
+
def _auto_detect_type(series: pl.Series) -> Optional[str]:
|
|
564
|
+
"""
|
|
565
|
+
Auto-detect appropriate type for a series.
|
|
566
|
+
|
|
567
|
+
Args:
|
|
568
|
+
series: Polars series
|
|
569
|
+
|
|
570
|
+
Returns:
|
|
571
|
+
Detected type string or None
|
|
572
|
+
"""
|
|
573
|
+
# Already correct type
|
|
574
|
+
if series.dtype in pl.NUMERIC_DTYPES:
|
|
575
|
+
return None
|
|
576
|
+
|
|
577
|
+
if series.dtype in [pl.Date, pl.Datetime]:
|
|
578
|
+
return None
|
|
579
|
+
|
|
580
|
+
# Try to detect from string values
|
|
581
|
+
if series.dtype == pl.Utf8:
|
|
582
|
+
sample = series.drop_nulls().head(100)
|
|
583
|
+
|
|
584
|
+
if len(sample) == 0:
|
|
585
|
+
return None
|
|
586
|
+
|
|
587
|
+
# Check for boolean
|
|
588
|
+
unique_vals = set(str(v).lower() for v in sample.to_list())
|
|
589
|
+
if unique_vals.issubset({'true', 'false', '1', '0', 'yes', 'no', 't', 'f'}):
|
|
590
|
+
return "bool"
|
|
591
|
+
|
|
592
|
+
# Check for numeric
|
|
593
|
+
try:
|
|
594
|
+
sample.cast(pl.Float64)
|
|
595
|
+
# Check if all are integers
|
|
596
|
+
if all('.' not in str(v) for v in sample.to_list() if v is not None):
|
|
597
|
+
return "int"
|
|
598
|
+
return "float"
|
|
599
|
+
except:
|
|
600
|
+
pass
|
|
601
|
+
|
|
602
|
+
# Check for date
|
|
603
|
+
try:
|
|
604
|
+
sample.str.strptime(pl.Date, "%Y-%m-%d", strict=False)
|
|
605
|
+
return "date"
|
|
606
|
+
except:
|
|
607
|
+
pass
|
|
608
|
+
|
|
609
|
+
# Check if should be categorical (low cardinality)
|
|
610
|
+
n_unique = series.n_unique()
|
|
611
|
+
if n_unique < len(series) * 0.5 and n_unique < 100:
|
|
612
|
+
return "category"
|
|
613
|
+
|
|
614
|
+
return None
|