ds-agent-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/bin/ds-agent.js +451 -0
  2. package/ds_agent/__init__.py +8 -0
  3. package/package.json +28 -0
  4. package/requirements.txt +126 -0
  5. package/setup.py +35 -0
  6. package/src/__init__.py +7 -0
  7. package/src/_compress_tool_result.py +118 -0
  8. package/src/api/__init__.py +4 -0
  9. package/src/api/app.py +1626 -0
  10. package/src/cache/__init__.py +5 -0
  11. package/src/cache/cache_manager.py +561 -0
  12. package/src/cli.py +2886 -0
  13. package/src/dynamic_prompts.py +281 -0
  14. package/src/orchestrator.py +4799 -0
  15. package/src/progress_manager.py +139 -0
  16. package/src/reasoning/__init__.py +332 -0
  17. package/src/reasoning/business_summary.py +431 -0
  18. package/src/reasoning/data_understanding.py +356 -0
  19. package/src/reasoning/model_explanation.py +383 -0
  20. package/src/reasoning/reasoning_trace.py +239 -0
  21. package/src/registry/__init__.py +3 -0
  22. package/src/registry/tools_registry.py +3 -0
  23. package/src/session_memory.py +448 -0
  24. package/src/session_store.py +370 -0
  25. package/src/storage/__init__.py +19 -0
  26. package/src/storage/artifact_store.py +620 -0
  27. package/src/storage/helpers.py +116 -0
  28. package/src/storage/huggingface_storage.py +694 -0
  29. package/src/storage/r2_storage.py +0 -0
  30. package/src/storage/user_files_service.py +288 -0
  31. package/src/tools/__init__.py +335 -0
  32. package/src/tools/advanced_analysis.py +823 -0
  33. package/src/tools/advanced_feature_engineering.py +708 -0
  34. package/src/tools/advanced_insights.py +578 -0
  35. package/src/tools/advanced_preprocessing.py +549 -0
  36. package/src/tools/advanced_training.py +906 -0
  37. package/src/tools/agent_tool_mapping.py +326 -0
  38. package/src/tools/auto_pipeline.py +420 -0
  39. package/src/tools/autogluon_training.py +1480 -0
  40. package/src/tools/business_intelligence.py +860 -0
  41. package/src/tools/cloud_data_sources.py +581 -0
  42. package/src/tools/code_interpreter.py +390 -0
  43. package/src/tools/computer_vision.py +614 -0
  44. package/src/tools/data_cleaning.py +614 -0
  45. package/src/tools/data_profiling.py +593 -0
  46. package/src/tools/data_type_conversion.py +268 -0
  47. package/src/tools/data_wrangling.py +433 -0
  48. package/src/tools/eda_reports.py +284 -0
  49. package/src/tools/enhanced_feature_engineering.py +241 -0
  50. package/src/tools/feature_engineering.py +302 -0
  51. package/src/tools/matplotlib_visualizations.py +1327 -0
  52. package/src/tools/model_training.py +520 -0
  53. package/src/tools/nlp_text_analytics.py +761 -0
  54. package/src/tools/plotly_visualizations.py +497 -0
  55. package/src/tools/production_mlops.py +852 -0
  56. package/src/tools/time_series.py +507 -0
  57. package/src/tools/tools_registry.py +2133 -0
  58. package/src/tools/visualization_engine.py +559 -0
  59. package/src/utils/__init__.py +42 -0
  60. package/src/utils/error_recovery.py +313 -0
  61. package/src/utils/parallel_executor.py +402 -0
  62. package/src/utils/polars_helpers.py +248 -0
  63. package/src/utils/schema_extraction.py +132 -0
  64. package/src/utils/semantic_layer.py +392 -0
  65. package/src/utils/token_budget.py +411 -0
  66. package/src/utils/validation.py +377 -0
  67. package/src/workflow_state.py +154 -0
@@ -0,0 +1,823 @@
1
+ """
2
+ Advanced Analysis Tools
3
+ Tools for EDA, model diagnostics, anomaly detection, multicollinearity, and statistical tests.
4
+ """
5
+
6
+ import polars as pl
7
+ import numpy as np
8
+ from typing import Dict, Any, List, Optional, Tuple
9
+ from pathlib import Path
10
+ import sys
11
+ import os
12
+ import warnings
13
+ import json
14
+
15
+ warnings.filterwarnings('ignore')
16
+
17
+ # Add parent directory to path for imports
18
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
19
+
20
+ from sklearn.ensemble import IsolationForest
21
+ from sklearn.neighbors import LocalOutlierFactor
22
+ from sklearn.model_selection import learning_curve
23
+ from scipy import stats
24
+ from scipy.stats import chi2_contingency, f_oneway, ttest_ind, pearsonr
25
+ from statsmodels.stats.outliers_influence import variance_inflation_factor
26
+ import plotly.graph_objects as go
27
+ import plotly.express as px
28
+ from plotly.subplots import make_subplots
29
+ import pandas as pd
30
+
31
+ from ds_agent.utils.polars_helpers import (
32
+ load_dataframe, get_numeric_columns, get_categorical_columns
33
+ )
34
+ from ds_agent.utils.validation import (
35
+ validate_file_exists, validate_file_format, validate_dataframe,
36
+ validate_column_exists
37
+ )
38
+
39
+
40
+ def perform_eda_analysis(
41
+ file_path: str,
42
+ target_col: Optional[str] = None,
43
+ output_html: Optional[str] = None
44
+ ) -> Dict[str, Any]:
45
+ """
46
+ Perform comprehensive automated Exploratory Data Analysis with interactive visualizations.
47
+
48
+ Args:
49
+ file_path: Path to dataset
50
+ target_col: Target column for supervised analysis
51
+ output_html: Path to save HTML report
52
+
53
+ Returns:
54
+ Dictionary with EDA insights and statistics
55
+ """
56
+ # Validation
57
+ validate_file_exists(file_path)
58
+ validate_file_format(file_path)
59
+
60
+ # Load data
61
+ df = load_dataframe(file_path)
62
+ validate_dataframe(df)
63
+
64
+ if target_col:
65
+ validate_column_exists(df, target_col)
66
+
67
+ print("📊 Performing comprehensive EDA...")
68
+
69
+ # Basic statistics
70
+ n_rows, n_cols = df.shape
71
+ numeric_cols = get_numeric_columns(df)
72
+ categorical_cols = get_categorical_columns(df)
73
+
74
+ # Missing values analysis
75
+ missing_stats = {}
76
+ for col in df.columns:
77
+ null_count = df[col].null_count()
78
+ if null_count > 0:
79
+ missing_stats[col] = {
80
+ 'count': null_count,
81
+ 'percentage': float(null_count / n_rows * 100)
82
+ }
83
+
84
+ # Univariate analysis for numeric columns
85
+ numeric_stats = {}
86
+ for col in numeric_cols[:20]: # Limit to 20 columns
87
+ col_data = df[col].drop_nulls().to_numpy()
88
+ numeric_stats[col] = {
89
+ 'mean': float(np.mean(col_data)),
90
+ 'median': float(np.median(col_data)),
91
+ 'std': float(np.std(col_data)),
92
+ 'min': float(np.min(col_data)),
93
+ 'max': float(np.max(col_data)),
94
+ 'q25': float(np.percentile(col_data, 25)),
95
+ 'q75': float(np.percentile(col_data, 75)),
96
+ 'skewness': float(stats.skew(col_data)),
97
+ 'kurtosis': float(stats.kurtosis(col_data))
98
+ }
99
+
100
+ # Categorical analysis
101
+ categorical_stats = {}
102
+ for col in categorical_cols[:10]: # Limit to 10 columns
103
+ value_counts = df[col].value_counts().head(10)
104
+ categorical_stats[col] = {
105
+ 'unique_values': df[col].n_unique(),
106
+ 'mode': df[col].mode()[0] if len(df[col].mode()) > 0 else None,
107
+ 'top_10_values': {str(row[col]): row['count'] for row in value_counts.to_dicts()}
108
+ }
109
+
110
+ # Correlation analysis (numeric only)
111
+ correlations = {}
112
+ if len(numeric_cols) > 1:
113
+ corr_matrix = df[numeric_cols[:20]].to_pandas().corr()
114
+
115
+ # Find highly correlated pairs
116
+ high_corr_pairs = []
117
+ for i in range(len(corr_matrix.columns)):
118
+ for j in range(i+1, len(corr_matrix.columns)):
119
+ corr_val = corr_matrix.iloc[i, j]
120
+ if abs(corr_val) > 0.7:
121
+ high_corr_pairs.append({
122
+ 'feature_1': corr_matrix.columns[i],
123
+ 'feature_2': corr_matrix.columns[j],
124
+ 'correlation': float(corr_val)
125
+ })
126
+
127
+ correlations['high_correlations'] = high_corr_pairs
128
+ correlations['correlation_matrix_shape'] = corr_matrix.shape
129
+
130
+ # Target analysis
131
+ target_insights = {}
132
+ if target_col:
133
+ if target_col in numeric_cols:
134
+ # Numeric target - regression
135
+ target_data = df[target_col].drop_nulls().to_numpy()
136
+ target_insights = {
137
+ 'type': 'regression',
138
+ 'mean': float(np.mean(target_data)),
139
+ 'std': float(np.std(target_data)),
140
+ 'min': float(np.min(target_data)),
141
+ 'max': float(np.max(target_data))
142
+ }
143
+
144
+ # Feature-target correlations
145
+ target_corr = {}
146
+ for col in numeric_cols:
147
+ if col != target_col:
148
+ try:
149
+ corr, pval = pearsonr(
150
+ df[col].drop_nulls().to_numpy(),
151
+ df[target_col].drop_nulls().to_numpy()
152
+ )
153
+ if abs(corr) > 0.3:
154
+ target_corr[col] = {
155
+ 'correlation': float(corr),
156
+ 'p_value': float(pval)
157
+ }
158
+ except:
159
+ pass
160
+ target_insights['correlated_features'] = target_corr
161
+
162
+ else:
163
+ # Categorical target - classification
164
+ value_counts = df[target_col].value_counts()
165
+ target_insights = {
166
+ 'type': 'classification',
167
+ 'classes': len(value_counts),
168
+ 'distribution': {str(row[target_col]): row['count'] for row in value_counts.to_dicts()},
169
+ 'imbalance_ratio': float(value_counts['count'].max() / value_counts['count'].min())
170
+ }
171
+
172
+ # Create visualizations if output_html requested
173
+ if output_html:
174
+ print("📈 Generating interactive visualizations...")
175
+
176
+ fig = make_subplots(
177
+ rows=3, cols=2,
178
+ subplot_titles=('Distribution of Numeric Features', 'Missing Values',
179
+ 'Correlation Heatmap', 'Target Distribution',
180
+ 'Outliers Detection', 'Feature Importance')
181
+ )
182
+
183
+ # Distribution plot (first numeric column)
184
+ if numeric_cols:
185
+ col = numeric_cols[0]
186
+ fig.add_trace(
187
+ go.Histogram(x=df[col].to_list(), name=col),
188
+ row=1, col=1
189
+ )
190
+
191
+ # Missing values plot
192
+ if missing_stats:
193
+ missing_cols = list(missing_stats.keys())[:10]
194
+ missing_pcts = [missing_stats[col]['percentage'] for col in missing_cols]
195
+ fig.add_trace(
196
+ go.Bar(x=missing_cols, y=missing_pcts, name='Missing %'),
197
+ row=1, col=2
198
+ )
199
+
200
+ # Correlation heatmap
201
+ if len(numeric_cols) > 1:
202
+ corr_matrix_np = corr_matrix.values
203
+ fig.add_trace(
204
+ go.Heatmap(
205
+ z=corr_matrix_np,
206
+ x=corr_matrix.columns.tolist(),
207
+ y=corr_matrix.columns.tolist(),
208
+ colorscale='RdBu'
209
+ ),
210
+ row=2, col=1
211
+ )
212
+
213
+ # Target distribution
214
+ if target_col and target_col in categorical_cols:
215
+ target_counts = df[target_col].value_counts()
216
+ fig.add_trace(
217
+ go.Bar(
218
+ x=[str(row[target_col]) for row in target_counts.to_dicts()],
219
+ y=[row['count'] for row in target_counts.to_dicts()],
220
+ name='Target'
221
+ ),
222
+ row=2, col=2
223
+ )
224
+
225
+ fig.update_layout(height=1200, showlegend=False, title_text="Automated EDA Report")
226
+
227
+ # Save HTML
228
+ os.makedirs(os.path.dirname(output_html) if os.path.dirname(output_html) else '.', exist_ok=True)
229
+ fig.write_html(output_html)
230
+ print(f"💾 EDA report saved to: {output_html}")
231
+
232
+ return {
233
+ 'status': 'success',
234
+ 'dataset_shape': {'rows': n_rows, 'columns': n_cols},
235
+ 'column_types': {
236
+ 'numeric': len(numeric_cols),
237
+ 'categorical': len(categorical_cols)
238
+ },
239
+ 'missing_values': missing_stats,
240
+ 'numeric_statistics': numeric_stats,
241
+ 'categorical_statistics': categorical_stats,
242
+ 'correlations': correlations,
243
+ 'target_insights': target_insights,
244
+ 'output_html': output_html
245
+ }
246
+
247
+
248
+ def detect_model_issues(
249
+ model_path: str,
250
+ train_data_path: str,
251
+ test_data_path: str,
252
+ target_col: str
253
+ ) -> Dict[str, Any]:
254
+ """
255
+ Detect overfitting, underfitting, and other model issues using learning curves and diagnostics.
256
+
257
+ Args:
258
+ model_path: Path to trained model (.pkl)
259
+ train_data_path: Path to training dataset
260
+ test_data_path: Path to test dataset
261
+ target_col: Target column name
262
+
263
+ Returns:
264
+ Dictionary with model diagnostics
265
+ """
266
+ import joblib
267
+ from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
268
+
269
+ # Validation
270
+ validate_file_exists(model_path)
271
+ validate_file_exists(train_data_path)
272
+ validate_file_exists(test_data_path)
273
+
274
+ # Load model
275
+ model = joblib.load(model_path)
276
+
277
+ # Load data
278
+ train_df = load_dataframe(train_data_path)
279
+ test_df = load_dataframe(test_data_path)
280
+
281
+ validate_column_exists(train_df, target_col)
282
+ validate_column_exists(test_df, target_col)
283
+
284
+ # Prepare data
285
+ from ds_agent.utils.polars_helpers import split_features_target
286
+ X_train, y_train = split_features_target(train_df, target_col)
287
+ X_test, y_test = split_features_target(test_df, target_col)
288
+
289
+ print("🔍 Analyzing model performance...")
290
+
291
+ # Predictions
292
+ y_train_pred = model.predict(X_train)
293
+ y_test_pred = model.predict(X_test)
294
+
295
+ # Detect task type
296
+ unique_values = len(np.unique(y_train))
297
+ task_type = "classification" if unique_values < 20 else "regression"
298
+
299
+ # Calculate metrics
300
+ if task_type == "classification":
301
+ train_score = accuracy_score(y_train, y_train_pred)
302
+ test_score = accuracy_score(y_test, y_test_pred)
303
+ metric_name = "accuracy"
304
+ else:
305
+ train_score = r2_score(y_train, y_train_pred)
306
+ test_score = r2_score(y_test, y_test_pred)
307
+ metric_name = "r2"
308
+
309
+ # Diagnose issues
310
+ score_gap = train_score - test_score
311
+
312
+ diagnosis = []
313
+ if score_gap > 0.15:
314
+ diagnosis.append({
315
+ 'issue': 'overfitting',
316
+ 'severity': 'high' if score_gap > 0.25 else 'medium',
317
+ 'description': f'Training {metric_name} ({train_score:.3f}) is much higher than test {metric_name} ({test_score:.3f})',
318
+ 'recommendations': [
319
+ 'Add regularization (L1/L2)',
320
+ 'Reduce model complexity',
321
+ 'Increase training data',
322
+ 'Use cross-validation',
323
+ 'Add dropout (for neural networks)'
324
+ ]
325
+ })
326
+
327
+ if test_score < 0.6 and task_type == "classification":
328
+ diagnosis.append({
329
+ 'issue': 'underfitting',
330
+ 'severity': 'high',
331
+ 'description': f'Test accuracy ({test_score:.3f}) is too low',
332
+ 'recommendations': [
333
+ 'Increase model complexity',
334
+ 'Engineer better features',
335
+ 'Try ensemble methods',
336
+ 'Tune hyperparameters',
337
+ 'Check for data quality issues'
338
+ ]
339
+ })
340
+
341
+ if test_score < 0.3 and task_type == "regression":
342
+ diagnosis.append({
343
+ 'issue': 'underfitting',
344
+ 'severity': 'high',
345
+ 'description': f'Test R² ({test_score:.3f}) is too low',
346
+ 'recommendations': [
347
+ 'Increase model complexity',
348
+ 'Engineer better features',
349
+ 'Try non-linear models',
350
+ 'Check for data scaling issues'
351
+ ]
352
+ })
353
+
354
+ # Bias-variance analysis
355
+ if abs(score_gap) < 0.05:
356
+ bias_variance = 'balanced'
357
+ elif score_gap > 0.15:
358
+ bias_variance = 'high_variance' # Overfitting
359
+ else:
360
+ bias_variance = 'high_bias' # Underfitting
361
+
362
+ # Generate learning curve data
363
+ print("📊 Generating learning curve...")
364
+ try:
365
+ train_sizes = np.linspace(0.1, 1.0, 10)
366
+ train_sizes_abs, train_scores, val_scores = learning_curve(
367
+ model, X_train, y_train,
368
+ train_sizes=train_sizes,
369
+ cv=5,
370
+ scoring='accuracy' if task_type == "classification" else 'r2',
371
+ n_jobs=-1
372
+ )
373
+
374
+ learning_curve_data = {
375
+ 'train_sizes': train_sizes_abs.tolist(),
376
+ 'train_scores_mean': train_scores.mean(axis=1).tolist(),
377
+ 'val_scores_mean': val_scores.mean(axis=1).tolist()
378
+ }
379
+ except Exception as e:
380
+ learning_curve_data = {'error': str(e)}
381
+
382
+ return {
383
+ 'status': 'success',
384
+ 'task_type': task_type,
385
+ 'train_score': float(train_score),
386
+ 'test_score': float(test_score),
387
+ 'score_gap': float(score_gap),
388
+ 'bias_variance_assessment': bias_variance,
389
+ 'diagnosis': diagnosis,
390
+ 'learning_curve': learning_curve_data,
391
+ 'summary': f"Model shows {bias_variance} with {len(diagnosis)} issues detected"
392
+ }
393
+
394
+
395
+ def detect_anomalies(
396
+ file_path: str,
397
+ method: str = "isolation_forest",
398
+ contamination: float = 0.1,
399
+ columns: Optional[List[str]] = None,
400
+ output_path: Optional[str] = None
401
+ ) -> Dict[str, Any]:
402
+ """
403
+ Detect anomalies/outliers using various methods.
404
+
405
+ Args:
406
+ file_path: Path to dataset
407
+ method: Anomaly detection method:
408
+ - 'isolation_forest': Isolation Forest (good for high-dim data)
409
+ - 'lof': Local Outlier Factor
410
+ - 'zscore': Z-score method (univariate)
411
+ - 'iqr': Interquartile Range method (univariate)
412
+ contamination: Expected proportion of outliers (0.01 to 0.5)
413
+ columns: Columns to analyze (None = all numeric)
414
+ output_path: Path to save dataset with anomaly labels
415
+
416
+ Returns:
417
+ Dictionary with anomaly detection results
418
+ """
419
+ # Validation
420
+ validate_file_exists(file_path)
421
+ validate_file_format(file_path)
422
+
423
+ # Load data
424
+ df = load_dataframe(file_path)
425
+ validate_dataframe(df)
426
+
427
+ # Get numeric columns if not specified
428
+ if columns is None:
429
+ columns = get_numeric_columns(df)
430
+ print(f"🔢 Auto-detected {len(columns)} numeric columns")
431
+ else:
432
+ for col in columns:
433
+ validate_column_exists(df, col)
434
+
435
+ if not columns:
436
+ return {
437
+ 'status': 'skipped',
438
+ 'message': 'No numeric columns found for anomaly detection'
439
+ }
440
+
441
+ X = df[columns].fill_null(0).to_numpy()
442
+
443
+ print(f"🔍 Detecting anomalies using {method}...")
444
+
445
+ # Detect anomalies based on method
446
+ if method == "isolation_forest":
447
+ detector = IsolationForest(contamination=contamination, random_state=42, n_jobs=-1)
448
+ predictions = detector.fit_predict(X)
449
+ anomaly_scores = detector.score_samples(X)
450
+ anomalies = predictions == -1
451
+
452
+ elif method == "lof":
453
+ detector = LocalOutlierFactor(contamination=contamination, n_jobs=-1)
454
+ predictions = detector.fit_predict(X)
455
+ anomaly_scores = detector.negative_outlier_factor_
456
+ anomalies = predictions == -1
457
+
458
+ elif method == "zscore":
459
+ # Z-score for each column
460
+ z_scores = np.abs(stats.zscore(X, axis=0))
461
+ anomalies = (z_scores > 3).any(axis=1)
462
+ anomaly_scores = z_scores.max(axis=1)
463
+
464
+ elif method == "iqr":
465
+ # IQR for each column
466
+ Q1 = np.percentile(X, 25, axis=0)
467
+ Q3 = np.percentile(X, 75, axis=0)
468
+ IQR = Q3 - Q1
469
+
470
+ lower_bound = Q1 - 1.5 * IQR
471
+ upper_bound = Q3 + 1.5 * IQR
472
+
473
+ anomalies = ((X < lower_bound) | (X > upper_bound)).any(axis=1)
474
+ # Calculate how many IQRs away from bounds
475
+ dist_from_bounds = np.maximum(
476
+ (lower_bound - X) / IQR,
477
+ (X - upper_bound) / IQR
478
+ ).max(axis=1)
479
+ anomaly_scores = dist_from_bounds
480
+ else:
481
+ raise ValueError(f"Unsupported method: {method}")
482
+
483
+ # Count anomalies
484
+ n_anomalies = int(anomalies.sum())
485
+ anomaly_percentage = float(n_anomalies / len(df) * 100)
486
+
487
+ print(f"🚨 Found {n_anomalies} anomalies ({anomaly_percentage:.2f}%)")
488
+
489
+ # Add anomaly labels to dataframe
490
+ df_with_anomalies = df.with_columns([
491
+ pl.Series('is_anomaly', anomalies.astype(int)),
492
+ pl.Series('anomaly_score', anomaly_scores)
493
+ ])
494
+
495
+ # Get indices of anomalies
496
+ anomaly_indices = np.where(anomalies)[0].tolist()
497
+
498
+ # Analyze anomalies by column
499
+ column_anomaly_stats = {}
500
+ for col in columns:
501
+ col_data = df[col].to_numpy()
502
+ anomaly_values = col_data[anomalies]
503
+
504
+ if len(anomaly_values) > 0:
505
+ column_anomaly_stats[col] = {
506
+ 'mean_normal': float(np.mean(col_data[~anomalies])),
507
+ 'mean_anomaly': float(np.mean(anomaly_values)),
508
+ 'std_normal': float(np.std(col_data[~anomalies])),
509
+ 'std_anomaly': float(np.std(anomaly_values))
510
+ }
511
+
512
+ # Save if output path provided
513
+ if output_path:
514
+ from ds_agent.utils.polars_helpers import save_dataframe
515
+ save_dataframe(df_with_anomalies, output_path)
516
+ print(f"💾 Dataset with anomaly labels saved to: {output_path}")
517
+
518
+ return {
519
+ 'status': 'success',
520
+ 'method': method,
521
+ 'n_anomalies': n_anomalies,
522
+ 'anomaly_percentage': anomaly_percentage,
523
+ 'anomaly_indices': anomaly_indices[:100], # First 100
524
+ 'column_statistics': column_anomaly_stats,
525
+ 'contamination': contamination,
526
+ 'output_path': output_path
527
+ }
528
+
529
+
530
+ def detect_and_handle_multicollinearity(
531
+ file_path: str,
532
+ threshold: float = 10.0,
533
+ action: str = "report",
534
+ output_path: Optional[str] = None
535
+ ) -> Dict[str, Any]:
536
+ """
537
+ Detect and optionally handle multicollinearity using VIF (Variance Inflation Factor).
538
+
539
+ Args:
540
+ file_path: Path to dataset
541
+ threshold: VIF threshold (10 = high multicollinearity, 5 = moderate)
542
+ action: Action to take:
543
+ - 'report': Only report VIF values
544
+ - 'remove': Remove features with VIF > threshold
545
+ - 'recommend': Provide regularization recommendations
546
+ output_path: Path to save dataset with reduced features
547
+
548
+ Returns:
549
+ Dictionary with VIF values and recommendations
550
+ """
551
+ # Validation
552
+ validate_file_exists(file_path)
553
+ validate_file_format(file_path)
554
+
555
+ # Load data
556
+ df = load_dataframe(file_path)
557
+ validate_dataframe(df)
558
+
559
+ # Get numeric columns
560
+ numeric_cols = get_numeric_columns(df)
561
+
562
+ if len(numeric_cols) < 2:
563
+ return {
564
+ 'status': 'skipped',
565
+ 'message': 'Need at least 2 numeric columns for multicollinearity analysis'
566
+ }
567
+
568
+ print(f"🔍 Calculating VIF for {len(numeric_cols)} features...")
569
+
570
+ # Prepare data
571
+ X = df[numeric_cols].fill_null(0).to_numpy()
572
+
573
+ # Calculate VIF for each feature
574
+ vif_data = {}
575
+ problematic_features = []
576
+
577
+ for i, col in enumerate(numeric_cols):
578
+ try:
579
+ vif = variance_inflation_factor(X, i)
580
+ vif_data[col] = float(vif)
581
+
582
+ if vif > threshold:
583
+ problematic_features.append({
584
+ 'feature': col,
585
+ 'vif': float(vif),
586
+ 'severity': 'high' if vif > 20 else 'moderate'
587
+ })
588
+ except Exception as e:
589
+ vif_data[col] = None
590
+ print(f"⚠️ Could not calculate VIF for {col}: {str(e)}")
591
+
592
+ # Sort by VIF
593
+ sorted_vif = dict(sorted(vif_data.items(), key=lambda x: x[1] if x[1] is not None else 0, reverse=True))
594
+
595
+ # Generate recommendations
596
+ recommendations = []
597
+
598
+ if len(problematic_features) > 0:
599
+ recommendations.append({
600
+ 'type': 'regularization',
601
+ 'description': 'Use Ridge (L2) or Elastic Net regularization to handle multicollinearity',
602
+ 'reason': f'{len(problematic_features)} features have VIF > {threshold}'
603
+ })
604
+
605
+ recommendations.append({
606
+ 'type': 'pca',
607
+ 'description': 'Apply PCA to reduce dimensionality and eliminate correlations',
608
+ 'reason': 'PCA creates orthogonal features'
609
+ })
610
+
611
+ if action == "remove":
612
+ # Remove features with highest VIF iteratively
613
+ features_to_remove = [f['feature'] for f in problematic_features]
614
+ recommendations.append({
615
+ 'type': 'feature_removal',
616
+ 'description': f'Remove {len(features_to_remove)} features with high VIF',
617
+ 'features': features_to_remove
618
+ })
619
+
620
+ # Handle action
621
+ if action == "remove" and len(problematic_features) > 0:
622
+ # Remove features with VIF > threshold
623
+ features_to_keep = [col for col in numeric_cols if col not in [f['feature'] for f in problematic_features]]
624
+ categorical_cols = get_categorical_columns(df)
625
+
626
+ df_reduced = df.select(features_to_keep + categorical_cols)
627
+
628
+ if output_path:
629
+ from ds_agent.utils.polars_helpers import save_dataframe
630
+ save_dataframe(df_reduced, output_path)
631
+ print(f"💾 Dataset with reduced features saved to: {output_path}")
632
+
633
+ return {
634
+ 'status': 'success',
635
+ 'action': 'removed',
636
+ 'vif_values': sorted_vif,
637
+ 'problematic_features': problematic_features,
638
+ 'features_removed': len(problematic_features),
639
+ 'features_remaining': len(features_to_keep),
640
+ 'recommendations': recommendations,
641
+ 'output_path': output_path
642
+ }
643
+
644
+ return {
645
+ 'status': 'success',
646
+ 'action': action,
647
+ 'vif_values': sorted_vif,
648
+ 'problematic_features': problematic_features,
649
+ 'threshold': threshold,
650
+ 'recommendations': recommendations
651
+ }
652
+
653
+
654
+ def perform_statistical_tests(
655
+ file_path: str,
656
+ target_col: str,
657
+ test_type: str = "auto",
658
+ features: Optional[List[str]] = None,
659
+ alpha: float = 0.05
660
+ ) -> Dict[str, Any]:
661
+ """
662
+ Perform statistical hypothesis tests to validate feature relationships.
663
+
664
+ Args:
665
+ file_path: Path to dataset
666
+ target_col: Target column name
667
+ test_type: Type of test:
668
+ - 'auto': Automatically select based on data types
669
+ - 'chi2': Chi-square test (categorical vs categorical)
670
+ - 'ttest': T-test (binary categorical vs numeric)
671
+ - 'anova': ANOVA (multi-class categorical vs numeric)
672
+ - 'pearson': Pearson correlation test (numeric vs numeric)
673
+ features: Features to test (None = all)
674
+ alpha: Significance level (default 0.05)
675
+
676
+ Returns:
677
+ Dictionary with test results and p-values
678
+ """
679
+ # Validation
680
+ validate_file_exists(file_path)
681
+ validate_file_format(file_path)
682
+
683
+ # Load data
684
+ df = load_dataframe(file_path)
685
+ validate_dataframe(df)
686
+ validate_column_exists(df, target_col)
687
+
688
+ # Get column types
689
+ numeric_cols = get_numeric_columns(df)
690
+ categorical_cols = get_categorical_columns(df)
691
+
692
+ # Determine target type
693
+ target_is_numeric = target_col in numeric_cols
694
+ target_is_categorical = target_col in categorical_cols
695
+
696
+ # Get features to test
697
+ if features is None:
698
+ features = [col for col in df.columns if col != target_col]
699
+
700
+ print(f"📊 Performing statistical tests for {len(features)} features...")
701
+
702
+ test_results = []
703
+
704
+ for feature in features:
705
+ feature_is_numeric = feature in numeric_cols
706
+ feature_is_categorical = feature in categorical_cols
707
+
708
+ # Skip if feature is target
709
+ if feature == target_col:
710
+ continue
711
+
712
+ # Select appropriate test
713
+ if test_type == "auto":
714
+ if target_is_numeric and feature_is_numeric:
715
+ selected_test = "pearson"
716
+ elif target_is_categorical and feature_is_numeric:
717
+ target_unique = df[target_col].n_unique()
718
+ selected_test = "ttest" if target_unique == 2 else "anova"
719
+ elif target_is_categorical and feature_is_categorical:
720
+ selected_test = "chi2"
721
+ elif target_is_numeric and feature_is_categorical:
722
+ selected_test = "anova"
723
+ else:
724
+ continue
725
+ else:
726
+ selected_test = test_type
727
+
728
+ # Perform test
729
+ try:
730
+ if selected_test == "pearson":
731
+ # Pearson correlation
732
+ feature_data = df[feature].drop_nulls().to_numpy()
733
+ target_data = df[target_col].drop_nulls().to_numpy()
734
+
735
+ # Align lengths
736
+ min_len = min(len(feature_data), len(target_data))
737
+ corr, pval = pearsonr(feature_data[:min_len], target_data[:min_len])
738
+
739
+ test_results.append({
740
+ 'feature': feature,
741
+ 'test': 'pearson',
742
+ 'statistic': float(corr),
743
+ 'p_value': float(pval),
744
+ 'significant': pval < alpha,
745
+ 'interpretation': f"Correlation: {corr:.3f}"
746
+ })
747
+
748
+ elif selected_test == "chi2":
749
+ # Chi-square test
750
+ contingency_table = pd.crosstab(
751
+ df[feature].to_pandas(),
752
+ df[target_col].to_pandas()
753
+ )
754
+ chi2, pval, dof, expected = chi2_contingency(contingency_table)
755
+
756
+ test_results.append({
757
+ 'feature': feature,
758
+ 'test': 'chi2',
759
+ 'statistic': float(chi2),
760
+ 'p_value': float(pval),
761
+ 'dof': int(dof),
762
+ 'significant': pval < alpha
763
+ })
764
+
765
+ elif selected_test == "ttest":
766
+ # T-test
767
+ target_values = df[target_col].unique().to_list()
768
+ if len(target_values) != 2:
769
+ continue
770
+
771
+ group1 = df.filter(pl.col(target_col) == target_values[0])[feature].drop_nulls().to_numpy()
772
+ group2 = df.filter(pl.col(target_col) == target_values[1])[feature].drop_nulls().to_numpy()
773
+
774
+ t_stat, pval = ttest_ind(group1, group2)
775
+
776
+ test_results.append({
777
+ 'feature': feature,
778
+ 'test': 'ttest',
779
+ 'statistic': float(t_stat),
780
+ 'p_value': float(pval),
781
+ 'significant': pval < alpha,
782
+ 'mean_diff': float(np.mean(group1) - np.mean(group2))
783
+ })
784
+
785
+ elif selected_test == "anova":
786
+ # ANOVA
787
+ groups = []
788
+ target_values = df[target_col].unique().to_list()
789
+
790
+ for val in target_values:
791
+ group_data = df.filter(pl.col(target_col) == val)[feature].drop_nulls().to_numpy()
792
+ if len(group_data) > 0:
793
+ groups.append(group_data)
794
+
795
+ if len(groups) > 1:
796
+ f_stat, pval = f_oneway(*groups)
797
+
798
+ test_results.append({
799
+ 'feature': feature,
800
+ 'test': 'anova',
801
+ 'statistic': float(f_stat),
802
+ 'p_value': float(pval),
803
+ 'significant': pval < alpha,
804
+ 'n_groups': len(groups)
805
+ })
806
+
807
+ except Exception as e:
808
+ print(f"⚠️ Test failed for {feature}: {str(e)}")
809
+
810
+ # Summary
811
+ significant_features = [r for r in test_results if r['significant']]
812
+
813
+ print(f"✅ {len(significant_features)}/{len(test_results)} features are statistically significant (α={alpha})")
814
+
815
+ return {
816
+ 'status': 'success',
817
+ 'target_column': target_col,
818
+ 'alpha': alpha,
819
+ 'total_tests': len(test_results),
820
+ 'significant_features': len(significant_features),
821
+ 'test_results': test_results,
822
+ 'significant_features_list': [r['feature'] for r in significant_features]
823
+ }