ds-agent-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/bin/ds-agent.js +451 -0
  2. package/ds_agent/__init__.py +8 -0
  3. package/package.json +28 -0
  4. package/requirements.txt +126 -0
  5. package/setup.py +35 -0
  6. package/src/__init__.py +7 -0
  7. package/src/_compress_tool_result.py +118 -0
  8. package/src/api/__init__.py +4 -0
  9. package/src/api/app.py +1626 -0
  10. package/src/cache/__init__.py +5 -0
  11. package/src/cache/cache_manager.py +561 -0
  12. package/src/cli.py +2886 -0
  13. package/src/dynamic_prompts.py +281 -0
  14. package/src/orchestrator.py +4799 -0
  15. package/src/progress_manager.py +139 -0
  16. package/src/reasoning/__init__.py +332 -0
  17. package/src/reasoning/business_summary.py +431 -0
  18. package/src/reasoning/data_understanding.py +356 -0
  19. package/src/reasoning/model_explanation.py +383 -0
  20. package/src/reasoning/reasoning_trace.py +239 -0
  21. package/src/registry/__init__.py +3 -0
  22. package/src/registry/tools_registry.py +3 -0
  23. package/src/session_memory.py +448 -0
  24. package/src/session_store.py +370 -0
  25. package/src/storage/__init__.py +19 -0
  26. package/src/storage/artifact_store.py +620 -0
  27. package/src/storage/helpers.py +116 -0
  28. package/src/storage/huggingface_storage.py +694 -0
  29. package/src/storage/r2_storage.py +0 -0
  30. package/src/storage/user_files_service.py +288 -0
  31. package/src/tools/__init__.py +335 -0
  32. package/src/tools/advanced_analysis.py +823 -0
  33. package/src/tools/advanced_feature_engineering.py +708 -0
  34. package/src/tools/advanced_insights.py +578 -0
  35. package/src/tools/advanced_preprocessing.py +549 -0
  36. package/src/tools/advanced_training.py +906 -0
  37. package/src/tools/agent_tool_mapping.py +326 -0
  38. package/src/tools/auto_pipeline.py +420 -0
  39. package/src/tools/autogluon_training.py +1480 -0
  40. package/src/tools/business_intelligence.py +860 -0
  41. package/src/tools/cloud_data_sources.py +581 -0
  42. package/src/tools/code_interpreter.py +390 -0
  43. package/src/tools/computer_vision.py +614 -0
  44. package/src/tools/data_cleaning.py +614 -0
  45. package/src/tools/data_profiling.py +593 -0
  46. package/src/tools/data_type_conversion.py +268 -0
  47. package/src/tools/data_wrangling.py +433 -0
  48. package/src/tools/eda_reports.py +284 -0
  49. package/src/tools/enhanced_feature_engineering.py +241 -0
  50. package/src/tools/feature_engineering.py +302 -0
  51. package/src/tools/matplotlib_visualizations.py +1327 -0
  52. package/src/tools/model_training.py +520 -0
  53. package/src/tools/nlp_text_analytics.py +761 -0
  54. package/src/tools/plotly_visualizations.py +497 -0
  55. package/src/tools/production_mlops.py +852 -0
  56. package/src/tools/time_series.py +507 -0
  57. package/src/tools/tools_registry.py +2133 -0
  58. package/src/tools/visualization_engine.py +559 -0
  59. package/src/utils/__init__.py +42 -0
  60. package/src/utils/error_recovery.py +313 -0
  61. package/src/utils/parallel_executor.py +402 -0
  62. package/src/utils/polars_helpers.py +248 -0
  63. package/src/utils/schema_extraction.py +132 -0
  64. package/src/utils/semantic_layer.py +392 -0
  65. package/src/utils/token_budget.py +411 -0
  66. package/src/utils/validation.py +377 -0
  67. package/src/workflow_state.py +154 -0
@@ -0,0 +1,420 @@
1
+ """
2
+ Automated ML Pipeline
3
+ Zero-configuration automatic data processing: Clean → Encode → Engineer → Select
4
+ """
5
+
6
+ import polars as pl
7
+ import numpy as np
8
+ import pandas as pd
9
+ from typing import Dict, Any, List, Optional, Tuple
10
+ from pathlib import Path
11
+ import sys
12
+ import os
13
+ from sklearn.feature_selection import SelectKBest, f_classif, f_regression, mutual_info_classif
14
+ from sklearn.preprocessing import StandardScaler
15
+
16
+ # Add parent directory to path
17
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
18
+
19
+ from ds_agent.utils.polars_helpers import load_dataframe, get_numeric_columns
20
+ from ds_agent.utils.validation import validate_file_exists
21
+ from .data_cleaning import clean_missing_values, handle_outliers
22
+ from .data_type_conversion import force_numeric_conversion, smart_type_inference
23
+ from .feature_engineering import encode_categorical, create_time_features
24
+ from .advanced_feature_engineering import create_interaction_features
25
+
26
+
27
+ def auto_ml_pipeline(file_path: str,
28
+ target_col: str,
29
+ task_type: str = "auto",
30
+ output_path: Optional[str] = None,
31
+ feature_engineering_level: str = "basic") -> Dict[str, Any]:
32
+ """
33
+ Fully automated ML pipeline with zero manual intervention.
34
+
35
+ Pipeline stages:
36
+ 1. Auto-detect column types
37
+ 2. Clean missing values intelligently
38
+ 3. Handle outliers
39
+ 4. Encode categorical variables
40
+ 5. Engineer time features (if datetime detected)
41
+ 6. Create interaction features (if requested)
42
+ 7. Select best features
43
+
44
+ Args:
45
+ file_path: Path to input dataset
46
+ target_col: Target column name
47
+ task_type: 'classification', 'regression', or 'auto'
48
+ output_path: Where to save processed data
49
+ feature_engineering_level: 'basic', 'intermediate', 'advanced'
50
+
51
+ Returns:
52
+ Dictionary with pipeline results and explanations
53
+ """
54
+ validate_file_exists(file_path)
55
+
56
+ if output_path is None:
57
+ output_path = "./outputs/data/auto_pipeline_output.csv"
58
+
59
+ # Ensure output_path has .csv extension
60
+ if not output_path.endswith('.csv'):
61
+ output_path = output_path.rstrip('/\\') + '.csv'
62
+
63
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
64
+
65
+ results = {
66
+ "stages_completed": [],
67
+ "transformations_applied": [],
68
+ "warnings": [],
69
+ "final_features": [],
70
+ "output_path": output_path
71
+ }
72
+
73
+ # Load data
74
+ df = load_dataframe(file_path)
75
+ original_shape = df.shape
76
+ results["original_shape"] = {"rows": original_shape[0], "columns": original_shape[1]}
77
+
78
+ print(f"šŸš€ Starting Auto ML Pipeline")
79
+ print(f"šŸ“Š Original shape: {original_shape[0]:,} rows Ɨ {original_shape[1]} columns")
80
+
81
+ # STAGE 1: Auto-detect column types
82
+ print("\nšŸ” Stage 1: Auto-detecting column types...")
83
+ type_detection = smart_type_inference(file_path, output_path="./outputs/data/stage1_types.csv")
84
+ results["stages_completed"].append("type_detection")
85
+ results["transformations_applied"].append({
86
+ "stage": "Type Detection",
87
+ "description": f"Detected {len(type_detection.get('conversions_made', []))} type conversions"
88
+ })
89
+ current_file = "./outputs/data/stage1_types.csv"
90
+
91
+ # STAGE 2: Clean missing values
92
+ print("\n🧹 Stage 2: Cleaning missing values...")
93
+ cleaning_result = clean_missing_values(
94
+ current_file,
95
+ strategy="auto",
96
+ output_path="./outputs/data/stage2_cleaned.csv"
97
+ )
98
+ results["stages_completed"].append("missing_value_cleaning")
99
+ results["transformations_applied"].append({
100
+ "stage": "Missing Value Cleaning",
101
+ "description": f"Cleaned {cleaning_result.get('total_nulls_before', 0)} missing values using auto-detected strategies"
102
+ })
103
+ current_file = "./outputs/data/stage2_cleaned.csv"
104
+
105
+ # STAGE 3: Handle outliers
106
+ print("\nšŸ“Š Stage 3: Handling outliers...")
107
+ outlier_result = handle_outliers(
108
+ current_file,
109
+ columns=["all"],
110
+ method="clip",
111
+ output_path="./outputs/data/stage3_no_outliers.csv"
112
+ )
113
+ results["stages_completed"].append("outlier_handling")
114
+ results["transformations_applied"].append({
115
+ "stage": "Outlier Handling",
116
+ "description": f"Clipped outliers in {outlier_result.get('columns_processed', 0)} columns"
117
+ })
118
+ current_file = "./outputs/data/stage3_no_outliers.csv"
119
+
120
+ # STAGE 4: Force numeric conversion (for any remaining string numbers)
121
+ print("\nšŸ”¢ Stage 4: Converting to numeric...")
122
+ numeric_result = force_numeric_conversion(
123
+ current_file,
124
+ columns=["all"],
125
+ errors="coerce",
126
+ output_path="./outputs/data/stage4_numeric.csv"
127
+ )
128
+ results["stages_completed"].append("numeric_conversion")
129
+ current_file = "./outputs/data/stage4_numeric.csv"
130
+
131
+ # STAGE 5: Encode categorical variables
132
+ print("\nšŸ·ļø Stage 5: Encoding categorical variables...")
133
+ encoding_result = encode_categorical(
134
+ current_file,
135
+ method="auto",
136
+ output_path="./outputs/data/stage5_encoded.csv"
137
+ )
138
+ results["stages_completed"].append("categorical_encoding")
139
+ results["transformations_applied"].append({
140
+ "stage": "Categorical Encoding",
141
+ "description": f"Encoded {len(encoding_result.get('encoded_columns', []))} categorical columns"
142
+ })
143
+ current_file = "./outputs/data/stage5_encoded.csv"
144
+
145
+ # STAGE 6: Feature engineering (if requested)
146
+ if feature_engineering_level in ["intermediate", "advanced"]:
147
+ print("\nāš™ļø Stage 6: Engineering features...")
148
+
149
+ # Check for datetime columns and create time features
150
+ df_current = load_dataframe(current_file).to_pandas()
151
+ datetime_cols = df_current.select_dtypes(include=['datetime64']).columns.tolist()
152
+
153
+ if datetime_cols:
154
+ print(f" Creating time features from {len(datetime_cols)} datetime columns...")
155
+ for dt_col in datetime_cols:
156
+ try:
157
+ time_result = create_time_features(
158
+ current_file,
159
+ date_column=dt_col,
160
+ output_path=current_file # Overwrite
161
+ )
162
+ results["transformations_applied"].append({
163
+ "stage": "Time Feature Engineering",
164
+ "description": f"Created time features from {dt_col}"
165
+ })
166
+ except Exception as e:
167
+ results["warnings"].append(f"Could not create time features from {dt_col}: {str(e)}")
168
+
169
+ # Create interaction features for advanced mode
170
+ if feature_engineering_level == "advanced":
171
+ print(" Creating interaction features...")
172
+ try:
173
+ interaction_result = create_interaction_features(
174
+ current_file,
175
+ method="polynomial",
176
+ degree=2,
177
+ max_features=10,
178
+ output_path="./outputs/data/stage6_engineered.csv"
179
+ )
180
+ results["stages_completed"].append("interaction_features")
181
+ results["transformations_applied"].append({
182
+ "stage": "Interaction Features",
183
+ "description": f"Created {len(interaction_result.get('new_features', []))} interaction features"
184
+ })
185
+ current_file = "./outputs/data/stage6_engineered.csv"
186
+ except Exception as e:
187
+ results["warnings"].append(f"Could not create interaction features: {str(e)}")
188
+
189
+ # STAGE 7: Feature selection
190
+ print("\nšŸŽÆ Stage 7: Selecting best features...")
191
+ try:
192
+ selection_result = auto_feature_selection(
193
+ current_file,
194
+ target_col=target_col,
195
+ task_type=task_type,
196
+ max_features=50,
197
+ output_path=output_path
198
+ )
199
+ results["stages_completed"].append("feature_selection")
200
+ results["transformations_applied"].append({
201
+ "stage": "Feature Selection",
202
+ "description": f"Selected {selection_result['n_features_selected']} best features from {selection_result['n_features_original']}"
203
+ })
204
+ results["selected_features"] = selection_result["selected_features"]
205
+ results["feature_importance"] = selection_result.get("feature_scores", {})
206
+ except Exception as e:
207
+ results["warnings"].append(f"Feature selection failed: {str(e)}")
208
+ # Just copy the file
209
+ import shutil
210
+ shutil.copy(current_file, output_path)
211
+
212
+ # Final shape
213
+ df_final = load_dataframe(output_path)
214
+ final_shape = df_final.shape
215
+ results["final_shape"] = {"rows": final_shape[0], "columns": final_shape[1]}
216
+ results["final_features"] = df_final.columns
217
+
218
+ print(f"\nāœ… Pipeline completed!")
219
+ print(f"šŸ“Š Final shape: {final_shape[0]:,} rows Ɨ {final_shape[1]} columns")
220
+ print(f"šŸ’¾ Saved to: {output_path}")
221
+
222
+ # Generate summary
223
+ results["summary"] = _generate_pipeline_summary(results)
224
+
225
+ return results
226
+
227
+
228
+ def auto_feature_selection(file_path: str,
229
+ target_col: str,
230
+ task_type: str = "auto",
231
+ max_features: int = 50,
232
+ method: str = "auto",
233
+ output_path: Optional[str] = None) -> Dict[str, Any]:
234
+ """
235
+ Automatically select the best features for modeling.
236
+
237
+ Args:
238
+ file_path: Path to dataset
239
+ target_col: Target column
240
+ task_type: 'classification', 'regression', or 'auto'
241
+ max_features: Maximum number of features to keep
242
+ method: 'mutual_info', 'f_test', 'boruta', or 'auto'
243
+ output_path: Where to save selected features
244
+
245
+ Returns:
246
+ Dictionary with selection results
247
+ """
248
+ validate_file_exists(file_path)
249
+ df = load_dataframe(file_path).to_pandas()
250
+
251
+ if target_col not in df.columns:
252
+ return {"status": "error", "message": f"Target column '{target_col}' not found"}
253
+
254
+ # Separate features and target
255
+ X = df.drop(columns=[target_col])
256
+ y = df[target_col]
257
+
258
+ # Get only numeric features
259
+ numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
260
+ X_numeric = X[numeric_features]
261
+
262
+ if len(numeric_features) == 0:
263
+ return {"status": "error", "message": "No numeric features found"}
264
+
265
+ # Auto-detect task type
266
+ if task_type == "auto":
267
+ if y.dtype == 'object' or y.nunique() < 20:
268
+ task_type = "classification"
269
+ else:
270
+ task_type = "regression"
271
+
272
+ # Select method
273
+ if method == "auto":
274
+ method = "mutual_info" if task_type == "classification" else "f_test"
275
+
276
+ # Perform selection
277
+ n_features_to_select = min(max_features, len(numeric_features))
278
+
279
+ if method == "boruta":
280
+ # BorutaPy - all-relevant feature selection
281
+ try:
282
+ from boruta import BorutaPy
283
+ from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
284
+
285
+ print("šŸ” Running BorutaPy all-relevant feature selection...")
286
+
287
+ if task_type == "classification":
288
+ rf = RandomForestClassifier(n_jobs=-1, max_depth=5, random_state=42)
289
+ else:
290
+ rf = RandomForestRegressor(n_jobs=-1, max_depth=5, random_state=42)
291
+
292
+ boruta_selector = BorutaPy(
293
+ rf,
294
+ n_estimators='auto',
295
+ max_iter=100,
296
+ random_state=42,
297
+ verbose=0
298
+ )
299
+
300
+ X_filled = X_numeric.fillna(0).values
301
+ boruta_selector.fit(X_filled, y.values if hasattr(y, 'values') else y)
302
+
303
+ # Get selected features
304
+ selected_mask = boruta_selector.support_
305
+ selected_features = np.array(numeric_features)[selected_mask].tolist()
306
+
307
+ # Get ranking
308
+ feature_scores = dict(zip(numeric_features, boruta_selector.ranking_.tolist()))
309
+ sorted_features = sorted(feature_scores.items(), key=lambda x: x[1])
310
+
311
+ results = {
312
+ "n_features_original": len(numeric_features),
313
+ "n_features_selected": len(selected_features),
314
+ "selected_features": selected_features,
315
+ "feature_rankings": dict(sorted_features),
316
+ "tentative_features": np.array(numeric_features)[boruta_selector.support_weak_].tolist(),
317
+ "selection_method": "boruta",
318
+ "task_type": task_type
319
+ }
320
+
321
+ # Save selected features + target
322
+ if output_path:
323
+ df_selected = df[selected_features + [target_col]]
324
+ df_selected.to_csv(output_path, index=False)
325
+ results["output_path"] = output_path
326
+
327
+ return results
328
+
329
+ except ImportError:
330
+ print("āš ļø boruta not installed. Falling back to mutual_info. Install with: pip install boruta>=0.3")
331
+ method = "mutual_info" if task_type == "classification" else "f_test"
332
+
333
+ if method == "mutual_info":
334
+ if task_type == "classification":
335
+ selector = SelectKBest(mutual_info_classif, k=n_features_to_select)
336
+ else:
337
+ from sklearn.feature_selection import mutual_info_regression
338
+ selector = SelectKBest(mutual_info_regression, k=n_features_to_select)
339
+ else: # f_test
340
+ if task_type == "classification":
341
+ selector = SelectKBest(f_classif, k=n_features_to_select)
342
+ else:
343
+ selector = SelectKBest(f_regression, k=n_features_to_select)
344
+
345
+ # Fit selector
346
+ X_selected = selector.fit_transform(X_numeric.fillna(0), y)
347
+
348
+ # Get selected feature names
349
+ selected_mask = selector.get_support()
350
+ selected_features = np.array(numeric_features)[selected_mask].tolist()
351
+
352
+ # Get feature scores
353
+ feature_scores = dict(zip(numeric_features, selector.scores_))
354
+ sorted_features = sorted(feature_scores.items(), key=lambda x: x[1], reverse=True)
355
+
356
+ results = {
357
+ "n_features_original": len(numeric_features),
358
+ "n_features_selected": len(selected_features),
359
+ "selected_features": selected_features,
360
+ "feature_scores": dict(sorted_features[:n_features_to_select]),
361
+ "selection_method": method,
362
+ "task_type": task_type
363
+ }
364
+
365
+ # Save selected features + target
366
+ if output_path:
367
+ df_selected = df[selected_features + [target_col]]
368
+ df_selected.to_csv(output_path, index=False)
369
+ results["output_path"] = output_path
370
+
371
+ return results
372
+
373
+
374
+ def _generate_pipeline_summary(results: Dict[str, Any]) -> str:
375
+ """Generate human-readable summary of pipeline execution."""
376
+ summary = []
377
+
378
+ summary.append("šŸ”„ **Auto ML Pipeline Summary**\n")
379
+ summary.append(f"Original shape: {results['original_shape']['rows']:,} rows Ɨ {results['original_shape']['columns']} columns")
380
+ summary.append(f"Final shape: {results['final_shape']['rows']:,} rows Ɨ {results['final_shape']['columns']} columns\n")
381
+
382
+ summary.append("**Stages Completed:**")
383
+ for i, stage in enumerate(results['stages_completed'], 1):
384
+ summary.append(f"{i}. {stage.replace('_', ' ').title()}")
385
+
386
+ summary.append("\n**Transformations Applied:**")
387
+ for transform in results['transformations_applied']:
388
+ summary.append(f"• {transform['stage']}: {transform['description']}")
389
+
390
+ if results.get('warnings'):
391
+ summary.append("\nāš ļø **Warnings:**")
392
+ for warning in results['warnings']:
393
+ summary.append(f"• {warning}")
394
+
395
+ if results.get('selected_features'):
396
+ summary.append(f"\nšŸŽÆ **Selected {len(results['selected_features'])} best features**")
397
+
398
+ summary.append(f"\nšŸ’¾ Output saved to: {results['output_path']}")
399
+
400
+ return "\n".join(summary)
401
+
402
+
403
+ def explain_pipeline_decision(stage: str, decision: str, reason: str) -> Dict[str, str]:
404
+ """
405
+ Explain a pipeline decision in human-readable format.
406
+
407
+ Args:
408
+ stage: Pipeline stage name
409
+ decision: What decision was made
410
+ reason: Why this decision was made
411
+
412
+ Returns:
413
+ Dictionary with explanation
414
+ """
415
+ return {
416
+ "stage": stage,
417
+ "decision": decision,
418
+ "reason": reason,
419
+ "explanation": f"In the {stage} stage, I decided to {decision} because {reason}"
420
+ }