ds-agent-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/bin/ds-agent.js +451 -0
  2. package/ds_agent/__init__.py +8 -0
  3. package/package.json +28 -0
  4. package/requirements.txt +126 -0
  5. package/setup.py +35 -0
  6. package/src/__init__.py +7 -0
  7. package/src/_compress_tool_result.py +118 -0
  8. package/src/api/__init__.py +4 -0
  9. package/src/api/app.py +1626 -0
  10. package/src/cache/__init__.py +5 -0
  11. package/src/cache/cache_manager.py +561 -0
  12. package/src/cli.py +2886 -0
  13. package/src/dynamic_prompts.py +281 -0
  14. package/src/orchestrator.py +4799 -0
  15. package/src/progress_manager.py +139 -0
  16. package/src/reasoning/__init__.py +332 -0
  17. package/src/reasoning/business_summary.py +431 -0
  18. package/src/reasoning/data_understanding.py +356 -0
  19. package/src/reasoning/model_explanation.py +383 -0
  20. package/src/reasoning/reasoning_trace.py +239 -0
  21. package/src/registry/__init__.py +3 -0
  22. package/src/registry/tools_registry.py +3 -0
  23. package/src/session_memory.py +448 -0
  24. package/src/session_store.py +370 -0
  25. package/src/storage/__init__.py +19 -0
  26. package/src/storage/artifact_store.py +620 -0
  27. package/src/storage/helpers.py +116 -0
  28. package/src/storage/huggingface_storage.py +694 -0
  29. package/src/storage/r2_storage.py +0 -0
  30. package/src/storage/user_files_service.py +288 -0
  31. package/src/tools/__init__.py +335 -0
  32. package/src/tools/advanced_analysis.py +823 -0
  33. package/src/tools/advanced_feature_engineering.py +708 -0
  34. package/src/tools/advanced_insights.py +578 -0
  35. package/src/tools/advanced_preprocessing.py +549 -0
  36. package/src/tools/advanced_training.py +906 -0
  37. package/src/tools/agent_tool_mapping.py +326 -0
  38. package/src/tools/auto_pipeline.py +420 -0
  39. package/src/tools/autogluon_training.py +1480 -0
  40. package/src/tools/business_intelligence.py +860 -0
  41. package/src/tools/cloud_data_sources.py +581 -0
  42. package/src/tools/code_interpreter.py +390 -0
  43. package/src/tools/computer_vision.py +614 -0
  44. package/src/tools/data_cleaning.py +614 -0
  45. package/src/tools/data_profiling.py +593 -0
  46. package/src/tools/data_type_conversion.py +268 -0
  47. package/src/tools/data_wrangling.py +433 -0
  48. package/src/tools/eda_reports.py +284 -0
  49. package/src/tools/enhanced_feature_engineering.py +241 -0
  50. package/src/tools/feature_engineering.py +302 -0
  51. package/src/tools/matplotlib_visualizations.py +1327 -0
  52. package/src/tools/model_training.py +520 -0
  53. package/src/tools/nlp_text_analytics.py +761 -0
  54. package/src/tools/plotly_visualizations.py +497 -0
  55. package/src/tools/production_mlops.py +852 -0
  56. package/src/tools/time_series.py +507 -0
  57. package/src/tools/tools_registry.py +2133 -0
  58. package/src/tools/visualization_engine.py +559 -0
  59. package/src/utils/__init__.py +42 -0
  60. package/src/utils/error_recovery.py +313 -0
  61. package/src/utils/parallel_executor.py +402 -0
  62. package/src/utils/polars_helpers.py +248 -0
  63. package/src/utils/schema_extraction.py +132 -0
  64. package/src/utils/semantic_layer.py +392 -0
  65. package/src/utils/token_budget.py +411 -0
  66. package/src/utils/validation.py +377 -0
  67. package/src/workflow_state.py +154 -0
@@ -0,0 +1,284 @@
1
+ """
2
+ EDA Report Generation Tools
3
+ Generates comprehensive HTML reports using ydata-profiling.
4
+ """
5
+
6
+ import os
7
+ import warnings
8
+ import io
9
+ import contextlib
10
+ from pathlib import Path
11
+ from typing import Dict, Any, Optional
12
+
13
+ # Suppress multiprocessing warnings from ydata-profiling cleanup
14
+ warnings.filterwarnings("ignore", category=UserWarning, module="multiprocessing")
15
+ warnings.filterwarnings("ignore", message=".*resource_tracker.*")
16
+
17
+
18
+ def generate_ydata_profiling_report(
19
+ file_path: str,
20
+ output_path: str = "./outputs/reports/ydata_profile.html",
21
+ minimal: bool = False,
22
+ title: str = "Data Profiling Report",
23
+ quiet: bool = True
24
+ ) -> Dict[str, Any]:
25
+ """
26
+ Generate a comprehensive HTML report using ydata-profiling (formerly pandas-profiling).
27
+
28
+ ydata-profiling provides extensive analysis including:
29
+ - Overview: dataset statistics, warnings, reproduction
30
+ - Variables: type inference, statistics, histograms, common values, missing values
31
+ - Interactions: scatter plots, correlations (Pearson, Spearman, Kendall, Cramér's V)
32
+ - Correlations: detailed correlation matrices and heatmaps
33
+ - Missing values: matrix, heatmap, and dendrogram
34
+ - Sample: first/last rows of the dataset
35
+ - Duplicate rows: analysis and examples
36
+
37
+ Args:
38
+ file_path: Path to the dataset CSV file
39
+ output_path: Where to save the HTML report
40
+ minimal: If True, generates faster minimal report (useful for large datasets)
41
+ title: Title for the report
42
+
43
+ Returns:
44
+ Dict with success status, report path, and statistics
45
+ """
46
+ try:
47
+ from ydata_profiling import ProfileReport
48
+ import pandas as pd
49
+
50
+ # Read dataset (ydata-profiling requires pandas)
51
+ if file_path.endswith('.csv'):
52
+ df = pd.read_csv(file_path)
53
+ elif file_path.endswith('.parquet'):
54
+ df = pd.read_parquet(file_path)
55
+ else:
56
+ raise ValueError(f"Unsupported file format: {file_path}")
57
+
58
+ # Auto-optimize for large datasets to prevent memory crashes
59
+ rows, cols = df.shape
60
+ file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
61
+
62
+ # Check environment: HuggingFace has 16GB, Render has 512MB
63
+ # Allow larger datasets on high-memory environments
64
+ max_rows_threshold = int(os.getenv("YDATA_MAX_ROWS", "100000")) # Default: 100k (HF), or set to 50000 for low-mem
65
+ max_size_threshold = float(os.getenv("YDATA_MAX_SIZE_MB", "50")) # Default: 50MB
66
+
67
+ # Automatic sampling only when dataset exceeds thresholds
68
+ should_sample = file_size_mb > max_size_threshold or rows > max_rows_threshold
69
+ if should_sample and not minimal:
70
+ sample_size = int(os.getenv("YDATA_SAMPLE_SIZE", "100000"))
71
+ df = df.sample(n=min(sample_size, rows), random_state=42)
72
+ minimal = True # Force minimal mode for large files
73
+
74
+ # Force minimal mode for very large files even after sampling
75
+ if file_size_mb > max_size_threshold * 2:
76
+ minimal = True
77
+
78
+ if quiet:
79
+ # ydata-profiling can emit very verbose tqdm/progress output; keep CLI output clean by default.
80
+ os.environ.setdefault("TQDM_DISABLE", "1")
81
+ os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
82
+
83
+ # Create output directory if needed
84
+ os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
85
+
86
+ # Configure profile based on minimal flag
87
+ output_capture = io.StringIO() if quiet else None
88
+ error_capture = io.StringIO() if quiet else None
89
+ if quiet:
90
+ output_ctx = contextlib.redirect_stdout(output_capture)
91
+ error_ctx = contextlib.redirect_stderr(error_capture)
92
+ else:
93
+ output_ctx = contextlib.nullcontext()
94
+ error_ctx = contextlib.nullcontext()
95
+
96
+ with output_ctx, error_ctx:
97
+ if minimal:
98
+ # Minimal mode: faster for large datasets, less memory
99
+ profile = ProfileReport(
100
+ df,
101
+ title=title,
102
+ minimal=True,
103
+ explorative=False,
104
+ samples=None, # Disable sample display to save memory
105
+ correlations=None, # Skip correlations in minimal mode
106
+ missing_diagrams=None, # Skip missing diagrams
107
+ duplicates=None, # Skip duplicate analysis
108
+ interactions=None # Skip interactions
109
+ )
110
+ else:
111
+ # Full mode: comprehensive analysis
112
+ profile = ProfileReport(
113
+ df,
114
+ title=title,
115
+ explorative=True,
116
+ correlations={
117
+ "pearson": {"calculate": True},
118
+ "spearman": {"calculate": True},
119
+ "kendall": {"calculate": False}, # Slow for large datasets
120
+ "phi_k": {"calculate": True},
121
+ "cramers": {"calculate": True},
122
+ }
123
+ )
124
+
125
+ # Generate HTML report
126
+ profile.to_file(output_path)
127
+
128
+ # Extract key statistics
129
+ num_features = len(df.columns)
130
+ num_rows = len(df)
131
+ num_numeric = df.select_dtypes(include=['number']).shape[1]
132
+ num_categorical = df.select_dtypes(include=['object', 'category']).shape[1]
133
+ num_boolean = df.select_dtypes(include=['bool']).shape[1]
134
+ missing_cells = df.isnull().sum().sum()
135
+ total_cells = num_rows * num_features
136
+ missing_pct = (missing_cells / total_cells) * 100 if total_cells > 0 else 0
137
+ duplicate_rows = df.duplicated().sum()
138
+
139
+ return {
140
+ "success": True,
141
+ "report_path": output_path,
142
+ "message": f"✅ ydata-profiling report generated successfully at: {output_path}",
143
+ "statistics": {
144
+ "dataset_size": {
145
+ "rows": num_rows,
146
+ "columns": num_features,
147
+ "cells": total_cells
148
+ },
149
+ "variable_types": {
150
+ "numeric": num_numeric,
151
+ "categorical": num_categorical,
152
+ "boolean": num_boolean
153
+ },
154
+ "data_quality": {
155
+ "missing_cells": missing_cells,
156
+ "missing_percentage": round(missing_pct, 2),
157
+ "duplicate_rows": int(duplicate_rows)
158
+ },
159
+ "report_config": {
160
+ "minimal_mode": minimal,
161
+ "title": title
162
+ }
163
+ }
164
+ }
165
+
166
+ except ImportError:
167
+ return {
168
+ "success": False,
169
+ "error": "ydata-profiling not installed. Install with: pip install ydata-profiling",
170
+ "error_type": "MissingDependency"
171
+ }
172
+ except Exception as e:
173
+ return {
174
+ "success": False,
175
+ "error": f"Failed to generate ydata-profiling report: {str(e)}",
176
+ "error_type": type(e).__name__
177
+ }
178
+
179
+
180
+ def generate_sweetviz_report(
181
+ file_path: str,
182
+ target_col: Optional[str] = None,
183
+ compare_file_path: Optional[str] = None,
184
+ output_path: str = "./outputs/reports/sweetviz_report.html",
185
+ title: str = "Sweetviz EDA Report",
186
+ quiet: bool = True
187
+ ) -> Dict[str, Any]:
188
+ """
189
+ Generate an interactive EDA report using Sweetviz.
190
+
191
+ Sweetviz provides:
192
+ - Feature-by-feature analysis with distributions
193
+ - Target analysis (associations with target variable)
194
+ - Dataset comparison (train vs test)
195
+ - Correlations/associations for numeric and categorical features
196
+
197
+ Args:
198
+ file_path: Path to the dataset CSV file
199
+ target_col: Optional target column for supervised analysis
200
+ compare_file_path: Optional second dataset for comparison (e.g., test set)
201
+ output_path: Where to save the HTML report
202
+ title: Title for the report
203
+
204
+ Returns:
205
+ Dict with success status and report path
206
+ """
207
+ try:
208
+ import sweetviz as sv
209
+ import pandas as pd
210
+ except ImportError:
211
+ return {
212
+ "success": False,
213
+ "error": "sweetviz not installed. Install with: pip install sweetviz>=2.3",
214
+ "error_type": "MissingDependency"
215
+ }
216
+
217
+ try:
218
+ # Read dataset
219
+ if file_path.endswith('.csv'):
220
+ df = pd.read_csv(file_path)
221
+ elif file_path.endswith('.parquet'):
222
+ df = pd.read_parquet(file_path)
223
+ else:
224
+ raise ValueError(f"Unsupported file format: {file_path}")
225
+
226
+ # Create output directory
227
+ os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
228
+
229
+ if quiet:
230
+ os.environ.setdefault("TQDM_DISABLE", "1")
231
+
232
+ output_capture = io.StringIO() if quiet else None
233
+ error_capture = io.StringIO() if quiet else None
234
+ if quiet:
235
+ output_ctx = contextlib.redirect_stdout(output_capture)
236
+ error_ctx = contextlib.redirect_stderr(error_capture)
237
+ else:
238
+ output_ctx = contextlib.nullcontext()
239
+ error_ctx = contextlib.nullcontext()
240
+
241
+ with output_ctx, error_ctx:
242
+ # Generate report
243
+ if compare_file_path:
244
+ # Comparison report (train vs test)
245
+ if compare_file_path.endswith('.csv'):
246
+ df_compare = pd.read_csv(compare_file_path)
247
+ else:
248
+ df_compare = pd.read_parquet(compare_file_path)
249
+
250
+ if target_col and target_col in df.columns:
251
+ report = sv.compare([df, "Dataset 1"], [df_compare, "Dataset 2"], target_feat=target_col)
252
+ else:
253
+ report = sv.compare([df, "Dataset 1"], [df_compare, "Dataset 2"])
254
+ else:
255
+ # Single dataset analysis
256
+ if target_col and target_col in df.columns:
257
+ report = sv.analyze(df, target_feat=target_col)
258
+ else:
259
+ report = sv.analyze(df)
260
+
261
+ # Save report (show_html=False prevents auto-opening browser)
262
+ report.show_html(output_path, open_browser=False)
263
+
264
+ num_features = len(df.columns)
265
+ num_rows = len(df)
266
+
267
+ return {
268
+ "success": True,
269
+ "report_path": output_path,
270
+ "message": f"✅ Sweetviz report generated at: {output_path}",
271
+ "statistics": {
272
+ "rows": num_rows,
273
+ "columns": num_features,
274
+ "target_column": target_col,
275
+ "comparison_mode": compare_file_path is not None
276
+ }
277
+ }
278
+
279
+ except Exception as e:
280
+ return {
281
+ "success": False,
282
+ "error": f"Failed to generate Sweetviz report: {str(e)}",
283
+ "error_type": type(e).__name__
284
+ }
@@ -0,0 +1,241 @@
1
+ """
2
+ Enhanced Feature Engineering - Additional robust features
3
+ """
4
+
5
+ import polars as pl
6
+ import numpy as np
7
+ from typing import Dict, Any, List, Optional
8
+ from pathlib import Path
9
+ import sys
10
+ import os
11
+
12
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
13
+
14
+ from ds_agent.utils.polars_helpers import load_dataframe, save_dataframe, get_numeric_columns
15
+ from ds_agent.utils.validation import validate_file_exists, validate_dataframe
16
+
17
+
18
+ def create_ratio_features(file_path: str,
19
+ columns: Optional[List[str]] = None,
20
+ max_ratios: int = 20,
21
+ output_path: Optional[str] = None) -> Dict[str, Any]:
22
+ """
23
+ Create ratio features (a/b) for all numeric column pairs.
24
+ ROBUST: Handles division by zero, infinity, and NaN values.
25
+
26
+ Args:
27
+ file_path: Path to dataset
28
+ columns: Columns to use (None = all numeric)
29
+ max_ratios: Maximum number of ratio features
30
+ output_path: Output file path
31
+
32
+ Returns:
33
+ Dictionary with results
34
+ """
35
+ validate_file_exists(file_path)
36
+ df = load_dataframe(file_path)
37
+ validate_dataframe(df)
38
+
39
+ if columns is None:
40
+ columns = get_numeric_columns(df)
41
+
42
+ print(f"🔢 Creating ratio features from {len(columns)} columns...")
43
+
44
+ ratio_exprs = []
45
+ feature_names = []
46
+
47
+ for i, col1 in enumerate(columns[:15]):
48
+ for col2 in columns[i+1:16]:
49
+ if len(ratio_exprs) >= max_ratios:
50
+ break
51
+
52
+ # Safe division (avoid div by zero, replace inf/nan)
53
+ ratio_name = f"ratio_{col1}_div_{col2}"
54
+ ratio_expr = (
55
+ pl.when(pl.col(col2).abs() < 1e-10)
56
+ .then(0)
57
+ .otherwise(pl.col(col1) / pl.col(col2))
58
+ .clip(-1e6, 1e6) # Clip extreme values
59
+ .fill_nan(0)
60
+ .fill_null(0)
61
+ .alias(ratio_name)
62
+ )
63
+ ratio_exprs.append(ratio_expr)
64
+ feature_names.append(ratio_name)
65
+
66
+ df = df.with_columns(ratio_exprs)
67
+
68
+ if output_path:
69
+ save_dataframe(df, output_path)
70
+
71
+ return {
72
+ 'success': True,
73
+ 'tool': 'create_ratio_features',
74
+ 'result': {
75
+ 'new_features': len(ratio_exprs),
76
+ 'feature_names': feature_names,
77
+ 'output_path': output_path
78
+ }
79
+ }
80
+
81
+
82
+ def create_statistical_features(file_path: str,
83
+ columns: Optional[List[str]] = None,
84
+ output_path: Optional[str] = None) -> Dict[str, Any]:
85
+ """
86
+ Create row-wise statistical features (mean, std, min, max, range).
87
+ ROBUST: Handles missing values and edge cases.
88
+
89
+ Args:
90
+ file_path: Path to dataset
91
+ columns: Columns to use (None = all numeric)
92
+ output_path: Output file path
93
+
94
+ Returns:
95
+ Dictionary with results
96
+ """
97
+ validate_file_exists(file_path)
98
+ df = load_dataframe(file_path)
99
+ validate_dataframe(df)
100
+
101
+ if columns is None:
102
+ columns = get_numeric_columns(df)
103
+
104
+ print(f"📊 Creating statistical features across {len(columns)} columns...")
105
+
106
+ # Row-wise statistics
107
+ stat_features = [
108
+ pl.concat_list([pl.col(c) for c in columns]).list.mean().fill_null(0).alias('row_mean'),
109
+ pl.concat_list([pl.col(c) for c in columns]).list.std().fill_null(0).alias('row_std'),
110
+ pl.concat_list([pl.col(c) for c in columns]).list.min().fill_null(0).alias('row_min'),
111
+ pl.concat_list([pl.col(c) for c in columns]).list.max().fill_null(0).alias('row_max'),
112
+ (pl.concat_list([pl.col(c) for c in columns]).list.max() -
113
+ pl.concat_list([pl.col(c) for c in columns]).list.min()).fill_null(0).alias('row_range'),
114
+ pl.concat_list([pl.col(c) for c in columns]).list.sum().fill_null(0).alias('row_sum'),
115
+ ]
116
+
117
+ df = df.with_columns(stat_features)
118
+
119
+ if output_path:
120
+ save_dataframe(df, output_path)
121
+
122
+ return {
123
+ 'success': True,
124
+ 'tool': 'create_statistical_features',
125
+ 'result': {
126
+ 'new_features': 6,
127
+ 'feature_names': ['row_mean', 'row_std', 'row_min', 'row_max', 'row_range', 'row_sum'],
128
+ 'output_path': output_path
129
+ }
130
+ }
131
+
132
+
133
+ def create_log_features(file_path: str,
134
+ columns: Optional[List[str]] = None,
135
+ output_path: Optional[str] = None) -> Dict[str, Any]:
136
+ """
137
+ Create log-transformed features for skewed distributions.
138
+ ROBUST: Handles negative values and zeros.
139
+
140
+ Args:
141
+ file_path: Path to dataset
142
+ columns: Columns to use (None = all numeric with positive values)
143
+ output_path: Output file path
144
+
145
+ Returns:
146
+ Dictionary with results
147
+ """
148
+ validate_file_exists(file_path)
149
+ df = load_dataframe(file_path)
150
+ validate_dataframe(df)
151
+
152
+ if columns is None:
153
+ columns = get_numeric_columns(df)
154
+
155
+ print(f"📈 Creating log-transformed features for {len(columns)} columns...")
156
+
157
+ log_exprs = []
158
+ feature_names = []
159
+
160
+ for col in columns:
161
+ # Check if column has positive values
162
+ min_val = df[col].min()
163
+ if min_val is not None and min_val > 0:
164
+ # log(x)
165
+ log_exprs.append(pl.col(col).log().fill_nan(0).alias(f"log_{col}"))
166
+ feature_names.append(f"log_{col}")
167
+ elif min_val is not None and min_val >= 0:
168
+ # log(x+1) for non-negative values
169
+ log_exprs.append((pl.col(col) + 1).log().fill_nan(0).alias(f"log1p_{col}"))
170
+ feature_names.append(f"log1p_{col}")
171
+
172
+ if log_exprs:
173
+ df = df.with_columns(log_exprs)
174
+
175
+ if output_path:
176
+ save_dataframe(df, output_path)
177
+
178
+ return {
179
+ 'success': True,
180
+ 'tool': 'create_log_features',
181
+ 'result': {
182
+ 'new_features': len(log_exprs),
183
+ 'feature_names': feature_names,
184
+ 'output_path': output_path
185
+ }
186
+ }
187
+
188
+
189
+ def create_binned_features(file_path: str,
190
+ columns: Optional[List[str]] = None,
191
+ n_bins: int = 5,
192
+ output_path: Optional[str] = None) -> Dict[str, Any]:
193
+ """
194
+ Create binned (discretized) features from continuous variables.
195
+ ROBUST: Uses quantile-based binning to handle outliers.
196
+
197
+ Args:
198
+ file_path: Path to dataset
199
+ columns: Columns to use (None = all numeric)
200
+ n_bins: Number of bins
201
+ output_path: Output file path
202
+
203
+ Returns:
204
+ Dictionary with results
205
+ """
206
+ validate_file_exists(file_path)
207
+ df = load_dataframe(file_path)
208
+ validate_dataframe(df)
209
+
210
+ if columns is None:
211
+ columns = get_numeric_columns(df)[:10] # Limit to 10 columns
212
+
213
+ print(f"🗂️ Creating binned features for {len(columns)} columns with {n_bins} bins...")
214
+
215
+ binned_exprs = []
216
+ feature_names = []
217
+
218
+ for col in columns:
219
+ # Quantile-based binning
220
+ bin_name = f"{col}_binned"
221
+ binned_exprs.append(
222
+ pl.col(col).qcut(n_bins, labels=[f"bin_{i}" for i in range(n_bins)],
223
+ allow_duplicates=True).fill_null("bin_0").alias(bin_name)
224
+ )
225
+ feature_names.append(bin_name)
226
+
227
+ df = df.with_columns(binned_exprs)
228
+
229
+ if output_path:
230
+ save_dataframe(df, output_path)
231
+
232
+ return {
233
+ 'success': True,
234
+ 'tool': 'create_binned_features',
235
+ 'result': {
236
+ 'new_features': len(binned_exprs),
237
+ 'feature_names': feature_names,
238
+ 'n_bins': n_bins,
239
+ 'output_path': output_path
240
+ }
241
+ }