ds-agent-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ds-agent.js +451 -0
- package/ds_agent/__init__.py +8 -0
- package/package.json +28 -0
- package/requirements.txt +126 -0
- package/setup.py +35 -0
- package/src/__init__.py +7 -0
- package/src/_compress_tool_result.py +118 -0
- package/src/api/__init__.py +4 -0
- package/src/api/app.py +1626 -0
- package/src/cache/__init__.py +5 -0
- package/src/cache/cache_manager.py +561 -0
- package/src/cli.py +2886 -0
- package/src/dynamic_prompts.py +281 -0
- package/src/orchestrator.py +4799 -0
- package/src/progress_manager.py +139 -0
- package/src/reasoning/__init__.py +332 -0
- package/src/reasoning/business_summary.py +431 -0
- package/src/reasoning/data_understanding.py +356 -0
- package/src/reasoning/model_explanation.py +383 -0
- package/src/reasoning/reasoning_trace.py +239 -0
- package/src/registry/__init__.py +3 -0
- package/src/registry/tools_registry.py +3 -0
- package/src/session_memory.py +448 -0
- package/src/session_store.py +370 -0
- package/src/storage/__init__.py +19 -0
- package/src/storage/artifact_store.py +620 -0
- package/src/storage/helpers.py +116 -0
- package/src/storage/huggingface_storage.py +694 -0
- package/src/storage/r2_storage.py +0 -0
- package/src/storage/user_files_service.py +288 -0
- package/src/tools/__init__.py +335 -0
- package/src/tools/advanced_analysis.py +823 -0
- package/src/tools/advanced_feature_engineering.py +708 -0
- package/src/tools/advanced_insights.py +578 -0
- package/src/tools/advanced_preprocessing.py +549 -0
- package/src/tools/advanced_training.py +906 -0
- package/src/tools/agent_tool_mapping.py +326 -0
- package/src/tools/auto_pipeline.py +420 -0
- package/src/tools/autogluon_training.py +1480 -0
- package/src/tools/business_intelligence.py +860 -0
- package/src/tools/cloud_data_sources.py +581 -0
- package/src/tools/code_interpreter.py +390 -0
- package/src/tools/computer_vision.py +614 -0
- package/src/tools/data_cleaning.py +614 -0
- package/src/tools/data_profiling.py +593 -0
- package/src/tools/data_type_conversion.py +268 -0
- package/src/tools/data_wrangling.py +433 -0
- package/src/tools/eda_reports.py +284 -0
- package/src/tools/enhanced_feature_engineering.py +241 -0
- package/src/tools/feature_engineering.py +302 -0
- package/src/tools/matplotlib_visualizations.py +1327 -0
- package/src/tools/model_training.py +520 -0
- package/src/tools/nlp_text_analytics.py +761 -0
- package/src/tools/plotly_visualizations.py +497 -0
- package/src/tools/production_mlops.py +852 -0
- package/src/tools/time_series.py +507 -0
- package/src/tools/tools_registry.py +2133 -0
- package/src/tools/visualization_engine.py +559 -0
- package/src/utils/__init__.py +42 -0
- package/src/utils/error_recovery.py +313 -0
- package/src/utils/parallel_executor.py +402 -0
- package/src/utils/polars_helpers.py +248 -0
- package/src/utils/schema_extraction.py +132 -0
- package/src/utils/semantic_layer.py +392 -0
- package/src/utils/token_budget.py +411 -0
- package/src/utils/validation.py +377 -0
- package/src/workflow_state.py +154 -0
|
@@ -0,0 +1,284 @@
|
|
|
1
|
+
"""
|
|
2
|
+
EDA Report Generation Tools
|
|
3
|
+
Generates comprehensive HTML reports using ydata-profiling.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import os
|
|
7
|
+
import warnings
|
|
8
|
+
import io
|
|
9
|
+
import contextlib
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
from typing import Dict, Any, Optional
|
|
12
|
+
|
|
13
|
+
# Suppress multiprocessing warnings from ydata-profiling cleanup
|
|
14
|
+
warnings.filterwarnings("ignore", category=UserWarning, module="multiprocessing")
|
|
15
|
+
warnings.filterwarnings("ignore", message=".*resource_tracker.*")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def generate_ydata_profiling_report(
|
|
19
|
+
file_path: str,
|
|
20
|
+
output_path: str = "./outputs/reports/ydata_profile.html",
|
|
21
|
+
minimal: bool = False,
|
|
22
|
+
title: str = "Data Profiling Report",
|
|
23
|
+
quiet: bool = True
|
|
24
|
+
) -> Dict[str, Any]:
|
|
25
|
+
"""
|
|
26
|
+
Generate a comprehensive HTML report using ydata-profiling (formerly pandas-profiling).
|
|
27
|
+
|
|
28
|
+
ydata-profiling provides extensive analysis including:
|
|
29
|
+
- Overview: dataset statistics, warnings, reproduction
|
|
30
|
+
- Variables: type inference, statistics, histograms, common values, missing values
|
|
31
|
+
- Interactions: scatter plots, correlations (Pearson, Spearman, Kendall, Cramér's V)
|
|
32
|
+
- Correlations: detailed correlation matrices and heatmaps
|
|
33
|
+
- Missing values: matrix, heatmap, and dendrogram
|
|
34
|
+
- Sample: first/last rows of the dataset
|
|
35
|
+
- Duplicate rows: analysis and examples
|
|
36
|
+
|
|
37
|
+
Args:
|
|
38
|
+
file_path: Path to the dataset CSV file
|
|
39
|
+
output_path: Where to save the HTML report
|
|
40
|
+
minimal: If True, generates faster minimal report (useful for large datasets)
|
|
41
|
+
title: Title for the report
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
Dict with success status, report path, and statistics
|
|
45
|
+
"""
|
|
46
|
+
try:
|
|
47
|
+
from ydata_profiling import ProfileReport
|
|
48
|
+
import pandas as pd
|
|
49
|
+
|
|
50
|
+
# Read dataset (ydata-profiling requires pandas)
|
|
51
|
+
if file_path.endswith('.csv'):
|
|
52
|
+
df = pd.read_csv(file_path)
|
|
53
|
+
elif file_path.endswith('.parquet'):
|
|
54
|
+
df = pd.read_parquet(file_path)
|
|
55
|
+
else:
|
|
56
|
+
raise ValueError(f"Unsupported file format: {file_path}")
|
|
57
|
+
|
|
58
|
+
# Auto-optimize for large datasets to prevent memory crashes
|
|
59
|
+
rows, cols = df.shape
|
|
60
|
+
file_size_mb = os.path.getsize(file_path) / (1024 * 1024)
|
|
61
|
+
|
|
62
|
+
# Check environment: HuggingFace has 16GB, Render has 512MB
|
|
63
|
+
# Allow larger datasets on high-memory environments
|
|
64
|
+
max_rows_threshold = int(os.getenv("YDATA_MAX_ROWS", "100000")) # Default: 100k (HF), or set to 50000 for low-mem
|
|
65
|
+
max_size_threshold = float(os.getenv("YDATA_MAX_SIZE_MB", "50")) # Default: 50MB
|
|
66
|
+
|
|
67
|
+
# Automatic sampling only when dataset exceeds thresholds
|
|
68
|
+
should_sample = file_size_mb > max_size_threshold or rows > max_rows_threshold
|
|
69
|
+
if should_sample and not minimal:
|
|
70
|
+
sample_size = int(os.getenv("YDATA_SAMPLE_SIZE", "100000"))
|
|
71
|
+
df = df.sample(n=min(sample_size, rows), random_state=42)
|
|
72
|
+
minimal = True # Force minimal mode for large files
|
|
73
|
+
|
|
74
|
+
# Force minimal mode for very large files even after sampling
|
|
75
|
+
if file_size_mb > max_size_threshold * 2:
|
|
76
|
+
minimal = True
|
|
77
|
+
|
|
78
|
+
if quiet:
|
|
79
|
+
# ydata-profiling can emit very verbose tqdm/progress output; keep CLI output clean by default.
|
|
80
|
+
os.environ.setdefault("TQDM_DISABLE", "1")
|
|
81
|
+
os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1")
|
|
82
|
+
|
|
83
|
+
# Create output directory if needed
|
|
84
|
+
os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
|
|
85
|
+
|
|
86
|
+
# Configure profile based on minimal flag
|
|
87
|
+
output_capture = io.StringIO() if quiet else None
|
|
88
|
+
error_capture = io.StringIO() if quiet else None
|
|
89
|
+
if quiet:
|
|
90
|
+
output_ctx = contextlib.redirect_stdout(output_capture)
|
|
91
|
+
error_ctx = contextlib.redirect_stderr(error_capture)
|
|
92
|
+
else:
|
|
93
|
+
output_ctx = contextlib.nullcontext()
|
|
94
|
+
error_ctx = contextlib.nullcontext()
|
|
95
|
+
|
|
96
|
+
with output_ctx, error_ctx:
|
|
97
|
+
if minimal:
|
|
98
|
+
# Minimal mode: faster for large datasets, less memory
|
|
99
|
+
profile = ProfileReport(
|
|
100
|
+
df,
|
|
101
|
+
title=title,
|
|
102
|
+
minimal=True,
|
|
103
|
+
explorative=False,
|
|
104
|
+
samples=None, # Disable sample display to save memory
|
|
105
|
+
correlations=None, # Skip correlations in minimal mode
|
|
106
|
+
missing_diagrams=None, # Skip missing diagrams
|
|
107
|
+
duplicates=None, # Skip duplicate analysis
|
|
108
|
+
interactions=None # Skip interactions
|
|
109
|
+
)
|
|
110
|
+
else:
|
|
111
|
+
# Full mode: comprehensive analysis
|
|
112
|
+
profile = ProfileReport(
|
|
113
|
+
df,
|
|
114
|
+
title=title,
|
|
115
|
+
explorative=True,
|
|
116
|
+
correlations={
|
|
117
|
+
"pearson": {"calculate": True},
|
|
118
|
+
"spearman": {"calculate": True},
|
|
119
|
+
"kendall": {"calculate": False}, # Slow for large datasets
|
|
120
|
+
"phi_k": {"calculate": True},
|
|
121
|
+
"cramers": {"calculate": True},
|
|
122
|
+
}
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Generate HTML report
|
|
126
|
+
profile.to_file(output_path)
|
|
127
|
+
|
|
128
|
+
# Extract key statistics
|
|
129
|
+
num_features = len(df.columns)
|
|
130
|
+
num_rows = len(df)
|
|
131
|
+
num_numeric = df.select_dtypes(include=['number']).shape[1]
|
|
132
|
+
num_categorical = df.select_dtypes(include=['object', 'category']).shape[1]
|
|
133
|
+
num_boolean = df.select_dtypes(include=['bool']).shape[1]
|
|
134
|
+
missing_cells = df.isnull().sum().sum()
|
|
135
|
+
total_cells = num_rows * num_features
|
|
136
|
+
missing_pct = (missing_cells / total_cells) * 100 if total_cells > 0 else 0
|
|
137
|
+
duplicate_rows = df.duplicated().sum()
|
|
138
|
+
|
|
139
|
+
return {
|
|
140
|
+
"success": True,
|
|
141
|
+
"report_path": output_path,
|
|
142
|
+
"message": f"✅ ydata-profiling report generated successfully at: {output_path}",
|
|
143
|
+
"statistics": {
|
|
144
|
+
"dataset_size": {
|
|
145
|
+
"rows": num_rows,
|
|
146
|
+
"columns": num_features,
|
|
147
|
+
"cells": total_cells
|
|
148
|
+
},
|
|
149
|
+
"variable_types": {
|
|
150
|
+
"numeric": num_numeric,
|
|
151
|
+
"categorical": num_categorical,
|
|
152
|
+
"boolean": num_boolean
|
|
153
|
+
},
|
|
154
|
+
"data_quality": {
|
|
155
|
+
"missing_cells": missing_cells,
|
|
156
|
+
"missing_percentage": round(missing_pct, 2),
|
|
157
|
+
"duplicate_rows": int(duplicate_rows)
|
|
158
|
+
},
|
|
159
|
+
"report_config": {
|
|
160
|
+
"minimal_mode": minimal,
|
|
161
|
+
"title": title
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
except ImportError:
|
|
167
|
+
return {
|
|
168
|
+
"success": False,
|
|
169
|
+
"error": "ydata-profiling not installed. Install with: pip install ydata-profiling",
|
|
170
|
+
"error_type": "MissingDependency"
|
|
171
|
+
}
|
|
172
|
+
except Exception as e:
|
|
173
|
+
return {
|
|
174
|
+
"success": False,
|
|
175
|
+
"error": f"Failed to generate ydata-profiling report: {str(e)}",
|
|
176
|
+
"error_type": type(e).__name__
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
|
|
180
|
+
def generate_sweetviz_report(
|
|
181
|
+
file_path: str,
|
|
182
|
+
target_col: Optional[str] = None,
|
|
183
|
+
compare_file_path: Optional[str] = None,
|
|
184
|
+
output_path: str = "./outputs/reports/sweetviz_report.html",
|
|
185
|
+
title: str = "Sweetviz EDA Report",
|
|
186
|
+
quiet: bool = True
|
|
187
|
+
) -> Dict[str, Any]:
|
|
188
|
+
"""
|
|
189
|
+
Generate an interactive EDA report using Sweetviz.
|
|
190
|
+
|
|
191
|
+
Sweetviz provides:
|
|
192
|
+
- Feature-by-feature analysis with distributions
|
|
193
|
+
- Target analysis (associations with target variable)
|
|
194
|
+
- Dataset comparison (train vs test)
|
|
195
|
+
- Correlations/associations for numeric and categorical features
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
file_path: Path to the dataset CSV file
|
|
199
|
+
target_col: Optional target column for supervised analysis
|
|
200
|
+
compare_file_path: Optional second dataset for comparison (e.g., test set)
|
|
201
|
+
output_path: Where to save the HTML report
|
|
202
|
+
title: Title for the report
|
|
203
|
+
|
|
204
|
+
Returns:
|
|
205
|
+
Dict with success status and report path
|
|
206
|
+
"""
|
|
207
|
+
try:
|
|
208
|
+
import sweetviz as sv
|
|
209
|
+
import pandas as pd
|
|
210
|
+
except ImportError:
|
|
211
|
+
return {
|
|
212
|
+
"success": False,
|
|
213
|
+
"error": "sweetviz not installed. Install with: pip install sweetviz>=2.3",
|
|
214
|
+
"error_type": "MissingDependency"
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
try:
|
|
218
|
+
# Read dataset
|
|
219
|
+
if file_path.endswith('.csv'):
|
|
220
|
+
df = pd.read_csv(file_path)
|
|
221
|
+
elif file_path.endswith('.parquet'):
|
|
222
|
+
df = pd.read_parquet(file_path)
|
|
223
|
+
else:
|
|
224
|
+
raise ValueError(f"Unsupported file format: {file_path}")
|
|
225
|
+
|
|
226
|
+
# Create output directory
|
|
227
|
+
os.makedirs(os.path.dirname(output_path) or "./outputs/reports", exist_ok=True)
|
|
228
|
+
|
|
229
|
+
if quiet:
|
|
230
|
+
os.environ.setdefault("TQDM_DISABLE", "1")
|
|
231
|
+
|
|
232
|
+
output_capture = io.StringIO() if quiet else None
|
|
233
|
+
error_capture = io.StringIO() if quiet else None
|
|
234
|
+
if quiet:
|
|
235
|
+
output_ctx = contextlib.redirect_stdout(output_capture)
|
|
236
|
+
error_ctx = contextlib.redirect_stderr(error_capture)
|
|
237
|
+
else:
|
|
238
|
+
output_ctx = contextlib.nullcontext()
|
|
239
|
+
error_ctx = contextlib.nullcontext()
|
|
240
|
+
|
|
241
|
+
with output_ctx, error_ctx:
|
|
242
|
+
# Generate report
|
|
243
|
+
if compare_file_path:
|
|
244
|
+
# Comparison report (train vs test)
|
|
245
|
+
if compare_file_path.endswith('.csv'):
|
|
246
|
+
df_compare = pd.read_csv(compare_file_path)
|
|
247
|
+
else:
|
|
248
|
+
df_compare = pd.read_parquet(compare_file_path)
|
|
249
|
+
|
|
250
|
+
if target_col and target_col in df.columns:
|
|
251
|
+
report = sv.compare([df, "Dataset 1"], [df_compare, "Dataset 2"], target_feat=target_col)
|
|
252
|
+
else:
|
|
253
|
+
report = sv.compare([df, "Dataset 1"], [df_compare, "Dataset 2"])
|
|
254
|
+
else:
|
|
255
|
+
# Single dataset analysis
|
|
256
|
+
if target_col and target_col in df.columns:
|
|
257
|
+
report = sv.analyze(df, target_feat=target_col)
|
|
258
|
+
else:
|
|
259
|
+
report = sv.analyze(df)
|
|
260
|
+
|
|
261
|
+
# Save report (show_html=False prevents auto-opening browser)
|
|
262
|
+
report.show_html(output_path, open_browser=False)
|
|
263
|
+
|
|
264
|
+
num_features = len(df.columns)
|
|
265
|
+
num_rows = len(df)
|
|
266
|
+
|
|
267
|
+
return {
|
|
268
|
+
"success": True,
|
|
269
|
+
"report_path": output_path,
|
|
270
|
+
"message": f"✅ Sweetviz report generated at: {output_path}",
|
|
271
|
+
"statistics": {
|
|
272
|
+
"rows": num_rows,
|
|
273
|
+
"columns": num_features,
|
|
274
|
+
"target_column": target_col,
|
|
275
|
+
"comparison_mode": compare_file_path is not None
|
|
276
|
+
}
|
|
277
|
+
}
|
|
278
|
+
|
|
279
|
+
except Exception as e:
|
|
280
|
+
return {
|
|
281
|
+
"success": False,
|
|
282
|
+
"error": f"Failed to generate Sweetviz report: {str(e)}",
|
|
283
|
+
"error_type": type(e).__name__
|
|
284
|
+
}
|
|
@@ -0,0 +1,241 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Enhanced Feature Engineering - Additional robust features
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
import polars as pl
|
|
6
|
+
import numpy as np
|
|
7
|
+
from typing import Dict, Any, List, Optional
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
import sys
|
|
10
|
+
import os
|
|
11
|
+
|
|
12
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
13
|
+
|
|
14
|
+
from ds_agent.utils.polars_helpers import load_dataframe, save_dataframe, get_numeric_columns
|
|
15
|
+
from ds_agent.utils.validation import validate_file_exists, validate_dataframe
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def create_ratio_features(file_path: str,
|
|
19
|
+
columns: Optional[List[str]] = None,
|
|
20
|
+
max_ratios: int = 20,
|
|
21
|
+
output_path: Optional[str] = None) -> Dict[str, Any]:
|
|
22
|
+
"""
|
|
23
|
+
Create ratio features (a/b) for all numeric column pairs.
|
|
24
|
+
ROBUST: Handles division by zero, infinity, and NaN values.
|
|
25
|
+
|
|
26
|
+
Args:
|
|
27
|
+
file_path: Path to dataset
|
|
28
|
+
columns: Columns to use (None = all numeric)
|
|
29
|
+
max_ratios: Maximum number of ratio features
|
|
30
|
+
output_path: Output file path
|
|
31
|
+
|
|
32
|
+
Returns:
|
|
33
|
+
Dictionary with results
|
|
34
|
+
"""
|
|
35
|
+
validate_file_exists(file_path)
|
|
36
|
+
df = load_dataframe(file_path)
|
|
37
|
+
validate_dataframe(df)
|
|
38
|
+
|
|
39
|
+
if columns is None:
|
|
40
|
+
columns = get_numeric_columns(df)
|
|
41
|
+
|
|
42
|
+
print(f"🔢 Creating ratio features from {len(columns)} columns...")
|
|
43
|
+
|
|
44
|
+
ratio_exprs = []
|
|
45
|
+
feature_names = []
|
|
46
|
+
|
|
47
|
+
for i, col1 in enumerate(columns[:15]):
|
|
48
|
+
for col2 in columns[i+1:16]:
|
|
49
|
+
if len(ratio_exprs) >= max_ratios:
|
|
50
|
+
break
|
|
51
|
+
|
|
52
|
+
# Safe division (avoid div by zero, replace inf/nan)
|
|
53
|
+
ratio_name = f"ratio_{col1}_div_{col2}"
|
|
54
|
+
ratio_expr = (
|
|
55
|
+
pl.when(pl.col(col2).abs() < 1e-10)
|
|
56
|
+
.then(0)
|
|
57
|
+
.otherwise(pl.col(col1) / pl.col(col2))
|
|
58
|
+
.clip(-1e6, 1e6) # Clip extreme values
|
|
59
|
+
.fill_nan(0)
|
|
60
|
+
.fill_null(0)
|
|
61
|
+
.alias(ratio_name)
|
|
62
|
+
)
|
|
63
|
+
ratio_exprs.append(ratio_expr)
|
|
64
|
+
feature_names.append(ratio_name)
|
|
65
|
+
|
|
66
|
+
df = df.with_columns(ratio_exprs)
|
|
67
|
+
|
|
68
|
+
if output_path:
|
|
69
|
+
save_dataframe(df, output_path)
|
|
70
|
+
|
|
71
|
+
return {
|
|
72
|
+
'success': True,
|
|
73
|
+
'tool': 'create_ratio_features',
|
|
74
|
+
'result': {
|
|
75
|
+
'new_features': len(ratio_exprs),
|
|
76
|
+
'feature_names': feature_names,
|
|
77
|
+
'output_path': output_path
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def create_statistical_features(file_path: str,
|
|
83
|
+
columns: Optional[List[str]] = None,
|
|
84
|
+
output_path: Optional[str] = None) -> Dict[str, Any]:
|
|
85
|
+
"""
|
|
86
|
+
Create row-wise statistical features (mean, std, min, max, range).
|
|
87
|
+
ROBUST: Handles missing values and edge cases.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
file_path: Path to dataset
|
|
91
|
+
columns: Columns to use (None = all numeric)
|
|
92
|
+
output_path: Output file path
|
|
93
|
+
|
|
94
|
+
Returns:
|
|
95
|
+
Dictionary with results
|
|
96
|
+
"""
|
|
97
|
+
validate_file_exists(file_path)
|
|
98
|
+
df = load_dataframe(file_path)
|
|
99
|
+
validate_dataframe(df)
|
|
100
|
+
|
|
101
|
+
if columns is None:
|
|
102
|
+
columns = get_numeric_columns(df)
|
|
103
|
+
|
|
104
|
+
print(f"📊 Creating statistical features across {len(columns)} columns...")
|
|
105
|
+
|
|
106
|
+
# Row-wise statistics
|
|
107
|
+
stat_features = [
|
|
108
|
+
pl.concat_list([pl.col(c) for c in columns]).list.mean().fill_null(0).alias('row_mean'),
|
|
109
|
+
pl.concat_list([pl.col(c) for c in columns]).list.std().fill_null(0).alias('row_std'),
|
|
110
|
+
pl.concat_list([pl.col(c) for c in columns]).list.min().fill_null(0).alias('row_min'),
|
|
111
|
+
pl.concat_list([pl.col(c) for c in columns]).list.max().fill_null(0).alias('row_max'),
|
|
112
|
+
(pl.concat_list([pl.col(c) for c in columns]).list.max() -
|
|
113
|
+
pl.concat_list([pl.col(c) for c in columns]).list.min()).fill_null(0).alias('row_range'),
|
|
114
|
+
pl.concat_list([pl.col(c) for c in columns]).list.sum().fill_null(0).alias('row_sum'),
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
df = df.with_columns(stat_features)
|
|
118
|
+
|
|
119
|
+
if output_path:
|
|
120
|
+
save_dataframe(df, output_path)
|
|
121
|
+
|
|
122
|
+
return {
|
|
123
|
+
'success': True,
|
|
124
|
+
'tool': 'create_statistical_features',
|
|
125
|
+
'result': {
|
|
126
|
+
'new_features': 6,
|
|
127
|
+
'feature_names': ['row_mean', 'row_std', 'row_min', 'row_max', 'row_range', 'row_sum'],
|
|
128
|
+
'output_path': output_path
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def create_log_features(file_path: str,
|
|
134
|
+
columns: Optional[List[str]] = None,
|
|
135
|
+
output_path: Optional[str] = None) -> Dict[str, Any]:
|
|
136
|
+
"""
|
|
137
|
+
Create log-transformed features for skewed distributions.
|
|
138
|
+
ROBUST: Handles negative values and zeros.
|
|
139
|
+
|
|
140
|
+
Args:
|
|
141
|
+
file_path: Path to dataset
|
|
142
|
+
columns: Columns to use (None = all numeric with positive values)
|
|
143
|
+
output_path: Output file path
|
|
144
|
+
|
|
145
|
+
Returns:
|
|
146
|
+
Dictionary with results
|
|
147
|
+
"""
|
|
148
|
+
validate_file_exists(file_path)
|
|
149
|
+
df = load_dataframe(file_path)
|
|
150
|
+
validate_dataframe(df)
|
|
151
|
+
|
|
152
|
+
if columns is None:
|
|
153
|
+
columns = get_numeric_columns(df)
|
|
154
|
+
|
|
155
|
+
print(f"📈 Creating log-transformed features for {len(columns)} columns...")
|
|
156
|
+
|
|
157
|
+
log_exprs = []
|
|
158
|
+
feature_names = []
|
|
159
|
+
|
|
160
|
+
for col in columns:
|
|
161
|
+
# Check if column has positive values
|
|
162
|
+
min_val = df[col].min()
|
|
163
|
+
if min_val is not None and min_val > 0:
|
|
164
|
+
# log(x)
|
|
165
|
+
log_exprs.append(pl.col(col).log().fill_nan(0).alias(f"log_{col}"))
|
|
166
|
+
feature_names.append(f"log_{col}")
|
|
167
|
+
elif min_val is not None and min_val >= 0:
|
|
168
|
+
# log(x+1) for non-negative values
|
|
169
|
+
log_exprs.append((pl.col(col) + 1).log().fill_nan(0).alias(f"log1p_{col}"))
|
|
170
|
+
feature_names.append(f"log1p_{col}")
|
|
171
|
+
|
|
172
|
+
if log_exprs:
|
|
173
|
+
df = df.with_columns(log_exprs)
|
|
174
|
+
|
|
175
|
+
if output_path:
|
|
176
|
+
save_dataframe(df, output_path)
|
|
177
|
+
|
|
178
|
+
return {
|
|
179
|
+
'success': True,
|
|
180
|
+
'tool': 'create_log_features',
|
|
181
|
+
'result': {
|
|
182
|
+
'new_features': len(log_exprs),
|
|
183
|
+
'feature_names': feature_names,
|
|
184
|
+
'output_path': output_path
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def create_binned_features(file_path: str,
|
|
190
|
+
columns: Optional[List[str]] = None,
|
|
191
|
+
n_bins: int = 5,
|
|
192
|
+
output_path: Optional[str] = None) -> Dict[str, Any]:
|
|
193
|
+
"""
|
|
194
|
+
Create binned (discretized) features from continuous variables.
|
|
195
|
+
ROBUST: Uses quantile-based binning to handle outliers.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
file_path: Path to dataset
|
|
199
|
+
columns: Columns to use (None = all numeric)
|
|
200
|
+
n_bins: Number of bins
|
|
201
|
+
output_path: Output file path
|
|
202
|
+
|
|
203
|
+
Returns:
|
|
204
|
+
Dictionary with results
|
|
205
|
+
"""
|
|
206
|
+
validate_file_exists(file_path)
|
|
207
|
+
df = load_dataframe(file_path)
|
|
208
|
+
validate_dataframe(df)
|
|
209
|
+
|
|
210
|
+
if columns is None:
|
|
211
|
+
columns = get_numeric_columns(df)[:10] # Limit to 10 columns
|
|
212
|
+
|
|
213
|
+
print(f"🗂️ Creating binned features for {len(columns)} columns with {n_bins} bins...")
|
|
214
|
+
|
|
215
|
+
binned_exprs = []
|
|
216
|
+
feature_names = []
|
|
217
|
+
|
|
218
|
+
for col in columns:
|
|
219
|
+
# Quantile-based binning
|
|
220
|
+
bin_name = f"{col}_binned"
|
|
221
|
+
binned_exprs.append(
|
|
222
|
+
pl.col(col).qcut(n_bins, labels=[f"bin_{i}" for i in range(n_bins)],
|
|
223
|
+
allow_duplicates=True).fill_null("bin_0").alias(bin_name)
|
|
224
|
+
)
|
|
225
|
+
feature_names.append(bin_name)
|
|
226
|
+
|
|
227
|
+
df = df.with_columns(binned_exprs)
|
|
228
|
+
|
|
229
|
+
if output_path:
|
|
230
|
+
save_dataframe(df, output_path)
|
|
231
|
+
|
|
232
|
+
return {
|
|
233
|
+
'success': True,
|
|
234
|
+
'tool': 'create_binned_features',
|
|
235
|
+
'result': {
|
|
236
|
+
'new_features': len(binned_exprs),
|
|
237
|
+
'feature_names': feature_names,
|
|
238
|
+
'n_bins': n_bins,
|
|
239
|
+
'output_path': output_path
|
|
240
|
+
}
|
|
241
|
+
}
|