ds-agent-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/bin/ds-agent.js +451 -0
  2. package/ds_agent/__init__.py +8 -0
  3. package/package.json +28 -0
  4. package/requirements.txt +126 -0
  5. package/setup.py +35 -0
  6. package/src/__init__.py +7 -0
  7. package/src/_compress_tool_result.py +118 -0
  8. package/src/api/__init__.py +4 -0
  9. package/src/api/app.py +1626 -0
  10. package/src/cache/__init__.py +5 -0
  11. package/src/cache/cache_manager.py +561 -0
  12. package/src/cli.py +2886 -0
  13. package/src/dynamic_prompts.py +281 -0
  14. package/src/orchestrator.py +4799 -0
  15. package/src/progress_manager.py +139 -0
  16. package/src/reasoning/__init__.py +332 -0
  17. package/src/reasoning/business_summary.py +431 -0
  18. package/src/reasoning/data_understanding.py +356 -0
  19. package/src/reasoning/model_explanation.py +383 -0
  20. package/src/reasoning/reasoning_trace.py +239 -0
  21. package/src/registry/__init__.py +3 -0
  22. package/src/registry/tools_registry.py +3 -0
  23. package/src/session_memory.py +448 -0
  24. package/src/session_store.py +370 -0
  25. package/src/storage/__init__.py +19 -0
  26. package/src/storage/artifact_store.py +620 -0
  27. package/src/storage/helpers.py +116 -0
  28. package/src/storage/huggingface_storage.py +694 -0
  29. package/src/storage/r2_storage.py +0 -0
  30. package/src/storage/user_files_service.py +288 -0
  31. package/src/tools/__init__.py +335 -0
  32. package/src/tools/advanced_analysis.py +823 -0
  33. package/src/tools/advanced_feature_engineering.py +708 -0
  34. package/src/tools/advanced_insights.py +578 -0
  35. package/src/tools/advanced_preprocessing.py +549 -0
  36. package/src/tools/advanced_training.py +906 -0
  37. package/src/tools/agent_tool_mapping.py +326 -0
  38. package/src/tools/auto_pipeline.py +420 -0
  39. package/src/tools/autogluon_training.py +1480 -0
  40. package/src/tools/business_intelligence.py +860 -0
  41. package/src/tools/cloud_data_sources.py +581 -0
  42. package/src/tools/code_interpreter.py +390 -0
  43. package/src/tools/computer_vision.py +614 -0
  44. package/src/tools/data_cleaning.py +614 -0
  45. package/src/tools/data_profiling.py +593 -0
  46. package/src/tools/data_type_conversion.py +268 -0
  47. package/src/tools/data_wrangling.py +433 -0
  48. package/src/tools/eda_reports.py +284 -0
  49. package/src/tools/enhanced_feature_engineering.py +241 -0
  50. package/src/tools/feature_engineering.py +302 -0
  51. package/src/tools/matplotlib_visualizations.py +1327 -0
  52. package/src/tools/model_training.py +520 -0
  53. package/src/tools/nlp_text_analytics.py +761 -0
  54. package/src/tools/plotly_visualizations.py +497 -0
  55. package/src/tools/production_mlops.py +852 -0
  56. package/src/tools/time_series.py +507 -0
  57. package/src/tools/tools_registry.py +2133 -0
  58. package/src/tools/visualization_engine.py +559 -0
  59. package/src/utils/__init__.py +42 -0
  60. package/src/utils/error_recovery.py +313 -0
  61. package/src/utils/parallel_executor.py +402 -0
  62. package/src/utils/polars_helpers.py +248 -0
  63. package/src/utils/schema_extraction.py +132 -0
  64. package/src/utils/semantic_layer.py +392 -0
  65. package/src/utils/token_budget.py +411 -0
  66. package/src/utils/validation.py +377 -0
  67. package/src/workflow_state.py +154 -0
@@ -0,0 +1,302 @@
1
+ """
2
+ Feature Engineering Tools
3
+ Tools for creating new features from existing data.
4
+ """
5
+
6
+ import polars as pl
7
+ import numpy as np
8
+ from typing import Dict, Any, List, Optional
9
+ from pathlib import Path
10
+ import sys
11
+ import os
12
+
13
+ # Add parent directory to path for imports
14
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
15
+
16
+ from ds_agent.utils.polars_helpers import (
17
+ load_dataframe,
18
+ save_dataframe,
19
+ get_numeric_columns,
20
+ get_categorical_columns,
21
+ )
22
+ from ds_agent.utils.validation import (
23
+ validate_file_exists,
24
+ validate_file_format,
25
+ validate_dataframe,
26
+ validate_column_exists,
27
+ validate_datetime_column,
28
+ )
29
+
30
+
31
+ def create_time_features(file_path: str, date_col: str,
32
+ output_path: str) -> Dict[str, Any]:
33
+ """
34
+ Extract comprehensive time-based features from datetime column.
35
+
36
+ Args:
37
+ file_path: Path to CSV or Parquet file
38
+ date_col: Name of datetime column
39
+ output_path: Path to save dataset with new features
40
+
41
+ Returns:
42
+ Dictionary with feature engineering report
43
+ """
44
+ # Validation
45
+ validate_file_exists(file_path)
46
+ validate_file_format(file_path)
47
+
48
+ # Load data
49
+ df = load_dataframe(file_path)
50
+ validate_dataframe(df)
51
+ validate_column_exists(df, date_col)
52
+
53
+ # Try to parse datetime if it's a string
54
+ if df[date_col].dtype == pl.Utf8:
55
+ try:
56
+ df = df.with_columns(
57
+ pl.col(date_col).str.strptime(pl.Datetime, strict=False).alias(date_col)
58
+ )
59
+ except:
60
+ return {
61
+ "status": "error",
62
+ "message": f"Could not parse column '{date_col}' as datetime"
63
+ }
64
+
65
+ # Validate it's now a datetime
66
+ if df[date_col].dtype not in [pl.Date, pl.Datetime]:
67
+ return {
68
+ "status": "error",
69
+ "message": f"Column '{date_col}' is not a datetime type (dtype: {df[date_col].dtype})"
70
+ }
71
+
72
+ features_created = []
73
+
74
+ # Extract basic time features
75
+ df = df.with_columns([
76
+ pl.col(date_col).dt.year().alias(f"{date_col}_year"),
77
+ pl.col(date_col).dt.month().alias(f"{date_col}_month"),
78
+ pl.col(date_col).dt.day().alias(f"{date_col}_day"),
79
+ pl.col(date_col).dt.weekday().alias(f"{date_col}_dayofweek"),
80
+ pl.col(date_col).dt.quarter().alias(f"{date_col}_quarter"),
81
+ ])
82
+
83
+ features_created.extend([
84
+ f"{date_col}_year",
85
+ f"{date_col}_month",
86
+ f"{date_col}_day",
87
+ f"{date_col}_dayofweek",
88
+ f"{date_col}_quarter"
89
+ ])
90
+
91
+ # Create is_weekend feature
92
+ df = df.with_columns(
93
+ (pl.col(f"{date_col}_dayofweek") >= 5).cast(pl.Int8).alias(f"{date_col}_is_weekend")
94
+ )
95
+ features_created.append(f"{date_col}_is_weekend")
96
+
97
+ # Cyclical encoding for month (sin/cos)
98
+ df = df.with_columns([
99
+ (2 * np.pi * pl.col(f"{date_col}_month") / 12).sin().alias(f"{date_col}_month_sin"),
100
+ (2 * np.pi * pl.col(f"{date_col}_month") / 12).cos().alias(f"{date_col}_month_cos"),
101
+ ])
102
+ features_created.extend([
103
+ f"{date_col}_month_sin",
104
+ f"{date_col}_month_cos"
105
+ ])
106
+
107
+ # If datetime has time component, extract hour
108
+ if df[date_col].dtype == pl.Datetime:
109
+ try:
110
+ df = df.with_columns([
111
+ pl.col(date_col).dt.hour().alias(f"{date_col}_hour"),
112
+ ])
113
+ features_created.append(f"{date_col}_hour")
114
+
115
+ # Cyclical encoding for hour
116
+ df = df.with_columns([
117
+ (2 * np.pi * pl.col(f"{date_col}_hour") / 24).sin().alias(f"{date_col}_hour_sin"),
118
+ (2 * np.pi * pl.col(f"{date_col}_hour") / 24).cos().alias(f"{date_col}_hour_cos"),
119
+ ])
120
+ features_created.extend([
121
+ f"{date_col}_hour_sin",
122
+ f"{date_col}_hour_cos"
123
+ ])
124
+ except:
125
+ pass # Hour extraction failed, skip
126
+
127
+ # Save dataset
128
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
129
+ save_dataframe(df, output_path)
130
+
131
+ return {
132
+ "status": "success",
133
+ "features_created": features_created,
134
+ "num_features": len(features_created),
135
+ "output_path": output_path
136
+ }
137
+
138
+
139
+ def encode_categorical(file_path: str, method: str = "auto", columns: Optional[List[str]] = None,
140
+ target_col: Optional[str] = None,
141
+ output_path: str = None) -> Dict[str, Any]:
142
+ """
143
+ Encode categorical variables.
144
+
145
+ Args:
146
+ file_path: Path to CSV or Parquet file
147
+ method: Encoding method ('one_hot', 'target', 'frequency', 'auto')
148
+ columns: List of columns to encode, or ['all'] for all categorical. If None, defaults to all categorical columns
149
+ target_col: Required for target encoding - name of target column
150
+ output_path: Path to save dataset with encoded features
151
+
152
+ Returns:
153
+ Dictionary with encoding report
154
+ """
155
+ # Validation
156
+ validate_file_exists(file_path)
157
+ validate_file_format(file_path)
158
+
159
+ # Load data
160
+ df = load_dataframe(file_path)
161
+ validate_dataframe(df)
162
+
163
+ # Determine which columns to process
164
+ categorical_cols = get_categorical_columns(df)
165
+
166
+ # Default to all categorical columns if not specified
167
+ if columns is None or columns == ["all"]:
168
+ target_cols = categorical_cols
169
+ else:
170
+ # Validate columns exist
171
+ for col in columns:
172
+ if col not in df.columns:
173
+ raise ValueError(f"Column '{col}' not found")
174
+ target_cols = columns
175
+
176
+ # Auto-detect method if 'auto'
177
+ if method == "auto":
178
+ # Use frequency encoding for high-cardinality, one-hot for low
179
+ method = "frequency" # Default safe choice
180
+
181
+ # For target encoding, validate target column
182
+ if method == "target":
183
+ if target_col is None:
184
+ return {
185
+ "status": "error",
186
+ "message": "target_col is required for target encoding"
187
+ }
188
+ validate_column_exists(df, target_col)
189
+
190
+ report = {
191
+ "method": method,
192
+ "columns_processed": {},
193
+ "features_created": []
194
+ }
195
+
196
+ # Process each column
197
+ for col in target_cols:
198
+ if col not in df.columns:
199
+ report["columns_processed"][col] = {
200
+ "status": "error",
201
+ "message": "Column not found"
202
+ }
203
+ continue
204
+
205
+ n_unique = df[col].n_unique()
206
+
207
+ try:
208
+ if method == "one_hot":
209
+ # One-hot encoding
210
+ # Limit to top categories if too many
211
+ if n_unique > 50:
212
+ report["columns_processed"][col] = {
213
+ "status": "warning",
214
+ "message": f"Column has {n_unique} unique values. Consider using frequency or target encoding instead."
215
+ }
216
+ continue
217
+
218
+ # Get dummies
219
+ encoded = df.select(pl.col(col)).to_dummies(columns=[col])
220
+
221
+ # Add encoded columns to dataframe
222
+ for enc_col in encoded.columns:
223
+ df = df.with_columns(encoded[enc_col])
224
+ report["features_created"].append(enc_col)
225
+
226
+ # Drop original column
227
+ df = df.drop(col)
228
+
229
+ report["columns_processed"][col] = {
230
+ "status": "success",
231
+ "num_features_created": len(encoded.columns)
232
+ }
233
+
234
+ elif method == "frequency":
235
+ # Frequency encoding
236
+ value_counts = df[col].value_counts()
237
+ freq_map = {
238
+ row[0]: row[1] / len(df)
239
+ for row in value_counts.iter_rows()
240
+ }
241
+
242
+ # Create new column with frequencies
243
+ new_col_name = f"{col}_freq"
244
+ df = df.with_columns(
245
+ pl.col(col).replace_strict(freq_map, default=0.0).alias(new_col_name)
246
+ )
247
+
248
+ # Drop original column
249
+ df = df.drop(col)
250
+
251
+ report["features_created"].append(new_col_name)
252
+ report["columns_processed"][col] = {
253
+ "status": "success",
254
+ "num_features_created": 1
255
+ }
256
+
257
+ elif method == "target":
258
+ # Target encoding (mean encoding)
259
+ # Calculate mean target value for each category
260
+ target_means = (
261
+ df.group_by(col)
262
+ .agg(pl.col(target_col).mean().alias("target_mean"))
263
+ )
264
+
265
+ # Create dictionary for mapping
266
+ target_map = {
267
+ row[0]: row[1]
268
+ for row in target_means.iter_rows()
269
+ }
270
+
271
+ # Global mean for unseen categories
272
+ global_mean = df[target_col].mean()
273
+
274
+ # Create new column with target encoding
275
+ new_col_name = f"{col}_target_enc"
276
+ df = df.with_columns(
277
+ pl.col(col).replace_strict(target_map, default=global_mean).alias(new_col_name)
278
+ )
279
+
280
+ # Drop original column
281
+ df = df.drop(col)
282
+
283
+ report["features_created"].append(new_col_name)
284
+ report["columns_processed"][col] = {
285
+ "status": "success",
286
+ "num_features_created": 1
287
+ }
288
+
289
+ except Exception as e:
290
+ report["columns_processed"][col] = {
291
+ "status": "error",
292
+ "message": str(e)
293
+ }
294
+
295
+ report["total_features_created"] = len(report["features_created"])
296
+
297
+ # Save dataset
298
+ Path(output_path).parent.mkdir(parents=True, exist_ok=True)
299
+ save_dataframe(df, output_path)
300
+ report["output_path"] = output_path
301
+
302
+ return report