ds-agent-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/bin/ds-agent.js +451 -0
  2. package/ds_agent/__init__.py +8 -0
  3. package/package.json +28 -0
  4. package/requirements.txt +126 -0
  5. package/setup.py +35 -0
  6. package/src/__init__.py +7 -0
  7. package/src/_compress_tool_result.py +118 -0
  8. package/src/api/__init__.py +4 -0
  9. package/src/api/app.py +1626 -0
  10. package/src/cache/__init__.py +5 -0
  11. package/src/cache/cache_manager.py +561 -0
  12. package/src/cli.py +2886 -0
  13. package/src/dynamic_prompts.py +281 -0
  14. package/src/orchestrator.py +4799 -0
  15. package/src/progress_manager.py +139 -0
  16. package/src/reasoning/__init__.py +332 -0
  17. package/src/reasoning/business_summary.py +431 -0
  18. package/src/reasoning/data_understanding.py +356 -0
  19. package/src/reasoning/model_explanation.py +383 -0
  20. package/src/reasoning/reasoning_trace.py +239 -0
  21. package/src/registry/__init__.py +3 -0
  22. package/src/registry/tools_registry.py +3 -0
  23. package/src/session_memory.py +448 -0
  24. package/src/session_store.py +370 -0
  25. package/src/storage/__init__.py +19 -0
  26. package/src/storage/artifact_store.py +620 -0
  27. package/src/storage/helpers.py +116 -0
  28. package/src/storage/huggingface_storage.py +694 -0
  29. package/src/storage/r2_storage.py +0 -0
  30. package/src/storage/user_files_service.py +288 -0
  31. package/src/tools/__init__.py +335 -0
  32. package/src/tools/advanced_analysis.py +823 -0
  33. package/src/tools/advanced_feature_engineering.py +708 -0
  34. package/src/tools/advanced_insights.py +578 -0
  35. package/src/tools/advanced_preprocessing.py +549 -0
  36. package/src/tools/advanced_training.py +906 -0
  37. package/src/tools/agent_tool_mapping.py +326 -0
  38. package/src/tools/auto_pipeline.py +420 -0
  39. package/src/tools/autogluon_training.py +1480 -0
  40. package/src/tools/business_intelligence.py +860 -0
  41. package/src/tools/cloud_data_sources.py +581 -0
  42. package/src/tools/code_interpreter.py +390 -0
  43. package/src/tools/computer_vision.py +614 -0
  44. package/src/tools/data_cleaning.py +614 -0
  45. package/src/tools/data_profiling.py +593 -0
  46. package/src/tools/data_type_conversion.py +268 -0
  47. package/src/tools/data_wrangling.py +433 -0
  48. package/src/tools/eda_reports.py +284 -0
  49. package/src/tools/enhanced_feature_engineering.py +241 -0
  50. package/src/tools/feature_engineering.py +302 -0
  51. package/src/tools/matplotlib_visualizations.py +1327 -0
  52. package/src/tools/model_training.py +520 -0
  53. package/src/tools/nlp_text_analytics.py +761 -0
  54. package/src/tools/plotly_visualizations.py +497 -0
  55. package/src/tools/production_mlops.py +852 -0
  56. package/src/tools/time_series.py +507 -0
  57. package/src/tools/tools_registry.py +2133 -0
  58. package/src/tools/visualization_engine.py +559 -0
  59. package/src/utils/__init__.py +42 -0
  60. package/src/utils/error_recovery.py +313 -0
  61. package/src/utils/parallel_executor.py +402 -0
  62. package/src/utils/polars_helpers.py +248 -0
  63. package/src/utils/schema_extraction.py +132 -0
  64. package/src/utils/semantic_layer.py +392 -0
  65. package/src/utils/token_budget.py +411 -0
  66. package/src/utils/validation.py +377 -0
  67. package/src/workflow_state.py +154 -0
@@ -0,0 +1,356 @@
1
+ """
2
+ Data Understanding Module
3
+
4
+ Provides reasoning about data characteristics, patterns, and quality.
5
+
6
+ KEY RULES:
7
+ - ✅ Accepts: Statistical summaries, metadata, sample rows
8
+ - ❌ NO: Raw DataFrames, full datasets
9
+ - ✅ Returns: Natural language insights + structured recommendations
10
+ - ❌ NO: Training decisions, model selection
11
+
12
+ Use Cases:
13
+ 1. Explain what data represents
14
+ 2. Identify data quality issues
15
+ 3. Suggest preprocessing steps
16
+ 4. Highlight interesting patterns
17
+
18
+ Example:
19
+ from reasoning.data_understanding import explain_dataset
20
+
21
+ summary = {
22
+ "rows": 10000,
23
+ "columns": 20,
24
+ "numeric": 15,
25
+ "categorical": 5,
26
+ "missing_values": {"age": 150, "income": 200},
27
+ "target_distribution": {"yes": 7000, "no": 3000}
28
+ }
29
+
30
+ explanation = explain_dataset(summary)
31
+ # Returns: {
32
+ # "overview": "This is an imbalanced classification dataset...",
33
+ # "quality_issues": ["Missing values in age and income"],
34
+ # "recommendations": ["Handle class imbalance", "Impute missing values"],
35
+ # "patterns": ["Target class imbalanced (70-30 split)"]
36
+ # }
37
+ """
38
+
39
+ from typing import Dict, Any, List, Optional
40
+ from . import get_reasoner
41
+
42
+
43
+ def explain_dataset(
44
+ summary: Dict[str, Any],
45
+ target_col: Optional[str] = None
46
+ ) -> Dict[str, Any]:
47
+ """
48
+ Explain dataset characteristics based on summary statistics.
49
+
50
+ Args:
51
+ summary: Statistical summary (NO raw data!)
52
+ Must include: rows, columns, dtypes, missing_values
53
+ Optional: target_distribution, correlations, outliers
54
+ target_col: Target column name (if known)
55
+
56
+ Returns:
57
+ {
58
+ "overview": str, # High-level description
59
+ "quality_issues": List[str], # Data quality problems
60
+ "recommendations": List[str], # Suggested preprocessing steps
61
+ "patterns": List[str], # Interesting patterns found
62
+ "target_insights": str # Target variable insights (if applicable)
63
+ }
64
+ """
65
+ # Validate inputs FIRST (NO raw data allowed!)
66
+ if "dataframe" in summary or "df" in summary:
67
+ raise ValueError("Cannot pass raw DataFrames! Pass summary statistics only.")
68
+
69
+ reasoner = get_reasoner()
70
+
71
+ # Build reasoning prompt from summary
72
+ prompt = f"""Analyze this dataset summary and provide insights:
73
+
74
+ **Dataset Summary:**
75
+ - Rows: {summary.get('rows', 'unknown')}
76
+ - Columns: {summary.get('columns', 'unknown')}
77
+ - Numeric columns: {summary.get('numeric_columns', [])}
78
+ - Categorical columns: {summary.get('categorical_columns', [])}
79
+ - Missing values: {summary.get('missing_values', {})}
80
+ - Target column: {target_col or 'Not specified'}
81
+
82
+ **Target Distribution (if available):**
83
+ {summary.get('target_distribution', 'Not provided')}
84
+
85
+ **Correlations (if available):**
86
+ {summary.get('top_correlations', 'Not provided')}
87
+
88
+ **Outliers (if available):**
89
+ {summary.get('outliers', 'Not provided')}
90
+
91
+ Provide:
92
+ 1. Overview of what this data represents
93
+ 2. Data quality issues identified
94
+ 3. Preprocessing recommendations
95
+ 4. Interesting patterns noticed
96
+ 5. Target variable insights (if classification/regression)
97
+ """
98
+
99
+ system_prompt = """You are a data understanding expert. Your role is to:
100
+ - Explain what data means in plain English
101
+ - Identify data quality issues
102
+ - Suggest preprocessing steps
103
+ - Highlight patterns
104
+
105
+ You do NOT:
106
+ - Make training decisions
107
+ - Select models
108
+ - Access raw data
109
+ - Execute any code
110
+
111
+ You ONLY reason about summaries provided."""
112
+
113
+ schema = {
114
+ "overview": "string - High-level description of dataset",
115
+ "quality_issues": ["array of strings - Data quality problems found"],
116
+ "recommendations": ["array of strings - Preprocessing steps to take"],
117
+ "patterns": ["array of strings - Interesting patterns noticed"],
118
+ "target_insights": "string - Insights about target variable"
119
+ }
120
+
121
+ result = reasoner.reason_structured(prompt, schema, system_prompt)
122
+
123
+ return result
124
+
125
+
126
+ def explain_data_profile(
127
+ profile: Dict[str, Any]
128
+ ) -> str:
129
+ """
130
+ Generate natural language explanation of data profiling results.
131
+
132
+ Args:
133
+ profile: Profiling output from tools (column stats, distributions, etc.)
134
+ Example: {
135
+ "column_stats": {...},
136
+ "missing_summary": {...},
137
+ "cardinality": {...}
138
+ }
139
+
140
+ Returns:
141
+ Natural language explanation
142
+ """
143
+ reasoner = get_reasoner()
144
+
145
+ prompt = f"""Explain these data profiling results in clear, actionable terms:
146
+
147
+ {profile}
148
+
149
+ Focus on:
150
+ - What the data looks like
151
+ - Any concerning patterns
152
+ - Next steps for data cleaning
153
+ """
154
+
155
+ system_prompt = """You are a data quality expert explaining profiling results.
156
+ Be concise, actionable, and highlight the most important findings."""
157
+
158
+ return reasoner.reason(prompt, system_prompt, temperature=0.1)
159
+
160
+
161
+ def suggest_transformations(
162
+ column_stats: Dict[str, Any],
163
+ task_type: Optional[str] = None
164
+ ) -> Dict[str, List[str]]:
165
+ """
166
+ Suggest transformations for each column based on statistics.
167
+
168
+ Args:
169
+ column_stats: Per-column statistics
170
+ Example: {
171
+ "age": {"min": 0, "max": 150, "outliers": 5},
172
+ "income": {"skewness": 3.5, "distribution": "highly_skewed"}
173
+ }
174
+ task_type: 'classification' or 'regression' (if known)
175
+
176
+ Returns:
177
+ {
178
+ "age": ["Remove outliers > 100", "Normalize to 0-1 range"],
179
+ "income": ["Apply log transform (skewed)", "Remove negative values"]
180
+ }
181
+ """
182
+ reasoner = get_reasoner()
183
+
184
+ prompt = f"""Based on these column statistics, suggest transformations:
185
+
186
+ **Column Statistics:**
187
+ {column_stats}
188
+
189
+ **Task Type:** {task_type or 'Unknown'}
190
+
191
+ For each column, suggest:
192
+ - Outlier handling
193
+ - Scaling/normalization
194
+ - Distribution transformations
195
+ - Encoding strategies (for categorical)
196
+
197
+ Be specific and actionable."""
198
+
199
+ schema = {
200
+ "column_name": ["array of transformation suggestions"]
201
+ }
202
+
203
+ return reasoner.reason_structured(prompt, schema)
204
+
205
+
206
+ def identify_feature_engineering_opportunities(
207
+ summary: Dict[str, Any],
208
+ domain: Optional[str] = None
209
+ ) -> List[Dict[str, str]]:
210
+ """
211
+ Identify feature engineering opportunities based on data summary.
212
+
213
+ Args:
214
+ summary: Dataset summary with column names and types
215
+ domain: Optional domain context (e.g., "healthcare", "finance")
216
+
217
+ Returns:
218
+ [
219
+ {
220
+ "opportunity": "Create age_bins feature",
221
+ "reason": "Age is continuous but may benefit from binning",
222
+ "suggested_code": "pd.cut(df['age'], bins=[0,18,35,50,65,100])"
223
+ },
224
+ ...
225
+ ]
226
+ """
227
+ reasoner = get_reasoner()
228
+
229
+ domain_context = f"\nDomain: {domain}" if domain else ""
230
+
231
+ prompt = f"""Identify feature engineering opportunities from this data:
232
+
233
+ **Available Columns:**
234
+ {summary.get('columns', [])}
235
+
236
+ **Column Types:**
237
+ {summary.get('dtypes', {})}
238
+
239
+ **Sample Values:**
240
+ {summary.get('sample_values', 'Not provided')}{domain_context}
241
+
242
+ Suggest:
243
+ 1. Interaction features (e.g., BMI from height/weight)
244
+ 2. Binning/discretization opportunities
245
+ 3. Time-based features (if datetime columns exist)
246
+ 4. Encoding strategies
247
+ 5. Domain-specific features
248
+
249
+ For each opportunity, explain WHY it would help."""
250
+
251
+ system_prompt = """You are a feature engineering expert.
252
+ Suggest creative but practical feature transformations.
253
+ Focus on features that typically improve model performance."""
254
+
255
+ schema = {
256
+ "opportunities": [
257
+ {
258
+ "opportunity": "string - What to create",
259
+ "reason": "string - Why it would help",
260
+ "suggested_code": "string - Pseudo-code or actual code"
261
+ }
262
+ ]
263
+ }
264
+
265
+ result = reasoner.reason_structured(prompt, schema, system_prompt)
266
+ return result.get("opportunities", [])
267
+
268
+
269
+ def explain_missing_values(
270
+ missing_summary: Dict[str, Any]
271
+ ) -> Dict[str, str]:
272
+ """
273
+ Explain missing value patterns and suggest strategies.
274
+
275
+ Args:
276
+ missing_summary: Summary of missing values
277
+ Example: {
278
+ "age": {"count": 150, "percentage": 1.5, "pattern": "random"},
279
+ "income": {"count": 500, "percentage": 5.0, "pattern": "not_random"}
280
+ }
281
+
282
+ Returns:
283
+ {
284
+ "age": "1.5% missing (random) - Safe to impute with median",
285
+ "income": "5% missing (non-random) - May indicate bias, consider separate category"
286
+ }
287
+ """
288
+ reasoner = get_reasoner()
289
+
290
+ prompt = f"""Analyze these missing value patterns and suggest handling strategies:
291
+
292
+ {missing_summary}
293
+
294
+ For each column with missing values:
295
+ 1. Assess the missing pattern (random vs systematic)
296
+ 2. Suggest imputation strategy
297
+ 3. Warn about any concerns (bias, data leakage, etc.)
298
+ """
299
+
300
+ schema = {
301
+ "column_name": "string - Assessment and strategy"
302
+ }
303
+
304
+ return reasoner.reason_structured(prompt, schema)
305
+
306
+
307
+ def compare_datasets(
308
+ dataset1_summary: Dict[str, Any],
309
+ dataset2_summary: Dict[str, Any],
310
+ comparison_purpose: str = "train_test_validation"
311
+ ) -> Dict[str, Any]:
312
+ """
313
+ Compare two dataset summaries and identify differences.
314
+
315
+ Args:
316
+ dataset1_summary: Summary of first dataset
317
+ dataset2_summary: Summary of second dataset
318
+ comparison_purpose: 'train_test_validation', 'before_after', or 'a_b_test'
319
+
320
+ Returns:
321
+ {
322
+ "differences": List[str], # Key differences found
323
+ "concerns": List[str], # Potential issues
324
+ "data_drift": bool, # Whether distribution shift detected
325
+ "recommendation": str # What to do about differences
326
+ }
327
+ """
328
+ reasoner = get_reasoner()
329
+
330
+ prompt = f"""Compare these two datasets:
331
+
332
+ **Dataset 1:**
333
+ {dataset1_summary}
334
+
335
+ **Dataset 2:**
336
+ {dataset2_summary}
337
+
338
+ **Comparison Purpose:** {comparison_purpose}
339
+
340
+ Identify:
341
+ 1. Distribution differences
342
+ 2. Schema differences
343
+ 3. Data quality differences
344
+ 4. Potential data drift or leakage
345
+ 5. Whether differences are concerning
346
+
347
+ Be specific about what changed and why it matters."""
348
+
349
+ schema = {
350
+ "differences": ["array of key differences"],
351
+ "concerns": ["array of potential issues"],
352
+ "data_drift": "boolean",
353
+ "recommendation": "string - What to do"
354
+ }
355
+
356
+ return reasoner.reason_structured(prompt, schema)