ds-agent-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ds-agent.js +451 -0
- package/ds_agent/__init__.py +8 -0
- package/package.json +28 -0
- package/requirements.txt +126 -0
- package/setup.py +35 -0
- package/src/__init__.py +7 -0
- package/src/_compress_tool_result.py +118 -0
- package/src/api/__init__.py +4 -0
- package/src/api/app.py +1626 -0
- package/src/cache/__init__.py +5 -0
- package/src/cache/cache_manager.py +561 -0
- package/src/cli.py +2886 -0
- package/src/dynamic_prompts.py +281 -0
- package/src/orchestrator.py +4799 -0
- package/src/progress_manager.py +139 -0
- package/src/reasoning/__init__.py +332 -0
- package/src/reasoning/business_summary.py +431 -0
- package/src/reasoning/data_understanding.py +356 -0
- package/src/reasoning/model_explanation.py +383 -0
- package/src/reasoning/reasoning_trace.py +239 -0
- package/src/registry/__init__.py +3 -0
- package/src/registry/tools_registry.py +3 -0
- package/src/session_memory.py +448 -0
- package/src/session_store.py +370 -0
- package/src/storage/__init__.py +19 -0
- package/src/storage/artifact_store.py +620 -0
- package/src/storage/helpers.py +116 -0
- package/src/storage/huggingface_storage.py +694 -0
- package/src/storage/r2_storage.py +0 -0
- package/src/storage/user_files_service.py +288 -0
- package/src/tools/__init__.py +335 -0
- package/src/tools/advanced_analysis.py +823 -0
- package/src/tools/advanced_feature_engineering.py +708 -0
- package/src/tools/advanced_insights.py +578 -0
- package/src/tools/advanced_preprocessing.py +549 -0
- package/src/tools/advanced_training.py +906 -0
- package/src/tools/agent_tool_mapping.py +326 -0
- package/src/tools/auto_pipeline.py +420 -0
- package/src/tools/autogluon_training.py +1480 -0
- package/src/tools/business_intelligence.py +860 -0
- package/src/tools/cloud_data_sources.py +581 -0
- package/src/tools/code_interpreter.py +390 -0
- package/src/tools/computer_vision.py +614 -0
- package/src/tools/data_cleaning.py +614 -0
- package/src/tools/data_profiling.py +593 -0
- package/src/tools/data_type_conversion.py +268 -0
- package/src/tools/data_wrangling.py +433 -0
- package/src/tools/eda_reports.py +284 -0
- package/src/tools/enhanced_feature_engineering.py +241 -0
- package/src/tools/feature_engineering.py +302 -0
- package/src/tools/matplotlib_visualizations.py +1327 -0
- package/src/tools/model_training.py +520 -0
- package/src/tools/nlp_text_analytics.py +761 -0
- package/src/tools/plotly_visualizations.py +497 -0
- package/src/tools/production_mlops.py +852 -0
- package/src/tools/time_series.py +507 -0
- package/src/tools/tools_registry.py +2133 -0
- package/src/tools/visualization_engine.py +559 -0
- package/src/utils/__init__.py +42 -0
- package/src/utils/error_recovery.py +313 -0
- package/src/utils/parallel_executor.py +402 -0
- package/src/utils/polars_helpers.py +248 -0
- package/src/utils/schema_extraction.py +132 -0
- package/src/utils/semantic_layer.py +392 -0
- package/src/utils/token_budget.py +411 -0
- package/src/utils/validation.py +377 -0
- package/src/workflow_state.py +154 -0
|
@@ -0,0 +1,356 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Data Understanding Module
|
|
3
|
+
|
|
4
|
+
Provides reasoning about data characteristics, patterns, and quality.
|
|
5
|
+
|
|
6
|
+
KEY RULES:
|
|
7
|
+
- ✅ Accepts: Statistical summaries, metadata, sample rows
|
|
8
|
+
- ❌ NO: Raw DataFrames, full datasets
|
|
9
|
+
- ✅ Returns: Natural language insights + structured recommendations
|
|
10
|
+
- ❌ NO: Training decisions, model selection
|
|
11
|
+
|
|
12
|
+
Use Cases:
|
|
13
|
+
1. Explain what data represents
|
|
14
|
+
2. Identify data quality issues
|
|
15
|
+
3. Suggest preprocessing steps
|
|
16
|
+
4. Highlight interesting patterns
|
|
17
|
+
|
|
18
|
+
Example:
|
|
19
|
+
from reasoning.data_understanding import explain_dataset
|
|
20
|
+
|
|
21
|
+
summary = {
|
|
22
|
+
"rows": 10000,
|
|
23
|
+
"columns": 20,
|
|
24
|
+
"numeric": 15,
|
|
25
|
+
"categorical": 5,
|
|
26
|
+
"missing_values": {"age": 150, "income": 200},
|
|
27
|
+
"target_distribution": {"yes": 7000, "no": 3000}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
explanation = explain_dataset(summary)
|
|
31
|
+
# Returns: {
|
|
32
|
+
# "overview": "This is an imbalanced classification dataset...",
|
|
33
|
+
# "quality_issues": ["Missing values in age and income"],
|
|
34
|
+
# "recommendations": ["Handle class imbalance", "Impute missing values"],
|
|
35
|
+
# "patterns": ["Target class imbalanced (70-30 split)"]
|
|
36
|
+
# }
|
|
37
|
+
"""
|
|
38
|
+
|
|
39
|
+
from typing import Dict, Any, List, Optional
|
|
40
|
+
from . import get_reasoner
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def explain_dataset(
|
|
44
|
+
summary: Dict[str, Any],
|
|
45
|
+
target_col: Optional[str] = None
|
|
46
|
+
) -> Dict[str, Any]:
|
|
47
|
+
"""
|
|
48
|
+
Explain dataset characteristics based on summary statistics.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
summary: Statistical summary (NO raw data!)
|
|
52
|
+
Must include: rows, columns, dtypes, missing_values
|
|
53
|
+
Optional: target_distribution, correlations, outliers
|
|
54
|
+
target_col: Target column name (if known)
|
|
55
|
+
|
|
56
|
+
Returns:
|
|
57
|
+
{
|
|
58
|
+
"overview": str, # High-level description
|
|
59
|
+
"quality_issues": List[str], # Data quality problems
|
|
60
|
+
"recommendations": List[str], # Suggested preprocessing steps
|
|
61
|
+
"patterns": List[str], # Interesting patterns found
|
|
62
|
+
"target_insights": str # Target variable insights (if applicable)
|
|
63
|
+
}
|
|
64
|
+
"""
|
|
65
|
+
# Validate inputs FIRST (NO raw data allowed!)
|
|
66
|
+
if "dataframe" in summary or "df" in summary:
|
|
67
|
+
raise ValueError("Cannot pass raw DataFrames! Pass summary statistics only.")
|
|
68
|
+
|
|
69
|
+
reasoner = get_reasoner()
|
|
70
|
+
|
|
71
|
+
# Build reasoning prompt from summary
|
|
72
|
+
prompt = f"""Analyze this dataset summary and provide insights:
|
|
73
|
+
|
|
74
|
+
**Dataset Summary:**
|
|
75
|
+
- Rows: {summary.get('rows', 'unknown')}
|
|
76
|
+
- Columns: {summary.get('columns', 'unknown')}
|
|
77
|
+
- Numeric columns: {summary.get('numeric_columns', [])}
|
|
78
|
+
- Categorical columns: {summary.get('categorical_columns', [])}
|
|
79
|
+
- Missing values: {summary.get('missing_values', {})}
|
|
80
|
+
- Target column: {target_col or 'Not specified'}
|
|
81
|
+
|
|
82
|
+
**Target Distribution (if available):**
|
|
83
|
+
{summary.get('target_distribution', 'Not provided')}
|
|
84
|
+
|
|
85
|
+
**Correlations (if available):**
|
|
86
|
+
{summary.get('top_correlations', 'Not provided')}
|
|
87
|
+
|
|
88
|
+
**Outliers (if available):**
|
|
89
|
+
{summary.get('outliers', 'Not provided')}
|
|
90
|
+
|
|
91
|
+
Provide:
|
|
92
|
+
1. Overview of what this data represents
|
|
93
|
+
2. Data quality issues identified
|
|
94
|
+
3. Preprocessing recommendations
|
|
95
|
+
4. Interesting patterns noticed
|
|
96
|
+
5. Target variable insights (if classification/regression)
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
system_prompt = """You are a data understanding expert. Your role is to:
|
|
100
|
+
- Explain what data means in plain English
|
|
101
|
+
- Identify data quality issues
|
|
102
|
+
- Suggest preprocessing steps
|
|
103
|
+
- Highlight patterns
|
|
104
|
+
|
|
105
|
+
You do NOT:
|
|
106
|
+
- Make training decisions
|
|
107
|
+
- Select models
|
|
108
|
+
- Access raw data
|
|
109
|
+
- Execute any code
|
|
110
|
+
|
|
111
|
+
You ONLY reason about summaries provided."""
|
|
112
|
+
|
|
113
|
+
schema = {
|
|
114
|
+
"overview": "string - High-level description of dataset",
|
|
115
|
+
"quality_issues": ["array of strings - Data quality problems found"],
|
|
116
|
+
"recommendations": ["array of strings - Preprocessing steps to take"],
|
|
117
|
+
"patterns": ["array of strings - Interesting patterns noticed"],
|
|
118
|
+
"target_insights": "string - Insights about target variable"
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
result = reasoner.reason_structured(prompt, schema, system_prompt)
|
|
122
|
+
|
|
123
|
+
return result
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def explain_data_profile(
|
|
127
|
+
profile: Dict[str, Any]
|
|
128
|
+
) -> str:
|
|
129
|
+
"""
|
|
130
|
+
Generate natural language explanation of data profiling results.
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
profile: Profiling output from tools (column stats, distributions, etc.)
|
|
134
|
+
Example: {
|
|
135
|
+
"column_stats": {...},
|
|
136
|
+
"missing_summary": {...},
|
|
137
|
+
"cardinality": {...}
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
Returns:
|
|
141
|
+
Natural language explanation
|
|
142
|
+
"""
|
|
143
|
+
reasoner = get_reasoner()
|
|
144
|
+
|
|
145
|
+
prompt = f"""Explain these data profiling results in clear, actionable terms:
|
|
146
|
+
|
|
147
|
+
{profile}
|
|
148
|
+
|
|
149
|
+
Focus on:
|
|
150
|
+
- What the data looks like
|
|
151
|
+
- Any concerning patterns
|
|
152
|
+
- Next steps for data cleaning
|
|
153
|
+
"""
|
|
154
|
+
|
|
155
|
+
system_prompt = """You are a data quality expert explaining profiling results.
|
|
156
|
+
Be concise, actionable, and highlight the most important findings."""
|
|
157
|
+
|
|
158
|
+
return reasoner.reason(prompt, system_prompt, temperature=0.1)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def suggest_transformations(
|
|
162
|
+
column_stats: Dict[str, Any],
|
|
163
|
+
task_type: Optional[str] = None
|
|
164
|
+
) -> Dict[str, List[str]]:
|
|
165
|
+
"""
|
|
166
|
+
Suggest transformations for each column based on statistics.
|
|
167
|
+
|
|
168
|
+
Args:
|
|
169
|
+
column_stats: Per-column statistics
|
|
170
|
+
Example: {
|
|
171
|
+
"age": {"min": 0, "max": 150, "outliers": 5},
|
|
172
|
+
"income": {"skewness": 3.5, "distribution": "highly_skewed"}
|
|
173
|
+
}
|
|
174
|
+
task_type: 'classification' or 'regression' (if known)
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
{
|
|
178
|
+
"age": ["Remove outliers > 100", "Normalize to 0-1 range"],
|
|
179
|
+
"income": ["Apply log transform (skewed)", "Remove negative values"]
|
|
180
|
+
}
|
|
181
|
+
"""
|
|
182
|
+
reasoner = get_reasoner()
|
|
183
|
+
|
|
184
|
+
prompt = f"""Based on these column statistics, suggest transformations:
|
|
185
|
+
|
|
186
|
+
**Column Statistics:**
|
|
187
|
+
{column_stats}
|
|
188
|
+
|
|
189
|
+
**Task Type:** {task_type or 'Unknown'}
|
|
190
|
+
|
|
191
|
+
For each column, suggest:
|
|
192
|
+
- Outlier handling
|
|
193
|
+
- Scaling/normalization
|
|
194
|
+
- Distribution transformations
|
|
195
|
+
- Encoding strategies (for categorical)
|
|
196
|
+
|
|
197
|
+
Be specific and actionable."""
|
|
198
|
+
|
|
199
|
+
schema = {
|
|
200
|
+
"column_name": ["array of transformation suggestions"]
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
return reasoner.reason_structured(prompt, schema)
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def identify_feature_engineering_opportunities(
|
|
207
|
+
summary: Dict[str, Any],
|
|
208
|
+
domain: Optional[str] = None
|
|
209
|
+
) -> List[Dict[str, str]]:
|
|
210
|
+
"""
|
|
211
|
+
Identify feature engineering opportunities based on data summary.
|
|
212
|
+
|
|
213
|
+
Args:
|
|
214
|
+
summary: Dataset summary with column names and types
|
|
215
|
+
domain: Optional domain context (e.g., "healthcare", "finance")
|
|
216
|
+
|
|
217
|
+
Returns:
|
|
218
|
+
[
|
|
219
|
+
{
|
|
220
|
+
"opportunity": "Create age_bins feature",
|
|
221
|
+
"reason": "Age is continuous but may benefit from binning",
|
|
222
|
+
"suggested_code": "pd.cut(df['age'], bins=[0,18,35,50,65,100])"
|
|
223
|
+
},
|
|
224
|
+
...
|
|
225
|
+
]
|
|
226
|
+
"""
|
|
227
|
+
reasoner = get_reasoner()
|
|
228
|
+
|
|
229
|
+
domain_context = f"\nDomain: {domain}" if domain else ""
|
|
230
|
+
|
|
231
|
+
prompt = f"""Identify feature engineering opportunities from this data:
|
|
232
|
+
|
|
233
|
+
**Available Columns:**
|
|
234
|
+
{summary.get('columns', [])}
|
|
235
|
+
|
|
236
|
+
**Column Types:**
|
|
237
|
+
{summary.get('dtypes', {})}
|
|
238
|
+
|
|
239
|
+
**Sample Values:**
|
|
240
|
+
{summary.get('sample_values', 'Not provided')}{domain_context}
|
|
241
|
+
|
|
242
|
+
Suggest:
|
|
243
|
+
1. Interaction features (e.g., BMI from height/weight)
|
|
244
|
+
2. Binning/discretization opportunities
|
|
245
|
+
3. Time-based features (if datetime columns exist)
|
|
246
|
+
4. Encoding strategies
|
|
247
|
+
5. Domain-specific features
|
|
248
|
+
|
|
249
|
+
For each opportunity, explain WHY it would help."""
|
|
250
|
+
|
|
251
|
+
system_prompt = """You are a feature engineering expert.
|
|
252
|
+
Suggest creative but practical feature transformations.
|
|
253
|
+
Focus on features that typically improve model performance."""
|
|
254
|
+
|
|
255
|
+
schema = {
|
|
256
|
+
"opportunities": [
|
|
257
|
+
{
|
|
258
|
+
"opportunity": "string - What to create",
|
|
259
|
+
"reason": "string - Why it would help",
|
|
260
|
+
"suggested_code": "string - Pseudo-code or actual code"
|
|
261
|
+
}
|
|
262
|
+
]
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
result = reasoner.reason_structured(prompt, schema, system_prompt)
|
|
266
|
+
return result.get("opportunities", [])
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
def explain_missing_values(
|
|
270
|
+
missing_summary: Dict[str, Any]
|
|
271
|
+
) -> Dict[str, str]:
|
|
272
|
+
"""
|
|
273
|
+
Explain missing value patterns and suggest strategies.
|
|
274
|
+
|
|
275
|
+
Args:
|
|
276
|
+
missing_summary: Summary of missing values
|
|
277
|
+
Example: {
|
|
278
|
+
"age": {"count": 150, "percentage": 1.5, "pattern": "random"},
|
|
279
|
+
"income": {"count": 500, "percentage": 5.0, "pattern": "not_random"}
|
|
280
|
+
}
|
|
281
|
+
|
|
282
|
+
Returns:
|
|
283
|
+
{
|
|
284
|
+
"age": "1.5% missing (random) - Safe to impute with median",
|
|
285
|
+
"income": "5% missing (non-random) - May indicate bias, consider separate category"
|
|
286
|
+
}
|
|
287
|
+
"""
|
|
288
|
+
reasoner = get_reasoner()
|
|
289
|
+
|
|
290
|
+
prompt = f"""Analyze these missing value patterns and suggest handling strategies:
|
|
291
|
+
|
|
292
|
+
{missing_summary}
|
|
293
|
+
|
|
294
|
+
For each column with missing values:
|
|
295
|
+
1. Assess the missing pattern (random vs systematic)
|
|
296
|
+
2. Suggest imputation strategy
|
|
297
|
+
3. Warn about any concerns (bias, data leakage, etc.)
|
|
298
|
+
"""
|
|
299
|
+
|
|
300
|
+
schema = {
|
|
301
|
+
"column_name": "string - Assessment and strategy"
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
return reasoner.reason_structured(prompt, schema)
|
|
305
|
+
|
|
306
|
+
|
|
307
|
+
def compare_datasets(
|
|
308
|
+
dataset1_summary: Dict[str, Any],
|
|
309
|
+
dataset2_summary: Dict[str, Any],
|
|
310
|
+
comparison_purpose: str = "train_test_validation"
|
|
311
|
+
) -> Dict[str, Any]:
|
|
312
|
+
"""
|
|
313
|
+
Compare two dataset summaries and identify differences.
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
dataset1_summary: Summary of first dataset
|
|
317
|
+
dataset2_summary: Summary of second dataset
|
|
318
|
+
comparison_purpose: 'train_test_validation', 'before_after', or 'a_b_test'
|
|
319
|
+
|
|
320
|
+
Returns:
|
|
321
|
+
{
|
|
322
|
+
"differences": List[str], # Key differences found
|
|
323
|
+
"concerns": List[str], # Potential issues
|
|
324
|
+
"data_drift": bool, # Whether distribution shift detected
|
|
325
|
+
"recommendation": str # What to do about differences
|
|
326
|
+
}
|
|
327
|
+
"""
|
|
328
|
+
reasoner = get_reasoner()
|
|
329
|
+
|
|
330
|
+
prompt = f"""Compare these two datasets:
|
|
331
|
+
|
|
332
|
+
**Dataset 1:**
|
|
333
|
+
{dataset1_summary}
|
|
334
|
+
|
|
335
|
+
**Dataset 2:**
|
|
336
|
+
{dataset2_summary}
|
|
337
|
+
|
|
338
|
+
**Comparison Purpose:** {comparison_purpose}
|
|
339
|
+
|
|
340
|
+
Identify:
|
|
341
|
+
1. Distribution differences
|
|
342
|
+
2. Schema differences
|
|
343
|
+
3. Data quality differences
|
|
344
|
+
4. Potential data drift or leakage
|
|
345
|
+
5. Whether differences are concerning
|
|
346
|
+
|
|
347
|
+
Be specific about what changed and why it matters."""
|
|
348
|
+
|
|
349
|
+
schema = {
|
|
350
|
+
"differences": ["array of key differences"],
|
|
351
|
+
"concerns": ["array of potential issues"],
|
|
352
|
+
"data_drift": "boolean",
|
|
353
|
+
"recommendation": "string - What to do"
|
|
354
|
+
}
|
|
355
|
+
|
|
356
|
+
return reasoner.reason_structured(prompt, schema)
|