ds-agent-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/bin/ds-agent.js +451 -0
  2. package/ds_agent/__init__.py +8 -0
  3. package/package.json +28 -0
  4. package/requirements.txt +126 -0
  5. package/setup.py +35 -0
  6. package/src/__init__.py +7 -0
  7. package/src/_compress_tool_result.py +118 -0
  8. package/src/api/__init__.py +4 -0
  9. package/src/api/app.py +1626 -0
  10. package/src/cache/__init__.py +5 -0
  11. package/src/cache/cache_manager.py +561 -0
  12. package/src/cli.py +2886 -0
  13. package/src/dynamic_prompts.py +281 -0
  14. package/src/orchestrator.py +4799 -0
  15. package/src/progress_manager.py +139 -0
  16. package/src/reasoning/__init__.py +332 -0
  17. package/src/reasoning/business_summary.py +431 -0
  18. package/src/reasoning/data_understanding.py +356 -0
  19. package/src/reasoning/model_explanation.py +383 -0
  20. package/src/reasoning/reasoning_trace.py +239 -0
  21. package/src/registry/__init__.py +3 -0
  22. package/src/registry/tools_registry.py +3 -0
  23. package/src/session_memory.py +448 -0
  24. package/src/session_store.py +370 -0
  25. package/src/storage/__init__.py +19 -0
  26. package/src/storage/artifact_store.py +620 -0
  27. package/src/storage/helpers.py +116 -0
  28. package/src/storage/huggingface_storage.py +694 -0
  29. package/src/storage/r2_storage.py +0 -0
  30. package/src/storage/user_files_service.py +288 -0
  31. package/src/tools/__init__.py +335 -0
  32. package/src/tools/advanced_analysis.py +823 -0
  33. package/src/tools/advanced_feature_engineering.py +708 -0
  34. package/src/tools/advanced_insights.py +578 -0
  35. package/src/tools/advanced_preprocessing.py +549 -0
  36. package/src/tools/advanced_training.py +906 -0
  37. package/src/tools/agent_tool_mapping.py +326 -0
  38. package/src/tools/auto_pipeline.py +420 -0
  39. package/src/tools/autogluon_training.py +1480 -0
  40. package/src/tools/business_intelligence.py +860 -0
  41. package/src/tools/cloud_data_sources.py +581 -0
  42. package/src/tools/code_interpreter.py +390 -0
  43. package/src/tools/computer_vision.py +614 -0
  44. package/src/tools/data_cleaning.py +614 -0
  45. package/src/tools/data_profiling.py +593 -0
  46. package/src/tools/data_type_conversion.py +268 -0
  47. package/src/tools/data_wrangling.py +433 -0
  48. package/src/tools/eda_reports.py +284 -0
  49. package/src/tools/enhanced_feature_engineering.py +241 -0
  50. package/src/tools/feature_engineering.py +302 -0
  51. package/src/tools/matplotlib_visualizations.py +1327 -0
  52. package/src/tools/model_training.py +520 -0
  53. package/src/tools/nlp_text_analytics.py +761 -0
  54. package/src/tools/plotly_visualizations.py +497 -0
  55. package/src/tools/production_mlops.py +852 -0
  56. package/src/tools/time_series.py +507 -0
  57. package/src/tools/tools_registry.py +2133 -0
  58. package/src/tools/visualization_engine.py +559 -0
  59. package/src/utils/__init__.py +42 -0
  60. package/src/utils/error_recovery.py +313 -0
  61. package/src/utils/parallel_executor.py +402 -0
  62. package/src/utils/polars_helpers.py +248 -0
  63. package/src/utils/schema_extraction.py +132 -0
  64. package/src/utils/semantic_layer.py +392 -0
  65. package/src/utils/token_budget.py +411 -0
  66. package/src/utils/validation.py +377 -0
  67. package/src/workflow_state.py +154 -0
@@ -0,0 +1,281 @@
1
+ """
2
+ Dynamic prompt generation for small context window models.
3
+ Loads only relevant tools based on user intent to reduce token usage.
4
+ """
5
+
6
+ from typing import List, Dict, Set
7
+ import re
8
+
9
+ # Intent categories and their keywords
10
+ INTENT_KEYWORDS = {
11
+ "data_quality": ["clean", "missing", "outlier", "quality", "duplicates", "null", "na", "impute"],
12
+ "visualization": ["plot", "chart", "graph", "visualize", "dashboard", "scatter", "histogram", "heatmap"],
13
+ "feature_engineering": ["feature", "encode", "transform", "scale", "normalize", "binning", "interaction"],
14
+ "model_training": ["train", "model", "predict", "classify", "regression", "forecast", "xgboost", "accuracy"],
15
+ "eda": ["profile", "describe", "summary", "statistics", "distribution", "correlation", "eda"],
16
+ "time_series": ["time", "date", "datetime", "temporal", "trend", "seasonality", "forecast"],
17
+ "optimization": ["tune", "optimize", "hyperparameter", "improve", "best parameters"],
18
+ "code_execution": ["execute", "run code", "calculate", "custom", "python"],
19
+ }
20
+
21
+ # Tool categories mapping
22
+ TOOL_CATEGORIES = {
23
+ "data_quality": [
24
+ "detect_data_quality_issues",
25
+ "clean_missing_values",
26
+ "handle_outliers",
27
+ "detect_and_remove_duplicates",
28
+ "force_numeric_conversion",
29
+ ],
30
+ "visualization": [
31
+ "generate_interactive_scatter",
32
+ "generate_interactive_histogram",
33
+ "generate_interactive_correlation_heatmap",
34
+ "generate_interactive_box_plots",
35
+ "generate_interactive_time_series",
36
+ "generate_plotly_dashboard",
37
+ "generate_all_plots",
38
+ "generate_data_quality_plots",
39
+ "generate_eda_plots",
40
+ ],
41
+ "feature_engineering": [
42
+ "encode_categorical",
43
+ "perform_feature_scaling",
44
+ "create_time_features",
45
+ "create_ratio_features",
46
+ "create_statistical_features",
47
+ "create_log_features",
48
+ "create_binned_features",
49
+ "auto_feature_engineering",
50
+ ],
51
+ "model_training": [
52
+ "train_baseline_models",
53
+ "hyperparameter_tuning",
54
+ "train_ensemble_models",
55
+ "perform_cross_validation",
56
+ "handle_imbalanced_data",
57
+ "auto_ml_pipeline",
58
+ ],
59
+ "eda": [
60
+ "profile_dataset",
61
+ "generate_ydata_profiling_report",
62
+ "analyze_distribution",
63
+ "detect_trends_and_seasonality",
64
+ "perform_hypothesis_testing",
65
+ ],
66
+ "time_series": [
67
+ "create_time_features",
68
+ "forecast_time_series",
69
+ "detect_trends_and_seasonality",
70
+ "generate_interactive_time_series",
71
+ ],
72
+ "optimization": [
73
+ "hyperparameter_tuning",
74
+ "auto_feature_selection",
75
+ "detect_and_handle_multicollinearity",
76
+ ],
77
+ "code_execution": [
78
+ "execute_python_code",
79
+ "execute_code_from_file",
80
+ ],
81
+ }
82
+
83
+ # Core tools always included (used in all workflows)
84
+ CORE_TOOLS = [
85
+ "profile_dataset",
86
+ "detect_data_quality_issues",
87
+ "clean_missing_values",
88
+ "encode_categorical",
89
+ ]
90
+
91
+
92
+ def detect_intent(query: str) -> Set[str]:
93
+ """
94
+ Detect user intent from query using keyword matching.
95
+
96
+ Args:
97
+ query: User's natural language query
98
+
99
+ Returns:
100
+ Set of intent categories detected
101
+ """
102
+ query_lower = query.lower()
103
+ detected_intents = set()
104
+
105
+ for intent, keywords in INTENT_KEYWORDS.items():
106
+ for keyword in keywords:
107
+ if keyword in query_lower:
108
+ detected_intents.add(intent)
109
+ break
110
+
111
+ # Default to EDA if no specific intent detected
112
+ if not detected_intents:
113
+ detected_intents.add("eda")
114
+
115
+ return detected_intents
116
+
117
+
118
+ def get_relevant_tools(intents: Set[str]) -> List[str]:
119
+ """
120
+ Get list of relevant tools based on detected intents.
121
+
122
+ Args:
123
+ intents: Set of detected intent categories
124
+
125
+ Returns:
126
+ List of tool names to include in prompt
127
+ """
128
+ tools = set(CORE_TOOLS) # Always include core tools
129
+
130
+ for intent in intents:
131
+ if intent in TOOL_CATEGORIES:
132
+ tools.update(TOOL_CATEGORIES[intent])
133
+
134
+ return sorted(list(tools))
135
+
136
+
137
+ def build_compact_system_prompt(user_query: str = None, detected_intents: Set[str] = None) -> str:
138
+ """
139
+ Build a compact system prompt with only relevant tools.
140
+
141
+ Args:
142
+ user_query: Optional user query to detect intent
143
+ detected_intents: Optional pre-detected intents
144
+
145
+ Returns:
146
+ Compact system prompt string
147
+ """
148
+ # Detect intents if not provided
149
+ if detected_intents is None and user_query:
150
+ detected_intents = detect_intent(user_query)
151
+ elif detected_intents is None:
152
+ detected_intents = {"eda"} # Default
153
+
154
+ # Get relevant tools
155
+ relevant_tools = get_relevant_tools(detected_intents)
156
+
157
+ # Build tool list string
158
+ tool_list = "\n".join([f"- {tool}" for tool in relevant_tools])
159
+
160
+ prompt = f"""You are an autonomous Data Science Agent. You EXECUTE tasks, not advise.
161
+
162
+ **TOOL CALLING FORMAT:**
163
+ When you need to use a tool, respond with JSON:
164
+ ```json
165
+ {{
166
+ "tool": "tool_name",
167
+ "arguments": {{"param1": "value1"}}
168
+ }}
169
+ ```
170
+
171
+ **RELEVANT TOOLS FOR THIS TASK:**
172
+ {tool_list}
173
+
174
+ **WORKFLOW RULES:**
175
+ 1. **Execute tools sequentially** - ONE tool per response
176
+ 2. **Use tool outputs** as inputs to next tool
177
+ 3. **Save outputs** to ./outputs/data/ or ./outputs/plots/
178
+ 4. **Error recovery**: If tool fails, retry with corrected parameters OR skip to next step
179
+ 5. **Never repeat** successful tools
180
+ 6. **Stop when done** - Don't continue after fulfilling user request
181
+
182
+ **COMMON WORKFLOWS:**
183
+
184
+ **Visualization Only:**
185
+ - User wants plots/charts/dashboard
186
+ - generate_plotly_dashboard OR generate_interactive_scatter → STOP
187
+
188
+ **Data Profiling:**
189
+ - User wants "detailed report"
190
+ - generate_ydata_profiling_report → STOP
191
+
192
+ **Full ML Pipeline:**
193
+ - User wants model training
194
+ - profile_dataset → detect_data_quality_issues → clean_missing_values →
195
+ encode_categorical → train_baseline_models → generate_plotly_dashboard
196
+
197
+ **PARAMETER CORRECTIONS:**
198
+ - Use exact column names from error messages
199
+ - If "Did you mean X?" → retry with X
200
+ - output_path (not output or output_dir)
201
+ - file_path for data files
202
+
203
+ **ERROR RECOVERY:**
204
+ - Column not found? Use suggested column from error
205
+ - File not found? Use last successful file
206
+ - Missing param? Add the required parameter
207
+ - Tool failed? Skip to next step (don't get stuck)
208
+
209
+ Execute the user's task efficiently with relevant tools."""
210
+
211
+ return prompt
212
+
213
+
214
+ def get_full_system_prompt() -> str:
215
+ """
216
+ Get the original full system prompt for models with large context windows.
217
+ This is the complete version used with Gemini 2.5 Flash.
218
+ """
219
+ # Import the original prompt from orchestrator
220
+ from src.orchestrator import DataScienceCopilot
221
+ copilot = DataScienceCopilot.__new__(DataScienceCopilot)
222
+ return copilot._build_system_prompt()
223
+
224
+
225
+ # Quick stats
226
+ def get_prompt_stats(prompt: str) -> Dict[str, int]:
227
+ """Get token count estimate and character count for prompt."""
228
+ chars = len(prompt)
229
+ # Rough estimate: 1 token ≈ 4 characters
230
+ tokens = chars // 4
231
+ lines = len(prompt.split('\n'))
232
+
233
+ return {
234
+ "characters": chars,
235
+ "estimated_tokens": tokens,
236
+ "lines": lines,
237
+ }
238
+
239
+
240
+ if __name__ == "__main__":
241
+ # Demo: Compare full vs compact prompts
242
+ print("=" * 80)
243
+ print("DYNAMIC PROMPT SYSTEM DEMO")
244
+ print("=" * 80)
245
+
246
+ # Example 1: Visualization request
247
+ query1 = "Generate interactive plots for magnitude and latitude"
248
+ intents1 = detect_intent(query1)
249
+ prompt1 = build_compact_system_prompt(user_query=query1)
250
+ stats1 = get_prompt_stats(prompt1)
251
+
252
+ print(f"\n📊 Example 1: '{query1}'")
253
+ print(f"Detected intents: {intents1}")
254
+ print(f"Tools loaded: {len(get_relevant_tools(intents1))}")
255
+ print(f"Prompt stats: {stats1['estimated_tokens']} tokens, {stats1['lines']} lines")
256
+
257
+ # Example 2: Full ML pipeline
258
+ query2 = "Train a model to predict earthquake magnitude"
259
+ intents2 = detect_intent(query2)
260
+ prompt2 = build_compact_system_prompt(user_query=query2)
261
+ stats2 = get_prompt_stats(prompt2)
262
+
263
+ print(f"\n🤖 Example 2: '{query2}'")
264
+ print(f"Detected intents: {intents2}")
265
+ print(f"Tools loaded: {len(get_relevant_tools(intents2))}")
266
+ print(f"Prompt stats: {stats2['estimated_tokens']} tokens, {stats2['lines']} lines")
267
+
268
+ # Example 3: Data profiling
269
+ query3 = "Generate a detailed profiling report"
270
+ intents3 = detect_intent(query3)
271
+ prompt3 = build_compact_system_prompt(user_query=query3)
272
+ stats3 = get_prompt_stats(prompt3)
273
+
274
+ print(f"\n📈 Example 3: '{query3}'")
275
+ print(f"Detected intents: {intents3}")
276
+ print(f"Tools loaded: {len(get_relevant_tools(intents3))}")
277
+ print(f"Prompt stats: {stats3['estimated_tokens']} tokens, {stats3['lines']} lines")
278
+
279
+ print("\n" + "=" * 80)
280
+ print("SUMMARY: Compact prompts reduce tokens by 80-90% for small context models!")
281
+ print("=" * 80)