ds-agent-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/bin/ds-agent.js +451 -0
  2. package/ds_agent/__init__.py +8 -0
  3. package/package.json +28 -0
  4. package/requirements.txt +126 -0
  5. package/setup.py +35 -0
  6. package/src/__init__.py +7 -0
  7. package/src/_compress_tool_result.py +118 -0
  8. package/src/api/__init__.py +4 -0
  9. package/src/api/app.py +1626 -0
  10. package/src/cache/__init__.py +5 -0
  11. package/src/cache/cache_manager.py +561 -0
  12. package/src/cli.py +2886 -0
  13. package/src/dynamic_prompts.py +281 -0
  14. package/src/orchestrator.py +4799 -0
  15. package/src/progress_manager.py +139 -0
  16. package/src/reasoning/__init__.py +332 -0
  17. package/src/reasoning/business_summary.py +431 -0
  18. package/src/reasoning/data_understanding.py +356 -0
  19. package/src/reasoning/model_explanation.py +383 -0
  20. package/src/reasoning/reasoning_trace.py +239 -0
  21. package/src/registry/__init__.py +3 -0
  22. package/src/registry/tools_registry.py +3 -0
  23. package/src/session_memory.py +448 -0
  24. package/src/session_store.py +370 -0
  25. package/src/storage/__init__.py +19 -0
  26. package/src/storage/artifact_store.py +620 -0
  27. package/src/storage/helpers.py +116 -0
  28. package/src/storage/huggingface_storage.py +694 -0
  29. package/src/storage/r2_storage.py +0 -0
  30. package/src/storage/user_files_service.py +288 -0
  31. package/src/tools/__init__.py +335 -0
  32. package/src/tools/advanced_analysis.py +823 -0
  33. package/src/tools/advanced_feature_engineering.py +708 -0
  34. package/src/tools/advanced_insights.py +578 -0
  35. package/src/tools/advanced_preprocessing.py +549 -0
  36. package/src/tools/advanced_training.py +906 -0
  37. package/src/tools/agent_tool_mapping.py +326 -0
  38. package/src/tools/auto_pipeline.py +420 -0
  39. package/src/tools/autogluon_training.py +1480 -0
  40. package/src/tools/business_intelligence.py +860 -0
  41. package/src/tools/cloud_data_sources.py +581 -0
  42. package/src/tools/code_interpreter.py +390 -0
  43. package/src/tools/computer_vision.py +614 -0
  44. package/src/tools/data_cleaning.py +614 -0
  45. package/src/tools/data_profiling.py +593 -0
  46. package/src/tools/data_type_conversion.py +268 -0
  47. package/src/tools/data_wrangling.py +433 -0
  48. package/src/tools/eda_reports.py +284 -0
  49. package/src/tools/enhanced_feature_engineering.py +241 -0
  50. package/src/tools/feature_engineering.py +302 -0
  51. package/src/tools/matplotlib_visualizations.py +1327 -0
  52. package/src/tools/model_training.py +520 -0
  53. package/src/tools/nlp_text_analytics.py +761 -0
  54. package/src/tools/plotly_visualizations.py +497 -0
  55. package/src/tools/production_mlops.py +852 -0
  56. package/src/tools/time_series.py +507 -0
  57. package/src/tools/tools_registry.py +2133 -0
  58. package/src/tools/visualization_engine.py +559 -0
  59. package/src/utils/__init__.py +42 -0
  60. package/src/utils/error_recovery.py +313 -0
  61. package/src/utils/parallel_executor.py +402 -0
  62. package/src/utils/polars_helpers.py +248 -0
  63. package/src/utils/schema_extraction.py +132 -0
  64. package/src/utils/semantic_layer.py +392 -0
  65. package/src/utils/token_budget.py +411 -0
  66. package/src/utils/validation.py +377 -0
  67. package/src/workflow_state.py +154 -0
@@ -0,0 +1,390 @@
1
+ """
2
+ Code Interpreter Tool
3
+ Allows the AI agent to write and execute custom Python code for tasks that don't have predefined tools.
4
+ This is what makes it a TRUE AI Agent, not just a function-calling bot.
5
+ """
6
+
7
+ import os
8
+ import sys
9
+ import subprocess
10
+ import tempfile
11
+ from pathlib import Path
12
+ from typing import Dict, Any, Optional
13
+ import polars as pl
14
+
15
+
16
+ def execute_python_code(
17
+ code: str,
18
+ working_directory: str = "./outputs/code",
19
+ timeout: int = 60,
20
+ allow_file_operations: bool = True,
21
+ output_file: Optional[str] = None
22
+ ) -> Dict[str, Any]:
23
+ """
24
+ Execute custom Python code written by the AI agent.
25
+
26
+ This is the KEY tool that transforms the agent from a function-calling bot
27
+ into a true AI agent capable of solving ANY data science problem.
28
+
29
+ Use cases:
30
+ - Custom visualizations not covered by existing tools
31
+ - Data transformations too specific for generic tools
32
+ - Domain-specific calculations
33
+ - Interactive dashboards
34
+ - Custom export formats
35
+
36
+ Args:
37
+ code: Python code to execute
38
+ working_directory: Where to run the code (default: ./outputs/code)
39
+ timeout: Maximum execution time in seconds
40
+ allow_file_operations: Whether code can read/write files
41
+ output_file: Optional file path to save output (e.g., HTML plot)
42
+
43
+ Returns:
44
+ Dict with execution results, stdout, stderr, and any generated files
45
+
46
+ Example:
47
+ # Agent can write custom Plotly code for specific visualizations
48
+ code = '''
49
+ import plotly.express as px
50
+ import pandas as pd
51
+
52
+ df = pd.read_csv('./temp/sales_data.csv')
53
+ fig = px.line(df, x='month', y='sales', color='bike_model',
54
+ title='Extended Sales by Month for Each Bike Model')
55
+
56
+ # Add dropdown filter
57
+ fig.update_layout(
58
+ updatemenus=[{
59
+ 'buttons': [{'label': model, 'method': 'update',
60
+ 'args': [{'visible': [model == m for m in df['bike_model'].unique()]}]}
61
+ for model in df['bike_model'].unique()],
62
+ 'direction': 'down',
63
+ 'showactive': True
64
+ }]
65
+ )
66
+
67
+ fig.write_html('./outputs/code/bike_sales_interactive.html')
68
+ print("Chart saved to: ./outputs/code/bike_sales_interactive.html")
69
+ '''
70
+
71
+ result = execute_python_code(code)
72
+ """
73
+ try:
74
+ # ⚠️ CRITICAL: Basic syntax validation BEFORE execution
75
+ try:
76
+ compile(code, '<string>', 'exec')
77
+ except SyntaxError as e:
78
+ return {
79
+ "success": False,
80
+ "error": f"Syntax error in generated code: {str(e)}",
81
+ "error_type": "SyntaxError",
82
+ "line": e.lineno,
83
+ "suggestion": "Fix syntax errors in the code. Common issues: missing quotes, parentheses, indentation"
84
+ }
85
+
86
+ # Create working directory with proper permissions
87
+ try:
88
+ os.makedirs(working_directory, exist_ok=True)
89
+ # Ensure directory is writable
90
+ test_file = os.path.join(working_directory, '.write_test')
91
+ with open(test_file, 'w') as f:
92
+ f.write('test')
93
+ os.remove(test_file)
94
+ except PermissionError:
95
+ return {
96
+ "success": False,
97
+ "error": f"No write permission for directory: {working_directory}",
98
+ "error_type": "PermissionError",
99
+ "suggestion": f"Check folder permissions or use a different directory"
100
+ }
101
+ except Exception as e:
102
+ return {
103
+ "success": False,
104
+ "error": f"Failed to create working directory: {str(e)}",
105
+ "error_type": type(e).__name__
106
+ }
107
+
108
+ # Security: Validate code doesn't contain dangerous operations
109
+ dangerous_patterns = {
110
+ 'subprocess': 'Use specialized tools instead of shell commands',
111
+ '__import__': 'Dynamic imports not allowed for security',
112
+ 'eval(': 'eval() is dangerous - rewrite without it',
113
+ 'exec(': 'exec() is dangerous - rewrite without it',
114
+ 'compile(': 'compile() not needed - write code directly',
115
+ 'os.system': 'Shell commands not allowed - use Python libraries',
116
+ 'os.popen': 'Shell commands not allowed - use Python libraries'
117
+ }
118
+
119
+ for pattern, reason in dangerous_patterns.items():
120
+ if pattern in code:
121
+ return {
122
+ "success": False,
123
+ "error": f"Code contains restricted operation: {pattern}",
124
+ "error_type": "SecurityError",
125
+ "reason": reason,
126
+ "suggestion": "Rewrite code using safe Python operations"
127
+ }
128
+
129
+ # Create temporary Python file with better error handling
130
+ temp_file = None
131
+ try:
132
+ with tempfile.NamedTemporaryFile(mode='w', suffix='.py', delete=False,
133
+ dir=working_directory, encoding='utf-8') as f:
134
+ temp_file = f.name
135
+
136
+ # Add helper imports at the top + error handling wrapper
137
+ enhanced_code = """
138
+ # Auto-imported libraries for convenience
139
+ import pandas as pd
140
+ import polars as pl
141
+ import numpy as np
142
+ import matplotlib
143
+ matplotlib.use('Agg') # Non-interactive backend
144
+ import matplotlib.pyplot as plt
145
+ import seaborn as sns
146
+ import plotly.express as px
147
+ import plotly.graph_objects as go
148
+ from pathlib import Path
149
+ import json
150
+ import sys
151
+ import traceback
152
+
153
+ # Ensure output directory exists
154
+ import os
155
+ os.makedirs('./outputs/code', exist_ok=True)
156
+ os.makedirs('./outputs/data', exist_ok=True)
157
+
158
+ try:
159
+ # User's code starts here
160
+ """ + "\n".join(" " + line for line in code.split("\n")) + """
161
+
162
+ except Exception as e:
163
+ print(f"❌ Error in code execution: {str(e)}", file=sys.stderr)
164
+ traceback.print_exc()
165
+ sys.exit(1)
166
+ """
167
+
168
+ f.write(enhanced_code)
169
+
170
+ except Exception as e:
171
+ return {
172
+ "success": False,
173
+ "error": f"Failed to write temporary file: {str(e)}",
174
+ "error_type": type(e).__name__,
175
+ "suggestion": "Check file write permissions"
176
+ }
177
+
178
+ # Track existing files BEFORE execution to detect new files
179
+ existing_files = set()
180
+ # 🔥 FIX: Also scan /tmp/data_science_agent/ since LLM often saves files there
181
+ scan_dirs = ['./outputs/code', './outputs/data', './outputs/plots', '/tmp/data_science_agent']
182
+ if allow_file_operations:
183
+ for output_dir in scan_dirs:
184
+ if os.path.exists(output_dir):
185
+ for file_path in Path(output_dir).resolve().glob('**/*'):
186
+ if file_path.is_file():
187
+ existing_files.add(file_path.resolve())
188
+
189
+ try:
190
+ # Execute the code with better error capture
191
+ # Use absolute path and normalize it for Windows
192
+ abs_temp_file = os.path.abspath(temp_file)
193
+ abs_cwd = os.path.abspath(Path.cwd())
194
+
195
+ result = subprocess.run(
196
+ [sys.executable, abs_temp_file],
197
+ capture_output=True,
198
+ text=True,
199
+ timeout=timeout,
200
+ cwd=abs_cwd # Use absolute path to avoid permission issues
201
+ )
202
+
203
+ stdout = result.stdout.strip()
204
+ stderr = result.stderr.strip()
205
+ returncode = result.returncode
206
+
207
+ # Check for errors with detailed diagnostics
208
+ if returncode != 0:
209
+ # Parse error message for common issues
210
+ error_hints = []
211
+ if "PermissionError" in stderr:
212
+ error_hints.append("💡 File permission issue - check if file is open in another program")
213
+ if "FileNotFoundError" in stderr:
214
+ error_hints.append("💡 File not found - check if path is correct (use relative paths like './outputs/data/file.csv')")
215
+ if "KeyError" in stderr:
216
+ error_hints.append("💡 Column not found - check column names in the CSV")
217
+ if "ModuleNotFoundError" in stderr:
218
+ error_hints.append("💡 Missing library - may need to install additional packages")
219
+ if "ValueError" in stderr:
220
+ error_hints.append("💡 Data type mismatch - check data types and conversions")
221
+
222
+ return {
223
+ "success": False,
224
+ "error": f"Code execution failed",
225
+ "stderr": stderr,
226
+ "stdout": stdout if stdout else None,
227
+ "error_type": "ExecutionError",
228
+ "exit_code": returncode,
229
+ "hints": error_hints if error_hints else ["Check the error message above for details"]
230
+ }
231
+
232
+ # Success! Find NEWLY generated files (not existing before execution)
233
+ generated_files = []
234
+ # 🔥 FIX: Also scan /tmp/data_science_agent/ for files created by LLM code
235
+ scan_dirs = ['./outputs/code', './outputs/data', './outputs/plots', '/tmp/data_science_agent']
236
+ if allow_file_operations:
237
+ cwd = Path.cwd()
238
+ for output_dir in scan_dirs:
239
+ if os.path.exists(output_dir):
240
+ abs_output_dir = Path(output_dir).resolve()
241
+ for file_path in abs_output_dir.glob('**/*'):
242
+ if file_path.is_file():
243
+ abs_file = file_path.resolve()
244
+
245
+ # Only include if it's NEW (didn't exist before) or MODIFIED
246
+ is_new = abs_file not in existing_files
247
+
248
+ # Check if file was modified in last 5 seconds (just created/updated)
249
+ import time
250
+ file_age = time.time() - file_path.stat().st_mtime
251
+ is_recent = file_age < 5
252
+
253
+ if (is_new or is_recent):
254
+ # Get relative path safely (handle Windows paths)
255
+ try:
256
+ rel_path = file_path.relative_to(cwd)
257
+ except ValueError:
258
+ # Fallback: just use the file name with output dir
259
+ rel_path = Path(output_dir) / file_path.name
260
+
261
+ # Only include if not temp file and has content
262
+ abs_temp = Path(temp_file).resolve() if temp_file else None
263
+ if file_path != abs_temp and file_path.stat().st_size > 0:
264
+ generated_files.append(str(rel_path).replace('\\', '/'))
265
+
266
+ # Sort by modification time (newest first)
267
+ if generated_files:
268
+ generated_files = sorted(
269
+ generated_files,
270
+ key=lambda x: Path(x).stat().st_mtime,
271
+ reverse=True
272
+ )[:10] # Limit to 10 most recent files
273
+
274
+ return {
275
+ "success": True,
276
+ "stdout": stdout if stdout else "✅ Code executed successfully (no output)",
277
+ "stderr": stderr if stderr else None,
278
+ "message": "✅ Code executed successfully",
279
+ "generated_files": generated_files,
280
+ "working_directory": working_directory,
281
+ "execution_summary": {
282
+ "lines_of_code": len(code.split('\n')),
283
+ "files_generated": len(generated_files)
284
+ }
285
+ }
286
+
287
+ finally:
288
+ # Clean up temp file
289
+ if temp_file and os.path.exists(temp_file):
290
+ try:
291
+ os.unlink(temp_file)
292
+ except Exception:
293
+ pass # Ignore cleanup errors
294
+
295
+ except subprocess.TimeoutExpired:
296
+ return {
297
+ "success": False,
298
+ "error": f"Code execution timed out after {timeout} seconds",
299
+ "error_type": "TimeoutError",
300
+ "suggestion": "Code is taking too long. Optimize it or increase timeout. Avoid large loops or heavy computations."
301
+ }
302
+ except Exception as e:
303
+ return {
304
+ "success": False,
305
+ "error": f"Unexpected error: {str(e)}",
306
+ "error_type": type(e).__name__,
307
+ "suggestion": "This is an unexpected error. Try simplifying the code."
308
+ }
309
+
310
+
311
+ def execute_code_from_file(
312
+ file_path: str,
313
+ working_directory: str = "./outputs/code",
314
+ timeout: int = 60
315
+ ) -> Dict[str, Any]:
316
+ """
317
+ Execute Python code from a file.
318
+
319
+ Useful when code is too long to pass as a string, or when the agent
320
+ wants to run an existing script.
321
+
322
+ Args:
323
+ file_path: Path to Python file to execute
324
+ working_directory: Where to run the code
325
+ timeout: Maximum execution time in seconds
326
+
327
+ Returns:
328
+ Dict with execution results
329
+ """
330
+ try:
331
+ # Read code from file
332
+ with open(file_path, 'r', encoding='utf-8') as f:
333
+ code = f.read()
334
+
335
+ return execute_python_code(
336
+ code=code,
337
+ working_directory=working_directory,
338
+ timeout=timeout
339
+ )
340
+ except FileNotFoundError:
341
+ return {
342
+ "success": False,
343
+ "error": f"File not found: {file_path}",
344
+ "error_type": "FileNotFoundError"
345
+ }
346
+ except Exception as e:
347
+ return {
348
+ "success": False,
349
+ "error": f"Failed to read file: {str(e)}",
350
+ "error_type": type(e).__name__
351
+ }
352
+
353
+
354
+ def generate_custom_visualization(
355
+ data_file: str,
356
+ visualization_description: str,
357
+ output_path: str = "./outputs/code/custom_plot.html",
358
+ timeout: int = 60
359
+ ) -> Dict[str, Any]:
360
+ """
361
+ HIGH-LEVEL helper: Generate custom visualization from natural language description.
362
+
363
+ The agent describes what it wants, and this function attempts to generate the code.
364
+ This is a convenience wrapper that could use an LLM to generate the plotting code.
365
+
366
+ Args:
367
+ data_file: Path to dataset
368
+ visualization_description: Natural language description of desired plot
369
+ output_path: Where to save the visualization
370
+ timeout: Execution timeout
371
+
372
+ Returns:
373
+ Dict with execution results
374
+
375
+ Example:
376
+ result = generate_custom_visualization(
377
+ data_file="./temp/sales.csv",
378
+ visualization_description="Line plot of sales by month for each bike model, with dropdown filter",
379
+ output_path="./outputs/code/sales_plot.html"
380
+ )
381
+ """
382
+ # This is a placeholder - in a full implementation, this would use an LLM
383
+ # to generate the Plotly code from the description
384
+
385
+ return {
386
+ "success": False,
387
+ "error": "Not yet implemented - use execute_python_code with explicit code instead",
388
+ "error_type": "NotImplementedError",
389
+ "suggestion": "Write the Plotly code explicitly and use execute_python_code()"
390
+ }