ds-agent-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/bin/ds-agent.js +451 -0
  2. package/ds_agent/__init__.py +8 -0
  3. package/package.json +28 -0
  4. package/requirements.txt +126 -0
  5. package/setup.py +35 -0
  6. package/src/__init__.py +7 -0
  7. package/src/_compress_tool_result.py +118 -0
  8. package/src/api/__init__.py +4 -0
  9. package/src/api/app.py +1626 -0
  10. package/src/cache/__init__.py +5 -0
  11. package/src/cache/cache_manager.py +561 -0
  12. package/src/cli.py +2886 -0
  13. package/src/dynamic_prompts.py +281 -0
  14. package/src/orchestrator.py +4799 -0
  15. package/src/progress_manager.py +139 -0
  16. package/src/reasoning/__init__.py +332 -0
  17. package/src/reasoning/business_summary.py +431 -0
  18. package/src/reasoning/data_understanding.py +356 -0
  19. package/src/reasoning/model_explanation.py +383 -0
  20. package/src/reasoning/reasoning_trace.py +239 -0
  21. package/src/registry/__init__.py +3 -0
  22. package/src/registry/tools_registry.py +3 -0
  23. package/src/session_memory.py +448 -0
  24. package/src/session_store.py +370 -0
  25. package/src/storage/__init__.py +19 -0
  26. package/src/storage/artifact_store.py +620 -0
  27. package/src/storage/helpers.py +116 -0
  28. package/src/storage/huggingface_storage.py +694 -0
  29. package/src/storage/r2_storage.py +0 -0
  30. package/src/storage/user_files_service.py +288 -0
  31. package/src/tools/__init__.py +335 -0
  32. package/src/tools/advanced_analysis.py +823 -0
  33. package/src/tools/advanced_feature_engineering.py +708 -0
  34. package/src/tools/advanced_insights.py +578 -0
  35. package/src/tools/advanced_preprocessing.py +549 -0
  36. package/src/tools/advanced_training.py +906 -0
  37. package/src/tools/agent_tool_mapping.py +326 -0
  38. package/src/tools/auto_pipeline.py +420 -0
  39. package/src/tools/autogluon_training.py +1480 -0
  40. package/src/tools/business_intelligence.py +860 -0
  41. package/src/tools/cloud_data_sources.py +581 -0
  42. package/src/tools/code_interpreter.py +390 -0
  43. package/src/tools/computer_vision.py +614 -0
  44. package/src/tools/data_cleaning.py +614 -0
  45. package/src/tools/data_profiling.py +593 -0
  46. package/src/tools/data_type_conversion.py +268 -0
  47. package/src/tools/data_wrangling.py +433 -0
  48. package/src/tools/eda_reports.py +284 -0
  49. package/src/tools/enhanced_feature_engineering.py +241 -0
  50. package/src/tools/feature_engineering.py +302 -0
  51. package/src/tools/matplotlib_visualizations.py +1327 -0
  52. package/src/tools/model_training.py +520 -0
  53. package/src/tools/nlp_text_analytics.py +761 -0
  54. package/src/tools/plotly_visualizations.py +497 -0
  55. package/src/tools/production_mlops.py +852 -0
  56. package/src/tools/time_series.py +507 -0
  57. package/src/tools/tools_registry.py +2133 -0
  58. package/src/tools/visualization_engine.py +559 -0
  59. package/src/utils/__init__.py +42 -0
  60. package/src/utils/error_recovery.py +313 -0
  61. package/src/utils/parallel_executor.py +402 -0
  62. package/src/utils/polars_helpers.py +248 -0
  63. package/src/utils/schema_extraction.py +132 -0
  64. package/src/utils/semantic_layer.py +392 -0
  65. package/src/utils/token_budget.py +411 -0
  66. package/src/utils/validation.py +377 -0
  67. package/src/workflow_state.py +154 -0
@@ -0,0 +1,132 @@
1
+ """
2
+ Local Schema Extraction (No LLM)
3
+ Fast, cheap extraction of dataset metadata without sending to LLM.
4
+ """
5
+
6
+ import polars as pl
7
+ from pathlib import Path
8
+ from typing import Dict, Any, Optional
9
+
10
+
11
+ def extract_schema_local(file_path: str, sample_rows: int = 5) -> Dict[str, Any]:
12
+ """
13
+ Extract dataset schema and basic stats locally without LLM.
14
+
15
+ Returns:
16
+ - column names and types
17
+ - row/column counts
18
+ - missing value counts
19
+ - small sample for reference
20
+ - memory usage
21
+ """
22
+ try:
23
+ # Read with Polars (faster than pandas)
24
+ if file_path.endswith('.csv'):
25
+ # 🔥 FIX: Use infer_schema_length and ignore_errors to handle mixed-type columns
26
+ # This prevents failures like: could not parse `835.159865` as dtype `i64`
27
+ try:
28
+ df = pl.read_csv(file_path, infer_schema_length=10000, ignore_errors=True)
29
+ except Exception:
30
+ # Final fallback: read everything as strings, then let Polars infer
31
+ try:
32
+ import pandas as pd
33
+ pdf = pd.read_csv(file_path, low_memory=False)
34
+ df = pl.from_pandas(pdf)
35
+ except Exception as e2:
36
+ return {
37
+ 'error': f"Failed to read CSV: {str(e2)}",
38
+ 'file_path': file_path
39
+ }
40
+ elif file_path.endswith('.parquet'):
41
+ df = pl.read_parquet(file_path)
42
+ else:
43
+ # Fallback to pandas
44
+ import pandas as pd
45
+ pdf = pd.read_csv(file_path, low_memory=False)
46
+ df = pl.from_pandas(pdf)
47
+
48
+ # Basic metadata
49
+ schema_info = {
50
+ 'file_path': file_path,
51
+ 'file_size_mb': round(Path(file_path).stat().st_size / (1024 * 1024), 2),
52
+ 'num_rows': df.shape[0],
53
+ 'num_columns': df.shape[1],
54
+ 'columns': {}
55
+ }
56
+
57
+ # Per-column metadata
58
+ for col in df.columns:
59
+ col_series = df[col]
60
+ dtype_str = str(col_series.dtype)
61
+
62
+ col_info = {
63
+ 'dtype': dtype_str,
64
+ 'missing_count': col_series.null_count(),
65
+ 'missing_pct': round(col_series.null_count() / len(col_series) * 100, 2),
66
+ 'unique_count': col_series.n_unique() if len(col_series) < 100000 else None # Skip for huge datasets
67
+ }
68
+
69
+ # Type-specific stats (lightweight)
70
+ if dtype_str in ['Int64', 'Float64', 'Int32', 'Float32']:
71
+ try:
72
+ col_info['min'] = float(col_series.min())
73
+ col_info['max'] = float(col_series.max())
74
+ col_info['mean'] = float(col_series.mean())
75
+ except:
76
+ pass
77
+
78
+ schema_info['columns'][col] = col_info
79
+
80
+ # Small sample for LLM context (only first few rows)
81
+ sample_data = df.head(sample_rows).to_dicts()
82
+ schema_info['sample_rows'] = sample_data
83
+
84
+ # Categorize columns
85
+ schema_info['numeric_columns'] = [
86
+ col for col, info in schema_info['columns'].items()
87
+ if 'Int' in info['dtype'] or 'Float' in info['dtype']
88
+ ]
89
+ schema_info['categorical_columns'] = [
90
+ col for col, info in schema_info['columns'].items()
91
+ if info['dtype'] in ['Utf8', 'String'] or (
92
+ info.get('unique_count') is not None and
93
+ info.get('unique_count') < 50 and
94
+ col not in schema_info['numeric_columns']
95
+ )
96
+ ]
97
+ schema_info['datetime_columns'] = [
98
+ col for col, info in schema_info['columns'].items()
99
+ if 'Date' in info['dtype'] or 'Time' in info['dtype']
100
+ ]
101
+
102
+ return schema_info
103
+
104
+ except Exception as e:
105
+ return {
106
+ 'error': f"Failed to extract schema: {str(e)}",
107
+ 'file_path': file_path
108
+ }
109
+
110
+
111
+ def infer_task_type(target_column: str, schema_info: Dict[str, Any]) -> Optional[str]:
112
+ """
113
+ Infer ML task type from target column without LLM.
114
+ """
115
+ if not target_column or target_column not in schema_info.get('columns', {}):
116
+ return None
117
+
118
+ target_info = schema_info['columns'][target_column]
119
+
120
+ # Numeric with many unique values → regression
121
+ if target_info['dtype'] in ['Int64', 'Float64', 'Int32', 'Float32']:
122
+ unique_count = target_info.get('unique_count')
123
+ if unique_count and unique_count > 20:
124
+ return 'regression'
125
+ elif unique_count and unique_count <= 10:
126
+ return 'classification'
127
+
128
+ # Categorical or low cardinality → classification
129
+ if target_info['dtype'] in ['Utf8', 'String'] or target_info.get('unique_count', 0) <= 20:
130
+ return 'classification'
131
+
132
+ return None
@@ -0,0 +1,392 @@
1
+ """
2
+ Semantic Layer using SBERT for Column Understanding and Agent Routing
3
+
4
+ Provides semantic understanding of dataset columns and agent intent matching
5
+ using sentence-transformers embeddings.
6
+ """
7
+
8
+ import numpy as np
9
+ from typing import Dict, Any, List, Optional, Tuple
10
+ import polars as pl
11
+ from pathlib import Path
12
+ import json
13
+
14
+ # SBERT for semantic embeddings
15
+ try:
16
+ from sentence_transformers import SentenceTransformer
17
+ import torch
18
+ SBERT_AVAILABLE = True
19
+ except ImportError:
20
+ SBERT_AVAILABLE = False
21
+ print("⚠️ sentence-transformers not available. Install with: pip install sentence-transformers")
22
+
23
+ # Sklearn for similarity
24
+ try:
25
+ from sklearn.metrics.pairwise import cosine_similarity
26
+ SKLEARN_AVAILABLE = True
27
+ except ImportError:
28
+ SKLEARN_AVAILABLE = False
29
+
30
+
31
+ class SemanticLayer:
32
+ """
33
+ Semantic understanding layer using SBERT embeddings.
34
+
35
+ Features:
36
+ - Column semantic embedding (name + sample values + dtype)
37
+ - Semantic column matching (find similar columns)
38
+ - Agent intent routing (semantic task → agent mapping)
39
+ - Target column inference (semantic similarity to "target")
40
+ """
41
+
42
+ def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
43
+ """
44
+ Initialize semantic layer with SBERT model.
45
+
46
+ Args:
47
+ model_name: Sentence-transformer model name
48
+ - all-MiniLM-L6-v2: Fast, 384 dims (recommended)
49
+ - all-mpnet-base-v2: Better quality, 768 dims, slower
50
+ - paraphrase-MiniLM-L6-v2: Good for short texts
51
+ """
52
+ self.model_name = model_name
53
+ self.model = None
54
+ self.enabled = SBERT_AVAILABLE and SKLEARN_AVAILABLE
55
+
56
+ if self.enabled:
57
+ try:
58
+ print(f"🧠 Loading SBERT model: {model_name}...")
59
+ # Try loading with trust_remote_code for better compatibility
60
+ self.model = SentenceTransformer(model_name, trust_remote_code=True)
61
+ # Use GPU if available
62
+ if torch.cuda.is_available():
63
+ self.model = self.model.to('cuda')
64
+ print("✅ SBERT loaded on GPU")
65
+ else:
66
+ print("✅ SBERT loaded on CPU")
67
+ except Exception as e:
68
+ print(f"⚠️ Failed to load SBERT model: {e}")
69
+ print(f" Falling back to keyword-based routing (semantic features disabled)")
70
+ self.enabled = False
71
+ else:
72
+ print("⚠️ SBERT semantic layer disabled (missing dependencies)")
73
+
74
+ def encode_column(self, column_name: str, dtype: str,
75
+ sample_values: Optional[List[Any]] = None,
76
+ stats: Optional[Dict[str, Any]] = None) -> np.ndarray:
77
+ """
78
+ Create semantic embedding for a column.
79
+
80
+ Combines column name, data type, sample values, and stats into
81
+ a text description that captures the column's semantic meaning.
82
+
83
+ Args:
84
+ column_name: Name of the column
85
+ dtype: Data type (Int64, Float64, Utf8, etc.)
86
+ sample_values: Sample values from the column
87
+ stats: Optional statistics (mean, min, max, etc.)
88
+
89
+ Returns:
90
+ Embedding vector (numpy array)
91
+
92
+ Example:
93
+ >>> encode_column("annual_salary", "Float64", [50000, 75000], {"mean": 65000})
94
+ >>> # Returns embedding for "annual_salary (Float64 numeric): values like 50000, 75000, mean 65000"
95
+ """
96
+ if not self.enabled:
97
+ return np.zeros(384) # Dummy embedding
98
+
99
+ # Build semantic description
100
+ description_parts = [f"Column name: {column_name}"]
101
+
102
+ # Add type information
103
+ type_desc = self._interpret_dtype(dtype)
104
+ description_parts.append(f"Type: {type_desc}")
105
+
106
+ # Add sample values
107
+ if sample_values:
108
+ # Format samples nicely
109
+ samples_str = ", ".join([str(v)[:50] for v in sample_values[:5] if v is not None])
110
+ description_parts.append(f"Example values: {samples_str}")
111
+
112
+ # Add statistics
113
+ if stats:
114
+ if 'mean' in stats and stats['mean'] is not None:
115
+ description_parts.append(f"Mean: {stats['mean']:.2f}")
116
+ if 'unique_count' in stats and stats['unique_count'] is not None:
117
+ description_parts.append(f"Unique values: {stats['unique_count']}")
118
+ if 'null_percentage' in stats and stats['null_percentage'] is not None:
119
+ description_parts.append(f"Missing: {stats['null_percentage']:.1f}%")
120
+
121
+ # Combine into single text
122
+ text = ". ".join(description_parts)
123
+
124
+ # Generate embedding
125
+ try:
126
+ embedding = self.model.encode(text, convert_to_numpy=True, show_progress_bar=False)
127
+ return embedding
128
+ except Exception as e:
129
+ print(f"⚠️ Error encoding column {column_name}: {e}")
130
+ return np.zeros(self.model.get_sentence_embedding_dimension())
131
+
132
+ def _interpret_dtype(self, dtype: str) -> str:
133
+ """Convert polars dtype to human-readable description."""
134
+ dtype_lower = str(dtype).lower()
135
+
136
+ if 'int' in dtype_lower or 'float' in dtype_lower:
137
+ return "numeric continuous or count data"
138
+ elif 'bool' in dtype_lower:
139
+ return "boolean flag"
140
+ elif 'utf8' in dtype_lower or 'str' in dtype_lower:
141
+ return "text or categorical label"
142
+ elif 'date' in dtype_lower or 'time' in dtype_lower:
143
+ return "temporal timestamp"
144
+ else:
145
+ return "data values"
146
+
147
+ def find_similar_columns(self, query_column: str, column_embeddings: Dict[str, np.ndarray],
148
+ top_k: int = 3, threshold: float = 0.6) -> List[Tuple[str, float]]:
149
+ """
150
+ Find columns semantically similar to query column.
151
+
152
+ Use case: Detect duplicates or related columns
153
+ Example: "Salary" → finds ["Annual_Income", "Compensation", "Pay"]
154
+
155
+ Args:
156
+ query_column: Column name to search for
157
+ column_embeddings: Dict mapping column names to their embeddings
158
+ top_k: Number of similar columns to return
159
+ threshold: Minimum similarity score (0-1)
160
+
161
+ Returns:
162
+ List of (column_name, similarity_score) tuples
163
+ """
164
+ if not self.enabled or query_column not in column_embeddings:
165
+ return []
166
+
167
+ query_emb = column_embeddings[query_column].reshape(1, -1)
168
+
169
+ similarities = []
170
+ for col_name, col_emb in column_embeddings.items():
171
+ if col_name == query_column:
172
+ continue
173
+
174
+ sim = cosine_similarity(query_emb, col_emb.reshape(1, -1))[0][0]
175
+ if sim >= threshold:
176
+ similarities.append((col_name, float(sim)))
177
+
178
+ # Sort by similarity descending
179
+ similarities.sort(key=lambda x: x[1], reverse=True)
180
+ return similarities[:top_k]
181
+
182
+ def infer_target_column(self, column_embeddings: Dict[str, np.ndarray],
183
+ task_description: str) -> Optional[Tuple[str, float]]:
184
+ """
185
+ Infer which column is likely the target/label for prediction.
186
+
187
+ Uses semantic similarity between column descriptions and task description.
188
+
189
+ Args:
190
+ column_embeddings: Dict mapping column names to embeddings
191
+ task_description: User's task description
192
+
193
+ Returns:
194
+ (column_name, confidence_score) or None
195
+
196
+ Example:
197
+ >>> infer_target_column(embeddings, "predict house prices")
198
+ >>> ("Price", 0.85) # High confidence "Price" is target
199
+ """
200
+ if not self.enabled:
201
+ return None
202
+
203
+ # Encode task description
204
+ task_emb = self.model.encode(task_description, convert_to_numpy=True, show_progress_bar=False)
205
+ task_emb = task_emb.reshape(1, -1)
206
+
207
+ # Find column with highest similarity to task
208
+ best_col = None
209
+ best_score = 0.0
210
+
211
+ for col_name, col_emb in column_embeddings.items():
212
+ sim = cosine_similarity(task_emb, col_emb.reshape(1, -1))[0][0]
213
+ if sim > best_score:
214
+ best_score = sim
215
+ best_col = col_name
216
+
217
+ # Only return if confidence is reasonable
218
+ if best_score >= 0.4: # Threshold for target inference
219
+ return (best_col, float(best_score))
220
+
221
+ return None
222
+
223
+ def route_to_agent(self, task_description: str,
224
+ agent_descriptions: Dict[str, str]) -> Tuple[str, float]:
225
+ """
226
+ Route task to appropriate specialist agent using semantic similarity.
227
+
228
+ Replaces keyword-based routing with semantic understanding.
229
+
230
+ Args:
231
+ task_description: User's task description
232
+ agent_descriptions: Dict mapping agent_key → agent description
233
+
234
+ Returns:
235
+ (agent_key, confidence_score)
236
+
237
+ Example:
238
+ >>> route_to_agent("build a predictive model", {
239
+ ... "modeling_agent": "Expert in ML training and models",
240
+ ... "viz_agent": "Expert in visualizations"
241
+ ... })
242
+ >>> ("modeling_agent", 0.92)
243
+ """
244
+ if not self.enabled:
245
+ # Fallback to first agent
246
+ return list(agent_descriptions.keys())[0], 0.5
247
+
248
+ # Encode task
249
+ task_emb = self.model.encode(task_description, convert_to_numpy=True, show_progress_bar=False)
250
+ task_emb = task_emb.reshape(1, -1)
251
+
252
+ # Encode agent descriptions
253
+ best_agent = None
254
+ best_score = 0.0
255
+
256
+ for agent_key, agent_desc in agent_descriptions.items():
257
+ agent_emb = self.model.encode(agent_desc, convert_to_numpy=True, show_progress_bar=False)
258
+ agent_emb = agent_emb.reshape(1, -1)
259
+
260
+ sim = cosine_similarity(task_emb, agent_emb)[0][0]
261
+ if sim > best_score:
262
+ best_score = sim
263
+ best_agent = agent_key
264
+
265
+ return best_agent, float(best_score)
266
+
267
+ def semantic_column_match(self, target_name: str, available_columns: List[str],
268
+ threshold: float = 0.6) -> Optional[Tuple[str, float]]:
269
+ """
270
+ Find best matching column for a target name using fuzzy semantic matching.
271
+
272
+ Better than string fuzzy matching because it understands synonyms:
273
+ - "salary" matches "annual_income", "compensation", "pay"
274
+ - "target" matches "label", "class", "outcome"
275
+
276
+ Args:
277
+ target_name: Column name to find (might not exist exactly)
278
+ available_columns: List of actual column names in dataset
279
+ threshold: Minimum similarity to consider a match
280
+
281
+ Returns:
282
+ (matched_column, confidence) or None
283
+
284
+ Example:
285
+ >>> semantic_column_match("salary", ["Annual_Income", "Name", "Age"])
286
+ >>> ("Annual_Income", 0.78)
287
+ """
288
+ if not self.enabled:
289
+ # Fallback to exact match
290
+ if target_name in available_columns:
291
+ return (target_name, 1.0)
292
+ return None
293
+
294
+ # Encode target
295
+ target_emb = self.model.encode(target_name, convert_to_numpy=True, show_progress_bar=False)
296
+ target_emb = target_emb.reshape(1, -1)
297
+
298
+ # Find best match
299
+ best_col = None
300
+ best_score = 0.0
301
+
302
+ for col in available_columns:
303
+ col_emb = self.model.encode(col, convert_to_numpy=True, show_progress_bar=False)
304
+ col_emb = col_emb.reshape(1, -1)
305
+
306
+ sim = cosine_similarity(target_emb, col_emb)[0][0]
307
+ if sim > best_score:
308
+ best_score = sim
309
+ best_col = col
310
+
311
+ if best_score >= threshold:
312
+ return (best_col, float(best_score))
313
+
314
+ return None
315
+
316
+ def enrich_dataset_info(self, dataset_info: Dict[str, Any],
317
+ file_path: str, sample_size: int = 100) -> Dict[str, Any]:
318
+ """
319
+ Enrich dataset_info with semantic column embeddings.
320
+
321
+ Adds 'column_embeddings' and 'semantic_insights' to dataset_info.
322
+
323
+ Args:
324
+ dataset_info: Dataset info from schema_extraction
325
+ file_path: Path to CSV file
326
+ sample_size: Number of rows to sample for encoding
327
+
328
+ Returns:
329
+ Enhanced dataset_info with semantic layer
330
+ """
331
+ if not self.enabled:
332
+ return dataset_info
333
+
334
+ try:
335
+ # Load dataset
336
+ df = pl.read_csv(file_path, n_rows=sample_size)
337
+
338
+ column_embeddings = {}
339
+
340
+ for col_name, col_info in dataset_info['columns'].items():
341
+ # Get sample values
342
+ sample_values = df[col_name].head(5).to_list()
343
+
344
+ # Create embedding
345
+ embedding = self.encode_column(
346
+ column_name=col_name,
347
+ dtype=col_info['dtype'],
348
+ sample_values=sample_values,
349
+ stats={
350
+ 'unique_count': col_info.get('unique_count'),
351
+ 'missing_pct': col_info.get('missing_pct'),
352
+ 'mean': col_info.get('mean')
353
+ }
354
+ )
355
+
356
+ column_embeddings[col_name] = embedding
357
+
358
+ # Add to dataset_info
359
+ dataset_info['column_embeddings'] = column_embeddings
360
+
361
+ # Detect similar columns (potential duplicates)
362
+ similar_pairs = []
363
+ cols = list(column_embeddings.keys())
364
+ for i, col1 in enumerate(cols):
365
+ similar = self.find_similar_columns(col1, column_embeddings, top_k=1, threshold=0.75)
366
+ if similar:
367
+ similar_pairs.append((col1, similar[0][0], similar[0][1]))
368
+
369
+ dataset_info['semantic_insights'] = {
370
+ 'similar_columns': similar_pairs,
371
+ 'total_columns_embedded': len(column_embeddings)
372
+ }
373
+
374
+ print(f"🧠 Semantic layer: Embedded {len(column_embeddings)} columns")
375
+ if similar_pairs:
376
+ print(f" Found {len(similar_pairs)} similar column pairs (potential duplicates)")
377
+
378
+ except Exception as e:
379
+ print(f"⚠️ Error enriching dataset with semantic layer: {e}")
380
+
381
+ return dataset_info
382
+
383
+
384
+ # Global semantic layer instance (lazy loaded)
385
+ _semantic_layer = None
386
+
387
+ def get_semantic_layer() -> SemanticLayer:
388
+ """Get or create global semantic layer instance."""
389
+ global _semantic_layer
390
+ if _semantic_layer is None:
391
+ _semantic_layer = SemanticLayer()
392
+ return _semantic_layer