ds-agent-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ds-agent.js +451 -0
- package/ds_agent/__init__.py +8 -0
- package/package.json +28 -0
- package/requirements.txt +126 -0
- package/setup.py +35 -0
- package/src/__init__.py +7 -0
- package/src/_compress_tool_result.py +118 -0
- package/src/api/__init__.py +4 -0
- package/src/api/app.py +1626 -0
- package/src/cache/__init__.py +5 -0
- package/src/cache/cache_manager.py +561 -0
- package/src/cli.py +2886 -0
- package/src/dynamic_prompts.py +281 -0
- package/src/orchestrator.py +4799 -0
- package/src/progress_manager.py +139 -0
- package/src/reasoning/__init__.py +332 -0
- package/src/reasoning/business_summary.py +431 -0
- package/src/reasoning/data_understanding.py +356 -0
- package/src/reasoning/model_explanation.py +383 -0
- package/src/reasoning/reasoning_trace.py +239 -0
- package/src/registry/__init__.py +3 -0
- package/src/registry/tools_registry.py +3 -0
- package/src/session_memory.py +448 -0
- package/src/session_store.py +370 -0
- package/src/storage/__init__.py +19 -0
- package/src/storage/artifact_store.py +620 -0
- package/src/storage/helpers.py +116 -0
- package/src/storage/huggingface_storage.py +694 -0
- package/src/storage/r2_storage.py +0 -0
- package/src/storage/user_files_service.py +288 -0
- package/src/tools/__init__.py +335 -0
- package/src/tools/advanced_analysis.py +823 -0
- package/src/tools/advanced_feature_engineering.py +708 -0
- package/src/tools/advanced_insights.py +578 -0
- package/src/tools/advanced_preprocessing.py +549 -0
- package/src/tools/advanced_training.py +906 -0
- package/src/tools/agent_tool_mapping.py +326 -0
- package/src/tools/auto_pipeline.py +420 -0
- package/src/tools/autogluon_training.py +1480 -0
- package/src/tools/business_intelligence.py +860 -0
- package/src/tools/cloud_data_sources.py +581 -0
- package/src/tools/code_interpreter.py +390 -0
- package/src/tools/computer_vision.py +614 -0
- package/src/tools/data_cleaning.py +614 -0
- package/src/tools/data_profiling.py +593 -0
- package/src/tools/data_type_conversion.py +268 -0
- package/src/tools/data_wrangling.py +433 -0
- package/src/tools/eda_reports.py +284 -0
- package/src/tools/enhanced_feature_engineering.py +241 -0
- package/src/tools/feature_engineering.py +302 -0
- package/src/tools/matplotlib_visualizations.py +1327 -0
- package/src/tools/model_training.py +520 -0
- package/src/tools/nlp_text_analytics.py +761 -0
- package/src/tools/plotly_visualizations.py +497 -0
- package/src/tools/production_mlops.py +852 -0
- package/src/tools/time_series.py +507 -0
- package/src/tools/tools_registry.py +2133 -0
- package/src/tools/visualization_engine.py +559 -0
- package/src/utils/__init__.py +42 -0
- package/src/utils/error_recovery.py +313 -0
- package/src/utils/parallel_executor.py +402 -0
- package/src/utils/polars_helpers.py +248 -0
- package/src/utils/schema_extraction.py +132 -0
- package/src/utils/semantic_layer.py +392 -0
- package/src/utils/token_budget.py +411 -0
- package/src/utils/validation.py +377 -0
- package/src/workflow_state.py +154 -0
|
@@ -0,0 +1,132 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Local Schema Extraction (No LLM)
|
|
3
|
+
Fast, cheap extraction of dataset metadata without sending to LLM.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Dict, Any, Optional
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def extract_schema_local(file_path: str, sample_rows: int = 5) -> Dict[str, Any]:
|
|
12
|
+
"""
|
|
13
|
+
Extract dataset schema and basic stats locally without LLM.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
- column names and types
|
|
17
|
+
- row/column counts
|
|
18
|
+
- missing value counts
|
|
19
|
+
- small sample for reference
|
|
20
|
+
- memory usage
|
|
21
|
+
"""
|
|
22
|
+
try:
|
|
23
|
+
# Read with Polars (faster than pandas)
|
|
24
|
+
if file_path.endswith('.csv'):
|
|
25
|
+
# 🔥 FIX: Use infer_schema_length and ignore_errors to handle mixed-type columns
|
|
26
|
+
# This prevents failures like: could not parse `835.159865` as dtype `i64`
|
|
27
|
+
try:
|
|
28
|
+
df = pl.read_csv(file_path, infer_schema_length=10000, ignore_errors=True)
|
|
29
|
+
except Exception:
|
|
30
|
+
# Final fallback: read everything as strings, then let Polars infer
|
|
31
|
+
try:
|
|
32
|
+
import pandas as pd
|
|
33
|
+
pdf = pd.read_csv(file_path, low_memory=False)
|
|
34
|
+
df = pl.from_pandas(pdf)
|
|
35
|
+
except Exception as e2:
|
|
36
|
+
return {
|
|
37
|
+
'error': f"Failed to read CSV: {str(e2)}",
|
|
38
|
+
'file_path': file_path
|
|
39
|
+
}
|
|
40
|
+
elif file_path.endswith('.parquet'):
|
|
41
|
+
df = pl.read_parquet(file_path)
|
|
42
|
+
else:
|
|
43
|
+
# Fallback to pandas
|
|
44
|
+
import pandas as pd
|
|
45
|
+
pdf = pd.read_csv(file_path, low_memory=False)
|
|
46
|
+
df = pl.from_pandas(pdf)
|
|
47
|
+
|
|
48
|
+
# Basic metadata
|
|
49
|
+
schema_info = {
|
|
50
|
+
'file_path': file_path,
|
|
51
|
+
'file_size_mb': round(Path(file_path).stat().st_size / (1024 * 1024), 2),
|
|
52
|
+
'num_rows': df.shape[0],
|
|
53
|
+
'num_columns': df.shape[1],
|
|
54
|
+
'columns': {}
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
# Per-column metadata
|
|
58
|
+
for col in df.columns:
|
|
59
|
+
col_series = df[col]
|
|
60
|
+
dtype_str = str(col_series.dtype)
|
|
61
|
+
|
|
62
|
+
col_info = {
|
|
63
|
+
'dtype': dtype_str,
|
|
64
|
+
'missing_count': col_series.null_count(),
|
|
65
|
+
'missing_pct': round(col_series.null_count() / len(col_series) * 100, 2),
|
|
66
|
+
'unique_count': col_series.n_unique() if len(col_series) < 100000 else None # Skip for huge datasets
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
# Type-specific stats (lightweight)
|
|
70
|
+
if dtype_str in ['Int64', 'Float64', 'Int32', 'Float32']:
|
|
71
|
+
try:
|
|
72
|
+
col_info['min'] = float(col_series.min())
|
|
73
|
+
col_info['max'] = float(col_series.max())
|
|
74
|
+
col_info['mean'] = float(col_series.mean())
|
|
75
|
+
except:
|
|
76
|
+
pass
|
|
77
|
+
|
|
78
|
+
schema_info['columns'][col] = col_info
|
|
79
|
+
|
|
80
|
+
# Small sample for LLM context (only first few rows)
|
|
81
|
+
sample_data = df.head(sample_rows).to_dicts()
|
|
82
|
+
schema_info['sample_rows'] = sample_data
|
|
83
|
+
|
|
84
|
+
# Categorize columns
|
|
85
|
+
schema_info['numeric_columns'] = [
|
|
86
|
+
col for col, info in schema_info['columns'].items()
|
|
87
|
+
if 'Int' in info['dtype'] or 'Float' in info['dtype']
|
|
88
|
+
]
|
|
89
|
+
schema_info['categorical_columns'] = [
|
|
90
|
+
col for col, info in schema_info['columns'].items()
|
|
91
|
+
if info['dtype'] in ['Utf8', 'String'] or (
|
|
92
|
+
info.get('unique_count') is not None and
|
|
93
|
+
info.get('unique_count') < 50 and
|
|
94
|
+
col not in schema_info['numeric_columns']
|
|
95
|
+
)
|
|
96
|
+
]
|
|
97
|
+
schema_info['datetime_columns'] = [
|
|
98
|
+
col for col, info in schema_info['columns'].items()
|
|
99
|
+
if 'Date' in info['dtype'] or 'Time' in info['dtype']
|
|
100
|
+
]
|
|
101
|
+
|
|
102
|
+
return schema_info
|
|
103
|
+
|
|
104
|
+
except Exception as e:
|
|
105
|
+
return {
|
|
106
|
+
'error': f"Failed to extract schema: {str(e)}",
|
|
107
|
+
'file_path': file_path
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def infer_task_type(target_column: str, schema_info: Dict[str, Any]) -> Optional[str]:
|
|
112
|
+
"""
|
|
113
|
+
Infer ML task type from target column without LLM.
|
|
114
|
+
"""
|
|
115
|
+
if not target_column or target_column not in schema_info.get('columns', {}):
|
|
116
|
+
return None
|
|
117
|
+
|
|
118
|
+
target_info = schema_info['columns'][target_column]
|
|
119
|
+
|
|
120
|
+
# Numeric with many unique values → regression
|
|
121
|
+
if target_info['dtype'] in ['Int64', 'Float64', 'Int32', 'Float32']:
|
|
122
|
+
unique_count = target_info.get('unique_count')
|
|
123
|
+
if unique_count and unique_count > 20:
|
|
124
|
+
return 'regression'
|
|
125
|
+
elif unique_count and unique_count <= 10:
|
|
126
|
+
return 'classification'
|
|
127
|
+
|
|
128
|
+
# Categorical or low cardinality → classification
|
|
129
|
+
if target_info['dtype'] in ['Utf8', 'String'] or target_info.get('unique_count', 0) <= 20:
|
|
130
|
+
return 'classification'
|
|
131
|
+
|
|
132
|
+
return None
|
|
@@ -0,0 +1,392 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Semantic Layer using SBERT for Column Understanding and Agent Routing
|
|
3
|
+
|
|
4
|
+
Provides semantic understanding of dataset columns and agent intent matching
|
|
5
|
+
using sentence-transformers embeddings.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
from typing import Dict, Any, List, Optional, Tuple
|
|
10
|
+
import polars as pl
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import json
|
|
13
|
+
|
|
14
|
+
# SBERT for semantic embeddings
|
|
15
|
+
try:
|
|
16
|
+
from sentence_transformers import SentenceTransformer
|
|
17
|
+
import torch
|
|
18
|
+
SBERT_AVAILABLE = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
SBERT_AVAILABLE = False
|
|
21
|
+
print("⚠️ sentence-transformers not available. Install with: pip install sentence-transformers")
|
|
22
|
+
|
|
23
|
+
# Sklearn for similarity
|
|
24
|
+
try:
|
|
25
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
26
|
+
SKLEARN_AVAILABLE = True
|
|
27
|
+
except ImportError:
|
|
28
|
+
SKLEARN_AVAILABLE = False
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class SemanticLayer:
|
|
32
|
+
"""
|
|
33
|
+
Semantic understanding layer using SBERT embeddings.
|
|
34
|
+
|
|
35
|
+
Features:
|
|
36
|
+
- Column semantic embedding (name + sample values + dtype)
|
|
37
|
+
- Semantic column matching (find similar columns)
|
|
38
|
+
- Agent intent routing (semantic task → agent mapping)
|
|
39
|
+
- Target column inference (semantic similarity to "target")
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
def __init__(self, model_name: str = "all-MiniLM-L6-v2"):
|
|
43
|
+
"""
|
|
44
|
+
Initialize semantic layer with SBERT model.
|
|
45
|
+
|
|
46
|
+
Args:
|
|
47
|
+
model_name: Sentence-transformer model name
|
|
48
|
+
- all-MiniLM-L6-v2: Fast, 384 dims (recommended)
|
|
49
|
+
- all-mpnet-base-v2: Better quality, 768 dims, slower
|
|
50
|
+
- paraphrase-MiniLM-L6-v2: Good for short texts
|
|
51
|
+
"""
|
|
52
|
+
self.model_name = model_name
|
|
53
|
+
self.model = None
|
|
54
|
+
self.enabled = SBERT_AVAILABLE and SKLEARN_AVAILABLE
|
|
55
|
+
|
|
56
|
+
if self.enabled:
|
|
57
|
+
try:
|
|
58
|
+
print(f"🧠 Loading SBERT model: {model_name}...")
|
|
59
|
+
# Try loading with trust_remote_code for better compatibility
|
|
60
|
+
self.model = SentenceTransformer(model_name, trust_remote_code=True)
|
|
61
|
+
# Use GPU if available
|
|
62
|
+
if torch.cuda.is_available():
|
|
63
|
+
self.model = self.model.to('cuda')
|
|
64
|
+
print("✅ SBERT loaded on GPU")
|
|
65
|
+
else:
|
|
66
|
+
print("✅ SBERT loaded on CPU")
|
|
67
|
+
except Exception as e:
|
|
68
|
+
print(f"⚠️ Failed to load SBERT model: {e}")
|
|
69
|
+
print(f" Falling back to keyword-based routing (semantic features disabled)")
|
|
70
|
+
self.enabled = False
|
|
71
|
+
else:
|
|
72
|
+
print("⚠️ SBERT semantic layer disabled (missing dependencies)")
|
|
73
|
+
|
|
74
|
+
def encode_column(self, column_name: str, dtype: str,
|
|
75
|
+
sample_values: Optional[List[Any]] = None,
|
|
76
|
+
stats: Optional[Dict[str, Any]] = None) -> np.ndarray:
|
|
77
|
+
"""
|
|
78
|
+
Create semantic embedding for a column.
|
|
79
|
+
|
|
80
|
+
Combines column name, data type, sample values, and stats into
|
|
81
|
+
a text description that captures the column's semantic meaning.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
column_name: Name of the column
|
|
85
|
+
dtype: Data type (Int64, Float64, Utf8, etc.)
|
|
86
|
+
sample_values: Sample values from the column
|
|
87
|
+
stats: Optional statistics (mean, min, max, etc.)
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Embedding vector (numpy array)
|
|
91
|
+
|
|
92
|
+
Example:
|
|
93
|
+
>>> encode_column("annual_salary", "Float64", [50000, 75000], {"mean": 65000})
|
|
94
|
+
>>> # Returns embedding for "annual_salary (Float64 numeric): values like 50000, 75000, mean 65000"
|
|
95
|
+
"""
|
|
96
|
+
if not self.enabled:
|
|
97
|
+
return np.zeros(384) # Dummy embedding
|
|
98
|
+
|
|
99
|
+
# Build semantic description
|
|
100
|
+
description_parts = [f"Column name: {column_name}"]
|
|
101
|
+
|
|
102
|
+
# Add type information
|
|
103
|
+
type_desc = self._interpret_dtype(dtype)
|
|
104
|
+
description_parts.append(f"Type: {type_desc}")
|
|
105
|
+
|
|
106
|
+
# Add sample values
|
|
107
|
+
if sample_values:
|
|
108
|
+
# Format samples nicely
|
|
109
|
+
samples_str = ", ".join([str(v)[:50] for v in sample_values[:5] if v is not None])
|
|
110
|
+
description_parts.append(f"Example values: {samples_str}")
|
|
111
|
+
|
|
112
|
+
# Add statistics
|
|
113
|
+
if stats:
|
|
114
|
+
if 'mean' in stats and stats['mean'] is not None:
|
|
115
|
+
description_parts.append(f"Mean: {stats['mean']:.2f}")
|
|
116
|
+
if 'unique_count' in stats and stats['unique_count'] is not None:
|
|
117
|
+
description_parts.append(f"Unique values: {stats['unique_count']}")
|
|
118
|
+
if 'null_percentage' in stats and stats['null_percentage'] is not None:
|
|
119
|
+
description_parts.append(f"Missing: {stats['null_percentage']:.1f}%")
|
|
120
|
+
|
|
121
|
+
# Combine into single text
|
|
122
|
+
text = ". ".join(description_parts)
|
|
123
|
+
|
|
124
|
+
# Generate embedding
|
|
125
|
+
try:
|
|
126
|
+
embedding = self.model.encode(text, convert_to_numpy=True, show_progress_bar=False)
|
|
127
|
+
return embedding
|
|
128
|
+
except Exception as e:
|
|
129
|
+
print(f"⚠️ Error encoding column {column_name}: {e}")
|
|
130
|
+
return np.zeros(self.model.get_sentence_embedding_dimension())
|
|
131
|
+
|
|
132
|
+
def _interpret_dtype(self, dtype: str) -> str:
|
|
133
|
+
"""Convert polars dtype to human-readable description."""
|
|
134
|
+
dtype_lower = str(dtype).lower()
|
|
135
|
+
|
|
136
|
+
if 'int' in dtype_lower or 'float' in dtype_lower:
|
|
137
|
+
return "numeric continuous or count data"
|
|
138
|
+
elif 'bool' in dtype_lower:
|
|
139
|
+
return "boolean flag"
|
|
140
|
+
elif 'utf8' in dtype_lower or 'str' in dtype_lower:
|
|
141
|
+
return "text or categorical label"
|
|
142
|
+
elif 'date' in dtype_lower or 'time' in dtype_lower:
|
|
143
|
+
return "temporal timestamp"
|
|
144
|
+
else:
|
|
145
|
+
return "data values"
|
|
146
|
+
|
|
147
|
+
def find_similar_columns(self, query_column: str, column_embeddings: Dict[str, np.ndarray],
|
|
148
|
+
top_k: int = 3, threshold: float = 0.6) -> List[Tuple[str, float]]:
|
|
149
|
+
"""
|
|
150
|
+
Find columns semantically similar to query column.
|
|
151
|
+
|
|
152
|
+
Use case: Detect duplicates or related columns
|
|
153
|
+
Example: "Salary" → finds ["Annual_Income", "Compensation", "Pay"]
|
|
154
|
+
|
|
155
|
+
Args:
|
|
156
|
+
query_column: Column name to search for
|
|
157
|
+
column_embeddings: Dict mapping column names to their embeddings
|
|
158
|
+
top_k: Number of similar columns to return
|
|
159
|
+
threshold: Minimum similarity score (0-1)
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
List of (column_name, similarity_score) tuples
|
|
163
|
+
"""
|
|
164
|
+
if not self.enabled or query_column not in column_embeddings:
|
|
165
|
+
return []
|
|
166
|
+
|
|
167
|
+
query_emb = column_embeddings[query_column].reshape(1, -1)
|
|
168
|
+
|
|
169
|
+
similarities = []
|
|
170
|
+
for col_name, col_emb in column_embeddings.items():
|
|
171
|
+
if col_name == query_column:
|
|
172
|
+
continue
|
|
173
|
+
|
|
174
|
+
sim = cosine_similarity(query_emb, col_emb.reshape(1, -1))[0][0]
|
|
175
|
+
if sim >= threshold:
|
|
176
|
+
similarities.append((col_name, float(sim)))
|
|
177
|
+
|
|
178
|
+
# Sort by similarity descending
|
|
179
|
+
similarities.sort(key=lambda x: x[1], reverse=True)
|
|
180
|
+
return similarities[:top_k]
|
|
181
|
+
|
|
182
|
+
def infer_target_column(self, column_embeddings: Dict[str, np.ndarray],
|
|
183
|
+
task_description: str) -> Optional[Tuple[str, float]]:
|
|
184
|
+
"""
|
|
185
|
+
Infer which column is likely the target/label for prediction.
|
|
186
|
+
|
|
187
|
+
Uses semantic similarity between column descriptions and task description.
|
|
188
|
+
|
|
189
|
+
Args:
|
|
190
|
+
column_embeddings: Dict mapping column names to embeddings
|
|
191
|
+
task_description: User's task description
|
|
192
|
+
|
|
193
|
+
Returns:
|
|
194
|
+
(column_name, confidence_score) or None
|
|
195
|
+
|
|
196
|
+
Example:
|
|
197
|
+
>>> infer_target_column(embeddings, "predict house prices")
|
|
198
|
+
>>> ("Price", 0.85) # High confidence "Price" is target
|
|
199
|
+
"""
|
|
200
|
+
if not self.enabled:
|
|
201
|
+
return None
|
|
202
|
+
|
|
203
|
+
# Encode task description
|
|
204
|
+
task_emb = self.model.encode(task_description, convert_to_numpy=True, show_progress_bar=False)
|
|
205
|
+
task_emb = task_emb.reshape(1, -1)
|
|
206
|
+
|
|
207
|
+
# Find column with highest similarity to task
|
|
208
|
+
best_col = None
|
|
209
|
+
best_score = 0.0
|
|
210
|
+
|
|
211
|
+
for col_name, col_emb in column_embeddings.items():
|
|
212
|
+
sim = cosine_similarity(task_emb, col_emb.reshape(1, -1))[0][0]
|
|
213
|
+
if sim > best_score:
|
|
214
|
+
best_score = sim
|
|
215
|
+
best_col = col_name
|
|
216
|
+
|
|
217
|
+
# Only return if confidence is reasonable
|
|
218
|
+
if best_score >= 0.4: # Threshold for target inference
|
|
219
|
+
return (best_col, float(best_score))
|
|
220
|
+
|
|
221
|
+
return None
|
|
222
|
+
|
|
223
|
+
def route_to_agent(self, task_description: str,
|
|
224
|
+
agent_descriptions: Dict[str, str]) -> Tuple[str, float]:
|
|
225
|
+
"""
|
|
226
|
+
Route task to appropriate specialist agent using semantic similarity.
|
|
227
|
+
|
|
228
|
+
Replaces keyword-based routing with semantic understanding.
|
|
229
|
+
|
|
230
|
+
Args:
|
|
231
|
+
task_description: User's task description
|
|
232
|
+
agent_descriptions: Dict mapping agent_key → agent description
|
|
233
|
+
|
|
234
|
+
Returns:
|
|
235
|
+
(agent_key, confidence_score)
|
|
236
|
+
|
|
237
|
+
Example:
|
|
238
|
+
>>> route_to_agent("build a predictive model", {
|
|
239
|
+
... "modeling_agent": "Expert in ML training and models",
|
|
240
|
+
... "viz_agent": "Expert in visualizations"
|
|
241
|
+
... })
|
|
242
|
+
>>> ("modeling_agent", 0.92)
|
|
243
|
+
"""
|
|
244
|
+
if not self.enabled:
|
|
245
|
+
# Fallback to first agent
|
|
246
|
+
return list(agent_descriptions.keys())[0], 0.5
|
|
247
|
+
|
|
248
|
+
# Encode task
|
|
249
|
+
task_emb = self.model.encode(task_description, convert_to_numpy=True, show_progress_bar=False)
|
|
250
|
+
task_emb = task_emb.reshape(1, -1)
|
|
251
|
+
|
|
252
|
+
# Encode agent descriptions
|
|
253
|
+
best_agent = None
|
|
254
|
+
best_score = 0.0
|
|
255
|
+
|
|
256
|
+
for agent_key, agent_desc in agent_descriptions.items():
|
|
257
|
+
agent_emb = self.model.encode(agent_desc, convert_to_numpy=True, show_progress_bar=False)
|
|
258
|
+
agent_emb = agent_emb.reshape(1, -1)
|
|
259
|
+
|
|
260
|
+
sim = cosine_similarity(task_emb, agent_emb)[0][0]
|
|
261
|
+
if sim > best_score:
|
|
262
|
+
best_score = sim
|
|
263
|
+
best_agent = agent_key
|
|
264
|
+
|
|
265
|
+
return best_agent, float(best_score)
|
|
266
|
+
|
|
267
|
+
def semantic_column_match(self, target_name: str, available_columns: List[str],
|
|
268
|
+
threshold: float = 0.6) -> Optional[Tuple[str, float]]:
|
|
269
|
+
"""
|
|
270
|
+
Find best matching column for a target name using fuzzy semantic matching.
|
|
271
|
+
|
|
272
|
+
Better than string fuzzy matching because it understands synonyms:
|
|
273
|
+
- "salary" matches "annual_income", "compensation", "pay"
|
|
274
|
+
- "target" matches "label", "class", "outcome"
|
|
275
|
+
|
|
276
|
+
Args:
|
|
277
|
+
target_name: Column name to find (might not exist exactly)
|
|
278
|
+
available_columns: List of actual column names in dataset
|
|
279
|
+
threshold: Minimum similarity to consider a match
|
|
280
|
+
|
|
281
|
+
Returns:
|
|
282
|
+
(matched_column, confidence) or None
|
|
283
|
+
|
|
284
|
+
Example:
|
|
285
|
+
>>> semantic_column_match("salary", ["Annual_Income", "Name", "Age"])
|
|
286
|
+
>>> ("Annual_Income", 0.78)
|
|
287
|
+
"""
|
|
288
|
+
if not self.enabled:
|
|
289
|
+
# Fallback to exact match
|
|
290
|
+
if target_name in available_columns:
|
|
291
|
+
return (target_name, 1.0)
|
|
292
|
+
return None
|
|
293
|
+
|
|
294
|
+
# Encode target
|
|
295
|
+
target_emb = self.model.encode(target_name, convert_to_numpy=True, show_progress_bar=False)
|
|
296
|
+
target_emb = target_emb.reshape(1, -1)
|
|
297
|
+
|
|
298
|
+
# Find best match
|
|
299
|
+
best_col = None
|
|
300
|
+
best_score = 0.0
|
|
301
|
+
|
|
302
|
+
for col in available_columns:
|
|
303
|
+
col_emb = self.model.encode(col, convert_to_numpy=True, show_progress_bar=False)
|
|
304
|
+
col_emb = col_emb.reshape(1, -1)
|
|
305
|
+
|
|
306
|
+
sim = cosine_similarity(target_emb, col_emb)[0][0]
|
|
307
|
+
if sim > best_score:
|
|
308
|
+
best_score = sim
|
|
309
|
+
best_col = col
|
|
310
|
+
|
|
311
|
+
if best_score >= threshold:
|
|
312
|
+
return (best_col, float(best_score))
|
|
313
|
+
|
|
314
|
+
return None
|
|
315
|
+
|
|
316
|
+
def enrich_dataset_info(self, dataset_info: Dict[str, Any],
|
|
317
|
+
file_path: str, sample_size: int = 100) -> Dict[str, Any]:
|
|
318
|
+
"""
|
|
319
|
+
Enrich dataset_info with semantic column embeddings.
|
|
320
|
+
|
|
321
|
+
Adds 'column_embeddings' and 'semantic_insights' to dataset_info.
|
|
322
|
+
|
|
323
|
+
Args:
|
|
324
|
+
dataset_info: Dataset info from schema_extraction
|
|
325
|
+
file_path: Path to CSV file
|
|
326
|
+
sample_size: Number of rows to sample for encoding
|
|
327
|
+
|
|
328
|
+
Returns:
|
|
329
|
+
Enhanced dataset_info with semantic layer
|
|
330
|
+
"""
|
|
331
|
+
if not self.enabled:
|
|
332
|
+
return dataset_info
|
|
333
|
+
|
|
334
|
+
try:
|
|
335
|
+
# Load dataset
|
|
336
|
+
df = pl.read_csv(file_path, n_rows=sample_size)
|
|
337
|
+
|
|
338
|
+
column_embeddings = {}
|
|
339
|
+
|
|
340
|
+
for col_name, col_info in dataset_info['columns'].items():
|
|
341
|
+
# Get sample values
|
|
342
|
+
sample_values = df[col_name].head(5).to_list()
|
|
343
|
+
|
|
344
|
+
# Create embedding
|
|
345
|
+
embedding = self.encode_column(
|
|
346
|
+
column_name=col_name,
|
|
347
|
+
dtype=col_info['dtype'],
|
|
348
|
+
sample_values=sample_values,
|
|
349
|
+
stats={
|
|
350
|
+
'unique_count': col_info.get('unique_count'),
|
|
351
|
+
'missing_pct': col_info.get('missing_pct'),
|
|
352
|
+
'mean': col_info.get('mean')
|
|
353
|
+
}
|
|
354
|
+
)
|
|
355
|
+
|
|
356
|
+
column_embeddings[col_name] = embedding
|
|
357
|
+
|
|
358
|
+
# Add to dataset_info
|
|
359
|
+
dataset_info['column_embeddings'] = column_embeddings
|
|
360
|
+
|
|
361
|
+
# Detect similar columns (potential duplicates)
|
|
362
|
+
similar_pairs = []
|
|
363
|
+
cols = list(column_embeddings.keys())
|
|
364
|
+
for i, col1 in enumerate(cols):
|
|
365
|
+
similar = self.find_similar_columns(col1, column_embeddings, top_k=1, threshold=0.75)
|
|
366
|
+
if similar:
|
|
367
|
+
similar_pairs.append((col1, similar[0][0], similar[0][1]))
|
|
368
|
+
|
|
369
|
+
dataset_info['semantic_insights'] = {
|
|
370
|
+
'similar_columns': similar_pairs,
|
|
371
|
+
'total_columns_embedded': len(column_embeddings)
|
|
372
|
+
}
|
|
373
|
+
|
|
374
|
+
print(f"🧠 Semantic layer: Embedded {len(column_embeddings)} columns")
|
|
375
|
+
if similar_pairs:
|
|
376
|
+
print(f" Found {len(similar_pairs)} similar column pairs (potential duplicates)")
|
|
377
|
+
|
|
378
|
+
except Exception as e:
|
|
379
|
+
print(f"⚠️ Error enriching dataset with semantic layer: {e}")
|
|
380
|
+
|
|
381
|
+
return dataset_info
|
|
382
|
+
|
|
383
|
+
|
|
384
|
+
# Global semantic layer instance (lazy loaded)
|
|
385
|
+
_semantic_layer = None
|
|
386
|
+
|
|
387
|
+
def get_semantic_layer() -> SemanticLayer:
|
|
388
|
+
"""Get or create global semantic layer instance."""
|
|
389
|
+
global _semantic_layer
|
|
390
|
+
if _semantic_layer is None:
|
|
391
|
+
_semantic_layer = SemanticLayer()
|
|
392
|
+
return _semantic_layer
|