ds-agent-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ds-agent.js +451 -0
- package/ds_agent/__init__.py +8 -0
- package/package.json +28 -0
- package/requirements.txt +126 -0
- package/setup.py +35 -0
- package/src/__init__.py +7 -0
- package/src/_compress_tool_result.py +118 -0
- package/src/api/__init__.py +4 -0
- package/src/api/app.py +1626 -0
- package/src/cache/__init__.py +5 -0
- package/src/cache/cache_manager.py +561 -0
- package/src/cli.py +2886 -0
- package/src/dynamic_prompts.py +281 -0
- package/src/orchestrator.py +4799 -0
- package/src/progress_manager.py +139 -0
- package/src/reasoning/__init__.py +332 -0
- package/src/reasoning/business_summary.py +431 -0
- package/src/reasoning/data_understanding.py +356 -0
- package/src/reasoning/model_explanation.py +383 -0
- package/src/reasoning/reasoning_trace.py +239 -0
- package/src/registry/__init__.py +3 -0
- package/src/registry/tools_registry.py +3 -0
- package/src/session_memory.py +448 -0
- package/src/session_store.py +370 -0
- package/src/storage/__init__.py +19 -0
- package/src/storage/artifact_store.py +620 -0
- package/src/storage/helpers.py +116 -0
- package/src/storage/huggingface_storage.py +694 -0
- package/src/storage/r2_storage.py +0 -0
- package/src/storage/user_files_service.py +288 -0
- package/src/tools/__init__.py +335 -0
- package/src/tools/advanced_analysis.py +823 -0
- package/src/tools/advanced_feature_engineering.py +708 -0
- package/src/tools/advanced_insights.py +578 -0
- package/src/tools/advanced_preprocessing.py +549 -0
- package/src/tools/advanced_training.py +906 -0
- package/src/tools/agent_tool_mapping.py +326 -0
- package/src/tools/auto_pipeline.py +420 -0
- package/src/tools/autogluon_training.py +1480 -0
- package/src/tools/business_intelligence.py +860 -0
- package/src/tools/cloud_data_sources.py +581 -0
- package/src/tools/code_interpreter.py +390 -0
- package/src/tools/computer_vision.py +614 -0
- package/src/tools/data_cleaning.py +614 -0
- package/src/tools/data_profiling.py +593 -0
- package/src/tools/data_type_conversion.py +268 -0
- package/src/tools/data_wrangling.py +433 -0
- package/src/tools/eda_reports.py +284 -0
- package/src/tools/enhanced_feature_engineering.py +241 -0
- package/src/tools/feature_engineering.py +302 -0
- package/src/tools/matplotlib_visualizations.py +1327 -0
- package/src/tools/model_training.py +520 -0
- package/src/tools/nlp_text_analytics.py +761 -0
- package/src/tools/plotly_visualizations.py +497 -0
- package/src/tools/production_mlops.py +852 -0
- package/src/tools/time_series.py +507 -0
- package/src/tools/tools_registry.py +2133 -0
- package/src/tools/visualization_engine.py +559 -0
- package/src/utils/__init__.py +42 -0
- package/src/utils/error_recovery.py +313 -0
- package/src/utils/parallel_executor.py +402 -0
- package/src/utils/polars_helpers.py +248 -0
- package/src/utils/schema_extraction.py +132 -0
- package/src/utils/semantic_layer.py +392 -0
- package/src/utils/token_budget.py +411 -0
- package/src/utils/validation.py +377 -0
- package/src/workflow_state.py +154 -0
|
@@ -0,0 +1,288 @@
|
|
|
1
|
+
"""
|
|
2
|
+
User Files Service - Manages file metadata in Supabase
|
|
3
|
+
|
|
4
|
+
This service:
|
|
5
|
+
1. Tracks all user files (plots, CSVs, reports, models) in Supabase
|
|
6
|
+
2. Provides file listing for the Assets panel
|
|
7
|
+
3. Handles file expiration and cleanup coordination
|
|
8
|
+
4. Works with R2StorageService for actual file storage
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import os
|
|
12
|
+
from datetime import datetime, timedelta
|
|
13
|
+
from typing import Optional, Dict, Any, List
|
|
14
|
+
from dataclasses import dataclass
|
|
15
|
+
from enum import Enum
|
|
16
|
+
|
|
17
|
+
# Supabase client import
|
|
18
|
+
try:
|
|
19
|
+
from supabase import create_client, Client
|
|
20
|
+
except ImportError:
|
|
21
|
+
print("Warning: supabase package not installed. Run: pip install supabase")
|
|
22
|
+
Client = None
|
|
23
|
+
|
|
24
|
+
SUPABASE_URL = os.getenv("SUPABASE_URL", "")
|
|
25
|
+
SUPABASE_SERVICE_KEY = os.getenv("SUPABASE_SERVICE_KEY", "") # Use service key for backend
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class FileType(Enum):
|
|
29
|
+
PLOT = "plot"
|
|
30
|
+
CSV = "csv"
|
|
31
|
+
REPORT = "report"
|
|
32
|
+
MODEL = "model"
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass
|
|
36
|
+
class UserFile:
|
|
37
|
+
"""Represents a user file record."""
|
|
38
|
+
id: str
|
|
39
|
+
user_id: str
|
|
40
|
+
session_id: Optional[str]
|
|
41
|
+
file_type: FileType
|
|
42
|
+
file_name: str
|
|
43
|
+
r2_key: str
|
|
44
|
+
size_bytes: int
|
|
45
|
+
mime_type: str
|
|
46
|
+
metadata: Dict[str, Any]
|
|
47
|
+
created_at: datetime
|
|
48
|
+
expires_at: datetime
|
|
49
|
+
download_url: Optional[str] = None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class UserFilesService:
|
|
53
|
+
"""Service for managing user file metadata in Supabase."""
|
|
54
|
+
|
|
55
|
+
def __init__(self):
|
|
56
|
+
"""Initialize Supabase client."""
|
|
57
|
+
if not SUPABASE_URL or not SUPABASE_SERVICE_KEY:
|
|
58
|
+
raise ValueError("SUPABASE_URL and SUPABASE_SERVICE_KEY must be set")
|
|
59
|
+
|
|
60
|
+
self.client: Client = create_client(SUPABASE_URL, SUPABASE_SERVICE_KEY)
|
|
61
|
+
self.table = "user_files"
|
|
62
|
+
|
|
63
|
+
# ==================== CREATE ====================
|
|
64
|
+
|
|
65
|
+
def create_file_record(
|
|
66
|
+
self,
|
|
67
|
+
user_id: str,
|
|
68
|
+
file_type: FileType,
|
|
69
|
+
file_name: str,
|
|
70
|
+
r2_key: str,
|
|
71
|
+
size_bytes: int,
|
|
72
|
+
session_id: Optional[str] = None,
|
|
73
|
+
mime_type: str = "application/octet-stream",
|
|
74
|
+
metadata: Optional[Dict[str, Any]] = None,
|
|
75
|
+
expires_in_days: int = 7
|
|
76
|
+
) -> UserFile:
|
|
77
|
+
"""
|
|
78
|
+
Create a file record in Supabase.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
user_id: User ID
|
|
82
|
+
file_type: Type of file
|
|
83
|
+
file_name: Display name
|
|
84
|
+
r2_key: R2 storage key
|
|
85
|
+
size_bytes: File size
|
|
86
|
+
session_id: Optional chat session ID
|
|
87
|
+
mime_type: MIME type
|
|
88
|
+
metadata: Additional metadata (plot type, metrics, etc.)
|
|
89
|
+
expires_in_days: Days until file expires
|
|
90
|
+
|
|
91
|
+
Returns:
|
|
92
|
+
Created UserFile record
|
|
93
|
+
"""
|
|
94
|
+
expires_at = datetime.utcnow() + timedelta(days=expires_in_days)
|
|
95
|
+
|
|
96
|
+
data = {
|
|
97
|
+
"user_id": user_id,
|
|
98
|
+
"session_id": session_id,
|
|
99
|
+
"file_type": file_type.value,
|
|
100
|
+
"file_name": file_name,
|
|
101
|
+
"r2_key": r2_key,
|
|
102
|
+
"size_bytes": size_bytes,
|
|
103
|
+
"mime_type": mime_type,
|
|
104
|
+
"metadata": metadata or {},
|
|
105
|
+
"expires_at": expires_at.isoformat()
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
result = self.client.table(self.table).insert(data).execute()
|
|
109
|
+
|
|
110
|
+
if result.data:
|
|
111
|
+
return self._to_user_file(result.data[0])
|
|
112
|
+
raise Exception("Failed to create file record")
|
|
113
|
+
|
|
114
|
+
# ==================== READ ====================
|
|
115
|
+
|
|
116
|
+
def get_user_files(
|
|
117
|
+
self,
|
|
118
|
+
user_id: str,
|
|
119
|
+
file_type: Optional[FileType] = None,
|
|
120
|
+
session_id: Optional[str] = None,
|
|
121
|
+
include_expired: bool = False
|
|
122
|
+
) -> List[UserFile]:
|
|
123
|
+
"""
|
|
124
|
+
Get all files for a user.
|
|
125
|
+
|
|
126
|
+
Args:
|
|
127
|
+
user_id: User ID
|
|
128
|
+
file_type: Optional filter by type
|
|
129
|
+
session_id: Optional filter by session
|
|
130
|
+
include_expired: Include expired files
|
|
131
|
+
|
|
132
|
+
Returns:
|
|
133
|
+
List of UserFile records
|
|
134
|
+
"""
|
|
135
|
+
query = self.client.table(self.table)\
|
|
136
|
+
.select("*")\
|
|
137
|
+
.eq("user_id", user_id)\
|
|
138
|
+
.eq("is_deleted", False)
|
|
139
|
+
|
|
140
|
+
if file_type:
|
|
141
|
+
query = query.eq("file_type", file_type.value)
|
|
142
|
+
|
|
143
|
+
if session_id:
|
|
144
|
+
query = query.eq("session_id", session_id)
|
|
145
|
+
|
|
146
|
+
if not include_expired:
|
|
147
|
+
query = query.gt("expires_at", datetime.utcnow().isoformat())
|
|
148
|
+
|
|
149
|
+
query = query.order("created_at", desc=True)
|
|
150
|
+
|
|
151
|
+
result = query.execute()
|
|
152
|
+
|
|
153
|
+
return [self._to_user_file(row) for row in (result.data or [])]
|
|
154
|
+
|
|
155
|
+
def get_file_by_id(self, file_id: str) -> Optional[UserFile]:
|
|
156
|
+
"""Get a specific file by ID."""
|
|
157
|
+
result = self.client.table(self.table)\
|
|
158
|
+
.select("*")\
|
|
159
|
+
.eq("id", file_id)\
|
|
160
|
+
.single()\
|
|
161
|
+
.execute()
|
|
162
|
+
|
|
163
|
+
if result.data:
|
|
164
|
+
return self._to_user_file(result.data)
|
|
165
|
+
return None
|
|
166
|
+
|
|
167
|
+
def get_file_by_r2_key(self, r2_key: str) -> Optional[UserFile]:
|
|
168
|
+
"""Get a file by R2 key."""
|
|
169
|
+
result = self.client.table(self.table)\
|
|
170
|
+
.select("*")\
|
|
171
|
+
.eq("r2_key", r2_key)\
|
|
172
|
+
.single()\
|
|
173
|
+
.execute()
|
|
174
|
+
|
|
175
|
+
if result.data:
|
|
176
|
+
return self._to_user_file(result.data)
|
|
177
|
+
return None
|
|
178
|
+
|
|
179
|
+
def get_session_files(self, session_id: str) -> List[UserFile]:
|
|
180
|
+
"""Get all files for a chat session."""
|
|
181
|
+
result = self.client.table(self.table)\
|
|
182
|
+
.select("*")\
|
|
183
|
+
.eq("session_id", session_id)\
|
|
184
|
+
.eq("is_deleted", False)\
|
|
185
|
+
.order("created_at", desc=True)\
|
|
186
|
+
.execute()
|
|
187
|
+
|
|
188
|
+
return [self._to_user_file(row) for row in (result.data or [])]
|
|
189
|
+
|
|
190
|
+
# ==================== UPDATE ====================
|
|
191
|
+
|
|
192
|
+
def extend_expiration(self, file_id: str, additional_days: int = 7) -> bool:
|
|
193
|
+
"""Extend file expiration date."""
|
|
194
|
+
file = self.get_file_by_id(file_id)
|
|
195
|
+
if not file:
|
|
196
|
+
return False
|
|
197
|
+
|
|
198
|
+
new_expires = datetime.utcnow() + timedelta(days=additional_days)
|
|
199
|
+
|
|
200
|
+
result = self.client.table(self.table)\
|
|
201
|
+
.update({"expires_at": new_expires.isoformat()})\
|
|
202
|
+
.eq("id", file_id)\
|
|
203
|
+
.execute()
|
|
204
|
+
|
|
205
|
+
return bool(result.data)
|
|
206
|
+
|
|
207
|
+
# ==================== DELETE ====================
|
|
208
|
+
|
|
209
|
+
def soft_delete_file(self, file_id: str) -> bool:
|
|
210
|
+
"""Soft delete a file (mark as deleted)."""
|
|
211
|
+
result = self.client.table(self.table)\
|
|
212
|
+
.update({"is_deleted": True})\
|
|
213
|
+
.eq("id", file_id)\
|
|
214
|
+
.execute()
|
|
215
|
+
|
|
216
|
+
return bool(result.data)
|
|
217
|
+
|
|
218
|
+
def hard_delete_file(self, file_id: str) -> bool:
|
|
219
|
+
"""Permanently delete a file record."""
|
|
220
|
+
result = self.client.table(self.table)\
|
|
221
|
+
.delete()\
|
|
222
|
+
.eq("id", file_id)\
|
|
223
|
+
.execute()
|
|
224
|
+
|
|
225
|
+
return bool(result.data)
|
|
226
|
+
|
|
227
|
+
def get_expired_files(self) -> List[UserFile]:
|
|
228
|
+
"""Get all expired files for cleanup."""
|
|
229
|
+
result = self.client.table(self.table)\
|
|
230
|
+
.select("*")\
|
|
231
|
+
.lt("expires_at", datetime.utcnow().isoformat())\
|
|
232
|
+
.eq("is_deleted", False)\
|
|
233
|
+
.execute()
|
|
234
|
+
|
|
235
|
+
return [self._to_user_file(row) for row in (result.data or [])]
|
|
236
|
+
|
|
237
|
+
# ==================== STATS ====================
|
|
238
|
+
|
|
239
|
+
def get_user_storage_stats(self, user_id: str) -> Dict[str, Any]:
|
|
240
|
+
"""Get storage statistics for a user."""
|
|
241
|
+
files = self.get_user_files(user_id, include_expired=False)
|
|
242
|
+
|
|
243
|
+
stats = {
|
|
244
|
+
"total_files": len(files),
|
|
245
|
+
"total_size_bytes": sum(f.size_bytes for f in files),
|
|
246
|
+
"by_type": {}
|
|
247
|
+
}
|
|
248
|
+
|
|
249
|
+
for file_type in FileType:
|
|
250
|
+
type_files = [f for f in files if f.file_type == file_type]
|
|
251
|
+
stats["by_type"][file_type.value] = {
|
|
252
|
+
"count": len(type_files),
|
|
253
|
+
"size_bytes": sum(f.size_bytes for f in type_files)
|
|
254
|
+
}
|
|
255
|
+
|
|
256
|
+
stats["total_size_mb"] = round(stats["total_size_bytes"] / (1024 * 1024), 2)
|
|
257
|
+
|
|
258
|
+
return stats
|
|
259
|
+
|
|
260
|
+
# ==================== HELPERS ====================
|
|
261
|
+
|
|
262
|
+
def _to_user_file(self, row: Dict[str, Any]) -> UserFile:
|
|
263
|
+
"""Convert database row to UserFile object."""
|
|
264
|
+
return UserFile(
|
|
265
|
+
id=row["id"],
|
|
266
|
+
user_id=row["user_id"],
|
|
267
|
+
session_id=row.get("session_id"),
|
|
268
|
+
file_type=FileType(row["file_type"]),
|
|
269
|
+
file_name=row["file_name"],
|
|
270
|
+
r2_key=row["r2_key"],
|
|
271
|
+
size_bytes=row.get("size_bytes", 0),
|
|
272
|
+
mime_type=row.get("mime_type", "application/octet-stream"),
|
|
273
|
+
metadata=row.get("metadata", {}),
|
|
274
|
+
created_at=datetime.fromisoformat(row["created_at"].replace("Z", "+00:00")),
|
|
275
|
+
expires_at=datetime.fromisoformat(row["expires_at"].replace("Z", "+00:00"))
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
# ==================== SINGLETON ====================
|
|
280
|
+
|
|
281
|
+
_files_service: Optional[UserFilesService] = None
|
|
282
|
+
|
|
283
|
+
def get_files_service() -> UserFilesService:
|
|
284
|
+
"""Get or create UserFilesService singleton."""
|
|
285
|
+
global _files_service
|
|
286
|
+
if _files_service is None:
|
|
287
|
+
_files_service = UserFilesService()
|
|
288
|
+
return _files_service
|
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
"""Tools module initialization - All 44 tools."""
|
|
2
|
+
|
|
3
|
+
# Basic Tools (10)
|
|
4
|
+
from .data_profiling import (
|
|
5
|
+
profile_dataset,
|
|
6
|
+
detect_data_quality_issues,
|
|
7
|
+
analyze_correlations,
|
|
8
|
+
get_smart_summary, # NEW: Enhanced data summary
|
|
9
|
+
detect_label_errors # NEW: cleanlab label error detection
|
|
10
|
+
)
|
|
11
|
+
|
|
12
|
+
from .data_cleaning import (
|
|
13
|
+
clean_missing_values,
|
|
14
|
+
handle_outliers,
|
|
15
|
+
fix_data_types
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
from .data_type_conversion import (
|
|
19
|
+
force_numeric_conversion,
|
|
20
|
+
smart_type_inference
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
# Data Wrangling Tools (3) - NEW
|
|
24
|
+
from .data_wrangling import (
|
|
25
|
+
merge_datasets,
|
|
26
|
+
concat_datasets,
|
|
27
|
+
reshape_dataset
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
from .feature_engineering import (
|
|
31
|
+
create_time_features,
|
|
32
|
+
encode_categorical
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
from .model_training import (
|
|
36
|
+
train_baseline_models,
|
|
37
|
+
generate_model_report
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
# AutoGluon-Powered Training (9) - Classification, Regression, Time Series, Optimization
|
|
41
|
+
from .autogluon_training import (
|
|
42
|
+
train_with_autogluon,
|
|
43
|
+
predict_with_autogluon,
|
|
44
|
+
forecast_with_autogluon,
|
|
45
|
+
optimize_autogluon_model,
|
|
46
|
+
analyze_autogluon_model,
|
|
47
|
+
extend_autogluon_training,
|
|
48
|
+
train_multilabel_autogluon,
|
|
49
|
+
backtest_timeseries,
|
|
50
|
+
analyze_timeseries_model
|
|
51
|
+
)
|
|
52
|
+
|
|
53
|
+
# Advanced Analysis Tools (5)
|
|
54
|
+
from .advanced_analysis import (
|
|
55
|
+
perform_eda_analysis,
|
|
56
|
+
detect_model_issues,
|
|
57
|
+
detect_anomalies,
|
|
58
|
+
detect_and_handle_multicollinearity,
|
|
59
|
+
perform_statistical_tests
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
# Advanced Feature Engineering Tools (4)
|
|
63
|
+
from .advanced_feature_engineering import (
|
|
64
|
+
create_interaction_features,
|
|
65
|
+
create_aggregation_features,
|
|
66
|
+
engineer_text_features,
|
|
67
|
+
auto_feature_engineering
|
|
68
|
+
)
|
|
69
|
+
|
|
70
|
+
# Advanced Preprocessing Tools (3)
|
|
71
|
+
from .advanced_preprocessing import (
|
|
72
|
+
handle_imbalanced_data,
|
|
73
|
+
perform_feature_scaling,
|
|
74
|
+
split_data_strategically
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
# Advanced Training Tools (3)
|
|
78
|
+
from .advanced_training import (
|
|
79
|
+
hyperparameter_tuning,
|
|
80
|
+
train_ensemble_models,
|
|
81
|
+
perform_cross_validation
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
# Business Intelligence Tools (4)
|
|
85
|
+
from .business_intelligence import (
|
|
86
|
+
perform_cohort_analysis,
|
|
87
|
+
perform_rfm_analysis,
|
|
88
|
+
detect_causal_relationships,
|
|
89
|
+
generate_business_insights
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
# Computer Vision Tools (3)
|
|
93
|
+
from .computer_vision import (
|
|
94
|
+
extract_image_features,
|
|
95
|
+
perform_image_clustering,
|
|
96
|
+
analyze_tabular_image_hybrid
|
|
97
|
+
)
|
|
98
|
+
|
|
99
|
+
# NLP/Text Analytics Tools (4)
|
|
100
|
+
from .nlp_text_analytics import (
|
|
101
|
+
perform_topic_modeling,
|
|
102
|
+
perform_named_entity_recognition,
|
|
103
|
+
analyze_sentiment_advanced,
|
|
104
|
+
perform_text_similarity
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# Production/MLOps Tools (5 + 2 new)
|
|
108
|
+
from .production_mlops import (
|
|
109
|
+
monitor_model_drift,
|
|
110
|
+
explain_predictions,
|
|
111
|
+
generate_model_card,
|
|
112
|
+
perform_ab_test_analysis,
|
|
113
|
+
detect_feature_leakage,
|
|
114
|
+
monitor_drift_evidently, # NEW: Evidently drift reports
|
|
115
|
+
explain_with_dtreeviz # NEW: Decision tree visualization
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
# Time Series Tools (3)
|
|
119
|
+
from .time_series import (
|
|
120
|
+
forecast_time_series,
|
|
121
|
+
detect_seasonality_trends,
|
|
122
|
+
create_time_series_features
|
|
123
|
+
)
|
|
124
|
+
|
|
125
|
+
# Advanced Insights Tools (6) - NEW
|
|
126
|
+
from .advanced_insights import (
|
|
127
|
+
analyze_root_cause,
|
|
128
|
+
detect_trends_and_seasonality,
|
|
129
|
+
detect_anomalies_advanced,
|
|
130
|
+
perform_hypothesis_testing,
|
|
131
|
+
analyze_distribution,
|
|
132
|
+
perform_segment_analysis
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
# Automated Pipeline Tools (2) - NEW
|
|
136
|
+
from .auto_pipeline import (
|
|
137
|
+
auto_ml_pipeline,
|
|
138
|
+
auto_feature_selection
|
|
139
|
+
)
|
|
140
|
+
|
|
141
|
+
# Visualization Tools (5) - NEW
|
|
142
|
+
from .visualization_engine import (
|
|
143
|
+
generate_all_plots,
|
|
144
|
+
generate_data_quality_plots,
|
|
145
|
+
generate_eda_plots,
|
|
146
|
+
generate_model_performance_plots,
|
|
147
|
+
generate_feature_importance_plot
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
# Interactive Plotly Visualizations (6) - NEW PHASE 2
|
|
151
|
+
from .plotly_visualizations import (
|
|
152
|
+
generate_interactive_scatter,
|
|
153
|
+
generate_interactive_histogram,
|
|
154
|
+
generate_interactive_correlation_heatmap,
|
|
155
|
+
generate_interactive_box_plots,
|
|
156
|
+
generate_interactive_time_series,
|
|
157
|
+
generate_plotly_dashboard
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
# EDA Report Generation (2) - NEW PHASE 2
|
|
161
|
+
from .eda_reports import (
|
|
162
|
+
generate_ydata_profiling_report,
|
|
163
|
+
generate_sweetviz_report # NEW: Sweetviz EDA with comparison
|
|
164
|
+
)
|
|
165
|
+
|
|
166
|
+
# Code Interpreter (2) - NEW PHASE 2 - CRITICAL for True AI Agent
|
|
167
|
+
from .code_interpreter import (
|
|
168
|
+
execute_python_code,
|
|
169
|
+
execute_code_from_file
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
# Cloud Data Sources (4) - NEW: BigQuery Integration
|
|
173
|
+
from .cloud_data_sources import (
|
|
174
|
+
load_bigquery_table,
|
|
175
|
+
write_bigquery_table,
|
|
176
|
+
profile_bigquery_table,
|
|
177
|
+
query_bigquery
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
from .tools_registry import TOOLS, get_tool_by_name, get_all_tool_names
|
|
181
|
+
|
|
182
|
+
from .enhanced_feature_engineering import (
|
|
183
|
+
create_ratio_features,
|
|
184
|
+
create_statistical_features,
|
|
185
|
+
create_log_features,
|
|
186
|
+
create_binned_features,
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
__all__ = [
|
|
190
|
+
# Basic Data Profiling (4 + 1 new) - UPDATED
|
|
191
|
+
"profile_dataset",
|
|
192
|
+
"detect_data_quality_issues",
|
|
193
|
+
"analyze_correlations",
|
|
194
|
+
"get_smart_summary", # NEW
|
|
195
|
+
"detect_label_errors", # NEW: cleanlab
|
|
196
|
+
|
|
197
|
+
# Basic Data Cleaning (3)
|
|
198
|
+
"clean_missing_values",
|
|
199
|
+
"handle_outliers",
|
|
200
|
+
"fix_data_types",
|
|
201
|
+
|
|
202
|
+
# Data Type Conversion (2)
|
|
203
|
+
"force_numeric_conversion",
|
|
204
|
+
"smart_type_inference",
|
|
205
|
+
|
|
206
|
+
# Data Wrangling (3) - NEW
|
|
207
|
+
"merge_datasets",
|
|
208
|
+
"concat_datasets",
|
|
209
|
+
"reshape_dataset",
|
|
210
|
+
|
|
211
|
+
# Basic Feature Engineering (2)
|
|
212
|
+
"create_time_features",
|
|
213
|
+
"encode_categorical",
|
|
214
|
+
|
|
215
|
+
# Basic Model Training (2)
|
|
216
|
+
"train_baseline_models",
|
|
217
|
+
"generate_model_report",
|
|
218
|
+
|
|
219
|
+
# AutoGluon Training (9) - NEW
|
|
220
|
+
"train_with_autogluon",
|
|
221
|
+
"predict_with_autogluon",
|
|
222
|
+
"forecast_with_autogluon",
|
|
223
|
+
"optimize_autogluon_model",
|
|
224
|
+
"analyze_autogluon_model",
|
|
225
|
+
"extend_autogluon_training",
|
|
226
|
+
"train_multilabel_autogluon",
|
|
227
|
+
"backtest_timeseries",
|
|
228
|
+
"analyze_timeseries_model",
|
|
229
|
+
|
|
230
|
+
# Advanced Analysis (5)
|
|
231
|
+
"perform_eda_analysis",
|
|
232
|
+
"detect_model_issues",
|
|
233
|
+
"detect_anomalies",
|
|
234
|
+
"detect_and_handle_multicollinearity",
|
|
235
|
+
"perform_statistical_tests",
|
|
236
|
+
|
|
237
|
+
# Advanced Feature Engineering (4)
|
|
238
|
+
"create_interaction_features",
|
|
239
|
+
"create_aggregation_features",
|
|
240
|
+
"engineer_text_features",
|
|
241
|
+
"auto_feature_engineering",
|
|
242
|
+
|
|
243
|
+
# Advanced Preprocessing (3)
|
|
244
|
+
"handle_imbalanced_data",
|
|
245
|
+
"perform_feature_scaling",
|
|
246
|
+
"split_data_strategically",
|
|
247
|
+
|
|
248
|
+
# Advanced Training (3)
|
|
249
|
+
"hyperparameter_tuning",
|
|
250
|
+
"train_ensemble_models",
|
|
251
|
+
"perform_cross_validation",
|
|
252
|
+
|
|
253
|
+
# Business Intelligence (4)
|
|
254
|
+
"perform_cohort_analysis",
|
|
255
|
+
"perform_rfm_analysis",
|
|
256
|
+
"detect_causal_relationships",
|
|
257
|
+
"generate_business_insights",
|
|
258
|
+
|
|
259
|
+
# Computer Vision (3)
|
|
260
|
+
"extract_image_features",
|
|
261
|
+
"perform_image_clustering",
|
|
262
|
+
"analyze_tabular_image_hybrid",
|
|
263
|
+
|
|
264
|
+
# NLP/Text Analytics (4)
|
|
265
|
+
"perform_topic_modeling",
|
|
266
|
+
"perform_named_entity_recognition",
|
|
267
|
+
"analyze_sentiment_advanced",
|
|
268
|
+
"perform_text_similarity",
|
|
269
|
+
|
|
270
|
+
# Production/MLOps (5 + 2 new)
|
|
271
|
+
"monitor_model_drift",
|
|
272
|
+
"explain_predictions",
|
|
273
|
+
"generate_model_card",
|
|
274
|
+
"perform_ab_test_analysis",
|
|
275
|
+
"detect_feature_leakage",
|
|
276
|
+
"monitor_drift_evidently", # NEW: Evidently
|
|
277
|
+
"explain_with_dtreeviz", # NEW: dtreeviz
|
|
278
|
+
|
|
279
|
+
# Time Series (3)
|
|
280
|
+
"forecast_time_series",
|
|
281
|
+
"detect_seasonality_trends",
|
|
282
|
+
"create_time_series_features",
|
|
283
|
+
|
|
284
|
+
# Advanced Insights (6) - NEW
|
|
285
|
+
"analyze_root_cause",
|
|
286
|
+
"detect_trends_and_seasonality",
|
|
287
|
+
"detect_anomalies_advanced",
|
|
288
|
+
"perform_hypothesis_testing",
|
|
289
|
+
"analyze_distribution",
|
|
290
|
+
"perform_segment_analysis",
|
|
291
|
+
|
|
292
|
+
# Automated Pipeline (2) - NEW
|
|
293
|
+
"auto_ml_pipeline",
|
|
294
|
+
"auto_feature_selection",
|
|
295
|
+
|
|
296
|
+
# Visualization (5) - NEW
|
|
297
|
+
"generate_all_plots",
|
|
298
|
+
"generate_data_quality_plots",
|
|
299
|
+
"generate_eda_plots",
|
|
300
|
+
"generate_model_performance_plots",
|
|
301
|
+
"generate_feature_importance_plot",
|
|
302
|
+
|
|
303
|
+
# Interactive Plotly Visualizations (6) - NEW PHASE 2
|
|
304
|
+
"generate_interactive_scatter",
|
|
305
|
+
"generate_interactive_histogram",
|
|
306
|
+
"generate_interactive_correlation_heatmap",
|
|
307
|
+
"generate_interactive_box_plots",
|
|
308
|
+
"generate_interactive_time_series",
|
|
309
|
+
"generate_plotly_dashboard",
|
|
310
|
+
|
|
311
|
+
# EDA Report Generation (2) - NEW PHASE 2
|
|
312
|
+
"generate_ydata_profiling_report",
|
|
313
|
+
"generate_sweetviz_report", # NEW: Sweetviz
|
|
314
|
+
|
|
315
|
+
# Code Interpreter (2) - NEW PHASE 2 - CRITICAL for True AI Agent
|
|
316
|
+
"execute_python_code",
|
|
317
|
+
"execute_code_from_file",
|
|
318
|
+
|
|
319
|
+
# Cloud Data Sources (4) - NEW: BigQuery Integration
|
|
320
|
+
"load_bigquery_table",
|
|
321
|
+
"write_bigquery_table",
|
|
322
|
+
"profile_bigquery_table",
|
|
323
|
+
"query_bigquery",
|
|
324
|
+
|
|
325
|
+
# Enhanced Feature Engineering (4) - NEW
|
|
326
|
+
"create_ratio_features",
|
|
327
|
+
"create_statistical_features",
|
|
328
|
+
"create_log_features",
|
|
329
|
+
"create_binned_features",
|
|
330
|
+
|
|
331
|
+
# Registry
|
|
332
|
+
"TOOLS",
|
|
333
|
+
"get_tool_by_name",
|
|
334
|
+
"get_all_tool_names",
|
|
335
|
+
]
|