ds-agent-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ds-agent.js +451 -0
- package/ds_agent/__init__.py +8 -0
- package/package.json +28 -0
- package/requirements.txt +126 -0
- package/setup.py +35 -0
- package/src/__init__.py +7 -0
- package/src/_compress_tool_result.py +118 -0
- package/src/api/__init__.py +4 -0
- package/src/api/app.py +1626 -0
- package/src/cache/__init__.py +5 -0
- package/src/cache/cache_manager.py +561 -0
- package/src/cli.py +2886 -0
- package/src/dynamic_prompts.py +281 -0
- package/src/orchestrator.py +4799 -0
- package/src/progress_manager.py +139 -0
- package/src/reasoning/__init__.py +332 -0
- package/src/reasoning/business_summary.py +431 -0
- package/src/reasoning/data_understanding.py +356 -0
- package/src/reasoning/model_explanation.py +383 -0
- package/src/reasoning/reasoning_trace.py +239 -0
- package/src/registry/__init__.py +3 -0
- package/src/registry/tools_registry.py +3 -0
- package/src/session_memory.py +448 -0
- package/src/session_store.py +370 -0
- package/src/storage/__init__.py +19 -0
- package/src/storage/artifact_store.py +620 -0
- package/src/storage/helpers.py +116 -0
- package/src/storage/huggingface_storage.py +694 -0
- package/src/storage/r2_storage.py +0 -0
- package/src/storage/user_files_service.py +288 -0
- package/src/tools/__init__.py +335 -0
- package/src/tools/advanced_analysis.py +823 -0
- package/src/tools/advanced_feature_engineering.py +708 -0
- package/src/tools/advanced_insights.py +578 -0
- package/src/tools/advanced_preprocessing.py +549 -0
- package/src/tools/advanced_training.py +906 -0
- package/src/tools/agent_tool_mapping.py +326 -0
- package/src/tools/auto_pipeline.py +420 -0
- package/src/tools/autogluon_training.py +1480 -0
- package/src/tools/business_intelligence.py +860 -0
- package/src/tools/cloud_data_sources.py +581 -0
- package/src/tools/code_interpreter.py +390 -0
- package/src/tools/computer_vision.py +614 -0
- package/src/tools/data_cleaning.py +614 -0
- package/src/tools/data_profiling.py +593 -0
- package/src/tools/data_type_conversion.py +268 -0
- package/src/tools/data_wrangling.py +433 -0
- package/src/tools/eda_reports.py +284 -0
- package/src/tools/enhanced_feature_engineering.py +241 -0
- package/src/tools/feature_engineering.py +302 -0
- package/src/tools/matplotlib_visualizations.py +1327 -0
- package/src/tools/model_training.py +520 -0
- package/src/tools/nlp_text_analytics.py +761 -0
- package/src/tools/plotly_visualizations.py +497 -0
- package/src/tools/production_mlops.py +852 -0
- package/src/tools/time_series.py +507 -0
- package/src/tools/tools_registry.py +2133 -0
- package/src/tools/visualization_engine.py +559 -0
- package/src/utils/__init__.py +42 -0
- package/src/utils/error_recovery.py +313 -0
- package/src/utils/parallel_executor.py +402 -0
- package/src/utils/polars_helpers.py +248 -0
- package/src/utils/schema_extraction.py +132 -0
- package/src/utils/semantic_layer.py +392 -0
- package/src/utils/token_budget.py +411 -0
- package/src/utils/validation.py +377 -0
- package/src/workflow_state.py +154 -0
|
@@ -0,0 +1,614 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Computer Vision & Image Analytics Tools
|
|
3
|
+
|
|
4
|
+
Advanced computer vision tools for image feature extraction, clustering,
|
|
5
|
+
and hybrid tabular-image analysis.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import polars as pl
|
|
9
|
+
import numpy as np
|
|
10
|
+
from typing import Dict, Any, List, Optional, Tuple
|
|
11
|
+
from pathlib import Path
|
|
12
|
+
import json
|
|
13
|
+
|
|
14
|
+
# Core CV libraries (optional)
|
|
15
|
+
try:
|
|
16
|
+
from PIL import Image
|
|
17
|
+
import cv2
|
|
18
|
+
CV2_AVAILABLE = True
|
|
19
|
+
except ImportError:
|
|
20
|
+
CV2_AVAILABLE = False
|
|
21
|
+
|
|
22
|
+
try:
|
|
23
|
+
import torch
|
|
24
|
+
import torchvision
|
|
25
|
+
from torchvision import models, transforms
|
|
26
|
+
TORCH_AVAILABLE = True
|
|
27
|
+
except ImportError:
|
|
28
|
+
TORCH_AVAILABLE = False
|
|
29
|
+
|
|
30
|
+
# ML libraries
|
|
31
|
+
try:
|
|
32
|
+
from sklearn.cluster import KMeans, DBSCAN
|
|
33
|
+
from sklearn.decomposition import PCA
|
|
34
|
+
from sklearn.preprocessing import StandardScaler
|
|
35
|
+
from sklearn.manifold import TSNE
|
|
36
|
+
except ImportError:
|
|
37
|
+
pass
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def extract_image_features(
|
|
41
|
+
image_paths: List[str],
|
|
42
|
+
method: str = "cnn",
|
|
43
|
+
model_name: str = "resnet50",
|
|
44
|
+
color_spaces: Optional[List[str]] = None,
|
|
45
|
+
include_histograms: bool = True,
|
|
46
|
+
histogram_bins: int = 256
|
|
47
|
+
) -> Dict[str, Any]:
|
|
48
|
+
"""
|
|
49
|
+
Extract features from images using CNN embeddings, color histograms, and other methods.
|
|
50
|
+
|
|
51
|
+
Args:
|
|
52
|
+
image_paths: List of paths to image files
|
|
53
|
+
method: Feature extraction method ('cnn', 'color', 'texture', 'hybrid')
|
|
54
|
+
model_name: Pre-trained model for CNN features ('resnet50', 'efficientnet_b0', 'vgg16')
|
|
55
|
+
color_spaces: Color spaces for histograms (['rgb', 'hsv', 'lab'])
|
|
56
|
+
include_histograms: Whether to include color histograms
|
|
57
|
+
histogram_bins: Number of bins for histograms
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Dictionary containing feature vectors, dimensionality, and metadata
|
|
61
|
+
"""
|
|
62
|
+
print(f"🔍 Extracting image features using {method} method...")
|
|
63
|
+
|
|
64
|
+
if not image_paths:
|
|
65
|
+
raise ValueError("No image paths provided")
|
|
66
|
+
|
|
67
|
+
result = {
|
|
68
|
+
"method": method,
|
|
69
|
+
"n_images": len(image_paths),
|
|
70
|
+
"features": [],
|
|
71
|
+
"feature_dim": 0,
|
|
72
|
+
"failed_images": []
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
if method == "cnn" and TORCH_AVAILABLE:
|
|
77
|
+
print(f" Using CNN model: {model_name}")
|
|
78
|
+
|
|
79
|
+
# Load pre-trained model
|
|
80
|
+
if model_name == "resnet50":
|
|
81
|
+
model = models.resnet50(pretrained=True)
|
|
82
|
+
# Remove final classification layer
|
|
83
|
+
model = torch.nn.Sequential(*list(model.children())[:-1])
|
|
84
|
+
elif model_name == "efficientnet_b0":
|
|
85
|
+
model = models.efficientnet_b0(pretrained=True)
|
|
86
|
+
model = torch.nn.Sequential(*list(model.children())[:-1])
|
|
87
|
+
elif model_name == "vgg16":
|
|
88
|
+
model = models.vgg16(pretrained=True)
|
|
89
|
+
model.classifier = torch.nn.Sequential(*list(model.classifier.children())[:-1])
|
|
90
|
+
else:
|
|
91
|
+
raise ValueError(f"Unknown model '{model_name}'")
|
|
92
|
+
|
|
93
|
+
model.eval()
|
|
94
|
+
|
|
95
|
+
# Image preprocessing
|
|
96
|
+
preprocess = transforms.Compose([
|
|
97
|
+
transforms.Resize(256),
|
|
98
|
+
transforms.CenterCrop(224),
|
|
99
|
+
transforms.ToTensor(),
|
|
100
|
+
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
|
|
101
|
+
])
|
|
102
|
+
|
|
103
|
+
# Extract features
|
|
104
|
+
for img_path in image_paths:
|
|
105
|
+
try:
|
|
106
|
+
img = Image.open(img_path).convert('RGB')
|
|
107
|
+
img_tensor = preprocess(img).unsqueeze(0)
|
|
108
|
+
|
|
109
|
+
with torch.no_grad():
|
|
110
|
+
features = model(img_tensor)
|
|
111
|
+
features = features.squeeze().numpy()
|
|
112
|
+
|
|
113
|
+
result["features"].append({
|
|
114
|
+
"image_path": img_path,
|
|
115
|
+
"feature_vector": features.tolist(),
|
|
116
|
+
"feature_dim": len(features)
|
|
117
|
+
})
|
|
118
|
+
|
|
119
|
+
except Exception as e:
|
|
120
|
+
result["failed_images"].append({"path": img_path, "error": str(e)})
|
|
121
|
+
|
|
122
|
+
if result["features"]:
|
|
123
|
+
result["feature_dim"] = result["features"][0]["feature_dim"]
|
|
124
|
+
|
|
125
|
+
elif method in ["color", "hybrid"] or not TORCH_AVAILABLE:
|
|
126
|
+
print(" Using color histogram features...")
|
|
127
|
+
|
|
128
|
+
if not CV2_AVAILABLE:
|
|
129
|
+
print("⚠️ OpenCV not available. Using PIL for basic features...")
|
|
130
|
+
return _extract_features_basic(image_paths)
|
|
131
|
+
|
|
132
|
+
color_spaces = color_spaces or ['rgb', 'hsv']
|
|
133
|
+
|
|
134
|
+
for img_path in image_paths:
|
|
135
|
+
try:
|
|
136
|
+
# Read image
|
|
137
|
+
img = cv2.imread(img_path)
|
|
138
|
+
if img is None:
|
|
139
|
+
raise ValueError(f"Could not read image: {img_path}")
|
|
140
|
+
|
|
141
|
+
feature_vector = []
|
|
142
|
+
|
|
143
|
+
# Color histograms
|
|
144
|
+
if 'rgb' in color_spaces:
|
|
145
|
+
for i in range(3):
|
|
146
|
+
hist = cv2.calcHist([img], [i], None, [histogram_bins], [0, 256])
|
|
147
|
+
feature_vector.extend(hist.flatten().tolist())
|
|
148
|
+
|
|
149
|
+
if 'hsv' in color_spaces:
|
|
150
|
+
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
|
|
151
|
+
for i in range(3):
|
|
152
|
+
hist = cv2.calcHist([hsv], [i], None, [histogram_bins], [0, 256])
|
|
153
|
+
feature_vector.extend(hist.flatten().tolist())
|
|
154
|
+
|
|
155
|
+
if 'lab' in color_spaces:
|
|
156
|
+
lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
|
|
157
|
+
for i in range(3):
|
|
158
|
+
hist = cv2.calcHist([lab], [i], None, [histogram_bins], [0, 256])
|
|
159
|
+
feature_vector.extend(hist.flatten().tolist())
|
|
160
|
+
|
|
161
|
+
# Basic image stats
|
|
162
|
+
feature_vector.extend([
|
|
163
|
+
img.shape[0], # height
|
|
164
|
+
img.shape[1], # width
|
|
165
|
+
img.mean(), # mean pixel value
|
|
166
|
+
img.std() # std pixel value
|
|
167
|
+
])
|
|
168
|
+
|
|
169
|
+
result["features"].append({
|
|
170
|
+
"image_path": img_path,
|
|
171
|
+
"feature_vector": feature_vector,
|
|
172
|
+
"feature_dim": len(feature_vector)
|
|
173
|
+
})
|
|
174
|
+
|
|
175
|
+
except Exception as e:
|
|
176
|
+
result["failed_images"].append({"path": img_path, "error": str(e)})
|
|
177
|
+
|
|
178
|
+
if result["features"]:
|
|
179
|
+
result["feature_dim"] = result["features"][0]["feature_dim"]
|
|
180
|
+
|
|
181
|
+
elif method == "texture":
|
|
182
|
+
print(" Extracting texture features...")
|
|
183
|
+
|
|
184
|
+
if not CV2_AVAILABLE:
|
|
185
|
+
raise ImportError("OpenCV required for texture features")
|
|
186
|
+
|
|
187
|
+
for img_path in image_paths:
|
|
188
|
+
try:
|
|
189
|
+
img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
|
|
190
|
+
if img is None:
|
|
191
|
+
raise ValueError(f"Could not read image: {img_path}")
|
|
192
|
+
|
|
193
|
+
# Edge detection
|
|
194
|
+
edges = cv2.Canny(img, 100, 200)
|
|
195
|
+
|
|
196
|
+
# Texture features
|
|
197
|
+
feature_vector = [
|
|
198
|
+
edges.mean(),
|
|
199
|
+
edges.std(),
|
|
200
|
+
np.count_nonzero(edges) / edges.size, # edge density
|
|
201
|
+
img.mean(),
|
|
202
|
+
img.std()
|
|
203
|
+
]
|
|
204
|
+
|
|
205
|
+
result["features"].append({
|
|
206
|
+
"image_path": img_path,
|
|
207
|
+
"feature_vector": feature_vector,
|
|
208
|
+
"feature_dim": len(feature_vector)
|
|
209
|
+
})
|
|
210
|
+
|
|
211
|
+
except Exception as e:
|
|
212
|
+
result["failed_images"].append({"path": img_path, "error": str(e)})
|
|
213
|
+
|
|
214
|
+
if result["features"]:
|
|
215
|
+
result["feature_dim"] = result["features"][0]["feature_dim"]
|
|
216
|
+
|
|
217
|
+
else:
|
|
218
|
+
raise ValueError(f"Unknown method '{method}' or required libraries not available")
|
|
219
|
+
|
|
220
|
+
print(f"✅ Feature extraction complete!")
|
|
221
|
+
print(f" Processed: {len(result['features'])} images")
|
|
222
|
+
print(f" Failed: {len(result['failed_images'])} images")
|
|
223
|
+
print(f" Feature dimension: {result['feature_dim']}")
|
|
224
|
+
|
|
225
|
+
return result
|
|
226
|
+
|
|
227
|
+
except Exception as e:
|
|
228
|
+
print(f"❌ Error during feature extraction: {str(e)}")
|
|
229
|
+
raise
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
def _extract_features_basic(image_paths: List[str]) -> Dict[str, Any]:
|
|
233
|
+
"""Fallback feature extraction using PIL when OpenCV/PyTorch not available."""
|
|
234
|
+
|
|
235
|
+
result = {
|
|
236
|
+
"method": "basic_pil",
|
|
237
|
+
"n_images": len(image_paths),
|
|
238
|
+
"features": [],
|
|
239
|
+
"feature_dim": 0,
|
|
240
|
+
"failed_images": []
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
for img_path in image_paths:
|
|
244
|
+
try:
|
|
245
|
+
img = Image.open(img_path).convert('RGB')
|
|
246
|
+
img_array = np.array(img)
|
|
247
|
+
|
|
248
|
+
# Basic statistics per channel
|
|
249
|
+
feature_vector = []
|
|
250
|
+
for channel in range(3):
|
|
251
|
+
channel_data = img_array[:, :, channel]
|
|
252
|
+
feature_vector.extend([
|
|
253
|
+
channel_data.mean(),
|
|
254
|
+
channel_data.std(),
|
|
255
|
+
channel_data.min(),
|
|
256
|
+
channel_data.max()
|
|
257
|
+
])
|
|
258
|
+
|
|
259
|
+
# Image dimensions
|
|
260
|
+
feature_vector.extend([img_array.shape[0], img_array.shape[1]])
|
|
261
|
+
|
|
262
|
+
result["features"].append({
|
|
263
|
+
"image_path": img_path,
|
|
264
|
+
"feature_vector": feature_vector,
|
|
265
|
+
"feature_dim": len(feature_vector)
|
|
266
|
+
})
|
|
267
|
+
|
|
268
|
+
except Exception as e:
|
|
269
|
+
result["failed_images"].append({"path": img_path, "error": str(e)})
|
|
270
|
+
|
|
271
|
+
if result["features"]:
|
|
272
|
+
result["feature_dim"] = result["features"][0]["feature_dim"]
|
|
273
|
+
|
|
274
|
+
result["note"] = "Install torch, torchvision, and opencv for advanced features"
|
|
275
|
+
|
|
276
|
+
return result
|
|
277
|
+
|
|
278
|
+
|
|
279
|
+
def perform_image_clustering(
|
|
280
|
+
features: Dict[str, Any],
|
|
281
|
+
n_clusters: int = 5,
|
|
282
|
+
method: str = "kmeans",
|
|
283
|
+
reduce_dimensions: bool = True,
|
|
284
|
+
target_dim: int = 50,
|
|
285
|
+
return_similar_pairs: bool = True,
|
|
286
|
+
top_k: int = 10
|
|
287
|
+
) -> Dict[str, Any]:
|
|
288
|
+
"""
|
|
289
|
+
Cluster images based on extracted features and find similar images.
|
|
290
|
+
|
|
291
|
+
Args:
|
|
292
|
+
features: Output from extract_image_features
|
|
293
|
+
n_clusters: Number of clusters
|
|
294
|
+
method: Clustering method ('kmeans', 'dbscan')
|
|
295
|
+
reduce_dimensions: Whether to reduce dimensions before clustering
|
|
296
|
+
target_dim: Target dimensionality for reduction
|
|
297
|
+
return_similar_pairs: Whether to return most similar image pairs
|
|
298
|
+
top_k: Number of top similar pairs to return
|
|
299
|
+
|
|
300
|
+
Returns:
|
|
301
|
+
Dictionary containing cluster assignments, centroids, and similar pairs
|
|
302
|
+
"""
|
|
303
|
+
print(f"🔍 Clustering images using {method}...")
|
|
304
|
+
|
|
305
|
+
if not features.get("features"):
|
|
306
|
+
raise ValueError("No features provided for clustering")
|
|
307
|
+
|
|
308
|
+
# Extract feature vectors
|
|
309
|
+
feature_vectors = np.array([f["feature_vector"] for f in features["features"]])
|
|
310
|
+
image_paths = [f["image_path"] for f in features["features"]]
|
|
311
|
+
|
|
312
|
+
print(f" Feature matrix shape: {feature_vectors.shape}")
|
|
313
|
+
|
|
314
|
+
result = {
|
|
315
|
+
"method": method,
|
|
316
|
+
"n_images": len(image_paths),
|
|
317
|
+
"n_clusters": n_clusters,
|
|
318
|
+
"clusters": []
|
|
319
|
+
}
|
|
320
|
+
|
|
321
|
+
try:
|
|
322
|
+
# Normalize features
|
|
323
|
+
scaler = StandardScaler()
|
|
324
|
+
feature_vectors_scaled = scaler.fit_transform(feature_vectors)
|
|
325
|
+
|
|
326
|
+
# Dimensionality reduction
|
|
327
|
+
if reduce_dimensions and feature_vectors_scaled.shape[1] > target_dim:
|
|
328
|
+
print(f" Reducing dimensions from {feature_vectors_scaled.shape[1]} to {target_dim}...")
|
|
329
|
+
pca = PCA(n_components=target_dim)
|
|
330
|
+
feature_vectors_reduced = pca.fit_transform(feature_vectors_scaled)
|
|
331
|
+
result["explained_variance"] = float(pca.explained_variance_ratio_.sum())
|
|
332
|
+
print(f" Explained variance: {result['explained_variance']:.3f}")
|
|
333
|
+
else:
|
|
334
|
+
feature_vectors_reduced = feature_vectors_scaled
|
|
335
|
+
|
|
336
|
+
# Clustering
|
|
337
|
+
if method == "kmeans":
|
|
338
|
+
clusterer = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
|
|
339
|
+
labels = clusterer.fit_predict(feature_vectors_reduced)
|
|
340
|
+
|
|
341
|
+
result["cluster_centers"] = clusterer.cluster_centers_.tolist()
|
|
342
|
+
result["inertia"] = float(clusterer.inertia_)
|
|
343
|
+
|
|
344
|
+
elif method == "dbscan":
|
|
345
|
+
clusterer = DBSCAN(eps=0.5, min_samples=5)
|
|
346
|
+
labels = clusterer.fit_predict(feature_vectors_reduced)
|
|
347
|
+
|
|
348
|
+
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
|
|
349
|
+
result["n_clusters"] = n_clusters
|
|
350
|
+
result["n_noise_points"] = int((labels == -1).sum())
|
|
351
|
+
|
|
352
|
+
else:
|
|
353
|
+
raise ValueError(f"Unknown method '{method}'. Use 'kmeans' or 'dbscan'")
|
|
354
|
+
|
|
355
|
+
# Organize results by cluster
|
|
356
|
+
for cluster_id in sorted(set(labels)):
|
|
357
|
+
cluster_indices = np.where(labels == cluster_id)[0]
|
|
358
|
+
cluster_images = [image_paths[i] for i in cluster_indices]
|
|
359
|
+
|
|
360
|
+
cluster_info = {
|
|
361
|
+
"cluster_id": int(cluster_id),
|
|
362
|
+
"size": len(cluster_images),
|
|
363
|
+
"images": cluster_images[:100] # Limit to first 100
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
if method == "kmeans":
|
|
367
|
+
# Calculate distances to centroid
|
|
368
|
+
centroid = clusterer.cluster_centers_[cluster_id]
|
|
369
|
+
distances = np.linalg.norm(feature_vectors_reduced[cluster_indices] - centroid, axis=1)
|
|
370
|
+
|
|
371
|
+
# Representative images (closest to centroid)
|
|
372
|
+
representative_indices = distances.argsort()[:5]
|
|
373
|
+
cluster_info["representative_images"] = [
|
|
374
|
+
cluster_images[i] for i in representative_indices
|
|
375
|
+
]
|
|
376
|
+
|
|
377
|
+
result["clusters"].append(cluster_info)
|
|
378
|
+
|
|
379
|
+
# Find similar image pairs
|
|
380
|
+
if return_similar_pairs:
|
|
381
|
+
print(f" Finding top {top_k} similar image pairs...")
|
|
382
|
+
|
|
383
|
+
from sklearn.metrics.pairwise import cosine_similarity
|
|
384
|
+
|
|
385
|
+
similarity_matrix = cosine_similarity(feature_vectors_reduced)
|
|
386
|
+
|
|
387
|
+
# Get upper triangle indices (avoid duplicates and self-similarity)
|
|
388
|
+
triu_indices = np.triu_indices(len(image_paths), k=1)
|
|
389
|
+
similarities = similarity_matrix[triu_indices]
|
|
390
|
+
|
|
391
|
+
# Get top K most similar pairs
|
|
392
|
+
top_indices = similarities.argsort()[-top_k:][::-1]
|
|
393
|
+
|
|
394
|
+
similar_pairs = []
|
|
395
|
+
for idx in top_indices:
|
|
396
|
+
i, j = triu_indices[0][idx], triu_indices[1][idx]
|
|
397
|
+
similar_pairs.append({
|
|
398
|
+
"image1": image_paths[i],
|
|
399
|
+
"image2": image_paths[j],
|
|
400
|
+
"similarity": float(similarities[idx])
|
|
401
|
+
})
|
|
402
|
+
|
|
403
|
+
result["similar_pairs"] = similar_pairs
|
|
404
|
+
|
|
405
|
+
# Visualize with t-SNE (if enough samples)
|
|
406
|
+
if len(image_paths) >= 30:
|
|
407
|
+
print(" Computing t-SNE for visualization...")
|
|
408
|
+
tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(image_paths)-1))
|
|
409
|
+
embeddings_2d = tsne.fit_transform(feature_vectors_reduced)
|
|
410
|
+
|
|
411
|
+
result["tsne_embeddings"] = embeddings_2d.tolist()
|
|
412
|
+
|
|
413
|
+
print(f"✅ Clustering complete!")
|
|
414
|
+
print(f" Clusters: {len(result['clusters'])}")
|
|
415
|
+
for cluster in result["clusters"]:
|
|
416
|
+
print(f" Cluster {cluster['cluster_id']}: {cluster['size']} images")
|
|
417
|
+
|
|
418
|
+
return result
|
|
419
|
+
|
|
420
|
+
except Exception as e:
|
|
421
|
+
print(f"❌ Error during clustering: {str(e)}")
|
|
422
|
+
raise
|
|
423
|
+
|
|
424
|
+
|
|
425
|
+
def analyze_tabular_image_hybrid(
|
|
426
|
+
tabular_data: pl.DataFrame,
|
|
427
|
+
image_column: str,
|
|
428
|
+
target_column: Optional[str] = None,
|
|
429
|
+
tabular_features: Optional[List[str]] = None,
|
|
430
|
+
fusion_method: str = "concatenate",
|
|
431
|
+
model_type: str = "classification",
|
|
432
|
+
test_size: float = 0.2
|
|
433
|
+
) -> Dict[str, Any]:
|
|
434
|
+
"""
|
|
435
|
+
Analyze datasets with both tabular and image data using multi-modal learning.
|
|
436
|
+
|
|
437
|
+
Args:
|
|
438
|
+
tabular_data: DataFrame with tabular features and image paths
|
|
439
|
+
image_column: Column containing image file paths
|
|
440
|
+
target_column: Target variable column (if supervised learning)
|
|
441
|
+
tabular_features: List of tabular feature columns (if None, uses all except image/target)
|
|
442
|
+
fusion_method: How to combine features ('concatenate', 'attention', 'early', 'late')
|
|
443
|
+
model_type: Type of task ('classification', 'regression')
|
|
444
|
+
test_size: Proportion of data for testing
|
|
445
|
+
|
|
446
|
+
Returns:
|
|
447
|
+
Dictionary containing model performance, feature importance, and predictions
|
|
448
|
+
"""
|
|
449
|
+
print(f"🔍 Analyzing hybrid tabular-image data...")
|
|
450
|
+
|
|
451
|
+
# Validate input
|
|
452
|
+
if image_column not in tabular_data.columns:
|
|
453
|
+
raise ValueError(f"Image column '{image_column}' not found in DataFrame")
|
|
454
|
+
|
|
455
|
+
if target_column and target_column not in tabular_data.columns:
|
|
456
|
+
raise ValueError(f"Target column '{target_column}' not found in DataFrame")
|
|
457
|
+
|
|
458
|
+
# Determine tabular features
|
|
459
|
+
if tabular_features is None:
|
|
460
|
+
exclude_cols = [image_column]
|
|
461
|
+
if target_column:
|
|
462
|
+
exclude_cols.append(target_column)
|
|
463
|
+
tabular_features = [col for col in tabular_data.columns if col not in exclude_cols]
|
|
464
|
+
|
|
465
|
+
print(f" Tabular features: {len(tabular_features)}")
|
|
466
|
+
print(f" Image column: {image_column}")
|
|
467
|
+
print(f" Target column: {target_column}")
|
|
468
|
+
|
|
469
|
+
result = {
|
|
470
|
+
"n_samples": tabular_data.shape[0],
|
|
471
|
+
"n_tabular_features": len(tabular_features),
|
|
472
|
+
"fusion_method": fusion_method,
|
|
473
|
+
"model_type": model_type
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
try:
|
|
477
|
+
# Step 1: Extract image features
|
|
478
|
+
print("\n Step 1: Extracting image features...")
|
|
479
|
+
image_paths = tabular_data[image_column].to_list()
|
|
480
|
+
|
|
481
|
+
# Use CNN features if available, otherwise color histograms
|
|
482
|
+
method = "cnn" if TORCH_AVAILABLE else "color"
|
|
483
|
+
image_features_result = extract_image_features(
|
|
484
|
+
image_paths,
|
|
485
|
+
method=method,
|
|
486
|
+
model_name="resnet50" if TORCH_AVAILABLE else None
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
# Build image feature matrix
|
|
490
|
+
image_feature_matrix = np.array([
|
|
491
|
+
f["feature_vector"] for f in image_features_result["features"]
|
|
492
|
+
])
|
|
493
|
+
|
|
494
|
+
print(f" Image features shape: {image_feature_matrix.shape}")
|
|
495
|
+
|
|
496
|
+
# Step 2: Prepare tabular features
|
|
497
|
+
print("\n Step 2: Preparing tabular features...")
|
|
498
|
+
tabular_feature_matrix = tabular_data.select(tabular_features).to_numpy()
|
|
499
|
+
|
|
500
|
+
# Handle missing values
|
|
501
|
+
from sklearn.impute import SimpleImputer
|
|
502
|
+
imputer = SimpleImputer(strategy='mean')
|
|
503
|
+
tabular_feature_matrix = imputer.fit_transform(tabular_feature_matrix)
|
|
504
|
+
|
|
505
|
+
print(f" Tabular features shape: {tabular_feature_matrix.shape}")
|
|
506
|
+
|
|
507
|
+
# Step 3: Fusion
|
|
508
|
+
print(f"\n Step 3: Fusing features using '{fusion_method}' method...")
|
|
509
|
+
|
|
510
|
+
if fusion_method == "concatenate" or fusion_method == "early":
|
|
511
|
+
# Simple concatenation
|
|
512
|
+
combined_features = np.hstack([tabular_feature_matrix, image_feature_matrix])
|
|
513
|
+
result["combined_feature_dim"] = combined_features.shape[1]
|
|
514
|
+
|
|
515
|
+
elif fusion_method == "late":
|
|
516
|
+
# Train separate models and combine predictions
|
|
517
|
+
combined_features = tabular_feature_matrix # Will handle separately
|
|
518
|
+
result["combined_feature_dim"] = tabular_feature_matrix.shape[1]
|
|
519
|
+
result["image_feature_dim"] = image_feature_matrix.shape[1]
|
|
520
|
+
|
|
521
|
+
else:
|
|
522
|
+
raise ValueError(f"Unknown fusion method '{fusion_method}'")
|
|
523
|
+
|
|
524
|
+
print(f" Combined features shape: {combined_features.shape}")
|
|
525
|
+
|
|
526
|
+
# Step 4: Train model (if target provided)
|
|
527
|
+
if target_column:
|
|
528
|
+
print(f"\n Step 4: Training {model_type} model...")
|
|
529
|
+
|
|
530
|
+
target = tabular_data[target_column].to_numpy()
|
|
531
|
+
|
|
532
|
+
# Split data
|
|
533
|
+
from sklearn.model_selection import train_test_split
|
|
534
|
+
|
|
535
|
+
X_train, X_test, y_train, y_test = train_test_split(
|
|
536
|
+
combined_features, target, test_size=test_size, random_state=42
|
|
537
|
+
)
|
|
538
|
+
|
|
539
|
+
# Train model
|
|
540
|
+
if model_type == "classification":
|
|
541
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
542
|
+
model = RandomForestClassifier(n_estimators=100, random_state=42)
|
|
543
|
+
model.fit(X_train, y_train)
|
|
544
|
+
|
|
545
|
+
# Evaluate
|
|
546
|
+
from sklearn.metrics import accuracy_score, classification_report
|
|
547
|
+
|
|
548
|
+
train_pred = model.predict(X_train)
|
|
549
|
+
test_pred = model.predict(X_test)
|
|
550
|
+
|
|
551
|
+
result["train_accuracy"] = float(accuracy_score(y_train, train_pred))
|
|
552
|
+
result["test_accuracy"] = float(accuracy_score(y_test, test_pred))
|
|
553
|
+
|
|
554
|
+
# Classification report
|
|
555
|
+
report = classification_report(y_test, test_pred, output_dict=True)
|
|
556
|
+
result["classification_report"] = report
|
|
557
|
+
|
|
558
|
+
elif model_type == "regression":
|
|
559
|
+
from sklearn.ensemble import RandomForestRegressor
|
|
560
|
+
model = RandomForestRegressor(n_estimators=100, random_state=42)
|
|
561
|
+
model.fit(X_train, y_train)
|
|
562
|
+
|
|
563
|
+
# Evaluate
|
|
564
|
+
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
|
|
565
|
+
|
|
566
|
+
train_pred = model.predict(X_train)
|
|
567
|
+
test_pred = model.predict(X_test)
|
|
568
|
+
|
|
569
|
+
result["train_rmse"] = float(np.sqrt(mean_squared_error(y_train, train_pred)))
|
|
570
|
+
result["test_rmse"] = float(np.sqrt(mean_squared_error(y_test, test_pred)))
|
|
571
|
+
result["train_r2"] = float(r2_score(y_train, train_pred))
|
|
572
|
+
result["test_r2"] = float(r2_score(y_test, test_pred))
|
|
573
|
+
result["test_mae"] = float(mean_absolute_error(y_test, test_pred))
|
|
574
|
+
|
|
575
|
+
# Feature importance
|
|
576
|
+
if fusion_method == "concatenate":
|
|
577
|
+
feature_names = tabular_features + [f"image_feat_{i}" for i in range(image_feature_matrix.shape[1])]
|
|
578
|
+
|
|
579
|
+
# Top 20 most important features
|
|
580
|
+
importances = model.feature_importances_
|
|
581
|
+
top_indices = importances.argsort()[-20:][::-1]
|
|
582
|
+
|
|
583
|
+
result["top_features"] = [
|
|
584
|
+
{
|
|
585
|
+
"feature": feature_names[i],
|
|
586
|
+
"importance": float(importances[i])
|
|
587
|
+
}
|
|
588
|
+
for i in top_indices
|
|
589
|
+
]
|
|
590
|
+
|
|
591
|
+
# Compare tabular vs image feature importance
|
|
592
|
+
tabular_importance = importances[:len(tabular_features)].sum()
|
|
593
|
+
image_importance = importances[len(tabular_features):].sum()
|
|
594
|
+
|
|
595
|
+
result["feature_importance_split"] = {
|
|
596
|
+
"tabular": float(tabular_importance),
|
|
597
|
+
"image": float(image_importance),
|
|
598
|
+
"tabular_percentage": float(tabular_importance / importances.sum() * 100),
|
|
599
|
+
"image_percentage": float(image_importance / importances.sum() * 100)
|
|
600
|
+
}
|
|
601
|
+
|
|
602
|
+
print(f"\n✅ Hybrid analysis complete!")
|
|
603
|
+
if target_column:
|
|
604
|
+
if model_type == "classification":
|
|
605
|
+
print(f" Test accuracy: {result['test_accuracy']:.4f}")
|
|
606
|
+
else:
|
|
607
|
+
print(f" Test R²: {result['test_r2']:.4f}")
|
|
608
|
+
print(f" Test RMSE: {result['test_rmse']:.4f}")
|
|
609
|
+
|
|
610
|
+
return result
|
|
611
|
+
|
|
612
|
+
except Exception as e:
|
|
613
|
+
print(f"❌ Error during hybrid analysis: {str(e)}")
|
|
614
|
+
raise
|