ds-agent-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/bin/ds-agent.js +451 -0
  2. package/ds_agent/__init__.py +8 -0
  3. package/package.json +28 -0
  4. package/requirements.txt +126 -0
  5. package/setup.py +35 -0
  6. package/src/__init__.py +7 -0
  7. package/src/_compress_tool_result.py +118 -0
  8. package/src/api/__init__.py +4 -0
  9. package/src/api/app.py +1626 -0
  10. package/src/cache/__init__.py +5 -0
  11. package/src/cache/cache_manager.py +561 -0
  12. package/src/cli.py +2886 -0
  13. package/src/dynamic_prompts.py +281 -0
  14. package/src/orchestrator.py +4799 -0
  15. package/src/progress_manager.py +139 -0
  16. package/src/reasoning/__init__.py +332 -0
  17. package/src/reasoning/business_summary.py +431 -0
  18. package/src/reasoning/data_understanding.py +356 -0
  19. package/src/reasoning/model_explanation.py +383 -0
  20. package/src/reasoning/reasoning_trace.py +239 -0
  21. package/src/registry/__init__.py +3 -0
  22. package/src/registry/tools_registry.py +3 -0
  23. package/src/session_memory.py +448 -0
  24. package/src/session_store.py +370 -0
  25. package/src/storage/__init__.py +19 -0
  26. package/src/storage/artifact_store.py +620 -0
  27. package/src/storage/helpers.py +116 -0
  28. package/src/storage/huggingface_storage.py +694 -0
  29. package/src/storage/r2_storage.py +0 -0
  30. package/src/storage/user_files_service.py +288 -0
  31. package/src/tools/__init__.py +335 -0
  32. package/src/tools/advanced_analysis.py +823 -0
  33. package/src/tools/advanced_feature_engineering.py +708 -0
  34. package/src/tools/advanced_insights.py +578 -0
  35. package/src/tools/advanced_preprocessing.py +549 -0
  36. package/src/tools/advanced_training.py +906 -0
  37. package/src/tools/agent_tool_mapping.py +326 -0
  38. package/src/tools/auto_pipeline.py +420 -0
  39. package/src/tools/autogluon_training.py +1480 -0
  40. package/src/tools/business_intelligence.py +860 -0
  41. package/src/tools/cloud_data_sources.py +581 -0
  42. package/src/tools/code_interpreter.py +390 -0
  43. package/src/tools/computer_vision.py +614 -0
  44. package/src/tools/data_cleaning.py +614 -0
  45. package/src/tools/data_profiling.py +593 -0
  46. package/src/tools/data_type_conversion.py +268 -0
  47. package/src/tools/data_wrangling.py +433 -0
  48. package/src/tools/eda_reports.py +284 -0
  49. package/src/tools/enhanced_feature_engineering.py +241 -0
  50. package/src/tools/feature_engineering.py +302 -0
  51. package/src/tools/matplotlib_visualizations.py +1327 -0
  52. package/src/tools/model_training.py +520 -0
  53. package/src/tools/nlp_text_analytics.py +761 -0
  54. package/src/tools/plotly_visualizations.py +497 -0
  55. package/src/tools/production_mlops.py +852 -0
  56. package/src/tools/time_series.py +507 -0
  57. package/src/tools/tools_registry.py +2133 -0
  58. package/src/tools/visualization_engine.py +559 -0
  59. package/src/utils/__init__.py +42 -0
  60. package/src/utils/error_recovery.py +313 -0
  61. package/src/utils/parallel_executor.py +402 -0
  62. package/src/utils/polars_helpers.py +248 -0
  63. package/src/utils/schema_extraction.py +132 -0
  64. package/src/utils/semantic_layer.py +392 -0
  65. package/src/utils/token_budget.py +411 -0
  66. package/src/utils/validation.py +377 -0
  67. package/src/workflow_state.py +154 -0
@@ -0,0 +1,614 @@
1
+ """
2
+ Computer Vision & Image Analytics Tools
3
+
4
+ Advanced computer vision tools for image feature extraction, clustering,
5
+ and hybrid tabular-image analysis.
6
+ """
7
+
8
+ import polars as pl
9
+ import numpy as np
10
+ from typing import Dict, Any, List, Optional, Tuple
11
+ from pathlib import Path
12
+ import json
13
+
14
+ # Core CV libraries (optional)
15
+ try:
16
+ from PIL import Image
17
+ import cv2
18
+ CV2_AVAILABLE = True
19
+ except ImportError:
20
+ CV2_AVAILABLE = False
21
+
22
+ try:
23
+ import torch
24
+ import torchvision
25
+ from torchvision import models, transforms
26
+ TORCH_AVAILABLE = True
27
+ except ImportError:
28
+ TORCH_AVAILABLE = False
29
+
30
+ # ML libraries
31
+ try:
32
+ from sklearn.cluster import KMeans, DBSCAN
33
+ from sklearn.decomposition import PCA
34
+ from sklearn.preprocessing import StandardScaler
35
+ from sklearn.manifold import TSNE
36
+ except ImportError:
37
+ pass
38
+
39
+
40
+ def extract_image_features(
41
+ image_paths: List[str],
42
+ method: str = "cnn",
43
+ model_name: str = "resnet50",
44
+ color_spaces: Optional[List[str]] = None,
45
+ include_histograms: bool = True,
46
+ histogram_bins: int = 256
47
+ ) -> Dict[str, Any]:
48
+ """
49
+ Extract features from images using CNN embeddings, color histograms, and other methods.
50
+
51
+ Args:
52
+ image_paths: List of paths to image files
53
+ method: Feature extraction method ('cnn', 'color', 'texture', 'hybrid')
54
+ model_name: Pre-trained model for CNN features ('resnet50', 'efficientnet_b0', 'vgg16')
55
+ color_spaces: Color spaces for histograms (['rgb', 'hsv', 'lab'])
56
+ include_histograms: Whether to include color histograms
57
+ histogram_bins: Number of bins for histograms
58
+
59
+ Returns:
60
+ Dictionary containing feature vectors, dimensionality, and metadata
61
+ """
62
+ print(f"🔍 Extracting image features using {method} method...")
63
+
64
+ if not image_paths:
65
+ raise ValueError("No image paths provided")
66
+
67
+ result = {
68
+ "method": method,
69
+ "n_images": len(image_paths),
70
+ "features": [],
71
+ "feature_dim": 0,
72
+ "failed_images": []
73
+ }
74
+
75
+ try:
76
+ if method == "cnn" and TORCH_AVAILABLE:
77
+ print(f" Using CNN model: {model_name}")
78
+
79
+ # Load pre-trained model
80
+ if model_name == "resnet50":
81
+ model = models.resnet50(pretrained=True)
82
+ # Remove final classification layer
83
+ model = torch.nn.Sequential(*list(model.children())[:-1])
84
+ elif model_name == "efficientnet_b0":
85
+ model = models.efficientnet_b0(pretrained=True)
86
+ model = torch.nn.Sequential(*list(model.children())[:-1])
87
+ elif model_name == "vgg16":
88
+ model = models.vgg16(pretrained=True)
89
+ model.classifier = torch.nn.Sequential(*list(model.classifier.children())[:-1])
90
+ else:
91
+ raise ValueError(f"Unknown model '{model_name}'")
92
+
93
+ model.eval()
94
+
95
+ # Image preprocessing
96
+ preprocess = transforms.Compose([
97
+ transforms.Resize(256),
98
+ transforms.CenterCrop(224),
99
+ transforms.ToTensor(),
100
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
101
+ ])
102
+
103
+ # Extract features
104
+ for img_path in image_paths:
105
+ try:
106
+ img = Image.open(img_path).convert('RGB')
107
+ img_tensor = preprocess(img).unsqueeze(0)
108
+
109
+ with torch.no_grad():
110
+ features = model(img_tensor)
111
+ features = features.squeeze().numpy()
112
+
113
+ result["features"].append({
114
+ "image_path": img_path,
115
+ "feature_vector": features.tolist(),
116
+ "feature_dim": len(features)
117
+ })
118
+
119
+ except Exception as e:
120
+ result["failed_images"].append({"path": img_path, "error": str(e)})
121
+
122
+ if result["features"]:
123
+ result["feature_dim"] = result["features"][0]["feature_dim"]
124
+
125
+ elif method in ["color", "hybrid"] or not TORCH_AVAILABLE:
126
+ print(" Using color histogram features...")
127
+
128
+ if not CV2_AVAILABLE:
129
+ print("⚠️ OpenCV not available. Using PIL for basic features...")
130
+ return _extract_features_basic(image_paths)
131
+
132
+ color_spaces = color_spaces or ['rgb', 'hsv']
133
+
134
+ for img_path in image_paths:
135
+ try:
136
+ # Read image
137
+ img = cv2.imread(img_path)
138
+ if img is None:
139
+ raise ValueError(f"Could not read image: {img_path}")
140
+
141
+ feature_vector = []
142
+
143
+ # Color histograms
144
+ if 'rgb' in color_spaces:
145
+ for i in range(3):
146
+ hist = cv2.calcHist([img], [i], None, [histogram_bins], [0, 256])
147
+ feature_vector.extend(hist.flatten().tolist())
148
+
149
+ if 'hsv' in color_spaces:
150
+ hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
151
+ for i in range(3):
152
+ hist = cv2.calcHist([hsv], [i], None, [histogram_bins], [0, 256])
153
+ feature_vector.extend(hist.flatten().tolist())
154
+
155
+ if 'lab' in color_spaces:
156
+ lab = cv2.cvtColor(img, cv2.COLOR_BGR2LAB)
157
+ for i in range(3):
158
+ hist = cv2.calcHist([lab], [i], None, [histogram_bins], [0, 256])
159
+ feature_vector.extend(hist.flatten().tolist())
160
+
161
+ # Basic image stats
162
+ feature_vector.extend([
163
+ img.shape[0], # height
164
+ img.shape[1], # width
165
+ img.mean(), # mean pixel value
166
+ img.std() # std pixel value
167
+ ])
168
+
169
+ result["features"].append({
170
+ "image_path": img_path,
171
+ "feature_vector": feature_vector,
172
+ "feature_dim": len(feature_vector)
173
+ })
174
+
175
+ except Exception as e:
176
+ result["failed_images"].append({"path": img_path, "error": str(e)})
177
+
178
+ if result["features"]:
179
+ result["feature_dim"] = result["features"][0]["feature_dim"]
180
+
181
+ elif method == "texture":
182
+ print(" Extracting texture features...")
183
+
184
+ if not CV2_AVAILABLE:
185
+ raise ImportError("OpenCV required for texture features")
186
+
187
+ for img_path in image_paths:
188
+ try:
189
+ img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
190
+ if img is None:
191
+ raise ValueError(f"Could not read image: {img_path}")
192
+
193
+ # Edge detection
194
+ edges = cv2.Canny(img, 100, 200)
195
+
196
+ # Texture features
197
+ feature_vector = [
198
+ edges.mean(),
199
+ edges.std(),
200
+ np.count_nonzero(edges) / edges.size, # edge density
201
+ img.mean(),
202
+ img.std()
203
+ ]
204
+
205
+ result["features"].append({
206
+ "image_path": img_path,
207
+ "feature_vector": feature_vector,
208
+ "feature_dim": len(feature_vector)
209
+ })
210
+
211
+ except Exception as e:
212
+ result["failed_images"].append({"path": img_path, "error": str(e)})
213
+
214
+ if result["features"]:
215
+ result["feature_dim"] = result["features"][0]["feature_dim"]
216
+
217
+ else:
218
+ raise ValueError(f"Unknown method '{method}' or required libraries not available")
219
+
220
+ print(f"✅ Feature extraction complete!")
221
+ print(f" Processed: {len(result['features'])} images")
222
+ print(f" Failed: {len(result['failed_images'])} images")
223
+ print(f" Feature dimension: {result['feature_dim']}")
224
+
225
+ return result
226
+
227
+ except Exception as e:
228
+ print(f"❌ Error during feature extraction: {str(e)}")
229
+ raise
230
+
231
+
232
+ def _extract_features_basic(image_paths: List[str]) -> Dict[str, Any]:
233
+ """Fallback feature extraction using PIL when OpenCV/PyTorch not available."""
234
+
235
+ result = {
236
+ "method": "basic_pil",
237
+ "n_images": len(image_paths),
238
+ "features": [],
239
+ "feature_dim": 0,
240
+ "failed_images": []
241
+ }
242
+
243
+ for img_path in image_paths:
244
+ try:
245
+ img = Image.open(img_path).convert('RGB')
246
+ img_array = np.array(img)
247
+
248
+ # Basic statistics per channel
249
+ feature_vector = []
250
+ for channel in range(3):
251
+ channel_data = img_array[:, :, channel]
252
+ feature_vector.extend([
253
+ channel_data.mean(),
254
+ channel_data.std(),
255
+ channel_data.min(),
256
+ channel_data.max()
257
+ ])
258
+
259
+ # Image dimensions
260
+ feature_vector.extend([img_array.shape[0], img_array.shape[1]])
261
+
262
+ result["features"].append({
263
+ "image_path": img_path,
264
+ "feature_vector": feature_vector,
265
+ "feature_dim": len(feature_vector)
266
+ })
267
+
268
+ except Exception as e:
269
+ result["failed_images"].append({"path": img_path, "error": str(e)})
270
+
271
+ if result["features"]:
272
+ result["feature_dim"] = result["features"][0]["feature_dim"]
273
+
274
+ result["note"] = "Install torch, torchvision, and opencv for advanced features"
275
+
276
+ return result
277
+
278
+
279
+ def perform_image_clustering(
280
+ features: Dict[str, Any],
281
+ n_clusters: int = 5,
282
+ method: str = "kmeans",
283
+ reduce_dimensions: bool = True,
284
+ target_dim: int = 50,
285
+ return_similar_pairs: bool = True,
286
+ top_k: int = 10
287
+ ) -> Dict[str, Any]:
288
+ """
289
+ Cluster images based on extracted features and find similar images.
290
+
291
+ Args:
292
+ features: Output from extract_image_features
293
+ n_clusters: Number of clusters
294
+ method: Clustering method ('kmeans', 'dbscan')
295
+ reduce_dimensions: Whether to reduce dimensions before clustering
296
+ target_dim: Target dimensionality for reduction
297
+ return_similar_pairs: Whether to return most similar image pairs
298
+ top_k: Number of top similar pairs to return
299
+
300
+ Returns:
301
+ Dictionary containing cluster assignments, centroids, and similar pairs
302
+ """
303
+ print(f"🔍 Clustering images using {method}...")
304
+
305
+ if not features.get("features"):
306
+ raise ValueError("No features provided for clustering")
307
+
308
+ # Extract feature vectors
309
+ feature_vectors = np.array([f["feature_vector"] for f in features["features"]])
310
+ image_paths = [f["image_path"] for f in features["features"]]
311
+
312
+ print(f" Feature matrix shape: {feature_vectors.shape}")
313
+
314
+ result = {
315
+ "method": method,
316
+ "n_images": len(image_paths),
317
+ "n_clusters": n_clusters,
318
+ "clusters": []
319
+ }
320
+
321
+ try:
322
+ # Normalize features
323
+ scaler = StandardScaler()
324
+ feature_vectors_scaled = scaler.fit_transform(feature_vectors)
325
+
326
+ # Dimensionality reduction
327
+ if reduce_dimensions and feature_vectors_scaled.shape[1] > target_dim:
328
+ print(f" Reducing dimensions from {feature_vectors_scaled.shape[1]} to {target_dim}...")
329
+ pca = PCA(n_components=target_dim)
330
+ feature_vectors_reduced = pca.fit_transform(feature_vectors_scaled)
331
+ result["explained_variance"] = float(pca.explained_variance_ratio_.sum())
332
+ print(f" Explained variance: {result['explained_variance']:.3f}")
333
+ else:
334
+ feature_vectors_reduced = feature_vectors_scaled
335
+
336
+ # Clustering
337
+ if method == "kmeans":
338
+ clusterer = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
339
+ labels = clusterer.fit_predict(feature_vectors_reduced)
340
+
341
+ result["cluster_centers"] = clusterer.cluster_centers_.tolist()
342
+ result["inertia"] = float(clusterer.inertia_)
343
+
344
+ elif method == "dbscan":
345
+ clusterer = DBSCAN(eps=0.5, min_samples=5)
346
+ labels = clusterer.fit_predict(feature_vectors_reduced)
347
+
348
+ n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
349
+ result["n_clusters"] = n_clusters
350
+ result["n_noise_points"] = int((labels == -1).sum())
351
+
352
+ else:
353
+ raise ValueError(f"Unknown method '{method}'. Use 'kmeans' or 'dbscan'")
354
+
355
+ # Organize results by cluster
356
+ for cluster_id in sorted(set(labels)):
357
+ cluster_indices = np.where(labels == cluster_id)[0]
358
+ cluster_images = [image_paths[i] for i in cluster_indices]
359
+
360
+ cluster_info = {
361
+ "cluster_id": int(cluster_id),
362
+ "size": len(cluster_images),
363
+ "images": cluster_images[:100] # Limit to first 100
364
+ }
365
+
366
+ if method == "kmeans":
367
+ # Calculate distances to centroid
368
+ centroid = clusterer.cluster_centers_[cluster_id]
369
+ distances = np.linalg.norm(feature_vectors_reduced[cluster_indices] - centroid, axis=1)
370
+
371
+ # Representative images (closest to centroid)
372
+ representative_indices = distances.argsort()[:5]
373
+ cluster_info["representative_images"] = [
374
+ cluster_images[i] for i in representative_indices
375
+ ]
376
+
377
+ result["clusters"].append(cluster_info)
378
+
379
+ # Find similar image pairs
380
+ if return_similar_pairs:
381
+ print(f" Finding top {top_k} similar image pairs...")
382
+
383
+ from sklearn.metrics.pairwise import cosine_similarity
384
+
385
+ similarity_matrix = cosine_similarity(feature_vectors_reduced)
386
+
387
+ # Get upper triangle indices (avoid duplicates and self-similarity)
388
+ triu_indices = np.triu_indices(len(image_paths), k=1)
389
+ similarities = similarity_matrix[triu_indices]
390
+
391
+ # Get top K most similar pairs
392
+ top_indices = similarities.argsort()[-top_k:][::-1]
393
+
394
+ similar_pairs = []
395
+ for idx in top_indices:
396
+ i, j = triu_indices[0][idx], triu_indices[1][idx]
397
+ similar_pairs.append({
398
+ "image1": image_paths[i],
399
+ "image2": image_paths[j],
400
+ "similarity": float(similarities[idx])
401
+ })
402
+
403
+ result["similar_pairs"] = similar_pairs
404
+
405
+ # Visualize with t-SNE (if enough samples)
406
+ if len(image_paths) >= 30:
407
+ print(" Computing t-SNE for visualization...")
408
+ tsne = TSNE(n_components=2, random_state=42, perplexity=min(30, len(image_paths)-1))
409
+ embeddings_2d = tsne.fit_transform(feature_vectors_reduced)
410
+
411
+ result["tsne_embeddings"] = embeddings_2d.tolist()
412
+
413
+ print(f"✅ Clustering complete!")
414
+ print(f" Clusters: {len(result['clusters'])}")
415
+ for cluster in result["clusters"]:
416
+ print(f" Cluster {cluster['cluster_id']}: {cluster['size']} images")
417
+
418
+ return result
419
+
420
+ except Exception as e:
421
+ print(f"❌ Error during clustering: {str(e)}")
422
+ raise
423
+
424
+
425
+ def analyze_tabular_image_hybrid(
426
+ tabular_data: pl.DataFrame,
427
+ image_column: str,
428
+ target_column: Optional[str] = None,
429
+ tabular_features: Optional[List[str]] = None,
430
+ fusion_method: str = "concatenate",
431
+ model_type: str = "classification",
432
+ test_size: float = 0.2
433
+ ) -> Dict[str, Any]:
434
+ """
435
+ Analyze datasets with both tabular and image data using multi-modal learning.
436
+
437
+ Args:
438
+ tabular_data: DataFrame with tabular features and image paths
439
+ image_column: Column containing image file paths
440
+ target_column: Target variable column (if supervised learning)
441
+ tabular_features: List of tabular feature columns (if None, uses all except image/target)
442
+ fusion_method: How to combine features ('concatenate', 'attention', 'early', 'late')
443
+ model_type: Type of task ('classification', 'regression')
444
+ test_size: Proportion of data for testing
445
+
446
+ Returns:
447
+ Dictionary containing model performance, feature importance, and predictions
448
+ """
449
+ print(f"🔍 Analyzing hybrid tabular-image data...")
450
+
451
+ # Validate input
452
+ if image_column not in tabular_data.columns:
453
+ raise ValueError(f"Image column '{image_column}' not found in DataFrame")
454
+
455
+ if target_column and target_column not in tabular_data.columns:
456
+ raise ValueError(f"Target column '{target_column}' not found in DataFrame")
457
+
458
+ # Determine tabular features
459
+ if tabular_features is None:
460
+ exclude_cols = [image_column]
461
+ if target_column:
462
+ exclude_cols.append(target_column)
463
+ tabular_features = [col for col in tabular_data.columns if col not in exclude_cols]
464
+
465
+ print(f" Tabular features: {len(tabular_features)}")
466
+ print(f" Image column: {image_column}")
467
+ print(f" Target column: {target_column}")
468
+
469
+ result = {
470
+ "n_samples": tabular_data.shape[0],
471
+ "n_tabular_features": len(tabular_features),
472
+ "fusion_method": fusion_method,
473
+ "model_type": model_type
474
+ }
475
+
476
+ try:
477
+ # Step 1: Extract image features
478
+ print("\n Step 1: Extracting image features...")
479
+ image_paths = tabular_data[image_column].to_list()
480
+
481
+ # Use CNN features if available, otherwise color histograms
482
+ method = "cnn" if TORCH_AVAILABLE else "color"
483
+ image_features_result = extract_image_features(
484
+ image_paths,
485
+ method=method,
486
+ model_name="resnet50" if TORCH_AVAILABLE else None
487
+ )
488
+
489
+ # Build image feature matrix
490
+ image_feature_matrix = np.array([
491
+ f["feature_vector"] for f in image_features_result["features"]
492
+ ])
493
+
494
+ print(f" Image features shape: {image_feature_matrix.shape}")
495
+
496
+ # Step 2: Prepare tabular features
497
+ print("\n Step 2: Preparing tabular features...")
498
+ tabular_feature_matrix = tabular_data.select(tabular_features).to_numpy()
499
+
500
+ # Handle missing values
501
+ from sklearn.impute import SimpleImputer
502
+ imputer = SimpleImputer(strategy='mean')
503
+ tabular_feature_matrix = imputer.fit_transform(tabular_feature_matrix)
504
+
505
+ print(f" Tabular features shape: {tabular_feature_matrix.shape}")
506
+
507
+ # Step 3: Fusion
508
+ print(f"\n Step 3: Fusing features using '{fusion_method}' method...")
509
+
510
+ if fusion_method == "concatenate" or fusion_method == "early":
511
+ # Simple concatenation
512
+ combined_features = np.hstack([tabular_feature_matrix, image_feature_matrix])
513
+ result["combined_feature_dim"] = combined_features.shape[1]
514
+
515
+ elif fusion_method == "late":
516
+ # Train separate models and combine predictions
517
+ combined_features = tabular_feature_matrix # Will handle separately
518
+ result["combined_feature_dim"] = tabular_feature_matrix.shape[1]
519
+ result["image_feature_dim"] = image_feature_matrix.shape[1]
520
+
521
+ else:
522
+ raise ValueError(f"Unknown fusion method '{fusion_method}'")
523
+
524
+ print(f" Combined features shape: {combined_features.shape}")
525
+
526
+ # Step 4: Train model (if target provided)
527
+ if target_column:
528
+ print(f"\n Step 4: Training {model_type} model...")
529
+
530
+ target = tabular_data[target_column].to_numpy()
531
+
532
+ # Split data
533
+ from sklearn.model_selection import train_test_split
534
+
535
+ X_train, X_test, y_train, y_test = train_test_split(
536
+ combined_features, target, test_size=test_size, random_state=42
537
+ )
538
+
539
+ # Train model
540
+ if model_type == "classification":
541
+ from sklearn.ensemble import RandomForestClassifier
542
+ model = RandomForestClassifier(n_estimators=100, random_state=42)
543
+ model.fit(X_train, y_train)
544
+
545
+ # Evaluate
546
+ from sklearn.metrics import accuracy_score, classification_report
547
+
548
+ train_pred = model.predict(X_train)
549
+ test_pred = model.predict(X_test)
550
+
551
+ result["train_accuracy"] = float(accuracy_score(y_train, train_pred))
552
+ result["test_accuracy"] = float(accuracy_score(y_test, test_pred))
553
+
554
+ # Classification report
555
+ report = classification_report(y_test, test_pred, output_dict=True)
556
+ result["classification_report"] = report
557
+
558
+ elif model_type == "regression":
559
+ from sklearn.ensemble import RandomForestRegressor
560
+ model = RandomForestRegressor(n_estimators=100, random_state=42)
561
+ model.fit(X_train, y_train)
562
+
563
+ # Evaluate
564
+ from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
565
+
566
+ train_pred = model.predict(X_train)
567
+ test_pred = model.predict(X_test)
568
+
569
+ result["train_rmse"] = float(np.sqrt(mean_squared_error(y_train, train_pred)))
570
+ result["test_rmse"] = float(np.sqrt(mean_squared_error(y_test, test_pred)))
571
+ result["train_r2"] = float(r2_score(y_train, train_pred))
572
+ result["test_r2"] = float(r2_score(y_test, test_pred))
573
+ result["test_mae"] = float(mean_absolute_error(y_test, test_pred))
574
+
575
+ # Feature importance
576
+ if fusion_method == "concatenate":
577
+ feature_names = tabular_features + [f"image_feat_{i}" for i in range(image_feature_matrix.shape[1])]
578
+
579
+ # Top 20 most important features
580
+ importances = model.feature_importances_
581
+ top_indices = importances.argsort()[-20:][::-1]
582
+
583
+ result["top_features"] = [
584
+ {
585
+ "feature": feature_names[i],
586
+ "importance": float(importances[i])
587
+ }
588
+ for i in top_indices
589
+ ]
590
+
591
+ # Compare tabular vs image feature importance
592
+ tabular_importance = importances[:len(tabular_features)].sum()
593
+ image_importance = importances[len(tabular_features):].sum()
594
+
595
+ result["feature_importance_split"] = {
596
+ "tabular": float(tabular_importance),
597
+ "image": float(image_importance),
598
+ "tabular_percentage": float(tabular_importance / importances.sum() * 100),
599
+ "image_percentage": float(image_importance / importances.sum() * 100)
600
+ }
601
+
602
+ print(f"\n✅ Hybrid analysis complete!")
603
+ if target_column:
604
+ if model_type == "classification":
605
+ print(f" Test accuracy: {result['test_accuracy']:.4f}")
606
+ else:
607
+ print(f" Test R²: {result['test_r2']:.4f}")
608
+ print(f" Test RMSE: {result['test_rmse']:.4f}")
609
+
610
+ return result
611
+
612
+ except Exception as e:
613
+ print(f"❌ Error during hybrid analysis: {str(e)}")
614
+ raise