ds-agent-cli 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. package/bin/ds-agent.js +451 -0
  2. package/ds_agent/__init__.py +8 -0
  3. package/package.json +28 -0
  4. package/requirements.txt +126 -0
  5. package/setup.py +35 -0
  6. package/src/__init__.py +7 -0
  7. package/src/_compress_tool_result.py +118 -0
  8. package/src/api/__init__.py +4 -0
  9. package/src/api/app.py +1626 -0
  10. package/src/cache/__init__.py +5 -0
  11. package/src/cache/cache_manager.py +561 -0
  12. package/src/cli.py +2886 -0
  13. package/src/dynamic_prompts.py +281 -0
  14. package/src/orchestrator.py +4799 -0
  15. package/src/progress_manager.py +139 -0
  16. package/src/reasoning/__init__.py +332 -0
  17. package/src/reasoning/business_summary.py +431 -0
  18. package/src/reasoning/data_understanding.py +356 -0
  19. package/src/reasoning/model_explanation.py +383 -0
  20. package/src/reasoning/reasoning_trace.py +239 -0
  21. package/src/registry/__init__.py +3 -0
  22. package/src/registry/tools_registry.py +3 -0
  23. package/src/session_memory.py +448 -0
  24. package/src/session_store.py +370 -0
  25. package/src/storage/__init__.py +19 -0
  26. package/src/storage/artifact_store.py +620 -0
  27. package/src/storage/helpers.py +116 -0
  28. package/src/storage/huggingface_storage.py +694 -0
  29. package/src/storage/r2_storage.py +0 -0
  30. package/src/storage/user_files_service.py +288 -0
  31. package/src/tools/__init__.py +335 -0
  32. package/src/tools/advanced_analysis.py +823 -0
  33. package/src/tools/advanced_feature_engineering.py +708 -0
  34. package/src/tools/advanced_insights.py +578 -0
  35. package/src/tools/advanced_preprocessing.py +549 -0
  36. package/src/tools/advanced_training.py +906 -0
  37. package/src/tools/agent_tool_mapping.py +326 -0
  38. package/src/tools/auto_pipeline.py +420 -0
  39. package/src/tools/autogluon_training.py +1480 -0
  40. package/src/tools/business_intelligence.py +860 -0
  41. package/src/tools/cloud_data_sources.py +581 -0
  42. package/src/tools/code_interpreter.py +390 -0
  43. package/src/tools/computer_vision.py +614 -0
  44. package/src/tools/data_cleaning.py +614 -0
  45. package/src/tools/data_profiling.py +593 -0
  46. package/src/tools/data_type_conversion.py +268 -0
  47. package/src/tools/data_wrangling.py +433 -0
  48. package/src/tools/eda_reports.py +284 -0
  49. package/src/tools/enhanced_feature_engineering.py +241 -0
  50. package/src/tools/feature_engineering.py +302 -0
  51. package/src/tools/matplotlib_visualizations.py +1327 -0
  52. package/src/tools/model_training.py +520 -0
  53. package/src/tools/nlp_text_analytics.py +761 -0
  54. package/src/tools/plotly_visualizations.py +497 -0
  55. package/src/tools/production_mlops.py +852 -0
  56. package/src/tools/time_series.py +507 -0
  57. package/src/tools/tools_registry.py +2133 -0
  58. package/src/tools/visualization_engine.py +559 -0
  59. package/src/utils/__init__.py +42 -0
  60. package/src/utils/error_recovery.py +313 -0
  61. package/src/utils/parallel_executor.py +402 -0
  62. package/src/utils/polars_helpers.py +248 -0
  63. package/src/utils/schema_extraction.py +132 -0
  64. package/src/utils/semantic_layer.py +392 -0
  65. package/src/utils/token_budget.py +411 -0
  66. package/src/utils/validation.py +377 -0
  67. package/src/workflow_state.py +154 -0
@@ -0,0 +1,578 @@
1
+ """
2
+ Advanced Insights Tools
3
+ Tools for root cause analysis, trend detection, anomaly detection, and statistical testing.
4
+ """
5
+
6
+ import polars as pl
7
+ import numpy as np
8
+ import pandas as pd
9
+ from typing import Dict, Any, List, Optional, Tuple
10
+ from pathlib import Path
11
+ import sys
12
+ import os
13
+ from scipy import stats
14
+ from scipy.signal import find_peaks
15
+ from sklearn.ensemble import IsolationForest
16
+ from sklearn.cluster import KMeans
17
+ from sklearn.preprocessing import StandardScaler
18
+ import json
19
+
20
+ # Add parent directory to path
21
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
22
+
23
+ from ds_agent.utils.polars_helpers import load_dataframe, get_numeric_columns
24
+ from ds_agent.utils.validation import validate_file_exists, validate_file_format
25
+
26
+
27
+ def analyze_root_cause(file_path: str,
28
+ target_col: str,
29
+ time_col: Optional[str] = None,
30
+ threshold_drop: float = 0.15) -> Dict[str, Any]:
31
+ """
32
+ Perform root cause analysis to identify why a metric dropped.
33
+
34
+ Args:
35
+ file_path: Path to dataset
36
+ target_col: Column to analyze (e.g., 'sales')
37
+ time_col: Optional time column for trend analysis
38
+ threshold_drop: Percentage drop to flag as significant (default 15%)
39
+
40
+ Returns:
41
+ Dictionary with root cause insights
42
+ """
43
+ validate_file_exists(file_path)
44
+ df = load_dataframe(file_path)
45
+
46
+ # Convert to pandas for easier analysis
47
+ df_pd = df.to_pandas()
48
+
49
+ results = {
50
+ "target_column": target_col,
51
+ "analysis_type": "root_cause",
52
+ "insights": [],
53
+ "correlations": {},
54
+ "top_factors": []
55
+ }
56
+
57
+ # Check if target exists
58
+ if target_col not in df_pd.columns:
59
+ return {"status": "error", "message": f"Column '{target_col}' not found"}
60
+
61
+ # Analyze overall trend
62
+ target_mean = df_pd[target_col].mean()
63
+ target_std = df_pd[target_col].std()
64
+
65
+ # If time column exists, analyze temporal patterns
66
+ if time_col and time_col in df_pd.columns:
67
+ try:
68
+ df_pd[time_col] = pd.to_datetime(df_pd[time_col])
69
+ df_sorted = df_pd.sort_values(time_col)
70
+
71
+ # Calculate period-over-period changes
72
+ if len(df_sorted) > 10:
73
+ mid_point = len(df_sorted) // 2
74
+ first_half_mean = df_sorted[target_col].iloc[:mid_point].mean()
75
+ second_half_mean = df_sorted[target_col].iloc[mid_point:].mean()
76
+
77
+ change_pct = ((second_half_mean - first_half_mean) / first_half_mean) * 100
78
+
79
+ if abs(change_pct) > threshold_drop * 100:
80
+ insight = f"📉 Significant change detected: {change_pct:+.1f}% between periods"
81
+ results["insights"].append(insight)
82
+ results["period_change"] = {
83
+ "first_period_avg": float(first_half_mean),
84
+ "second_period_avg": float(second_half_mean),
85
+ "change_percentage": float(change_pct)
86
+ }
87
+ except Exception as e:
88
+ results["insights"].append(f"⚠️ Could not analyze time series: {str(e)}")
89
+
90
+ # Find correlations with target
91
+ numeric_cols = df_pd.select_dtypes(include=[np.number]).columns.tolist()
92
+ if target_col in numeric_cols:
93
+ numeric_cols.remove(target_col)
94
+
95
+ if numeric_cols:
96
+ correlations = {}
97
+ for col in numeric_cols[:20]: # Limit to top 20 for performance
98
+ try:
99
+ corr = df_pd[target_col].corr(df_pd[col])
100
+ if not np.isnan(corr):
101
+ correlations[col] = float(corr)
102
+ except:
103
+ pass
104
+
105
+ # Sort by absolute correlation
106
+ sorted_corrs = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)
107
+ results["correlations"] = dict(sorted_corrs[:10])
108
+
109
+ # Identify top factors
110
+ top_factors = []
111
+ for col, corr in sorted_corrs[:5]:
112
+ if abs(corr) > 0.3:
113
+ direction = "positively" if corr > 0 else "negatively"
114
+ top_factors.append({
115
+ "factor": col,
116
+ "correlation": float(corr),
117
+ "description": f"{col} is {direction} correlated ({corr:.3f}) with {target_col}"
118
+ })
119
+
120
+ results["top_factors"] = top_factors
121
+
122
+ if top_factors:
123
+ results["insights"].append(f"🔍 Found {len(top_factors)} significant factors influencing {target_col}")
124
+
125
+ # Outlier detection in target
126
+ Q1 = df_pd[target_col].quantile(0.25)
127
+ Q3 = df_pd[target_col].quantile(0.75)
128
+ IQR = Q3 - Q1
129
+ outliers = df_pd[(df_pd[target_col] < Q1 - 1.5 * IQR) | (df_pd[target_col] > Q3 + 1.5 * IQR)]
130
+
131
+ if len(outliers) > 0:
132
+ outlier_pct = (len(outliers) / len(df_pd)) * 100
133
+ results["insights"].append(f"⚠️ {len(outliers)} outliers detected ({outlier_pct:.1f}% of data)")
134
+ results["outlier_count"] = len(outliers)
135
+
136
+ return results
137
+
138
+
139
+ def detect_trends_and_seasonality(file_path: str,
140
+ value_col: str,
141
+ time_col: str,
142
+ seasonal_period: Optional[int] = None) -> Dict[str, Any]:
143
+ """
144
+ Detect trends and seasonal patterns in time series data.
145
+
146
+ Args:
147
+ file_path: Path to dataset
148
+ value_col: Column with values to analyze
149
+ time_col: Column with timestamps
150
+ seasonal_period: Expected seasonal period (auto-detected if None)
151
+
152
+ Returns:
153
+ Dictionary with trend and seasonality insights
154
+ """
155
+ validate_file_exists(file_path)
156
+ df = load_dataframe(file_path).to_pandas()
157
+
158
+ results = {
159
+ "value_column": value_col,
160
+ "time_column": time_col,
161
+ "trend_detected": False,
162
+ "seasonality_detected": False,
163
+ "insights": []
164
+ }
165
+
166
+ # Validate columns
167
+ if value_col not in df.columns or time_col not in df.columns:
168
+ return {"status": "error", "message": "Columns not found"}
169
+
170
+ # Convert to datetime and sort
171
+ try:
172
+ df[time_col] = pd.to_datetime(df[time_col])
173
+ df = df.sort_values(time_col).reset_index(drop=True)
174
+ except:
175
+ return {"status": "error", "message": f"Could not parse {time_col} as datetime"}
176
+
177
+ values = df[value_col].values
178
+
179
+ # Trend detection using linear regression
180
+ X = np.arange(len(values)).reshape(-1, 1)
181
+ y = values
182
+
183
+ # Simple linear regression
184
+ slope, intercept, r_value, p_value, std_err = stats.linregress(X.flatten(), y)
185
+
186
+ if p_value < 0.05: # Significant trend
187
+ results["trend_detected"] = True
188
+ results["trend_slope"] = float(slope)
189
+ results["trend_r_squared"] = float(r_value ** 2)
190
+
191
+ direction = "upward" if slope > 0 else "downward"
192
+ results["insights"].append(f"📈 {direction.capitalize()} trend detected (slope: {slope:.4f}, R²: {r_value**2:.3f})")
193
+ results["trend_direction"] = direction
194
+ else:
195
+ results["insights"].append("📊 No significant trend detected")
196
+
197
+ # Seasonality detection using autocorrelation
198
+ if len(values) > 20:
199
+ from statsmodels.tsa.stattools import acf
200
+
201
+ try:
202
+ autocorr = acf(values, nlags=min(len(values)//2, 50), fft=True)
203
+
204
+ # Find peaks in autocorrelation (excluding lag 0)
205
+ peaks, properties = find_peaks(autocorr[1:], height=0.3)
206
+
207
+ if len(peaks) > 0:
208
+ # Most prominent peak indicates seasonal period
209
+ peak_lag = peaks[np.argmax(properties['peak_heights'])] + 1
210
+ results["seasonality_detected"] = True
211
+ results["seasonal_period"] = int(peak_lag)
212
+ results["insights"].append(f"🔄 Seasonality detected with period of {peak_lag} observations")
213
+ else:
214
+ results["insights"].append("📊 No strong seasonality pattern detected")
215
+ except Exception as e:
216
+ results["insights"].append(f"⚠️ Could not analyze seasonality: {str(e)}")
217
+
218
+ # Calculate summary statistics
219
+ results["statistics"] = {
220
+ "mean": float(np.mean(values)),
221
+ "std": float(np.std(values)),
222
+ "min": float(np.min(values)),
223
+ "max": float(np.max(values)),
224
+ "range": float(np.max(values) - np.min(values))
225
+ }
226
+
227
+ return results
228
+
229
+
230
+ def detect_anomalies_advanced(file_path: str,
231
+ columns: Optional[List[str]] = None,
232
+ contamination: float = 0.1,
233
+ method: str = "isolation_forest") -> Dict[str, Any]:
234
+ """
235
+ Detect anomalies with confidence scores using advanced methods.
236
+
237
+ Args:
238
+ file_path: Path to dataset
239
+ columns: Columns to analyze (all numeric if None)
240
+ contamination: Expected proportion of outliers
241
+ method: 'isolation_forest' or 'statistical'
242
+
243
+ Returns:
244
+ Dictionary with anomaly detection results
245
+ """
246
+ validate_file_exists(file_path)
247
+ df = load_dataframe(file_path)
248
+ df_pd = df.to_pandas()
249
+
250
+ # Select numeric columns
251
+ if columns is None:
252
+ numeric_cols = df_pd.select_dtypes(include=[np.number]).columns.tolist()
253
+ else:
254
+ numeric_cols = [c for c in columns if c in df_pd.columns]
255
+
256
+ if not numeric_cols:
257
+ return {"status": "error", "message": "No numeric columns found"}
258
+
259
+ X = df_pd[numeric_cols].fillna(df_pd[numeric_cols].mean())
260
+
261
+ results = {
262
+ "method": method,
263
+ "columns_analyzed": numeric_cols,
264
+ "total_rows": len(X),
265
+ "anomaly_indices": [],
266
+ "anomaly_scores": []
267
+ }
268
+
269
+ if method == "isolation_forest":
270
+ # Isolation Forest
271
+ clf = IsolationForest(contamination=contamination, random_state=42)
272
+ predictions = clf.fit_predict(X)
273
+ scores = clf.score_samples(X)
274
+
275
+ anomaly_mask = predictions == -1
276
+ results["anomalies_detected"] = int(anomaly_mask.sum())
277
+ results["anomaly_percentage"] = float((anomaly_mask.sum() / len(X)) * 100)
278
+ results["anomaly_indices"] = np.where(anomaly_mask)[0].tolist()
279
+ results["anomaly_scores"] = scores[anomaly_mask].tolist()
280
+
281
+ results["insights"] = [
282
+ f"🔍 Detected {results['anomalies_detected']} anomalies ({results['anomaly_percentage']:.2f}% of data)",
283
+ f"📊 Using Isolation Forest with contamination={contamination}"
284
+ ]
285
+
286
+ else: # Statistical method
287
+ # Z-score method
288
+ z_scores = np.abs(stats.zscore(X, nan_policy='omit'))
289
+ anomaly_mask = (z_scores > 3).any(axis=1)
290
+
291
+ results["anomalies_detected"] = int(anomaly_mask.sum())
292
+ results["anomaly_percentage"] = float((anomaly_mask.sum() / len(X)) * 100)
293
+ results["anomaly_indices"] = np.where(anomaly_mask)[0].tolist()
294
+
295
+ results["insights"] = [
296
+ f"🔍 Detected {results['anomalies_detected']} anomalies ({results['anomaly_percentage']:.2f}% of data)",
297
+ f"📊 Using statistical method (Z-score > 3)"
298
+ ]
299
+
300
+ return results
301
+
302
+
303
+ def perform_hypothesis_testing(file_path: str,
304
+ group_col: str,
305
+ value_col: str,
306
+ test_type: str = "auto") -> Dict[str, Any]:
307
+ """
308
+ Perform statistical hypothesis testing.
309
+
310
+ Args:
311
+ file_path: Path to dataset
312
+ group_col: Column defining groups
313
+ value_col: Column with values to compare
314
+ test_type: 't-test', 'chi-square', 'anova', or 'auto'
315
+
316
+ Returns:
317
+ Dictionary with test results
318
+ """
319
+ validate_file_exists(file_path)
320
+ df = load_dataframe(file_path).to_pandas()
321
+
322
+ if group_col not in df.columns or value_col not in df.columns:
323
+ return {"status": "error", "message": "Columns not found"}
324
+
325
+ results = {
326
+ "group_column": group_col,
327
+ "value_column": value_col,
328
+ "test_type": test_type
329
+ }
330
+
331
+ # Get groups
332
+ groups = df.groupby(group_col)[value_col].apply(list).to_dict()
333
+ group_names = list(groups.keys())
334
+
335
+ if len(group_names) < 2:
336
+ return {"status": "error", "message": "Need at least 2 groups for comparison"}
337
+
338
+ # Auto-detect test type
339
+ if test_type == "auto":
340
+ if len(group_names) == 2:
341
+ test_type = "t-test"
342
+ else:
343
+ test_type = "anova"
344
+
345
+ # Perform test
346
+ if test_type == "t-test" and len(group_names) >= 2:
347
+ group1_data = groups[group_names[0]]
348
+ group2_data = groups[group_names[1]]
349
+
350
+ statistic, p_value = stats.ttest_ind(group1_data, group2_data)
351
+
352
+ results["test_statistic"] = float(statistic)
353
+ results["p_value"] = float(p_value)
354
+ results["significant"] = p_value < 0.05
355
+ results["groups_compared"] = [group_names[0], group_names[1]]
356
+
357
+ results["interpretation"] = (
358
+ f"{'Significant' if p_value < 0.05 else 'No significant'} difference "
359
+ f"between {group_names[0]} and {group_names[1]} (p={p_value:.4f})"
360
+ )
361
+
362
+ # Effect size (Cohen's d)
363
+ mean1, mean2 = np.mean(group1_data), np.mean(group2_data)
364
+ std1, std2 = np.std(group1_data), np.std(group2_data)
365
+ pooled_std = np.sqrt((std1**2 + std2**2) / 2)
366
+ cohens_d = (mean1 - mean2) / pooled_std if pooled_std > 0 else 0
367
+
368
+ results["effect_size"] = float(cohens_d)
369
+ results["group_means"] = {group_names[0]: float(mean1), group_names[1]: float(mean2)}
370
+
371
+ elif test_type == "anova":
372
+ group_data = [groups[g] for g in group_names]
373
+ statistic, p_value = stats.f_oneway(*group_data)
374
+
375
+ results["test_statistic"] = float(statistic)
376
+ results["p_value"] = float(p_value)
377
+ results["significant"] = p_value < 0.05
378
+ results["groups_compared"] = group_names
379
+
380
+ results["interpretation"] = (
381
+ f"{'Significant' if p_value < 0.05 else 'No significant'} difference "
382
+ f"among {len(group_names)} groups (p={p_value:.4f})"
383
+ )
384
+
385
+ # Group means
386
+ results["group_means"] = {g: float(np.mean(groups[g])) for g in group_names}
387
+
388
+ return results
389
+
390
+
391
+ def analyze_distribution(file_path: str,
392
+ column: str,
393
+ tests: List[str] = ["normality", "skewness"]) -> Dict[str, Any]:
394
+ """
395
+ Analyze distribution of a column.
396
+
397
+ Args:
398
+ file_path: Path to dataset
399
+ column: Column to analyze
400
+ tests: List of tests to perform
401
+
402
+ Returns:
403
+ Dictionary with distribution analysis results
404
+ """
405
+ validate_file_exists(file_path)
406
+ df = load_dataframe(file_path).to_pandas()
407
+
408
+ if column not in df.columns:
409
+ return {"status": "error", "message": f"Column '{column}' not found"}
410
+
411
+ data = df[column].dropna()
412
+
413
+ results = {
414
+ "column": column,
415
+ "n_values": len(data),
416
+ "n_missing": int(df[column].isna().sum()),
417
+ "tests_performed": tests,
418
+ "insights": []
419
+ }
420
+
421
+ # Basic statistics
422
+ results["statistics"] = {
423
+ "mean": float(data.mean()),
424
+ "median": float(data.median()),
425
+ "std": float(data.std()),
426
+ "min": float(data.min()),
427
+ "max": float(data.max()),
428
+ "q25": float(data.quantile(0.25)),
429
+ "q75": float(data.quantile(0.75))
430
+ }
431
+
432
+ # Normality test
433
+ if "normality" in tests:
434
+ statistic, p_value = stats.shapiro(data.sample(min(5000, len(data)))) # Limit for performance
435
+ results["normality_test"] = {
436
+ "test": "Shapiro-Wilk",
437
+ "statistic": float(statistic),
438
+ "p_value": float(p_value),
439
+ "is_normal": p_value > 0.05
440
+ }
441
+
442
+ if p_value > 0.05:
443
+ results["insights"].append(f"✅ Data appears normally distributed (p={p_value:.4f})")
444
+ else:
445
+ results["insights"].append(f"⚠️ Data is NOT normally distributed (p={p_value:.4f})")
446
+
447
+ # Skewness
448
+ if "skewness" in tests:
449
+ skewness = float(stats.skew(data))
450
+ kurtosis = float(stats.kurtosis(data))
451
+
452
+ results["skewness"] = skewness
453
+ results["kurtosis"] = kurtosis
454
+
455
+ if abs(skewness) < 0.5:
456
+ skew_desc = "approximately symmetric"
457
+ elif skewness > 0:
458
+ skew_desc = "right-skewed (positive skew)"
459
+ else:
460
+ skew_desc = "left-skewed (negative skew)"
461
+
462
+ results["insights"].append(f"📊 Distribution is {skew_desc} (skewness={skewness:.3f})")
463
+
464
+ return results
465
+
466
+
467
+ def perform_segment_analysis(file_path: str,
468
+ n_segments: int = 5,
469
+ features: Optional[List[str]] = None,
470
+ method: str = "kmeans") -> Dict[str, Any]:
471
+ """
472
+ Perform cluster-based segment analysis.
473
+
474
+ Args:
475
+ file_path: Path to dataset
476
+ n_segments: Number of segments to create (ignored for HDBSCAN)
477
+ features: Features to use for clustering (all numeric if None)
478
+ method: Clustering method ('kmeans' or 'hdbscan')
479
+
480
+ Returns:
481
+ Dictionary with segment analysis results
482
+ """
483
+ validate_file_exists(file_path)
484
+ df = load_dataframe(file_path).to_pandas()
485
+
486
+ # Select features
487
+ if features is None:
488
+ features = df.select_dtypes(include=[np.number]).columns.tolist()
489
+ else:
490
+ features = [f for f in features if f in df.columns]
491
+
492
+ if not features:
493
+ return {"status": "error", "message": "No numeric features found for clustering"}
494
+
495
+ # Prepare data
496
+ X = df[features].fillna(df[features].mean())
497
+
498
+ # Scale features
499
+ scaler = StandardScaler()
500
+ X_scaled = scaler.fit_transform(X)
501
+
502
+ # Perform clustering
503
+ if method == "hdbscan":
504
+ try:
505
+ from sklearn.cluster import HDBSCAN as SklearnHDBSCAN
506
+
507
+ print("🔍 Using HDBSCAN for density-based segmentation...")
508
+ clusterer = SklearnHDBSCAN(
509
+ min_cluster_size=max(5, len(X) // 50),
510
+ min_samples=max(3, len(X) // 100),
511
+ cluster_selection_method='eom'
512
+ )
513
+ labels = clusterer.fit_predict(X_scaled)
514
+
515
+ # HDBSCAN assigns -1 to noise points
516
+ n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
517
+ n_noise = int((labels == -1).sum())
518
+ n_segments = n_clusters
519
+
520
+ print(f" Found {n_clusters} clusters + {n_noise} noise points")
521
+
522
+ except ImportError:
523
+ print("⚠️ HDBSCAN not available (requires scikit-learn >= 1.3). Falling back to KMeans.")
524
+ method = "kmeans"
525
+
526
+ if method == "kmeans":
527
+ kmeans = KMeans(n_clusters=n_segments, random_state=42, n_init=10)
528
+ labels = kmeans.fit_predict(X_scaled)
529
+
530
+ # Add cluster labels to dataframe
531
+ df['segment'] = labels
532
+
533
+ # Analyze segments (include noise cluster -1 for HDBSCAN)
534
+ unique_labels = sorted(set(labels))
535
+ segment_profiles = []
536
+ for label in unique_labels:
537
+ segment_data = df[df['segment'] == label]
538
+ profile = {
539
+ "segment_id": int(label),
540
+ "label": "noise" if label == -1 else f"cluster_{label}",
541
+ "size": len(segment_data),
542
+ "percentage": float((len(segment_data) / len(df)) * 100),
543
+ "characteristics": {}
544
+ }
545
+
546
+ # Calculate mean for each feature
547
+ for feat in features:
548
+ profile["characteristics"][feat] = {
549
+ "mean": float(segment_data[feat].mean()),
550
+ "std": float(segment_data[feat].std())
551
+ }
552
+
553
+ segment_profiles.append(profile)
554
+
555
+ results = {
556
+ "method": method,
557
+ "n_segments": n_segments,
558
+ "features_used": features,
559
+ "total_samples": len(df),
560
+ "segments": segment_profiles,
561
+ "insights": [
562
+ f"🎯 Created {n_segments} segments from {len(df)} samples using {method.upper()}",
563
+ f"📊 Used {len(features)} features for segmentation"
564
+ ]
565
+ }
566
+
567
+ if method == "hdbscan" and n_noise > 0:
568
+ results["noise_points"] = n_noise
569
+ results["insights"].append(f"🔇 {n_noise} samples classified as noise (outliers)")
570
+
571
+ # Find most distinctive features for each segment
572
+ for profile in segment_profiles:
573
+ if profile["segment_id"] != -1:
574
+ results["insights"].append(
575
+ f"Segment {profile['segment_id']}: {profile['size']} samples ({profile['percentage']:.1f}%)"
576
+ )
577
+
578
+ return results