ds-agent-cli 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/ds-agent.js +451 -0
- package/ds_agent/__init__.py +8 -0
- package/package.json +28 -0
- package/requirements.txt +126 -0
- package/setup.py +35 -0
- package/src/__init__.py +7 -0
- package/src/_compress_tool_result.py +118 -0
- package/src/api/__init__.py +4 -0
- package/src/api/app.py +1626 -0
- package/src/cache/__init__.py +5 -0
- package/src/cache/cache_manager.py +561 -0
- package/src/cli.py +2886 -0
- package/src/dynamic_prompts.py +281 -0
- package/src/orchestrator.py +4799 -0
- package/src/progress_manager.py +139 -0
- package/src/reasoning/__init__.py +332 -0
- package/src/reasoning/business_summary.py +431 -0
- package/src/reasoning/data_understanding.py +356 -0
- package/src/reasoning/model_explanation.py +383 -0
- package/src/reasoning/reasoning_trace.py +239 -0
- package/src/registry/__init__.py +3 -0
- package/src/registry/tools_registry.py +3 -0
- package/src/session_memory.py +448 -0
- package/src/session_store.py +370 -0
- package/src/storage/__init__.py +19 -0
- package/src/storage/artifact_store.py +620 -0
- package/src/storage/helpers.py +116 -0
- package/src/storage/huggingface_storage.py +694 -0
- package/src/storage/r2_storage.py +0 -0
- package/src/storage/user_files_service.py +288 -0
- package/src/tools/__init__.py +335 -0
- package/src/tools/advanced_analysis.py +823 -0
- package/src/tools/advanced_feature_engineering.py +708 -0
- package/src/tools/advanced_insights.py +578 -0
- package/src/tools/advanced_preprocessing.py +549 -0
- package/src/tools/advanced_training.py +906 -0
- package/src/tools/agent_tool_mapping.py +326 -0
- package/src/tools/auto_pipeline.py +420 -0
- package/src/tools/autogluon_training.py +1480 -0
- package/src/tools/business_intelligence.py +860 -0
- package/src/tools/cloud_data_sources.py +581 -0
- package/src/tools/code_interpreter.py +390 -0
- package/src/tools/computer_vision.py +614 -0
- package/src/tools/data_cleaning.py +614 -0
- package/src/tools/data_profiling.py +593 -0
- package/src/tools/data_type_conversion.py +268 -0
- package/src/tools/data_wrangling.py +433 -0
- package/src/tools/eda_reports.py +284 -0
- package/src/tools/enhanced_feature_engineering.py +241 -0
- package/src/tools/feature_engineering.py +302 -0
- package/src/tools/matplotlib_visualizations.py +1327 -0
- package/src/tools/model_training.py +520 -0
- package/src/tools/nlp_text_analytics.py +761 -0
- package/src/tools/plotly_visualizations.py +497 -0
- package/src/tools/production_mlops.py +852 -0
- package/src/tools/time_series.py +507 -0
- package/src/tools/tools_registry.py +2133 -0
- package/src/tools/visualization_engine.py +559 -0
- package/src/utils/__init__.py +42 -0
- package/src/utils/error_recovery.py +313 -0
- package/src/utils/parallel_executor.py +402 -0
- package/src/utils/polars_helpers.py +248 -0
- package/src/utils/schema_extraction.py +132 -0
- package/src/utils/semantic_layer.py +392 -0
- package/src/utils/token_budget.py +411 -0
- package/src/utils/validation.py +377 -0
- package/src/workflow_state.py +154 -0
|
@@ -0,0 +1,852 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Production & MLOps Tools
|
|
3
|
+
Tools for model monitoring, explainability, governance, and production readiness.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import polars as pl
|
|
7
|
+
import numpy as np
|
|
8
|
+
from typing import Dict, Any, List, Optional, Tuple
|
|
9
|
+
from pathlib import Path
|
|
10
|
+
import sys
|
|
11
|
+
import os
|
|
12
|
+
import json
|
|
13
|
+
import warnings
|
|
14
|
+
from datetime import datetime
|
|
15
|
+
import joblib
|
|
16
|
+
|
|
17
|
+
warnings.filterwarnings('ignore')
|
|
18
|
+
|
|
19
|
+
# Add parent directory to path for imports
|
|
20
|
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
|
21
|
+
|
|
22
|
+
from scipy import stats
|
|
23
|
+
from scipy.stats import ks_2samp, pearsonr
|
|
24
|
+
import shap
|
|
25
|
+
from lime import lime_tabular
|
|
26
|
+
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
|
|
27
|
+
|
|
28
|
+
from ds_agent.utils.polars_helpers import load_dataframe, get_numeric_columns, split_features_target
|
|
29
|
+
from ds_agent.utils.validation import validate_file_exists, validate_file_format, validate_dataframe, validate_column_exists
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def monitor_model_drift(
|
|
33
|
+
reference_data_path: str,
|
|
34
|
+
current_data_path: str,
|
|
35
|
+
target_col: Optional[str] = None,
|
|
36
|
+
threshold_psi: float = 0.2,
|
|
37
|
+
threshold_ks: float = 0.05,
|
|
38
|
+
output_path: Optional[str] = None
|
|
39
|
+
) -> Dict[str, Any]:
|
|
40
|
+
"""
|
|
41
|
+
Detect data drift and concept drift in production models.
|
|
42
|
+
|
|
43
|
+
Args:
|
|
44
|
+
reference_data_path: Path to training/reference dataset
|
|
45
|
+
current_data_path: Path to production/current dataset
|
|
46
|
+
target_col: Target column (for concept drift detection)
|
|
47
|
+
threshold_psi: PSI threshold (>0.2 = significant drift)
|
|
48
|
+
threshold_ks: KS test p-value threshold (<0.05 = significant drift)
|
|
49
|
+
output_path: Path to save drift report
|
|
50
|
+
|
|
51
|
+
Returns:
|
|
52
|
+
Dictionary with drift metrics and alerts
|
|
53
|
+
"""
|
|
54
|
+
# Validation
|
|
55
|
+
validate_file_exists(reference_data_path)
|
|
56
|
+
validate_file_exists(current_data_path)
|
|
57
|
+
|
|
58
|
+
# Load data
|
|
59
|
+
ref_df = load_dataframe(reference_data_path)
|
|
60
|
+
curr_df = load_dataframe(current_data_path)
|
|
61
|
+
|
|
62
|
+
validate_dataframe(ref_df)
|
|
63
|
+
validate_dataframe(curr_df)
|
|
64
|
+
|
|
65
|
+
print("🔍 Analyzing data drift...")
|
|
66
|
+
|
|
67
|
+
# Get common columns
|
|
68
|
+
common_cols = list(set(ref_df.columns) & set(curr_df.columns))
|
|
69
|
+
numeric_cols = [col for col in get_numeric_columns(ref_df) if col in common_cols and col != target_col]
|
|
70
|
+
|
|
71
|
+
# Calculate PSI (Population Stability Index) for each feature
|
|
72
|
+
drift_results = {}
|
|
73
|
+
alerts = []
|
|
74
|
+
|
|
75
|
+
for col in numeric_cols:
|
|
76
|
+
try:
|
|
77
|
+
ref_data = ref_df[col].drop_nulls().to_numpy()
|
|
78
|
+
curr_data = curr_df[col].drop_nulls().to_numpy()
|
|
79
|
+
|
|
80
|
+
# PSI calculation
|
|
81
|
+
# Create bins based on reference data
|
|
82
|
+
bins = np.percentile(ref_data, [0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
|
|
83
|
+
bins = np.unique(bins) # Remove duplicates
|
|
84
|
+
|
|
85
|
+
ref_counts, _ = np.histogram(ref_data, bins=bins)
|
|
86
|
+
curr_counts, _ = np.histogram(curr_data, bins=bins)
|
|
87
|
+
|
|
88
|
+
# Add small constant to avoid division by zero
|
|
89
|
+
ref_props = (ref_counts + 1e-6) / (len(ref_data) + len(bins) * 1e-6)
|
|
90
|
+
curr_props = (curr_counts + 1e-6) / (len(curr_data) + len(bins) * 1e-6)
|
|
91
|
+
|
|
92
|
+
psi = np.sum((curr_props - ref_props) * np.log(curr_props / ref_props))
|
|
93
|
+
|
|
94
|
+
# KS test (Kolmogorov-Smirnov)
|
|
95
|
+
ks_stat, ks_pval = ks_2samp(ref_data, curr_data)
|
|
96
|
+
|
|
97
|
+
# Distribution statistics
|
|
98
|
+
ref_mean = float(np.mean(ref_data))
|
|
99
|
+
curr_mean = float(np.mean(curr_data))
|
|
100
|
+
mean_shift = float(abs(curr_mean - ref_mean) / (ref_mean + 1e-10))
|
|
101
|
+
|
|
102
|
+
drift_results[col] = {
|
|
103
|
+
'psi': float(psi),
|
|
104
|
+
'ks_statistic': float(ks_stat),
|
|
105
|
+
'ks_pvalue': float(ks_pval),
|
|
106
|
+
'ref_mean': ref_mean,
|
|
107
|
+
'curr_mean': curr_mean,
|
|
108
|
+
'mean_shift_pct': mean_shift * 100,
|
|
109
|
+
'drift_detected': psi > threshold_psi or ks_pval < threshold_ks
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
# Generate alerts
|
|
113
|
+
if psi > threshold_psi:
|
|
114
|
+
alerts.append({
|
|
115
|
+
'feature': col,
|
|
116
|
+
'type': 'data_drift',
|
|
117
|
+
'severity': 'high' if psi > 0.5 else 'medium',
|
|
118
|
+
'metric': 'PSI',
|
|
119
|
+
'value': float(psi),
|
|
120
|
+
'message': f"PSI = {psi:.3f} exceeds threshold {threshold_psi}"
|
|
121
|
+
})
|
|
122
|
+
|
|
123
|
+
if ks_pval < threshold_ks:
|
|
124
|
+
alerts.append({
|
|
125
|
+
'feature': col,
|
|
126
|
+
'type': 'data_drift',
|
|
127
|
+
'severity': 'high',
|
|
128
|
+
'metric': 'KS_test',
|
|
129
|
+
'value': float(ks_pval),
|
|
130
|
+
'message': f"KS test p-value = {ks_pval:.4f} < {threshold_ks}"
|
|
131
|
+
})
|
|
132
|
+
|
|
133
|
+
except Exception as e:
|
|
134
|
+
print(f"⚠️ Could not calculate drift for {col}: {str(e)}")
|
|
135
|
+
|
|
136
|
+
# Concept drift (target distribution change)
|
|
137
|
+
concept_drift_result = None
|
|
138
|
+
if target_col and target_col in common_cols:
|
|
139
|
+
try:
|
|
140
|
+
ref_target = ref_df[target_col].drop_nulls().to_numpy()
|
|
141
|
+
curr_target = curr_df[target_col].drop_nulls().to_numpy()
|
|
142
|
+
|
|
143
|
+
# Check if categorical
|
|
144
|
+
if len(np.unique(ref_target)) < 20:
|
|
145
|
+
# Categorical target - compare distributions
|
|
146
|
+
ref_dist = {str(val): np.sum(ref_target == val) / len(ref_target) for val in np.unique(ref_target)}
|
|
147
|
+
curr_dist = {str(val): np.sum(curr_target == val) / len(curr_target) for val in np.unique(curr_target)}
|
|
148
|
+
|
|
149
|
+
concept_drift_result = {
|
|
150
|
+
'ref_distribution': ref_dist,
|
|
151
|
+
'curr_distribution': curr_dist,
|
|
152
|
+
'drift_detected': True if len(set(ref_dist.keys()) - set(curr_dist.keys())) > 0 else False
|
|
153
|
+
}
|
|
154
|
+
else:
|
|
155
|
+
# Numeric target
|
|
156
|
+
ks_stat, ks_pval = ks_2samp(ref_target, curr_target)
|
|
157
|
+
concept_drift_result = {
|
|
158
|
+
'ks_statistic': float(ks_stat),
|
|
159
|
+
'ks_pvalue': float(ks_pval),
|
|
160
|
+
'drift_detected': ks_pval < threshold_ks
|
|
161
|
+
}
|
|
162
|
+
|
|
163
|
+
if concept_drift_result['drift_detected']:
|
|
164
|
+
alerts.append({
|
|
165
|
+
'feature': target_col,
|
|
166
|
+
'type': 'concept_drift',
|
|
167
|
+
'severity': 'critical',
|
|
168
|
+
'message': 'Target distribution has changed - model may need retraining'
|
|
169
|
+
})
|
|
170
|
+
except Exception as e:
|
|
171
|
+
print(f"⚠️ Could not detect concept drift: {str(e)}")
|
|
172
|
+
|
|
173
|
+
# Summary
|
|
174
|
+
drifted_features = [col for col, result in drift_results.items() if result['drift_detected']]
|
|
175
|
+
|
|
176
|
+
print(f"🚨 {len(alerts)} drift alerts | {len(drifted_features)} features with significant drift")
|
|
177
|
+
|
|
178
|
+
# Save report
|
|
179
|
+
report = {
|
|
180
|
+
'timestamp': datetime.now().isoformat(),
|
|
181
|
+
'reference_samples': len(ref_df),
|
|
182
|
+
'current_samples': len(curr_df),
|
|
183
|
+
'features_analyzed': len(numeric_cols),
|
|
184
|
+
'drift_results': drift_results,
|
|
185
|
+
'concept_drift': concept_drift_result,
|
|
186
|
+
'alerts': alerts,
|
|
187
|
+
'drifted_features': drifted_features
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if output_path:
|
|
191
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
192
|
+
with open(output_path, 'w') as f:
|
|
193
|
+
json.dump(report, f, indent=2)
|
|
194
|
+
print(f"💾 Drift report saved to: {output_path}")
|
|
195
|
+
|
|
196
|
+
return {
|
|
197
|
+
'status': 'success',
|
|
198
|
+
'features_analyzed': len(numeric_cols),
|
|
199
|
+
'drifted_features': drifted_features,
|
|
200
|
+
'n_alerts': len(alerts),
|
|
201
|
+
'alerts': alerts,
|
|
202
|
+
'concept_drift_detected': concept_drift_result['drift_detected'] if concept_drift_result else False,
|
|
203
|
+
'recommendation': 'Retrain model' if len(alerts) > 0 else 'No action needed',
|
|
204
|
+
'report_path': output_path
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def explain_predictions(
|
|
209
|
+
model_path: str,
|
|
210
|
+
data_path: str,
|
|
211
|
+
instance_indices: List[int],
|
|
212
|
+
method: str = "shap",
|
|
213
|
+
output_dir: Optional[str] = None
|
|
214
|
+
) -> Dict[str, Any]:
|
|
215
|
+
"""
|
|
216
|
+
Generate explainability reports for individual predictions using SHAP or LIME.
|
|
217
|
+
|
|
218
|
+
Args:
|
|
219
|
+
model_path: Path to trained model (.pkl)
|
|
220
|
+
data_path: Path to dataset
|
|
221
|
+
instance_indices: List of row indices to explain
|
|
222
|
+
method: Explanation method ('shap', 'lime', or 'both')
|
|
223
|
+
output_dir: Directory to save explanation plots
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Dictionary with explanations and feature importance
|
|
227
|
+
"""
|
|
228
|
+
# Validation
|
|
229
|
+
validate_file_exists(model_path)
|
|
230
|
+
validate_file_exists(data_path)
|
|
231
|
+
|
|
232
|
+
# Load model and data
|
|
233
|
+
model = joblib.load(model_path)
|
|
234
|
+
df = load_dataframe(data_path)
|
|
235
|
+
validate_dataframe(df)
|
|
236
|
+
|
|
237
|
+
print(f"🔍 Generating {method} explanations for {len(instance_indices)} instances...")
|
|
238
|
+
|
|
239
|
+
X = df.to_numpy()
|
|
240
|
+
feature_names = df.columns
|
|
241
|
+
|
|
242
|
+
explanations = []
|
|
243
|
+
|
|
244
|
+
# SHAP explanations
|
|
245
|
+
if method in ["shap", "both"]:
|
|
246
|
+
try:
|
|
247
|
+
# Create SHAP explainer
|
|
248
|
+
explainer = shap.Explainer(model, X)
|
|
249
|
+
shap_values = explainer(X[instance_indices])
|
|
250
|
+
|
|
251
|
+
for idx, instance_idx in enumerate(instance_indices):
|
|
252
|
+
shap_exp = {
|
|
253
|
+
'instance_idx': instance_idx,
|
|
254
|
+
'method': 'shap',
|
|
255
|
+
'prediction': model.predict(X[instance_idx:instance_idx+1])[0],
|
|
256
|
+
'feature_contributions': {
|
|
257
|
+
feature_names[i]: float(shap_values.values[idx, i])
|
|
258
|
+
for i in range(len(feature_names))
|
|
259
|
+
},
|
|
260
|
+
'top_5_positive': sorted(
|
|
261
|
+
[(feature_names[i], float(shap_values.values[idx, i]))
|
|
262
|
+
for i in range(len(feature_names))],
|
|
263
|
+
key=lambda x: x[1], reverse=True
|
|
264
|
+
)[:5],
|
|
265
|
+
'top_5_negative': sorted(
|
|
266
|
+
[(feature_names[i], float(shap_values.values[idx, i]))
|
|
267
|
+
for i in range(len(feature_names))],
|
|
268
|
+
key=lambda x: x[1]
|
|
269
|
+
)[:5]
|
|
270
|
+
}
|
|
271
|
+
explanations.append(shap_exp)
|
|
272
|
+
|
|
273
|
+
# Save force plot if output_dir provided
|
|
274
|
+
if output_dir:
|
|
275
|
+
os.makedirs(output_dir, exist_ok=True)
|
|
276
|
+
for idx, instance_idx in enumerate(instance_indices):
|
|
277
|
+
plot_path = os.path.join(output_dir, f"shap_force_plot_instance_{instance_idx}.html")
|
|
278
|
+
shap.save_html(plot_path, shap.force_plot(
|
|
279
|
+
explainer.expected_value,
|
|
280
|
+
shap_values.values[idx],
|
|
281
|
+
X[instance_idx],
|
|
282
|
+
feature_names=feature_names
|
|
283
|
+
))
|
|
284
|
+
print(f"💾 SHAP plots saved to: {output_dir}")
|
|
285
|
+
|
|
286
|
+
except Exception as e:
|
|
287
|
+
print(f"⚠️ SHAP failed: {str(e)}")
|
|
288
|
+
|
|
289
|
+
# LIME explanations
|
|
290
|
+
if method in ["lime", "both"]:
|
|
291
|
+
try:
|
|
292
|
+
# Create LIME explainer
|
|
293
|
+
explainer = lime_tabular.LimeTabularExplainer(
|
|
294
|
+
X,
|
|
295
|
+
feature_names=feature_names,
|
|
296
|
+
mode='classification' if hasattr(model, 'predict_proba') else 'regression'
|
|
297
|
+
)
|
|
298
|
+
|
|
299
|
+
for instance_idx in instance_indices:
|
|
300
|
+
exp = explainer.explain_instance(
|
|
301
|
+
X[instance_idx],
|
|
302
|
+
model.predict_proba if hasattr(model, 'predict_proba') else model.predict,
|
|
303
|
+
num_features=len(feature_names)
|
|
304
|
+
)
|
|
305
|
+
|
|
306
|
+
lime_exp = {
|
|
307
|
+
'instance_idx': instance_idx,
|
|
308
|
+
'method': 'lime',
|
|
309
|
+
'prediction': model.predict(X[instance_idx:instance_idx+1])[0],
|
|
310
|
+
'feature_contributions': dict(exp.as_list()),
|
|
311
|
+
'top_features': exp.as_list()[:10]
|
|
312
|
+
}
|
|
313
|
+
explanations.append(lime_exp)
|
|
314
|
+
|
|
315
|
+
# Save HTML if output_dir provided
|
|
316
|
+
if output_dir:
|
|
317
|
+
plot_path = os.path.join(output_dir, f"lime_explanation_instance_{instance_idx}.html")
|
|
318
|
+
exp.save_to_file(plot_path)
|
|
319
|
+
|
|
320
|
+
except Exception as e:
|
|
321
|
+
print(f"⚠️ LIME failed: {str(e)}")
|
|
322
|
+
|
|
323
|
+
print(f"✅ Generated {len(explanations)} explanations")
|
|
324
|
+
|
|
325
|
+
return {
|
|
326
|
+
'status': 'success',
|
|
327
|
+
'method': method,
|
|
328
|
+
'n_explanations': len(explanations),
|
|
329
|
+
'explanations': explanations,
|
|
330
|
+
'output_dir': output_dir
|
|
331
|
+
}
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
def generate_model_card(
|
|
335
|
+
model_path: str,
|
|
336
|
+
train_data_path: str,
|
|
337
|
+
test_data_path: str,
|
|
338
|
+
target_col: str,
|
|
339
|
+
model_name: str,
|
|
340
|
+
model_description: str,
|
|
341
|
+
intended_use: str,
|
|
342
|
+
sensitive_attributes: Optional[List[str]] = None,
|
|
343
|
+
output_path: Optional[str] = None
|
|
344
|
+
) -> Dict[str, Any]:
|
|
345
|
+
"""
|
|
346
|
+
Generate comprehensive model card for governance and compliance.
|
|
347
|
+
|
|
348
|
+
Args:
|
|
349
|
+
model_path: Path to trained model
|
|
350
|
+
train_data_path: Path to training data
|
|
351
|
+
test_data_path: Path to test data
|
|
352
|
+
target_col: Target column name
|
|
353
|
+
model_name: Name of the model
|
|
354
|
+
model_description: Description of model architecture
|
|
355
|
+
intended_use: Intended use case
|
|
356
|
+
sensitive_attributes: List of sensitive columns for fairness analysis
|
|
357
|
+
output_path: Path to save model card (JSON/HTML)
|
|
358
|
+
|
|
359
|
+
Returns:
|
|
360
|
+
Dictionary with model card information
|
|
361
|
+
"""
|
|
362
|
+
# Load model and data
|
|
363
|
+
model = joblib.load(model_path)
|
|
364
|
+
train_df = load_dataframe(train_data_path)
|
|
365
|
+
test_df = load_dataframe(test_data_path)
|
|
366
|
+
|
|
367
|
+
X_train, y_train = split_features_target(train_df, target_col)
|
|
368
|
+
X_test, y_test = split_features_target(test_df, target_col)
|
|
369
|
+
|
|
370
|
+
print("📋 Generating model card...")
|
|
371
|
+
|
|
372
|
+
# Model performance
|
|
373
|
+
y_pred = model.predict(X_test)
|
|
374
|
+
|
|
375
|
+
task_type = "classification" if len(np.unique(y_test)) < 20 else "regression"
|
|
376
|
+
|
|
377
|
+
if task_type == "classification":
|
|
378
|
+
performance = {
|
|
379
|
+
'accuracy': float(accuracy_score(y_test, y_pred)),
|
|
380
|
+
'classification_report': classification_report(y_test, y_pred, output_dict=True)
|
|
381
|
+
}
|
|
382
|
+
else:
|
|
383
|
+
from sklearn.metrics import mean_squared_error, r2_score
|
|
384
|
+
performance = {
|
|
385
|
+
'rmse': float(np.sqrt(mean_squared_error(y_test, y_pred))),
|
|
386
|
+
'r2': float(r2_score(y_test, y_pred))
|
|
387
|
+
}
|
|
388
|
+
|
|
389
|
+
# Fairness metrics
|
|
390
|
+
fairness_metrics = {}
|
|
391
|
+
if sensitive_attributes:
|
|
392
|
+
for attr in sensitive_attributes:
|
|
393
|
+
if attr in test_df.columns:
|
|
394
|
+
try:
|
|
395
|
+
groups = test_df[attr].unique().to_list()
|
|
396
|
+
group_metrics = {}
|
|
397
|
+
|
|
398
|
+
for group in groups:
|
|
399
|
+
mask = test_df[attr].to_numpy() == group
|
|
400
|
+
group_pred = y_pred[mask]
|
|
401
|
+
group_true = y_test[mask]
|
|
402
|
+
|
|
403
|
+
if task_type == "classification":
|
|
404
|
+
group_metrics[str(group)] = {
|
|
405
|
+
'accuracy': float(accuracy_score(group_true, group_pred)),
|
|
406
|
+
'sample_size': int(np.sum(mask))
|
|
407
|
+
}
|
|
408
|
+
else:
|
|
409
|
+
group_metrics[str(group)] = {
|
|
410
|
+
'rmse': float(np.sqrt(mean_squared_error(group_true, group_pred))),
|
|
411
|
+
'sample_size': int(np.sum(mask))
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
fairness_metrics[attr] = group_metrics
|
|
415
|
+
except Exception as e:
|
|
416
|
+
print(f"⚠️ Could not compute fairness for {attr}: {str(e)}")
|
|
417
|
+
|
|
418
|
+
# Model card
|
|
419
|
+
model_card = {
|
|
420
|
+
'model_details': {
|
|
421
|
+
'name': model_name,
|
|
422
|
+
'description': model_description,
|
|
423
|
+
'version': '1.0',
|
|
424
|
+
'type': str(type(model).__name__),
|
|
425
|
+
'created_date': datetime.now().isoformat(),
|
|
426
|
+
'intended_use': intended_use
|
|
427
|
+
},
|
|
428
|
+
'training_data': {
|
|
429
|
+
'n_samples': len(train_df),
|
|
430
|
+
'n_features': len(train_df.columns) - 1,
|
|
431
|
+
'target_column': target_col
|
|
432
|
+
},
|
|
433
|
+
'performance': performance,
|
|
434
|
+
'fairness_metrics': fairness_metrics,
|
|
435
|
+
'limitations': [
|
|
436
|
+
f"Trained on {len(train_df)} samples",
|
|
437
|
+
"Performance may degrade on out-of-distribution data",
|
|
438
|
+
"Regular monitoring recommended"
|
|
439
|
+
],
|
|
440
|
+
'ethical_considerations': [
|
|
441
|
+
"Model should not be used for discriminatory purposes",
|
|
442
|
+
"Predictions should be reviewed by domain experts",
|
|
443
|
+
"Consider societal impact before deployment"
|
|
444
|
+
]
|
|
445
|
+
}
|
|
446
|
+
|
|
447
|
+
# Save model card
|
|
448
|
+
if output_path:
|
|
449
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
450
|
+
with open(output_path, 'w') as f:
|
|
451
|
+
json.dump(model_card, f, indent=2)
|
|
452
|
+
print(f"💾 Model card saved to: {output_path}")
|
|
453
|
+
|
|
454
|
+
return {
|
|
455
|
+
'status': 'success',
|
|
456
|
+
'model_card': model_card,
|
|
457
|
+
'output_path': output_path
|
|
458
|
+
}
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
def perform_ab_test_analysis(
|
|
462
|
+
control_data_path: str,
|
|
463
|
+
treatment_data_path: str,
|
|
464
|
+
metric_col: str,
|
|
465
|
+
alpha: float = 0.05,
|
|
466
|
+
power: float = 0.8
|
|
467
|
+
) -> Dict[str, Any]:
|
|
468
|
+
"""
|
|
469
|
+
Perform A/B test statistical analysis with confidence intervals.
|
|
470
|
+
|
|
471
|
+
Args:
|
|
472
|
+
control_data_path: Path to control group data
|
|
473
|
+
treatment_data_path: Path to treatment group data
|
|
474
|
+
metric_col: Metric column to compare
|
|
475
|
+
alpha: Significance level (default 0.05)
|
|
476
|
+
power: Statistical power (default 0.8)
|
|
477
|
+
|
|
478
|
+
Returns:
|
|
479
|
+
Dictionary with A/B test results
|
|
480
|
+
"""
|
|
481
|
+
# Load data
|
|
482
|
+
control_df = load_dataframe(control_data_path)
|
|
483
|
+
treatment_df = load_dataframe(treatment_data_path)
|
|
484
|
+
|
|
485
|
+
validate_column_exists(control_df, metric_col)
|
|
486
|
+
validate_column_exists(treatment_df, metric_col)
|
|
487
|
+
|
|
488
|
+
control = control_df[metric_col].drop_nulls().to_numpy()
|
|
489
|
+
treatment = treatment_df[metric_col].drop_nulls().to_numpy()
|
|
490
|
+
|
|
491
|
+
print("📊 Performing A/B test analysis...")
|
|
492
|
+
|
|
493
|
+
# Calculate statistics
|
|
494
|
+
control_mean = float(np.mean(control))
|
|
495
|
+
treatment_mean = float(np.mean(treatment))
|
|
496
|
+
|
|
497
|
+
control_std = float(np.std(control, ddof=1))
|
|
498
|
+
treatment_std = float(np.std(treatment, ddof=1))
|
|
499
|
+
|
|
500
|
+
# T-test
|
|
501
|
+
from scipy.stats import ttest_ind
|
|
502
|
+
t_stat, p_value = ttest_ind(treatment, control)
|
|
503
|
+
|
|
504
|
+
# Effect size (Cohen's d)
|
|
505
|
+
pooled_std = np.sqrt(((len(control)-1)*control_std**2 + (len(treatment)-1)*treatment_std**2) / (len(control)+len(treatment)-2))
|
|
506
|
+
cohens_d = (treatment_mean - control_mean) / pooled_std
|
|
507
|
+
|
|
508
|
+
# Confidence intervals
|
|
509
|
+
from scipy import stats as scipy_stats
|
|
510
|
+
control_ci = scipy_stats.t.interval(1-alpha, len(control)-1, loc=control_mean, scale=control_std/np.sqrt(len(control)))
|
|
511
|
+
treatment_ci = scipy_stats.t.interval(1-alpha, len(treatment)-1, loc=treatment_mean, scale=treatment_std/np.sqrt(len(treatment)))
|
|
512
|
+
|
|
513
|
+
# Relative uplift
|
|
514
|
+
relative_uplift = ((treatment_mean - control_mean) / control_mean) * 100
|
|
515
|
+
|
|
516
|
+
# Sample size recommendation
|
|
517
|
+
from scipy.stats import norm
|
|
518
|
+
z_alpha = norm.ppf(1 - alpha/2)
|
|
519
|
+
z_beta = norm.ppf(power)
|
|
520
|
+
|
|
521
|
+
required_n = 2 * ((z_alpha + z_beta) * pooled_std / (treatment_mean - control_mean + 1e-10))**2
|
|
522
|
+
|
|
523
|
+
# Statistical significance
|
|
524
|
+
is_significant = p_value < alpha
|
|
525
|
+
|
|
526
|
+
result = {
|
|
527
|
+
'control_group': {
|
|
528
|
+
'n_samples': len(control),
|
|
529
|
+
'mean': control_mean,
|
|
530
|
+
'std': control_std,
|
|
531
|
+
'ci_95': [float(control_ci[0]), float(control_ci[1])]
|
|
532
|
+
},
|
|
533
|
+
'treatment_group': {
|
|
534
|
+
'n_samples': len(treatment),
|
|
535
|
+
'mean': treatment_mean,
|
|
536
|
+
'std': treatment_std,
|
|
537
|
+
'ci_95': [float(treatment_ci[0]), float(treatment_ci[1])]
|
|
538
|
+
},
|
|
539
|
+
'test_results': {
|
|
540
|
+
't_statistic': float(t_stat),
|
|
541
|
+
'p_value': float(p_value),
|
|
542
|
+
'is_significant': is_significant,
|
|
543
|
+
'alpha': alpha
|
|
544
|
+
},
|
|
545
|
+
'effect_size': {
|
|
546
|
+
'cohens_d': float(cohens_d),
|
|
547
|
+
'interpretation': 'large' if abs(cohens_d) > 0.8 else 'medium' if abs(cohens_d) > 0.5 else 'small'
|
|
548
|
+
},
|
|
549
|
+
'business_impact': {
|
|
550
|
+
'absolute_lift': float(treatment_mean - control_mean),
|
|
551
|
+
'relative_lift_pct': float(relative_uplift)
|
|
552
|
+
},
|
|
553
|
+
'sample_size_recommendation': {
|
|
554
|
+
'current_total': len(control) + len(treatment),
|
|
555
|
+
'recommended_per_group': int(required_n),
|
|
556
|
+
'is_sufficient': len(control) >= required_n and len(treatment) >= required_n
|
|
557
|
+
},
|
|
558
|
+
'conclusion': f"Treatment {'significantly' if is_significant else 'does not significantly'} outperform control (p={p_value:.4f})"
|
|
559
|
+
}
|
|
560
|
+
|
|
561
|
+
print(f"{'✅' if is_significant else '❌'} {result['conclusion']}")
|
|
562
|
+
print(f"📈 Relative lift: {relative_uplift:+.2f}%")
|
|
563
|
+
|
|
564
|
+
return {
|
|
565
|
+
'status': 'success',
|
|
566
|
+
**result
|
|
567
|
+
}
|
|
568
|
+
|
|
569
|
+
|
|
570
|
+
def detect_feature_leakage(
|
|
571
|
+
data_path: str,
|
|
572
|
+
target_col: str,
|
|
573
|
+
time_col: Optional[str] = None,
|
|
574
|
+
correlation_threshold: float = 0.95
|
|
575
|
+
) -> Dict[str, Any]:
|
|
576
|
+
"""
|
|
577
|
+
Detect potential feature leakage (target leakage and temporal leakage).
|
|
578
|
+
|
|
579
|
+
Args:
|
|
580
|
+
data_path: Path to dataset
|
|
581
|
+
target_col: Target column name
|
|
582
|
+
time_col: Time column for temporal leakage detection
|
|
583
|
+
correlation_threshold: Correlation threshold for leakage detection
|
|
584
|
+
|
|
585
|
+
Returns:
|
|
586
|
+
Dictionary with potential leakage issues
|
|
587
|
+
"""
|
|
588
|
+
# Load data
|
|
589
|
+
df = load_dataframe(data_path)
|
|
590
|
+
validate_dataframe(df)
|
|
591
|
+
validate_column_exists(df, target_col)
|
|
592
|
+
|
|
593
|
+
print("🔍 Detecting feature leakage...")
|
|
594
|
+
|
|
595
|
+
# Get numeric columns
|
|
596
|
+
numeric_cols = [col for col in get_numeric_columns(df) if col != target_col]
|
|
597
|
+
|
|
598
|
+
# Target leakage detection (high correlation with target)
|
|
599
|
+
target_leakage = []
|
|
600
|
+
target_data = df[target_col].drop_nulls().to_numpy()
|
|
601
|
+
|
|
602
|
+
for col in numeric_cols:
|
|
603
|
+
try:
|
|
604
|
+
col_data = df[col].drop_nulls().to_numpy()
|
|
605
|
+
|
|
606
|
+
# Align lengths
|
|
607
|
+
min_len = min(len(target_data), len(col_data))
|
|
608
|
+
corr, pval = pearsonr(target_data[:min_len], col_data[:min_len])
|
|
609
|
+
|
|
610
|
+
if abs(corr) > correlation_threshold:
|
|
611
|
+
target_leakage.append({
|
|
612
|
+
'feature': col,
|
|
613
|
+
'correlation': float(corr),
|
|
614
|
+
'p_value': float(pval),
|
|
615
|
+
'severity': 'critical' if abs(corr) > 0.99 else 'high',
|
|
616
|
+
'recommendation': f'Remove or investigate {col} - suspiciously high correlation with target'
|
|
617
|
+
})
|
|
618
|
+
except Exception as e:
|
|
619
|
+
pass
|
|
620
|
+
|
|
621
|
+
# Temporal leakage detection
|
|
622
|
+
temporal_leakage = []
|
|
623
|
+
if time_col and time_col in df.columns:
|
|
624
|
+
# Check for future information
|
|
625
|
+
# Features that shouldn't be available at prediction time
|
|
626
|
+
potential_future_cols = [col for col in df.columns if any(keyword in col.lower() for keyword in ['future', 'next', 'after', 'later'])]
|
|
627
|
+
|
|
628
|
+
if potential_future_cols:
|
|
629
|
+
temporal_leakage.append({
|
|
630
|
+
'features': potential_future_cols,
|
|
631
|
+
'issue': 'potential_future_information',
|
|
632
|
+
'recommendation': 'Verify these features are available at prediction time'
|
|
633
|
+
})
|
|
634
|
+
|
|
635
|
+
# Check for perfect predictors (100% correlation or zero variance when grouped by target)
|
|
636
|
+
perfect_predictors = []
|
|
637
|
+
for col in numeric_cols:
|
|
638
|
+
try:
|
|
639
|
+
grouped_variance = df.group_by(target_col).agg(pl.col(col).var())
|
|
640
|
+
if (grouped_variance[col].drop_nulls() < 1e-10).all():
|
|
641
|
+
perfect_predictors.append({
|
|
642
|
+
'feature': col,
|
|
643
|
+
'issue': 'zero_variance_per_class',
|
|
644
|
+
'recommendation': f'{col} has zero variance within each target class - likely leakage'
|
|
645
|
+
})
|
|
646
|
+
except:
|
|
647
|
+
pass
|
|
648
|
+
|
|
649
|
+
# Summary
|
|
650
|
+
total_issues = len(target_leakage) + len(temporal_leakage) + len(perfect_predictors)
|
|
651
|
+
|
|
652
|
+
print(f"🚨 Found {total_issues} potential leakage issues")
|
|
653
|
+
|
|
654
|
+
return {
|
|
655
|
+
'status': 'success',
|
|
656
|
+
'target_leakage': target_leakage,
|
|
657
|
+
'temporal_leakage': temporal_leakage,
|
|
658
|
+
'perfect_predictors': perfect_predictors,
|
|
659
|
+
'total_issues': total_issues,
|
|
660
|
+
'recommendation': 'Review and remove suspicious features before training' if total_issues > 0 else 'No obvious leakage detected'
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
|
|
664
|
+
def monitor_drift_evidently(
|
|
665
|
+
reference_data_path: str,
|
|
666
|
+
current_data_path: str,
|
|
667
|
+
output_path: Optional[str] = None
|
|
668
|
+
) -> Dict[str, Any]:
|
|
669
|
+
"""
|
|
670
|
+
Generate a comprehensive data drift report using Evidently AI.
|
|
671
|
+
|
|
672
|
+
Evidently provides production-grade drift detection with:
|
|
673
|
+
- Statistical tests per feature (KS, Chi-squared, Jensen-Shannon)
|
|
674
|
+
- Data quality metrics
|
|
675
|
+
- Interactive HTML dashboard
|
|
676
|
+
|
|
677
|
+
Args:
|
|
678
|
+
reference_data_path: Path to training/reference dataset
|
|
679
|
+
current_data_path: Path to production/current dataset
|
|
680
|
+
output_path: Path to save HTML drift report
|
|
681
|
+
|
|
682
|
+
Returns:
|
|
683
|
+
Dictionary with drift metrics and report path
|
|
684
|
+
"""
|
|
685
|
+
try:
|
|
686
|
+
from evidently.report import Report
|
|
687
|
+
from evidently.metric_preset import DataDriftPreset, DataQualityPreset
|
|
688
|
+
except ImportError:
|
|
689
|
+
return {
|
|
690
|
+
'status': 'error',
|
|
691
|
+
'message': 'evidently not installed. Install with: pip install evidently>=0.4'
|
|
692
|
+
}
|
|
693
|
+
|
|
694
|
+
import pandas as pd_ev
|
|
695
|
+
|
|
696
|
+
validate_file_exists(reference_data_path)
|
|
697
|
+
validate_file_exists(current_data_path)
|
|
698
|
+
|
|
699
|
+
# Load data as pandas (evidently requires pandas)
|
|
700
|
+
ref_df = load_dataframe(reference_data_path).to_pandas()
|
|
701
|
+
curr_df = load_dataframe(current_data_path).to_pandas()
|
|
702
|
+
|
|
703
|
+
print("🔍 Generating Evidently drift report...")
|
|
704
|
+
|
|
705
|
+
# Create drift report
|
|
706
|
+
report = Report(metrics=[
|
|
707
|
+
DataDriftPreset(),
|
|
708
|
+
DataQualityPreset()
|
|
709
|
+
])
|
|
710
|
+
|
|
711
|
+
report.run(reference_data=ref_df, current_data=curr_df)
|
|
712
|
+
|
|
713
|
+
# Save HTML report
|
|
714
|
+
if output_path is None:
|
|
715
|
+
output_path = "./outputs/reports/evidently_drift_report.html"
|
|
716
|
+
|
|
717
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
718
|
+
report.save_html(output_path)
|
|
719
|
+
|
|
720
|
+
# Extract results as dict
|
|
721
|
+
report_dict = report.as_dict()
|
|
722
|
+
|
|
723
|
+
# Parse drift results
|
|
724
|
+
drift_metrics = report_dict.get('metrics', [])
|
|
725
|
+
|
|
726
|
+
drifted_features = []
|
|
727
|
+
total_features = 0
|
|
728
|
+
for metric in drift_metrics:
|
|
729
|
+
result_data = metric.get('result', {})
|
|
730
|
+
if 'drift_by_columns' in result_data:
|
|
731
|
+
for col_name, col_data in result_data['drift_by_columns'].items():
|
|
732
|
+
total_features += 1
|
|
733
|
+
if col_data.get('drift_detected', False):
|
|
734
|
+
drifted_features.append(col_name)
|
|
735
|
+
|
|
736
|
+
print(f"✅ Evidently report saved to: {output_path}")
|
|
737
|
+
print(f" 📊 {len(drifted_features)}/{total_features} features with drift detected")
|
|
738
|
+
|
|
739
|
+
return {
|
|
740
|
+
'status': 'success',
|
|
741
|
+
'report_path': output_path,
|
|
742
|
+
'total_features_analyzed': total_features,
|
|
743
|
+
'drifted_features': drifted_features,
|
|
744
|
+
'n_drifted': len(drifted_features),
|
|
745
|
+
'recommendation': 'Retrain model' if drifted_features else 'No significant drift detected'
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
|
|
749
|
+
def explain_with_dtreeviz(
|
|
750
|
+
model_path: str,
|
|
751
|
+
data_path: str,
|
|
752
|
+
target_col: str,
|
|
753
|
+
feature_names: Optional[List[str]] = None,
|
|
754
|
+
instance_index: int = 0,
|
|
755
|
+
output_path: Optional[str] = None
|
|
756
|
+
) -> Dict[str, Any]:
|
|
757
|
+
"""
|
|
758
|
+
Generate tree visualization using dtreeviz for tree-based models.
|
|
759
|
+
|
|
760
|
+
Creates publication-quality decision tree visualizations showing:
|
|
761
|
+
- Decision path for individual predictions
|
|
762
|
+
- Feature distributions at each node
|
|
763
|
+
- Split thresholds with data histograms
|
|
764
|
+
|
|
765
|
+
Args:
|
|
766
|
+
model_path: Path to trained tree-based model (.pkl)
|
|
767
|
+
data_path: Path to dataset
|
|
768
|
+
target_col: Target column name
|
|
769
|
+
feature_names: List of feature names (auto-detected if None)
|
|
770
|
+
instance_index: Index of instance to trace through tree
|
|
771
|
+
output_path: Path to save SVG visualization
|
|
772
|
+
|
|
773
|
+
Returns:
|
|
774
|
+
Dictionary with visualization path and tree info
|
|
775
|
+
"""
|
|
776
|
+
try:
|
|
777
|
+
import dtreeviz
|
|
778
|
+
except ImportError:
|
|
779
|
+
return {
|
|
780
|
+
'status': 'error',
|
|
781
|
+
'message': 'dtreeviz not installed. Install with: pip install dtreeviz>=2.2'
|
|
782
|
+
}
|
|
783
|
+
|
|
784
|
+
validate_file_exists(model_path)
|
|
785
|
+
validate_file_exists(data_path)
|
|
786
|
+
|
|
787
|
+
model = joblib.load(model_path)
|
|
788
|
+
df = load_dataframe(data_path)
|
|
789
|
+
validate_dataframe(df)
|
|
790
|
+
|
|
791
|
+
# Prepare data
|
|
792
|
+
if target_col in df.columns:
|
|
793
|
+
X = df.drop(target_col).to_pandas()
|
|
794
|
+
y = df[target_col].to_pandas()
|
|
795
|
+
else:
|
|
796
|
+
X = df.to_pandas()
|
|
797
|
+
y = None
|
|
798
|
+
|
|
799
|
+
if feature_names is None:
|
|
800
|
+
feature_names = X.columns.tolist()
|
|
801
|
+
|
|
802
|
+
print(f"🌳 Generating dtreeviz visualization...")
|
|
803
|
+
|
|
804
|
+
if output_path is None:
|
|
805
|
+
output_path = "./outputs/reports/dtreeviz_tree.svg"
|
|
806
|
+
|
|
807
|
+
os.makedirs(os.path.dirname(output_path), exist_ok=True)
|
|
808
|
+
|
|
809
|
+
try:
|
|
810
|
+
# Check if model is a tree-based model
|
|
811
|
+
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
|
|
812
|
+
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
|
|
813
|
+
|
|
814
|
+
# For ensemble models, use the first estimator
|
|
815
|
+
tree_model = model
|
|
816
|
+
if hasattr(model, 'estimators_'):
|
|
817
|
+
tree_model = model.estimators_[0]
|
|
818
|
+
print(" 📌 Using first estimator from ensemble for visualization")
|
|
819
|
+
|
|
820
|
+
# Determine task type
|
|
821
|
+
is_classifier = hasattr(model, 'predict_proba')
|
|
822
|
+
|
|
823
|
+
# Create visualization
|
|
824
|
+
viz_model = dtreeviz.model(
|
|
825
|
+
tree_model,
|
|
826
|
+
X_train=X,
|
|
827
|
+
y_train=y,
|
|
828
|
+
feature_names=feature_names,
|
|
829
|
+
target_name=target_col,
|
|
830
|
+
class_names=list(map(str, sorted(y.unique()))) if is_classifier and y is not None else None
|
|
831
|
+
)
|
|
832
|
+
|
|
833
|
+
# Generate tree visualization
|
|
834
|
+
v = viz_model.view(x=X.iloc[instance_index])
|
|
835
|
+
v.save(output_path)
|
|
836
|
+
|
|
837
|
+
print(f"✅ Tree visualization saved to: {output_path}")
|
|
838
|
+
|
|
839
|
+
return {
|
|
840
|
+
'status': 'success',
|
|
841
|
+
'visualization_path': output_path,
|
|
842
|
+
'model_type': type(model).__name__,
|
|
843
|
+
'n_features': len(feature_names),
|
|
844
|
+
'instance_explained': instance_index,
|
|
845
|
+
'tree_depth': tree_model.get_depth() if hasattr(tree_model, 'get_depth') else 'unknown'
|
|
846
|
+
}
|
|
847
|
+
|
|
848
|
+
except Exception as e:
|
|
849
|
+
return {
|
|
850
|
+
'status': 'error',
|
|
851
|
+
'message': f'dtreeviz visualization failed: {str(e)}. Ensure model is tree-based (DecisionTree, RandomForest, XGBoost).'
|
|
852
|
+
}
|