@defai.digital/cli 13.4.3 → 13.4.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled/agents/aerospace-scientist.json +114 -0
- package/bundled/agents/architecture.json +96 -0
- package/bundled/agents/backend.json +125 -0
- package/bundled/agents/blockchain-developer.json +114 -0
- package/bundled/agents/ceo.json +113 -0
- package/bundled/agents/creative-marketer.json +114 -0
- package/bundled/agents/cto.json +96 -0
- package/bundled/agents/data-scientist.json +96 -0
- package/bundled/agents/devops.json +98 -0
- package/bundled/agents/frontend.json +118 -0
- package/bundled/agents/fullstack.json +99 -0
- package/bundled/agents/ml-engineer.json +114 -0
- package/bundled/agents/mlops-engineer.json +102 -0
- package/bundled/agents/mobile.json +96 -0
- package/bundled/agents/product.json +96 -0
- package/bundled/agents/quality.json +95 -0
- package/bundled/agents/quantum-engineer.json +114 -0
- package/bundled/agents/researcher.json +123 -0
- package/bundled/agents/reviewer.json +107 -0
- package/bundled/agents/security.json +79 -0
- package/bundled/agents/standard.json +82 -0
- package/bundled/agents/writer.json +78 -0
- package/bundled/templates/monorepo/contract-index.ts.hbs +7 -0
- package/bundled/templates/monorepo/contract-test.ts.hbs +130 -0
- package/bundled/templates/monorepo/contracts-package.json.hbs +29 -0
- package/bundled/templates/monorepo/domain-index.ts.hbs +115 -0
- package/bundled/templates/monorepo/domain-package.json.hbs +27 -0
- package/bundled/templates/monorepo/gitignore.hbs +32 -0
- package/bundled/templates/monorepo/invariants.md.hbs +43 -0
- package/bundled/templates/monorepo/package.json.hbs +28 -0
- package/bundled/templates/monorepo/pnpm-workspace.yaml.hbs +5 -0
- package/bundled/templates/monorepo/schema.ts.hbs +82 -0
- package/bundled/templates/monorepo/template.json +106 -0
- package/bundled/templates/monorepo/tsconfig.json.hbs +22 -0
- package/bundled/templates/standalone/contract-index.ts.hbs +5 -0
- package/bundled/templates/standalone/contract-test.ts.hbs +95 -0
- package/bundled/templates/standalone/contracts-root-index.ts.hbs +7 -0
- package/bundled/templates/standalone/domain-index.ts.hbs +6 -0
- package/bundled/templates/standalone/domain-repository.ts.hbs +44 -0
- package/bundled/templates/standalone/domain-service.ts.hbs +102 -0
- package/bundled/templates/standalone/gitignore.hbs +27 -0
- package/bundled/templates/standalone/invariants.md.hbs +35 -0
- package/bundled/templates/standalone/package.json.hbs +41 -0
- package/bundled/templates/standalone/schema.ts.hbs +61 -0
- package/bundled/templates/standalone/src-index.ts.hbs +11 -0
- package/bundled/templates/standalone/template.json +91 -0
- package/bundled/templates/standalone/tsconfig.json.hbs +20 -0
- package/bundled/templates/standalone/vitest.config.ts.hbs +8 -0
- package/bundled/workflows/adversarial-debate.yaml +222 -0
- package/bundled/workflows/analyst.yaml +115 -0
- package/bundled/workflows/assistant.yaml +74 -0
- package/bundled/workflows/code-review-discussion.yaml +166 -0
- package/bundled/workflows/code-reviewer.yaml +94 -0
- package/bundled/workflows/contract-first-project.yaml +356 -0
- package/bundled/workflows/debugger.yaml +107 -0
- package/bundled/workflows/designer.yaml +113 -0
- package/bundled/workflows/developer.yaml +105 -0
- package/bundled/workflows/discuss-step-examples.yaml +153 -0
- package/bundled/workflows/infrastructure-automation.yaml +283 -0
- package/bundled/workflows/ml-ab-testing.yaml +311 -0
- package/bundled/workflows/ml-experiment-tracker.yaml +150 -0
- package/bundled/workflows/ml-feature-engineering.yaml +242 -0
- package/bundled/workflows/ml-model-evaluation.yaml +234 -0
- package/bundled/workflows/ml-model-monitoring.yaml +227 -0
- package/bundled/workflows/ml-model-registry.yaml +232 -0
- package/bundled/workflows/mlops-deployment.yaml +267 -0
- package/bundled/workflows/mobile-development.yaml +312 -0
- package/bundled/workflows/multi-model-discussion.yaml +243 -0
- package/bundled/workflows/product-discovery.yaml +295 -0
- package/bundled/workflows/qa-specialist.yaml +116 -0
- package/bundled/workflows/refactoring.yaml +105 -0
- package/bundled/workflows/security-audit.yaml +135 -0
- package/bundled/workflows/std/analysis.yaml +190 -0
- package/bundled/workflows/std/code-review.yaml +117 -0
- package/bundled/workflows/std/debugging.yaml +155 -0
- package/bundled/workflows/std/documentation.yaml +180 -0
- package/bundled/workflows/std/implementation.yaml +197 -0
- package/bundled/workflows/std/refactoring.yaml +180 -0
- package/bundled/workflows/std/testing.yaml +200 -0
- package/bundled/workflows/strategic-planning.yaml +235 -0
- package/bundled/workflows/technology-research.yaml +239 -0
- package/dist/commands/agent.d.ts.map +1 -1
- package/dist/commands/agent.js +8 -2
- package/dist/commands/agent.js.map +1 -1
- package/dist/commands/scaffold.d.ts.map +1 -1
- package/dist/commands/scaffold.js +6 -3
- package/dist/commands/scaffold.js.map +1 -1
- package/dist/web/api.d.ts.map +1 -1
- package/dist/web/api.js +13 -6
- package/dist/web/api.js.map +1 -1
- package/package.json +23 -22
|
@@ -0,0 +1,234 @@
|
|
|
1
|
+
workflowId: ml-model-evaluation
|
|
2
|
+
name: Model Evaluation Pipeline
|
|
3
|
+
description: Comprehensive model evaluation with performance, fairness, robustness, and production-readiness checks
|
|
4
|
+
version: "1.0.0"
|
|
5
|
+
category: machine-learning
|
|
6
|
+
tags:
|
|
7
|
+
- ml
|
|
8
|
+
- evaluation
|
|
9
|
+
- fairness
|
|
10
|
+
- robustness
|
|
11
|
+
- production
|
|
12
|
+
|
|
13
|
+
metadata:
|
|
14
|
+
requiredAbilities:
|
|
15
|
+
- machine-learning
|
|
16
|
+
- statistical-analysis
|
|
17
|
+
- data-analysis
|
|
18
|
+
estimatedDuration: 600
|
|
19
|
+
complexity: high
|
|
20
|
+
|
|
21
|
+
steps:
|
|
22
|
+
- stepId: performance-metrics
|
|
23
|
+
name: Calculate Performance Metrics
|
|
24
|
+
type: prompt
|
|
25
|
+
timeout: 180000
|
|
26
|
+
config:
|
|
27
|
+
agent: data-scientist
|
|
28
|
+
task: |
|
|
29
|
+
Evaluate model performance on the test dataset.
|
|
30
|
+
|
|
31
|
+
## Classification Metrics (if applicable)
|
|
32
|
+
- Accuracy
|
|
33
|
+
- Precision (macro, micro, weighted)
|
|
34
|
+
- Recall (macro, micro, weighted)
|
|
35
|
+
- F1 Score (macro, micro, weighted)
|
|
36
|
+
- AUC-ROC (one-vs-rest for multiclass)
|
|
37
|
+
- AUC-PR (Precision-Recall curve)
|
|
38
|
+
- Log Loss
|
|
39
|
+
- Confusion Matrix
|
|
40
|
+
- Per-class metrics breakdown
|
|
41
|
+
|
|
42
|
+
## Regression Metrics (if applicable)
|
|
43
|
+
- Mean Squared Error (MSE)
|
|
44
|
+
- Root Mean Squared Error (RMSE)
|
|
45
|
+
- Mean Absolute Error (MAE)
|
|
46
|
+
- R-squared (Coefficient of Determination)
|
|
47
|
+
- Mean Absolute Percentage Error (MAPE)
|
|
48
|
+
- Residual analysis
|
|
49
|
+
|
|
50
|
+
## Confidence Intervals
|
|
51
|
+
- Bootstrap 95% CI for primary metric
|
|
52
|
+
- Report standard deviation across folds
|
|
53
|
+
|
|
54
|
+
Provide structured output with all applicable metrics.
|
|
55
|
+
|
|
56
|
+
- stepId: fairness-audit
|
|
57
|
+
name: Fairness & Bias Audit
|
|
58
|
+
type: prompt
|
|
59
|
+
timeout: 180000
|
|
60
|
+
config:
|
|
61
|
+
agent: data-scientist
|
|
62
|
+
task: |
|
|
63
|
+
Conduct a comprehensive fairness and bias audit.
|
|
64
|
+
|
|
65
|
+
## Demographic Analysis
|
|
66
|
+
For each protected attribute (gender, age, race, etc.):
|
|
67
|
+
|
|
68
|
+
1. **Demographic Parity**:
|
|
69
|
+
- Selection rate across groups
|
|
70
|
+
- Disparate impact ratio (should be > 0.8)
|
|
71
|
+
|
|
72
|
+
2. **Equal Opportunity**:
|
|
73
|
+
- True positive rate across groups
|
|
74
|
+
- False negative rate disparity
|
|
75
|
+
|
|
76
|
+
3. **Equalized Odds**:
|
|
77
|
+
- TPR and FPR across groups
|
|
78
|
+
- Maximum disparity
|
|
79
|
+
|
|
80
|
+
4. **Calibration**:
|
|
81
|
+
- Predicted probability vs actual outcome by group
|
|
82
|
+
|
|
83
|
+
## Bias Detection
|
|
84
|
+
- Identify any systematic prediction bias
|
|
85
|
+
- Check for proxy discrimination
|
|
86
|
+
- Analyze error distribution across groups
|
|
87
|
+
|
|
88
|
+
## Mitigation Recommendations
|
|
89
|
+
If bias is detected:
|
|
90
|
+
- Recommend specific mitigation strategies
|
|
91
|
+
- Estimate impact of mitigation on overall performance
|
|
92
|
+
|
|
93
|
+
## Compliance Check
|
|
94
|
+
- Flag any potential regulatory concerns (GDPR, CCPA, ECOA)
|
|
95
|
+
|
|
96
|
+
- stepId: robustness-testing
|
|
97
|
+
name: Robustness & Edge Case Testing
|
|
98
|
+
type: prompt
|
|
99
|
+
timeout: 180000
|
|
100
|
+
config:
|
|
101
|
+
agent: ml-engineer
|
|
102
|
+
task: |
|
|
103
|
+
Test model robustness and behavior on edge cases.
|
|
104
|
+
|
|
105
|
+
## Input Perturbation Testing
|
|
106
|
+
1. **Noise Sensitivity**:
|
|
107
|
+
- Add Gaussian noise to numerical features
|
|
108
|
+
- Measure performance degradation curve
|
|
109
|
+
|
|
110
|
+
2. **Missing Values**:
|
|
111
|
+
- Simulate missing data patterns
|
|
112
|
+
- Test model behavior with incomplete inputs
|
|
113
|
+
|
|
114
|
+
3. **Out-of-Range Values**:
|
|
115
|
+
- Test with extreme values
|
|
116
|
+
- Check for catastrophic failures
|
|
117
|
+
|
|
118
|
+
## Distribution Shift Testing
|
|
119
|
+
1. **Covariate Shift**:
|
|
120
|
+
- Test on data with shifted feature distributions
|
|
121
|
+
- Measure performance drop
|
|
122
|
+
|
|
123
|
+
2. **Label Shift**:
|
|
124
|
+
- Test with different class proportions
|
|
125
|
+
- Check calibration under shift
|
|
126
|
+
|
|
127
|
+
## Edge Cases
|
|
128
|
+
1. Boundary conditions
|
|
129
|
+
2. Rare class handling
|
|
130
|
+
3. Adversarial-like examples (if applicable)
|
|
131
|
+
|
|
132
|
+
## Out-of-Distribution Detection
|
|
133
|
+
- Can the model identify when inputs are OOD?
|
|
134
|
+
- Confidence calibration on OOD samples
|
|
135
|
+
|
|
136
|
+
- stepId: latency-profiling
|
|
137
|
+
name: Latency & Resource Profiling
|
|
138
|
+
type: prompt
|
|
139
|
+
timeout: 120000
|
|
140
|
+
config:
|
|
141
|
+
agent: ml-engineer
|
|
142
|
+
task: |
|
|
143
|
+
Profile model for production deployment.
|
|
144
|
+
|
|
145
|
+
## Inference Latency
|
|
146
|
+
- Single sample latency (p50, p95, p99)
|
|
147
|
+
- Batch inference latency by batch size
|
|
148
|
+
- Cold start latency
|
|
149
|
+
- Warm cache latency
|
|
150
|
+
|
|
151
|
+
## Resource Requirements
|
|
152
|
+
- Memory footprint (model size)
|
|
153
|
+
- Peak memory during inference
|
|
154
|
+
- CPU utilization pattern
|
|
155
|
+
- GPU utilization (if applicable)
|
|
156
|
+
- GPU memory requirements
|
|
157
|
+
|
|
158
|
+
## Scalability Analysis
|
|
159
|
+
- Throughput (requests/second)
|
|
160
|
+
- Scaling behavior under load
|
|
161
|
+
- Recommended instance type
|
|
162
|
+
- Cost per 1M predictions
|
|
163
|
+
|
|
164
|
+
## Optimization Opportunities
|
|
165
|
+
- Quantization potential
|
|
166
|
+
- Pruning opportunities
|
|
167
|
+
- Distillation candidates
|
|
168
|
+
- Caching strategies
|
|
169
|
+
|
|
170
|
+
- stepId: evaluation-report
|
|
171
|
+
name: Generate Evaluation Report
|
|
172
|
+
type: prompt
|
|
173
|
+
timeout: 120000
|
|
174
|
+
config:
|
|
175
|
+
agent: data-scientist
|
|
176
|
+
task: |
|
|
177
|
+
Create a comprehensive model evaluation report.
|
|
178
|
+
|
|
179
|
+
## Executive Summary
|
|
180
|
+
- Model name and version
|
|
181
|
+
- One-paragraph assessment
|
|
182
|
+
- Go/No-Go recommendation with confidence level
|
|
183
|
+
|
|
184
|
+
## Performance Summary
|
|
185
|
+
- Primary metric with confidence interval
|
|
186
|
+
- Comparison to baseline
|
|
187
|
+
- Key strengths and weaknesses
|
|
188
|
+
|
|
189
|
+
## Fairness Summary
|
|
190
|
+
- Overall fairness assessment (Pass/Fail/Warning)
|
|
191
|
+
- Groups with highest disparity
|
|
192
|
+
- Required mitigations before deployment
|
|
193
|
+
|
|
194
|
+
## Robustness Summary
|
|
195
|
+
- Robustness score (1-10)
|
|
196
|
+
- Critical failure modes identified
|
|
197
|
+
- Recommended guardrails
|
|
198
|
+
|
|
199
|
+
## Production Readiness
|
|
200
|
+
- Latency requirements: Met/Not Met
|
|
201
|
+
- Resource requirements: Acceptable/High/Excessive
|
|
202
|
+
- Scaling assessment
|
|
203
|
+
|
|
204
|
+
## Recommendations
|
|
205
|
+
1. **If Go**: Deployment checklist
|
|
206
|
+
2. **If No-Go**: Required improvements ranked by priority
|
|
207
|
+
3. **Monitoring requirements** for production
|
|
208
|
+
|
|
209
|
+
## Appendix
|
|
210
|
+
- Full metrics tables
|
|
211
|
+
- Visualization references
|
|
212
|
+
- Test dataset description
|
|
213
|
+
|
|
214
|
+
- stepId: store-evaluation
|
|
215
|
+
name: Store Evaluation Results
|
|
216
|
+
type: tool
|
|
217
|
+
timeout: 10000
|
|
218
|
+
tool: memory_store
|
|
219
|
+
config:
|
|
220
|
+
namespace: ml-evaluations
|
|
221
|
+
key: "{{model_name}}/{{model_version}}/evaluation"
|
|
222
|
+
ttl: 15552000
|
|
223
|
+
value:
|
|
224
|
+
model_name: "{{model_name}}"
|
|
225
|
+
model_version: "{{model_version}}"
|
|
226
|
+
evaluation_id: "{{evaluation_id}}"
|
|
227
|
+
recommendation: "{{go_no_go}}"
|
|
228
|
+
performance_metrics: "{{performance_metrics}}"
|
|
229
|
+
fairness_results: "{{fairness_results}}"
|
|
230
|
+
robustness_score: "{{robustness_score}}"
|
|
231
|
+
latency_profile: "{{latency_profile}}"
|
|
232
|
+
full_report: "{{evaluation_report}}"
|
|
233
|
+
evaluated_at: "{{timestamp}}"
|
|
234
|
+
evaluated_by: "{{user}}"
|
|
@@ -0,0 +1,227 @@
|
|
|
1
|
+
workflowId: ml-model-monitoring
|
|
2
|
+
name: Production Model Monitoring
|
|
3
|
+
description: Detect drift, degradation, and anomalies in production models
|
|
4
|
+
version: "1.0.0"
|
|
5
|
+
category: machine-learning
|
|
6
|
+
tags:
|
|
7
|
+
- ml
|
|
8
|
+
- monitoring
|
|
9
|
+
- drift
|
|
10
|
+
- production
|
|
11
|
+
|
|
12
|
+
metadata:
|
|
13
|
+
requiredAbilities:
|
|
14
|
+
- machine-learning
|
|
15
|
+
- statistical-analysis
|
|
16
|
+
- data-analysis
|
|
17
|
+
estimatedDuration: 300
|
|
18
|
+
complexity: high
|
|
19
|
+
schedule: "0 */6 * * *" # Every 6 hours
|
|
20
|
+
|
|
21
|
+
steps:
|
|
22
|
+
- stepId: fetch-baseline
|
|
23
|
+
name: Fetch Baseline Statistics
|
|
24
|
+
type: tool
|
|
25
|
+
timeout: 10000
|
|
26
|
+
tool: memory_retrieve
|
|
27
|
+
config:
|
|
28
|
+
namespace: ml-model-registry
|
|
29
|
+
key: "{{model_name}}/versions/{{production_version}}/baseline_stats"
|
|
30
|
+
|
|
31
|
+
- stepId: fetch-production-data
|
|
32
|
+
name: Fetch Recent Production Data
|
|
33
|
+
type: tool
|
|
34
|
+
timeout: 30000
|
|
35
|
+
tool: memory_search
|
|
36
|
+
config:
|
|
37
|
+
namespace: ml-production-predictions
|
|
38
|
+
query: "model:{{model_name}} AND timestamp:[{{window_start}} TO {{window_end}}]"
|
|
39
|
+
limit: 10000
|
|
40
|
+
|
|
41
|
+
- stepId: data-drift-detection
|
|
42
|
+
name: Detect Data Drift
|
|
43
|
+
type: prompt
|
|
44
|
+
timeout: 180000
|
|
45
|
+
config:
|
|
46
|
+
agent: data-scientist
|
|
47
|
+
task: |
|
|
48
|
+
Analyze production data for drift compared to training data.
|
|
49
|
+
|
|
50
|
+
## Feature Distribution Analysis
|
|
51
|
+
For each feature:
|
|
52
|
+
|
|
53
|
+
1. **Statistical Tests**:
|
|
54
|
+
- Kolmogorov-Smirnov test (continuous features)
|
|
55
|
+
- Chi-squared test (categorical features)
|
|
56
|
+
- Population Stability Index (PSI)
|
|
57
|
+
|
|
58
|
+
2. **Drift Severity**:
|
|
59
|
+
- No Drift: PSI < 0.1
|
|
60
|
+
- Moderate Drift: 0.1 <= PSI < 0.25
|
|
61
|
+
- Significant Drift: PSI >= 0.25
|
|
62
|
+
|
|
63
|
+
3. **Per-Feature Report**:
|
|
64
|
+
- Feature name
|
|
65
|
+
- Drift score
|
|
66
|
+
- Baseline mean/distribution
|
|
67
|
+
- Current mean/distribution
|
|
68
|
+
- Visual: distribution comparison
|
|
69
|
+
|
|
70
|
+
## Overall Assessment
|
|
71
|
+
- Total features with significant drift
|
|
72
|
+
- Most drifted features (top 5)
|
|
73
|
+
- Drift trend (increasing/stable/decreasing)
|
|
74
|
+
|
|
75
|
+
## Alert Level
|
|
76
|
+
- GREEN: No significant drift
|
|
77
|
+
- YELLOW: Moderate drift, monitor closely
|
|
78
|
+
- RED: Significant drift, action required
|
|
79
|
+
|
|
80
|
+
- stepId: prediction-drift
|
|
81
|
+
name: Detect Prediction Drift
|
|
82
|
+
type: prompt
|
|
83
|
+
timeout: 120000
|
|
84
|
+
config:
|
|
85
|
+
agent: data-scientist
|
|
86
|
+
task: |
|
|
87
|
+
Analyze prediction distribution for drift.
|
|
88
|
+
|
|
89
|
+
## Prediction Distribution Analysis
|
|
90
|
+
|
|
91
|
+
1. **Output Distribution**:
|
|
92
|
+
- Compare current vs baseline prediction distribution
|
|
93
|
+
- For classification: class probability distributions
|
|
94
|
+
- For regression: prediction value distribution
|
|
95
|
+
|
|
96
|
+
2. **Confidence Scores**:
|
|
97
|
+
- Average confidence: baseline vs current
|
|
98
|
+
- Confidence distribution shift
|
|
99
|
+
- Low-confidence prediction rate
|
|
100
|
+
|
|
101
|
+
3. **Prediction Patterns**:
|
|
102
|
+
- Class balance (classification)
|
|
103
|
+
- Prediction range (regression)
|
|
104
|
+
- Unusual prediction clusters
|
|
105
|
+
|
|
106
|
+
## Anomaly Detection
|
|
107
|
+
- Identify sudden prediction shifts
|
|
108
|
+
- Detect prediction patterns not seen in training
|
|
109
|
+
- Flag potential model failures
|
|
110
|
+
|
|
111
|
+
- stepId: performance-degradation
|
|
112
|
+
name: Check Performance Degradation
|
|
113
|
+
type: prompt
|
|
114
|
+
timeout: 120000
|
|
115
|
+
config:
|
|
116
|
+
agent: ml-engineer
|
|
117
|
+
task: |
|
|
118
|
+
Compare current model performance against baseline.
|
|
119
|
+
|
|
120
|
+
## Metric Comparison (if ground truth available)
|
|
121
|
+
|
|
122
|
+
| Metric | Baseline | Current | Delta | Status |
|
|
123
|
+
|--------|----------|---------|-------|--------|
|
|
124
|
+
| Accuracy | X | Y | Z% | OK/WARN/ALERT |
|
|
125
|
+
| Precision | X | Y | Z% | OK/WARN/ALERT |
|
|
126
|
+
| Recall | X | Y | Z% | OK/WARN/ALERT |
|
|
127
|
+
|
|
128
|
+
## Thresholds
|
|
129
|
+
- OK: Within 5% of baseline
|
|
130
|
+
- WARN: 5-10% degradation
|
|
131
|
+
- ALERT: >10% degradation
|
|
132
|
+
|
|
133
|
+
## Latency Analysis
|
|
134
|
+
- P50 latency: baseline vs current
|
|
135
|
+
- P99 latency: baseline vs current
|
|
136
|
+
- Timeout rate
|
|
137
|
+
|
|
138
|
+
## Error Analysis
|
|
139
|
+
- Error rate trend
|
|
140
|
+
- Error type distribution
|
|
141
|
+
- New error patterns
|
|
142
|
+
|
|
143
|
+
- stepId: retraining-decision
|
|
144
|
+
name: Retraining Recommendation
|
|
145
|
+
type: prompt
|
|
146
|
+
timeout: 60000
|
|
147
|
+
config:
|
|
148
|
+
agent: ml-engineer
|
|
149
|
+
task: |
|
|
150
|
+
Based on drift and degradation analysis, recommend action.
|
|
151
|
+
|
|
152
|
+
## Decision Matrix
|
|
153
|
+
|
|
154
|
+
| Data Drift | Performance Drop | Recommendation |
|
|
155
|
+
|------------|------------------|----------------|
|
|
156
|
+
| None | None | NO_ACTION |
|
|
157
|
+
| Moderate | None | MONITOR |
|
|
158
|
+
| Significant | None | INVESTIGATE |
|
|
159
|
+
| None | Moderate | INVESTIGATE |
|
|
160
|
+
| Moderate | Moderate | RETRAIN |
|
|
161
|
+
| Significant | Any | RETRAIN_URGENT |
|
|
162
|
+
| Any | Significant | RETRAIN_URGENT |
|
|
163
|
+
|
|
164
|
+
## Recommendation Output
|
|
165
|
+
- **Action**: NO_ACTION / MONITOR / INVESTIGATE / RETRAIN / RETRAIN_URGENT
|
|
166
|
+
- **Confidence**: HIGH / MEDIUM / LOW
|
|
167
|
+
- **Reasoning**: Explanation
|
|
168
|
+
|
|
169
|
+
## If RETRAIN Recommended
|
|
170
|
+
- Suggested training data window
|
|
171
|
+
- Features to focus on
|
|
172
|
+
- Hyperparameter adjustments to consider
|
|
173
|
+
|
|
174
|
+
## If INVESTIGATE Recommended
|
|
175
|
+
- Likely root causes to check
|
|
176
|
+
- Data quality checks to run
|
|
177
|
+
- External factors to consider
|
|
178
|
+
|
|
179
|
+
- stepId: generate-alerts
|
|
180
|
+
name: Generate Monitoring Alerts
|
|
181
|
+
type: conditional
|
|
182
|
+
config:
|
|
183
|
+
condition: "{{action}} != 'NO_ACTION'"
|
|
184
|
+
then:
|
|
185
|
+
- stepId: store-alert
|
|
186
|
+
type: tool
|
|
187
|
+
tool: memory_store
|
|
188
|
+
config:
|
|
189
|
+
namespace: ml-alerts
|
|
190
|
+
key: "{{model_name}}/{{timestamp}}"
|
|
191
|
+
value:
|
|
192
|
+
alert_id: "{{alert_id}}"
|
|
193
|
+
model_name: "{{model_name}}"
|
|
194
|
+
model_version: "{{production_version}}"
|
|
195
|
+
severity: "{{severity}}"
|
|
196
|
+
action_required: "{{action}}"
|
|
197
|
+
drift_summary:
|
|
198
|
+
data_drift_level: "{{data_drift_level}}"
|
|
199
|
+
prediction_drift_level: "{{prediction_drift_level}}"
|
|
200
|
+
top_drifted_features: "{{top_drifted_features}}"
|
|
201
|
+
performance_summary:
|
|
202
|
+
degradation_detected: "{{degradation_detected}}"
|
|
203
|
+
metrics_affected: "{{metrics_affected}}"
|
|
204
|
+
recommendation: "{{recommendation}}"
|
|
205
|
+
created_at: "{{timestamp}}"
|
|
206
|
+
acknowledged: false
|
|
207
|
+
|
|
208
|
+
- stepId: store-monitoring-report
|
|
209
|
+
name: Store Monitoring Report
|
|
210
|
+
type: tool
|
|
211
|
+
timeout: 10000
|
|
212
|
+
tool: memory_store
|
|
213
|
+
config:
|
|
214
|
+
namespace: ml-monitoring-reports
|
|
215
|
+
key: "{{model_name}}/{{report_date}}"
|
|
216
|
+
ttl: 2592000
|
|
217
|
+
value:
|
|
218
|
+
model_name: "{{model_name}}"
|
|
219
|
+
model_version: "{{production_version}}"
|
|
220
|
+
report_period:
|
|
221
|
+
start: "{{window_start}}"
|
|
222
|
+
end: "{{window_end}}"
|
|
223
|
+
data_drift_report: "{{data_drift_report}}"
|
|
224
|
+
prediction_drift_report: "{{prediction_drift_report}}"
|
|
225
|
+
performance_report: "{{performance_report}}"
|
|
226
|
+
action_taken: "{{action}}"
|
|
227
|
+
generated_at: "{{timestamp}}"
|
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
workflowId: ml-model-registry
|
|
2
|
+
name: Model Registry Management
|
|
3
|
+
description: Version, store, and manage ML models through their lifecycle
|
|
4
|
+
version: "1.0.0"
|
|
5
|
+
category: machine-learning
|
|
6
|
+
tags:
|
|
7
|
+
- ml
|
|
8
|
+
- model-registry
|
|
9
|
+
- versioning
|
|
10
|
+
- deployment
|
|
11
|
+
|
|
12
|
+
metadata:
|
|
13
|
+
requiredAbilities:
|
|
14
|
+
- machine-learning
|
|
15
|
+
estimatedDuration: 120
|
|
16
|
+
complexity: low
|
|
17
|
+
|
|
18
|
+
steps:
|
|
19
|
+
- stepId: validate-model
|
|
20
|
+
name: Validate Model Artifact
|
|
21
|
+
type: prompt
|
|
22
|
+
timeout: 60000
|
|
23
|
+
config:
|
|
24
|
+
agent: ml-engineer
|
|
25
|
+
task: |
|
|
26
|
+
Validate the model artifact before registration.
|
|
27
|
+
|
|
28
|
+
## Validation Checks
|
|
29
|
+
1. **Artifact Integrity**:
|
|
30
|
+
- Model file exists and is readable
|
|
31
|
+
- Checksum matches expected value
|
|
32
|
+
|
|
33
|
+
2. **Schema Validation**:
|
|
34
|
+
- Input schema is defined
|
|
35
|
+
- Output schema is defined
|
|
36
|
+
- Feature names match training data
|
|
37
|
+
|
|
38
|
+
3. **Metadata Completeness**:
|
|
39
|
+
- Training dataset version recorded
|
|
40
|
+
- Hyperparameters recorded
|
|
41
|
+
- Evaluation metrics recorded
|
|
42
|
+
- Experiment ID linked
|
|
43
|
+
|
|
44
|
+
4. **Dependency Check**:
|
|
45
|
+
- Required libraries documented
|
|
46
|
+
- Version compatibility verified
|
|
47
|
+
|
|
48
|
+
Return validation status: PASS or FAIL with details.
|
|
49
|
+
|
|
50
|
+
- stepId: register-model
|
|
51
|
+
name: Register Model Version
|
|
52
|
+
type: tool
|
|
53
|
+
timeout: 10000
|
|
54
|
+
tool: memory_store
|
|
55
|
+
config:
|
|
56
|
+
namespace: ml-model-registry
|
|
57
|
+
key: "{{model_name}}/versions/{{version}}"
|
|
58
|
+
value:
|
|
59
|
+
model_name: "{{model_name}}"
|
|
60
|
+
version: "{{version}}"
|
|
61
|
+
description: "{{description}}"
|
|
62
|
+
|
|
63
|
+
# Lineage
|
|
64
|
+
experiment_id: "{{experiment_id}}"
|
|
65
|
+
training_dataset: "{{dataset_version}}"
|
|
66
|
+
parent_model: "{{parent_version}}"
|
|
67
|
+
|
|
68
|
+
# Artifact
|
|
69
|
+
artifact_path: "{{artifact_path}}"
|
|
70
|
+
artifact_checksum: "{{checksum}}"
|
|
71
|
+
artifact_size_bytes: "{{size}}"
|
|
72
|
+
|
|
73
|
+
# Schemas
|
|
74
|
+
input_schema: "{{input_schema}}"
|
|
75
|
+
output_schema: "{{output_schema}}"
|
|
76
|
+
feature_names: "{{feature_names}}"
|
|
77
|
+
|
|
78
|
+
# Performance
|
|
79
|
+
metrics: "{{metrics}}"
|
|
80
|
+
evaluation_id: "{{evaluation_id}}"
|
|
81
|
+
|
|
82
|
+
# Dependencies
|
|
83
|
+
framework: "{{framework}}"
|
|
84
|
+
framework_version: "{{framework_version}}"
|
|
85
|
+
dependencies: "{{dependencies}}"
|
|
86
|
+
|
|
87
|
+
# Lifecycle
|
|
88
|
+
status: "staged"
|
|
89
|
+
created_at: "{{timestamp}}"
|
|
90
|
+
created_by: "{{user}}"
|
|
91
|
+
|
|
92
|
+
# Tags
|
|
93
|
+
tags: "{{tags}}"
|
|
94
|
+
|
|
95
|
+
- stepId: update-model-index
|
|
96
|
+
name: Update Model Index
|
|
97
|
+
type: tool
|
|
98
|
+
timeout: 10000
|
|
99
|
+
tool: memory_store
|
|
100
|
+
config:
|
|
101
|
+
namespace: ml-model-registry
|
|
102
|
+
key: "{{model_name}}/index"
|
|
103
|
+
value:
|
|
104
|
+
model_name: "{{model_name}}"
|
|
105
|
+
latest_version: "{{version}}"
|
|
106
|
+
production_version: "{{production_version}}"
|
|
107
|
+
all_versions: "{{version_list}}"
|
|
108
|
+
updated_at: "{{timestamp}}"
|
|
109
|
+
|
|
110
|
+
- stepId: compare-versions
|
|
111
|
+
name: Compare Model Versions
|
|
112
|
+
type: prompt
|
|
113
|
+
timeout: 120000
|
|
114
|
+
config:
|
|
115
|
+
agent: data-scientist
|
|
116
|
+
task: |
|
|
117
|
+
Compare the new model version against the current production version.
|
|
118
|
+
|
|
119
|
+
## Comparison Dimensions
|
|
120
|
+
|
|
121
|
+
1. **Performance Delta**:
|
|
122
|
+
- Metric-by-metric comparison
|
|
123
|
+
- Statistical significance of differences
|
|
124
|
+
- Confidence intervals
|
|
125
|
+
|
|
126
|
+
2. **Schema Compatibility**:
|
|
127
|
+
- Input schema changes (breaking/non-breaking)
|
|
128
|
+
- Output schema changes
|
|
129
|
+
- Feature additions/removals
|
|
130
|
+
|
|
131
|
+
3. **Resource Changes**:
|
|
132
|
+
- Model size difference
|
|
133
|
+
- Inference latency change
|
|
134
|
+
- Memory requirement change
|
|
135
|
+
|
|
136
|
+
4. **Risk Assessment**:
|
|
137
|
+
- Breaking changes identified
|
|
138
|
+
- Rollback complexity
|
|
139
|
+
- Required client updates
|
|
140
|
+
|
|
141
|
+
## Recommendation
|
|
142
|
+
- **PROMOTE**: Ready for production rollout
|
|
143
|
+
- **CANARY**: Recommend gradual rollout with monitoring
|
|
144
|
+
- **HOLD**: Needs more evaluation
|
|
145
|
+
- **REJECT**: Does not meet requirements
|
|
146
|
+
|
|
147
|
+
- stepId: handle-promote
|
|
148
|
+
name: Promote to Production
|
|
149
|
+
type: conditional
|
|
150
|
+
config:
|
|
151
|
+
condition: "{{recommendation}} == 'PROMOTE'"
|
|
152
|
+
then:
|
|
153
|
+
- stepId: update-production-status
|
|
154
|
+
type: tool
|
|
155
|
+
tool: memory_store
|
|
156
|
+
config:
|
|
157
|
+
namespace: ml-model-registry
|
|
158
|
+
key: "{{model_name}}/versions/{{version}}"
|
|
159
|
+
merge: true
|
|
160
|
+
value:
|
|
161
|
+
status: "production"
|
|
162
|
+
promoted_at: "{{timestamp}}"
|
|
163
|
+
promoted_by: "{{user}}"
|
|
164
|
+
|
|
165
|
+
- stepId: handle-canary
|
|
166
|
+
name: Set Canary Status
|
|
167
|
+
type: conditional
|
|
168
|
+
config:
|
|
169
|
+
condition: "{{recommendation}} == 'CANARY'"
|
|
170
|
+
then:
|
|
171
|
+
- stepId: update-canary-status
|
|
172
|
+
type: tool
|
|
173
|
+
tool: memory_store
|
|
174
|
+
config:
|
|
175
|
+
namespace: ml-model-registry
|
|
176
|
+
key: "{{model_name}}/versions/{{version}}"
|
|
177
|
+
merge: true
|
|
178
|
+
value:
|
|
179
|
+
status: "canary"
|
|
180
|
+
canary_started_at: "{{timestamp}}"
|
|
181
|
+
canary_traffic_percent: 10
|
|
182
|
+
|
|
183
|
+
- stepId: notify-promotion
|
|
184
|
+
name: Generate Promotion Notification
|
|
185
|
+
type: prompt
|
|
186
|
+
timeout: 60000
|
|
187
|
+
config:
|
|
188
|
+
agent: writer
|
|
189
|
+
task: |
|
|
190
|
+
Create a model promotion notification for stakeholders.
|
|
191
|
+
|
|
192
|
+
## Notification Content
|
|
193
|
+
|
|
194
|
+
**Subject**: Model Promotion: {{model_name}} v{{version}}
|
|
195
|
+
|
|
196
|
+
**Summary**:
|
|
197
|
+
- Previous production version: {{old_version}}
|
|
198
|
+
- New production version: {{version}}
|
|
199
|
+
- Promotion type: {{promotion_type}} (immediate/canary)
|
|
200
|
+
|
|
201
|
+
**Key Changes**:
|
|
202
|
+
- Performance improvement: {{performance_delta}}
|
|
203
|
+
- Notable changes: {{changes}}
|
|
204
|
+
|
|
205
|
+
**Rollback Plan**:
|
|
206
|
+
- Rollback command/procedure
|
|
207
|
+
- Rollback criteria
|
|
208
|
+
- On-call contact
|
|
209
|
+
|
|
210
|
+
**Timeline**:
|
|
211
|
+
- Promotion start: {{timestamp}}
|
|
212
|
+
- Full rollout (if canary): {{full_rollout_date}}
|
|
213
|
+
- Monitoring period: {{monitoring_period}}
|
|
214
|
+
|
|
215
|
+
- stepId: store-promotion-event
|
|
216
|
+
name: Log Promotion Event
|
|
217
|
+
type: tool
|
|
218
|
+
timeout: 10000
|
|
219
|
+
tool: memory_store
|
|
220
|
+
config:
|
|
221
|
+
namespace: ml-model-events
|
|
222
|
+
key: "{{model_name}}/promotions/{{timestamp}}"
|
|
223
|
+
ttl: 31536000
|
|
224
|
+
value:
|
|
225
|
+
event_type: "promotion"
|
|
226
|
+
model_name: "{{model_name}}"
|
|
227
|
+
from_version: "{{old_production_version}}"
|
|
228
|
+
to_version: "{{version}}"
|
|
229
|
+
promotion_type: "{{promotion_type}}"
|
|
230
|
+
initiated_by: "{{user}}"
|
|
231
|
+
timestamp: "{{timestamp}}"
|
|
232
|
+
notification_sent: true
|