@defai.digital/cli 13.4.3 → 13.4.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/bundled/agents/aerospace-scientist.json +114 -0
  2. package/bundled/agents/architecture.json +96 -0
  3. package/bundled/agents/backend.json +125 -0
  4. package/bundled/agents/blockchain-developer.json +114 -0
  5. package/bundled/agents/ceo.json +113 -0
  6. package/bundled/agents/creative-marketer.json +114 -0
  7. package/bundled/agents/cto.json +96 -0
  8. package/bundled/agents/data-scientist.json +96 -0
  9. package/bundled/agents/devops.json +98 -0
  10. package/bundled/agents/frontend.json +118 -0
  11. package/bundled/agents/fullstack.json +99 -0
  12. package/bundled/agents/ml-engineer.json +114 -0
  13. package/bundled/agents/mlops-engineer.json +102 -0
  14. package/bundled/agents/mobile.json +96 -0
  15. package/bundled/agents/product.json +96 -0
  16. package/bundled/agents/quality.json +95 -0
  17. package/bundled/agents/quantum-engineer.json +114 -0
  18. package/bundled/agents/researcher.json +123 -0
  19. package/bundled/agents/reviewer.json +107 -0
  20. package/bundled/agents/security.json +79 -0
  21. package/bundled/agents/standard.json +82 -0
  22. package/bundled/agents/writer.json +78 -0
  23. package/bundled/templates/monorepo/contract-index.ts.hbs +7 -0
  24. package/bundled/templates/monorepo/contract-test.ts.hbs +130 -0
  25. package/bundled/templates/monorepo/contracts-package.json.hbs +29 -0
  26. package/bundled/templates/monorepo/domain-index.ts.hbs +115 -0
  27. package/bundled/templates/monorepo/domain-package.json.hbs +27 -0
  28. package/bundled/templates/monorepo/gitignore.hbs +32 -0
  29. package/bundled/templates/monorepo/invariants.md.hbs +43 -0
  30. package/bundled/templates/monorepo/package.json.hbs +28 -0
  31. package/bundled/templates/monorepo/pnpm-workspace.yaml.hbs +5 -0
  32. package/bundled/templates/monorepo/schema.ts.hbs +82 -0
  33. package/bundled/templates/monorepo/template.json +106 -0
  34. package/bundled/templates/monorepo/tsconfig.json.hbs +22 -0
  35. package/bundled/templates/standalone/contract-index.ts.hbs +5 -0
  36. package/bundled/templates/standalone/contract-test.ts.hbs +95 -0
  37. package/bundled/templates/standalone/contracts-root-index.ts.hbs +7 -0
  38. package/bundled/templates/standalone/domain-index.ts.hbs +6 -0
  39. package/bundled/templates/standalone/domain-repository.ts.hbs +44 -0
  40. package/bundled/templates/standalone/domain-service.ts.hbs +102 -0
  41. package/bundled/templates/standalone/gitignore.hbs +27 -0
  42. package/bundled/templates/standalone/invariants.md.hbs +35 -0
  43. package/bundled/templates/standalone/package.json.hbs +41 -0
  44. package/bundled/templates/standalone/schema.ts.hbs +61 -0
  45. package/bundled/templates/standalone/src-index.ts.hbs +11 -0
  46. package/bundled/templates/standalone/template.json +91 -0
  47. package/bundled/templates/standalone/tsconfig.json.hbs +20 -0
  48. package/bundled/templates/standalone/vitest.config.ts.hbs +8 -0
  49. package/bundled/workflows/adversarial-debate.yaml +222 -0
  50. package/bundled/workflows/analyst.yaml +115 -0
  51. package/bundled/workflows/assistant.yaml +74 -0
  52. package/bundled/workflows/code-review-discussion.yaml +166 -0
  53. package/bundled/workflows/code-reviewer.yaml +94 -0
  54. package/bundled/workflows/contract-first-project.yaml +356 -0
  55. package/bundled/workflows/debugger.yaml +107 -0
  56. package/bundled/workflows/designer.yaml +113 -0
  57. package/bundled/workflows/developer.yaml +105 -0
  58. package/bundled/workflows/discuss-step-examples.yaml +153 -0
  59. package/bundled/workflows/infrastructure-automation.yaml +283 -0
  60. package/bundled/workflows/ml-ab-testing.yaml +311 -0
  61. package/bundled/workflows/ml-experiment-tracker.yaml +150 -0
  62. package/bundled/workflows/ml-feature-engineering.yaml +242 -0
  63. package/bundled/workflows/ml-model-evaluation.yaml +234 -0
  64. package/bundled/workflows/ml-model-monitoring.yaml +227 -0
  65. package/bundled/workflows/ml-model-registry.yaml +232 -0
  66. package/bundled/workflows/mlops-deployment.yaml +267 -0
  67. package/bundled/workflows/mobile-development.yaml +312 -0
  68. package/bundled/workflows/multi-model-discussion.yaml +243 -0
  69. package/bundled/workflows/product-discovery.yaml +295 -0
  70. package/bundled/workflows/qa-specialist.yaml +116 -0
  71. package/bundled/workflows/refactoring.yaml +105 -0
  72. package/bundled/workflows/security-audit.yaml +135 -0
  73. package/bundled/workflows/std/analysis.yaml +190 -0
  74. package/bundled/workflows/std/code-review.yaml +117 -0
  75. package/bundled/workflows/std/debugging.yaml +155 -0
  76. package/bundled/workflows/std/documentation.yaml +180 -0
  77. package/bundled/workflows/std/implementation.yaml +197 -0
  78. package/bundled/workflows/std/refactoring.yaml +180 -0
  79. package/bundled/workflows/std/testing.yaml +200 -0
  80. package/bundled/workflows/strategic-planning.yaml +235 -0
  81. package/bundled/workflows/technology-research.yaml +239 -0
  82. package/dist/commands/agent.d.ts.map +1 -1
  83. package/dist/commands/agent.js +8 -2
  84. package/dist/commands/agent.js.map +1 -1
  85. package/dist/commands/scaffold.d.ts.map +1 -1
  86. package/dist/commands/scaffold.js +6 -3
  87. package/dist/commands/scaffold.js.map +1 -1
  88. package/dist/web/api.d.ts.map +1 -1
  89. package/dist/web/api.js +13 -6
  90. package/dist/web/api.js.map +1 -1
  91. package/package.json +23 -22
@@ -0,0 +1,234 @@
1
+ workflowId: ml-model-evaluation
2
+ name: Model Evaluation Pipeline
3
+ description: Comprehensive model evaluation with performance, fairness, robustness, and production-readiness checks
4
+ version: "1.0.0"
5
+ category: machine-learning
6
+ tags:
7
+ - ml
8
+ - evaluation
9
+ - fairness
10
+ - robustness
11
+ - production
12
+
13
+ metadata:
14
+ requiredAbilities:
15
+ - machine-learning
16
+ - statistical-analysis
17
+ - data-analysis
18
+ estimatedDuration: 600
19
+ complexity: high
20
+
21
+ steps:
22
+ - stepId: performance-metrics
23
+ name: Calculate Performance Metrics
24
+ type: prompt
25
+ timeout: 180000
26
+ config:
27
+ agent: data-scientist
28
+ task: |
29
+ Evaluate model performance on the test dataset.
30
+
31
+ ## Classification Metrics (if applicable)
32
+ - Accuracy
33
+ - Precision (macro, micro, weighted)
34
+ - Recall (macro, micro, weighted)
35
+ - F1 Score (macro, micro, weighted)
36
+ - AUC-ROC (one-vs-rest for multiclass)
37
+ - AUC-PR (Precision-Recall curve)
38
+ - Log Loss
39
+ - Confusion Matrix
40
+ - Per-class metrics breakdown
41
+
42
+ ## Regression Metrics (if applicable)
43
+ - Mean Squared Error (MSE)
44
+ - Root Mean Squared Error (RMSE)
45
+ - Mean Absolute Error (MAE)
46
+ - R-squared (Coefficient of Determination)
47
+ - Mean Absolute Percentage Error (MAPE)
48
+ - Residual analysis
49
+
50
+ ## Confidence Intervals
51
+ - Bootstrap 95% CI for primary metric
52
+ - Report standard deviation across folds
53
+
54
+ Provide structured output with all applicable metrics.
55
+
56
+ - stepId: fairness-audit
57
+ name: Fairness & Bias Audit
58
+ type: prompt
59
+ timeout: 180000
60
+ config:
61
+ agent: data-scientist
62
+ task: |
63
+ Conduct a comprehensive fairness and bias audit.
64
+
65
+ ## Demographic Analysis
66
+ For each protected attribute (gender, age, race, etc.):
67
+
68
+ 1. **Demographic Parity**:
69
+ - Selection rate across groups
70
+ - Disparate impact ratio (should be > 0.8)
71
+
72
+ 2. **Equal Opportunity**:
73
+ - True positive rate across groups
74
+ - False negative rate disparity
75
+
76
+ 3. **Equalized Odds**:
77
+ - TPR and FPR across groups
78
+ - Maximum disparity
79
+
80
+ 4. **Calibration**:
81
+ - Predicted probability vs actual outcome by group
82
+
83
+ ## Bias Detection
84
+ - Identify any systematic prediction bias
85
+ - Check for proxy discrimination
86
+ - Analyze error distribution across groups
87
+
88
+ ## Mitigation Recommendations
89
+ If bias is detected:
90
+ - Recommend specific mitigation strategies
91
+ - Estimate impact of mitigation on overall performance
92
+
93
+ ## Compliance Check
94
+ - Flag any potential regulatory concerns (GDPR, CCPA, ECOA)
95
+
96
+ - stepId: robustness-testing
97
+ name: Robustness & Edge Case Testing
98
+ type: prompt
99
+ timeout: 180000
100
+ config:
101
+ agent: ml-engineer
102
+ task: |
103
+ Test model robustness and behavior on edge cases.
104
+
105
+ ## Input Perturbation Testing
106
+ 1. **Noise Sensitivity**:
107
+ - Add Gaussian noise to numerical features
108
+ - Measure performance degradation curve
109
+
110
+ 2. **Missing Values**:
111
+ - Simulate missing data patterns
112
+ - Test model behavior with incomplete inputs
113
+
114
+ 3. **Out-of-Range Values**:
115
+ - Test with extreme values
116
+ - Check for catastrophic failures
117
+
118
+ ## Distribution Shift Testing
119
+ 1. **Covariate Shift**:
120
+ - Test on data with shifted feature distributions
121
+ - Measure performance drop
122
+
123
+ 2. **Label Shift**:
124
+ - Test with different class proportions
125
+ - Check calibration under shift
126
+
127
+ ## Edge Cases
128
+ 1. Boundary conditions
129
+ 2. Rare class handling
130
+ 3. Adversarial-like examples (if applicable)
131
+
132
+ ## Out-of-Distribution Detection
133
+ - Can the model identify when inputs are OOD?
134
+ - Confidence calibration on OOD samples
135
+
136
+ - stepId: latency-profiling
137
+ name: Latency & Resource Profiling
138
+ type: prompt
139
+ timeout: 120000
140
+ config:
141
+ agent: ml-engineer
142
+ task: |
143
+ Profile model for production deployment.
144
+
145
+ ## Inference Latency
146
+ - Single sample latency (p50, p95, p99)
147
+ - Batch inference latency by batch size
148
+ - Cold start latency
149
+ - Warm cache latency
150
+
151
+ ## Resource Requirements
152
+ - Memory footprint (model size)
153
+ - Peak memory during inference
154
+ - CPU utilization pattern
155
+ - GPU utilization (if applicable)
156
+ - GPU memory requirements
157
+
158
+ ## Scalability Analysis
159
+ - Throughput (requests/second)
160
+ - Scaling behavior under load
161
+ - Recommended instance type
162
+ - Cost per 1M predictions
163
+
164
+ ## Optimization Opportunities
165
+ - Quantization potential
166
+ - Pruning opportunities
167
+ - Distillation candidates
168
+ - Caching strategies
169
+
170
+ - stepId: evaluation-report
171
+ name: Generate Evaluation Report
172
+ type: prompt
173
+ timeout: 120000
174
+ config:
175
+ agent: data-scientist
176
+ task: |
177
+ Create a comprehensive model evaluation report.
178
+
179
+ ## Executive Summary
180
+ - Model name and version
181
+ - One-paragraph assessment
182
+ - Go/No-Go recommendation with confidence level
183
+
184
+ ## Performance Summary
185
+ - Primary metric with confidence interval
186
+ - Comparison to baseline
187
+ - Key strengths and weaknesses
188
+
189
+ ## Fairness Summary
190
+ - Overall fairness assessment (Pass/Fail/Warning)
191
+ - Groups with highest disparity
192
+ - Required mitigations before deployment
193
+
194
+ ## Robustness Summary
195
+ - Robustness score (1-10)
196
+ - Critical failure modes identified
197
+ - Recommended guardrails
198
+
199
+ ## Production Readiness
200
+ - Latency requirements: Met/Not Met
201
+ - Resource requirements: Acceptable/High/Excessive
202
+ - Scaling assessment
203
+
204
+ ## Recommendations
205
+ 1. **If Go**: Deployment checklist
206
+ 2. **If No-Go**: Required improvements ranked by priority
207
+ 3. **Monitoring requirements** for production
208
+
209
+ ## Appendix
210
+ - Full metrics tables
211
+ - Visualization references
212
+ - Test dataset description
213
+
214
+ - stepId: store-evaluation
215
+ name: Store Evaluation Results
216
+ type: tool
217
+ timeout: 10000
218
+ tool: memory_store
219
+ config:
220
+ namespace: ml-evaluations
221
+ key: "{{model_name}}/{{model_version}}/evaluation"
222
+ ttl: 15552000
223
+ value:
224
+ model_name: "{{model_name}}"
225
+ model_version: "{{model_version}}"
226
+ evaluation_id: "{{evaluation_id}}"
227
+ recommendation: "{{go_no_go}}"
228
+ performance_metrics: "{{performance_metrics}}"
229
+ fairness_results: "{{fairness_results}}"
230
+ robustness_score: "{{robustness_score}}"
231
+ latency_profile: "{{latency_profile}}"
232
+ full_report: "{{evaluation_report}}"
233
+ evaluated_at: "{{timestamp}}"
234
+ evaluated_by: "{{user}}"
@@ -0,0 +1,227 @@
1
+ workflowId: ml-model-monitoring
2
+ name: Production Model Monitoring
3
+ description: Detect drift, degradation, and anomalies in production models
4
+ version: "1.0.0"
5
+ category: machine-learning
6
+ tags:
7
+ - ml
8
+ - monitoring
9
+ - drift
10
+ - production
11
+
12
+ metadata:
13
+ requiredAbilities:
14
+ - machine-learning
15
+ - statistical-analysis
16
+ - data-analysis
17
+ estimatedDuration: 300
18
+ complexity: high
19
+ schedule: "0 */6 * * *" # Every 6 hours
20
+
21
+ steps:
22
+ - stepId: fetch-baseline
23
+ name: Fetch Baseline Statistics
24
+ type: tool
25
+ timeout: 10000
26
+ tool: memory_retrieve
27
+ config:
28
+ namespace: ml-model-registry
29
+ key: "{{model_name}}/versions/{{production_version}}/baseline_stats"
30
+
31
+ - stepId: fetch-production-data
32
+ name: Fetch Recent Production Data
33
+ type: tool
34
+ timeout: 30000
35
+ tool: memory_search
36
+ config:
37
+ namespace: ml-production-predictions
38
+ query: "model:{{model_name}} AND timestamp:[{{window_start}} TO {{window_end}}]"
39
+ limit: 10000
40
+
41
+ - stepId: data-drift-detection
42
+ name: Detect Data Drift
43
+ type: prompt
44
+ timeout: 180000
45
+ config:
46
+ agent: data-scientist
47
+ task: |
48
+ Analyze production data for drift compared to training data.
49
+
50
+ ## Feature Distribution Analysis
51
+ For each feature:
52
+
53
+ 1. **Statistical Tests**:
54
+ - Kolmogorov-Smirnov test (continuous features)
55
+ - Chi-squared test (categorical features)
56
+ - Population Stability Index (PSI)
57
+
58
+ 2. **Drift Severity**:
59
+ - No Drift: PSI < 0.1
60
+ - Moderate Drift: 0.1 <= PSI < 0.25
61
+ - Significant Drift: PSI >= 0.25
62
+
63
+ 3. **Per-Feature Report**:
64
+ - Feature name
65
+ - Drift score
66
+ - Baseline mean/distribution
67
+ - Current mean/distribution
68
+ - Visual: distribution comparison
69
+
70
+ ## Overall Assessment
71
+ - Total features with significant drift
72
+ - Most drifted features (top 5)
73
+ - Drift trend (increasing/stable/decreasing)
74
+
75
+ ## Alert Level
76
+ - GREEN: No significant drift
77
+ - YELLOW: Moderate drift, monitor closely
78
+ - RED: Significant drift, action required
79
+
80
+ - stepId: prediction-drift
81
+ name: Detect Prediction Drift
82
+ type: prompt
83
+ timeout: 120000
84
+ config:
85
+ agent: data-scientist
86
+ task: |
87
+ Analyze prediction distribution for drift.
88
+
89
+ ## Prediction Distribution Analysis
90
+
91
+ 1. **Output Distribution**:
92
+ - Compare current vs baseline prediction distribution
93
+ - For classification: class probability distributions
94
+ - For regression: prediction value distribution
95
+
96
+ 2. **Confidence Scores**:
97
+ - Average confidence: baseline vs current
98
+ - Confidence distribution shift
99
+ - Low-confidence prediction rate
100
+
101
+ 3. **Prediction Patterns**:
102
+ - Class balance (classification)
103
+ - Prediction range (regression)
104
+ - Unusual prediction clusters
105
+
106
+ ## Anomaly Detection
107
+ - Identify sudden prediction shifts
108
+ - Detect prediction patterns not seen in training
109
+ - Flag potential model failures
110
+
111
+ - stepId: performance-degradation
112
+ name: Check Performance Degradation
113
+ type: prompt
114
+ timeout: 120000
115
+ config:
116
+ agent: ml-engineer
117
+ task: |
118
+ Compare current model performance against baseline.
119
+
120
+ ## Metric Comparison (if ground truth available)
121
+
122
+ | Metric | Baseline | Current | Delta | Status |
123
+ |--------|----------|---------|-------|--------|
124
+ | Accuracy | X | Y | Z% | OK/WARN/ALERT |
125
+ | Precision | X | Y | Z% | OK/WARN/ALERT |
126
+ | Recall | X | Y | Z% | OK/WARN/ALERT |
127
+
128
+ ## Thresholds
129
+ - OK: Within 5% of baseline
130
+ - WARN: 5-10% degradation
131
+ - ALERT: >10% degradation
132
+
133
+ ## Latency Analysis
134
+ - P50 latency: baseline vs current
135
+ - P99 latency: baseline vs current
136
+ - Timeout rate
137
+
138
+ ## Error Analysis
139
+ - Error rate trend
140
+ - Error type distribution
141
+ - New error patterns
142
+
143
+ - stepId: retraining-decision
144
+ name: Retraining Recommendation
145
+ type: prompt
146
+ timeout: 60000
147
+ config:
148
+ agent: ml-engineer
149
+ task: |
150
+ Based on drift and degradation analysis, recommend action.
151
+
152
+ ## Decision Matrix
153
+
154
+ | Data Drift | Performance Drop | Recommendation |
155
+ |------------|------------------|----------------|
156
+ | None | None | NO_ACTION |
157
+ | Moderate | None | MONITOR |
158
+ | Significant | None | INVESTIGATE |
159
+ | None | Moderate | INVESTIGATE |
160
+ | Moderate | Moderate | RETRAIN |
161
+ | Significant | Any | RETRAIN_URGENT |
162
+ | Any | Significant | RETRAIN_URGENT |
163
+
164
+ ## Recommendation Output
165
+ - **Action**: NO_ACTION / MONITOR / INVESTIGATE / RETRAIN / RETRAIN_URGENT
166
+ - **Confidence**: HIGH / MEDIUM / LOW
167
+ - **Reasoning**: Explanation
168
+
169
+ ## If RETRAIN Recommended
170
+ - Suggested training data window
171
+ - Features to focus on
172
+ - Hyperparameter adjustments to consider
173
+
174
+ ## If INVESTIGATE Recommended
175
+ - Likely root causes to check
176
+ - Data quality checks to run
177
+ - External factors to consider
178
+
179
+ - stepId: generate-alerts
180
+ name: Generate Monitoring Alerts
181
+ type: conditional
182
+ config:
183
+ condition: "{{action}} != 'NO_ACTION'"
184
+ then:
185
+ - stepId: store-alert
186
+ type: tool
187
+ tool: memory_store
188
+ config:
189
+ namespace: ml-alerts
190
+ key: "{{model_name}}/{{timestamp}}"
191
+ value:
192
+ alert_id: "{{alert_id}}"
193
+ model_name: "{{model_name}}"
194
+ model_version: "{{production_version}}"
195
+ severity: "{{severity}}"
196
+ action_required: "{{action}}"
197
+ drift_summary:
198
+ data_drift_level: "{{data_drift_level}}"
199
+ prediction_drift_level: "{{prediction_drift_level}}"
200
+ top_drifted_features: "{{top_drifted_features}}"
201
+ performance_summary:
202
+ degradation_detected: "{{degradation_detected}}"
203
+ metrics_affected: "{{metrics_affected}}"
204
+ recommendation: "{{recommendation}}"
205
+ created_at: "{{timestamp}}"
206
+ acknowledged: false
207
+
208
+ - stepId: store-monitoring-report
209
+ name: Store Monitoring Report
210
+ type: tool
211
+ timeout: 10000
212
+ tool: memory_store
213
+ config:
214
+ namespace: ml-monitoring-reports
215
+ key: "{{model_name}}/{{report_date}}"
216
+ ttl: 2592000
217
+ value:
218
+ model_name: "{{model_name}}"
219
+ model_version: "{{production_version}}"
220
+ report_period:
221
+ start: "{{window_start}}"
222
+ end: "{{window_end}}"
223
+ data_drift_report: "{{data_drift_report}}"
224
+ prediction_drift_report: "{{prediction_drift_report}}"
225
+ performance_report: "{{performance_report}}"
226
+ action_taken: "{{action}}"
227
+ generated_at: "{{timestamp}}"
@@ -0,0 +1,232 @@
1
+ workflowId: ml-model-registry
2
+ name: Model Registry Management
3
+ description: Version, store, and manage ML models through their lifecycle
4
+ version: "1.0.0"
5
+ category: machine-learning
6
+ tags:
7
+ - ml
8
+ - model-registry
9
+ - versioning
10
+ - deployment
11
+
12
+ metadata:
13
+ requiredAbilities:
14
+ - machine-learning
15
+ estimatedDuration: 120
16
+ complexity: low
17
+
18
+ steps:
19
+ - stepId: validate-model
20
+ name: Validate Model Artifact
21
+ type: prompt
22
+ timeout: 60000
23
+ config:
24
+ agent: ml-engineer
25
+ task: |
26
+ Validate the model artifact before registration.
27
+
28
+ ## Validation Checks
29
+ 1. **Artifact Integrity**:
30
+ - Model file exists and is readable
31
+ - Checksum matches expected value
32
+
33
+ 2. **Schema Validation**:
34
+ - Input schema is defined
35
+ - Output schema is defined
36
+ - Feature names match training data
37
+
38
+ 3. **Metadata Completeness**:
39
+ - Training dataset version recorded
40
+ - Hyperparameters recorded
41
+ - Evaluation metrics recorded
42
+ - Experiment ID linked
43
+
44
+ 4. **Dependency Check**:
45
+ - Required libraries documented
46
+ - Version compatibility verified
47
+
48
+ Return validation status: PASS or FAIL with details.
49
+
50
+ - stepId: register-model
51
+ name: Register Model Version
52
+ type: tool
53
+ timeout: 10000
54
+ tool: memory_store
55
+ config:
56
+ namespace: ml-model-registry
57
+ key: "{{model_name}}/versions/{{version}}"
58
+ value:
59
+ model_name: "{{model_name}}"
60
+ version: "{{version}}"
61
+ description: "{{description}}"
62
+
63
+ # Lineage
64
+ experiment_id: "{{experiment_id}}"
65
+ training_dataset: "{{dataset_version}}"
66
+ parent_model: "{{parent_version}}"
67
+
68
+ # Artifact
69
+ artifact_path: "{{artifact_path}}"
70
+ artifact_checksum: "{{checksum}}"
71
+ artifact_size_bytes: "{{size}}"
72
+
73
+ # Schemas
74
+ input_schema: "{{input_schema}}"
75
+ output_schema: "{{output_schema}}"
76
+ feature_names: "{{feature_names}}"
77
+
78
+ # Performance
79
+ metrics: "{{metrics}}"
80
+ evaluation_id: "{{evaluation_id}}"
81
+
82
+ # Dependencies
83
+ framework: "{{framework}}"
84
+ framework_version: "{{framework_version}}"
85
+ dependencies: "{{dependencies}}"
86
+
87
+ # Lifecycle
88
+ status: "staged"
89
+ created_at: "{{timestamp}}"
90
+ created_by: "{{user}}"
91
+
92
+ # Tags
93
+ tags: "{{tags}}"
94
+
95
+ - stepId: update-model-index
96
+ name: Update Model Index
97
+ type: tool
98
+ timeout: 10000
99
+ tool: memory_store
100
+ config:
101
+ namespace: ml-model-registry
102
+ key: "{{model_name}}/index"
103
+ value:
104
+ model_name: "{{model_name}}"
105
+ latest_version: "{{version}}"
106
+ production_version: "{{production_version}}"
107
+ all_versions: "{{version_list}}"
108
+ updated_at: "{{timestamp}}"
109
+
110
+ - stepId: compare-versions
111
+ name: Compare Model Versions
112
+ type: prompt
113
+ timeout: 120000
114
+ config:
115
+ agent: data-scientist
116
+ task: |
117
+ Compare the new model version against the current production version.
118
+
119
+ ## Comparison Dimensions
120
+
121
+ 1. **Performance Delta**:
122
+ - Metric-by-metric comparison
123
+ - Statistical significance of differences
124
+ - Confidence intervals
125
+
126
+ 2. **Schema Compatibility**:
127
+ - Input schema changes (breaking/non-breaking)
128
+ - Output schema changes
129
+ - Feature additions/removals
130
+
131
+ 3. **Resource Changes**:
132
+ - Model size difference
133
+ - Inference latency change
134
+ - Memory requirement change
135
+
136
+ 4. **Risk Assessment**:
137
+ - Breaking changes identified
138
+ - Rollback complexity
139
+ - Required client updates
140
+
141
+ ## Recommendation
142
+ - **PROMOTE**: Ready for production rollout
143
+ - **CANARY**: Recommend gradual rollout with monitoring
144
+ - **HOLD**: Needs more evaluation
145
+ - **REJECT**: Does not meet requirements
146
+
147
+ - stepId: handle-promote
148
+ name: Promote to Production
149
+ type: conditional
150
+ config:
151
+ condition: "{{recommendation}} == 'PROMOTE'"
152
+ then:
153
+ - stepId: update-production-status
154
+ type: tool
155
+ tool: memory_store
156
+ config:
157
+ namespace: ml-model-registry
158
+ key: "{{model_name}}/versions/{{version}}"
159
+ merge: true
160
+ value:
161
+ status: "production"
162
+ promoted_at: "{{timestamp}}"
163
+ promoted_by: "{{user}}"
164
+
165
+ - stepId: handle-canary
166
+ name: Set Canary Status
167
+ type: conditional
168
+ config:
169
+ condition: "{{recommendation}} == 'CANARY'"
170
+ then:
171
+ - stepId: update-canary-status
172
+ type: tool
173
+ tool: memory_store
174
+ config:
175
+ namespace: ml-model-registry
176
+ key: "{{model_name}}/versions/{{version}}"
177
+ merge: true
178
+ value:
179
+ status: "canary"
180
+ canary_started_at: "{{timestamp}}"
181
+ canary_traffic_percent: 10
182
+
183
+ - stepId: notify-promotion
184
+ name: Generate Promotion Notification
185
+ type: prompt
186
+ timeout: 60000
187
+ config:
188
+ agent: writer
189
+ task: |
190
+ Create a model promotion notification for stakeholders.
191
+
192
+ ## Notification Content
193
+
194
+ **Subject**: Model Promotion: {{model_name}} v{{version}}
195
+
196
+ **Summary**:
197
+ - Previous production version: {{old_version}}
198
+ - New production version: {{version}}
199
+ - Promotion type: {{promotion_type}} (immediate/canary)
200
+
201
+ **Key Changes**:
202
+ - Performance improvement: {{performance_delta}}
203
+ - Notable changes: {{changes}}
204
+
205
+ **Rollback Plan**:
206
+ - Rollback command/procedure
207
+ - Rollback criteria
208
+ - On-call contact
209
+
210
+ **Timeline**:
211
+ - Promotion start: {{timestamp}}
212
+ - Full rollout (if canary): {{full_rollout_date}}
213
+ - Monitoring period: {{monitoring_period}}
214
+
215
+ - stepId: store-promotion-event
216
+ name: Log Promotion Event
217
+ type: tool
218
+ timeout: 10000
219
+ tool: memory_store
220
+ config:
221
+ namespace: ml-model-events
222
+ key: "{{model_name}}/promotions/{{timestamp}}"
223
+ ttl: 31536000
224
+ value:
225
+ event_type: "promotion"
226
+ model_name: "{{model_name}}"
227
+ from_version: "{{old_production_version}}"
228
+ to_version: "{{version}}"
229
+ promotion_type: "{{promotion_type}}"
230
+ initiated_by: "{{user}}"
231
+ timestamp: "{{timestamp}}"
232
+ notification_sent: true