@defai.digital/cli 13.4.4 → 13.4.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bundled/agents/architect.json +117 -0
- package/bundled/agents/auditor.json +114 -0
- package/bundled/agents/bug-hunter.json +128 -0
- package/bundled/agents/builder.json +128 -0
- package/bundled/agents/ceo.json +6 -1
- package/bundled/agents/executor.json +150 -0
- package/bundled/agents/fullstack.json +10 -2
- package/bundled/agents/operator.json +119 -0
- package/bundled/agents/researcher.json +42 -13
- package/bundled/agents/reviewer.json +90 -42
- package/bundled/templates/monorepo/contract-index.ts.hbs +7 -0
- package/bundled/templates/monorepo/contract-test.ts.hbs +130 -0
- package/bundled/templates/monorepo/contracts-package.json.hbs +29 -0
- package/bundled/templates/monorepo/domain-index.ts.hbs +115 -0
- package/bundled/templates/monorepo/domain-package.json.hbs +27 -0
- package/bundled/templates/monorepo/gitignore.hbs +32 -0
- package/bundled/templates/monorepo/invariants.md.hbs +43 -0
- package/bundled/templates/monorepo/package.json.hbs +28 -0
- package/bundled/templates/monorepo/pnpm-workspace.yaml.hbs +5 -0
- package/bundled/templates/monorepo/schema.ts.hbs +82 -0
- package/bundled/templates/monorepo/template.json +106 -0
- package/bundled/templates/monorepo/tsconfig.json.hbs +22 -0
- package/bundled/templates/standalone/contract-index.ts.hbs +5 -0
- package/bundled/templates/standalone/contract-test.ts.hbs +95 -0
- package/bundled/templates/standalone/contracts-root-index.ts.hbs +7 -0
- package/bundled/templates/standalone/domain-index.ts.hbs +6 -0
- package/bundled/templates/standalone/domain-repository.ts.hbs +44 -0
- package/bundled/templates/standalone/domain-service.ts.hbs +102 -0
- package/bundled/templates/standalone/gitignore.hbs +27 -0
- package/bundled/templates/standalone/invariants.md.hbs +35 -0
- package/bundled/templates/standalone/package.json.hbs +41 -0
- package/bundled/templates/standalone/schema.ts.hbs +61 -0
- package/bundled/templates/standalone/src-index.ts.hbs +11 -0
- package/bundled/templates/standalone/template.json +91 -0
- package/bundled/templates/standalone/tsconfig.json.hbs +20 -0
- package/bundled/templates/standalone/vitest.config.ts.hbs +8 -0
- package/bundled/workflows/adversarial-debate.yaml +222 -0
- package/bundled/workflows/analyst.yaml +115 -0
- package/bundled/workflows/assistant.yaml +74 -0
- package/bundled/workflows/code-review-discussion.yaml +166 -0
- package/bundled/workflows/code-reviewer.yaml +94 -0
- package/bundled/workflows/contract-first-project.yaml +356 -0
- package/bundled/workflows/debugger.yaml +107 -0
- package/bundled/workflows/designer.yaml +113 -0
- package/bundled/workflows/developer.yaml +105 -0
- package/bundled/workflows/discuss-step-examples.yaml +153 -0
- package/bundled/workflows/infrastructure-automation.yaml +283 -0
- package/bundled/workflows/ml-ab-testing.yaml +311 -0
- package/bundled/workflows/ml-experiment-tracker.yaml +150 -0
- package/bundled/workflows/ml-feature-engineering.yaml +242 -0
- package/bundled/workflows/ml-model-evaluation.yaml +234 -0
- package/bundled/workflows/ml-model-monitoring.yaml +227 -0
- package/bundled/workflows/ml-model-registry.yaml +232 -0
- package/bundled/workflows/mlops-deployment.yaml +267 -0
- package/bundled/workflows/mobile-development.yaml +312 -0
- package/bundled/workflows/multi-model-discussion.yaml +243 -0
- package/bundled/workflows/product-discovery.yaml +295 -0
- package/bundled/workflows/qa-specialist.yaml +116 -0
- package/bundled/workflows/refactoring.yaml +105 -0
- package/bundled/workflows/security-audit.yaml +135 -0
- package/bundled/workflows/std/analysis.yaml +190 -0
- package/bundled/workflows/std/code-review.yaml +117 -0
- package/bundled/workflows/std/debugging.yaml +155 -0
- package/bundled/workflows/std/documentation.yaml +180 -0
- package/bundled/workflows/std/implementation.yaml +197 -0
- package/bundled/workflows/std/refactoring.yaml +180 -0
- package/bundled/workflows/std/testing.yaml +200 -0
- package/bundled/workflows/strategic-planning.yaml +235 -0
- package/bundled/workflows/technology-research.yaml +239 -0
- package/dist/bootstrap.d.ts.map +1 -1
- package/dist/bootstrap.js +10 -6
- package/dist/bootstrap.js.map +1 -1
- package/dist/commands/discuss.d.ts.map +1 -1
- package/dist/commands/discuss.js +4 -1
- package/dist/commands/discuss.js.map +1 -1
- package/dist/commands/doctor.d.ts +1 -1
- package/dist/commands/doctor.js +3 -3
- package/dist/commands/doctor.js.map +1 -1
- package/dist/commands/init.d.ts.map +1 -1
- package/dist/commands/init.js +65 -5
- package/dist/commands/init.js.map +1 -1
- package/dist/commands/monitor.d.ts.map +1 -1
- package/dist/commands/monitor.js +29 -1
- package/dist/commands/monitor.js.map +1 -1
- package/dist/commands/scaffold.d.ts.map +1 -1
- package/dist/commands/scaffold.js +6 -3
- package/dist/commands/scaffold.js.map +1 -1
- package/dist/commands/setup.d.ts.map +1 -1
- package/dist/commands/setup.js +119 -3
- package/dist/commands/setup.js.map +1 -1
- package/dist/commands/status.d.ts +10 -0
- package/dist/commands/status.d.ts.map +1 -1
- package/dist/commands/status.js +151 -49
- package/dist/commands/status.js.map +1 -1
- package/dist/commands/update.d.ts.map +1 -1
- package/dist/commands/update.js +1 -43
- package/dist/commands/update.js.map +1 -1
- package/dist/web/api.d.ts +18 -0
- package/dist/web/api.d.ts.map +1 -1
- package/dist/web/api.js +480 -39
- package/dist/web/api.js.map +1 -1
- package/dist/web/dashboard.d.ts.map +1 -1
- package/dist/web/dashboard.js +1449 -132
- package/dist/web/dashboard.js.map +1 -1
- package/package.json +21 -21
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
workflowId: ml-ab-testing
|
|
2
|
+
name: Model A/B Testing
|
|
3
|
+
description: Statistical comparison of model variants in production
|
|
4
|
+
version: "1.0.0"
|
|
5
|
+
category: machine-learning
|
|
6
|
+
tags:
|
|
7
|
+
- ml
|
|
8
|
+
- ab-testing
|
|
9
|
+
- experimentation
|
|
10
|
+
- statistics
|
|
11
|
+
|
|
12
|
+
metadata:
|
|
13
|
+
requiredAbilities:
|
|
14
|
+
- machine-learning
|
|
15
|
+
- statistical-analysis
|
|
16
|
+
- data-analysis
|
|
17
|
+
estimatedDuration: 300
|
|
18
|
+
complexity: high
|
|
19
|
+
|
|
20
|
+
steps:
|
|
21
|
+
- stepId: experiment-design
|
|
22
|
+
name: Design A/B Experiment
|
|
23
|
+
type: prompt
|
|
24
|
+
timeout: 180000
|
|
25
|
+
config:
|
|
26
|
+
agent: data-scientist
|
|
27
|
+
task: |
|
|
28
|
+
Design a statistically rigorous A/B test for model comparison.
|
|
29
|
+
|
|
30
|
+
## Hypothesis Definition
|
|
31
|
+
|
|
32
|
+
1. **Null Hypothesis (H0)**:
|
|
33
|
+
- State: "There is no difference between model A and model B"
|
|
34
|
+
|
|
35
|
+
2. **Alternative Hypothesis (H1)**:
|
|
36
|
+
- State: "Model B performs better than model A"
|
|
37
|
+
- Test type: one-tailed or two-tailed
|
|
38
|
+
|
|
39
|
+
## Sample Size Calculation
|
|
40
|
+
|
|
41
|
+
Given:
|
|
42
|
+
- Baseline conversion rate / metric: {{baseline_rate}}
|
|
43
|
+
- Minimum detectable effect (MDE): {{mde}}
|
|
44
|
+
- Significance level (alpha): 0.05
|
|
45
|
+
- Statistical power (1-beta): 0.80
|
|
46
|
+
|
|
47
|
+
Calculate:
|
|
48
|
+
- Required sample size per variant
|
|
49
|
+
- Estimated test duration based on traffic
|
|
50
|
+
|
|
51
|
+
## Metric Definition
|
|
52
|
+
|
|
53
|
+
1. **Primary Metric**:
|
|
54
|
+
- Metric name
|
|
55
|
+
- Definition
|
|
56
|
+
- Success direction (higher/lower is better)
|
|
57
|
+
|
|
58
|
+
2. **Secondary Metrics**:
|
|
59
|
+
- List of supporting metrics
|
|
60
|
+
|
|
61
|
+
3. **Guardrail Metrics**:
|
|
62
|
+
- Metrics that must not degrade
|
|
63
|
+
- Degradation threshold
|
|
64
|
+
|
|
65
|
+
## Randomization Strategy
|
|
66
|
+
- Randomization unit (user, session, request)
|
|
67
|
+
- Stratification variables (if any)
|
|
68
|
+
- Holdout percentage
|
|
69
|
+
|
|
70
|
+
- stepId: traffic-allocation
|
|
71
|
+
name: Define Traffic Allocation
|
|
72
|
+
type: prompt
|
|
73
|
+
timeout: 60000
|
|
74
|
+
config:
|
|
75
|
+
agent: ml-engineer
|
|
76
|
+
task: |
|
|
77
|
+
Plan the traffic allocation strategy.
|
|
78
|
+
|
|
79
|
+
## Traffic Split
|
|
80
|
+
|
|
81
|
+
| Variant | Description | Traffic % |
|
|
82
|
+
|---------|-------------|-----------|
|
|
83
|
+
| Control | Current production model | {{control_pct}}% |
|
|
84
|
+
| Treatment | New model candidate | {{treatment_pct}}% |
|
|
85
|
+
| Holdout | No model (baseline) | {{holdout_pct}}% |
|
|
86
|
+
|
|
87
|
+
## Ramping Strategy
|
|
88
|
+
|
|
89
|
+
- **Day 1-2**: 1% treatment (burn-in period)
|
|
90
|
+
- **Day 3-7**: 10% treatment (early signal)
|
|
91
|
+
- **Day 8-14**: 50% treatment (full test)
|
|
92
|
+
|
|
93
|
+
## Safeguards
|
|
94
|
+
|
|
95
|
+
1. **Auto-Rollback Triggers**:
|
|
96
|
+
- Error rate > {{error_threshold}}%
|
|
97
|
+
- Latency P99 > {{latency_threshold}}ms
|
|
98
|
+
- Guardrail metric degradation > {{guardrail_threshold}}%
|
|
99
|
+
|
|
100
|
+
2. **Manual Review Triggers**:
|
|
101
|
+
- Unusual traffic patterns
|
|
102
|
+
- Unexpected metric movements
|
|
103
|
+
|
|
104
|
+
- stepId: store-experiment-config
|
|
105
|
+
name: Store Experiment Configuration
|
|
106
|
+
type: tool
|
|
107
|
+
timeout: 10000
|
|
108
|
+
tool: memory_store
|
|
109
|
+
config:
|
|
110
|
+
namespace: ml-ab-experiments
|
|
111
|
+
key: "{{experiment_id}}/config"
|
|
112
|
+
value:
|
|
113
|
+
experiment_id: "{{experiment_id}}"
|
|
114
|
+
experiment_name: "{{experiment_name}}"
|
|
115
|
+
status: "configured"
|
|
116
|
+
hypothesis:
|
|
117
|
+
null_hypothesis: "{{h0}}"
|
|
118
|
+
alternative_hypothesis: "{{h1}}"
|
|
119
|
+
variants:
|
|
120
|
+
control:
|
|
121
|
+
model_name: "{{control_model}}"
|
|
122
|
+
model_version: "{{control_version}}"
|
|
123
|
+
traffic_percent: "{{control_pct}}"
|
|
124
|
+
treatment:
|
|
125
|
+
model_name: "{{treatment_model}}"
|
|
126
|
+
model_version: "{{treatment_version}}"
|
|
127
|
+
traffic_percent: "{{treatment_pct}}"
|
|
128
|
+
metrics:
|
|
129
|
+
primary: "{{primary_metric}}"
|
|
130
|
+
secondary: "{{secondary_metrics}}"
|
|
131
|
+
guardrails: "{{guardrail_metrics}}"
|
|
132
|
+
statistical_params:
|
|
133
|
+
significance_level: 0.05
|
|
134
|
+
power: 0.80
|
|
135
|
+
mde: "{{mde}}"
|
|
136
|
+
required_sample_size: "{{sample_size}}"
|
|
137
|
+
schedule:
|
|
138
|
+
start_date: "{{start_date}}"
|
|
139
|
+
estimated_end_date: "{{end_date}}"
|
|
140
|
+
ramping_schedule: "{{ramp_schedule}}"
|
|
141
|
+
created_at: "{{timestamp}}"
|
|
142
|
+
created_by: "{{user}}"
|
|
143
|
+
|
|
144
|
+
- stepId: monitor-experiment
|
|
145
|
+
name: Monitor Running Experiment
|
|
146
|
+
type: prompt
|
|
147
|
+
timeout: 120000
|
|
148
|
+
config:
|
|
149
|
+
agent: data-scientist
|
|
150
|
+
task: |
|
|
151
|
+
Monitor the running A/B experiment for issues.
|
|
152
|
+
|
|
153
|
+
## Health Checks
|
|
154
|
+
|
|
155
|
+
1. **Sample Ratio Mismatch (SRM)**:
|
|
156
|
+
- Expected ratio: {{expected_ratio}}
|
|
157
|
+
- Observed ratio: {{observed_ratio}}
|
|
158
|
+
- Chi-squared test for SRM
|
|
159
|
+
- ALERT if p-value < 0.001
|
|
160
|
+
|
|
161
|
+
2. **Traffic Distribution**:
|
|
162
|
+
- Verify randomization is working
|
|
163
|
+
- Check for selection bias
|
|
164
|
+
|
|
165
|
+
3. **Guardrail Metrics**:
|
|
166
|
+
- Current values vs baseline
|
|
167
|
+
- Flag any degradation
|
|
168
|
+
|
|
169
|
+
4. **Data Quality**:
|
|
170
|
+
- Missing data rate
|
|
171
|
+
- Logging issues
|
|
172
|
+
|
|
173
|
+
## Early Stopping Check
|
|
174
|
+
|
|
175
|
+
- **Sequential testing**: Can we stop early?
|
|
176
|
+
- Current confidence level
|
|
177
|
+
- Projected final sample size
|
|
178
|
+
|
|
179
|
+
## Issues Found
|
|
180
|
+
- List any anomalies
|
|
181
|
+
- Recommended actions
|
|
182
|
+
|
|
183
|
+
- stepId: analyze-results
|
|
184
|
+
name: Analyze Experiment Results
|
|
185
|
+
type: prompt
|
|
186
|
+
timeout: 180000
|
|
187
|
+
config:
|
|
188
|
+
agent: data-scientist
|
|
189
|
+
task: |
|
|
190
|
+
Perform statistical analysis of A/B test results.
|
|
191
|
+
|
|
192
|
+
## Primary Metric Analysis
|
|
193
|
+
|
|
194
|
+
1. **Descriptive Statistics**:
|
|
195
|
+
| Variant | N | Mean | Std | Median |
|
|
196
|
+
|---------|---|------|-----|--------|
|
|
197
|
+
| Control | | | | |
|
|
198
|
+
| Treatment | | | | |
|
|
199
|
+
|
|
200
|
+
2. **Effect Size**:
|
|
201
|
+
- Absolute difference: {{treatment_mean}} - {{control_mean}}
|
|
202
|
+
- Relative lift: (T - C) / C * 100%
|
|
203
|
+
- 95% Confidence Interval: [{{ci_lower}}, {{ci_upper}}]
|
|
204
|
+
|
|
205
|
+
3. **Statistical Significance**:
|
|
206
|
+
- Test used: t-test / Mann-Whitney / Chi-squared
|
|
207
|
+
- Test statistic: {{test_stat}}
|
|
208
|
+
- P-value: {{p_value}}
|
|
209
|
+
- Significant at alpha=0.05: Yes/No
|
|
210
|
+
|
|
211
|
+
4. **Practical Significance**:
|
|
212
|
+
- Is the effect size meaningful?
|
|
213
|
+
- Does it meet MDE threshold?
|
|
214
|
+
|
|
215
|
+
## Secondary Metrics
|
|
216
|
+
|
|
217
|
+
| Metric | Control | Treatment | Lift | P-value | Significant |
|
|
218
|
+
|--------|---------|-----------|------|---------|-------------|
|
|
219
|
+
|
|
220
|
+
## Guardrail Metrics
|
|
221
|
+
|
|
222
|
+
| Metric | Control | Treatment | Change | Status |
|
|
223
|
+
|--------|---------|-----------|--------|--------|
|
|
224
|
+
|
|
225
|
+
## Segment Analysis
|
|
226
|
+
|
|
227
|
+
Analyze effect by key segments:
|
|
228
|
+
- New vs returning users
|
|
229
|
+
- Mobile vs desktop
|
|
230
|
+
- Geographic regions
|
|
231
|
+
- User cohorts
|
|
232
|
+
|
|
233
|
+
## Novelty/Primacy Effects
|
|
234
|
+
|
|
235
|
+
- Plot metric over time
|
|
236
|
+
- Check for effect decay
|
|
237
|
+
- Estimate steady-state effect
|
|
238
|
+
|
|
239
|
+
- stepId: make-decision
|
|
240
|
+
name: Make Experiment Decision
|
|
241
|
+
type: prompt
|
|
242
|
+
timeout: 120000
|
|
243
|
+
config:
|
|
244
|
+
agent: ml-engineer
|
|
245
|
+
task: |
|
|
246
|
+
Based on analysis, make a decision about the experiment.
|
|
247
|
+
|
|
248
|
+
## Decision Framework
|
|
249
|
+
|
|
250
|
+
| Significance | Practical Impact | Guardrails | Decision |
|
|
251
|
+
|--------------|------------------|------------|----------|
|
|
252
|
+
| Yes | Positive | Pass | SHIP |
|
|
253
|
+
| Yes | Negative | - | ROLLBACK |
|
|
254
|
+
| No | - | Pass | EXTEND or NO_SHIP |
|
|
255
|
+
| - | - | Fail | ROLLBACK |
|
|
256
|
+
|
|
257
|
+
## Decision
|
|
258
|
+
|
|
259
|
+
**Recommendation**: {{decision}}
|
|
260
|
+
|
|
261
|
+
**Confidence**: HIGH / MEDIUM / LOW
|
|
262
|
+
|
|
263
|
+
**Reasoning**:
|
|
264
|
+
- Statistical evidence: {{stat_reasoning}}
|
|
265
|
+
- Business impact: {{business_reasoning}}
|
|
266
|
+
- Risk assessment: {{risk_reasoning}}
|
|
267
|
+
|
|
268
|
+
## If SHIP
|
|
269
|
+
- Rollout plan (gradual vs immediate)
|
|
270
|
+
- Monitoring requirements
|
|
271
|
+
- Success criteria for full rollout
|
|
272
|
+
|
|
273
|
+
## If ROLLBACK
|
|
274
|
+
- Root cause hypothesis
|
|
275
|
+
- Recommended investigation
|
|
276
|
+
- Next experiment suggestions
|
|
277
|
+
|
|
278
|
+
## If EXTEND
|
|
279
|
+
- Additional sample size needed
|
|
280
|
+
- New end date
|
|
281
|
+
- Any design changes
|
|
282
|
+
|
|
283
|
+
- stepId: store-results
|
|
284
|
+
name: Store Experiment Results
|
|
285
|
+
type: tool
|
|
286
|
+
timeout: 10000
|
|
287
|
+
tool: memory_store
|
|
288
|
+
config:
|
|
289
|
+
namespace: ml-ab-experiments
|
|
290
|
+
key: "{{experiment_id}}/results"
|
|
291
|
+
ttl: 31536000
|
|
292
|
+
value:
|
|
293
|
+
experiment_id: "{{experiment_id}}"
|
|
294
|
+
status: "completed"
|
|
295
|
+
results:
|
|
296
|
+
primary_metric:
|
|
297
|
+
control_mean: "{{control_mean}}"
|
|
298
|
+
treatment_mean: "{{treatment_mean}}"
|
|
299
|
+
relative_lift: "{{lift}}"
|
|
300
|
+
confidence_interval: "{{ci}}"
|
|
301
|
+
p_value: "{{p_value}}"
|
|
302
|
+
significant: "{{significant}}"
|
|
303
|
+
secondary_metrics: "{{secondary_results}}"
|
|
304
|
+
guardrail_metrics: "{{guardrail_results}}"
|
|
305
|
+
segment_analysis: "{{segment_results}}"
|
|
306
|
+
decision:
|
|
307
|
+
recommendation: "{{decision}}"
|
|
308
|
+
confidence: "{{confidence}}"
|
|
309
|
+
reasoning: "{{reasoning}}"
|
|
310
|
+
completed_at: "{{timestamp}}"
|
|
311
|
+
decided_by: "{{user}}"
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
workflowId: ml-experiment-tracker
|
|
2
|
+
name: ML Experiment Tracker
|
|
3
|
+
description: Track, compare, and manage ML experiments for reproducibility
|
|
4
|
+
version: "1.0.0"
|
|
5
|
+
category: machine-learning
|
|
6
|
+
tags:
|
|
7
|
+
- ml
|
|
8
|
+
- experiments
|
|
9
|
+
- tracking
|
|
10
|
+
- reproducibility
|
|
11
|
+
|
|
12
|
+
metadata:
|
|
13
|
+
requiredAbilities:
|
|
14
|
+
- machine-learning
|
|
15
|
+
- statistical-analysis
|
|
16
|
+
- data-analysis
|
|
17
|
+
estimatedDuration: 300
|
|
18
|
+
complexity: medium
|
|
19
|
+
|
|
20
|
+
steps:
|
|
21
|
+
- stepId: define-experiment
|
|
22
|
+
name: Define Experiment
|
|
23
|
+
type: prompt
|
|
24
|
+
timeout: 60000
|
|
25
|
+
config:
|
|
26
|
+
agent: ml-engineer
|
|
27
|
+
task: |
|
|
28
|
+
Define the ML experiment with the following structure:
|
|
29
|
+
|
|
30
|
+
## Experiment Definition
|
|
31
|
+
|
|
32
|
+
1. **Hypothesis**: What are we testing?
|
|
33
|
+
2. **Baseline Model**: What is the current best model?
|
|
34
|
+
3. **Metrics to Track**:
|
|
35
|
+
- Primary metric (optimization target)
|
|
36
|
+
- Secondary metrics (guardrails)
|
|
37
|
+
4. **Success Criteria**: What improvement is significant?
|
|
38
|
+
5. **Dataset Version**: Which data split are we using?
|
|
39
|
+
|
|
40
|
+
Provide structured output for logging.
|
|
41
|
+
|
|
42
|
+
- stepId: log-parameters
|
|
43
|
+
name: Log Experiment Parameters
|
|
44
|
+
type: tool
|
|
45
|
+
timeout: 10000
|
|
46
|
+
tool: memory_store
|
|
47
|
+
config:
|
|
48
|
+
namespace: ml-experiments
|
|
49
|
+
key: "{{experiment_id}}/parameters"
|
|
50
|
+
value:
|
|
51
|
+
experiment_id: "{{experiment_id}}"
|
|
52
|
+
experiment_name: "{{experiment_name}}"
|
|
53
|
+
hypothesis: "{{hypothesis}}"
|
|
54
|
+
model_type: "{{model_type}}"
|
|
55
|
+
hyperparameters: "{{hyperparameters}}"
|
|
56
|
+
dataset_version: "{{dataset_version}}"
|
|
57
|
+
baseline_model: "{{baseline_model}}"
|
|
58
|
+
success_criteria: "{{success_criteria}}"
|
|
59
|
+
created_at: "{{timestamp}}"
|
|
60
|
+
created_by: "{{user}}"
|
|
61
|
+
status: "running"
|
|
62
|
+
|
|
63
|
+
- stepId: log-metrics
|
|
64
|
+
name: Log Training Metrics
|
|
65
|
+
type: tool
|
|
66
|
+
timeout: 10000
|
|
67
|
+
tool: memory_store
|
|
68
|
+
config:
|
|
69
|
+
namespace: ml-experiments
|
|
70
|
+
key: "{{experiment_id}}/metrics"
|
|
71
|
+
value:
|
|
72
|
+
experiment_id: "{{experiment_id}}"
|
|
73
|
+
metrics:
|
|
74
|
+
accuracy: "{{accuracy}}"
|
|
75
|
+
precision: "{{precision}}"
|
|
76
|
+
recall: "{{recall}}"
|
|
77
|
+
f1_score: "{{f1_score}}"
|
|
78
|
+
auc_roc: "{{auc_roc}}"
|
|
79
|
+
loss: "{{loss}}"
|
|
80
|
+
training_metrics:
|
|
81
|
+
training_time_seconds: "{{training_time}}"
|
|
82
|
+
epochs_completed: "{{epochs}}"
|
|
83
|
+
early_stopping_epoch: "{{early_stop_epoch}}"
|
|
84
|
+
resource_usage:
|
|
85
|
+
peak_memory_mb: "{{peak_memory}}"
|
|
86
|
+
gpu_utilization: "{{gpu_util}}"
|
|
87
|
+
logged_at: "{{timestamp}}"
|
|
88
|
+
|
|
89
|
+
- stepId: compare-experiments
|
|
90
|
+
name: Compare with Baseline
|
|
91
|
+
type: prompt
|
|
92
|
+
timeout: 120000
|
|
93
|
+
config:
|
|
94
|
+
agent: data-scientist
|
|
95
|
+
task: |
|
|
96
|
+
Compare experiment {{experiment_id}} against the baseline model.
|
|
97
|
+
|
|
98
|
+
## Analysis Required
|
|
99
|
+
|
|
100
|
+
1. **Metric Comparison**:
|
|
101
|
+
- Calculate absolute and relative improvement
|
|
102
|
+
- For each metric: baseline vs experiment
|
|
103
|
+
|
|
104
|
+
2. **Statistical Significance**:
|
|
105
|
+
- Is the improvement statistically significant?
|
|
106
|
+
- Calculate p-value if applicable
|
|
107
|
+
- Report confidence intervals
|
|
108
|
+
|
|
109
|
+
3. **Trade-off Analysis**:
|
|
110
|
+
- Accuracy vs inference latency
|
|
111
|
+
- Model complexity vs performance
|
|
112
|
+
- Training cost vs improvement
|
|
113
|
+
|
|
114
|
+
4. **Recommendation**:
|
|
115
|
+
- PROMOTE: Significant improvement, ready for production
|
|
116
|
+
- ITERATE: Promising but needs refinement
|
|
117
|
+
- REJECT: No improvement or regression
|
|
118
|
+
|
|
119
|
+
Provide structured recommendation with justification.
|
|
120
|
+
|
|
121
|
+
- stepId: update-status
|
|
122
|
+
name: Update Experiment Status
|
|
123
|
+
type: tool
|
|
124
|
+
timeout: 10000
|
|
125
|
+
tool: memory_store
|
|
126
|
+
config:
|
|
127
|
+
namespace: ml-experiments
|
|
128
|
+
key: "{{experiment_id}}/status"
|
|
129
|
+
value:
|
|
130
|
+
experiment_id: "{{experiment_id}}"
|
|
131
|
+
status: "{{recommendation}}"
|
|
132
|
+
comparison_summary: "{{comparison_summary}}"
|
|
133
|
+
statistical_significance: "{{p_value}}"
|
|
134
|
+
recommendation_rationale: "{{rationale}}"
|
|
135
|
+
completed_at: "{{timestamp}}"
|
|
136
|
+
|
|
137
|
+
- stepId: store-comparison-report
|
|
138
|
+
name: Store Comparison Report
|
|
139
|
+
type: tool
|
|
140
|
+
timeout: 10000
|
|
141
|
+
tool: memory_store
|
|
142
|
+
config:
|
|
143
|
+
namespace: ml-experiment-reports
|
|
144
|
+
key: "{{experiment_id}}/comparison"
|
|
145
|
+
ttl: 7776000
|
|
146
|
+
value:
|
|
147
|
+
experiment_id: "{{experiment_id}}"
|
|
148
|
+
baseline_id: "{{baseline_model}}"
|
|
149
|
+
full_report: "{{comparison_report}}"
|
|
150
|
+
created_at: "{{timestamp}}"
|
|
@@ -0,0 +1,242 @@
|
|
|
1
|
+
workflowId: ml-feature-engineering
|
|
2
|
+
name: Feature Engineering Pipeline
|
|
3
|
+
description: Systematic feature creation, validation, and selection
|
|
4
|
+
version: "1.0.0"
|
|
5
|
+
category: machine-learning
|
|
6
|
+
tags:
|
|
7
|
+
- ml
|
|
8
|
+
- features
|
|
9
|
+
- feature-engineering
|
|
10
|
+
- feature-selection
|
|
11
|
+
|
|
12
|
+
metadata:
|
|
13
|
+
requiredAbilities:
|
|
14
|
+
- machine-learning
|
|
15
|
+
- data-analysis
|
|
16
|
+
- feature-engineering
|
|
17
|
+
estimatedDuration: 600
|
|
18
|
+
complexity: high
|
|
19
|
+
|
|
20
|
+
steps:
|
|
21
|
+
- stepId: feature-ideation
|
|
22
|
+
name: Feature Ideation
|
|
23
|
+
type: prompt
|
|
24
|
+
timeout: 180000
|
|
25
|
+
config:
|
|
26
|
+
agent: data-scientist
|
|
27
|
+
task: |
|
|
28
|
+
Given the prediction target and available data, brainstorm candidate features.
|
|
29
|
+
|
|
30
|
+
## Feature Categories to Consider
|
|
31
|
+
|
|
32
|
+
1. **Raw Features**:
|
|
33
|
+
- Direct columns from source data
|
|
34
|
+
- Basic transformations (log, sqrt, power)
|
|
35
|
+
|
|
36
|
+
2. **Domain Knowledge Features**:
|
|
37
|
+
- Business logic features
|
|
38
|
+
- Industry-specific indicators
|
|
39
|
+
- Expert-derived calculations
|
|
40
|
+
|
|
41
|
+
3. **Interaction Features**:
|
|
42
|
+
- Feature products (A * B)
|
|
43
|
+
- Feature ratios (A / B)
|
|
44
|
+
- Feature differences (A - B)
|
|
45
|
+
|
|
46
|
+
4. **Time-Based Features** (if temporal data):
|
|
47
|
+
- Lag features
|
|
48
|
+
- Rolling statistics (mean, std, min, max)
|
|
49
|
+
- Time since event
|
|
50
|
+
- Seasonal indicators
|
|
51
|
+
- Trend features
|
|
52
|
+
|
|
53
|
+
5. **Aggregation Features**:
|
|
54
|
+
- Group-by statistics
|
|
55
|
+
- Entity-level aggregations
|
|
56
|
+
- Window aggregations
|
|
57
|
+
|
|
58
|
+
6. **Text Features** (if text data):
|
|
59
|
+
- TF-IDF
|
|
60
|
+
- Word embeddings
|
|
61
|
+
- Sentiment scores
|
|
62
|
+
- Named entities
|
|
63
|
+
|
|
64
|
+
7. **Categorical Encodings**:
|
|
65
|
+
- One-hot encoding
|
|
66
|
+
- Target encoding
|
|
67
|
+
- Frequency encoding
|
|
68
|
+
- Embedding encoding
|
|
69
|
+
|
|
70
|
+
## Output Format
|
|
71
|
+
For each candidate feature:
|
|
72
|
+
- Feature name
|
|
73
|
+
- Description
|
|
74
|
+
- Calculation logic
|
|
75
|
+
- Expected predictive value (hypothesis)
|
|
76
|
+
- Implementation complexity (low/medium/high)
|
|
77
|
+
|
|
78
|
+
- stepId: feature-implementation
|
|
79
|
+
name: Implement Features
|
|
80
|
+
type: prompt
|
|
81
|
+
timeout: 300000
|
|
82
|
+
config:
|
|
83
|
+
agent: data-scientist
|
|
84
|
+
task: |
|
|
85
|
+
Implement the candidate features with production-quality code.
|
|
86
|
+
|
|
87
|
+
## For Each Feature
|
|
88
|
+
|
|
89
|
+
1. **Transformation Code**:
|
|
90
|
+
```python
|
|
91
|
+
def compute_feature_name(df: pd.DataFrame) -> pd.Series:
|
|
92
|
+
'''
|
|
93
|
+
Description of what this feature represents.
|
|
94
|
+
|
|
95
|
+
Args:
|
|
96
|
+
df: Input dataframe with required columns
|
|
97
|
+
|
|
98
|
+
Returns:
|
|
99
|
+
pd.Series: Computed feature values
|
|
100
|
+
'''
|
|
101
|
+
# Implementation
|
|
102
|
+
pass
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
2. **Missing Value Handling**:
|
|
106
|
+
- Strategy: drop / fill_mean / fill_median / fill_mode / fill_constant
|
|
107
|
+
- Justification for chosen strategy
|
|
108
|
+
|
|
109
|
+
3. **Edge Case Handling**:
|
|
110
|
+
- Division by zero
|
|
111
|
+
- Negative values for log transforms
|
|
112
|
+
- Null propagation
|
|
113
|
+
|
|
114
|
+
4. **Data Type**:
|
|
115
|
+
- Output dtype
|
|
116
|
+
- Value range
|
|
117
|
+
|
|
118
|
+
## Feature Pipeline
|
|
119
|
+
Create a combined feature pipeline function that:
|
|
120
|
+
- Takes raw data
|
|
121
|
+
- Applies all transformations
|
|
122
|
+
- Returns feature matrix
|
|
123
|
+
|
|
124
|
+
- stepId: feature-validation
|
|
125
|
+
name: Validate Features
|
|
126
|
+
type: prompt
|
|
127
|
+
timeout: 180000
|
|
128
|
+
config:
|
|
129
|
+
agent: data-scientist
|
|
130
|
+
task: |
|
|
131
|
+
Validate feature quality and check for common issues.
|
|
132
|
+
|
|
133
|
+
## Data Leakage Check
|
|
134
|
+
|
|
135
|
+
1. **Target Leakage**:
|
|
136
|
+
- Does any feature contain information from the target?
|
|
137
|
+
- Is any feature computed using future data?
|
|
138
|
+
- Check correlation with target on train vs test
|
|
139
|
+
|
|
140
|
+
2. **Train-Test Leakage**:
|
|
141
|
+
- Are features computed using test data statistics?
|
|
142
|
+
- Is any global normalization applied before split?
|
|
143
|
+
|
|
144
|
+
## Feature Quality Checks
|
|
145
|
+
|
|
146
|
+
1. **Missing Values**:
|
|
147
|
+
- Missing rate per feature
|
|
148
|
+
- Flag features with >50% missing
|
|
149
|
+
|
|
150
|
+
2. **Cardinality**:
|
|
151
|
+
- Unique value count
|
|
152
|
+
- Flag high-cardinality categoricals
|
|
153
|
+
|
|
154
|
+
3. **Distribution**:
|
|
155
|
+
- Check for extreme skewness
|
|
156
|
+
- Identify outliers
|
|
157
|
+
- Check for constant features
|
|
158
|
+
|
|
159
|
+
4. **Correlation Analysis**:
|
|
160
|
+
- Correlation with target
|
|
161
|
+
- Inter-feature correlation matrix
|
|
162
|
+
- Flag highly correlated pairs (>0.95)
|
|
163
|
+
|
|
164
|
+
## Validation Report
|
|
165
|
+
For each feature:
|
|
166
|
+
- Leakage risk: SAFE / WARNING / DANGEROUS
|
|
167
|
+
- Quality score: 1-10
|
|
168
|
+
- Issues found
|
|
169
|
+
- Recommendations
|
|
170
|
+
|
|
171
|
+
- stepId: feature-selection
|
|
172
|
+
name: Select Final Features
|
|
173
|
+
type: prompt
|
|
174
|
+
timeout: 180000
|
|
175
|
+
config:
|
|
176
|
+
agent: ml-engineer
|
|
177
|
+
task: |
|
|
178
|
+
Select the final feature set for modeling.
|
|
179
|
+
|
|
180
|
+
## Selection Methods
|
|
181
|
+
|
|
182
|
+
1. **Importance-Based Selection**:
|
|
183
|
+
- SHAP values from baseline model
|
|
184
|
+
- Permutation importance
|
|
185
|
+
- Random forest feature importance
|
|
186
|
+
|
|
187
|
+
2. **Statistical Selection**:
|
|
188
|
+
- Mutual information with target
|
|
189
|
+
- Chi-squared test (categorical)
|
|
190
|
+
- ANOVA F-value (continuous)
|
|
191
|
+
|
|
192
|
+
3. **Iterative Selection**:
|
|
193
|
+
- Forward selection results
|
|
194
|
+
- Backward elimination results
|
|
195
|
+
- Recursive feature elimination
|
|
196
|
+
|
|
197
|
+
## Selection Criteria
|
|
198
|
+
|
|
199
|
+
- Remove features with:
|
|
200
|
+
- Near-zero variance
|
|
201
|
+
- High missing rate (>50%)
|
|
202
|
+
- High correlation with other features (>0.95)
|
|
203
|
+
- Low importance (bottom 10%)
|
|
204
|
+
- Data leakage risk
|
|
205
|
+
|
|
206
|
+
- Keep features with:
|
|
207
|
+
- High target correlation
|
|
208
|
+
- Domain importance
|
|
209
|
+
- Unique information
|
|
210
|
+
|
|
211
|
+
## Final Feature Set
|
|
212
|
+
|
|
213
|
+
| Feature | Importance Rank | Rationale |
|
|
214
|
+
|---------|-----------------|-----------|
|
|
215
|
+
| feature_1 | 1 | ... |
|
|
216
|
+
| feature_2 | 2 | ... |
|
|
217
|
+
|
|
218
|
+
Total features selected: X out of Y candidates
|
|
219
|
+
|
|
220
|
+
- stepId: store-feature-definitions
|
|
221
|
+
name: Store Feature Definitions
|
|
222
|
+
type: tool
|
|
223
|
+
timeout: 10000
|
|
224
|
+
tool: memory_store
|
|
225
|
+
config:
|
|
226
|
+
namespace: ml-feature-store
|
|
227
|
+
key: "{{feature_set_name}}/{{version}}"
|
|
228
|
+
value:
|
|
229
|
+
feature_set_name: "{{feature_set_name}}"
|
|
230
|
+
version: "{{version}}"
|
|
231
|
+
description: "{{description}}"
|
|
232
|
+
target_variable: "{{target}}"
|
|
233
|
+
features: "{{feature_definitions}}"
|
|
234
|
+
validation_results:
|
|
235
|
+
leakage_check: "{{leakage_status}}"
|
|
236
|
+
quality_scores: "{{quality_scores}}"
|
|
237
|
+
selection_summary:
|
|
238
|
+
total_candidates: "{{total_candidates}}"
|
|
239
|
+
selected_count: "{{selected_count}}"
|
|
240
|
+
selection_method: "{{selection_method}}"
|
|
241
|
+
created_at: "{{timestamp}}"
|
|
242
|
+
created_by: "{{user}}"
|