orchestr8 2.5.0 → 2.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/.blueprint/agents/AGENT_BA_CASS.md +42 -19
  2. package/.blueprint/agents/AGENT_DEVELOPER_CODEY.md +42 -38
  3. package/.blueprint/agents/AGENT_SPECIFICATION_ALEX.md +45 -0
  4. package/.blueprint/agents/AGENT_TESTER_NIGEL.md +42 -21
  5. package/.blueprint/features/feature_adaptive-retry/FEATURE_SPEC.md +239 -0
  6. package/.blueprint/features/feature_adaptive-retry/IMPLEMENTATION_PLAN.md +48 -0
  7. package/.blueprint/features/feature_adaptive-retry/story-prompt-modification.md +85 -0
  8. package/.blueprint/features/feature_adaptive-retry/story-retry-config.md +89 -0
  9. package/.blueprint/features/feature_adaptive-retry/story-should-retry.md +98 -0
  10. package/.blueprint/features/feature_adaptive-retry/story-strategy-recommendation.md +85 -0
  11. package/.blueprint/features/feature_agent-guardrails/FEATURE_SPEC.md +328 -0
  12. package/.blueprint/features/feature_agent-guardrails/IMPLEMENTATION_PLAN.md +90 -0
  13. package/.blueprint/features/feature_agent-guardrails/story-citation-requirements.md +50 -0
  14. package/.blueprint/features/feature_agent-guardrails/story-confidentiality.md +50 -0
  15. package/.blueprint/features/feature_agent-guardrails/story-escalation-protocol.md +55 -0
  16. package/.blueprint/features/feature_agent-guardrails/story-source-restrictions.md +50 -0
  17. package/.blueprint/features/feature_feedback-loop/FEATURE_SPEC.md +347 -0
  18. package/.blueprint/features/feature_feedback-loop/IMPLEMENTATION_PLAN.md +71 -0
  19. package/.blueprint/features/feature_feedback-loop/story-feedback-collection.md +63 -0
  20. package/.blueprint/features/feature_feedback-loop/story-feedback-config.md +61 -0
  21. package/.blueprint/features/feature_feedback-loop/story-feedback-insights.md +63 -0
  22. package/.blueprint/features/feature_feedback-loop/story-quality-gates.md +57 -0
  23. package/.blueprint/features/feature_pipeline-history/FEATURE_SPEC.md +239 -0
  24. package/.blueprint/features/feature_pipeline-history/IMPLEMENTATION_PLAN.md +71 -0
  25. package/.blueprint/features/feature_pipeline-history/story-clear-history.md +73 -0
  26. package/.blueprint/features/feature_pipeline-history/story-display-history.md +75 -0
  27. package/.blueprint/features/feature_pipeline-history/story-record-execution.md +76 -0
  28. package/.blueprint/features/feature_pipeline-history/story-show-statistics.md +85 -0
  29. package/.blueprint/features/feature_pipeline-insights/FEATURE_SPEC.md +288 -0
  30. package/.blueprint/features/feature_pipeline-insights/IMPLEMENTATION_PLAN.md +65 -0
  31. package/.blueprint/features/feature_pipeline-insights/story-anomaly-detection.md +71 -0
  32. package/.blueprint/features/feature_pipeline-insights/story-bottleneck-analysis.md +75 -0
  33. package/.blueprint/features/feature_pipeline-insights/story-failure-patterns.md +75 -0
  34. package/.blueprint/features/feature_pipeline-insights/story-json-output.md +75 -0
  35. package/.blueprint/features/feature_pipeline-insights/story-trend-analysis.md +78 -0
  36. package/.blueprint/features/feature_validate-command/FEATURE_SPEC.md +209 -0
  37. package/.blueprint/features/feature_validate-command/IMPLEMENTATION_PLAN.md +59 -0
  38. package/.blueprint/features/feature_validate-command/story-failure-output.md +61 -0
  39. package/.blueprint/features/feature_validate-command/story-node-version-check.md +52 -0
  40. package/.blueprint/features/feature_validate-command/story-run-validation.md +59 -0
  41. package/.blueprint/features/feature_validate-command/story-success-output.md +50 -0
  42. package/.blueprint/system_specification/SYSTEM_SPEC.md +248 -0
  43. package/README.md +182 -38
  44. package/SKILL.md +333 -23
  45. package/bin/cli.js +128 -20
  46. package/package.json +2 -2
  47. package/src/feedback.js +171 -0
  48. package/src/history.js +306 -0
  49. package/src/index.js +57 -2
  50. package/src/init.js +2 -6
  51. package/src/insights.js +504 -0
  52. package/src/retry.js +274 -0
  53. package/src/validate.js +172 -0
  54. package/src/skills.js +0 -93
@@ -0,0 +1,504 @@
1
+ const { readHistoryFile, formatDuration } = require('./history');
2
+
3
+ const STAGES = ['alex', 'cass', 'nigel', 'codey-plan', 'codey-implement'];
4
+
5
+ function calculateMean(values) {
6
+ if (values.length === 0) return 0;
7
+ return values.reduce((a, b) => a + b, 0) / values.length;
8
+ }
9
+
10
+ function calculateStdDev(values, mean) {
11
+ if (values.length === 0) return 0;
12
+ const variance = values.reduce((sum, v) => sum + Math.pow(v - mean, 2), 0) / values.length;
13
+ return Math.sqrt(variance);
14
+ }
15
+
16
+ function analyzeBottlenecks(history) {
17
+ const successRuns = history.filter(e => e.status === 'success' && e.stages);
18
+ if (successRuns.length < 3) {
19
+ return { insufficientData: true, message: 'Insufficient data for bottleneck analysis (need 3+ runs)' };
20
+ }
21
+
22
+ const stageDurations = {};
23
+ for (const stage of STAGES) {
24
+ stageDurations[stage] = [];
25
+ }
26
+
27
+ for (const entry of successRuns) {
28
+ for (const stage of STAGES) {
29
+ if (entry.stages[stage] && entry.stages[stage].durationMs) {
30
+ stageDurations[stage].push(entry.stages[stage].durationMs);
31
+ }
32
+ }
33
+ }
34
+
35
+ const stageAvgs = {};
36
+ let totalAvgDuration = 0;
37
+ for (const stage of STAGES) {
38
+ const avg = calculateMean(stageDurations[stage]);
39
+ stageAvgs[stage] = avg;
40
+ totalAvgDuration += avg;
41
+ }
42
+
43
+ let maxStage = null;
44
+ let maxAvg = 0;
45
+ for (const stage of STAGES) {
46
+ if (stageAvgs[stage] > maxAvg) {
47
+ maxAvg = stageAvgs[stage];
48
+ maxStage = stage;
49
+ }
50
+ }
51
+
52
+ const percentage = totalAvgDuration > 0 ? (maxAvg / totalAvgDuration) * 100 : 0;
53
+ const isBottleneck = percentage > 35;
54
+ const recommendation = percentage > 40
55
+ ? `Consider optimizing ${maxStage} stage to improve pipeline throughput`
56
+ : null;
57
+
58
+ return {
59
+ stages: stageAvgs,
60
+ bottleneckStage: maxStage,
61
+ avgDurationMs: maxAvg,
62
+ percentage: Math.round(percentage * 10) / 10,
63
+ isBottleneck,
64
+ recommendation
65
+ };
66
+ }
67
+
68
+ function analyzeFailures(history) {
69
+ const failedRuns = history.filter(e => e.status === 'failed');
70
+ if (failedRuns.length === 0) {
71
+ return { noFailures: true, message: 'No failures recorded' };
72
+ }
73
+
74
+ const failuresByStage = {};
75
+ const featureFailures = {};
76
+
77
+ for (const entry of failedRuns) {
78
+ if (entry.failedStage) {
79
+ failuresByStage[entry.failedStage] = (failuresByStage[entry.failedStage] || 0) + 1;
80
+ }
81
+ if (entry.slug) {
82
+ featureFailures[entry.slug] = (featureFailures[entry.slug] || 0) + 1;
83
+ }
84
+ }
85
+
86
+ // Find most common failure stage (first occurrence wins ties)
87
+ let mostCommonStage = null;
88
+ let maxCount = 0;
89
+ for (const stage of STAGES) {
90
+ if ((failuresByStage[stage] || 0) > maxCount) {
91
+ maxCount = failuresByStage[stage];
92
+ mostCommonStage = stage;
93
+ }
94
+ }
95
+
96
+ const repeatedFeatures = Object.entries(featureFailures)
97
+ .filter(([, count]) => count > 1)
98
+ .map(([slug, count]) => ({ slug, count }));
99
+
100
+ const totalRuns = history.length;
101
+ const failureRate = (failedRuns.length / totalRuns) * 100;
102
+ const isHighFailureRate = failureRate > 15;
103
+ const recommendation = failureRate > 20
104
+ ? `High failure rate detected. Review ${mostCommonStage} stage for common issues`
105
+ : null;
106
+
107
+ return {
108
+ failuresByStage,
109
+ mostCommonStage,
110
+ failureCount: maxCount,
111
+ repeatedFeatures,
112
+ failureRate: Math.round(failureRate * 10) / 10,
113
+ isHighFailureRate,
114
+ recommendation
115
+ };
116
+ }
117
+
118
+ function detectAnomalies(history) {
119
+ const runsWithStages = history.filter(e => e.stages);
120
+ if (runsWithStages.length < 3) {
121
+ return { insufficientData: true, message: 'Insufficient data for anomaly detection' };
122
+ }
123
+
124
+ const stageDurations = {};
125
+ for (const stage of STAGES) {
126
+ stageDurations[stage] = [];
127
+ }
128
+
129
+ for (const entry of runsWithStages) {
130
+ for (const stage of STAGES) {
131
+ if (entry.stages[stage] && entry.stages[stage].durationMs) {
132
+ stageDurations[stage].push({
133
+ slug: entry.slug,
134
+ duration: entry.stages[stage].durationMs
135
+ });
136
+ }
137
+ }
138
+ }
139
+
140
+ const anomalies = [];
141
+ const last10 = runsWithStages.slice(-10);
142
+
143
+ for (const stage of STAGES) {
144
+ const allDurations = stageDurations[stage].map(d => d.duration);
145
+ const mean = calculateMean(allDurations);
146
+ const stddev = calculateStdDev(allDurations, mean);
147
+ const threshold = mean + 2 * stddev;
148
+
149
+ for (const entry of last10) {
150
+ if (entry.stages[stage] && entry.stages[stage].durationMs > threshold && stddev > 0) {
151
+ const actual = entry.stages[stage].durationMs;
152
+ const deviation = (actual - mean) / stddev;
153
+ anomalies.push({
154
+ slug: entry.slug,
155
+ stage,
156
+ actual,
157
+ expected: Math.round(mean),
158
+ deviation: Math.round(deviation * 10) / 10
159
+ });
160
+ }
161
+ }
162
+ }
163
+
164
+ if (anomalies.length === 0) {
165
+ return { noAnomalies: true, message: 'No anomalies detected in recent runs' };
166
+ }
167
+
168
+ return {
169
+ anomalies,
170
+ recommendation: anomalies.length > 0
171
+ ? 'Review flagged runs for unusual conditions or environment issues'
172
+ : null
173
+ };
174
+ }
175
+
176
+ function analyzeTrends(history) {
177
+ if (history.length < 6) {
178
+ return { insufficientData: true, message: 'Insufficient data for trend analysis (need 6+ runs)' };
179
+ }
180
+
181
+ const midpoint = Math.floor(history.length / 2);
182
+ const firstHalf = history.slice(0, midpoint);
183
+ const secondHalf = history.slice(midpoint);
184
+
185
+ // Success rate trend
186
+ const firstSuccessRate = firstHalf.filter(e => e.status === 'success').length / firstHalf.length * 100;
187
+ const secondSuccessRate = secondHalf.filter(e => e.status === 'success').length / secondHalf.length * 100;
188
+ const successRateChange = secondSuccessRate - firstSuccessRate;
189
+
190
+ let successTrend = 'stable';
191
+ if (successRateChange > 10) successTrend = 'improving';
192
+ else if (successRateChange < -10) successTrend = 'degrading';
193
+
194
+ // Duration trend
195
+ const firstDurations = firstHalf.filter(e => e.totalDurationMs).map(e => e.totalDurationMs);
196
+ const secondDurations = secondHalf.filter(e => e.totalDurationMs).map(e => e.totalDurationMs);
197
+
198
+ const firstAvgDuration = calculateMean(firstDurations);
199
+ const secondAvgDuration = calculateMean(secondDurations);
200
+ const durationChange = firstAvgDuration > 0
201
+ ? ((secondAvgDuration - firstAvgDuration) / firstAvgDuration) * 100
202
+ : 0;
203
+
204
+ let durationTrend = 'stable';
205
+ if (durationChange < -10) durationTrend = 'improving';
206
+ else if (durationChange > 10) durationTrend = 'degrading';
207
+
208
+ let recommendation = null;
209
+ if (successTrend === 'degrading') {
210
+ recommendation = 'Pipeline success rate is declining. Review recent changes for regressions';
211
+ } else if (durationTrend === 'degrading') {
212
+ recommendation = 'Pipeline duration is increasing. Consider performance optimization';
213
+ }
214
+
215
+ return {
216
+ successRate: {
217
+ trend: successTrend,
218
+ change: Math.round(successRateChange * 10) / 10,
219
+ first: Math.round(firstSuccessRate * 10) / 10,
220
+ second: Math.round(secondSuccessRate * 10) / 10
221
+ },
222
+ duration: {
223
+ trend: durationTrend,
224
+ change: Math.round(durationChange * 10) / 10,
225
+ first: Math.round(firstAvgDuration),
226
+ second: Math.round(secondAvgDuration)
227
+ },
228
+ recommendation
229
+ };
230
+ }
231
+
232
+ function formatTextOutput(analysis, sections) {
233
+ const lines = ['\nPipeline Insights\n'];
234
+
235
+ const showAll = sections.length === 0;
236
+ const showBottlenecks = showAll || sections.includes('bottlenecks');
237
+ const showFailures = showAll || sections.includes('failures');
238
+ const showAnomalies = showAll || sections.includes('anomalies');
239
+ const showTrends = showAll || sections.includes('trends');
240
+
241
+ if (showBottlenecks) {
242
+ lines.push('BOTTLENECK ANALYSIS');
243
+ if (analysis.bottlenecks.insufficientData) {
244
+ lines.push(` ${analysis.bottlenecks.message}`);
245
+ } else {
246
+ lines.push(` Slowest stage: ${analysis.bottlenecks.bottleneckStage} (${analysis.bottlenecks.percentage}% of pipeline)`);
247
+ lines.push(` Average duration: ${formatDuration(analysis.bottlenecks.avgDurationMs)}`);
248
+ if (analysis.bottlenecks.isBottleneck) {
249
+ lines.push(' Status: BOTTLENECK DETECTED');
250
+ }
251
+ if (analysis.bottlenecks.recommendation) {
252
+ lines.push(` Recommendation: ${analysis.bottlenecks.recommendation}`);
253
+ }
254
+ }
255
+ lines.push('');
256
+ }
257
+
258
+ if (showFailures) {
259
+ lines.push('FAILURE PATTERNS');
260
+ if (analysis.failures.noFailures) {
261
+ lines.push(` ${analysis.failures.message}`);
262
+ } else {
263
+ lines.push(` Most common failure stage: ${analysis.failures.mostCommonStage} (${analysis.failures.failureCount} failures)`);
264
+ lines.push(` Overall failure rate: ${analysis.failures.failureRate}%`);
265
+ if (analysis.failures.repeatedFeatures.length > 0) {
266
+ lines.push(' Features with repeated failures:');
267
+ for (const f of analysis.failures.repeatedFeatures) {
268
+ lines.push(` - ${f.slug} (${f.count} failures)`);
269
+ }
270
+ }
271
+ if (analysis.failures.recommendation) {
272
+ lines.push(` Recommendation: ${analysis.failures.recommendation}`);
273
+ }
274
+ }
275
+ lines.push('');
276
+ }
277
+
278
+ if (showAnomalies) {
279
+ lines.push('ANOMALY DETECTION');
280
+ if (analysis.anomalies.insufficientData) {
281
+ lines.push(` ${analysis.anomalies.message}`);
282
+ } else if (analysis.anomalies.noAnomalies) {
283
+ lines.push(` ${analysis.anomalies.message}`);
284
+ } else {
285
+ lines.push(' Anomalous runs detected:');
286
+ for (const a of analysis.anomalies.anomalies) {
287
+ lines.push(` - ${a.slug}/${a.stage}: ${formatDuration(a.actual)} (expected ~${formatDuration(a.expected)}, ${a.deviation}x stddev)`);
288
+ }
289
+ if (analysis.anomalies.recommendation) {
290
+ lines.push(` Recommendation: ${analysis.anomalies.recommendation}`);
291
+ }
292
+ }
293
+ lines.push('');
294
+ }
295
+
296
+ if (showTrends) {
297
+ lines.push('TREND ANALYSIS');
298
+ if (analysis.trends.insufficientData) {
299
+ lines.push(` ${analysis.trends.message}`);
300
+ } else {
301
+ const sr = analysis.trends.successRate;
302
+ const dr = analysis.trends.duration;
303
+ lines.push(` Success rate: ${sr.trend} (${sr.change > 0 ? '+' : ''}${sr.change}%)`);
304
+ lines.push(` Duration: ${dr.trend} (${dr.change > 0 ? '+' : ''}${dr.change}%)`);
305
+ if (analysis.trends.recommendation) {
306
+ lines.push(` Recommendation: ${analysis.trends.recommendation}`);
307
+ }
308
+ }
309
+ lines.push('');
310
+ }
311
+
312
+ return lines.join('\n');
313
+ }
314
+
315
+ function formatJsonOutput(analysis, sections) {
316
+ const showAll = sections.length === 0;
317
+ const output = {};
318
+
319
+ if (showAll || sections.includes('bottlenecks')) {
320
+ output.bottlenecks = analysis.bottlenecks;
321
+ }
322
+ if (showAll || sections.includes('failures')) {
323
+ output.failures = analysis.failures;
324
+ }
325
+ if (showAll || sections.includes('anomalies')) {
326
+ output.anomalies = analysis.anomalies;
327
+ }
328
+ if (showAll || sections.includes('trends')) {
329
+ output.trends = analysis.trends;
330
+ }
331
+
332
+ return JSON.stringify(output, null, 2);
333
+ }
334
+
335
+ function displayInsights(options = {}) {
336
+ const history = readHistoryFile();
337
+
338
+ if (history.error === 'corrupted') {
339
+ console.log("Warning: History file is corrupted. Run 'orchestr8 history clear' to reset.");
340
+ return;
341
+ }
342
+
343
+ if (!history || history.length === 0) {
344
+ console.log('No pipeline history found.');
345
+ return;
346
+ }
347
+
348
+ const analysis = {
349
+ bottlenecks: analyzeBottlenecks(history),
350
+ failures: analyzeFailures(history),
351
+ anomalies: detectAnomalies(history),
352
+ trends: analyzeTrends(history)
353
+ };
354
+
355
+ const sections = [];
356
+ if (options.bottlenecks) sections.push('bottlenecks');
357
+ if (options.failures) sections.push('failures');
358
+
359
+ if (options.json) {
360
+ console.log(formatJsonOutput(analysis, sections));
361
+ } else {
362
+ console.log(formatTextOutput(analysis, sections));
363
+ }
364
+ }
365
+
366
+ /**
367
+ * Calculates agent calibration score based on prediction accuracy.
368
+ * Per FEATURE_SPEC.md:Rule 4.
369
+ * @param {string} agent - Agent name (alex, cass, nigel)
370
+ * @param {Array} history - History entries
371
+ * @returns {number|null} Calibration score 0-1, or null if insufficient data
372
+ */
373
+ function calculateCalibration(agent, history) {
374
+ const entries = history.filter(e => e.stages?.[agent]?.feedback);
375
+ if (entries.length < 10) return null;
376
+
377
+ let matches = 0;
378
+ for (const entry of entries) {
379
+ const rating = entry.stages[agent].feedback.rating;
380
+ const success = entry.status === 'success';
381
+ const predicted = rating >= 3;
382
+ if (predicted === success) matches++;
383
+ }
384
+ return matches / entries.length;
385
+ }
386
+
387
+ /**
388
+ * Correlates issue codes with failure rates.
389
+ * @param {Array} history - History entries
390
+ * @returns {object} Map of issue code to failure correlation (0-1)
391
+ */
392
+ function correlateIssues(history) {
393
+ const issueCounts = {};
394
+ const issueFailures = {};
395
+
396
+ for (const entry of history) {
397
+ for (const stage of Object.values(entry.stages || {})) {
398
+ if (stage.feedback?.issues) {
399
+ for (const issue of stage.feedback.issues) {
400
+ issueCounts[issue] = (issueCounts[issue] || 0) + 1;
401
+ if (entry.status === 'failed') {
402
+ issueFailures[issue] = (issueFailures[issue] || 0) + 1;
403
+ }
404
+ }
405
+ }
406
+ }
407
+ }
408
+
409
+ const correlations = {};
410
+ for (const issue of Object.keys(issueCounts)) {
411
+ correlations[issue] = (issueFailures[issue] || 0) / issueCounts[issue];
412
+ }
413
+ return correlations;
414
+ }
415
+
416
+ /**
417
+ * Recommends optimal threshold based on historical data.
418
+ * @param {Array} history - History entries
419
+ * @returns {number} Recommended threshold value
420
+ */
421
+ function recommendThreshold(history) {
422
+ let best = 3.0;
423
+ for (const t of [2, 2.5, 3, 3.5, 4]) {
424
+ const correct = history.filter(e => {
425
+ const r = e.stages?.cass?.feedback?.rating || 3;
426
+ const pred = r >= t;
427
+ return pred === (e.status === 'success');
428
+ }).length;
429
+ if (correct > history.length * 0.7) best = t;
430
+ }
431
+ return best;
432
+ }
433
+
434
+ /**
435
+ * Displays feedback-specific insights.
436
+ * @param {object} options - Display options
437
+ */
438
+ function displayFeedbackInsights(options = {}) {
439
+ const history = readHistoryFile();
440
+
441
+ if (history.error === 'corrupted') {
442
+ console.log("Warning: History file is corrupted.");
443
+ return;
444
+ }
445
+
446
+ if (!history || history.length === 0) {
447
+ console.log('No pipeline history found.');
448
+ return;
449
+ }
450
+
451
+ console.log('\nFeedback Insights\n');
452
+
453
+ // Agent calibration
454
+ console.log('AGENT CALIBRATION');
455
+ for (const agent of ['alex', 'cass', 'nigel']) {
456
+ const calibration = calculateCalibration(agent, history);
457
+ if (calibration === null) {
458
+ console.log(` ${agent.padEnd(8)}: Insufficient data (<10 runs)`);
459
+ } else {
460
+ const pct = Math.round(calibration * 100);
461
+ console.log(` ${agent.padEnd(8)}: ${pct}% accuracy`);
462
+ }
463
+ }
464
+ console.log('');
465
+
466
+ // Issue correlations
467
+ const correlations = correlateIssues(history);
468
+ if (Object.keys(correlations).length > 0) {
469
+ console.log('ISSUE CORRELATIONS');
470
+ const sorted = Object.entries(correlations)
471
+ .sort(([, a], [, b]) => b - a);
472
+ for (const [issue, corr] of sorted) {
473
+ const pct = Math.round(corr * 100);
474
+ console.log(` ${issue.padEnd(24)}: ${pct}% failure rate`);
475
+ }
476
+ console.log('');
477
+ }
478
+
479
+ // Threshold recommendation
480
+ const entriesWithFeedback = history.filter(e =>
481
+ Object.values(e.stages || {}).some(s => s.feedback)
482
+ );
483
+ if (entriesWithFeedback.length >= 10) {
484
+ const recommended = recommendThreshold(history);
485
+ console.log('RECOMMENDATIONS');
486
+ console.log(` Suggested minRatingThreshold: ${recommended}`);
487
+ console.log('');
488
+ }
489
+ }
490
+
491
+ module.exports = {
492
+ displayInsights,
493
+ analyzeBottlenecks,
494
+ analyzeFailures,
495
+ detectAnomalies,
496
+ analyzeTrends,
497
+ calculateMean,
498
+ calculateStdDev,
499
+ // Feedback analysis exports
500
+ calculateCalibration,
501
+ correlateIssues,
502
+ recommendThreshold,
503
+ displayFeedbackInsights
504
+ };