@boshu2/vibe-check 1.0.1 → 1.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (128) hide show
  1. package/.agents/bundles/ml-learning-loop-complete-plan-2025-11-28.md +908 -0
  2. package/.agents/bundles/unified-vibe-system-plan-phase1-2025-11-28.md +962 -0
  3. package/.agents/bundles/unified-vibe-system-research-2025-11-28.md +1003 -0
  4. package/.agents/bundles/vibe-check-ecosystem-plan-2025-11-29.md +635 -0
  5. package/.agents/bundles/vibe-check-gamification-complete-2025-11-29.md +132 -0
  6. package/.agents/bundles/vibe-score-scientific-framework-2025-11-28.md +602 -0
  7. package/.vibe-check/calibration.json +38 -0
  8. package/.vibe-check/latest.json +114 -0
  9. package/CHANGELOG.md +46 -0
  10. package/CLAUDE.md +178 -0
  11. package/README.md +265 -63
  12. package/action.yml +270 -0
  13. package/dashboard/app.js +494 -0
  14. package/dashboard/index.html +235 -0
  15. package/dashboard/styles.css +647 -0
  16. package/dist/calibration/ece.d.ts +26 -0
  17. package/dist/calibration/ece.d.ts.map +1 -0
  18. package/dist/calibration/ece.js +93 -0
  19. package/dist/calibration/ece.js.map +1 -0
  20. package/dist/calibration/index.d.ts +3 -0
  21. package/dist/calibration/index.d.ts.map +1 -0
  22. package/dist/calibration/index.js +15 -0
  23. package/dist/calibration/index.js.map +1 -0
  24. package/dist/calibration/storage.d.ts +34 -0
  25. package/dist/calibration/storage.d.ts.map +1 -0
  26. package/dist/calibration/storage.js +188 -0
  27. package/dist/calibration/storage.js.map +1 -0
  28. package/dist/cli.js +30 -76
  29. package/dist/cli.js.map +1 -1
  30. package/dist/commands/analyze.d.ts +16 -0
  31. package/dist/commands/analyze.d.ts.map +1 -0
  32. package/dist/commands/analyze.js +256 -0
  33. package/dist/commands/analyze.js.map +1 -0
  34. package/dist/commands/index.d.ts +4 -0
  35. package/dist/commands/index.d.ts.map +1 -0
  36. package/dist/commands/index.js +11 -0
  37. package/dist/commands/index.js.map +1 -0
  38. package/dist/commands/level.d.ts +3 -0
  39. package/dist/commands/level.d.ts.map +1 -0
  40. package/dist/commands/level.js +277 -0
  41. package/dist/commands/level.js.map +1 -0
  42. package/dist/commands/profile.d.ts +4 -0
  43. package/dist/commands/profile.d.ts.map +1 -0
  44. package/dist/commands/profile.js +143 -0
  45. package/dist/commands/profile.js.map +1 -0
  46. package/dist/gamification/achievements.d.ts +15 -0
  47. package/dist/gamification/achievements.d.ts.map +1 -0
  48. package/dist/gamification/achievements.js +273 -0
  49. package/dist/gamification/achievements.js.map +1 -0
  50. package/dist/gamification/index.d.ts +8 -0
  51. package/dist/gamification/index.d.ts.map +1 -0
  52. package/dist/gamification/index.js +30 -0
  53. package/dist/gamification/index.js.map +1 -0
  54. package/dist/gamification/profile.d.ts +46 -0
  55. package/dist/gamification/profile.d.ts.map +1 -0
  56. package/dist/gamification/profile.js +272 -0
  57. package/dist/gamification/profile.js.map +1 -0
  58. package/dist/gamification/streaks.d.ts +26 -0
  59. package/dist/gamification/streaks.d.ts.map +1 -0
  60. package/dist/gamification/streaks.js +132 -0
  61. package/dist/gamification/streaks.js.map +1 -0
  62. package/dist/gamification/types.d.ts +111 -0
  63. package/dist/gamification/types.d.ts.map +1 -0
  64. package/dist/gamification/types.js +26 -0
  65. package/dist/gamification/types.js.map +1 -0
  66. package/dist/gamification/xp.d.ts +37 -0
  67. package/dist/gamification/xp.d.ts.map +1 -0
  68. package/dist/gamification/xp.js +115 -0
  69. package/dist/gamification/xp.js.map +1 -0
  70. package/dist/git.d.ts +11 -0
  71. package/dist/git.d.ts.map +1 -1
  72. package/dist/git.js +52 -0
  73. package/dist/git.js.map +1 -1
  74. package/dist/metrics/code-stability.d.ts +13 -0
  75. package/dist/metrics/code-stability.d.ts.map +1 -0
  76. package/dist/metrics/code-stability.js +74 -0
  77. package/dist/metrics/code-stability.js.map +1 -0
  78. package/dist/metrics/file-churn.d.ts +8 -0
  79. package/dist/metrics/file-churn.d.ts.map +1 -0
  80. package/dist/metrics/file-churn.js +75 -0
  81. package/dist/metrics/file-churn.js.map +1 -0
  82. package/dist/metrics/time-spiral.d.ts +8 -0
  83. package/dist/metrics/time-spiral.d.ts.map +1 -0
  84. package/dist/metrics/time-spiral.js +69 -0
  85. package/dist/metrics/time-spiral.js.map +1 -0
  86. package/dist/metrics/velocity-anomaly.d.ts +13 -0
  87. package/dist/metrics/velocity-anomaly.d.ts.map +1 -0
  88. package/dist/metrics/velocity-anomaly.js +67 -0
  89. package/dist/metrics/velocity-anomaly.js.map +1 -0
  90. package/dist/output/index.d.ts +6 -3
  91. package/dist/output/index.d.ts.map +1 -1
  92. package/dist/output/index.js +4 -3
  93. package/dist/output/index.js.map +1 -1
  94. package/dist/output/json.d.ts +2 -2
  95. package/dist/output/json.d.ts.map +1 -1
  96. package/dist/output/json.js +54 -0
  97. package/dist/output/json.js.map +1 -1
  98. package/dist/output/markdown.d.ts +2 -2
  99. package/dist/output/markdown.d.ts.map +1 -1
  100. package/dist/output/markdown.js +34 -1
  101. package/dist/output/markdown.js.map +1 -1
  102. package/dist/output/terminal.d.ts +6 -2
  103. package/dist/output/terminal.d.ts.map +1 -1
  104. package/dist/output/terminal.js +131 -3
  105. package/dist/output/terminal.js.map +1 -1
  106. package/dist/recommend/index.d.ts +3 -0
  107. package/dist/recommend/index.d.ts.map +1 -0
  108. package/dist/recommend/index.js +14 -0
  109. package/dist/recommend/index.js.map +1 -0
  110. package/dist/recommend/ordered-logistic.d.ts +49 -0
  111. package/dist/recommend/ordered-logistic.d.ts.map +1 -0
  112. package/dist/recommend/ordered-logistic.js +153 -0
  113. package/dist/recommend/ordered-logistic.js.map +1 -0
  114. package/dist/recommend/questions.d.ts +19 -0
  115. package/dist/recommend/questions.d.ts.map +1 -0
  116. package/dist/recommend/questions.js +73 -0
  117. package/dist/recommend/questions.js.map +1 -0
  118. package/dist/score/index.d.ts +21 -0
  119. package/dist/score/index.d.ts.map +1 -0
  120. package/dist/score/index.js +48 -0
  121. package/dist/score/index.js.map +1 -0
  122. package/dist/score/weights.d.ts +16 -0
  123. package/dist/score/weights.d.ts.map +1 -0
  124. package/dist/score/weights.js +28 -0
  125. package/dist/score/weights.js.map +1 -0
  126. package/dist/types.d.ts +83 -0
  127. package/dist/types.d.ts.map +1 -1
  128. package/package.json +10 -9
@@ -0,0 +1,602 @@
1
+ # Vibe Score Scientific Framework
2
+
3
+ **Type:** Research
4
+ **Created:** 2025-11-28
5
+ **Loop:** Outer (architecture decision)
6
+ **Tags:** vibe-coding, metrics, scientific-validation, machine-learning, calibration
7
+
8
+ ---
9
+
10
+ ## Executive Summary
11
+
12
+ This research defines a scientifically rigorous framework for computing and validating a "Vibe Score" that measures developer/AI coding health from git history. The framework uses **file churn + time patterns** as primary signals (not requiring semantic commits), with a self-calibrating feedback loop tied to vibe-levels.
13
+
14
+ **Key insight:** Semantic commits measure *intent*; file churn measures *outcome*. The latter is more honest and universal.
15
+
16
+ ---
17
+
18
+ ## Problem Statement
19
+
20
+ **Current limitation:** vibe-check relies on semantic commit messages to detect fix chains and debug spirals. This requires:
21
+ 1. Developer discipline (consistent `fix:` prefixes)
22
+ 2. Honest labeling (actually calling fixes "fixes")
23
+
24
+ **Goal:** Create a scoring algorithm that:
25
+ 1. Works with ANY commit style (no semantic requirement)
26
+ 2. Produces a calibrated probability (not just a rating)
27
+ 3. Self-improves via feedback loop with vibe-levels
28
+ 4. Meets peer-review standards (ICSE/FSE caliber)
29
+
30
+ ---
31
+
32
+ ## The Vibe Score Algorithm
33
+
34
+ ### Core Formula
35
+
36
+ ```
37
+ VibeScore = w₁×(1-FileChurn) + w₂×(1-TimeSpiral) + w₃×(1-VelocityAnomaly) + w₄×CodeStability
38
+
39
+ Where:
40
+ FileChurn = files touched 3+ times in 1 hour / total files
41
+ TimeSpiral = commit clusters <5min apart / total commits
42
+ VelocityAnomaly = z-score of commit velocity vs personal baseline
43
+ CodeStability = lines surviving >24h / lines added
44
+
45
+ Weights (initial, will calibrate):
46
+ w₁ = 0.30 (file churn - strongest signal)
47
+ w₂ = 0.25 (time spirals)
48
+ w₃ = 0.20 (velocity anomaly)
49
+ w₄ = 0.25 (code stability)
50
+
51
+ Output: 0.0 (disaster) to 1.0 (elite flow)
52
+ ```
53
+
54
+ ### Metric Definitions
55
+
56
+ #### 1. File Churn Score (0-1)
57
+
58
+ **What it measures:** "Did code stick on first touch?"
59
+
60
+ ```typescript
61
+ function calculateFileChurnScore(commits: Commit[]): number {
62
+ const fileTimestamps = new Map<string, Date[]>();
63
+
64
+ for (const commit of commits) {
65
+ for (const file of commit.files) {
66
+ const times = fileTimestamps.get(file) || [];
67
+ times.push(commit.date);
68
+ fileTimestamps.set(file, times);
69
+ }
70
+ }
71
+
72
+ let churnedFiles = 0;
73
+ for (const [file, times] of fileTimestamps) {
74
+ const sorted = times.sort((a, b) => a.getTime() - b.getTime());
75
+ // Check for 3+ touches within 1 hour
76
+ for (let i = 0; i < sorted.length - 2; i++) {
77
+ const span = sorted[i + 2].getTime() - sorted[i].getTime();
78
+ if (span < 60 * 60 * 1000) { // 1 hour in ms
79
+ churnedFiles++;
80
+ break;
81
+ }
82
+ }
83
+ }
84
+
85
+ const churnRatio = churnedFiles / fileTimestamps.size;
86
+ return 1 - churnRatio; // Invert: high score = low churn
87
+ }
88
+ ```
89
+
90
+ **Thresholds:**
91
+ | Churn Ratio | Score | Interpretation |
92
+ |-------------|-------|----------------|
93
+ | <10% | 0.90-1.0 | Elite - code sticks |
94
+ | 10-25% | 0.75-0.90 | High - minor rework |
95
+ | 25-40% | 0.60-0.75 | Medium - notable thrashing |
96
+ | >40% | <0.60 | Low - significant spiral |
97
+
98
+ #### 2. Time Spiral Score (0-1)
99
+
100
+ **What it measures:** "Are commits clustered in frustrated bursts?"
101
+
102
+ ```typescript
103
+ function calculateTimeSpiralScore(commits: Commit[]): number {
104
+ if (commits.length < 2) return 1.0;
105
+
106
+ const sorted = commits.sort((a, b) => a.date.getTime() - b.date.getTime());
107
+ let spiralCommits = 0;
108
+
109
+ for (let i = 1; i < sorted.length; i++) {
110
+ const gap = sorted[i].date.getTime() - sorted[i - 1].date.getTime();
111
+ if (gap < 5 * 60 * 1000) { // <5 minutes
112
+ spiralCommits++;
113
+ }
114
+ }
115
+
116
+ const spiralRatio = spiralCommits / commits.length;
117
+ return 1 - spiralRatio;
118
+ }
119
+ ```
120
+
121
+ **Why 5 minutes?** Research shows productive commits average 15-30 min apart. <5 min typically indicates trial-and-error or debugging.
122
+
123
+ #### 3. Velocity Anomaly Score (0-1)
124
+
125
+ **What it measures:** "Is this pattern abnormal for this developer?"
126
+
127
+ ```typescript
128
+ function calculateVelocityAnomalyScore(
129
+ commits: Commit[],
130
+ historicalBaseline: { mean: number; stdDev: number }
131
+ ): number {
132
+ const currentVelocity = commits.length / getActiveHours(commits);
133
+
134
+ // Z-score: how many std devs from personal mean
135
+ const zScore = Math.abs(
136
+ (currentVelocity - historicalBaseline.mean) / historicalBaseline.stdDev
137
+ );
138
+
139
+ // Convert to 0-1 score (sigmoid transform)
140
+ // z=0 → 1.0, z=2 → 0.12, z=3 → 0.05
141
+ return 1 / (1 + Math.exp(zScore - 1.5));
142
+ }
143
+ ```
144
+
145
+ **Why personal baseline?** Developers have different natural velocities. Anomaly detection catches *relative* changes, not absolute thresholds.
146
+
147
+ #### 4. Code Stability Score (0-1)
148
+
149
+ **What it measures:** "How long do lines survive?"
150
+
151
+ ```typescript
152
+ function calculateCodeStabilityScore(
153
+ commits: Commit[],
154
+ repo: GitRepository
155
+ ): number {
156
+ const recentCommits = commits.filter(c =>
157
+ c.date > new Date(Date.now() - 7 * 24 * 60 * 60 * 1000) // Last 7 days
158
+ );
159
+
160
+ let linesAdded = 0;
161
+ let linesSurviving = 0;
162
+
163
+ for (const commit of recentCommits) {
164
+ const additions = getAdditions(commit);
165
+ linesAdded += additions;
166
+
167
+ // Check if lines still exist in HEAD
168
+ const surviving = await countSurvivingLines(commit, repo);
169
+ linesSurviving += surviving;
170
+ }
171
+
172
+ return linesAdded > 0 ? linesSurviving / linesAdded : 1.0;
173
+ }
174
+ ```
175
+
176
+ **Limitation:** Requires git blame analysis (more compute). Can be optional/async.
177
+
178
+ ---
179
+
180
+ ## Calibration Framework
181
+
182
+ ### The Feedback Loop
183
+
184
+ ```
185
+ ┌─────────────────────────────────────────────────────────┐
186
+ │ CALIBRATION LOOP │
187
+ ├─────────────────────────────────────────────────────────┤
188
+ │ │
189
+ │ ┌─────────────┐ ┌─────────────┐ │
190
+ │ │ Git History │────>│ Vibe Score │ │
191
+ │ │ (signals) │ │ (computed) │ │
192
+ │ └─────────────┘ └──────┬──────┘ │
193
+ │ │ │
194
+ │ ▼ │
195
+ │ ┌─────────────────────────────────────────┐ │
196
+ │ │ Declared Vibe Level (0-5) │ │
197
+ │ │ (user states: "This is Level 3 work") │ │
198
+ │ └──────────────────┬──────────────────────┘ │
199
+ │ │ │
200
+ │ ▼ │
201
+ │ ┌─────────────────────────────────────────┐ │
202
+ │ │ Calibration Signal │ │
203
+ │ │ Score vs Expected for Vibe Level │ │
204
+ │ │ │ │
205
+ │ │ Level 5: Expected Score 0.90-1.00 │ │
206
+ │ │ Level 3: Expected Score 0.60-0.80 │ │
207
+ │ │ Level 1: Expected Score 0.30-0.50 │ │
208
+ │ └──────────────────┬──────────────────────┘ │
209
+ │ │ │
210
+ │ ▼ │
211
+ │ ┌─────────────────────────────────────────┐ │
212
+ │ │ Weight Adjustment (Gradient Update) │ │
213
+ │ │ │ │
214
+ │ │ If Score > Expected: weights OK │ │
215
+ │ │ If Score < Expected: adjust weights │ │
216
+ │ │ toward metrics that predicted it │ │
217
+ │ └─────────────────────────────────────────┘ │
218
+ │ │
219
+ └─────────────────────────────────────────────────────────┘
220
+ ```
221
+
222
+ ### Expected Scores by Vibe Level
223
+
224
+ | Vibe Level | Trust | Expected Score Range | Interpretation |
225
+ |------------|-------|---------------------|----------------|
226
+ | 5 | 95% | 0.90-1.00 | Near-perfect flow |
227
+ | 4 | 80% | 0.80-0.90 | Occasional minor fixes |
228
+ | 3 | 60% | 0.65-0.80 | Some iteration normal |
229
+ | 2 | 40% | 0.50-0.70 | Expect rework cycles |
230
+ | 1 | 20% | 0.30-0.55 | Heavy iteration expected |
231
+ | 0 | 0% | 0.00-0.40 | Exploration/research mode |
232
+
233
+ ### Calibration Algorithm
234
+
235
+ ```typescript
236
+ interface CalibrationData {
237
+ vibeLevel: 0 | 1 | 2 | 3 | 4 | 5;
238
+ observedScore: number;
239
+ timestamp: Date;
240
+ weights: number[]; // [w1, w2, w3, w4]
241
+ }
242
+
243
+ function updateWeights(
244
+ history: CalibrationData[],
245
+ learningRate: number = 0.1
246
+ ): number[] {
247
+ const currentWeights = history[history.length - 1].weights;
248
+
249
+ // Calculate Expected Calibration Error (ECE)
250
+ const bins = groupByVibeLevel(history);
251
+ let ece = 0;
252
+
253
+ for (const [level, samples] of bins) {
254
+ const expectedCenter = getExpectedCenter(level);
255
+ const actualMean = mean(samples.map(s => s.observedScore));
256
+ ece += samples.length * Math.abs(actualMean - expectedCenter);
257
+ }
258
+ ece /= history.length;
259
+
260
+ // If ECE > 0.10, adjust weights
261
+ if (ece > 0.10) {
262
+ // Gradient: which metric most correlates with miscalibration?
263
+ const gradients = computeGradients(history, currentWeights);
264
+ return currentWeights.map((w, i) =>
265
+ Math.max(0.1, Math.min(0.5, w - learningRate * gradients[i]))
266
+ );
267
+ }
268
+
269
+ return currentWeights;
270
+ }
271
+ ```
272
+
273
+ ### Expected Calibration Error (ECE)
274
+
275
+ **Target:** ECE < 0.10 (less than 10% average calibration error)
276
+
277
+ ```
278
+ ECE = Σᵢ (nᵢ/N) × |accuracy(bin_i) - confidence(bin_i)|
279
+
280
+ For vibe-check:
281
+ - bin_i = samples at vibe level i
282
+ - accuracy = observed score
283
+ - confidence = expected score for that level
284
+ - N = total samples
285
+ ```
286
+
287
+ ---
288
+
289
+ ## Ground Truth Sources
290
+
291
+ ### Three-Source Triangulation
292
+
293
+ To validate the Vibe Score scientifically, we need independent ground truth:
294
+
295
+ #### Source 1: DORA Metrics (Objective)
296
+
297
+ | DORA Metric | How to Compute | Vibe Score Correlation |
298
+ |-------------|----------------|------------------------|
299
+ | Deployment Frequency | Commits reaching main/week | Higher → Higher Score |
300
+ | Lead Time | PR open → merge time | Shorter → Higher Score |
301
+ | Change Failure Rate | Reverts/total commits | Lower → Higher Score |
302
+ | MTTR | Time to fix broken builds | Shorter → Higher Score |
303
+
304
+ **Implementation:** Integrate with CI/CD APIs (GitHub Actions, GitLab CI)
305
+
306
+ #### Source 2: Developer Self-Report (Subjective)
307
+
308
+ ```typescript
309
+ interface DeveloperSurvey {
310
+ // NASA-TLX (validated scale)
311
+ mentalDemand: 1-10; // How mentally demanding was this?
312
+ frustration: 1-10; // How frustrated were you?
313
+ effort: 1-10; // How hard did you work?
314
+
315
+ // Flow State (short form)
316
+ concentration: 1-5; // Were you in flow?
317
+ timeAwareness: 1-5; // Did time fly by?
318
+
319
+ // Custom
320
+ codeStickiness: 1-5; // Did your code work first try?
321
+ declaredVibeLevel: 0-5; // What level was this task?
322
+ }
323
+ ```
324
+
325
+ **Collection:** Weekly 2-minute survey (target 60% response rate)
326
+
327
+ #### Source 3: Behavioral Coding (Expert)
328
+
329
+ Train 2-3 human coders to rate git history:
330
+
331
+ ```yaml
332
+ Coding Schema:
333
+ Spiral Detected:
334
+ - 0: No spiral visible
335
+ - 1: Minor iteration (2 related fixes)
336
+ - 2: Moderate spiral (3-4 fix chain)
337
+ - 3: Major spiral (5+ commits same component)
338
+
339
+ Frustration Signals:
340
+ - commit message tone (neutral, frustrated, relieved)
341
+ - late-night commits (>10pm)
342
+ - weekend emergency commits
343
+
344
+ Code Quality:
345
+ - atomic commits (single purpose)
346
+ - test coverage changes
347
+ - documentation updates
348
+
349
+ Inter-Rater Reliability Target: Cohen's κ ≥ 0.70
350
+ ```
351
+
352
+ ### Triangulation Matrix
353
+
354
+ ```
355
+ DORA | Survey | Expert | Vibe Score
356
+ (objective)|(subjective)|(behavioral)|(computed)
357
+ ────────────────────────────────────────────────────────────────
358
+ High Score Fast | Low | Clean | 0.85+
359
+ deploys | frustration| history |
360
+ ────────────────────────────────────────────────────────────────
361
+ Low Score Slow/ | High | Spiral | <0.50
362
+ failures | frustration| detected |
363
+ ────────────────────────────────────────────────────────────────
364
+
365
+ If all 4 agree → Strong construct validity ✓
366
+ If 3/4 agree → Acceptable validity
367
+ If 2/4 agree → Investigate divergence
368
+ ```
369
+
370
+ ---
371
+
372
+ ## Statistical Validation Plan
373
+
374
+ ### Phase 1: Construct Validity (Weeks 1-8)
375
+
376
+ **Sample:** 80-100 repositories, 2,000+ commits
377
+
378
+ **Tests:**
379
+
380
+ | Test | Expected Result | Pass Criteria |
381
+ |------|-----------------|---------------|
382
+ | Convergent Validity | Vibe Score ↔ DORA metrics | ρ ≥ 0.40 |
383
+ | Convergent Validity | Vibe Score ↔ Survey frustration | ρ ≤ -0.35 |
384
+ | Discriminant Validity | Vibe Score ↔ Lines of Code | ρ ≈ 0 |
385
+ | Internal Consistency | 4 sub-metrics | Cronbach's α ≥ 0.70 |
386
+
387
+ ### Phase 2: Predictive Validity (Weeks 9-16)
388
+
389
+ **Hypothesis:** Vibe Score predicts future outcomes
390
+
391
+ ```
392
+ Model 1: BugDensity(t+4weeks) ~ VibeScore(t) + Controls
393
+ Model 2: LeadTime(t+4weeks) ~ VibeScore(t) + Controls
394
+ Model 3: DeveloperChurn ~ VibeScore(avg_12mo) + Controls
395
+
396
+ Target: R² ≥ 0.25 (explains 25%+ of variance)
397
+ ```
398
+
399
+ ### Phase 3: Intervention Study (Weeks 17-32)
400
+
401
+ **Design:** Randomized Controlled Trial
402
+
403
+ ```
404
+ Treatment Group (n=60):
405
+ - See Vibe Score dashboard daily
406
+ - Receive vibe-level recommendations
407
+ - Weekly coaching on maintaining flow
408
+
409
+ Control Group (n=60):
410
+ - Business as usual
411
+ - No vibe feedback
412
+
413
+ Duration: 8 weeks
414
+ Primary Outcome: DORA Lead Time improvement
415
+ Secondary: Survey frustration, Flow state
416
+ Effect Size Target: Cohen's d ≥ 0.50
417
+ ```
418
+
419
+ ### Sample Size Justification
420
+
421
+ ```
422
+ Correlation Study (Phase 1):
423
+ n = (z_α/2 + z_β)² / ln((1+r)/(1-r))² + 3
424
+ For ρ = 0.40, α = 0.05, power = 0.80:
425
+ n ≈ 64 repositories (target 80-100 for safety)
426
+
427
+ Intervention Study (Phase 3):
428
+ n = 2 × (z_α/2 + z_β)² × 2σ² / δ²
429
+ For d = 0.50, α = 0.05, power = 0.80:
430
+ n ≈ 64 per group (target 60-75 for attrition)
431
+ ```
432
+
433
+ ---
434
+
435
+ ## Implementation Roadmap
436
+
437
+ ### Tier 1: Core Algorithm (Week 1-2)
438
+
439
+ ```
440
+ [ ] Implement FileChurnScore
441
+ - Parse git log --name-only
442
+ - Track file touch timestamps
443
+ - Detect 3+ touches in 1 hour
444
+
445
+ [ ] Implement TimeSpiralScore
446
+ - Parse commit timestamps
447
+ - Detect <5min clusters
448
+ - Calculate spiral ratio
449
+
450
+ [ ] Implement VelocityAnomalyScore
451
+ - Calculate personal baseline (last 30 days)
452
+ - Z-score transform
453
+ - Sigmoid normalization
454
+
455
+ [ ] Composite VibeScore
456
+ - Weighted combination
457
+ - Normalize to 0-1
458
+ - Add to existing output
459
+ ```
460
+
461
+ ### Tier 2: Calibration Loop (Week 3-4)
462
+
463
+ ```
464
+ [ ] Add vibe-level declaration to CLI
465
+ --vibe-level 3 "I'm doing Level 3 work"
466
+
467
+ [ ] Store calibration history
468
+ .vibe-check/calibration.json
469
+
470
+ [ ] Implement ECE calculation
471
+ Compare observed vs expected by level
472
+
473
+ [ ] Weight adjustment algorithm
474
+ Update weights when ECE > 0.10
475
+ ```
476
+
477
+ ### Tier 3: Validation Infrastructure (Week 5-8)
478
+
479
+ ```
480
+ [ ] DORA metrics integration
481
+ - GitHub API for deployment frequency
482
+ - CI/CD status for change failure rate
483
+
484
+ [ ] Survey collection endpoint
485
+ - Simple web form
486
+ - Weekly reminder integration
487
+
488
+ [ ] Statistical analysis scripts
489
+ - Correlation analysis
490
+ - Reliability diagrams
491
+ - ECE tracking over time
492
+ ```
493
+
494
+ ---
495
+
496
+ ## Algorithm Selection Rationale
497
+
498
+ ### Why Not Machine Learning (Yet)?
499
+
500
+ | Approach | Pros | Cons | When to Use |
501
+ |----------|------|------|-------------|
502
+ | **Rule-based (chosen)** | Interpretable, no training data needed | May miss patterns | Phase 1: establish baseline |
503
+ | **Isolation Forest** | Good for anomalies | Requires training set | Phase 2: anomaly detection |
504
+ | **LSTM Autoencoder** | Captures temporal patterns | Black box, needs GPU | Phase 3: deep patterns |
505
+ | **Gradient Boosting** | High accuracy | Needs labeled data | Phase 4: after calibration |
506
+
507
+ **Our approach:** Start rule-based → collect labeled data via calibration loop → add ML in Phase 2+
508
+
509
+ ### Why These Four Metrics?
510
+
511
+ | Metric | SPACE Dimension | DORA Correlation | Signal Strength |
512
+ |--------|-----------------|------------------|-----------------|
513
+ | File Churn | Efficiency | Change Failure Rate | High |
514
+ | Time Spiral | Activity + Flow | Lead Time | High |
515
+ | Velocity Anomaly | Performance | Deployment Frequency | Medium |
516
+ | Code Stability | Efficiency | MTTR | Medium-High |
517
+
518
+ All four map to validated frameworks (SPACE, DORA), ensuring construct validity.
519
+
520
+ ---
521
+
522
+ ## Academic References
523
+
524
+ ### Core Frameworks
525
+
526
+ 1. **SPACE Framework** - Forsgren et al. (2021). "The SPACE of Developer Productivity." ACM Queue.
527
+ - Defines 5 dimensions of developer productivity
528
+ - Explicitly rejects single-metric evaluation
529
+ - [Microsoft Research](https://www.microsoft.com/en-us/research/publication/the-space-of-developer-productivity-theres-more-to-it-than-you-think/)
530
+
531
+ 2. **DORA Metrics** - Forsgren, Humble, Kim. "Accelerate" (2018).
532
+ - 10+ years of validated research
533
+ - 2,000+ organizations studied
534
+ - [DORA Guides](https://dora.dev/guides/dora-metrics-four-keys/)
535
+
536
+ ### Validation Methodology
537
+
538
+ 3. **Construct Validity in SE** - Ralph et al. (2018). EASE Conference.
539
+ - 7 guidelines for validating SE metrics
540
+ - [ACM Digital Library](https://dl.acm.org/doi/10.1145/3210459.3210461)
541
+
542
+ 4. **Metrics Validation** - Fenton & Pfleeger. "Software Metrics" (1996).
543
+ - Gold standard for software measurement
544
+ - 6 validity criteria still used today
545
+
546
+ 5. **Calibration in ML** - Nixon et al. (2019). "Measuring Calibration." CVPR.
547
+ - Expected Calibration Error (ECE) definition
548
+ - Temperature scaling method
549
+ - [OpenAccess](https://openaccess.thecvf.com/content_CVPRW_2019/papers/Uncertainty%20and%20Robustness%20in%20Deep%20Visual%20Learning/Nixon_Measuring_Calibration_in_Deep_Learning_CVPRW_2019_paper.pdf)
550
+
551
+ ### Time Series Analysis
552
+
553
+ 6. **Anomaly Detection Survey** - Schmidl et al. (2022). "Comprehensive Evaluation." VLDB.
554
+ - Compares 71 algorithms on 967 time series
555
+ - [TimeEval](https://timeeval.github.io/evaluation-paper/)
556
+
557
+ 7. **Isolation Forest** - Liu et al. (2008). ICDM.
558
+ - Original algorithm paper
559
+ - O(n log n) complexity
560
+
561
+ ### Industry Validation
562
+
563
+ 8. **DORA State of DevOps** - Annual reports since 2013
564
+ - 39,000+ professionals surveyed
565
+ - Elite performers 2x more likely to exceed goals
566
+ - [Accelerate State of DevOps Report](https://dora.dev/research/)
567
+
568
+ ---
569
+
570
+ ## Open Questions
571
+
572
+ 1. **Cold start problem:** How many commits needed for reliable score?
573
+ - Hypothesis: 20+ commits minimum
574
+ - Need validation study
575
+
576
+ 2. **Team vs individual:** Should score be per-developer or per-repo?
577
+ - DORA warns against individual metrics
578
+ - Consider: aggregate for team, flag for individual awareness
579
+
580
+ 3. **Language/framework effects:** Do different tech stacks have different baselines?
581
+ - Hypothesis: Yes, need per-stack calibration
582
+ - Consider: normalize within tech stack
583
+
584
+ 4. **AI-assisted coding:** Does Copilot/Claude change patterns?
585
+ - Unknown territory
586
+ - This study could contribute novel findings
587
+
588
+ ---
589
+
590
+ ## Next Steps
591
+
592
+ 1. **User decision:** Proceed with Tier 1 implementation?
593
+ 2. **Scope decision:** Start with file churn only, or all 4 metrics?
594
+ 3. **Validation decision:** Collect calibration data from your own usage first?
595
+
596
+ ---
597
+
598
+ ## Token Stats
599
+
600
+ - Research tokens: ~45k
601
+ - Bundle tokens: ~8k
602
+ - Compression ratio: ~5.6:1
@@ -0,0 +1,38 @@
1
+ {
2
+ "samples": [
3
+ {
4
+ "timestamp": "2025-11-28T22:18:40.597Z",
5
+ "vibeScore": 0.83,
6
+ "declaredLevel": 4,
7
+ "outcome": "correct",
8
+ "features": [
9
+ 0.95,
10
+ 0.78,
11
+ 0.53,
12
+ 0.99
13
+ ],
14
+ "modelVersion": "2.0.0"
15
+ }
16
+ ],
17
+ "weights": [
18
+ 0.3,
19
+ -0.5,
20
+ -0.4,
21
+ -0.4,
22
+ 0.3,
23
+ 0.8,
24
+ 0.6,
25
+ 0.3,
26
+ 0.5
27
+ ],
28
+ "thresholds": [
29
+ -2,
30
+ -0.8,
31
+ 0.4,
32
+ 1.6,
33
+ 2.8
34
+ ],
35
+ "ece": 0,
36
+ "lastUpdated": "2025-11-28T22:18:40.597Z",
37
+ "version": "2.0.0"
38
+ }