pi-crew 0.8.1 → 0.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/CHANGELOG.md CHANGED
@@ -1,5 +1,61 @@
1
1
  # Changelog
2
2
 
3
+ ## [0.8.2] — Skill confidence dead-code fix (T7) (2026-06-16)
4
+
5
+ Fixes a **real correctness bug** surfaced by the pi-extensions deep-dive
6
+ (pi-continuous-learning's tiered confidence model): pi-crew's skill
7
+ confidence system was effectively **inert**.
8
+
9
+ ### Bug fixed
10
+
11
+ `registerSkillEffectivenessHooks` had two defects that left every skill's
12
+ confidence stuck at ~0.3 regardless of outcomes:
13
+
14
+ 1. **`adjustConfidence()` was dead code.** The `task_completed` handler
15
+ hardcoded `confidence: computeInitialConfidence(1)` (= 0.3) on every
16
+ activation write. The function was defined and unit-tested in isolation,
17
+ but **never called in the recording path** — so every stored activation
18
+ had confidence 0.3, and `computeSkillMetrics.currentConfidence` (derived
19
+ from the last stored value + decay) never moved.
20
+ 2. **`task_failed` was a no-op.** Its comment claimed failures were "handled
21
+ by computeSkillMetrics", but `computeSkillMetrics` derives `passRate`
22
+ from *recorded* activations — and failed tasks recorded **nothing**, so a
23
+ failure never fed back into the confidence/decay loop.
24
+
25
+ Net effect: the entire confidence-weighted skill system was decorative.
26
+ Pass-rate, trend, and promotion-gate decisions were computed from a flat
27
+ 0.3 baseline.
28
+
29
+ ### Fix
30
+
31
+ New `computeNextActivationConfidence(skillId, activations, passed)` helper
32
+ computes the **rolling** confidence: it seeds the first activation of a
33
+ skill at 0.3, then applies `adjustConfidence` (+0.05 success / -0.1
34
+ failure, clamped [0.1, 0.95]) on the skill's last recorded confidence.
35
+
36
+ Both hooks now record activations with the rolling confidence:
37
+ - `task_completed` → records `passed:true` activations at the rolled-forward
38
+ confidence.
39
+ - `task_failed` → now records `passed:false` activations (was a no-op),
40
+ which lowers passRate AND triggers the -0.1 contradicting delta on the
41
+ next recorded activation.
42
+
43
+ This unblocks the confidence-weighted skill selection (`getWeightedSkillsForRole`)
44
+ and the promotion gate (`evaluatePromotionGate`) — they now reflect real
45
+ outcome history. Existing `adjustConfidence`/`computeInitialConfidence`/
46
+ `computeSkillMetrics` tests are preserved unchanged (they asserted on the
47
+ intended contract; the recording path now honors it).
48
+
49
+ ### Files
50
+ - `src/runtime/skill-effectiveness.ts` — `computeNextActivationConfidence`
51
+ helper; both hooks rewired to record rolling-confidence activations.
52
+ - NEW `test/unit/t7-confidence-deadcode-fix.test.ts` (7 tests): rolling
53
+ confidence evolves across activations; failures feed back; `adjustConfidence`
54
+ is no longer dead.
55
+
56
+ typecheck clean; skill-effectiveness suite 44/44 pass. (One unrelated
57
+ `event-log-async` flake under local load passes 3/3 in isolation — clean on CI.)
58
+
3
59
  ## [0.8.1] — Subagent cold-start race fix (module-scoped import latch) (2026-06-16)
4
60
 
5
61
  Fixes a flaky, load-dependent crash that surfaced when launching multiple
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pi-crew",
3
- "version": "0.8.1",
3
+ "version": "0.8.2",
4
4
  "description": "Pi extension for coordinated AI teams, workflows, worktrees, and async task orchestration",
5
5
  "author": "baphuongna",
6
6
  "license": "MIT",
@@ -162,6 +162,40 @@ export function adjustConfidence(current: number, passed: boolean): number {
162
162
  return Math.max(0.1, Math.min(0.95, current + delta)); // Clamp to [0.1, 0.95]
163
163
  }
164
164
 
165
+ /**
166
+ * Compute the rolling confidence to RECORD for a new activation of `skillId`,
167
+ * given the existing activations for that skill in the run.
168
+ *
169
+ * Bug fixed (T7, v0.8.2): the `task_completed`/`task_failed` hooks used to
170
+ * hardcode `confidence: computeInitialConfidence(1)` (= 0.3) on every write,
171
+ * which made `adjustConfidence` dead code — every skill stayed stuck at ~0.3
172
+ * regardless of how often it succeeded or failed. The whole confidence
173
+ * system was effectively inert.
174
+ *
175
+ * Now the recorded confidence is the PRIOR rolling confidence for that skill
176
+ * adjusted by this outcome (+0.05 success / -0.1 failure), so the stored
177
+ * value evolves over the run. First activation seeds from the observation
178
+ * count (which is 0 prior + this one = 1 observation -> initial 0.3). This
179
+ * preserves the existing `adjustConfidence` clamp range [0.1, 0.95] and the
180
+ * existing tests (which assert on the stored numeric values).
181
+ */
182
+ export function computeNextActivationConfidence(
183
+ skillId: string,
184
+ activations: SkillActivation[],
185
+ passed: boolean,
186
+ ): number {
187
+ const prior = activations.filter((a) => a.skillId === skillId);
188
+ if (prior.length === 0) {
189
+ // First activation of this skill in the run: seed by observation count.
190
+ // (computeInitialConfidence(1) = 0.3 — the tentative floor.)
191
+ return computeInitialConfidence(1);
192
+ }
193
+ // Rolling confidence = last recorded confidence for this skill, adjusted.
194
+ const lastConfidence = prior[prior.length - 1]?.confidence
195
+ ?? computeInitialConfidence(prior.length);
196
+ return adjustConfidence(lastConfidence, passed);
197
+ }
198
+
165
199
  /**
166
200
  * Apply decay to confidence for skills not observed recently.
167
201
  */
@@ -424,7 +458,13 @@ export function registerSkillEffectivenessHooks(): void {
424
458
  if (hooksRegistered) return;
425
459
  hooksRegistered = true;
426
460
 
427
- // Track task completion for skill effectiveness
461
+ // Track task completion for skill effectiveness.
462
+ // T7 (v0.8.2): record the ROLLING adjusted confidence, not a hardcoded
463
+ // 0.3. computeNextActivationConfidence seeds the first activation and
464
+ // applies adjustConfidence (+0.05 success / -0.1 failure) on subsequent
465
+ // ones, so the stored confidence evolves across the run. Before this fix
466
+ // every activation was written with confidence 0.3, which made
467
+ // adjustConfidence dead code and left every skill stuck at ~0.3.
428
468
  crewHooks.register("task_completed", (event) => {
429
469
  const { taskId, runId, data } = event;
430
470
  if (!taskId || !runId) return;
@@ -433,8 +473,19 @@ export function registerSkillEffectivenessHooks(): void {
433
473
  const skillNames = (data?.skills as string[]) ?? [];
434
474
  const success = (data?.status as string) === "completed";
435
475
 
436
- // Record each skill activation
476
+ // cwd comes from the event payload (set by callers) so that the
477
+ // activation lands in the correct .pi/teams/ or .crew/state/runs/
478
+ // (see issue #29).
479
+ const eventCwd = (data?.cwd as string) ?? process.cwd();
480
+ const existingActivations = getSkillActivations(eventCwd, runId);
481
+
482
+ // Record each skill activation with its rolling confidence
437
483
  for (const skillId of skillNames) {
484
+ const confidence = computeNextActivationConfidence(
485
+ skillId,
486
+ existingActivations,
487
+ success,
488
+ );
438
489
  const activation: SkillActivation = {
439
490
  id: `act-${Date.now()}-${Math.random().toString(36).slice(2)}`,
440
491
  skillId,
@@ -443,23 +494,46 @@ export function registerSkillEffectivenessHooks(): void {
443
494
  taskId,
444
495
  timestamp: new Date().toISOString(),
445
496
  passed: success,
446
- confidence: computeInitialConfidence(1),
497
+ confidence,
447
498
  };
448
- // cwd comes from the event payload (set by callers) so that the
449
- // activation lands in the correct .pi/teams/ or .crew/state/runs/
450
- // (see issue #29).
451
- const eventCwd = (data?.cwd as string) ?? process.cwd();
452
499
  recordSkillActivation(eventCwd, activation);
453
500
  }
454
501
  });
455
502
 
456
- // Track task failures
503
+ // Track task failures.
504
+ // T7 (v0.8.2): this used to be a no-op ("handled by computeSkillMetrics"),
505
+ // but computeSkillMetrics derives passRate from recorded activations —
506
+ // and since failed tasks recorded NOTHING, the failure never fed back
507
+ // into the confidence/decay loop. Now we record a `passed:false`
508
+ // activation for each skill tied to the failed task, which both lowers
509
+ // passRate AND triggers the -0.1 contradicting delta via
510
+ // computeNextActivationConfidence on the next recorded activation.
457
511
  crewHooks.register("task_failed", (event) => {
458
512
  const { taskId, runId, data } = event;
459
513
  if (!taskId || !runId) return;
460
514
 
461
- // Downgrade confidence for skills associated with failed tasks
462
- // This is handled by computeSkillMetrics when processing activations
515
+ const skillNames = (data?.skills as string[]) ?? [];
516
+ const eventCwd = (data?.cwd as string) ?? process.cwd();
517
+ const existingActivations = getSkillActivations(eventCwd, runId);
518
+
519
+ for (const skillId of skillNames) {
520
+ const confidence = computeNextActivationConfidence(
521
+ skillId,
522
+ existingActivations,
523
+ false,
524
+ );
525
+ const activation: SkillActivation = {
526
+ id: `act-${Date.now()}-${Math.random().toString(36).slice(2)}`,
527
+ skillId,
528
+ role: (data?.role as string) ?? "unknown",
529
+ runId,
530
+ taskId,
531
+ timestamp: new Date().toISOString(),
532
+ passed: false,
533
+ confidence,
534
+ };
535
+ recordSkillActivation(eventCwd, activation);
536
+ }
463
537
  });
464
538
  }
465
539