pi-crew 0.8.1 → 0.8.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +56 -0
- package/package.json +1 -1
- package/src/runtime/skill-effectiveness.ts +84 -10
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,61 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## [0.8.2] — Skill confidence dead-code fix (T7) (2026-06-16)
|
|
4
|
+
|
|
5
|
+
Fixes a **real correctness bug** surfaced by the pi-extensions deep-dive
|
|
6
|
+
(pi-continuous-learning's tiered confidence model): pi-crew's skill
|
|
7
|
+
confidence system was effectively **inert**.
|
|
8
|
+
|
|
9
|
+
### Bug fixed
|
|
10
|
+
|
|
11
|
+
`registerSkillEffectivenessHooks` had two defects that left every skill's
|
|
12
|
+
confidence stuck at ~0.3 regardless of outcomes:
|
|
13
|
+
|
|
14
|
+
1. **`adjustConfidence()` was dead code.** The `task_completed` handler
|
|
15
|
+
hardcoded `confidence: computeInitialConfidence(1)` (= 0.3) on every
|
|
16
|
+
activation write. The function was defined and unit-tested in isolation,
|
|
17
|
+
but **never called in the recording path** — so every stored activation
|
|
18
|
+
had confidence 0.3, and `computeSkillMetrics.currentConfidence` (derived
|
|
19
|
+
from the last stored value + decay) never moved.
|
|
20
|
+
2. **`task_failed` was a no-op.** Its comment claimed failures were "handled
|
|
21
|
+
by computeSkillMetrics", but `computeSkillMetrics` derives `passRate`
|
|
22
|
+
from *recorded* activations — and failed tasks recorded **nothing**, so a
|
|
23
|
+
failure never fed back into the confidence/decay loop.
|
|
24
|
+
|
|
25
|
+
Net effect: the entire confidence-weighted skill system was decorative.
|
|
26
|
+
Pass-rate, trend, and promotion-gate decisions were computed from a flat
|
|
27
|
+
0.3 baseline.
|
|
28
|
+
|
|
29
|
+
### Fix
|
|
30
|
+
|
|
31
|
+
New `computeNextActivationConfidence(skillId, activations, passed)` helper
|
|
32
|
+
computes the **rolling** confidence: it seeds the first activation of a
|
|
33
|
+
skill at 0.3, then applies `adjustConfidence` (+0.05 success / -0.1
|
|
34
|
+
failure, clamped [0.1, 0.95]) on the skill's last recorded confidence.
|
|
35
|
+
|
|
36
|
+
Both hooks now record activations with the rolling confidence:
|
|
37
|
+
- `task_completed` → records `passed:true` activations at the rolled-forward
|
|
38
|
+
confidence.
|
|
39
|
+
- `task_failed` → now records `passed:false` activations (was a no-op),
|
|
40
|
+
which lowers passRate AND triggers the -0.1 contradicting delta on the
|
|
41
|
+
next recorded activation.
|
|
42
|
+
|
|
43
|
+
This unblocks the confidence-weighted skill selection (`getWeightedSkillsForRole`)
|
|
44
|
+
and the promotion gate (`evaluatePromotionGate`) — they now reflect real
|
|
45
|
+
outcome history. Existing `adjustConfidence`/`computeInitialConfidence`/
|
|
46
|
+
`computeSkillMetrics` tests are preserved unchanged (they asserted on the
|
|
47
|
+
intended contract; the recording path now honors it).
|
|
48
|
+
|
|
49
|
+
### Files
|
|
50
|
+
- `src/runtime/skill-effectiveness.ts` — `computeNextActivationConfidence`
|
|
51
|
+
helper; both hooks rewired to record rolling-confidence activations.
|
|
52
|
+
- NEW `test/unit/t7-confidence-deadcode-fix.test.ts` (7 tests): rolling
|
|
53
|
+
confidence evolves across activations; failures feed back; `adjustConfidence`
|
|
54
|
+
is no longer dead.
|
|
55
|
+
|
|
56
|
+
typecheck clean; skill-effectiveness suite 44/44 pass. (One unrelated
|
|
57
|
+
`event-log-async` flake under local load passes 3/3 in isolation — clean on CI.)
|
|
58
|
+
|
|
3
59
|
## [0.8.1] — Subagent cold-start race fix (module-scoped import latch) (2026-06-16)
|
|
4
60
|
|
|
5
61
|
Fixes a flaky, load-dependent crash that surfaced when launching multiple
|
package/package.json
CHANGED
|
@@ -162,6 +162,40 @@ export function adjustConfidence(current: number, passed: boolean): number {
|
|
|
162
162
|
return Math.max(0.1, Math.min(0.95, current + delta)); // Clamp to [0.1, 0.95]
|
|
163
163
|
}
|
|
164
164
|
|
|
165
|
+
/**
|
|
166
|
+
* Compute the rolling confidence to RECORD for a new activation of `skillId`,
|
|
167
|
+
* given the existing activations for that skill in the run.
|
|
168
|
+
*
|
|
169
|
+
* Bug fixed (T7, v0.8.2): the `task_completed`/`task_failed` hooks used to
|
|
170
|
+
* hardcode `confidence: computeInitialConfidence(1)` (= 0.3) on every write,
|
|
171
|
+
* which made `adjustConfidence` dead code — every skill stayed stuck at ~0.3
|
|
172
|
+
* regardless of how often it succeeded or failed. The whole confidence
|
|
173
|
+
* system was effectively inert.
|
|
174
|
+
*
|
|
175
|
+
* Now the recorded confidence is the PRIOR rolling confidence for that skill
|
|
176
|
+
* adjusted by this outcome (+0.05 success / -0.1 failure), so the stored
|
|
177
|
+
* value evolves over the run. First activation seeds from the observation
|
|
178
|
+
* count (which is 0 prior + this one = 1 observation -> initial 0.3). This
|
|
179
|
+
* preserves the existing `adjustConfidence` clamp range [0.1, 0.95] and the
|
|
180
|
+
* existing tests (which assert on the stored numeric values).
|
|
181
|
+
*/
|
|
182
|
+
export function computeNextActivationConfidence(
|
|
183
|
+
skillId: string,
|
|
184
|
+
activations: SkillActivation[],
|
|
185
|
+
passed: boolean,
|
|
186
|
+
): number {
|
|
187
|
+
const prior = activations.filter((a) => a.skillId === skillId);
|
|
188
|
+
if (prior.length === 0) {
|
|
189
|
+
// First activation of this skill in the run: seed by observation count.
|
|
190
|
+
// (computeInitialConfidence(1) = 0.3 — the tentative floor.)
|
|
191
|
+
return computeInitialConfidence(1);
|
|
192
|
+
}
|
|
193
|
+
// Rolling confidence = last recorded confidence for this skill, adjusted.
|
|
194
|
+
const lastConfidence = prior[prior.length - 1]?.confidence
|
|
195
|
+
?? computeInitialConfidence(prior.length);
|
|
196
|
+
return adjustConfidence(lastConfidence, passed);
|
|
197
|
+
}
|
|
198
|
+
|
|
165
199
|
/**
|
|
166
200
|
* Apply decay to confidence for skills not observed recently.
|
|
167
201
|
*/
|
|
@@ -424,7 +458,13 @@ export function registerSkillEffectivenessHooks(): void {
|
|
|
424
458
|
if (hooksRegistered) return;
|
|
425
459
|
hooksRegistered = true;
|
|
426
460
|
|
|
427
|
-
// Track task completion for skill effectiveness
|
|
461
|
+
// Track task completion for skill effectiveness.
|
|
462
|
+
// T7 (v0.8.2): record the ROLLING adjusted confidence, not a hardcoded
|
|
463
|
+
// 0.3. computeNextActivationConfidence seeds the first activation and
|
|
464
|
+
// applies adjustConfidence (+0.05 success / -0.1 failure) on subsequent
|
|
465
|
+
// ones, so the stored confidence evolves across the run. Before this fix
|
|
466
|
+
// every activation was written with confidence 0.3, which made
|
|
467
|
+
// adjustConfidence dead code and left every skill stuck at ~0.3.
|
|
428
468
|
crewHooks.register("task_completed", (event) => {
|
|
429
469
|
const { taskId, runId, data } = event;
|
|
430
470
|
if (!taskId || !runId) return;
|
|
@@ -433,8 +473,19 @@ export function registerSkillEffectivenessHooks(): void {
|
|
|
433
473
|
const skillNames = (data?.skills as string[]) ?? [];
|
|
434
474
|
const success = (data?.status as string) === "completed";
|
|
435
475
|
|
|
436
|
-
//
|
|
476
|
+
// cwd comes from the event payload (set by callers) so that the
|
|
477
|
+
// activation lands in the correct .pi/teams/ or .crew/state/runs/
|
|
478
|
+
// (see issue #29).
|
|
479
|
+
const eventCwd = (data?.cwd as string) ?? process.cwd();
|
|
480
|
+
const existingActivations = getSkillActivations(eventCwd, runId);
|
|
481
|
+
|
|
482
|
+
// Record each skill activation with its rolling confidence
|
|
437
483
|
for (const skillId of skillNames) {
|
|
484
|
+
const confidence = computeNextActivationConfidence(
|
|
485
|
+
skillId,
|
|
486
|
+
existingActivations,
|
|
487
|
+
success,
|
|
488
|
+
);
|
|
438
489
|
const activation: SkillActivation = {
|
|
439
490
|
id: `act-${Date.now()}-${Math.random().toString(36).slice(2)}`,
|
|
440
491
|
skillId,
|
|
@@ -443,23 +494,46 @@ export function registerSkillEffectivenessHooks(): void {
|
|
|
443
494
|
taskId,
|
|
444
495
|
timestamp: new Date().toISOString(),
|
|
445
496
|
passed: success,
|
|
446
|
-
confidence
|
|
497
|
+
confidence,
|
|
447
498
|
};
|
|
448
|
-
// cwd comes from the event payload (set by callers) so that the
|
|
449
|
-
// activation lands in the correct .pi/teams/ or .crew/state/runs/
|
|
450
|
-
// (see issue #29).
|
|
451
|
-
const eventCwd = (data?.cwd as string) ?? process.cwd();
|
|
452
499
|
recordSkillActivation(eventCwd, activation);
|
|
453
500
|
}
|
|
454
501
|
});
|
|
455
502
|
|
|
456
|
-
// Track task failures
|
|
503
|
+
// Track task failures.
|
|
504
|
+
// T7 (v0.8.2): this used to be a no-op ("handled by computeSkillMetrics"),
|
|
505
|
+
// but computeSkillMetrics derives passRate from recorded activations —
|
|
506
|
+
// and since failed tasks recorded NOTHING, the failure never fed back
|
|
507
|
+
// into the confidence/decay loop. Now we record a `passed:false`
|
|
508
|
+
// activation for each skill tied to the failed task, which both lowers
|
|
509
|
+
// passRate AND triggers the -0.1 contradicting delta via
|
|
510
|
+
// computeNextActivationConfidence on the next recorded activation.
|
|
457
511
|
crewHooks.register("task_failed", (event) => {
|
|
458
512
|
const { taskId, runId, data } = event;
|
|
459
513
|
if (!taskId || !runId) return;
|
|
460
514
|
|
|
461
|
-
|
|
462
|
-
|
|
515
|
+
const skillNames = (data?.skills as string[]) ?? [];
|
|
516
|
+
const eventCwd = (data?.cwd as string) ?? process.cwd();
|
|
517
|
+
const existingActivations = getSkillActivations(eventCwd, runId);
|
|
518
|
+
|
|
519
|
+
for (const skillId of skillNames) {
|
|
520
|
+
const confidence = computeNextActivationConfidence(
|
|
521
|
+
skillId,
|
|
522
|
+
existingActivations,
|
|
523
|
+
false,
|
|
524
|
+
);
|
|
525
|
+
const activation: SkillActivation = {
|
|
526
|
+
id: `act-${Date.now()}-${Math.random().toString(36).slice(2)}`,
|
|
527
|
+
skillId,
|
|
528
|
+
role: (data?.role as string) ?? "unknown",
|
|
529
|
+
runId,
|
|
530
|
+
taskId,
|
|
531
|
+
timestamp: new Date().toISOString(),
|
|
532
|
+
passed: false,
|
|
533
|
+
confidence,
|
|
534
|
+
};
|
|
535
|
+
recordSkillActivation(eventCwd, activation);
|
|
536
|
+
}
|
|
463
537
|
});
|
|
464
538
|
}
|
|
465
539
|
|