akm-cli 0.9.0-beta.56 → 0.9.0-beta.58

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/dist/assets/prompts/extract-session.md +5 -1
  2. package/dist/cli/config-migrate.js +7 -1
  3. package/dist/commands/config-cli.js +8 -11
  4. package/dist/commands/health/stash-exposure.js +46 -0
  5. package/dist/commands/health/windows.js +6 -7
  6. package/dist/commands/health.js +31 -10
  7. package/dist/commands/improve/collapse-detector.js +2 -1
  8. package/dist/commands/improve/consolidate.js +207 -159
  9. package/dist/commands/improve/distill/promote-memory.js +4 -3
  10. package/dist/commands/improve/distill/quality-gate.js +7 -4
  11. package/dist/commands/improve/distill-promotion-policy.js +826 -167
  12. package/dist/commands/improve/distill.js +26 -12
  13. package/dist/commands/improve/extract-prompt.js +16 -2
  14. package/dist/commands/improve/extract.js +16 -8
  15. package/dist/commands/improve/improve-auto-accept.js +22 -1
  16. package/dist/commands/improve/loop-stages.js +7 -2
  17. package/dist/commands/improve/memory/memory-belief.js +14 -15
  18. package/dist/commands/improve/memory/memory-contradiction-detect.js +60 -32
  19. package/dist/commands/improve/memory/memory-improve.js +27 -27
  20. package/dist/commands/improve/preparation.js +4 -0
  21. package/dist/commands/improve/procedural.js +1 -0
  22. package/dist/commands/improve/recombine.js +1 -0
  23. package/dist/commands/improve/reflect-noise.js +1 -1
  24. package/dist/commands/improve/reflect.js +4 -3
  25. package/dist/commands/improve/shared.js +9 -6
  26. package/dist/commands/proposal/drain-policies.js +4 -2
  27. package/dist/commands/read/remember-cli.js +1 -1
  28. package/dist/commands/read/show.js +15 -0
  29. package/dist/commands/remember.js +11 -12
  30. package/dist/commands/sources/init.js +5 -1
  31. package/dist/commands/sources/stash-skeleton.js +34 -0
  32. package/dist/core/asset/frontmatter.js +22 -0
  33. package/dist/core/common.js +1 -15
  34. package/dist/core/config/config-io.js +10 -1
  35. package/dist/core/config/config-migration.js +2 -15
  36. package/dist/core/config/config-schema.js +15 -3
  37. package/dist/core/config/config.js +22 -14
  38. package/dist/core/paths.js +4 -4
  39. package/dist/core/time.js +53 -0
  40. package/dist/indexer/db/db.js +51 -46
  41. package/dist/indexer/indexer.js +77 -65
  42. package/dist/indexer/search/db-search.js +41 -6
  43. package/dist/indexer/search/ranking-contributors.js +14 -8
  44. package/dist/indexer/search/search-source.js +15 -3
  45. package/dist/integrations/agent/profiles.js +7 -1
  46. package/dist/llm/feature-gate.js +4 -8
  47. package/dist/output/renderers.js +4 -0
  48. package/dist/scripts/migrate-storage.js +84 -60
  49. package/dist/scripts/migrations/import-fs-improve-runs-to-db.js +6 -0
  50. package/dist/storage/repositories/registry-cache.js +2 -1
  51. package/dist/storage/repositories/registry-index-cache-repository.js +46 -0
  52. package/dist/workflows/runtime/runs.js +6 -1
  53. package/package.json +1 -1
@@ -42,9 +42,6 @@ function deriveDescription(body, description) {
42
42
  ?.replace(/^#+\s*/, "")
43
43
  ?.slice(0, 160);
44
44
  }
45
- function memoryContent(frontmatter, body) {
46
- return ["---", ...frontmatter, "---", "", body, ""].join("\n");
47
- }
48
45
  export function deriveKnowledgeRef(inputRef) {
49
46
  const parsed = parseAssetRef(inputRef);
50
47
  return `knowledge:${parsed.name}`;
@@ -419,7 +416,7 @@ const METADATA_BASELINE = {
419
416
  };
420
417
  },
421
418
  };
422
- function selectPromotionPolicy(corpus) {
419
+ export function selectPromotionPolicy(corpus) {
423
420
  const trainingCases = corpus.filter((testCase) => (testCase.split ?? "train") === "train");
424
421
  const heldOutCases = corpus.filter((testCase) => (testCase.split ?? "train") === "heldout");
425
422
  let bestPolicy;
@@ -481,171 +478,833 @@ function selectPromotionPolicy(corpus) {
481
478
  strictlyBeatsBaselines,
482
479
  };
483
480
  }
484
- function benchmarkCase(name, expectPromote, split, frontmatter, body, feedbackSignals, outcome) {
485
- return {
486
- name,
487
- expectPromote,
488
- split,
489
- input: {
490
- inputRef: `memory:${name}`,
491
- assetContent: memoryContent(frontmatter, body),
492
- feedbackEvents: feedbackSignals.map((signal) => ({ metadata: { signal } })),
493
- },
494
- ...outcome,
495
- };
496
- }
497
- export const DEFAULT_PROMOTION_POLICY_CORPUS = [
498
- benchmarkCase("deploy-vpn-required", true, "train", [
499
- "description: VPN required before deploy",
500
- "source: skill:deploy",
501
- "observed_at: 2026-04-20",
502
- "confidence: 0.95",
503
- "tags: [deploy, ops]",
504
- ], "Always connect the VPN before starting production deploys.", ["positive", "positive"]),
505
- benchmarkCase("release-smoke-test", true, "train", [
506
- "description: Smoke test gates release",
507
- "quality: curated",
508
- "source: skill:release",
509
- "observed_at: 2026-04-18",
510
- "confidence: 0.85",
511
- ], "Run the smoke test before tagging a release candidate.", ["positive", "positive", "positive"]),
512
- benchmarkCase("kubernetes-rollout-check", true, "train", [
513
- "description: Verify rollout status after apply",
514
- "source: skill:k8s",
515
- "observed_at: 2026-04-15",
516
- "confidence: 0.95",
517
- "tags: [k8s]",
518
- ], "Check rollout status after kubectl apply before declaring success.", ["positive", "positive"]),
519
- benchmarkCase("incident-channel-rule", true, "train", [
520
- "description: Incident bridge stays single-threaded",
521
- "quality: curated",
522
- "source: skill:incident",
523
- "observed_at: 2026-04-12",
524
- "confidence: 0.95",
525
- ], "Keep one operator narrating decisions in the incident bridge to avoid conflicting instructions.", ["positive", "positive", "positive"]),
526
- benchmarkCase("weak-single-signal", false, "train", [
527
- "description: VPN required before deploy",
528
- "source: skill:deploy",
529
- "observed_at: 2026-04-20",
530
- "confidence: 0.95",
531
- "tags: [deploy]",
532
- ], "Always connect the VPN before starting production deploys.", ["positive"]),
533
- benchmarkCase("contested-fact", false, "train", [
534
- "description: VPN required before deploy",
535
- "quality: curated",
536
- "source: skill:deploy",
537
- "observed_at: 2026-04-20",
538
- "confidence: 0.95",
539
- ], "Always connect the VPN before starting production deploys.", ["positive", "negative", "positive"], { falsePromoteCost: 5 }),
540
- benchmarkCase("tentative-fact", false, "train", ["description: Deploy may require VPN", "source: skill:deploy", "observed_at: 2026-04-20", "confidence: 0.95"], "Maybe connect the VPN before starting production deploys.", ["positive", "positive"]),
541
- benchmarkCase("subjective-preference", false, "train", [
542
- "description: VPN required before deploy",
543
- "subjective: true",
544
- "source: skill:deploy",
545
- "observed_at: 2026-04-20",
546
- "confidence: 0.95",
547
- ], "I prefer connecting the VPN before starting production deploys.", ["positive", "positive"]),
548
- benchmarkCase("feedback-conflict", false, "train", ["description: VPN required before deploy", "source: skill:deploy", "observed_at: 2026-04-20", "confidence: 0.95"], "Always connect the VPN before starting production deploys.", ["positive", "positive"]),
549
- {
550
- name: "feedback-conflict",
551
- expectPromote: false,
552
- split: "train",
553
- input: {
554
- inputRef: "memory:feedback-conflict",
555
- assetContent: memoryContent([
556
- "description: VPN required before deploy",
557
- "source: skill:deploy",
558
- "observed_at: 2026-04-20",
559
- "confidence: 0.95",
560
- ], "Always connect the VPN before starting production deploys."),
561
- feedbackEvents: [{ metadata: { signal: "positive" } }, { metadata: { signal: "positive", conflict: true } }],
562
- },
481
+ // Frozen module-load selection. The promotion policy's selected model was
482
+ // historically computed by running `selectPromotionPolicy` over a large
483
+ // hardcoded benchmark corpus at import time (a full grid search on every module
484
+ // load). That corpus and the grid search now live in the bench test
485
+ // (tests/commands/distill/distill-promotion-policy.bench.test.ts), which
486
+ // re-runs `selectPromotionPolicy(DEFAULT_PROMOTION_POLICY_CORPUS)` and asserts
487
+ // deep equality with this constant so the freeze stays honest.
488
+ export const DEFAULT_PROMOTION_POLICY_SELECTION = {
489
+ corpusSize: 21,
490
+ trainingSize: 14,
491
+ heldOutSize: 7,
492
+ selectedModel: {
493
+ name: "balanced-evidence",
494
+ threshold: 3.8,
563
495
  },
564
- benchmarkCase("derived-memory", false, "train", ["description: VPN required before deploy", "source: skill:deploy", "confidence: 0.95"], "Always connect the VPN before starting production deploys.", ["positive", "positive"]),
565
- {
566
- name: "derived-memory",
567
- expectPromote: false,
568
- split: "train",
569
- input: {
570
- inputRef: "memory:derived-memory.derived",
571
- assetContent: memoryContent(["description: VPN required before deploy", "source: skill:deploy", "confidence: 0.95"], "Always connect the VPN before starting production deploys."),
572
- feedbackEvents: [{ metadata: { signal: "positive" } }, { metadata: { signal: "positive" } }],
573
- },
496
+ training: {
497
+ total: 14,
498
+ correct: 13,
499
+ falsePositives: 1,
500
+ falseNegatives: 0,
501
+ accuracy: 0.9285714285714286,
502
+ precision: 0.8333333333333334,
503
+ recall: 1,
504
+ f1: 0.9090909090909091,
505
+ truePositives: 5,
506
+ trueNegatives: 8,
507
+ netOutcomeScore: 11,
508
+ capturedPromoteValue: 15,
509
+ preventedFalsePromotionCost: 33,
510
+ results: [
511
+ {
512
+ name: "deploy-vpn-required",
513
+ expectPromote: true,
514
+ assessment: {
515
+ applicable: true,
516
+ promote: true,
517
+ score: 4.250000000000001,
518
+ threshold: 3.8,
519
+ knowledgeRef: "knowledge:deploy-vpn-required",
520
+ content: "---\ndescription: VPN required before deploy\ntags:\n - deploy\n - ops\nobserved_at: 2026-04-20\nsources:\n - memory:deploy-vpn-required\n - skill:deploy\n---\n\nAlways connect the VPN before starting production deploys.\n",
521
+ blockedBy: [],
522
+ positiveSignals: [
523
+ "2 positive feedback events",
524
+ "repeated reinforcement",
525
+ "strong confidence",
526
+ "linked source",
527
+ "observed_at present",
528
+ "description present",
529
+ "tagged memory",
530
+ "substantive body",
531
+ ],
532
+ negativeSignals: [],
533
+ modelName: "balanced-evidence",
534
+ },
535
+ passed: true,
536
+ },
537
+ {
538
+ name: "release-smoke-test",
539
+ expectPromote: true,
540
+ assessment: {
541
+ applicable: true,
542
+ promote: true,
543
+ score: 5.240000000000002,
544
+ threshold: 3.8,
545
+ knowledgeRef: "knowledge:release-smoke-test",
546
+ content: "---\ndescription: Smoke test gates release\nobserved_at: 2026-04-18\nsources:\n - memory:release-smoke-test\n - skill:release\n---\n\nRun the smoke test before tagging a release candidate.\n",
547
+ blockedBy: [],
548
+ positiveSignals: [
549
+ "3 positive feedback events",
550
+ "repeated reinforcement",
551
+ "curated quality",
552
+ "strong confidence",
553
+ "linked source",
554
+ "observed_at present",
555
+ "description present",
556
+ "substantive body",
557
+ ],
558
+ negativeSignals: [],
559
+ modelName: "balanced-evidence",
560
+ },
561
+ passed: true,
562
+ },
563
+ {
564
+ name: "kubernetes-rollout-check",
565
+ expectPromote: true,
566
+ assessment: {
567
+ applicable: true,
568
+ promote: true,
569
+ score: 4.250000000000001,
570
+ threshold: 3.8,
571
+ knowledgeRef: "knowledge:kubernetes-rollout-check",
572
+ content: "---\ndescription: Verify rollout status after apply\ntags:\n - k8s\nobserved_at: 2026-04-15\nsources:\n - memory:kubernetes-rollout-check\n - skill:k8s\n---\n\nCheck rollout status after kubectl apply before declaring success.\n",
573
+ blockedBy: [],
574
+ positiveSignals: [
575
+ "2 positive feedback events",
576
+ "repeated reinforcement",
577
+ "strong confidence",
578
+ "linked source",
579
+ "observed_at present",
580
+ "description present",
581
+ "tagged memory",
582
+ "substantive body",
583
+ ],
584
+ negativeSignals: [],
585
+ modelName: "balanced-evidence",
586
+ },
587
+ passed: true,
588
+ },
589
+ {
590
+ name: "incident-channel-rule",
591
+ expectPromote: true,
592
+ assessment: {
593
+ applicable: true,
594
+ promote: true,
595
+ score: 5.450000000000002,
596
+ threshold: 3.8,
597
+ knowledgeRef: "knowledge:incident-channel-rule",
598
+ content: "---\ndescription: Incident bridge stays single-threaded\nobserved_at: 2026-04-12\nsources:\n - memory:incident-channel-rule\n - skill:incident\n---\n\nKeep one operator narrating decisions in the incident bridge to avoid conflicting instructions.\n",
599
+ blockedBy: [],
600
+ positiveSignals: [
601
+ "3 positive feedback events",
602
+ "repeated reinforcement",
603
+ "curated quality",
604
+ "strong confidence",
605
+ "linked source",
606
+ "observed_at present",
607
+ "description present",
608
+ "substantive body",
609
+ ],
610
+ negativeSignals: [],
611
+ modelName: "balanced-evidence",
612
+ },
613
+ passed: true,
614
+ },
615
+ {
616
+ name: "weak-single-signal",
617
+ expectPromote: false,
618
+ assessment: {
619
+ applicable: true,
620
+ promote: false,
621
+ score: 2.1,
622
+ threshold: 3.8,
623
+ knowledgeRef: "knowledge:weak-single-signal",
624
+ blockedBy: [],
625
+ positiveSignals: [
626
+ "1 positive feedback event",
627
+ "strong confidence",
628
+ "linked source",
629
+ "observed_at present",
630
+ "description present",
631
+ "tagged memory",
632
+ "substantive body",
633
+ ],
634
+ negativeSignals: ["only one reinforcing feedback event"],
635
+ modelName: "balanced-evidence",
636
+ },
637
+ passed: true,
638
+ },
639
+ {
640
+ name: "contested-fact",
641
+ expectPromote: false,
642
+ assessment: {
643
+ applicable: true,
644
+ promote: false,
645
+ score: 2.65,
646
+ threshold: 3.8,
647
+ knowledgeRef: "knowledge:contested-fact",
648
+ blockedBy: [],
649
+ positiveSignals: [
650
+ "2 positive feedback events",
651
+ "repeated reinforcement",
652
+ "curated quality",
653
+ "strong confidence",
654
+ "linked source",
655
+ "observed_at present",
656
+ "description present",
657
+ "substantive body",
658
+ ],
659
+ negativeSignals: ["1 negative feedback event"],
660
+ modelName: "balanced-evidence",
661
+ },
662
+ passed: true,
663
+ },
664
+ {
665
+ name: "tentative-fact",
666
+ expectPromote: false,
667
+ assessment: {
668
+ applicable: true,
669
+ promote: false,
670
+ score: 3.0000000000000004,
671
+ threshold: 3.8,
672
+ knowledgeRef: "knowledge:tentative-fact",
673
+ blockedBy: [],
674
+ positiveSignals: [
675
+ "2 positive feedback events",
676
+ "repeated reinforcement",
677
+ "strong confidence",
678
+ "linked source",
679
+ "observed_at present",
680
+ "description present",
681
+ "substantive body",
682
+ ],
683
+ negativeSignals: ["tentative language"],
684
+ modelName: "balanced-evidence",
685
+ },
686
+ passed: true,
687
+ },
688
+ {
689
+ name: "subjective-preference",
690
+ expectPromote: false,
691
+ assessment: {
692
+ applicable: true,
693
+ promote: false,
694
+ score: 0,
695
+ threshold: 3.8,
696
+ knowledgeRef: "knowledge:subjective-preference",
697
+ blockedBy: ["subjective-memory"],
698
+ positiveSignals: [],
699
+ negativeSignals: [],
700
+ modelName: "balanced-evidence",
701
+ },
702
+ passed: true,
703
+ },
704
+ {
705
+ name: "feedback-conflict",
706
+ expectPromote: false,
707
+ assessment: {
708
+ applicable: true,
709
+ promote: true,
710
+ score: 4.1000000000000005,
711
+ threshold: 3.8,
712
+ knowledgeRef: "knowledge:feedback-conflict",
713
+ content: "---\ndescription: VPN required before deploy\nobserved_at: 2026-04-20\nsources:\n - memory:feedback-conflict\n - skill:deploy\n---\n\nAlways connect the VPN before starting production deploys.\n",
714
+ blockedBy: [],
715
+ positiveSignals: [
716
+ "2 positive feedback events",
717
+ "repeated reinforcement",
718
+ "strong confidence",
719
+ "linked source",
720
+ "observed_at present",
721
+ "description present",
722
+ "substantive body",
723
+ ],
724
+ negativeSignals: [],
725
+ modelName: "balanced-evidence",
726
+ },
727
+ passed: false,
728
+ },
729
+ {
730
+ name: "derived-memory",
731
+ expectPromote: false,
732
+ assessment: {
733
+ applicable: true,
734
+ promote: false,
735
+ score: 3.7,
736
+ threshold: 3.8,
737
+ knowledgeRef: "knowledge:derived-memory",
738
+ blockedBy: [],
739
+ positiveSignals: [
740
+ "2 positive feedback events",
741
+ "repeated reinforcement",
742
+ "strong confidence",
743
+ "linked source",
744
+ "description present",
745
+ "substantive body",
746
+ ],
747
+ negativeSignals: [],
748
+ modelName: "balanced-evidence",
749
+ },
750
+ passed: true,
751
+ },
752
+ {
753
+ name: "staging-cutover-order",
754
+ expectPromote: true,
755
+ assessment: {
756
+ applicable: true,
757
+ promote: true,
758
+ score: 4.840000000000001,
759
+ threshold: 3.8,
760
+ knowledgeRef: "knowledge:staging-cutover-order",
761
+ content: "---\ndescription: Cut over staging after migrations\ntags:\n - db\n - deploy\nobserved_at: 2026-04-10\nsources:\n - memory:staging-cutover-order\n - skill:database\n---\n\nRun database migrations before shifting staging traffic onto the new release.\n",
762
+ blockedBy: [],
763
+ positiveSignals: [
764
+ "3 positive feedback events",
765
+ "repeated reinforcement",
766
+ "strong confidence",
767
+ "linked source",
768
+ "observed_at present",
769
+ "description present",
770
+ "tagged memory",
771
+ "substantive body",
772
+ ],
773
+ negativeSignals: [],
774
+ modelName: "balanced-evidence",
775
+ },
776
+ passed: true,
777
+ },
778
+ {
779
+ name: "temporary-token-workaround",
780
+ expectPromote: false,
781
+ assessment: {
782
+ applicable: true,
783
+ promote: false,
784
+ score: 0,
785
+ threshold: 3.8,
786
+ knowledgeRef: "knowledge:temporary-token-workaround",
787
+ blockedBy: ["expiring-memory"],
788
+ positiveSignals: [],
789
+ negativeSignals: [],
790
+ modelName: "balanced-evidence",
791
+ },
792
+ passed: true,
793
+ },
794
+ {
795
+ name: "thin-metadata-memory",
796
+ expectPromote: false,
797
+ assessment: {
798
+ applicable: true,
799
+ promote: false,
800
+ score: 3,
801
+ threshold: 3.8,
802
+ knowledgeRef: "knowledge:thin-metadata-memory",
803
+ blockedBy: [],
804
+ positiveSignals: [
805
+ "2 positive feedback events",
806
+ "repeated reinforcement",
807
+ "linked source",
808
+ "description present",
809
+ "substantive body",
810
+ ],
811
+ negativeSignals: [],
812
+ modelName: "balanced-evidence",
813
+ },
814
+ passed: true,
815
+ },
816
+ {
817
+ name: "promoted-quality-memory",
818
+ expectPromote: false,
819
+ assessment: {
820
+ applicable: true,
821
+ promote: false,
822
+ score: 0,
823
+ threshold: 3.8,
824
+ knowledgeRef: "knowledge:promoted-quality-memory",
825
+ blockedBy: ["proposed-quality"],
826
+ positiveSignals: [],
827
+ negativeSignals: [],
828
+ modelName: "balanced-evidence",
829
+ },
830
+ passed: true,
831
+ },
832
+ ],
574
833
  },
575
- benchmarkCase("staging-cutover-order", true, "train", [
576
- "description: Cut over staging after migrations",
577
- "source: skill:database",
578
- "observed_at: 2026-04-10",
579
- "confidence: 0.85",
580
- "tags: [db, deploy]",
581
- ], "Run database migrations before shifting staging traffic onto the new release.", ["positive", "positive", "positive"]),
582
- benchmarkCase("temporary-token-workaround", false, "train", [
583
- "description: Temporary deploy token workaround",
584
- "source: skill:deploy",
585
- "observed_at: 2026-04-20",
586
- "confidence: 0.95",
587
- "expires: 2026-06-01",
588
- ], "Use the temporary deploy token workaround until the incident is closed.", ["positive", "positive"]),
589
- benchmarkCase("thin-metadata-memory", false, "train", ["description: VPN required before deploy", "source: skill:deploy"], "Always connect the VPN before starting production deploys.", ["positive", "positive"]),
590
- benchmarkCase("promoted-quality-memory", false, "train", [
591
- "description: VPN required before deploy",
592
- "quality: proposed",
593
- "source: skill:deploy",
594
- "observed_at: 2026-04-20",
595
- "confidence: 0.95",
596
- ], "Always connect the VPN before starting production deploys.", ["positive", "positive"]),
597
- benchmarkCase("kafka-rebalance-note", true, "heldout", [
598
- "description: Pause consumers during rebalance",
599
- "quality: curated",
600
- "source: skill:kafka",
601
- "observed_at: 2026-04-08",
602
- "confidence: 0.95",
603
- "tags: [kafka, ops]",
604
- ], "Pause consumers during partition rebalances to avoid duplicate processing while assignments settle.", ["positive", "positive", "positive"], { promoteValue: 4 }),
605
- benchmarkCase("gha-token-scope", true, "heldout", [
606
- "description: Minimize GitHub token scopes",
607
- "source: skill:github-actions",
608
- "observed_at: 2026-04-07",
609
- "confidence: 0.85",
610
- "tags: [gha, security]",
611
- ], "Use the narrowest GitHub token scope that still allows the workflow step to succeed.", ["positive", "positive"], { promoteValue: 4 }),
612
- benchmarkCase("helm-debug-guess", false, "heldout", [
613
- "description: Helm upgrade might need --debug",
614
- "source: skill:helm",
615
- "observed_at: 2026-04-05",
616
- "confidence: 0.85",
617
- ], "It might help to add --debug to helm upgrade output during failures.", ["positive", "positive"]),
618
- benchmarkCase("terraform-state-location", true, "heldout", [
619
- "description: Use remote state locks",
620
- "quality: curated",
621
- "source: skill:terraform",
622
- "observed_at: 2026-04-04",
623
- "confidence: 0.95",
624
- "tags: [terraform]",
625
- ], "Use remote state with locking enabled before applying shared Terraform stacks.", ["positive", "positive", "positive"]),
626
- benchmarkCase("mixed-signal-rollback", false, "heldout", [
627
- "description: Rollback the cluster immediately",
628
- "quality: curated",
629
- "source: skill:incident",
630
- "observed_at: 2026-04-03",
631
- "confidence: 0.95",
632
- ], "Rollback the cluster immediately after any 5xx spike.", ["positive", "negative", "positive"], { falsePromoteCost: 6 }),
633
- benchmarkCase("cache-ttl-fact", true, "heldout", [
634
- "description: Cache TTL defaults to five minutes",
635
- "source: skill:platform",
636
- "observed_at: 2026-04-02",
637
- "confidence: 0.95",
638
- "tags: [cache, platform]",
639
- ], "The shared platform cache TTL defaults to five minutes unless the service opts out.", ["positive", "positive"]),
640
- benchmarkCase("personal-shell-alias", false, "heldout", [
641
- "description: Preferred shell alias",
642
- "subjective: true",
643
- "source: skill:shell",
644
- "observed_at: 2026-04-01",
645
- "confidence: 0.95",
646
- ], "I prefer aliasing kubectl to k.", ["positive", "positive"]),
647
- ];
648
- export const DEFAULT_PROMOTION_POLICY_SELECTION = selectPromotionPolicy(DEFAULT_PROMOTION_POLICY_CORPUS);
834
+ heldOut: {
835
+ total: 7,
836
+ correct: 7,
837
+ falsePositives: 0,
838
+ falseNegatives: 0,
839
+ accuracy: 1,
840
+ precision: 1,
841
+ recall: 1,
842
+ f1: 1,
843
+ truePositives: 4,
844
+ trueNegatives: 3,
845
+ netOutcomeScore: 14,
846
+ capturedPromoteValue: 14,
847
+ preventedFalsePromotionCost: 14,
848
+ results: [
849
+ {
850
+ name: "kafka-rebalance-note",
851
+ expectPromote: true,
852
+ assessment: {
853
+ applicable: true,
854
+ promote: true,
855
+ score: 5.600000000000002,
856
+ threshold: 3.8,
857
+ knowledgeRef: "knowledge:kafka-rebalance-note",
858
+ content: "---\ndescription: Pause consumers during rebalance\ntags:\n - kafka\n - ops\nobserved_at: 2026-04-08\nsources:\n - memory:kafka-rebalance-note\n - skill:kafka\n---\n\nPause consumers during partition rebalances to avoid duplicate processing while assignments settle.\n",
859
+ blockedBy: [],
860
+ positiveSignals: [
861
+ "3 positive feedback events",
862
+ "repeated reinforcement",
863
+ "curated quality",
864
+ "strong confidence",
865
+ "linked source",
866
+ "observed_at present",
867
+ "description present",
868
+ "tagged memory",
869
+ "substantive body",
870
+ ],
871
+ negativeSignals: [],
872
+ modelName: "balanced-evidence",
873
+ },
874
+ passed: true,
875
+ },
876
+ {
877
+ name: "gha-token-scope",
878
+ expectPromote: true,
879
+ assessment: {
880
+ applicable: true,
881
+ promote: true,
882
+ score: 4.04,
883
+ threshold: 3.8,
884
+ knowledgeRef: "knowledge:gha-token-scope",
885
+ content: "---\ndescription: Minimize GitHub token scopes\ntags:\n - gha\n - security\nobserved_at: 2026-04-07\nsources:\n - memory:gha-token-scope\n - skill:github-actions\n---\n\nUse the narrowest GitHub token scope that still allows the workflow step to succeed.\n",
886
+ blockedBy: [],
887
+ positiveSignals: [
888
+ "2 positive feedback events",
889
+ "repeated reinforcement",
890
+ "strong confidence",
891
+ "linked source",
892
+ "observed_at present",
893
+ "description present",
894
+ "tagged memory",
895
+ "substantive body",
896
+ ],
897
+ negativeSignals: [],
898
+ modelName: "balanced-evidence",
899
+ },
900
+ passed: true,
901
+ },
902
+ {
903
+ name: "helm-debug-guess",
904
+ expectPromote: false,
905
+ assessment: {
906
+ applicable: true,
907
+ promote: false,
908
+ score: 2.7899999999999996,
909
+ threshold: 3.8,
910
+ knowledgeRef: "knowledge:helm-debug-guess",
911
+ blockedBy: [],
912
+ positiveSignals: [
913
+ "2 positive feedback events",
914
+ "repeated reinforcement",
915
+ "strong confidence",
916
+ "linked source",
917
+ "observed_at present",
918
+ "description present",
919
+ "substantive body",
920
+ ],
921
+ negativeSignals: ["tentative language"],
922
+ modelName: "balanced-evidence",
923
+ },
924
+ passed: true,
925
+ },
926
+ {
927
+ name: "terraform-state-location",
928
+ expectPromote: true,
929
+ assessment: {
930
+ applicable: true,
931
+ promote: true,
932
+ score: 5.600000000000002,
933
+ threshold: 3.8,
934
+ knowledgeRef: "knowledge:terraform-state-location",
935
+ content: "---\ndescription: Use remote state locks\ntags:\n - terraform\nobserved_at: 2026-04-04\nsources:\n - memory:terraform-state-location\n - skill:terraform\n---\n\nUse remote state with locking enabled before applying shared Terraform stacks.\n",
936
+ blockedBy: [],
937
+ positiveSignals: [
938
+ "3 positive feedback events",
939
+ "repeated reinforcement",
940
+ "curated quality",
941
+ "strong confidence",
942
+ "linked source",
943
+ "observed_at present",
944
+ "description present",
945
+ "tagged memory",
946
+ "substantive body",
947
+ ],
948
+ negativeSignals: [],
949
+ modelName: "balanced-evidence",
950
+ },
951
+ passed: true,
952
+ },
953
+ {
954
+ name: "mixed-signal-rollback",
955
+ expectPromote: false,
956
+ assessment: {
957
+ applicable: true,
958
+ promote: false,
959
+ score: 2.65,
960
+ threshold: 3.8,
961
+ knowledgeRef: "knowledge:mixed-signal-rollback",
962
+ blockedBy: [],
963
+ positiveSignals: [
964
+ "2 positive feedback events",
965
+ "repeated reinforcement",
966
+ "curated quality",
967
+ "strong confidence",
968
+ "linked source",
969
+ "observed_at present",
970
+ "description present",
971
+ "substantive body",
972
+ ],
973
+ negativeSignals: ["1 negative feedback event"],
974
+ modelName: "balanced-evidence",
975
+ },
976
+ passed: true,
977
+ },
978
+ {
979
+ name: "cache-ttl-fact",
980
+ expectPromote: true,
981
+ assessment: {
982
+ applicable: true,
983
+ promote: true,
984
+ score: 4.250000000000001,
985
+ threshold: 3.8,
986
+ knowledgeRef: "knowledge:cache-ttl-fact",
987
+ content: "---\ndescription: Cache TTL defaults to five minutes\ntags:\n - cache\n - platform\nobserved_at: 2026-04-02\nsources:\n - memory:cache-ttl-fact\n - skill:platform\n---\n\nThe shared platform cache TTL defaults to five minutes unless the service opts out.\n",
988
+ blockedBy: [],
989
+ positiveSignals: [
990
+ "2 positive feedback events",
991
+ "repeated reinforcement",
992
+ "strong confidence",
993
+ "linked source",
994
+ "observed_at present",
995
+ "description present",
996
+ "tagged memory",
997
+ "substantive body",
998
+ ],
999
+ negativeSignals: [],
1000
+ modelName: "balanced-evidence",
1001
+ },
1002
+ passed: true,
1003
+ },
1004
+ {
1005
+ name: "personal-shell-alias",
1006
+ expectPromote: false,
1007
+ assessment: {
1008
+ applicable: true,
1009
+ promote: false,
1010
+ score: 0,
1011
+ threshold: 3.8,
1012
+ knowledgeRef: "knowledge:personal-shell-alias",
1013
+ blockedBy: ["subjective-memory"],
1014
+ positiveSignals: [],
1015
+ negativeSignals: [],
1016
+ modelName: "balanced-evidence",
1017
+ },
1018
+ passed: true,
1019
+ },
1020
+ ],
1021
+ },
1022
+ baselines: [
1023
+ {
1024
+ name: "baseline-positive-feedback",
1025
+ heldOut: {
1026
+ total: 7,
1027
+ correct: 5,
1028
+ falsePositives: 2,
1029
+ falseNegatives: 0,
1030
+ accuracy: 0.7142857142857143,
1031
+ precision: 0.6666666666666666,
1032
+ recall: 1,
1033
+ f1: 0.8,
1034
+ truePositives: 4,
1035
+ trueNegatives: 1,
1036
+ netOutcomeScore: 4,
1037
+ capturedPromoteValue: 14,
1038
+ preventedFalsePromotionCost: 4,
1039
+ results: [
1040
+ {
1041
+ name: "kafka-rebalance-note",
1042
+ expectPromote: true,
1043
+ assessment: {
1044
+ applicable: true,
1045
+ promote: true,
1046
+ score: 3,
1047
+ threshold: 2,
1048
+ knowledgeRef: "knowledge:kafka-rebalance-note",
1049
+ content: "---\ndescription: Pause consumers during rebalance\ntags:\n - kafka\n - ops\nobserved_at: 2026-04-08\nsources:\n - memory:kafka-rebalance-note\n - skill:kafka\n---\n\nPause consumers during partition rebalances to avoid duplicate processing while assignments settle.\n",
1050
+ blockedBy: [],
1051
+ positiveSignals: ["baseline positive feedback rule"],
1052
+ negativeSignals: [],
1053
+ modelName: "baseline-positive-feedback",
1054
+ },
1055
+ passed: true,
1056
+ },
1057
+ {
1058
+ name: "gha-token-scope",
1059
+ expectPromote: true,
1060
+ assessment: {
1061
+ applicable: true,
1062
+ promote: true,
1063
+ score: 2,
1064
+ threshold: 2,
1065
+ knowledgeRef: "knowledge:gha-token-scope",
1066
+ content: "---\ndescription: Minimize GitHub token scopes\ntags:\n - gha\n - security\nobserved_at: 2026-04-07\nsources:\n - memory:gha-token-scope\n - skill:github-actions\n---\n\nUse the narrowest GitHub token scope that still allows the workflow step to succeed.\n",
1067
+ blockedBy: [],
1068
+ positiveSignals: ["baseline positive feedback rule"],
1069
+ negativeSignals: [],
1070
+ modelName: "baseline-positive-feedback",
1071
+ },
1072
+ passed: true,
1073
+ },
1074
+ {
1075
+ name: "helm-debug-guess",
1076
+ expectPromote: false,
1077
+ assessment: {
1078
+ applicable: true,
1079
+ promote: true,
1080
+ score: 2,
1081
+ threshold: 2,
1082
+ knowledgeRef: "knowledge:helm-debug-guess",
1083
+ content: "---\ndescription: Helm upgrade might need --debug\nobserved_at: 2026-04-05\nsources:\n - memory:helm-debug-guess\n - skill:helm\n---\n\nIt might help to add --debug to helm upgrade output during failures.\n",
1084
+ blockedBy: [],
1085
+ positiveSignals: ["baseline positive feedback rule"],
1086
+ negativeSignals: [],
1087
+ modelName: "baseline-positive-feedback",
1088
+ },
1089
+ passed: false,
1090
+ },
1091
+ {
1092
+ name: "terraform-state-location",
1093
+ expectPromote: true,
1094
+ assessment: {
1095
+ applicable: true,
1096
+ promote: true,
1097
+ score: 3,
1098
+ threshold: 2,
1099
+ knowledgeRef: "knowledge:terraform-state-location",
1100
+ content: "---\ndescription: Use remote state locks\ntags:\n - terraform\nobserved_at: 2026-04-04\nsources:\n - memory:terraform-state-location\n - skill:terraform\n---\n\nUse remote state with locking enabled before applying shared Terraform stacks.\n",
1101
+ blockedBy: [],
1102
+ positiveSignals: ["baseline positive feedback rule"],
1103
+ negativeSignals: [],
1104
+ modelName: "baseline-positive-feedback",
1105
+ },
1106
+ passed: true,
1107
+ },
1108
+ {
1109
+ name: "mixed-signal-rollback",
1110
+ expectPromote: false,
1111
+ assessment: {
1112
+ applicable: true,
1113
+ promote: true,
1114
+ score: 2,
1115
+ threshold: 2,
1116
+ knowledgeRef: "knowledge:mixed-signal-rollback",
1117
+ content: "---\ndescription: Rollback the cluster immediately\nobserved_at: 2026-04-03\nsources:\n - memory:mixed-signal-rollback\n - skill:incident\n---\n\nRollback the cluster immediately after any 5xx spike.\n",
1118
+ blockedBy: [],
1119
+ positiveSignals: ["baseline positive feedback rule"],
1120
+ negativeSignals: [],
1121
+ modelName: "baseline-positive-feedback",
1122
+ },
1123
+ passed: false,
1124
+ },
1125
+ {
1126
+ name: "cache-ttl-fact",
1127
+ expectPromote: true,
1128
+ assessment: {
1129
+ applicable: true,
1130
+ promote: true,
1131
+ score: 2,
1132
+ threshold: 2,
1133
+ knowledgeRef: "knowledge:cache-ttl-fact",
1134
+ content: "---\ndescription: Cache TTL defaults to five minutes\ntags:\n - cache\n - platform\nobserved_at: 2026-04-02\nsources:\n - memory:cache-ttl-fact\n - skill:platform\n---\n\nThe shared platform cache TTL defaults to five minutes unless the service opts out.\n",
1135
+ blockedBy: [],
1136
+ positiveSignals: ["baseline positive feedback rule"],
1137
+ negativeSignals: [],
1138
+ modelName: "baseline-positive-feedback",
1139
+ },
1140
+ passed: true,
1141
+ },
1142
+ {
1143
+ name: "personal-shell-alias",
1144
+ expectPromote: false,
1145
+ assessment: {
1146
+ applicable: true,
1147
+ promote: false,
1148
+ score: 0,
1149
+ threshold: 2,
1150
+ knowledgeRef: "knowledge:personal-shell-alias",
1151
+ blockedBy: ["subjective-memory"],
1152
+ positiveSignals: [],
1153
+ negativeSignals: [],
1154
+ modelName: "baseline-positive-feedback",
1155
+ },
1156
+ passed: true,
1157
+ },
1158
+ ],
1159
+ },
1160
+ noWorseThanSelected: true,
1161
+ strictWin: true,
1162
+ strictWinMetrics: ["f1", "netOutcomeScore", "accuracy"],
1163
+ },
1164
+ {
1165
+ name: "baseline-metadata",
1166
+ heldOut: {
1167
+ total: 7,
1168
+ correct: 5,
1169
+ falsePositives: 2,
1170
+ falseNegatives: 0,
1171
+ accuracy: 0.7142857142857143,
1172
+ precision: 0.6666666666666666,
1173
+ recall: 1,
1174
+ f1: 0.8,
1175
+ truePositives: 4,
1176
+ trueNegatives: 1,
1177
+ netOutcomeScore: 4,
1178
+ capturedPromoteValue: 14,
1179
+ preventedFalsePromotionCost: 4,
1180
+ results: [
1181
+ {
1182
+ name: "kafka-rebalance-note",
1183
+ expectPromote: true,
1184
+ assessment: {
1185
+ applicable: true,
1186
+ promote: true,
1187
+ score: 2,
1188
+ threshold: 3,
1189
+ knowledgeRef: "knowledge:kafka-rebalance-note",
1190
+ content: "---\ndescription: Pause consumers during rebalance\ntags:\n - kafka\n - ops\nobserved_at: 2026-04-08\nsources:\n - memory:kafka-rebalance-note\n - skill:kafka\n---\n\nPause consumers during partition rebalances to avoid duplicate processing while assignments settle.\n",
1191
+ blockedBy: [],
1192
+ positiveSignals: ["baseline metadata rule"],
1193
+ negativeSignals: [],
1194
+ modelName: "baseline-metadata",
1195
+ },
1196
+ passed: true,
1197
+ },
1198
+ {
1199
+ name: "gha-token-scope",
1200
+ expectPromote: true,
1201
+ assessment: {
1202
+ applicable: true,
1203
+ promote: true,
1204
+ score: 2,
1205
+ threshold: 3,
1206
+ knowledgeRef: "knowledge:gha-token-scope",
1207
+ content: "---\ndescription: Minimize GitHub token scopes\ntags:\n - gha\n - security\nobserved_at: 2026-04-07\nsources:\n - memory:gha-token-scope\n - skill:github-actions\n---\n\nUse the narrowest GitHub token scope that still allows the workflow step to succeed.\n",
1208
+ blockedBy: [],
1209
+ positiveSignals: ["baseline metadata rule"],
1210
+ negativeSignals: [],
1211
+ modelName: "baseline-metadata",
1212
+ },
1213
+ passed: true,
1214
+ },
1215
+ {
1216
+ name: "helm-debug-guess",
1217
+ expectPromote: false,
1218
+ assessment: {
1219
+ applicable: true,
1220
+ promote: true,
1221
+ score: 2,
1222
+ threshold: 3,
1223
+ knowledgeRef: "knowledge:helm-debug-guess",
1224
+ content: "---\ndescription: Helm upgrade might need --debug\nobserved_at: 2026-04-05\nsources:\n - memory:helm-debug-guess\n - skill:helm\n---\n\nIt might help to add --debug to helm upgrade output during failures.\n",
1225
+ blockedBy: [],
1226
+ positiveSignals: ["baseline metadata rule"],
1227
+ negativeSignals: [],
1228
+ modelName: "baseline-metadata",
1229
+ },
1230
+ passed: false,
1231
+ },
1232
+ {
1233
+ name: "terraform-state-location",
1234
+ expectPromote: true,
1235
+ assessment: {
1236
+ applicable: true,
1237
+ promote: true,
1238
+ score: 2,
1239
+ threshold: 3,
1240
+ knowledgeRef: "knowledge:terraform-state-location",
1241
+ content: "---\ndescription: Use remote state locks\ntags:\n - terraform\nobserved_at: 2026-04-04\nsources:\n - memory:terraform-state-location\n - skill:terraform\n---\n\nUse remote state with locking enabled before applying shared Terraform stacks.\n",
1242
+ blockedBy: [],
1243
+ positiveSignals: ["baseline metadata rule"],
1244
+ negativeSignals: [],
1245
+ modelName: "baseline-metadata",
1246
+ },
1247
+ passed: true,
1248
+ },
1249
+ {
1250
+ name: "mixed-signal-rollback",
1251
+ expectPromote: false,
1252
+ assessment: {
1253
+ applicable: true,
1254
+ promote: true,
1255
+ score: 2,
1256
+ threshold: 3,
1257
+ knowledgeRef: "knowledge:mixed-signal-rollback",
1258
+ content: "---\ndescription: Rollback the cluster immediately\nobserved_at: 2026-04-03\nsources:\n - memory:mixed-signal-rollback\n - skill:incident\n---\n\nRollback the cluster immediately after any 5xx spike.\n",
1259
+ blockedBy: [],
1260
+ positiveSignals: ["baseline metadata rule"],
1261
+ negativeSignals: [],
1262
+ modelName: "baseline-metadata",
1263
+ },
1264
+ passed: false,
1265
+ },
1266
+ {
1267
+ name: "cache-ttl-fact",
1268
+ expectPromote: true,
1269
+ assessment: {
1270
+ applicable: true,
1271
+ promote: true,
1272
+ score: 2,
1273
+ threshold: 3,
1274
+ knowledgeRef: "knowledge:cache-ttl-fact",
1275
+ content: "---\ndescription: Cache TTL defaults to five minutes\ntags:\n - cache\n - platform\nobserved_at: 2026-04-02\nsources:\n - memory:cache-ttl-fact\n - skill:platform\n---\n\nThe shared platform cache TTL defaults to five minutes unless the service opts out.\n",
1276
+ blockedBy: [],
1277
+ positiveSignals: ["baseline metadata rule"],
1278
+ negativeSignals: [],
1279
+ modelName: "baseline-metadata",
1280
+ },
1281
+ passed: true,
1282
+ },
1283
+ {
1284
+ name: "personal-shell-alias",
1285
+ expectPromote: false,
1286
+ assessment: {
1287
+ applicable: true,
1288
+ promote: false,
1289
+ score: 0,
1290
+ threshold: 2,
1291
+ knowledgeRef: "knowledge:personal-shell-alias",
1292
+ blockedBy: ["subjective-memory"],
1293
+ positiveSignals: [],
1294
+ negativeSignals: [],
1295
+ modelName: "baseline-metadata",
1296
+ },
1297
+ passed: true,
1298
+ },
1299
+ ],
1300
+ },
1301
+ noWorseThanSelected: true,
1302
+ strictWin: true,
1303
+ strictWinMetrics: ["f1", "netOutcomeScore", "accuracy"],
1304
+ },
1305
+ ],
1306
+ strictlyBeatsBaselines: true,
1307
+ };
649
1308
  const SELECTED_MODEL = CANDIDATE_MODELS.find((model) => model.name === DEFAULT_PROMOTION_POLICY_SELECTION.selectedModel.name);
650
1309
  export const DEFAULT_PROMOTION_POLICY = {
651
1310
  name: DEFAULT_PROMOTION_POLICY_SELECTION.selectedModel.name,