audrey 1.0.2 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CHANGELOG.md +27 -0
  2. package/README.md +9 -1
  3. package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +4 -4
  4. package/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
  5. package/benchmarks/output/external/guardbench-external-evidence.json +1 -1
  6. package/benchmarks/output/guardbench-conformance-card.json +9 -9
  7. package/benchmarks/output/guardbench-raw.json +104 -103
  8. package/benchmarks/output/guardbench-summary.json +167 -165
  9. package/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
  10. package/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
  11. package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +9 -9
  12. package/benchmarks/output/submission-bundle/guardbench-raw.json +104 -103
  13. package/benchmarks/output/submission-bundle/guardbench-summary.json +167 -165
  14. package/benchmarks/output/submission-bundle/submission-manifest.json +11 -11
  15. package/benchmarks/output/submission-bundle/validation-report.json +1 -1
  16. package/benchmarks/output/summary.json +48 -48
  17. package/dist/mcp-server/config.d.ts +1 -1
  18. package/dist/mcp-server/config.js +1 -1
  19. package/dist/mcp-server/index.d.ts +3 -344
  20. package/dist/mcp-server/index.d.ts.map +1 -1
  21. package/dist/mcp-server/index.js +6 -280
  22. package/dist/mcp-server/index.js.map +1 -1
  23. package/dist/mcp-server/tool-schemas.d.ts +341 -0
  24. package/dist/mcp-server/tool-schemas.d.ts.map +1 -0
  25. package/dist/mcp-server/tool-schemas.js +248 -0
  26. package/dist/mcp-server/tool-schemas.js.map +1 -0
  27. package/dist/mcp-server/tool-validation.d.ts +17 -0
  28. package/dist/mcp-server/tool-validation.d.ts.map +1 -0
  29. package/dist/mcp-server/tool-validation.js +41 -0
  30. package/dist/mcp-server/tool-validation.js.map +1 -0
  31. package/docs/paper/07-evaluation.md +6 -6
  32. package/docs/paper/audrey-paper-v1.md +6 -6
  33. package/docs/paper/evidence-ledger.md +1 -1
  34. package/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
  35. package/docs/paper/output/arxiv/main.tex +6 -6
  36. package/docs/paper/output/arxiv-compile-report.json +3 -3
  37. package/docs/paper/output/submission-bundle/README.md +9 -1
  38. package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +4 -4
  39. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
  40. package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +1 -1
  41. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +9 -9
  42. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +104 -103
  43. package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +167 -165
  44. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
  45. package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
  46. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +11 -11
  47. package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +1 -1
  48. package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +60 -60
  49. package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +6 -6
  50. package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +6 -6
  51. package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +1 -1
  52. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
  53. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +6 -6
  54. package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +3 -3
  55. package/docs/paper/output/submission-bundle/package.json +1 -1
  56. package/docs/paper/output/submission-bundle/paper-submission-manifest.json +34 -34
  57. package/package.json +1 -1
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "suite": "GuardBench comparative",
3
- "generatedAt": "2026-05-29T03:45:36.607Z",
3
+ "generatedAt": "2026-05-29T13:33:23.188Z",
4
4
  "manifest": {
5
5
  "manifestVersion": "0.2.0",
6
6
  "suiteId": "guardbench-local-comparative",
@@ -416,15 +416,15 @@
416
416
  ]
417
417
  },
418
418
  "provenance": {
419
- "generatedAt": "2026-05-29T03:45:36.607Z",
420
- "gitSha": "ceed2f51b615175c8bb412b96b5e5a501561189f",
419
+ "generatedAt": "2026-05-29T13:33:23.189Z",
420
+ "gitSha": "9f771bae94f5ce4cfd5d5425e300a6a440c833d2",
421
421
  "gitDirty": false,
422
422
  "node": "v24.16.0",
423
423
  "v8": "13.6.233.17-node.49",
424
424
  "platform": "linux",
425
425
  "arch": "x64",
426
426
  "osRelease": "6.17.0-1015-azure",
427
- "cpuModel": "AMD EPYC 9V74 80-Core Processor",
427
+ "cpuModel": "Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz",
428
428
  "cpuCount": 4,
429
429
  "totalMemoryGb": 15.61,
430
430
  "embeddingProvider": "mock",
@@ -449,14 +449,14 @@
449
449
  "redactionLeaks": 0,
450
450
  "recallDegradationDetectionRate": 1,
451
451
  "latency": {
452
- "p50Ms": 2.916,
453
- "p95Ms": 21.17,
454
- "maxMs": 21.17
452
+ "p50Ms": 3.09,
453
+ "p95Ms": 28.181,
454
+ "maxMs": 28.181
455
455
  },
456
456
  "systemSummaries": [
457
457
  {
458
458
  "system": "Audrey Guard",
459
- "generatedAt": "2026-05-29T03:45:36.606Z",
459
+ "generatedAt": "2026-05-29T13:33:23.188Z",
460
460
  "scenarios": 10,
461
461
  "passed": 10,
462
462
  "passRate": 1,
@@ -470,14 +470,14 @@
470
470
  "redactionLeaks": 0,
471
471
  "recallDegradationDetectionRate": 1,
472
472
  "latency": {
473
- "p50Ms": 2.916,
474
- "p95Ms": 21.17,
475
- "maxMs": 21.17
473
+ "p50Ms": 3.09,
474
+ "p95Ms": 28.181,
475
+ "maxMs": 28.181
476
476
  }
477
477
  },
478
478
  {
479
479
  "system": "No Memory",
480
- "generatedAt": "2026-05-29T03:45:36.606Z",
480
+ "generatedAt": "2026-05-29T13:33:23.188Z",
481
481
  "scenarios": 10,
482
482
  "passed": 1,
483
483
  "passRate": 0.1,
@@ -491,14 +491,14 @@
491
491
  "redactionLeaks": 0,
492
492
  "recallDegradationDetectionRate": 0,
493
493
  "latency": {
494
- "p50Ms": 0.007,
495
- "p95Ms": 0.035,
496
- "maxMs": 0.035
494
+ "p50Ms": 0.008,
495
+ "p95Ms": 0.061,
496
+ "maxMs": 0.061
497
497
  }
498
498
  },
499
499
  {
500
500
  "system": "Recent Window",
501
- "generatedAt": "2026-05-29T03:45:36.606Z",
501
+ "generatedAt": "2026-05-29T13:33:23.188Z",
502
502
  "scenarios": 10,
503
503
  "passed": 6,
504
504
  "passRate": 0.6,
@@ -512,14 +512,14 @@
512
512
  "redactionLeaks": 0,
513
513
  "recallDegradationDetectionRate": 0.5,
514
514
  "latency": {
515
- "p50Ms": 0.106,
516
- "p95Ms": 0.421,
517
- "maxMs": 0.421
515
+ "p50Ms": 0.138,
516
+ "p95Ms": 0.434,
517
+ "maxMs": 0.434
518
518
  }
519
519
  },
520
520
  {
521
521
  "system": "Vector Only",
522
- "generatedAt": "2026-05-29T03:45:36.606Z",
522
+ "generatedAt": "2026-05-29T13:33:23.188Z",
523
523
  "scenarios": 10,
524
524
  "passed": 4,
525
525
  "passRate": 0.4,
@@ -533,14 +533,14 @@
533
533
  "redactionLeaks": 0,
534
534
  "recallDegradationDetectionRate": 0,
535
535
  "latency": {
536
- "p50Ms": 0.451,
537
- "p95Ms": 1.551,
538
- "maxMs": 1.551
536
+ "p50Ms": 0.529,
537
+ "p95Ms": 1.356,
538
+ "maxMs": 1.356
539
539
  }
540
540
  },
541
541
  {
542
542
  "system": "FTS Only",
543
- "generatedAt": "2026-05-29T03:45:36.606Z",
543
+ "generatedAt": "2026-05-29T13:33:23.188Z",
544
544
  "scenarios": 10,
545
545
  "passed": 1,
546
546
  "passRate": 0.1,
@@ -554,16 +554,16 @@
554
554
  "redactionLeaks": 0,
555
555
  "recallDegradationDetectionRate": 0,
556
556
  "latency": {
557
- "p50Ms": 0.43,
558
- "p95Ms": 0.717,
559
- "maxMs": 0.717
557
+ "p50Ms": 0.421,
558
+ "p95Ms": 0.633,
559
+ "maxMs": 0.633
560
560
  }
561
561
  }
562
562
  ],
563
563
  "comparisons": {
564
564
  "bestBaseline": {
565
565
  "system": "Recent Window",
566
- "generatedAt": "2026-05-29T03:45:36.606Z",
566
+ "generatedAt": "2026-05-29T13:33:23.188Z",
567
567
  "scenarios": 10,
568
568
  "passed": 6,
569
569
  "passRate": 0.6,
@@ -577,9 +577,9 @@
577
577
  "redactionLeaks": 0,
578
578
  "recallDegradationDetectionRate": 0.5,
579
579
  "latency": {
580
- "p50Ms": 0.106,
581
- "p95Ms": 0.421,
582
- "maxMs": 0.421
580
+ "p50Ms": 0.138,
581
+ "p95Ms": 0.434,
582
+ "maxMs": 0.434
583
583
  }
584
584
  },
585
585
  "audreyMarginOverBestBaseline": 0.4
@@ -594,11 +594,11 @@
594
594
  "decisionCorrect": true,
595
595
  "riskScore": 0.9,
596
596
  "passed": true,
597
- "latencyMs": 8.156,
597
+ "latencyMs": 7.93,
598
598
  "evidenceCount": 2,
599
599
  "evidenceIds": [
600
- "01KSRXCP2KSTEGGHFKA5WRR1S8",
601
- "failure:Bash:2026-05-29T03:45:33.268Z"
600
+ "01KSSZ0Y9FJAC1S37EWC7YK3BX",
601
+ "failure:Bash:2026-05-29T13:33:20.048Z"
602
602
  ],
603
603
  "recommendedActions": [
604
604
  "Do not repeat the exact failed action until the prior error is understood or the command is changed.",
@@ -620,10 +620,10 @@
620
620
  "decisionCorrect": true,
621
621
  "riskScore": 0.85,
622
622
  "passed": true,
623
- "latencyMs": 2.277,
623
+ "latencyMs": 2.263,
624
624
  "evidenceCount": 1,
625
625
  "evidenceIds": [
626
- "01KSRXCP4XYS02ZWPSKVYA7A0G"
626
+ "01KSSZ0YBP031Y5JXGRJTC5FS8"
627
627
  ],
628
628
  "recommendedActions": [
629
629
  "Do not proceed until the high-severity memory warning is addressed.",
@@ -645,10 +645,10 @@
645
645
  "decisionCorrect": true,
646
646
  "riskScore": 0.55,
647
647
  "passed": true,
648
- "latencyMs": 3.184,
648
+ "latencyMs": 3.152,
649
649
  "evidenceCount": 1,
650
650
  "evidenceIds": [
651
- "failure:Bash:2026-05-29T03:45:33.404Z"
651
+ "failure:Bash:2026-05-29T13:33:20.179Z"
652
652
  ],
653
653
  "recommendedActions": [
654
654
  "Before re-running Bash, check what changed since the last failure."
@@ -669,10 +669,10 @@
669
669
  "decisionCorrect": true,
670
670
  "riskScore": 0.55,
671
671
  "passed": true,
672
- "latencyMs": 2.916,
672
+ "latencyMs": 2.846,
673
673
  "evidenceCount": 1,
674
674
  "evidenceIds": [
675
- "failure:Bash:2026-05-29T03:45:33.469Z"
675
+ "failure:Bash:2026-05-29T13:33:20.243Z"
676
676
  ],
677
677
  "recommendedActions": [
678
678
  "Before re-running Bash, check what changed since the last failure."
@@ -693,11 +693,11 @@
693
693
  "decisionCorrect": true,
694
694
  "riskScore": 0.2,
695
695
  "passed": true,
696
- "latencyMs": 3.161,
696
+ "latencyMs": 3.09,
697
697
  "evidenceCount": 2,
698
698
  "evidenceIds": [
699
- "01KSRXCPAXZX9BGBD93N5CDDCM",
700
- "failure:Bash:2026-05-29T03:45:33.531Z"
699
+ "01KSSZ0YHGH45B4CPKET1W5VZW",
700
+ "failure:Bash:2026-05-29T13:33:20.302Z"
701
701
  ],
702
702
  "recommendedActions": [
703
703
  "This exact action has succeeded since its last failure; proceed with normal validation.",
@@ -719,7 +719,7 @@
719
719
  "decisionCorrect": true,
720
720
  "riskScore": 0.85,
721
721
  "passed": true,
722
- "latencyMs": 2.647,
722
+ "latencyMs": 2.534,
723
723
  "evidenceCount": 1,
724
724
  "evidenceIds": [
725
725
  "recall:episodic:recall.vector_counts"
@@ -745,11 +745,11 @@
745
745
  "decisionCorrect": true,
746
746
  "riskScore": 0.85,
747
747
  "passed": true,
748
- "latencyMs": 1.934,
748
+ "latencyMs": 3.585,
749
749
  "evidenceCount": 2,
750
750
  "evidenceIds": [
751
751
  "recall:fts:recall.fts_lookup",
752
- "01KSRXCPEXC1RDR4VFSV3ZV759"
752
+ "01KSSZ0YN75S42CB2APNH97S32"
753
753
  ],
754
754
  "recommendedActions": [
755
755
  "Do not proceed until the high-severity memory warning is addressed.",
@@ -775,8 +775,8 @@
775
775
  "latencyMs": 2.599,
776
776
  "evidenceCount": 2,
777
777
  "evidenceIds": [
778
- "01KSRXCPGV1X3H49QBRCN72084",
779
- "failure:Bash:2026-05-29T03:45:33.723Z"
778
+ "01KSSZ0YQ26JKF6YQETHJEAWCE",
779
+ "failure:Bash:2026-05-29T13:33:20.483Z"
780
780
  ],
781
781
  "recommendedActions": [
782
782
  "Do not repeat the exact failed action until the prior error is understood or the command is changed.",
@@ -798,11 +798,11 @@
798
798
  "decisionCorrect": true,
799
799
  "riskScore": 0.85,
800
800
  "passed": true,
801
- "latencyMs": 2.391,
801
+ "latencyMs": 2.341,
802
802
  "evidenceCount": 2,
803
803
  "evidenceIds": [
804
- "01KSRXCPJTXVN9X36WASHM2QY6",
805
- "01KSRXCPJV1JQBFZ19K6H796AG"
804
+ "01KSSZ0YRX53XKT4C23A12RFND",
805
+ "01KSSZ0YRY86WACDBFSD3W0N04"
806
806
  ],
807
807
  "recommendedActions": [
808
808
  "Do not proceed until the high-severity memory warning is addressed.",
@@ -824,28 +824,29 @@
824
824
  "decisionCorrect": true,
825
825
  "riskScore": 0.85,
826
826
  "passed": true,
827
- "latencyMs": 21.17,
828
- "evidenceCount": 12,
827
+ "latencyMs": 28.181,
828
+ "evidenceCount": 13,
829
829
  "evidenceIds": [
830
- "01KSRXCQ4DK284E35ZKNYDXWBQ",
831
- "01KSRXCQ3H1TVR4E552DQVV9MG",
832
- "01KSRXCQ39QKSSNZWEFZBHMNT9",
833
- "01KSRXCPYP6VKM8AZC7KZ4SN6W",
834
- "01KSRXCPVWWCPWE3M38G6VM1BG",
835
- "01KSRXCPV7YMCBAT0602VZ3DQG",
836
- "01KSRXCPTM8GHZXKXNKH5FMRG6",
837
- "01KSRXCPSNJTZHJK1MWE6WNNYW",
838
- "01KSRXCPS3K2GR6MFXTMTDEKD3",
839
- "01KSRXCPS3K2GR6MFXTMTDEKD2",
840
- "01KSRXCPRGC2EN41NQD4MYJ1Q1",
841
- "01KSRXCPNG135506TFPF1WMAVB"
830
+ "01KSSZ0Z92H6MV9SGY92Q4JAF3",
831
+ "01KSSZ0Z8W4Z9J8GT3CK8FC6VS",
832
+ "01KSSZ0Z7B5ZVWXVZKT4VW8A34",
833
+ "01KSSZ0Z7A1AFDK0520E1RH72M",
834
+ "01KSSZ0Z4Y150G4Q3STE6WJBD8",
835
+ "01KSSZ0Z4WMY0QV4VGJD4CCG1B",
836
+ "01KSSZ0Z48NTHT9ZB0PM7ESE4Q",
837
+ "01KSSZ0Z44J6MF93GDY4510PKC",
838
+ "01KSSZ0Z22A7QWQ2KEPKXQM7FK",
839
+ "01KSSZ0Z21X7YP9FJCGMZ61E7X",
840
+ "01KSSZ0YYRG8809DR6R80GG89T",
841
+ "01KSSZ0YYKFZYZKPGJ8TJG319N",
842
+ "01KSSZ0YY3A2N08TE8S3VQ7GJE"
842
843
  ],
843
844
  "recommendedActions": [
844
845
  "Do not proceed until the high-severity memory warning is addressed.",
845
846
  "Apply this must-follow rule before acting.",
846
847
  "Treat this as uncertain context and verify before relying on it."
847
848
  ],
848
- "summary": "Blocked: 12 memory signals, 1 high severity, 11 medium severity found before acting.",
849
+ "summary": "Blocked: 13 memory signals, 1 high severity, 12 medium severity found before acting.",
849
850
  "recallErrors": [],
850
851
  "leakedSecrets": [],
851
852
  "hasEvidenceForDecision": true,
@@ -868,11 +869,11 @@
868
869
  "decisionCorrect": true,
869
870
  "riskScore": 0.9,
870
871
  "passed": true,
871
- "latencyMs": 8.156,
872
+ "latencyMs": 7.93,
872
873
  "evidenceCount": 2,
873
874
  "evidenceIds": [
874
- "01KSRXCP2KSTEGGHFKA5WRR1S8",
875
- "failure:Bash:2026-05-29T03:45:33.268Z"
875
+ "01KSSZ0Y9FJAC1S37EWC7YK3BX",
876
+ "failure:Bash:2026-05-29T13:33:20.048Z"
876
877
  ],
877
878
  "recommendedActions": [
878
879
  "Do not repeat the exact failed action until the prior error is understood or the command is changed.",
@@ -894,7 +895,7 @@
894
895
  "decisionCorrect": false,
895
896
  "riskScore": 0,
896
897
  "passed": false,
897
- "latencyMs": 0.035,
898
+ "latencyMs": 0.061,
898
899
  "evidenceCount": 0,
899
900
  "evidenceIds": [],
900
901
  "recommendedActions": [],
@@ -914,10 +915,10 @@
914
915
  "decisionCorrect": false,
915
916
  "riskScore": 0.55,
916
917
  "passed": false,
917
- "latencyMs": 0.223,
918
+ "latencyMs": 0.221,
918
919
  "evidenceCount": 1,
919
920
  "evidenceIds": [
920
- "01KSRXCP3QFKSFEB9TZNQV7P48"
921
+ "01KSSZ0YAJTGPGFWKAJRPXMRDB"
921
922
  ],
922
923
  "recommendedActions": [
923
924
  "Check the recent failed event before repeating a similar action."
@@ -938,10 +939,10 @@
938
939
  "decisionCorrect": false,
939
940
  "riskScore": 0.35,
940
941
  "passed": false,
941
- "latencyMs": 0.945,
942
+ "latencyMs": 0.991,
942
943
  "evidenceCount": 1,
943
944
  "evidenceIds": [
944
- "01KSRXCP44MHYS3JJJN1C3H0SR"
945
+ "01KSSZ0YAYKH485SPZ8S9ZB50R"
945
946
  ],
946
947
  "recommendedActions": [
947
948
  "Treat retrieved memory as advisory context."
@@ -962,7 +963,7 @@
962
963
  "decisionCorrect": false,
963
964
  "riskScore": 0,
964
965
  "passed": false,
965
- "latencyMs": 0.467,
966
+ "latencyMs": 0.501,
966
967
  "evidenceCount": 0,
967
968
  "evidenceIds": [],
968
969
  "recommendedActions": [],
@@ -989,10 +990,10 @@
989
990
  "decisionCorrect": true,
990
991
  "riskScore": 0.85,
991
992
  "passed": true,
992
- "latencyMs": 2.277,
993
+ "latencyMs": 2.263,
993
994
  "evidenceCount": 1,
994
995
  "evidenceIds": [
995
- "01KSRXCP4XYS02ZWPSKVYA7A0G"
996
+ "01KSSZ0YBP031Y5JXGRJTC5FS8"
996
997
  ],
997
998
  "recommendedActions": [
998
999
  "Do not proceed until the high-severity memory warning is addressed.",
@@ -1014,7 +1015,7 @@
1014
1015
  "decisionCorrect": false,
1015
1016
  "riskScore": 0,
1016
1017
  "passed": false,
1017
- "latencyMs": 0.004,
1018
+ "latencyMs": 0.006,
1018
1019
  "evidenceCount": 0,
1019
1020
  "evidenceIds": [],
1020
1021
  "recommendedActions": [],
@@ -1034,10 +1035,10 @@
1034
1035
  "decisionCorrect": true,
1035
1036
  "riskScore": 0.85,
1036
1037
  "passed": true,
1037
- "latencyMs": 0.322,
1038
+ "latencyMs": 0.355,
1038
1039
  "evidenceCount": 1,
1039
1040
  "evidenceIds": [
1040
- "01KSRXCP5QE5HDWCC6T278MZFJ"
1041
+ "01KSSZ0YCF1X2GEC0505YYCRTG"
1041
1042
  ],
1042
1043
  "recommendedActions": [
1043
1044
  "Review retrieved memory before acting."
@@ -1058,10 +1059,10 @@
1058
1059
  "decisionCorrect": true,
1059
1060
  "riskScore": 0.85,
1060
1061
  "passed": true,
1061
- "latencyMs": 0.508,
1062
+ "latencyMs": 0.548,
1062
1063
  "evidenceCount": 1,
1063
1064
  "evidenceIds": [
1064
- "01KSRXCP63JE2K5CC3CMMWRZVM"
1065
+ "01KSSZ0YCV9P16GZ1RKY1V9ZZQ"
1065
1066
  ],
1066
1067
  "recommendedActions": [
1067
1068
  "Review retrieved memory before acting."
@@ -1082,7 +1083,7 @@
1082
1083
  "decisionCorrect": false,
1083
1084
  "riskScore": 0,
1084
1085
  "passed": false,
1085
- "latencyMs": 0.428,
1086
+ "latencyMs": 0.432,
1086
1087
  "evidenceCount": 0,
1087
1088
  "evidenceIds": [],
1088
1089
  "recommendedActions": [],
@@ -1109,10 +1110,10 @@
1109
1110
  "decisionCorrect": true,
1110
1111
  "riskScore": 0.55,
1111
1112
  "passed": true,
1112
- "latencyMs": 3.184,
1113
+ "latencyMs": 3.152,
1113
1114
  "evidenceCount": 1,
1114
1115
  "evidenceIds": [
1115
- "failure:Bash:2026-05-29T03:45:33.404Z"
1116
+ "failure:Bash:2026-05-29T13:33:20.179Z"
1116
1117
  ],
1117
1118
  "recommendedActions": [
1118
1119
  "Before re-running Bash, check what changed since the last failure."
@@ -1133,7 +1134,7 @@
1133
1134
  "decisionCorrect": false,
1134
1135
  "riskScore": 0,
1135
1136
  "passed": false,
1136
- "latencyMs": 0.005,
1137
+ "latencyMs": 0.008,
1137
1138
  "evidenceCount": 0,
1138
1139
  "evidenceIds": [],
1139
1140
  "recommendedActions": [],
@@ -1153,10 +1154,10 @@
1153
1154
  "decisionCorrect": true,
1154
1155
  "riskScore": 0.55,
1155
1156
  "passed": true,
1156
- "latencyMs": 0.071,
1157
+ "latencyMs": 0.085,
1157
1158
  "evidenceCount": 1,
1158
1159
  "evidenceIds": [
1159
- "01KSRXCP7QG82ZMEVDA0WPBDGS"
1160
+ "01KSSZ0YEGG95X6ZRDJMM6QV01"
1160
1161
  ],
1161
1162
  "recommendedActions": [
1162
1163
  "Check the recent failed event before repeating a similar action."
@@ -1177,10 +1178,10 @@
1177
1178
  "decisionCorrect": true,
1178
1179
  "riskScore": 0.35,
1179
1180
  "passed": true,
1180
- "latencyMs": 0.523,
1181
+ "latencyMs": 0.546,
1181
1182
  "evidenceCount": 1,
1182
1183
  "evidenceIds": [
1183
- "01KSRXCP85A8VZCV4YFYWDBZY4"
1184
+ "01KSSZ0YEWVT8CP0CFKKTE21FV"
1184
1185
  ],
1185
1186
  "recommendedActions": [
1186
1187
  "Treat retrieved memory as advisory context."
@@ -1201,7 +1202,7 @@
1201
1202
  "decisionCorrect": false,
1202
1203
  "riskScore": 0,
1203
1204
  "passed": false,
1204
- "latencyMs": 0.457,
1205
+ "latencyMs": 0.42,
1205
1206
  "evidenceCount": 0,
1206
1207
  "evidenceIds": [],
1207
1208
  "recommendedActions": [],
@@ -1228,10 +1229,10 @@
1228
1229
  "decisionCorrect": true,
1229
1230
  "riskScore": 0.55,
1230
1231
  "passed": true,
1231
- "latencyMs": 2.916,
1232
+ "latencyMs": 2.846,
1232
1233
  "evidenceCount": 1,
1233
1234
  "evidenceIds": [
1234
- "failure:Bash:2026-05-29T03:45:33.469Z"
1235
+ "failure:Bash:2026-05-29T13:33:20.243Z"
1235
1236
  ],
1236
1237
  "recommendedActions": [
1237
1238
  "Before re-running Bash, check what changed since the last failure."
@@ -1252,7 +1253,7 @@
1252
1253
  "decisionCorrect": false,
1253
1254
  "riskScore": 0,
1254
1255
  "passed": false,
1255
- "latencyMs": 0.008,
1256
+ "latencyMs": 0.017,
1256
1257
  "evidenceCount": 0,
1257
1258
  "evidenceIds": [],
1258
1259
  "recommendedActions": [],
@@ -1272,10 +1273,10 @@
1272
1273
  "decisionCorrect": true,
1273
1274
  "riskScore": 0.55,
1274
1275
  "passed": true,
1275
- "latencyMs": 0.079,
1276
+ "latencyMs": 0.07,
1276
1277
  "evidenceCount": 1,
1277
1278
  "evidenceIds": [
1278
- "01KSRXCP9QC1E626F1EMNFSYVM"
1279
+ "01KSSZ0YGC3GGGPFAFRWY19ZA1"
1279
1280
  ],
1280
1281
  "recommendedActions": [
1281
1282
  "Check the recent failed event before repeating a similar action."
@@ -1296,10 +1297,10 @@
1296
1297
  "decisionCorrect": true,
1297
1298
  "riskScore": 0.35,
1298
1299
  "passed": true,
1299
- "latencyMs": 0.427,
1300
+ "latencyMs": 0.416,
1300
1301
  "evidenceCount": 1,
1301
1302
  "evidenceIds": [
1302
- "01KSRXCPA32S387Y36NHQRFNPT"
1303
+ "01KSSZ0YGR98XGFHTA2G7DDF41"
1303
1304
  ],
1304
1305
  "recommendedActions": [
1305
1306
  "Treat retrieved memory as advisory context."
@@ -1320,7 +1321,7 @@
1320
1321
  "decisionCorrect": false,
1321
1322
  "riskScore": 0,
1322
1323
  "passed": false,
1323
- "latencyMs": 0.43,
1324
+ "latencyMs": 0.421,
1324
1325
  "evidenceCount": 0,
1325
1326
  "evidenceIds": [],
1326
1327
  "recommendedActions": [],
@@ -1347,11 +1348,11 @@
1347
1348
  "decisionCorrect": true,
1348
1349
  "riskScore": 0.2,
1349
1350
  "passed": true,
1350
- "latencyMs": 3.161,
1351
+ "latencyMs": 3.09,
1351
1352
  "evidenceCount": 2,
1352
1353
  "evidenceIds": [
1353
- "01KSRXCPAXZX9BGBD93N5CDDCM",
1354
- "failure:Bash:2026-05-29T03:45:33.531Z"
1354
+ "01KSSZ0YHGH45B4CPKET1W5VZW",
1355
+ "failure:Bash:2026-05-29T13:33:20.302Z"
1355
1356
  ],
1356
1357
  "recommendedActions": [
1357
1358
  "This exact action has succeeded since its last failure; proceed with normal validation.",
@@ -1373,7 +1374,7 @@
1373
1374
  "decisionCorrect": true,
1374
1375
  "riskScore": 0,
1375
1376
  "passed": true,
1376
- "latencyMs": 0.007,
1377
+ "latencyMs": 0.006,
1377
1378
  "evidenceCount": 0,
1378
1379
  "evidenceIds": [],
1379
1380
  "recommendedActions": [],
@@ -1396,7 +1397,7 @@
1396
1397
  "latencyMs": 0.077,
1397
1398
  "evidenceCount": 1,
1398
1399
  "evidenceIds": [
1399
- "01KSRXCPBRH43HFGR41QC4C6S1"
1400
+ "01KSSZ0YJAN3G93CGTDHVBC7M2"
1400
1401
  ],
1401
1402
  "recommendedActions": [
1402
1403
  "Check the recent failed event before repeating a similar action."
@@ -1417,10 +1418,10 @@
1417
1418
  "decisionCorrect": false,
1418
1419
  "riskScore": 0.35,
1419
1420
  "passed": false,
1420
- "latencyMs": 0.451,
1421
+ "latencyMs": 0.529,
1421
1422
  "evidenceCount": 1,
1422
1423
  "evidenceIds": [
1423
- "01KSRXCPC5FRNJEG2MV4DA0M6B"
1424
+ "01KSSZ0YJPGBVHZC5MY8RDBCH7"
1424
1425
  ],
1425
1426
  "recommendedActions": [
1426
1427
  "Treat retrieved memory as advisory context."
@@ -1441,7 +1442,7 @@
1441
1442
  "decisionCorrect": true,
1442
1443
  "riskScore": 0,
1443
1444
  "passed": true,
1444
- "latencyMs": 0.44,
1445
+ "latencyMs": 0.434,
1445
1446
  "evidenceCount": 0,
1446
1447
  "evidenceIds": [],
1447
1448
  "recommendedActions": [],
@@ -1468,7 +1469,7 @@
1468
1469
  "decisionCorrect": true,
1469
1470
  "riskScore": 0.85,
1470
1471
  "passed": true,
1471
- "latencyMs": 2.647,
1472
+ "latencyMs": 2.534,
1472
1473
  "evidenceCount": 1,
1473
1474
  "evidenceIds": [
1474
1475
  "recall:episodic:recall.vector_counts"
@@ -1494,7 +1495,7 @@
1494
1495
  "decisionCorrect": false,
1495
1496
  "riskScore": 0,
1496
1497
  "passed": false,
1497
- "latencyMs": 0.01,
1498
+ "latencyMs": 0.007,
1498
1499
  "evidenceCount": 0,
1499
1500
  "evidenceIds": [],
1500
1501
  "recommendedActions": [],
@@ -1514,10 +1515,10 @@
1514
1515
  "decisionCorrect": true,
1515
1516
  "riskScore": 0.85,
1516
1517
  "passed": true,
1517
- "latencyMs": 0.153,
1518
+ "latencyMs": 0.15,
1518
1519
  "evidenceCount": 1,
1519
1520
  "evidenceIds": [
1520
- "01KSRXCPDRK36MH6YDNH3JKEXF"
1521
+ "01KSSZ0YM51C8FJSRZG6XYKB9Y"
1521
1522
  ],
1522
1523
  "recommendedActions": [
1523
1524
  "Review retrieved memory before acting."
@@ -1538,7 +1539,7 @@
1538
1539
  "decisionCorrect": false,
1539
1540
  "riskScore": 0.55,
1540
1541
  "passed": false,
1541
- "latencyMs": 0.304,
1542
+ "latencyMs": 0.309,
1542
1543
  "evidenceCount": 0,
1543
1544
  "evidenceIds": [],
1544
1545
  "recommendedActions": [
@@ -1566,7 +1567,7 @@
1566
1567
  "decisionCorrect": false,
1567
1568
  "riskScore": 0,
1568
1569
  "passed": false,
1569
- "latencyMs": 0.376,
1570
+ "latencyMs": 0.382,
1570
1571
  "evidenceCount": 0,
1571
1572
  "evidenceIds": [],
1572
1573
  "recommendedActions": [],
@@ -1593,11 +1594,11 @@
1593
1594
  "decisionCorrect": true,
1594
1595
  "riskScore": 0.85,
1595
1596
  "passed": true,
1596
- "latencyMs": 1.934,
1597
+ "latencyMs": 3.585,
1597
1598
  "evidenceCount": 2,
1598
1599
  "evidenceIds": [
1599
1600
  "recall:fts:recall.fts_lookup",
1600
- "01KSRXCPEXC1RDR4VFSV3ZV759"
1601
+ "01KSSZ0YN75S42CB2APNH97S32"
1601
1602
  ],
1602
1603
  "recommendedActions": [
1603
1604
  "Do not proceed until the high-severity memory warning is addressed.",
@@ -1620,7 +1621,7 @@
1620
1621
  "decisionCorrect": false,
1621
1622
  "riskScore": 0,
1622
1623
  "passed": false,
1623
- "latencyMs": 0.006,
1624
+ "latencyMs": 0.008,
1624
1625
  "evidenceCount": 0,
1625
1626
  "evidenceIds": [],
1626
1627
  "recommendedActions": [],
@@ -1640,10 +1641,10 @@
1640
1641
  "decisionCorrect": false,
1641
1642
  "riskScore": 0.35,
1642
1643
  "passed": false,
1643
- "latencyMs": 0.105,
1644
+ "latencyMs": 0.123,
1644
1645
  "evidenceCount": 1,
1645
1646
  "evidenceIds": [
1646
- "01KSRXCPFQ579DG3V402TKWYPM"
1647
+ "01KSSZ0YP099TTMQ40MRD6FAQ3"
1647
1648
  ],
1648
1649
  "recommendedActions": [
1649
1650
  "Treat retrieved memory as advisory context."
@@ -1664,10 +1665,10 @@
1664
1665
  "decisionCorrect": false,
1665
1666
  "riskScore": 0.35,
1666
1667
  "passed": false,
1667
- "latencyMs": 0.347,
1668
+ "latencyMs": 0.403,
1668
1669
  "evidenceCount": 1,
1669
1670
  "evidenceIds": [
1670
- "01KSRXCPG3Q8K0YSYA2SAVRPMM"
1671
+ "01KSSZ0YPBFF5602JAW36JN22Y"
1671
1672
  ],
1672
1673
  "recommendedActions": [
1673
1674
  "Treat retrieved memory as advisory context."
@@ -1688,7 +1689,7 @@
1688
1689
  "decisionCorrect": false,
1689
1690
  "riskScore": 0.55,
1690
1691
  "passed": false,
1691
- "latencyMs": 0.13,
1692
+ "latencyMs": 0.152,
1692
1693
  "evidenceCount": 0,
1693
1694
  "evidenceIds": [],
1694
1695
  "recommendedActions": [
@@ -1726,8 +1727,8 @@
1726
1727
  "latencyMs": 2.599,
1727
1728
  "evidenceCount": 2,
1728
1729
  "evidenceIds": [
1729
- "01KSRXCPGV1X3H49QBRCN72084",
1730
- "failure:Bash:2026-05-29T03:45:33.723Z"
1730
+ "01KSSZ0YQ26JKF6YQETHJEAWCE",
1731
+ "failure:Bash:2026-05-29T13:33:20.483Z"
1731
1732
  ],
1732
1733
  "recommendedActions": [
1733
1734
  "Do not repeat the exact failed action until the prior error is understood or the command is changed.",
@@ -1749,7 +1750,7 @@
1749
1750
  "decisionCorrect": false,
1750
1751
  "riskScore": 0,
1751
1752
  "passed": false,
1752
- "latencyMs": 0.005,
1753
+ "latencyMs": 0.007,
1753
1754
  "evidenceCount": 0,
1754
1755
  "evidenceIds": [],
1755
1756
  "recommendedActions": [],
@@ -1769,10 +1770,10 @@
1769
1770
  "decisionCorrect": false,
1770
1771
  "riskScore": 0.55,
1771
1772
  "passed": false,
1772
- "latencyMs": 0.062,
1773
+ "latencyMs": 0.072,
1773
1774
  "evidenceCount": 1,
1774
1775
  "evidenceIds": [
1775
- "01KSRXCPHPYBHWZKFJ5XCHY1X6"
1776
+ "01KSSZ0YQWMP8RBH8GCDJ01C38"
1776
1777
  ],
1777
1778
  "recommendedActions": [
1778
1779
  "Check the recent failed event before repeating a similar action."
@@ -1793,10 +1794,10 @@
1793
1794
  "decisionCorrect": false,
1794
1795
  "riskScore": 0.35,
1795
1796
  "passed": false,
1796
- "latencyMs": 0.396,
1797
+ "latencyMs": 0.427,
1797
1798
  "evidenceCount": 1,
1798
1799
  "evidenceIds": [
1799
- "01KSRXCPJ2NXZ1VNNKPQ5RH818"
1800
+ "01KSSZ0YR72GCV7F2M1A5JJ8R0"
1800
1801
  ],
1801
1802
  "recommendedActions": [
1802
1803
  "Treat retrieved memory as advisory context."
@@ -1817,7 +1818,7 @@
1817
1818
  "decisionCorrect": false,
1818
1819
  "riskScore": 0,
1819
1820
  "passed": false,
1820
- "latencyMs": 0.35,
1821
+ "latencyMs": 0.368,
1821
1822
  "evidenceCount": 0,
1822
1823
  "evidenceIds": [],
1823
1824
  "recommendedActions": [],
@@ -1844,11 +1845,11 @@
1844
1845
  "decisionCorrect": true,
1845
1846
  "riskScore": 0.85,
1846
1847
  "passed": true,
1847
- "latencyMs": 2.391,
1848
+ "latencyMs": 2.341,
1848
1849
  "evidenceCount": 2,
1849
1850
  "evidenceIds": [
1850
- "01KSRXCPJTXVN9X36WASHM2QY6",
1851
- "01KSRXCPJV1JQBFZ19K6H796AG"
1851
+ "01KSSZ0YRX53XKT4C23A12RFND",
1852
+ "01KSSZ0YRY86WACDBFSD3W0N04"
1852
1853
  ],
1853
1854
  "recommendedActions": [
1854
1855
  "Do not proceed until the high-severity memory warning is addressed.",
@@ -1870,7 +1871,7 @@
1870
1871
  "decisionCorrect": false,
1871
1872
  "riskScore": 0,
1872
1873
  "passed": false,
1873
- "latencyMs": 0.004,
1874
+ "latencyMs": 0.006,
1874
1875
  "evidenceCount": 0,
1875
1876
  "evidenceIds": [],
1876
1877
  "recommendedActions": [],
@@ -1890,11 +1891,11 @@
1890
1891
  "decisionCorrect": true,
1891
1892
  "riskScore": 0.85,
1892
1893
  "passed": true,
1893
- "latencyMs": 0.106,
1894
+ "latencyMs": 0.138,
1894
1895
  "evidenceCount": 2,
1895
1896
  "evidenceIds": [
1896
- "01KSRXCPKNY5BNX2TH3M407J48",
1897
- "01KSRXCPKMTBHPCWYJWJ3REV9J"
1897
+ "01KSSZ0YSQHY993KVPVAJWNZC7",
1898
+ "01KSSZ0YSP6P69G6SYWYSEJ27M"
1898
1899
  ],
1899
1900
  "recommendedActions": [
1900
1901
  "Review retrieved memory before acting."
@@ -1915,11 +1916,11 @@
1915
1916
  "decisionCorrect": true,
1916
1917
  "riskScore": 0.85,
1917
1918
  "passed": true,
1918
- "latencyMs": 0.4,
1919
+ "latencyMs": 0.395,
1919
1920
  "evidenceCount": 2,
1920
1921
  "evidenceIds": [
1921
- "01KSRXCPM08WEJAJ579D9KS053",
1922
- "01KSRXCPM1V6CVS8380AN3F39Y"
1922
+ "01KSSZ0YT1VBYF3GHSXNS8GR7Y",
1923
+ "01KSSZ0YT2Q1MWN007FYJB8V6N"
1923
1924
  ],
1924
1925
  "recommendedActions": [
1925
1926
  "Review retrieved memory before acting."
@@ -1940,7 +1941,7 @@
1940
1941
  "decisionCorrect": false,
1941
1942
  "riskScore": 0,
1942
1943
  "passed": false,
1943
- "latencyMs": 0.378,
1944
+ "latencyMs": 0.338,
1944
1945
  "evidenceCount": 0,
1945
1946
  "evidenceIds": [],
1946
1947
  "recommendedActions": [],
@@ -1967,28 +1968,29 @@
1967
1968
  "decisionCorrect": true,
1968
1969
  "riskScore": 0.85,
1969
1970
  "passed": true,
1970
- "latencyMs": 21.17,
1971
- "evidenceCount": 12,
1971
+ "latencyMs": 28.181,
1972
+ "evidenceCount": 13,
1972
1973
  "evidenceIds": [
1973
- "01KSRXCQ4DK284E35ZKNYDXWBQ",
1974
- "01KSRXCQ3H1TVR4E552DQVV9MG",
1975
- "01KSRXCQ39QKSSNZWEFZBHMNT9",
1976
- "01KSRXCPYP6VKM8AZC7KZ4SN6W",
1977
- "01KSRXCPVWWCPWE3M38G6VM1BG",
1978
- "01KSRXCPV7YMCBAT0602VZ3DQG",
1979
- "01KSRXCPTM8GHZXKXNKH5FMRG6",
1980
- "01KSRXCPSNJTZHJK1MWE6WNNYW",
1981
- "01KSRXCPS3K2GR6MFXTMTDEKD3",
1982
- "01KSRXCPS3K2GR6MFXTMTDEKD2",
1983
- "01KSRXCPRGC2EN41NQD4MYJ1Q1",
1984
- "01KSRXCPNG135506TFPF1WMAVB"
1974
+ "01KSSZ0Z92H6MV9SGY92Q4JAF3",
1975
+ "01KSSZ0Z8W4Z9J8GT3CK8FC6VS",
1976
+ "01KSSZ0Z7B5ZVWXVZKT4VW8A34",
1977
+ "01KSSZ0Z7A1AFDK0520E1RH72M",
1978
+ "01KSSZ0Z4Y150G4Q3STE6WJBD8",
1979
+ "01KSSZ0Z4WMY0QV4VGJD4CCG1B",
1980
+ "01KSSZ0Z48NTHT9ZB0PM7ESE4Q",
1981
+ "01KSSZ0Z44J6MF93GDY4510PKC",
1982
+ "01KSSZ0Z22A7QWQ2KEPKXQM7FK",
1983
+ "01KSSZ0Z21X7YP9FJCGMZ61E7X",
1984
+ "01KSSZ0YYRG8809DR6R80GG89T",
1985
+ "01KSSZ0YYKFZYZKPGJ8TJG319N",
1986
+ "01KSSZ0YY3A2N08TE8S3VQ7GJE"
1985
1987
  ],
1986
1988
  "recommendedActions": [
1987
1989
  "Do not proceed until the high-severity memory warning is addressed.",
1988
1990
  "Apply this must-follow rule before acting.",
1989
1991
  "Treat this as uncertain context and verify before relying on it."
1990
1992
  ],
1991
- "summary": "Blocked: 12 memory signals, 1 high severity, 11 medium severity found before acting.",
1993
+ "summary": "Blocked: 13 memory signals, 1 high severity, 12 medium severity found before acting.",
1992
1994
  "recallErrors": [],
1993
1995
  "leakedSecrets": [],
1994
1996
  "hasEvidenceForDecision": true,
@@ -2024,10 +2026,10 @@
2024
2026
  "decisionCorrect": true,
2025
2027
  "riskScore": 0.85,
2026
2028
  "passed": true,
2027
- "latencyMs": 0.421,
2029
+ "latencyMs": 0.434,
2028
2030
  "evidenceCount": 1,
2029
2031
  "evidenceIds": [
2030
- "01KSRXCR6FARVQ7ATWYWC5QAF9"
2032
+ "01KSSZ109E1PA94YM4KMJQTD1D"
2031
2033
  ],
2032
2034
  "recommendedActions": [
2033
2035
  "Review retrieved memory before acting."
@@ -2048,14 +2050,14 @@
2048
2050
  "decisionCorrect": false,
2049
2051
  "riskScore": 0.35,
2050
2052
  "passed": false,
2051
- "latencyMs": 1.551,
2053
+ "latencyMs": 1.356,
2052
2054
  "evidenceCount": 5,
2053
2055
  "evidenceIds": [
2054
- "01KSRXCRC5YBFBKT1RM4SPXRZZ",
2055
- "01KSRXCR9R09K2J5HM1BGN1PSW",
2056
- "01KSRXCRJKF9PWQG7YRGGK1TP6",
2057
- "01KSRXCRGTW1V1VGWWT869D36Q",
2058
- "01KSRXCRP3KVXPGD7WMNS3KWKF"
2056
+ "01KSSZ10NJ12JB0518EN4MBBS2",
2057
+ "01KSSZ10J6YEQ8YYRKRQEMTXC7",
2058
+ "01KSSZ10EKE4AFRXSMK5VJYNXZ",
2059
+ "01KSSZ10CECQDV0CD902G694Z5",
2060
+ "01KSSZ10EBZBKXSER4HB4T0WGK"
2059
2061
  ],
2060
2062
  "recommendedActions": [
2061
2063
  "Treat retrieved memory as advisory context."
@@ -2076,7 +2078,7 @@
2076
2078
  "decisionCorrect": false,
2077
2079
  "riskScore": 0,
2078
2080
  "passed": false,
2079
- "latencyMs": 0.717,
2081
+ "latencyMs": 0.633,
2080
2082
  "evidenceCount": 0,
2081
2083
  "evidenceIds": [],
2082
2084
  "recommendedActions": [],
@@ -2091,7 +2093,7 @@
2091
2093
  }
2092
2094
  ],
2093
2095
  "artifactRedactionSweep": {
2094
- "checkedAt": "2026-05-29T03:45:36.646Z",
2096
+ "checkedAt": "2026-05-29T13:33:23.214Z",
2095
2097
  "filesChecked": [
2096
2098
  "benchmarks/output/guardbench-manifest.json",
2097
2099
  "benchmarks/output/guardbench-raw.json",