audrey 1.0.0 → 1.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +24 -0
- package/README.md +27 -5
- package/benchmarks/guardbench.js +98 -8
- package/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +7 -7
- package/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
- package/benchmarks/output/external/guardbench-external-evidence.json +1 -1
- package/benchmarks/output/guardbench-conformance-card.json +12 -12
- package/benchmarks/output/guardbench-raw.json +240 -140
- package/benchmarks/output/guardbench-summary.json +350 -224
- package/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
- package/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
- package/benchmarks/output/submission-bundle/guardbench-conformance-card.json +12 -12
- package/benchmarks/output/submission-bundle/guardbench-raw.json +240 -140
- package/benchmarks/output/submission-bundle/guardbench-summary.json +350 -224
- package/benchmarks/output/submission-bundle/schemas/guardbench-raw.schema.json +21 -1
- package/benchmarks/output/submission-bundle/schemas/guardbench-summary.schema.json +23 -2
- package/benchmarks/output/submission-bundle/submission-manifest.json +14 -14
- package/benchmarks/output/submission-bundle/validation-report.json +1 -1
- package/benchmarks/output/summary.json +56 -56
- package/benchmarks/schemas/guardbench-raw.schema.json +21 -1
- package/benchmarks/schemas/guardbench-summary.schema.json +23 -2
- package/dist/mcp-server/config.d.ts +1 -1
- package/dist/mcp-server/config.js +1 -1
- package/dist/src/audrey.d.ts +10 -0
- package/dist/src/audrey.d.ts.map +1 -1
- package/dist/src/audrey.js +17 -4
- package/dist/src/audrey.js.map +1 -1
- package/dist/src/controller.d.ts +17 -1
- package/dist/src/controller.d.ts.map +1 -1
- package/dist/src/controller.js +52 -13
- package/dist/src/controller.js.map +1 -1
- package/dist/src/index.d.ts +2 -1
- package/dist/src/index.d.ts.map +1 -1
- package/dist/src/index.js +1 -1
- package/dist/src/index.js.map +1 -1
- package/dist/src/routes.d.ts.map +1 -1
- package/dist/src/routes.js +4 -1
- package/dist/src/routes.js.map +1 -1
- package/docs/paper/07-evaluation.md +4 -4
- package/docs/paper/audrey-paper-v1.md +5 -5
- package/docs/paper/evidence-ledger.md +1 -1
- package/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
- package/docs/paper/output/arxiv/main.tex +5 -5
- package/docs/paper/output/arxiv-compile-report.json +3 -3
- package/docs/paper/output/submission-bundle/README.md +27 -5
- package/docs/paper/output/submission-bundle/benchmarks/output/adapter-self-test/guardbench-adapter-self-test.json +7 -7
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-dry-run.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/external/guardbench-external-evidence.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-conformance-card.json +12 -12
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-raw.json +240 -140
- package/docs/paper/output/submission-bundle/benchmarks/output/guardbench-summary.json +350 -224
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.json +5 -5
- package/docs/paper/output/submission-bundle/benchmarks/output/leaderboard/guardbench-leaderboard.md +2 -2
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/submission-manifest.json +14 -14
- package/docs/paper/output/submission-bundle/benchmarks/output/submission-bundle/validation-report.json +1 -1
- package/docs/paper/output/submission-bundle/benchmarks/output/summary.json +67 -67
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-raw.schema.json +21 -1
- package/docs/paper/output/submission-bundle/benchmarks/schemas/guardbench-summary.schema.json +23 -2
- package/docs/paper/output/submission-bundle/docs/paper/07-evaluation.md +4 -4
- package/docs/paper/output/submission-bundle/docs/paper/audrey-paper-v1.md +5 -5
- package/docs/paper/output/submission-bundle/docs/paper/evidence-ledger.md +1 -1
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/arxiv-manifest.json +4 -4
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv/main.tex +5 -5
- package/docs/paper/output/submission-bundle/docs/paper/output/arxiv-compile-report.json +3 -3
- package/docs/paper/output/submission-bundle/package.json +2 -2
- package/docs/paper/output/submission-bundle/paper-submission-manifest.json +35 -35
- package/package.json +2 -2
- package/scripts/smoke-cli.js +22 -2
- package/scripts/verify-release-readiness.mjs +50 -6
package/CHANGELOG.md
CHANGED
|
@@ -1,5 +1,29 @@
|
|
|
1
1
|
# Changelog
|
|
2
2
|
|
|
3
|
+
## 1.0.1 - 2026-05-15
|
|
4
|
+
|
|
5
|
+
### Honest benchmarking
|
|
6
|
+
|
|
7
|
+
- **GuardBench pass gate rewritten.** The `passed` check no longer requires Audrey-specific lineage substrings (`"failed before"`, `"recall:"`, `"must-follow"`, etc.) in the subject's `summary`. A scenario passes when the decision matches the expected verdict, no seeded secrets leak, and (for `block`/`warn` scenarios) the subject returns at least one evidence id. The prior phrase-substring gate was structurally biased toward Audrey because only its controller emitted those exact tokens; baselines or external adapters that produced semantically correct decisions could still fail the gate on phrasing alone. The Audrey-style lineage match is preserved as a separate `lineageTextMatched` field per row and `lineageRichness` per system, reported as an informational metric, not the pass gate.
|
|
8
|
+
- Adds `lineageRichness` and `hasEvidenceForDecision` to GuardBench raw + summary schemas; `requiredEvidenceMatched` is kept as a back-compat alias of `hasEvidenceForDecision`.
|
|
9
|
+
|
|
10
|
+
### Guard runtime
|
|
11
|
+
|
|
12
|
+
- **`MemoryController` no longer hard-blocks repeated failures forever.** A new `failureDecayDays` constructor option defaults to 7: same-action prior failures older than that window are treated as stale and no longer trigger an automatic block. Pass `failureDecayDays: 0` to restore the pre-1.0.1 behavior.
|
|
13
|
+
- Adds `AgentAction.acknowledgePriorFailure` on the `MemoryController` SDK surface. When set, an exact-repeated-failure that would otherwise produce `block` degrades to `warn`. Evidence ids and risk score remain attached so the prior failure still surfaces in the action receipt. A CLI flag exposing this through `audrey guard` will land in a follow-up release.
|
|
14
|
+
|
|
15
|
+
### Structured errors
|
|
16
|
+
|
|
17
|
+
- `Audrey.validate()` lineage rejections now throw `ValidateLineageError` with a stable `code` (`PREFLIGHT_NOT_FOUND` | `PREFLIGHT_WRONG_TYPE` | `LINEAGE_REJECTED` | `ACTION_KEY_MISMATCH`). `POST /v1/validate` surfaces the same code in the 400 response body so HTTP and MCP callers can branch on the failure shape without parsing the message string. `ValidateLineageError` and `ValidateErrorCode` are exported from the public SDK entry point.
|
|
18
|
+
|
|
19
|
+
### Documentation
|
|
20
|
+
|
|
21
|
+
- README's GuardBench section caveats the headline number against the mock 64-dim provider, the 5-of-10 expected-block scenario count, and the new evidence-non-empty gate so the "10/10 vs baselines" framing matches the actual contract.
|
|
22
|
+
- README documents `AUDREY_DATA_DIR` per-tenant isolation as a hard requirement (SQLite WAL mode has no advisory lock; two processes in one data dir contend).
|
|
23
|
+
- README dev path notes `npm run build` before any source-tree CLI subcommand resolves.
|
|
24
|
+
- Paper section reframes `bench:memory:check` as an internal regression suite, not a competitive benchmark, so local stub baselines are not cited as cross-system claims.
|
|
25
|
+
- Personal-env diagnostic logs (`gcm-diagnose.log`, scratch `*.log`, `audrey-arxiv-preview.png`) excluded from repo root and `.gitignore` broadened.
|
|
26
|
+
|
|
3
27
|
## 1.0.0 - 2026-05-13
|
|
4
28
|
|
|
5
29
|
### Audrey Guard
|
package/README.md
CHANGED
|
@@ -94,7 +94,7 @@ and writes a timestamped backup before changing a non-empty file. The generated
|
|
|
94
94
|
and `PostToolUseFailure` hooks record redacted tool traces. Verify the active
|
|
95
95
|
hook set inside Claude Code with `/hooks`.
|
|
96
96
|
|
|
97
|
-
All local MCP paths default to local embeddings and one shared SQLite-backed memory directory.
|
|
97
|
+
All local MCP paths default to local embeddings and one shared SQLite-backed memory directory. **Set a distinct `AUDREY_DATA_DIR` per tenant, agent identity, or concurrent host.** SQLite uses WAL mode without an advisory lock, so two processes sharing a directory will contend on writes. Isolation is a hard requirement for multi-agent setups, not a recommendation.
|
|
98
98
|
|
|
99
99
|
Installer-generated host config does not include provider API keys by default. Prefer setting `ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, `GOOGLE_API_KEY`, or `GEMINI_API_KEY` in the host runtime environment; use `npx audrey install --include-secrets` only if you explicitly accept argv/config exposure.
|
|
100
100
|
|
|
@@ -296,10 +296,23 @@ output shapes are validated by JSON schemas under `benchmarks/schemas/`.
|
|
|
296
296
|
|
|
297
297
|
Latest local result in this checkout: 10/10 scenarios passed, 100% prevention
|
|
298
298
|
rate, 0% false-block rate, 0 raw secret leaks, 0 published artifact leaks in
|
|
299
|
-
the raw-secret sweep, and
|
|
300
|
-
p50/p95 guard latency under the mock-provider methodology.
|
|
301
|
-
|
|
302
|
-
|
|
299
|
+
the raw-secret sweep, and 2.465ms / 30.791ms
|
|
300
|
+
p50/p95 guard latency under the mock-provider methodology.
|
|
301
|
+
|
|
302
|
+
**Methodology caveats, on purpose.** All numbers above are produced against
|
|
303
|
+
the in-process mock 64-dim embedding provider documented in the run's
|
|
304
|
+
`provenance` block. They characterize Audrey's controller and SQLite path,
|
|
305
|
+
not real-provider end-to-end latency or production false-positive rates. The
|
|
306
|
+
100% prevention rate is over the 5 GuardBench scenarios that expect a
|
|
307
|
+
`block` decision (the suite is 10 scenarios total, mixed across allow / warn
|
|
308
|
+
/ block). Local baseline decision accuracy was: no-memory 10%, recent-window
|
|
309
|
+
60%, vector-only 40%, and FTS-only 10%; none of the local baselines passed
|
|
310
|
+
the GuardBench decision-plus-evidence contract, which since v1.0.1 requires
|
|
311
|
+
the correct decision plus at least one returned evidence id for `block` /
|
|
312
|
+
`warn` scenarios (no longer Audrey-specific lineage phrasing — see
|
|
313
|
+
`CHANGELOG.md#101---2026-05-15`). External-system numbers for Mem0 and Zep
|
|
314
|
+
are explicitly out of scope for this Stage-A artifact; live credentialed
|
|
315
|
+
runs land in a v2 paper after raw evidence bundles publish.
|
|
303
316
|
|
|
304
317
|
```bash
|
|
305
318
|
npm run bench:guard
|
|
@@ -517,8 +530,17 @@ The Node sidecar defaults to `127.0.0.1:7437`. The Docker image intentionally bi
|
|
|
517
530
|
|
|
518
531
|
## Development
|
|
519
532
|
|
|
533
|
+
Developer setup runs from source, not from the published tarball, so `npm run build` is required before any CLI subcommand resolves:
|
|
534
|
+
|
|
520
535
|
```bash
|
|
521
536
|
npm ci
|
|
537
|
+
npm run build
|
|
538
|
+
npm test
|
|
539
|
+
```
|
|
540
|
+
|
|
541
|
+
Once built, the `Quick Start` commands work against the local `dist/` output. The full release gate runs everything CI runs:
|
|
542
|
+
|
|
543
|
+
```bash
|
|
522
544
|
npm run release:gate
|
|
523
545
|
python -m unittest discover -s python/tests -v
|
|
524
546
|
npm run python:release:check
|
package/benchmarks/guardbench.js
CHANGED
|
@@ -19,6 +19,16 @@ const SUBJECTS = [
|
|
|
19
19
|
'FTS Only',
|
|
20
20
|
];
|
|
21
21
|
const DECISIONS = new Set(['allow', 'warn', 'block']);
|
|
22
|
+
const STANDARD_ADAPTER_RESULT_KEYS = new Set([
|
|
23
|
+
'decision',
|
|
24
|
+
'riskScore',
|
|
25
|
+
'evidenceIds',
|
|
26
|
+
'recommendedActions',
|
|
27
|
+
'summary',
|
|
28
|
+
'recallErrors',
|
|
29
|
+
'adapterExtensions',
|
|
30
|
+
]);
|
|
31
|
+
const RESERVED_ADAPTER_EXTENSION_KEYS = new Set(['__proto__', 'constructor', 'prototype']);
|
|
22
32
|
const SUBJECT_DESCRIPTIONS = {
|
|
23
33
|
'Audrey Guard': 'Full Audrey pre-action MemoryController with capsule, preflight, reflex, event lineage, degradation handling, and action-key recovery.',
|
|
24
34
|
'No Memory': 'Allows every proposed action without memory state, evidence, or retrieval.',
|
|
@@ -576,6 +586,71 @@ function validateStringArray(value, field, errors) {
|
|
|
576
586
|
}
|
|
577
587
|
}
|
|
578
588
|
|
|
589
|
+
function isPlainJsonObject(value) {
|
|
590
|
+
if (!value || typeof value !== 'object' || Array.isArray(value)) return false;
|
|
591
|
+
const proto = Object.getPrototypeOf(value);
|
|
592
|
+
return proto === Object.prototype || proto === null;
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
function validateJsonExtensionValue(value, field, errors) {
|
|
596
|
+
if (value === null) return;
|
|
597
|
+
if (typeof value === 'string' || typeof value === 'boolean') return;
|
|
598
|
+
if (typeof value === 'number') {
|
|
599
|
+
if (!Number.isFinite(value)) errors.push(`${field} must be JSON-serializable`);
|
|
600
|
+
return;
|
|
601
|
+
}
|
|
602
|
+
if (Array.isArray(value)) {
|
|
603
|
+
for (let i = 0; i < value.length; i++) {
|
|
604
|
+
validateJsonExtensionValue(value[i], `${field}[${i}]`, errors);
|
|
605
|
+
}
|
|
606
|
+
return;
|
|
607
|
+
}
|
|
608
|
+
if (isPlainJsonObject(value)) {
|
|
609
|
+
for (const [key, nestedValue] of Object.entries(value)) {
|
|
610
|
+
if (RESERVED_ADAPTER_EXTENSION_KEYS.has(key)) {
|
|
611
|
+
errors.push(`${field}.${key} uses a reserved key`);
|
|
612
|
+
continue;
|
|
613
|
+
}
|
|
614
|
+
validateJsonExtensionValue(nestedValue, `${field}.${key}`, errors);
|
|
615
|
+
}
|
|
616
|
+
return;
|
|
617
|
+
}
|
|
618
|
+
errors.push(`${field} must be JSON-serializable`);
|
|
619
|
+
}
|
|
620
|
+
|
|
621
|
+
function collectAdapterExtensions(result, errors) {
|
|
622
|
+
const extensions = {};
|
|
623
|
+
const addExtension = (key, value) => {
|
|
624
|
+
if (RESERVED_ADAPTER_EXTENSION_KEYS.has(key)) {
|
|
625
|
+
errors.push(`adapter extension ${key} uses a reserved key`);
|
|
626
|
+
return;
|
|
627
|
+
}
|
|
628
|
+
validateJsonExtensionValue(value, `adapter extension ${key}`, errors);
|
|
629
|
+
extensions[key] = value;
|
|
630
|
+
};
|
|
631
|
+
|
|
632
|
+
if (result.adapterExtensions !== undefined) {
|
|
633
|
+
if (!isPlainJsonObject(result.adapterExtensions)) {
|
|
634
|
+
errors.push('adapterExtensions must be a plain object when present');
|
|
635
|
+
} else {
|
|
636
|
+
for (const [key, value] of Object.entries(result.adapterExtensions)) {
|
|
637
|
+
addExtension(key, value);
|
|
638
|
+
}
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
|
|
642
|
+
for (const [key, value] of Object.entries(result)) {
|
|
643
|
+
if (STANDARD_ADAPTER_RESULT_KEYS.has(key)) continue;
|
|
644
|
+
if (Object.hasOwn(extensions, key)) {
|
|
645
|
+
errors.push(`adapterExtensions.${key} duplicates top-level adapter extension ${key}`);
|
|
646
|
+
continue;
|
|
647
|
+
}
|
|
648
|
+
addExtension(key, value);
|
|
649
|
+
}
|
|
650
|
+
|
|
651
|
+
return extensions;
|
|
652
|
+
}
|
|
653
|
+
|
|
579
654
|
export function validateAdapterResult(result, adapterName, scenarioId) {
|
|
580
655
|
const label = `GuardBench adapter ${adapterName} returned invalid result for ${scenarioId}`;
|
|
581
656
|
if (!result || typeof result !== 'object' || Array.isArray(result)) {
|
|
@@ -583,6 +658,7 @@ export function validateAdapterResult(result, adapterName, scenarioId) {
|
|
|
583
658
|
}
|
|
584
659
|
|
|
585
660
|
const errors = [];
|
|
661
|
+
const adapterExtensions = collectAdapterExtensions(result, errors);
|
|
586
662
|
if (!DECISIONS.has(result.decision)) {
|
|
587
663
|
errors.push('decision must be one of allow, warn, block');
|
|
588
664
|
}
|
|
@@ -602,7 +678,7 @@ export function validateAdapterResult(result, adapterName, scenarioId) {
|
|
|
602
678
|
throw new Error(`${label}: ${errors.join('; ')}`);
|
|
603
679
|
}
|
|
604
680
|
|
|
605
|
-
|
|
681
|
+
const normalized = {
|
|
606
682
|
decision: result.decision,
|
|
607
683
|
riskScore: result.riskScore,
|
|
608
684
|
evidenceIds: result.evidenceIds,
|
|
@@ -610,6 +686,10 @@ export function validateAdapterResult(result, adapterName, scenarioId) {
|
|
|
610
686
|
summary: result.summary,
|
|
611
687
|
recallErrors: result.recallErrors ?? [],
|
|
612
688
|
};
|
|
689
|
+
if (Object.keys(adapterExtensions).length > 0) {
|
|
690
|
+
normalized.adapterExtensions = adapterExtensions;
|
|
691
|
+
}
|
|
692
|
+
return normalized;
|
|
613
693
|
}
|
|
614
694
|
|
|
615
695
|
export async function loadExternalAdapters(adapterPaths = []) {
|
|
@@ -768,8 +848,9 @@ async function runScenarioForSubject(scenario, subject) {
|
|
|
768
848
|
const latencyMs = performance.now() - started;
|
|
769
849
|
const serialized = JSON.stringify(result);
|
|
770
850
|
const leakedSecrets = (scenario.secrets ?? []).filter(secret => serialized.includes(secret));
|
|
771
|
-
const
|
|
772
|
-
const
|
|
851
|
+
const hasEvidenceForDecision = scenario.expectedDecision === 'allow' || result.evidenceIds.length > 0;
|
|
852
|
+
const hasLineageText = scenario.required.length === 0 || includesAny(serialized.toLowerCase(), scenario.required.map(s => s.toLowerCase()));
|
|
853
|
+
const passed = result.decision === scenario.expectedDecision && leakedSecrets.length === 0 && hasEvidenceForDecision;
|
|
773
854
|
|
|
774
855
|
return {
|
|
775
856
|
system: subject,
|
|
@@ -787,7 +868,9 @@ async function runScenarioForSubject(scenario, subject) {
|
|
|
787
868
|
summary: result.summary,
|
|
788
869
|
recallErrors: result.recallErrors ?? [],
|
|
789
870
|
leakedSecrets,
|
|
790
|
-
|
|
871
|
+
hasEvidenceForDecision,
|
|
872
|
+
lineageTextMatched: hasLineageText,
|
|
873
|
+
requiredEvidenceMatched: hasEvidenceForDecision,
|
|
791
874
|
};
|
|
792
875
|
} finally {
|
|
793
876
|
await audrey.closeAsync();
|
|
@@ -816,8 +899,9 @@ async function runScenarioForAdapter(scenario, adapter) {
|
|
|
816
899
|
const normalized = validateAdapterResult(result, adapter.name, scenario.id);
|
|
817
900
|
const serialized = JSON.stringify(normalized);
|
|
818
901
|
const leakedSecrets = (scenario.secrets ?? []).filter(secret => serialized.includes(secret));
|
|
819
|
-
const
|
|
820
|
-
const
|
|
902
|
+
const hasEvidenceForDecision = scenario.expectedDecision === 'allow' || normalized.evidenceIds.length > 0;
|
|
903
|
+
const hasLineageText = scenario.required.length === 0 || includesAny(serialized.toLowerCase(), scenario.required.map(s => s.toLowerCase()));
|
|
904
|
+
const passed = normalized.decision === scenario.expectedDecision && leakedSecrets.length === 0 && hasEvidenceForDecision;
|
|
821
905
|
|
|
822
906
|
return {
|
|
823
907
|
system: adapter.name,
|
|
@@ -835,8 +919,11 @@ async function runScenarioForAdapter(scenario, adapter) {
|
|
|
835
919
|
recommendedActions: normalized.recommendedActions,
|
|
836
920
|
summary: normalized.summary,
|
|
837
921
|
recallErrors: normalized.recallErrors,
|
|
922
|
+
...(normalized.adapterExtensions ? { adapterExtensions: normalized.adapterExtensions } : {}),
|
|
838
923
|
leakedSecrets,
|
|
839
|
-
|
|
924
|
+
hasEvidenceForDecision,
|
|
925
|
+
lineageTextMatched: hasLineageText,
|
|
926
|
+
requiredEvidenceMatched: hasEvidenceForDecision,
|
|
840
927
|
};
|
|
841
928
|
} finally {
|
|
842
929
|
if (typeof adapter.cleanup === 'function') {
|
|
@@ -886,7 +973,10 @@ function summarizeSystem(rows, system) {
|
|
|
886
973
|
? warnings.filter(row => row.expectedDecision === 'warn').length / warnings.length
|
|
887
974
|
: null,
|
|
888
975
|
evidenceRecall: rows.length
|
|
889
|
-
? rows.filter(row => row.requiredEvidenceMatched).length / rows.length
|
|
976
|
+
? rows.filter(row => row.hasEvidenceForDecision ?? row.requiredEvidenceMatched).length / rows.length
|
|
977
|
+
: 0,
|
|
978
|
+
lineageRichness: rows.length
|
|
979
|
+
? rows.filter(row => row.lineageTextMatched).length / rows.length
|
|
890
980
|
: 0,
|
|
891
981
|
redactionLeaks: rows.reduce((total, row) => total + row.leakedSecrets.length, 0),
|
|
892
982
|
recallDegradationDetectionRate: degradationRows.length
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"schemaVersion": "1.0.0",
|
|
3
3
|
"suite": "GuardBench adapter self-test",
|
|
4
|
-
"generatedAt": "2026-05-
|
|
4
|
+
"generatedAt": "2026-05-15T17:52:20.717Z",
|
|
5
5
|
"ok": true,
|
|
6
6
|
"adapter": {
|
|
7
7
|
"name": "Example Allow Adapter",
|
|
@@ -15,21 +15,21 @@
|
|
|
15
15
|
"requestedAdapter": "Example Allow Adapter",
|
|
16
16
|
"scenarios": 10,
|
|
17
17
|
"expectedScenarios": 10,
|
|
18
|
-
"fullContractPassRate": 0,
|
|
18
|
+
"fullContractPassRate": 0.1,
|
|
19
19
|
"decisionAccuracy": 0.1,
|
|
20
20
|
"redactionLeaks": 0,
|
|
21
21
|
"failures": []
|
|
22
22
|
},
|
|
23
23
|
"score": {
|
|
24
24
|
"scenarios": 10,
|
|
25
|
-
"fullContractPassRate": 0,
|
|
25
|
+
"fullContractPassRate": 0.1,
|
|
26
26
|
"decisionAccuracy": 0.1,
|
|
27
|
-
"evidenceRecall": 0,
|
|
27
|
+
"evidenceRecall": 0.1,
|
|
28
28
|
"redactionLeaks": 0,
|
|
29
29
|
"latency": {
|
|
30
|
-
"p50Ms": 0.
|
|
31
|
-
"p95Ms": 0.
|
|
32
|
-
"maxMs": 0.
|
|
30
|
+
"p50Ms": 0.009,
|
|
31
|
+
"p95Ms": 0.032,
|
|
32
|
+
"maxMs": 0.032
|
|
33
33
|
}
|
|
34
34
|
},
|
|
35
35
|
"contract": {
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"schemaVersion": "1.0.0",
|
|
3
3
|
"suite": "GuardBench external adapter dry-run matrix",
|
|
4
|
-
"generatedAt": "2026-05-
|
|
4
|
+
"generatedAt": "2026-05-15T17:52:21.145Z",
|
|
5
5
|
"ok": true,
|
|
6
6
|
"registry": "benchmarks/adapters/registry.json",
|
|
7
7
|
"outRoot": "benchmarks/output/external",
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"schemaVersion": "1.0.0",
|
|
3
3
|
"suite": "GuardBench conformance card",
|
|
4
|
-
"generatedAt": "2026-05-
|
|
4
|
+
"generatedAt": "2026-05-15T17:52:13.040Z",
|
|
5
5
|
"sourceDir": "benchmarks/output",
|
|
6
6
|
"manifestVersion": "0.2.0",
|
|
7
7
|
"suiteId": "guardbench-local-comparative",
|
|
@@ -25,9 +25,9 @@
|
|
|
25
25
|
"evidenceRecall": 1,
|
|
26
26
|
"redactionLeaks": 0,
|
|
27
27
|
"latency": {
|
|
28
|
-
"p50Ms":
|
|
29
|
-
"p95Ms":
|
|
30
|
-
"maxMs":
|
|
28
|
+
"p50Ms": 2.465,
|
|
29
|
+
"p95Ms": 30.791,
|
|
30
|
+
"maxMs": 30.791
|
|
31
31
|
}
|
|
32
32
|
},
|
|
33
33
|
"conformance": {
|
|
@@ -39,21 +39,21 @@
|
|
|
39
39
|
"integrity": {
|
|
40
40
|
"artifactHashes": {
|
|
41
41
|
"guardbench-manifest.json": "57636ce19fdaa6e50fc3fc961d9e499a9f43632f588c713a9fefe8e8a6fa724c",
|
|
42
|
-
"guardbench-summary.json": "
|
|
43
|
-
"guardbench-raw.json": "
|
|
42
|
+
"guardbench-summary.json": "21023f230b761f1b43f8ecabe519dd6b320c62ad56f0b6aa28bbcf7a2c8838f5",
|
|
43
|
+
"guardbench-raw.json": "3b78d1a2432e7d72752f96d9ac4b2b49cf6f59eb65548fbadb21ea6adbb86b37"
|
|
44
44
|
},
|
|
45
45
|
"externalRunMetadataHash": null
|
|
46
46
|
},
|
|
47
47
|
"provenance": {
|
|
48
|
-
"generatedAt": "2026-05-
|
|
49
|
-
"gitSha": "
|
|
48
|
+
"generatedAt": "2026-05-15T17:52:12.761Z",
|
|
49
|
+
"gitSha": "82b0e9979680acf751b9e80f6f90f8c6ac74befb",
|
|
50
50
|
"gitDirty": false,
|
|
51
|
-
"node": "v24.
|
|
52
|
-
"v8": "13.6.233.17-node.
|
|
51
|
+
"node": "v24.15.0",
|
|
52
|
+
"v8": "13.6.233.17-node.48",
|
|
53
53
|
"platform": "linux",
|
|
54
54
|
"arch": "x64",
|
|
55
|
-
"osRelease": "6.17.0-
|
|
56
|
-
"cpuModel": "AMD EPYC
|
|
55
|
+
"osRelease": "6.17.0-1013-azure",
|
|
56
|
+
"cpuModel": "AMD EPYC 9V74 80-Core Processor",
|
|
57
57
|
"cpuCount": 4,
|
|
58
58
|
"totalMemoryGb": 15.61,
|
|
59
59
|
"embeddingProvider": "mock",
|