@ryuenn3123/agentic-senior-core 2.0.5 → 2.0.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.agent-context/blueprints/mobile-app.md +91 -21
- package/.agent-context/profiles/platform.md +13 -13
- package/.agent-context/profiles/regulated.md +13 -13
- package/.agent-context/profiles/startup.md +13 -13
- package/.agent-context/review-checklists/frontend-skill-parity.md +28 -28
- package/.agent-context/review-checklists/frontend-usability.md +33 -33
- package/.agent-context/review-checklists/release-operations.md +29 -29
- package/.agent-context/skills/README.md +62 -62
- package/.agent-context/skills/backend/README.md +67 -67
- package/.agent-context/skills/backend/architecture.md +360 -360
- package/.agent-context/skills/backend/compatibility-manifest.json +8 -8
- package/.agent-context/skills/backend/data-access.md +230 -230
- package/.agent-context/skills/backend/errors.md +137 -137
- package/.agent-context/skills/backend/validation.md +116 -116
- package/.agent-context/skills/backend.md +28 -28
- package/.agent-context/skills/cli/README.md +55 -49
- package/.agent-context/skills/cli/compatibility-manifest.json +8 -8
- package/.agent-context/skills/cli/init.md +37 -37
- package/.agent-context/skills/cli/output.md +35 -35
- package/.agent-context/skills/cli/safety-telemetry.md +39 -0
- package/.agent-context/skills/cli/upgrade.md +37 -37
- package/.agent-context/skills/cli.md +31 -28
- package/.agent-context/skills/distribution/.evidence/compatibility-manifest.json +9 -0
- package/.agent-context/skills/distribution/.evidence/sbom-excerpt.json +6 -0
- package/.agent-context/skills/distribution/.evidence/test-report.json +8 -0
- package/.agent-context/skills/distribution/CHANGELOG.md +7 -0
- package/.agent-context/skills/distribution/README.md +27 -19
- package/.agent-context/skills/distribution/compatibility-manifest.json +8 -8
- package/.agent-context/skills/distribution/compatibility.md +31 -31
- package/.agent-context/skills/distribution/package.json +5 -0
- package/.agent-context/skills/distribution/provenance-attestation.md +47 -0
- package/.agent-context/skills/distribution/publish.md +36 -36
- package/.agent-context/skills/distribution/rollback.md +31 -31
- package/.agent-context/skills/distribution/tests/.gitkeep +1 -0
- package/.agent-context/skills/distribution.md +31 -28
- package/.agent-context/skills/frontend/.evidence/compatibility-manifest.json +9 -0
- package/.agent-context/skills/frontend/.evidence/sbom-excerpt.json +6 -0
- package/.agent-context/skills/frontend/.evidence/test-report.json +8 -0
- package/.agent-context/skills/frontend/CHANGELOG.md +7 -0
- package/.agent-context/skills/frontend/README.md +49 -36
- package/.agent-context/skills/frontend/accessibility.md +107 -107
- package/.agent-context/skills/frontend/compatibility-manifest.json +8 -8
- package/.agent-context/skills/frontend/conversion-clarity.md +51 -0
- package/.agent-context/skills/frontend/motion.md +66 -66
- package/.agent-context/skills/frontend/package.json +5 -0
- package/.agent-context/skills/frontend/performance.md +62 -62
- package/.agent-context/skills/frontend/responsive-delivery.md +41 -0
- package/.agent-context/skills/frontend/tests/.gitkeep +1 -0
- package/.agent-context/skills/frontend/ui-architecture.md +128 -128
- package/.agent-context/skills/frontend.md +35 -29
- package/.agent-context/skills/fullstack/.evidence/compatibility-manifest.json +9 -0
- package/.agent-context/skills/fullstack/.evidence/sbom-excerpt.json +6 -0
- package/.agent-context/skills/fullstack/.evidence/test-report.json +8 -0
- package/.agent-context/skills/fullstack/CHANGELOG.md +7 -0
- package/.agent-context/skills/fullstack/README.md +27 -19
- package/.agent-context/skills/fullstack/compatibility-manifest.json +8 -8
- package/.agent-context/skills/fullstack/contracts.md +52 -52
- package/.agent-context/skills/fullstack/end-to-end.md +41 -41
- package/.agent-context/skills/fullstack/feature-slicing.md +64 -64
- package/.agent-context/skills/fullstack/package.json +5 -0
- package/.agent-context/skills/fullstack/release-coordination.md +51 -0
- package/.agent-context/skills/fullstack/tests/.gitkeep +1 -0
- package/.agent-context/skills/fullstack.md +29 -26
- package/.agent-context/skills/index.json +107 -107
- package/.agent-context/skills/review-quality/.evidence/compatibility-manifest.json +9 -0
- package/.agent-context/skills/review-quality/.evidence/sbom-excerpt.json +6 -0
- package/.agent-context/skills/review-quality/.evidence/test-report.json +8 -0
- package/.agent-context/skills/review-quality/CHANGELOG.md +7 -0
- package/.agent-context/skills/review-quality/README.md +27 -19
- package/.agent-context/skills/review-quality/benchmark.md +29 -29
- package/.agent-context/skills/review-quality/compatibility-manifest.json +8 -8
- package/.agent-context/skills/review-quality/package.json +5 -0
- package/.agent-context/skills/review-quality/planning.md +37 -37
- package/.agent-context/skills/review-quality/release-decision.md +49 -0
- package/.agent-context/skills/review-quality/security.md +33 -33
- package/.agent-context/skills/review-quality/tests/.gitkeep +1 -0
- package/.agent-context/skills/review-quality.md +30 -27
- package/.agent-context/stacks/flutter.md +16 -16
- package/.agent-context/stacks/react-native.md +16 -16
- package/.agent-context/state/architecture-map.md +25 -25
- package/.agent-context/state/benchmark-analysis.json +431 -431
- package/.agent-context/state/benchmark-thresholds.json +10 -10
- package/.agent-context/state/benchmark-watchlist.json +19 -19
- package/.agent-context/state/dependency-map.md +32 -32
- package/.agent-context/state/quality-trend-report.json +16 -6
- package/.agent-context/state/skill-platform.json +38 -38
- package/.agent-context/state/weekly-governance-report.json +126 -0
- package/.agent-override.md +36 -36
- package/.cursorrules +1 -1
- package/.gemini/instructions.md +20 -20
- package/.github/ISSUE_TEMPLATE/v1.7-frontend-work-item.yml +54 -54
- package/.github/copilot-instructions.md +20 -20
- package/.github/workflows/benchmark-detection.yml +38 -38
- package/.github/workflows/benchmark-intelligence.yml +50 -50
- package/.github/workflows/frontend-usability-gate.yml +36 -36
- package/.github/workflows/governance-weekly-report.yml +43 -0
- package/.github/workflows/release-gate.yml +32 -32
- package/.github/workflows/sbom-compliance.yml +32 -32
- package/.windsurfrules +1 -1
- package/AGENTS.md +27 -27
- package/README.md +383 -368
- package/lib/cli/commands/optimize.mjs +171 -171
- package/lib/cli/compatibility.mjs +124 -124
- package/lib/cli/constants.mjs +35 -0
- package/lib/cli/token-optimization.mjs +275 -275
- package/lib/cli/utils.mjs +4 -1
- package/mcp.json +92 -92
- package/package.json +2 -1
- package/scripts/benchmark-gate.mjs +121 -121
- package/scripts/benchmark-intelligence.mjs +140 -140
- package/scripts/detection-benchmark.mjs +138 -138
- package/scripts/frontend-usability-audit.mjs +87 -87
- package/scripts/generate-sbom.mjs +61 -61
- package/scripts/governance-weekly-report.mjs +293 -0
- package/scripts/init-project.ps1 +104 -104
- package/scripts/llm-judge.mjs +664 -664
- package/scripts/quality-trend-report.mjs +288 -288
- package/scripts/release-gate.mjs +261 -259
- package/scripts/skill-tier-policy.mjs +75 -75
- package/scripts/token-optimization-benchmark.mjs +252 -252
- package/scripts/validate.mjs +874 -865
package/mcp.json
CHANGED
|
@@ -1,92 +1,92 @@
|
|
|
1
|
-
{
|
|
2
|
-
"$schema": "https://modelcontextprotocol.io/schemas/mcp.json",
|
|
3
|
-
"version": "1.0",
|
|
4
|
-
"name": "agentic-senior-core",
|
|
5
|
-
"description": "MCP configuration for governance-aware diagnostics and self-healing workflows with full knowledge injection.",
|
|
6
|
-
"knowledgeLayers": {
|
|
7
|
-
"enabled": true,
|
|
8
|
-
"description": "8-layer unified knowledge injection for AI agents",
|
|
9
|
-
"layers": {
|
|
10
|
-
"rules": {
|
|
11
|
-
"path": ".agent-context/rules",
|
|
12
|
-
"count": 14,
|
|
13
|
-
"autoLoad": true
|
|
14
|
-
},
|
|
15
|
-
"stacks": {
|
|
16
|
-
"path": ".agent-context/stacks",
|
|
17
|
-
"count": 10,
|
|
18
|
-
"autoLoad": true
|
|
19
|
-
},
|
|
20
|
-
"blueprints": {
|
|
21
|
-
"path": ".agent-context/blueprints",
|
|
22
|
-
"count": 14,
|
|
23
|
-
"autoLoad": true
|
|
24
|
-
},
|
|
25
|
-
"skills": {
|
|
26
|
-
"path": ".agent-context/skills",
|
|
27
|
-
"count": 6,
|
|
28
|
-
"autoLoad": true,
|
|
29
|
-
"packs": ["backend", "frontend", "cli", "distribution", "fullstack", "review-quality"]
|
|
30
|
-
},
|
|
31
|
-
"prompts": {
|
|
32
|
-
"path": ".agent-context/prompts",
|
|
33
|
-
"count": 3,
|
|
34
|
-
"autoLoad": true,
|
|
35
|
-
"templates": ["init-project", "refactor", "review-code"]
|
|
36
|
-
},
|
|
37
|
-
"profiles": {
|
|
38
|
-
"path": ".agent-context/profiles",
|
|
39
|
-
"count": 3,
|
|
40
|
-
"autoLoad": true,
|
|
41
|
-
"governance": ["platform", "regulated", "startup"]
|
|
42
|
-
},
|
|
43
|
-
"state": {
|
|
44
|
-
"path": ".agent-context/state",
|
|
45
|
-
"count": 6,
|
|
46
|
-
"autoLoad": true
|
|
47
|
-
},
|
|
48
|
-
"policies": {
|
|
49
|
-
"path": ".agent-context/policies",
|
|
50
|
-
"count": 1,
|
|
51
|
-
"autoLoad": true
|
|
52
|
-
}
|
|
53
|
-
}
|
|
54
|
-
},
|
|
55
|
-
"servers": {
|
|
56
|
-
"lint": {
|
|
57
|
-
"command": "node",
|
|
58
|
-
"args": ["./scripts/validate.mjs"],
|
|
59
|
-
"transport": "stdio"
|
|
60
|
-
},
|
|
61
|
-
"test": {
|
|
62
|
-
"command": "node",
|
|
63
|
-
"args": ["--test", "./tests/cli-smoke.test.mjs", "./tests/llm-judge.test.mjs"],
|
|
64
|
-
"transport": "stdio"
|
|
65
|
-
}
|
|
66
|
-
},
|
|
67
|
-
"workflows": {
|
|
68
|
-
"self-heal-on-failure": {
|
|
69
|
-
"trigger": ["ci.failure", "lint.failure"],
|
|
70
|
-
"steps": [
|
|
71
|
-
"collect_logs",
|
|
72
|
-
"map_violation_to_rule",
|
|
73
|
-
"propose_patch",
|
|
74
|
-
"run_checklist_review"
|
|
75
|
-
]
|
|
76
|
-
},
|
|
77
|
-
"full-knowledge-injection": {
|
|
78
|
-
"trigger": ["workspace.initialize", "agent.start"],
|
|
79
|
-
"steps": [
|
|
80
|
-
"load_all_knowledge_layers",
|
|
81
|
-
"inject_rules",
|
|
82
|
-
"inject_stacks",
|
|
83
|
-
"inject_blueprints",
|
|
84
|
-
"inject_skills",
|
|
85
|
-
"inject_prompts",
|
|
86
|
-
"inject_profiles",
|
|
87
|
-
"inject_state",
|
|
88
|
-
"inject_policies"
|
|
89
|
-
]
|
|
90
|
-
}
|
|
91
|
-
}
|
|
92
|
-
}
|
|
1
|
+
{
|
|
2
|
+
"$schema": "https://modelcontextprotocol.io/schemas/mcp.json",
|
|
3
|
+
"version": "1.0",
|
|
4
|
+
"name": "agentic-senior-core",
|
|
5
|
+
"description": "MCP configuration for governance-aware diagnostics and self-healing workflows with full knowledge injection.",
|
|
6
|
+
"knowledgeLayers": {
|
|
7
|
+
"enabled": true,
|
|
8
|
+
"description": "8-layer unified knowledge injection for AI agents",
|
|
9
|
+
"layers": {
|
|
10
|
+
"rules": {
|
|
11
|
+
"path": ".agent-context/rules",
|
|
12
|
+
"count": 14,
|
|
13
|
+
"autoLoad": true
|
|
14
|
+
},
|
|
15
|
+
"stacks": {
|
|
16
|
+
"path": ".agent-context/stacks",
|
|
17
|
+
"count": 10,
|
|
18
|
+
"autoLoad": true
|
|
19
|
+
},
|
|
20
|
+
"blueprints": {
|
|
21
|
+
"path": ".agent-context/blueprints",
|
|
22
|
+
"count": 14,
|
|
23
|
+
"autoLoad": true
|
|
24
|
+
},
|
|
25
|
+
"skills": {
|
|
26
|
+
"path": ".agent-context/skills",
|
|
27
|
+
"count": 6,
|
|
28
|
+
"autoLoad": true,
|
|
29
|
+
"packs": ["backend", "frontend", "cli", "distribution", "fullstack", "review-quality"]
|
|
30
|
+
},
|
|
31
|
+
"prompts": {
|
|
32
|
+
"path": ".agent-context/prompts",
|
|
33
|
+
"count": 3,
|
|
34
|
+
"autoLoad": true,
|
|
35
|
+
"templates": ["init-project", "refactor", "review-code"]
|
|
36
|
+
},
|
|
37
|
+
"profiles": {
|
|
38
|
+
"path": ".agent-context/profiles",
|
|
39
|
+
"count": 3,
|
|
40
|
+
"autoLoad": true,
|
|
41
|
+
"governance": ["platform", "regulated", "startup"]
|
|
42
|
+
},
|
|
43
|
+
"state": {
|
|
44
|
+
"path": ".agent-context/state",
|
|
45
|
+
"count": 6,
|
|
46
|
+
"autoLoad": true
|
|
47
|
+
},
|
|
48
|
+
"policies": {
|
|
49
|
+
"path": ".agent-context/policies",
|
|
50
|
+
"count": 1,
|
|
51
|
+
"autoLoad": true
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
},
|
|
55
|
+
"servers": {
|
|
56
|
+
"lint": {
|
|
57
|
+
"command": "node",
|
|
58
|
+
"args": ["./scripts/validate.mjs"],
|
|
59
|
+
"transport": "stdio"
|
|
60
|
+
},
|
|
61
|
+
"test": {
|
|
62
|
+
"command": "node",
|
|
63
|
+
"args": ["--test", "./tests/cli-smoke.test.mjs", "./tests/llm-judge.test.mjs"],
|
|
64
|
+
"transport": "stdio"
|
|
65
|
+
}
|
|
66
|
+
},
|
|
67
|
+
"workflows": {
|
|
68
|
+
"self-heal-on-failure": {
|
|
69
|
+
"trigger": ["ci.failure", "lint.failure"],
|
|
70
|
+
"steps": [
|
|
71
|
+
"collect_logs",
|
|
72
|
+
"map_violation_to_rule",
|
|
73
|
+
"propose_patch",
|
|
74
|
+
"run_checklist_review"
|
|
75
|
+
]
|
|
76
|
+
},
|
|
77
|
+
"full-knowledge-injection": {
|
|
78
|
+
"trigger": ["workspace.initialize", "agent.start"],
|
|
79
|
+
"steps": [
|
|
80
|
+
"load_all_knowledge_layers",
|
|
81
|
+
"inject_rules",
|
|
82
|
+
"inject_stacks",
|
|
83
|
+
"inject_blueprints",
|
|
84
|
+
"inject_skills",
|
|
85
|
+
"inject_prompts",
|
|
86
|
+
"inject_profiles",
|
|
87
|
+
"inject_state",
|
|
88
|
+
"inject_policies"
|
|
89
|
+
]
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@ryuenn3123/agentic-senior-core",
|
|
3
|
-
"version": "2.0.
|
|
3
|
+
"version": "2.0.7",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"description": "Force your AI Agent to code like a Staff Engineer, not a Junior.",
|
|
6
6
|
"bin": {
|
|
@@ -51,6 +51,7 @@
|
|
|
51
51
|
"benchmark:gate": "node ./scripts/benchmark-gate.mjs",
|
|
52
52
|
"benchmark:intelligence": "node ./scripts/benchmark-intelligence.mjs",
|
|
53
53
|
"report:quality-trend": "node ./scripts/quality-trend-report.mjs",
|
|
54
|
+
"report:governance-weekly": "node ./scripts/governance-weekly-report.mjs",
|
|
54
55
|
"validate": "node ./scripts/validate.mjs",
|
|
55
56
|
"test": "node --test ./tests/cli-smoke.test.mjs ./tests/llm-judge.test.mjs ./tests/enterprise-ops.test.mjs ./tests/skill-tier-gate.test.mjs"
|
|
56
57
|
}
|
|
@@ -1,121 +1,121 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
/**
|
|
4
|
-
* benchmark-gate.mjs
|
|
5
|
-
*
|
|
6
|
-
* Anti-regression gate for benchmark quality signals.
|
|
7
|
-
* Fails when benchmark metrics drop below configured thresholds.
|
|
8
|
-
*/
|
|
9
|
-
|
|
10
|
-
import { existsSync, readFileSync } from 'node:fs';
|
|
11
|
-
import { dirname, join, resolve } from 'node:path';
|
|
12
|
-
import { fileURLToPath } from 'node:url';
|
|
13
|
-
import { execFileSync } from 'node:child_process';
|
|
14
|
-
|
|
15
|
-
const SCRIPT_FILE_PATH = fileURLToPath(import.meta.url);
|
|
16
|
-
const SCRIPT_DIR = dirname(SCRIPT_FILE_PATH);
|
|
17
|
-
const REPOSITORY_ROOT = resolve(SCRIPT_DIR, '..');
|
|
18
|
-
const BENCHMARK_THRESHOLD_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-thresholds.json');
|
|
19
|
-
const DETECTION_BENCHMARK_PATH = join(REPOSITORY_ROOT, 'scripts', 'detection-benchmark.mjs');
|
|
20
|
-
|
|
21
|
-
function readThresholdConfiguration() {
|
|
22
|
-
if (!existsSync(BENCHMARK_THRESHOLD_PATH)) {
|
|
23
|
-
return {
|
|
24
|
-
minimumTop1Accuracy: 0.9,
|
|
25
|
-
maximumManualCorrectionRate: 0.12,
|
|
26
|
-
maximumTop1AccuracyDrop: 0.02,
|
|
27
|
-
maximumManualCorrectionIncrease: 0.03,
|
|
28
|
-
previousReleaseBaseline: {
|
|
29
|
-
top1Accuracy: 0.9167,
|
|
30
|
-
manualCorrectionRate: 0.0833,
|
|
31
|
-
},
|
|
32
|
-
};
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
return JSON.parse(readFileSync(BENCHMARK_THRESHOLD_PATH, 'utf8'));
|
|
36
|
-
}
|
|
37
|
-
|
|
38
|
-
function runDetectionBenchmark() {
|
|
39
|
-
const benchmarkRawOutput = execFileSync('node', [DETECTION_BENCHMARK_PATH], {
|
|
40
|
-
cwd: REPOSITORY_ROOT,
|
|
41
|
-
encoding: 'utf8',
|
|
42
|
-
});
|
|
43
|
-
|
|
44
|
-
return JSON.parse(benchmarkRawOutput);
|
|
45
|
-
}
|
|
46
|
-
|
|
47
|
-
function buildCheckResult(checkName, passed, details) {
|
|
48
|
-
return {
|
|
49
|
-
checkName,
|
|
50
|
-
passed,
|
|
51
|
-
details,
|
|
52
|
-
};
|
|
53
|
-
}
|
|
54
|
-
|
|
55
|
-
function runBenchmarkGate() {
|
|
56
|
-
const thresholdConfiguration = readThresholdConfiguration();
|
|
57
|
-
const benchmarkResult = runDetectionBenchmark();
|
|
58
|
-
const benchmarkChecks = [];
|
|
59
|
-
|
|
60
|
-
const top1AccuracyPassed = benchmarkResult.top1Accuracy >= thresholdConfiguration.minimumTop1Accuracy;
|
|
61
|
-
benchmarkChecks.push(
|
|
62
|
-
buildCheckResult(
|
|
63
|
-
'minimum-top1-accuracy',
|
|
64
|
-
top1AccuracyPassed,
|
|
65
|
-
`top1Accuracy=${benchmarkResult.top1Accuracy} minimum=${thresholdConfiguration.minimumTop1Accuracy}`,
|
|
66
|
-
),
|
|
67
|
-
);
|
|
68
|
-
|
|
69
|
-
const manualCorrectionPassed = benchmarkResult.manualCorrectionRate <= thresholdConfiguration.maximumManualCorrectionRate;
|
|
70
|
-
benchmarkChecks.push(
|
|
71
|
-
buildCheckResult(
|
|
72
|
-
'maximum-manual-correction-rate',
|
|
73
|
-
manualCorrectionPassed,
|
|
74
|
-
`manualCorrectionRate=${benchmarkResult.manualCorrectionRate} maximum=${thresholdConfiguration.maximumManualCorrectionRate}`,
|
|
75
|
-
),
|
|
76
|
-
);
|
|
77
|
-
|
|
78
|
-
const previousReleaseBaseline = thresholdConfiguration.previousReleaseBaseline;
|
|
79
|
-
if (previousReleaseBaseline && typeof previousReleaseBaseline === 'object') {
|
|
80
|
-
const top1AccuracyDrop = Number((previousReleaseBaseline.top1Accuracy - benchmarkResult.top1Accuracy).toFixed(4));
|
|
81
|
-
const manualCorrectionIncrease = Number((benchmarkResult.manualCorrectionRate - previousReleaseBaseline.manualCorrectionRate).toFixed(4));
|
|
82
|
-
|
|
83
|
-
const top1AccuracyDropPassed = top1AccuracyDrop <= thresholdConfiguration.maximumTop1AccuracyDrop;
|
|
84
|
-
benchmarkChecks.push(
|
|
85
|
-
buildCheckResult(
|
|
86
|
-
'maximum-top1-accuracy-drop',
|
|
87
|
-
top1AccuracyDropPassed,
|
|
88
|
-
`drop=${top1AccuracyDrop} maximum=${thresholdConfiguration.maximumTop1AccuracyDrop}`,
|
|
89
|
-
),
|
|
90
|
-
);
|
|
91
|
-
|
|
92
|
-
const manualCorrectionIncreasePassed = manualCorrectionIncrease <= thresholdConfiguration.maximumManualCorrectionIncrease;
|
|
93
|
-
benchmarkChecks.push(
|
|
94
|
-
buildCheckResult(
|
|
95
|
-
'maximum-manual-correction-increase',
|
|
96
|
-
manualCorrectionIncreasePassed,
|
|
97
|
-
`increase=${manualCorrectionIncrease} maximum=${thresholdConfiguration.maximumManualCorrectionIncrease}`,
|
|
98
|
-
),
|
|
99
|
-
);
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
const failedCheckCount = benchmarkChecks.filter((benchmarkCheck) => !benchmarkCheck.passed).length;
|
|
103
|
-
const benchmarkGateReport = {
|
|
104
|
-
generatedAt: new Date().toISOString(),
|
|
105
|
-
gateName: 'benchmark-gate',
|
|
106
|
-
passed: failedCheckCount === 0,
|
|
107
|
-
failureCount: failedCheckCount,
|
|
108
|
-
benchmarkResult: {
|
|
109
|
-
fixtureCount: benchmarkResult.fixtureCount,
|
|
110
|
-
top1Accuracy: benchmarkResult.top1Accuracy,
|
|
111
|
-
manualCorrectionRate: benchmarkResult.manualCorrectionRate,
|
|
112
|
-
},
|
|
113
|
-
thresholds: thresholdConfiguration,
|
|
114
|
-
results: benchmarkChecks,
|
|
115
|
-
};
|
|
116
|
-
|
|
117
|
-
console.log(JSON.stringify(benchmarkGateReport, null, 2));
|
|
118
|
-
process.exit(benchmarkGateReport.passed ? 0 : 1);
|
|
119
|
-
}
|
|
120
|
-
|
|
121
|
-
runBenchmarkGate();
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* benchmark-gate.mjs
|
|
5
|
+
*
|
|
6
|
+
* Anti-regression gate for benchmark quality signals.
|
|
7
|
+
* Fails when benchmark metrics drop below configured thresholds.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { existsSync, readFileSync } from 'node:fs';
|
|
11
|
+
import { dirname, join, resolve } from 'node:path';
|
|
12
|
+
import { fileURLToPath } from 'node:url';
|
|
13
|
+
import { execFileSync } from 'node:child_process';
|
|
14
|
+
|
|
15
|
+
const SCRIPT_FILE_PATH = fileURLToPath(import.meta.url);
|
|
16
|
+
const SCRIPT_DIR = dirname(SCRIPT_FILE_PATH);
|
|
17
|
+
const REPOSITORY_ROOT = resolve(SCRIPT_DIR, '..');
|
|
18
|
+
const BENCHMARK_THRESHOLD_PATH = join(REPOSITORY_ROOT, '.agent-context', 'state', 'benchmark-thresholds.json');
|
|
19
|
+
const DETECTION_BENCHMARK_PATH = join(REPOSITORY_ROOT, 'scripts', 'detection-benchmark.mjs');
|
|
20
|
+
|
|
21
|
+
function readThresholdConfiguration() {
|
|
22
|
+
if (!existsSync(BENCHMARK_THRESHOLD_PATH)) {
|
|
23
|
+
return {
|
|
24
|
+
minimumTop1Accuracy: 0.9,
|
|
25
|
+
maximumManualCorrectionRate: 0.12,
|
|
26
|
+
maximumTop1AccuracyDrop: 0.02,
|
|
27
|
+
maximumManualCorrectionIncrease: 0.03,
|
|
28
|
+
previousReleaseBaseline: {
|
|
29
|
+
top1Accuracy: 0.9167,
|
|
30
|
+
manualCorrectionRate: 0.0833,
|
|
31
|
+
},
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
return JSON.parse(readFileSync(BENCHMARK_THRESHOLD_PATH, 'utf8'));
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
function runDetectionBenchmark() {
|
|
39
|
+
const benchmarkRawOutput = execFileSync('node', [DETECTION_BENCHMARK_PATH], {
|
|
40
|
+
cwd: REPOSITORY_ROOT,
|
|
41
|
+
encoding: 'utf8',
|
|
42
|
+
});
|
|
43
|
+
|
|
44
|
+
return JSON.parse(benchmarkRawOutput);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function buildCheckResult(checkName, passed, details) {
|
|
48
|
+
return {
|
|
49
|
+
checkName,
|
|
50
|
+
passed,
|
|
51
|
+
details,
|
|
52
|
+
};
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
function runBenchmarkGate() {
|
|
56
|
+
const thresholdConfiguration = readThresholdConfiguration();
|
|
57
|
+
const benchmarkResult = runDetectionBenchmark();
|
|
58
|
+
const benchmarkChecks = [];
|
|
59
|
+
|
|
60
|
+
const top1AccuracyPassed = benchmarkResult.top1Accuracy >= thresholdConfiguration.minimumTop1Accuracy;
|
|
61
|
+
benchmarkChecks.push(
|
|
62
|
+
buildCheckResult(
|
|
63
|
+
'minimum-top1-accuracy',
|
|
64
|
+
top1AccuracyPassed,
|
|
65
|
+
`top1Accuracy=${benchmarkResult.top1Accuracy} minimum=${thresholdConfiguration.minimumTop1Accuracy}`,
|
|
66
|
+
),
|
|
67
|
+
);
|
|
68
|
+
|
|
69
|
+
const manualCorrectionPassed = benchmarkResult.manualCorrectionRate <= thresholdConfiguration.maximumManualCorrectionRate;
|
|
70
|
+
benchmarkChecks.push(
|
|
71
|
+
buildCheckResult(
|
|
72
|
+
'maximum-manual-correction-rate',
|
|
73
|
+
manualCorrectionPassed,
|
|
74
|
+
`manualCorrectionRate=${benchmarkResult.manualCorrectionRate} maximum=${thresholdConfiguration.maximumManualCorrectionRate}`,
|
|
75
|
+
),
|
|
76
|
+
);
|
|
77
|
+
|
|
78
|
+
const previousReleaseBaseline = thresholdConfiguration.previousReleaseBaseline;
|
|
79
|
+
if (previousReleaseBaseline && typeof previousReleaseBaseline === 'object') {
|
|
80
|
+
const top1AccuracyDrop = Number((previousReleaseBaseline.top1Accuracy - benchmarkResult.top1Accuracy).toFixed(4));
|
|
81
|
+
const manualCorrectionIncrease = Number((benchmarkResult.manualCorrectionRate - previousReleaseBaseline.manualCorrectionRate).toFixed(4));
|
|
82
|
+
|
|
83
|
+
const top1AccuracyDropPassed = top1AccuracyDrop <= thresholdConfiguration.maximumTop1AccuracyDrop;
|
|
84
|
+
benchmarkChecks.push(
|
|
85
|
+
buildCheckResult(
|
|
86
|
+
'maximum-top1-accuracy-drop',
|
|
87
|
+
top1AccuracyDropPassed,
|
|
88
|
+
`drop=${top1AccuracyDrop} maximum=${thresholdConfiguration.maximumTop1AccuracyDrop}`,
|
|
89
|
+
),
|
|
90
|
+
);
|
|
91
|
+
|
|
92
|
+
const manualCorrectionIncreasePassed = manualCorrectionIncrease <= thresholdConfiguration.maximumManualCorrectionIncrease;
|
|
93
|
+
benchmarkChecks.push(
|
|
94
|
+
buildCheckResult(
|
|
95
|
+
'maximum-manual-correction-increase',
|
|
96
|
+
manualCorrectionIncreasePassed,
|
|
97
|
+
`increase=${manualCorrectionIncrease} maximum=${thresholdConfiguration.maximumManualCorrectionIncrease}`,
|
|
98
|
+
),
|
|
99
|
+
);
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const failedCheckCount = benchmarkChecks.filter((benchmarkCheck) => !benchmarkCheck.passed).length;
|
|
103
|
+
const benchmarkGateReport = {
|
|
104
|
+
generatedAt: new Date().toISOString(),
|
|
105
|
+
gateName: 'benchmark-gate',
|
|
106
|
+
passed: failedCheckCount === 0,
|
|
107
|
+
failureCount: failedCheckCount,
|
|
108
|
+
benchmarkResult: {
|
|
109
|
+
fixtureCount: benchmarkResult.fixtureCount,
|
|
110
|
+
top1Accuracy: benchmarkResult.top1Accuracy,
|
|
111
|
+
manualCorrectionRate: benchmarkResult.manualCorrectionRate,
|
|
112
|
+
},
|
|
113
|
+
thresholds: thresholdConfiguration,
|
|
114
|
+
results: benchmarkChecks,
|
|
115
|
+
};
|
|
116
|
+
|
|
117
|
+
console.log(JSON.stringify(benchmarkGateReport, null, 2));
|
|
118
|
+
process.exit(benchmarkGateReport.passed ? 0 : 1);
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
runBenchmarkGate();
|