@prism-d1/cli 1.0.26 → 1.0.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/assets/eval-harness/README.md +114 -0
- package/dist/assets/eval-harness/eval-config.json +10 -0
- package/dist/assets/eval-harness/rubrics/agent-quality.json +79 -0
- package/dist/assets/eval-harness/rubrics/api-response-quality.json +45 -0
- package/dist/assets/eval-harness/rubrics/code-quality.json +98 -0
- package/dist/assets/eval-harness/rubrics/security-compliance.json +145 -0
- package/dist/assets/eval-harness/rubrics/spec-compliance.json +67 -0
- package/dist/assets/eval-harness/run-eval.sh +122 -0
- package/dist/assets/github-workflows/README.md +110 -0
- package/dist/assets/github-workflows/prism-agent-eval.yml +313 -0
- package/dist/assets/github-workflows/prism-ai-metrics.yml +261 -0
- package/dist/assets/github-workflows/prism-dora-weekly.yml +334 -0
- package/dist/assets/github-workflows/prism-eval-gate.yml +310 -0
- package/dist/assets/infra/bin/app.ts +56 -0
- package/dist/assets/infra/cdk.json +12 -0
- package/dist/assets/infra/lib/api-stack.ts +347 -0
- package/dist/assets/infra/lib/constructs/bedrock-guardrail-construct.ts +201 -0
- package/dist/assets/infra/lib/constructs/guardrail-enforcer-construct.ts +59 -0
- package/dist/assets/infra/lib/constructs/prism-vpc-construct.ts +75 -0
- package/dist/assets/infra/lib/constructs/security-agent-construct.ts +266 -0
- package/dist/assets/infra/lib/dashboard-stack.ts +1392 -0
- package/dist/assets/infra/lib/lambda/api-handler.ts +477 -0
- package/dist/assets/infra/lib/lambda/defect-correlator.ts +142 -0
- package/dist/assets/infra/lib/lambda/exfiltration-detector.ts +100 -0
- package/dist/assets/infra/lib/lambda/layers/guardrail-enforcer/nodejs/guardrail-enforcer.js +53 -0
- package/dist/assets/infra/lib/lambda/metrics-processor.ts +748 -0
- package/dist/assets/infra/lib/lambda/security-agent-processor.ts +231 -0
- package/dist/assets/infra/lib/lambda/security-remediation-tracker.ts +120 -0
- package/dist/assets/infra/lib/lambda/security-response-automator.ts +130 -0
- package/dist/assets/infra/lib/lambda/spec-to-code-calculator.ts +123 -0
- package/dist/assets/infra/lib/metrics-pipeline-stack.ts +701 -0
- package/dist/assets/infra/package.json +23 -0
- package/dist/assets/infra/tsconfig.json +24 -0
- package/dist/src/commands/bootstrapper/install-eval-harness.d.ts.map +1 -1
- package/dist/src/commands/bootstrapper/install-eval-harness.js +3 -4
- package/dist/src/commands/bootstrapper/install-eval-harness.js.map +1 -1
- package/dist/src/commands/bootstrapper/install-git-hooks.d.ts.map +1 -1
- package/dist/src/commands/bootstrapper/install-git-hooks.js +2 -5
- package/dist/src/commands/bootstrapper/install-git-hooks.js.map +1 -1
- package/dist/src/commands/securityagent/setup.d.ts.map +1 -1
- package/dist/src/commands/securityagent/setup.js +2 -3
- package/dist/src/commands/securityagent/setup.js.map +1 -1
- package/dist/src/commands/workshop/deploy-infra.d.ts.map +1 -1
- package/dist/src/commands/workshop/deploy-infra.js +2 -3
- package/dist/src/commands/workshop/deploy-infra.js.map +1 -1
- package/dist/src/commands/workshop/generate-demo-data.d.ts.map +1 -1
- package/dist/src/commands/workshop/generate-demo-data.js +3 -8
- package/dist/src/commands/workshop/generate-demo-data.js.map +1 -1
- package/dist/src/commands/workshop/perform-pen-test.d.ts.map +1 -1
- package/dist/src/commands/workshop/perform-pen-test.js +5 -14
- package/dist/src/commands/workshop/perform-pen-test.js.map +1 -1
- package/dist/src/utils/root.d.ts +6 -0
- package/dist/src/utils/root.d.ts.map +1 -1
- package/dist/src/utils/root.js +29 -0
- package/dist/src/utils/root.js.map +1 -1
- package/package.json +2 -2
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
# Eval Harness
|
|
2
|
+
|
|
3
|
+
Evaluate AI-generated code against rubrics using Amazon Bedrock. Runs per-file, calculates weighted scores client-side, and integrates with the PRISM eval gate workflow.
|
|
4
|
+
|
|
5
|
+
## Install
|
|
6
|
+
|
|
7
|
+
```bash
|
|
8
|
+
# Workshop mode — empty rubrics, create your own
|
|
9
|
+
bash prism-cli bootstrapper install-eval-harness
|
|
10
|
+
|
|
11
|
+
# Production mode — includes all 5 rubrics
|
|
12
|
+
bash prism-cli bootstrapper install-eval-harness --with-rubrics
|
|
13
|
+
|
|
14
|
+
# Non-interactive
|
|
15
|
+
bash prism-cli bootstrapper install-eval-harness --model us.anthropic.claude-haiku-4-5-20251001-v1:0 --threshold 0.82 --with-rubrics
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
This installs into your repo:
|
|
19
|
+
- `.prism/.prism/eval-harness/run-eval.sh` — evaluation script
|
|
20
|
+
- `.prism/.prism/eval-harness/eval-config.json` — model, threshold, region
|
|
21
|
+
- `.prism/.prism/eval-harness/rubrics/` — rubric JSON files
|
|
22
|
+
- `.github/workflows/prism-eval-gate.yml` — CI workflow
|
|
23
|
+
|
|
24
|
+
## Usage
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
# Evaluate a single file
|
|
28
|
+
./.prism/.prism/eval-harness/run-eval.sh .prism/.prism/eval-harness/rubrics/code-quality.json src/handler.ts
|
|
29
|
+
|
|
30
|
+
# With a spec file (for spec-compliance rubric)
|
|
31
|
+
./.prism/.prism/eval-harness/run-eval.sh .prism/.prism/eval-harness/rubrics/spec-compliance.json src/api.ts --spec specs/api.md
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
### Output
|
|
35
|
+
|
|
36
|
+
```
|
|
37
|
+
correctness: 0.9 — Handles all inputs correctly including edge cases
|
|
38
|
+
readability: 0.85 — Clear naming, minor style inconsistency in helper
|
|
39
|
+
...
|
|
40
|
+
|
|
41
|
+
Score: 0.8720
|
|
42
|
+
Result: PASS
|
|
43
|
+
Hallucinations: 0
|
|
44
|
+
```
|
|
45
|
+
|
|
46
|
+
Exit codes: `0` = pass, `1` = fail, `2` = error.
|
|
47
|
+
|
|
48
|
+
## Configuration
|
|
49
|
+
|
|
50
|
+
`eval-config.json`:
|
|
51
|
+
|
|
52
|
+
| Field | Description | Default |
|
|
53
|
+
|---|---|---|
|
|
54
|
+
| `pass_threshold` | Minimum score to pass (0-1) | `0.82` |
|
|
55
|
+
| `eval_model_id` | Bedrock model for evaluation | `us.anthropic.claude-haiku-4-5-20251001-v1:0` |
|
|
56
|
+
| `aws_region` | AWS region | `us-west-2` |
|
|
57
|
+
| `event_bus` | EventBridge bus name | `prism-d1-metrics` |
|
|
58
|
+
| `emit_to_eventbridge` | Emit events (workflow handles this) | `true` |
|
|
59
|
+
|
|
60
|
+
## Rubrics
|
|
61
|
+
|
|
62
|
+
Five production rubrics are available:
|
|
63
|
+
|
|
64
|
+
| Rubric | Auto-selected when file path matches |
|
|
65
|
+
|---|---|
|
|
66
|
+
| `code-quality.json` | Default fallback |
|
|
67
|
+
| `api-response-quality.json` | `api`, `handler`, `route`, `controller` |
|
|
68
|
+
| `agent-quality.json` | `agent`, `assistant`, `orchestrat`, `workflow`, `chain` |
|
|
69
|
+
| `security-compliance.json` | `auth`, `security`, `guard`, `policy`, `iam`, `crypto` |
|
|
70
|
+
| `spec-compliance.json` | Used when commit has `Spec-Ref:` trailer |
|
|
71
|
+
|
|
72
|
+
### Creating a Custom Rubric
|
|
73
|
+
|
|
74
|
+
```json
|
|
75
|
+
{
|
|
76
|
+
"rubric_name": "my-rubric",
|
|
77
|
+
"criteria": [
|
|
78
|
+
{
|
|
79
|
+
"name": "criterion_name",
|
|
80
|
+
"weight": 0.30,
|
|
81
|
+
"description": "What this measures",
|
|
82
|
+
"scoring": "How to score 0.0-1.0"
|
|
83
|
+
}
|
|
84
|
+
]
|
|
85
|
+
}
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
Weights must sum to 1.0. The script calculates the weighted average client-side (does not trust the LLM to do math).
|
|
89
|
+
|
|
90
|
+
## CI Workflow
|
|
91
|
+
|
|
92
|
+
The `prism-eval-gate.yml` workflow:
|
|
93
|
+
|
|
94
|
+
1. Detects commits with `AI-Origin:` trailers
|
|
95
|
+
2. Identifies changed source files from those commits
|
|
96
|
+
3. Auto-selects a rubric per file based on path
|
|
97
|
+
4. Runs `run-eval.sh` per file
|
|
98
|
+
5. Posts a PR comment with per-file scores
|
|
99
|
+
6. Waits for AWS Security Agent review (if installed)
|
|
100
|
+
7. Emits `prism.d1.eval` event to EventBridge
|
|
101
|
+
8. Fails the check if any file scores below threshold or Security Agent finds issues
|
|
102
|
+
|
|
103
|
+
### Requirements
|
|
104
|
+
|
|
105
|
+
- OIDC provider configured for GitHub Actions
|
|
106
|
+
- IAM role with `bedrock:InvokeModel` + `events:PutEvents`
|
|
107
|
+
- Repository secret `PRISM_METRICS_ROLE_ARN`
|
|
108
|
+
- `.prism/config.json` with `team_id`
|
|
109
|
+
|
|
110
|
+
## Uninstall
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
bash prism-cli bootstrapper install-eval-harness --uninstall
|
|
114
|
+
```
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
{
|
|
2
|
+
"pass_threshold": 0.82,
|
|
3
|
+
"model_id": "anthropic.claude-sonnet-4-20250514",
|
|
4
|
+
"eval_model_id": "anthropic.claude-sonnet-4-20250514",
|
|
5
|
+
"event_bus": "prism-d1-metrics",
|
|
6
|
+
"aws_region": "us-west-2",
|
|
7
|
+
"rubrics_dir": "rubrics",
|
|
8
|
+
"output_dir": ".prism/eval-results",
|
|
9
|
+
"emit_to_eventbridge": true
|
|
10
|
+
}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
{
|
|
2
|
+
"rubric_name": "agent-quality",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Evaluates the quality of agentic workflow outputs — task completion, tool efficiency, guardrail adherence, reasoning clarity, and error recovery.",
|
|
5
|
+
"criteria": [
|
|
6
|
+
{
|
|
7
|
+
"name": "task_completion",
|
|
8
|
+
"weight": 0.30,
|
|
9
|
+
"description": "Did the agent achieve the stated goal completely and correctly?",
|
|
10
|
+
"scoring": {
|
|
11
|
+
"5": "Goal fully achieved. All required outputs produced with correct content. No manual follow-up needed.",
|
|
12
|
+
"4": "Goal achieved with minor gaps. Outputs are correct but may be missing optional enhancements.",
|
|
13
|
+
"3": "Goal partially achieved. Core deliverable produced but with notable omissions or inaccuracies.",
|
|
14
|
+
"2": "Goal mostly unmet. Agent produced some output but missed the primary objective.",
|
|
15
|
+
"1": "Goal not achieved. Agent failed to produce a meaningful result or produced entirely wrong output."
|
|
16
|
+
},
|
|
17
|
+
"threshold": 4
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"name": "tool_usage_efficiency",
|
|
21
|
+
"weight": 0.20,
|
|
22
|
+
"description": "Were tools called efficiently without unnecessary steps, redundant invocations, or wasted iterations?",
|
|
23
|
+
"scoring": {
|
|
24
|
+
"5": "Optimal tool usage. Every invocation was necessary and well-targeted. No redundant calls or unnecessary retries.",
|
|
25
|
+
"4": "Efficient tool usage with minor redundancy. One or two calls could have been avoided.",
|
|
26
|
+
"3": "Acceptable efficiency. Some unnecessary tool calls or suboptimal ordering, but did not significantly impact outcome.",
|
|
27
|
+
"2": "Inefficient. Multiple redundant calls, unnecessary retries, or poor tool selection that wasted tokens and time.",
|
|
28
|
+
"1": "Highly inefficient. Excessive tool calls, looping behavior, or called tools irrelevant to the task."
|
|
29
|
+
},
|
|
30
|
+
"threshold": 3
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"name": "guardrail_compliance",
|
|
34
|
+
"weight": 0.20,
|
|
35
|
+
"description": "Did the agent stay within defined boundaries — prohibited actions, data access scopes, and safety constraints?",
|
|
36
|
+
"scoring": {
|
|
37
|
+
"5": "Perfect compliance. Agent operated strictly within all defined guardrails. No boundary violations.",
|
|
38
|
+
"4": "Near-perfect compliance. Agent stayed within guardrails with only cosmetic deviations (e.g., verbose output).",
|
|
39
|
+
"3": "Mostly compliant. Minor guardrail stretch that did not cause harm but should be tightened.",
|
|
40
|
+
"2": "Guardrail violation occurred. Agent performed a restricted action or accessed out-of-scope data.",
|
|
41
|
+
"1": "Serious guardrail breach. Agent took prohibited actions, accessed sensitive data, or made irreversible changes without approval."
|
|
42
|
+
},
|
|
43
|
+
"threshold": 5
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"name": "reasoning_trace_quality",
|
|
47
|
+
"weight": 0.15,
|
|
48
|
+
"description": "Is the reasoning chain clear, coherent, and auditable? Can a human reviewer follow the agent's decision-making?",
|
|
49
|
+
"scoring": {
|
|
50
|
+
"5": "Reasoning trace is complete, step-by-step, and clearly explains every decision. Fully auditable.",
|
|
51
|
+
"4": "Reasoning trace is clear for most steps. Minor gaps in explaining tool selection or branching decisions.",
|
|
52
|
+
"3": "Reasoning trace present but incomplete. Some steps lack explanation or context.",
|
|
53
|
+
"2": "Reasoning trace is sparse or confusing. Difficult to understand why certain actions were taken.",
|
|
54
|
+
"1": "No meaningful reasoning trace. Agent actions are opaque and unauditable."
|
|
55
|
+
},
|
|
56
|
+
"threshold": 3
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"name": "error_recovery",
|
|
60
|
+
"weight": 0.15,
|
|
61
|
+
"description": "Did the agent handle failures gracefully — retrying, falling back, or reporting errors without crashing or looping?",
|
|
62
|
+
"scoring": {
|
|
63
|
+
"5": "Excellent error handling. Agent retried appropriately, used fallback strategies, and reported failures clearly.",
|
|
64
|
+
"4": "Good error handling. Agent recovered from failures with minor issues (e.g., slightly verbose error reporting).",
|
|
65
|
+
"3": "Adequate error handling. Agent recovered from common failures but may have struggled with unusual error conditions.",
|
|
66
|
+
"2": "Poor error handling. Agent crashed, looped, or silently dropped errors for some failure modes.",
|
|
67
|
+
"1": "No error handling. Agent failed on first error with no retry, fallback, or meaningful error message."
|
|
68
|
+
},
|
|
69
|
+
"threshold": 3
|
|
70
|
+
}
|
|
71
|
+
],
|
|
72
|
+
"pass_threshold": 0.82,
|
|
73
|
+
"output_format": {
|
|
74
|
+
"overall_score": "float 0-1 (weighted average of criteria scores normalized to 0-1)",
|
|
75
|
+
"criteria_scores": "object mapping criteria name to { score: 1-5, reasoning: string }",
|
|
76
|
+
"pass": "boolean",
|
|
77
|
+
"summary": "string"
|
|
78
|
+
}
|
|
79
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
{
|
|
2
|
+
"rubric_name": "api_spec_compliance",
|
|
3
|
+
"description": "Evaluates whether API endpoint implementations match their specs",
|
|
4
|
+
"global_threshold": 0.8,
|
|
5
|
+
"criteria": [
|
|
6
|
+
{
|
|
7
|
+
"name": "acceptance_criteria_coverage",
|
|
8
|
+
"description": "Every acceptance criterion in the spec must have a corresponding implementation. Check each Given/When/Then statement and verify the code handles that exact scenario.",
|
|
9
|
+
"scoring": "Count the acceptance criteria in the spec. For each one, determine if the code implements it (1) or not (0). Score = implemented / total.",
|
|
10
|
+
"threshold": 1.0,
|
|
11
|
+
"weight": 0.35,
|
|
12
|
+
"requires_spec": true
|
|
13
|
+
},
|
|
14
|
+
{
|
|
15
|
+
"name": "api_contract_fidelity",
|
|
16
|
+
"description": "The implemented endpoint must match the spec's API contract exactly: correct HTTP method, path, request body schema, response body schema, and status codes.",
|
|
17
|
+
"scoring": "Check 5 elements: method, path, request schema, response schema, status codes. Score 0.2 for each correct element.",
|
|
18
|
+
"threshold": 0.8,
|
|
19
|
+
"weight": 0.25,
|
|
20
|
+
"requires_spec": true
|
|
21
|
+
},
|
|
22
|
+
{
|
|
23
|
+
"name": "error_handling",
|
|
24
|
+
"description": "All error cases defined in the spec must be handled with the correct HTTP status code and response format (RFC 7807 Problem Details).",
|
|
25
|
+
"scoring": "For each error case in the spec, check: (a) is it handled? (b) correct status code? (c) Problem Details format? Score = correct_checks / (total_error_cases * 3)",
|
|
26
|
+
"threshold": 0.7,
|
|
27
|
+
"weight": 0.20,
|
|
28
|
+
"requires_spec": true
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"name": "no_hallucinated_dependencies",
|
|
32
|
+
"description": "The code must not import modules, call functions, or reference APIs that do not exist in the project. Every import must resolve to a real file or installed package.",
|
|
33
|
+
"scoring": "Binary: 1.0 if all imports and function calls reference real modules/packages, 0.0 if any hallucinated dependency is found.",
|
|
34
|
+
"threshold": 1.0,
|
|
35
|
+
"weight": 0.10
|
|
36
|
+
},
|
|
37
|
+
{
|
|
38
|
+
"name": "type_safety",
|
|
39
|
+
"description": "The code must use proper TypeScript types. No 'any' types, no type assertions without justification, all function parameters and return types annotated.",
|
|
40
|
+
"scoring": "Count type violations: any usage, missing annotations, unsafe assertions. Score = max(0, 1.0 - (violations * 0.1))",
|
|
41
|
+
"threshold": 0.8,
|
|
42
|
+
"weight": 0.10
|
|
43
|
+
}
|
|
44
|
+
]
|
|
45
|
+
}
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
{
|
|
2
|
+
"rubric_name": "code-quality",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Evaluates the overall quality of generated or modified code.",
|
|
5
|
+
"criteria": [
|
|
6
|
+
{
|
|
7
|
+
"name": "correctness",
|
|
8
|
+
"weight": 0.25,
|
|
9
|
+
"description": "Code produces correct results for all specified inputs and edge cases.",
|
|
10
|
+
"scoring": {
|
|
11
|
+
"5": "Correct for all inputs including edge cases. Handles all specified error conditions.",
|
|
12
|
+
"4": "Correct for all normal inputs. Minor edge case gaps.",
|
|
13
|
+
"3": "Correct for happy path. Some error conditions unhandled.",
|
|
14
|
+
"2": "Partially correct. Fails for some normal inputs.",
|
|
15
|
+
"1": "Fundamentally incorrect. Does not produce expected results."
|
|
16
|
+
}
|
|
17
|
+
},
|
|
18
|
+
{
|
|
19
|
+
"name": "readability",
|
|
20
|
+
"weight": 0.15,
|
|
21
|
+
"description": "Code is easy to read, understand, and maintain.",
|
|
22
|
+
"scoring": {
|
|
23
|
+
"5": "Clear naming, consistent style, appropriate comments, well-structured. A new developer could understand it quickly.",
|
|
24
|
+
"4": "Generally readable with minor style inconsistencies.",
|
|
25
|
+
"3": "Readable but requires some effort. Missing context in complex sections.",
|
|
26
|
+
"2": "Hard to follow. Inconsistent naming or structure.",
|
|
27
|
+
"1": "Very difficult to read. No clear structure or naming convention."
|
|
28
|
+
}
|
|
29
|
+
},
|
|
30
|
+
{
|
|
31
|
+
"name": "maintainability",
|
|
32
|
+
"weight": 0.15,
|
|
33
|
+
"description": "Code is modular, follows SOLID principles, and is easy to extend.",
|
|
34
|
+
"scoring": {
|
|
35
|
+
"5": "Well-decomposed into small functions/classes. Clear separation of concerns. Easy to extend.",
|
|
36
|
+
"4": "Good structure with minor coupling issues.",
|
|
37
|
+
"3": "Reasonable structure but some tight coupling or large functions.",
|
|
38
|
+
"2": "Monolithic or tightly coupled. Changes would require touching many files.",
|
|
39
|
+
"1": "No modularity. Everything in one function or deeply entangled."
|
|
40
|
+
}
|
|
41
|
+
},
|
|
42
|
+
{
|
|
43
|
+
"name": "error_handling",
|
|
44
|
+
"weight": 0.15,
|
|
45
|
+
"description": "Errors are handled gracefully without swallowing, leaking, or crashing.",
|
|
46
|
+
"scoring": {
|
|
47
|
+
"5": "All error paths handled with appropriate actions. No swallowed errors. Clear error propagation.",
|
|
48
|
+
"4": "Most errors handled. Minor gaps in edge case error handling.",
|
|
49
|
+
"3": "Common errors handled. Some paths may swallow or ignore errors.",
|
|
50
|
+
"2": "Inconsistent error handling. Some unhandled exceptions.",
|
|
51
|
+
"1": "Minimal or no error handling."
|
|
52
|
+
}
|
|
53
|
+
},
|
|
54
|
+
{
|
|
55
|
+
"name": "testing",
|
|
56
|
+
"weight": 0.15,
|
|
57
|
+
"description": "Code is accompanied by meaningful tests.",
|
|
58
|
+
"scoring": {
|
|
59
|
+
"5": "Comprehensive tests covering happy path, errors, and edge cases. Tests are readable and maintainable.",
|
|
60
|
+
"4": "Good test coverage with minor gaps in edge cases.",
|
|
61
|
+
"3": "Basic tests covering happy path and some error cases.",
|
|
62
|
+
"2": "Minimal tests. Only happy path or superficial assertions.",
|
|
63
|
+
"1": "No tests."
|
|
64
|
+
}
|
|
65
|
+
},
|
|
66
|
+
{
|
|
67
|
+
"name": "performance",
|
|
68
|
+
"weight": 0.10,
|
|
69
|
+
"description": "Code does not introduce unnecessary performance issues.",
|
|
70
|
+
"scoring": {
|
|
71
|
+
"5": "Efficient algorithms and data structures. No N+1 queries, unnecessary allocations, or blocking calls.",
|
|
72
|
+
"4": "Generally efficient with minor optimization opportunities.",
|
|
73
|
+
"3": "Acceptable performance but some inefficiencies (e.g., redundant computations).",
|
|
74
|
+
"2": "Noticeable performance issues that would affect production.",
|
|
75
|
+
"1": "Severe performance problems (infinite loops, O(n^3) where O(n) suffices, etc.)."
|
|
76
|
+
}
|
|
77
|
+
},
|
|
78
|
+
{
|
|
79
|
+
"name": "documentation",
|
|
80
|
+
"weight": 0.05,
|
|
81
|
+
"description": "Public interfaces are documented; complex logic has explanatory comments.",
|
|
82
|
+
"scoring": {
|
|
83
|
+
"5": "All public functions/methods documented. Complex logic explained. No obvious missing docs.",
|
|
84
|
+
"4": "Most public interfaces documented. Minor gaps.",
|
|
85
|
+
"3": "Some documentation present but incomplete.",
|
|
86
|
+
"2": "Minimal documentation.",
|
|
87
|
+
"1": "No documentation."
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
],
|
|
91
|
+
"pass_threshold": 0.82,
|
|
92
|
+
"output_format": {
|
|
93
|
+
"overall_score": "float 0-1 (weighted average of criteria scores normalized to 0-1)",
|
|
94
|
+
"criteria_scores": "object mapping criteria name to { score: 1-5, reasoning: string }",
|
|
95
|
+
"pass": "boolean",
|
|
96
|
+
"summary": "string"
|
|
97
|
+
}
|
|
98
|
+
}
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
{
|
|
2
|
+
"rubric_name": "security-compliance",
|
|
3
|
+
"version": "2.0.0",
|
|
4
|
+
"description": "Evaluates code for security best practices and compliance requirements. Aligned with AWS AI-DLC Security Baseline (SECURITY-01 through SECURITY-08).",
|
|
5
|
+
"criteria": [
|
|
6
|
+
{
|
|
7
|
+
"name": "authentication_authorization",
|
|
8
|
+
"weight": 0.16,
|
|
9
|
+
"description": "Access control is properly implemented and enforced.",
|
|
10
|
+
"references": ["SECURITY-01"],
|
|
11
|
+
"scoring": {
|
|
12
|
+
"5": "All endpoints enforce authentication. Authorization checks verify resource ownership. No privilege escalation paths.",
|
|
13
|
+
"4": "Auth properly implemented with minor gaps in edge case authorization.",
|
|
14
|
+
"3": "Basic auth present but some endpoints or operations lack proper authorization checks.",
|
|
15
|
+
"2": "Auth inconsistently applied. Some endpoints accessible without proper credentials.",
|
|
16
|
+
"1": "No authentication or authorization. Resources accessible by anyone."
|
|
17
|
+
}
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"name": "injection_prevention",
|
|
21
|
+
"weight": 0.16,
|
|
22
|
+
"description": "Code is protected against injection attacks (SQL, NoSQL, command, XSS). All API parameters validated for type, length, and format.",
|
|
23
|
+
"references": ["SECURITY-05"],
|
|
24
|
+
"scoring": {
|
|
25
|
+
"5": "All queries use parameterized statements. All user input validated (type, length, format) and sanitized for output context. Request body size limits configured.",
|
|
26
|
+
"4": "Parameterized queries used consistently. Input validation present with minor gaps in format checking.",
|
|
27
|
+
"3": "Most queries parameterized but some string interpolation present. Inconsistent input validation.",
|
|
28
|
+
"2": "Inconsistent use of parameterized queries. Missing input validation on some endpoints.",
|
|
29
|
+
"1": "Raw string concatenation in queries or commands. No input validation."
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"name": "secret_management",
|
|
34
|
+
"weight": 0.15,
|
|
35
|
+
"description": "Secrets are stored securely and never exposed in code, logs, or errors.",
|
|
36
|
+
"references": ["SECURITY-03", "SECURITY-07"],
|
|
37
|
+
"scoring": {
|
|
38
|
+
"5": "No secrets in code. Credentials from Secrets Manager/SSM. Secrets excluded from logs and error responses. No PII in log output.",
|
|
39
|
+
"4": "Secrets properly managed with minor logging gaps.",
|
|
40
|
+
"3": "Secrets not in code but may appear in verbose logs or error messages.",
|
|
41
|
+
"2": "Some secrets hard-coded or in configuration files.",
|
|
42
|
+
"1": "Secrets visible in source code, logs, or error responses."
|
|
43
|
+
}
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"name": "data_protection",
|
|
47
|
+
"weight": 0.15,
|
|
48
|
+
"description": "Sensitive data is encrypted in transit (TLS 1.2+) and at rest (KMS). PII is handled per policy.",
|
|
49
|
+
"references": ["SECURITY-01"],
|
|
50
|
+
"scoring": {
|
|
51
|
+
"5": "All data encrypted in transit (TLS 1.2+). Sensitive data encrypted at rest with KMS. PII identified and handled per policy. No unnecessary data retention. Object storage rejects non-TLS requests.",
|
|
52
|
+
"4": "Encryption properly configured. Minor gaps in PII handling or data retention.",
|
|
53
|
+
"3": "Basic encryption present. PII handling incomplete. Some stores missing encryption config.",
|
|
54
|
+
"2": "Encryption inconsistent. Some sensitive data unprotected.",
|
|
55
|
+
"1": "No encryption. Sensitive data stored or transmitted in plaintext."
|
|
56
|
+
}
|
|
57
|
+
},
|
|
58
|
+
{
|
|
59
|
+
"name": "least_privilege",
|
|
60
|
+
"weight": 0.10,
|
|
61
|
+
"description": "IAM roles and permissions follow least-privilege principle with specific resource ARNs and actions.",
|
|
62
|
+
"references": ["SECURITY-06"],
|
|
63
|
+
"scoring": {
|
|
64
|
+
"5": "All IAM policies scoped to specific resources and actions. No wildcard permissions. Roles are service-specific. Read and write permissions separated.",
|
|
65
|
+
"4": "Mostly least-privilege. Minor over-permission in non-critical areas.",
|
|
66
|
+
"3": "Some wildcard actions or broad resource scopes present. No documented exceptions.",
|
|
67
|
+
"2": "Over-permissive policies. Wildcard actions on broad resource sets.",
|
|
68
|
+
"1": "Admin-level permissions or *:* policies."
|
|
69
|
+
}
|
|
70
|
+
},
|
|
71
|
+
{
|
|
72
|
+
"name": "error_information_leakage",
|
|
73
|
+
"weight": 0.07,
|
|
74
|
+
"description": "Error responses do not leak internal implementation details, stack traces, or technology identifiers.",
|
|
75
|
+
"references": ["SECURITY-07"],
|
|
76
|
+
"scoring": {
|
|
77
|
+
"5": "All errors return generic messages to clients. Stack traces, file paths, version numbers, and internal details logged server-side only. No technology identifiers in responses.",
|
|
78
|
+
"4": "Errors generally safe. Minor information leakage in edge cases.",
|
|
79
|
+
"3": "Some error responses include internal details like file paths or library names.",
|
|
80
|
+
"2": "Stack traces or database errors exposed to clients in some cases.",
|
|
81
|
+
"1": "Detailed stack traces, SQL errors, or internal paths exposed to clients."
|
|
82
|
+
}
|
|
83
|
+
},
|
|
84
|
+
{
|
|
85
|
+
"name": "http_security_headers",
|
|
86
|
+
"weight": 0.04,
|
|
87
|
+
"description": "HTTP security headers are set on all HTML-serving endpoints (CSP, HSTS, X-Content-Type-Options, X-Frame-Options, Referrer-Policy).",
|
|
88
|
+
"references": ["SECURITY-04"],
|
|
89
|
+
"scoring": {
|
|
90
|
+
"5": "All required headers set: CSP (no inline or eval directives), HSTS (max-age >= 31536000), X-Content-Type-Options: nosniff, X-Frame-Options: DENY, Referrer-Policy: strict-origin-when-cross-origin.",
|
|
91
|
+
"4": "Most headers set. Minor gaps in CSP strictness.",
|
|
92
|
+
"3": "Some security headers present. CSP or HSTS missing.",
|
|
93
|
+
"2": "Only 1-2 security headers configured.",
|
|
94
|
+
"1": "No security headers set."
|
|
95
|
+
}
|
|
96
|
+
},
|
|
97
|
+
{
|
|
98
|
+
"name": "structured_logging",
|
|
99
|
+
"weight": 0.04,
|
|
100
|
+
"description": "Application uses structured logging with correlation IDs, directed to a centralized log service.",
|
|
101
|
+
"references": ["SECURITY-02", "SECURITY-03"],
|
|
102
|
+
"scoring": {
|
|
103
|
+
"5": "Structured logging framework configured. Every log entry has timestamp, correlation ID, log level. Output directed to CloudWatch/centralized service. Access logging enabled on all network intermediaries.",
|
|
104
|
+
"4": "Structured logging present. Minor gaps in correlation ID propagation.",
|
|
105
|
+
"3": "Logging present but not structured. Missing correlation IDs or centralized routing.",
|
|
106
|
+
"2": "Ad-hoc console.log/print used as primary logging. No centralized routing.",
|
|
107
|
+
"1": "No logging infrastructure. Ad-hoc print statements only."
|
|
108
|
+
}
|
|
109
|
+
},
|
|
110
|
+
{
|
|
111
|
+
"name": "dependency_security",
|
|
112
|
+
"weight": 0.04,
|
|
113
|
+
"description": "No known vulnerable dependencies. Versions pinned. Unused dependencies removed.",
|
|
114
|
+
"references": ["SECURITY-08"],
|
|
115
|
+
"scoring": {
|
|
116
|
+
"5": "No known vulnerabilities in dependencies. All versions pinned in lockfile. Unused dependencies removed. Transitive deps reviewed.",
|
|
117
|
+
"4": "No critical vulnerabilities. Versions mostly pinned. Minor unused deps remain.",
|
|
118
|
+
"3": "Some moderate vulnerabilities present. Lockfile exists but not all versions pinned.",
|
|
119
|
+
"2": "Known vulnerabilities in dependencies. No lockfile or unpinned versions.",
|
|
120
|
+
"1": "Critical vulnerabilities in dependencies. No dependency management."
|
|
121
|
+
}
|
|
122
|
+
},
|
|
123
|
+
{
|
|
124
|
+
"name": "security_agent_findings",
|
|
125
|
+
"weight": 0.09,
|
|
126
|
+
"description": "No unresolved Critical/High findings from AWS Security Agent on this codebase.",
|
|
127
|
+
"references": ["SECURITY-09"],
|
|
128
|
+
"scoring": {
|
|
129
|
+
"5": "No open findings from Security Agent. All historical findings remediated within SLA.",
|
|
130
|
+
"4": "No Critical/High findings. Minor (LOW/MEDIUM) findings with active remediation plans.",
|
|
131
|
+
"3": "Open HIGH findings with remediation in progress. No Critical.",
|
|
132
|
+
"2": "Open CRITICAL findings or HIGH findings past remediation SLA.",
|
|
133
|
+
"1": "Multiple CRITICAL findings or validated pen test exploits unresolved."
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
],
|
|
137
|
+
"pass_threshold": 0.82,
|
|
138
|
+
"output_format": {
|
|
139
|
+
"overall_score": "float 0-1 (weighted average of criteria scores normalized to 0-1)",
|
|
140
|
+
"criteria_scores": "object mapping criteria name to { score: 1-5, reasoning: string }",
|
|
141
|
+
"pass": "boolean",
|
|
142
|
+
"summary": "string",
|
|
143
|
+
"security_findings": "array of { rule_id: string, severity: string, description: string }"
|
|
144
|
+
}
|
|
145
|
+
}
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
{
|
|
2
|
+
"rubric_name": "spec-compliance",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"description": "Evaluates whether generated code faithfully implements the requirements defined in a spec.",
|
|
5
|
+
"criteria": [
|
|
6
|
+
{
|
|
7
|
+
"name": "requirement_coverage",
|
|
8
|
+
"weight": 0.35,
|
|
9
|
+
"description": "All requirements and acceptance criteria from the spec are implemented.",
|
|
10
|
+
"scoring": {
|
|
11
|
+
"5": "Every requirement and acceptance criterion is implemented. No gaps.",
|
|
12
|
+
"4": "All major requirements implemented. Minor acceptance criteria partially addressed.",
|
|
13
|
+
"3": "Most requirements implemented. Some acceptance criteria missing.",
|
|
14
|
+
"2": "Significant requirements missing. Only happy path covered.",
|
|
15
|
+
"1": "Most requirements unimplemented or fundamentally misunderstood."
|
|
16
|
+
},
|
|
17
|
+
"requires_spec": true
|
|
18
|
+
},
|
|
19
|
+
{
|
|
20
|
+
"name": "interface_adherence",
|
|
21
|
+
"weight": 0.25,
|
|
22
|
+
"description": "Code matches the interfaces, signatures, and contracts defined in the spec.",
|
|
23
|
+
"scoring": {
|
|
24
|
+
"5": "All function signatures, API contracts, and data structures exactly match the spec.",
|
|
25
|
+
"4": "Interfaces match with minor naming or type differences that don't break contracts.",
|
|
26
|
+
"3": "Most interfaces match. Some deviations from spec'd contracts.",
|
|
27
|
+
"2": "Significant deviations from specified interfaces.",
|
|
28
|
+
"1": "Interfaces bear little resemblance to the spec."
|
|
29
|
+
},
|
|
30
|
+
"requires_spec": true
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"name": "edge_case_handling",
|
|
34
|
+
"weight": 0.20,
|
|
35
|
+
"description": "Edge cases and error scenarios described in the spec are handled.",
|
|
36
|
+
"scoring": {
|
|
37
|
+
"5": "All edge cases and error scenarios from the spec are handled gracefully.",
|
|
38
|
+
"4": "Most edge cases handled. Minor spec-defined scenarios missed.",
|
|
39
|
+
"3": "Common edge cases handled. Some spec-defined error paths missing.",
|
|
40
|
+
"2": "Only happy path. Most spec-defined edge cases unhandled.",
|
|
41
|
+
"1": "No edge case handling despite spec requirements."
|
|
42
|
+
},
|
|
43
|
+
"requires_spec": true
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"name": "spec_intent_fidelity",
|
|
47
|
+
"weight": 0.20,
|
|
48
|
+
"description": "Implementation captures the intent and spirit of the spec, not just the letter.",
|
|
49
|
+
"scoring": {
|
|
50
|
+
"5": "Implementation fully captures the spec's intent, including implied behaviors and user expectations.",
|
|
51
|
+
"4": "Captures intent well. Minor misalignment with spec's broader goals.",
|
|
52
|
+
"3": "Technically meets requirements but misses some of the spec's intent.",
|
|
53
|
+
"2": "Follows spec literally but misses the underlying purpose.",
|
|
54
|
+
"1": "Implementation contradicts the spec's intent."
|
|
55
|
+
},
|
|
56
|
+
"requires_spec": true
|
|
57
|
+
}
|
|
58
|
+
],
|
|
59
|
+
"pass_threshold": 0.82,
|
|
60
|
+
"output_format": {
|
|
61
|
+
"overall_score": "float 0-1 (weighted average of criteria scores normalized to 0-1)",
|
|
62
|
+
"criteria_scores": "object mapping criteria name to { score: 1-5, reasoning: string }",
|
|
63
|
+
"pass": "boolean",
|
|
64
|
+
"summary": "string",
|
|
65
|
+
"spec_gaps": "array of strings describing unimplemented requirements"
|
|
66
|
+
}
|
|
67
|
+
}
|