@prism-d1/cli 1.0.27 → 1.0.29

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. package/dist/assets/eval-harness/README.md +114 -0
  2. package/dist/assets/eval-harness/eval-config.json +10 -0
  3. package/dist/assets/eval-harness/rubrics/agent-quality.json +79 -0
  4. package/dist/assets/eval-harness/rubrics/api-response-quality.json +45 -0
  5. package/dist/assets/eval-harness/rubrics/code-quality.json +98 -0
  6. package/dist/assets/eval-harness/rubrics/security-compliance.json +145 -0
  7. package/dist/assets/eval-harness/rubrics/spec-compliance.json +67 -0
  8. package/dist/assets/eval-harness/run-eval.sh +122 -0
  9. package/dist/assets/github-workflows/README.md +110 -0
  10. package/dist/assets/github-workflows/prism-agent-eval.yml +313 -0
  11. package/dist/assets/github-workflows/prism-ai-metrics.yml +261 -0
  12. package/dist/assets/github-workflows/prism-dora-weekly.yml +334 -0
  13. package/dist/assets/github-workflows/prism-eval-gate.yml +310 -0
  14. package/dist/assets/infra/bin/app.ts +56 -0
  15. package/dist/assets/infra/cdk.json +12 -0
  16. package/dist/assets/infra/lib/api-stack.ts +347 -0
  17. package/dist/assets/infra/lib/constructs/bedrock-guardrail-construct.ts +201 -0
  18. package/dist/assets/infra/lib/constructs/guardrail-enforcer-construct.ts +59 -0
  19. package/dist/assets/infra/lib/constructs/prism-vpc-construct.ts +75 -0
  20. package/dist/assets/infra/lib/constructs/security-agent-construct.ts +266 -0
  21. package/dist/assets/infra/lib/dashboard-stack.ts +1392 -0
  22. package/dist/assets/infra/lib/lambda/api-handler.ts +477 -0
  23. package/dist/assets/infra/lib/lambda/defect-correlator.ts +142 -0
  24. package/dist/assets/infra/lib/lambda/exfiltration-detector.ts +100 -0
  25. package/dist/assets/infra/lib/lambda/layers/guardrail-enforcer/nodejs/guardrail-enforcer.js +53 -0
  26. package/dist/assets/infra/lib/lambda/metrics-processor.ts +748 -0
  27. package/dist/assets/infra/lib/lambda/security-agent-processor.ts +231 -0
  28. package/dist/assets/infra/lib/lambda/security-remediation-tracker.ts +120 -0
  29. package/dist/assets/infra/lib/lambda/security-response-automator.ts +130 -0
  30. package/dist/assets/infra/lib/lambda/spec-to-code-calculator.ts +123 -0
  31. package/dist/assets/infra/lib/metrics-pipeline-stack.ts +701 -0
  32. package/dist/assets/infra/package.json +23 -0
  33. package/dist/assets/infra/tsconfig.json +24 -0
  34. package/dist/src/commands/bootstrapper/install-eval-harness.d.ts.map +1 -1
  35. package/dist/src/commands/bootstrapper/install-eval-harness.js +3 -4
  36. package/dist/src/commands/bootstrapper/install-eval-harness.js.map +1 -1
  37. package/dist/src/commands/bootstrapper/install-git-hooks.d.ts.map +1 -1
  38. package/dist/src/commands/bootstrapper/install-git-hooks.js +2 -5
  39. package/dist/src/commands/bootstrapper/install-git-hooks.js.map +1 -1
  40. package/dist/src/commands/securityagent/setup.d.ts.map +1 -1
  41. package/dist/src/commands/securityagent/setup.js +2 -3
  42. package/dist/src/commands/securityagent/setup.js.map +1 -1
  43. package/dist/src/commands/workshop/deploy-infra.d.ts.map +1 -1
  44. package/dist/src/commands/workshop/deploy-infra.js +2 -3
  45. package/dist/src/commands/workshop/deploy-infra.js.map +1 -1
  46. package/dist/src/commands/workshop/generate-demo-data.d.ts.map +1 -1
  47. package/dist/src/commands/workshop/generate-demo-data.js +3 -8
  48. package/dist/src/commands/workshop/generate-demo-data.js.map +1 -1
  49. package/dist/src/commands/workshop/perform-pen-test.d.ts.map +1 -1
  50. package/dist/src/commands/workshop/perform-pen-test.js +5 -14
  51. package/dist/src/commands/workshop/perform-pen-test.js.map +1 -1
  52. package/dist/src/utils/root.d.ts +6 -0
  53. package/dist/src/utils/root.d.ts.map +1 -1
  54. package/dist/src/utils/root.js +29 -0
  55. package/dist/src/utils/root.js.map +1 -1
  56. package/package.json +2 -2
@@ -0,0 +1,122 @@
1
+ #!/usr/bin/env bash
2
+ # run-eval.sh — Evaluate a single file against a rubric using Bedrock.
3
+ #
4
+ # Usage: ./run-eval.sh <rubric-file> <input-file> [--spec <spec-file>]
5
+ #
6
+ # Output (stdout, parsed by workflow):
7
+ # Score: <0-1>
8
+ # Result: PASS|FAIL
9
+ #
10
+ # Exit: 0=pass, 1=fail, 2=error
11
+
12
+ set -euo pipefail
13
+
14
+ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
15
+ CONFIG_FILE="${SCRIPT_DIR}/eval-config.json"
16
+
17
+ # --- Args ---
18
+ RUBRIC_FILE=""
19
+ INPUT_FILE=""
20
+ SPEC_FILE=""
21
+
22
+ while [[ $# -gt 0 ]]; do
23
+ case "$1" in
24
+ --spec) SPEC_FILE="$2"; shift 2 ;;
25
+ *) if [[ -z "${RUBRIC_FILE}" ]]; then RUBRIC_FILE="$1"
26
+ elif [[ -z "${INPUT_FILE}" ]]; then INPUT_FILE="$1"
27
+ fi; shift ;;
28
+ esac
29
+ done
30
+
31
+ [[ -f "${RUBRIC_FILE}" ]] || { echo "Error: rubric not found: ${RUBRIC_FILE}" >&2; exit 2; }
32
+ [[ -f "${INPUT_FILE}" ]] || { echo "Error: file not found: ${INPUT_FILE}" >&2; exit 2; }
33
+ [[ -f "${CONFIG_FILE}" ]] || { echo "Error: eval-config.json not found" >&2; exit 2; }
34
+
35
+ # --- Config ---
36
+ EVAL_MODEL=$(jq -r '.eval_model_id' "${CONFIG_FILE}")
37
+ PASS_THRESHOLD=$(jq -r '.pass_threshold' "${CONFIG_FILE}")
38
+ AWS_REGION=$(jq -r '.aws_region' "${CONFIG_FILE}")
39
+
40
+ RUBRIC_NAME=$(jq -r '.rubric_name' "${RUBRIC_FILE}")
41
+ CODE_CONTENT=$(cat "${INPUT_FILE}")
42
+
43
+ # --- Filter criteria: skip requires_spec criteria when no spec provided ---
44
+ CRITERIA_FILTER='if $spec == "" then .criteria | map(select(.requires_spec != true)) else .criteria end'
45
+ ACTIVE_CRITERIA=$(jq --arg spec "${SPEC_FILE}" "${CRITERIA_FILTER}" "${RUBRIC_FILE}")
46
+ ACTIVE_COUNT=$(echo "${ACTIVE_CRITERIA}" | jq 'length')
47
+
48
+ if [[ "${ACTIVE_COUNT}" -eq 0 ]]; then
49
+ echo "Skipped: ${RUBRIC_NAME} (all criteria require spec)"
50
+ echo "Score: 0"
51
+ echo "Result: SKIP"
52
+ echo "Hallucinations: 0"
53
+ exit 0
54
+ fi
55
+
56
+ # --- Build rubric text for prompt ---
57
+ RUBRIC_CRITERIA=$(echo "${ACTIVE_CRITERIA}" | jq -r '.[] | "- \(.name) [weight=\(.weight)]: \(.description)\n Scoring: \(.scoring | if type == "object" then to_entries | map("\(.key): \(.value)") | join("; ") else . end)"')
58
+
59
+ SPEC_SECTION="No spec provided. Evaluate based on code quality criteria only."
60
+ if [[ -n "${SPEC_FILE}" && -f "${SPEC_FILE}" ]]; then
61
+ SPEC_SECTION=$(cat "${SPEC_FILE}")
62
+ fi
63
+
64
+ # --- Prompt (ask for per-criterion scores, we calculate weighted average) ---
65
+ EVAL_PROMPT="You are a code quality evaluator. Evaluate the following code against the rubric criteria.
66
+
67
+ ## Spec
68
+ ${SPEC_SECTION}
69
+
70
+ ## Code Under Evaluation
71
+ --- FILE: ${INPUT_FILE} ---
72
+ ${CODE_CONTENT}
73
+
74
+ ## Rubric Criteria
75
+ ${RUBRIC_CRITERIA}
76
+
77
+ Respond in this exact JSON format (no other text):
78
+ {\"evaluations\": [{\"criterion\": \"<name>\", \"score\": <0.0-1.0>, \"rationale\": \"<brief>\"}]}"
79
+
80
+ # --- Call Bedrock ---
81
+ BODY_FILE=$(mktemp)
82
+ RESP_FILE=$(mktemp)
83
+ trap 'rm -f "${BODY_FILE}" "${RESP_FILE}"' EXIT
84
+
85
+ jq -n --arg p "${EVAL_PROMPT}" \
86
+ '{anthropic_version:"bedrock-2023-05-31",max_tokens:2000,messages:[{role:"user",content:$p}]}' > "${BODY_FILE}"
87
+
88
+ aws bedrock-runtime invoke-model \
89
+ --region "${AWS_REGION}" \
90
+ --model-id "${EVAL_MODEL}" \
91
+ --content-type "application/json" \
92
+ --accept "application/json" \
93
+ --body "fileb://${BODY_FILE}" \
94
+ "${RESP_FILE}" 2>/dev/null || { echo "Error: Bedrock invoke failed" >&2; exit 2; }
95
+
96
+ # --- Parse response ---
97
+ EVAL_JSON=$(jq -r '.content[0].text' "${RESP_FILE}" 2>/dev/null | sed -n '/^{/,/^}/p') || true
98
+ [[ -n "${EVAL_JSON}" ]] || { echo "Error: could not parse model response" >&2; exit 2; }
99
+
100
+ # --- Calculate weighted score (client-side, don't trust LLM math) ---
101
+ OVERALL=$(echo "${EVAL_JSON}" | jq --argjson criteria "${ACTIVE_CRITERIA}" '
102
+ ($criteria | map(.weight) | add) as $total_weight |
103
+ [.evaluations[] as $e |
104
+ ($criteria[] | select(.name == $e.criterion)) as $c |
105
+ ($e.score * $c.weight)
106
+ ] | (add // 0) / $total_weight')
107
+
108
+ # --- Detect hallucinations ---
109
+ HALLUCINATIONS=$(echo "${EVAL_JSON}" | jq '[.evaluations[] | select(.rationale | test("hallucinated|does not exist|not found"; "i"))] | length')
110
+
111
+ # --- Output (workflow parses these lines) ---
112
+ echo "${EVAL_JSON}" | jq -r '.evaluations[] | "\(.criterion): \(.score) — \(.rationale)"'
113
+ echo ""
114
+ printf "Score: %.4f\n" "${OVERALL}"
115
+ echo "Result: $(echo "${OVERALL} >= ${PASS_THRESHOLD}" | bc -l | grep -q '^1' && echo "PASS" || echo "FAIL")"
116
+ echo "Hallucinations: ${HALLUCINATIONS}"
117
+
118
+ # --- Exit ---
119
+ if (( $(echo "${OVERALL} < ${PASS_THRESHOLD}" | bc -l) )); then
120
+ exit 1
121
+ fi
122
+ exit 0
@@ -0,0 +1,110 @@
1
+ # GitHub Actions Workflows
2
+
3
+ Reusable GitHub Actions workflows for PRISM D1 Velocity metric collection.
4
+
5
+ ## Workflows
6
+
7
+ | Workflow | Trigger | Purpose |
8
+ |---|---|---|
9
+ | `prism-ai-metrics.yml` | PR merge to main | Calculates AI-to-merge ratio, token usage, lead time. Emits `prism.d1.pr` + `prism.d1.deploy` events |
10
+ | `prism-eval-gate.yml` | PR open/update | Evaluates AI-generated code per-file with auto-selected rubrics, waits for Security Agent, blocks merge on failure |
11
+ | `prism-agent-eval.yml` | PR modifying agent code | Runs agent in mock mode, evaluates output with agent-quality rubric |
12
+ | `prism-dora-weekly.yml` | Weekly (Monday 09:00 UTC) | Calculates DORA + AI-DORA metrics, emits to EventBridge + CloudWatch |
13
+
14
+ ## Setup
15
+
16
+ ### 1. Configure AWS OIDC
17
+
18
+ ```bash
19
+ bash prism-cli bootstrapper setup-github-oidc
20
+ ```
21
+
22
+ This interactively creates:
23
+ - OIDC identity provider for `token.actions.githubusercontent.com`
24
+ - IAM role `GitHubActions-<repo>` with trust policy scoped to your repo
25
+ - Inline policy with `events:PutEvents` and `bedrock:InvokeModel`
26
+
27
+ For the weekly workflow, manually add `cloudwatch:PutMetricData` to the role policy.
28
+
29
+ ### 2. Set Repository Secret
30
+
31
+ In GitHub: Settings → Secrets and variables → Actions → New repository secret:
32
+
33
+ | Secret | Value |
34
+ |---|---|
35
+ | `PRISM_METRICS_ROLE_ARN` | ARN printed by `setup-github-oidc` |
36
+
37
+ ### 3. Install Git Hooks + Config
38
+
39
+ ```bash
40
+ bash prism-cli bootstrapper install-git-hooks
41
+ ```
42
+
43
+ Creates `.prism/config.json` with your team ID (read by all workflows).
44
+
45
+ ### 4. Install Eval Harness
46
+
47
+ ```bash
48
+ # Workshop mode — bring your own rubric
49
+ bash prism-cli bootstrapper install-eval-harness
50
+
51
+ # Production mode — includes all 5 rubrics
52
+ bash prism-cli bootstrapper install-eval-harness --with-rubrics
53
+ ```
54
+
55
+ This copies `.prism/eval-harness/`, `eval-config.json`, rubrics, and the `prism-eval-gate.yml` workflow.
56
+
57
+ ### 5. Copy Remaining Workflows
58
+
59
+ ```bash
60
+ mkdir -p .github/workflows
61
+ cp bootstrapper/github-workflows/prism-ai-metrics.yml .github/workflows/
62
+ cp bootstrapper/github-workflows/prism-dora-weekly.yml .github/workflows/
63
+ # Optional — only if you have agents with --mock support:
64
+ cp bootstrapper/github-workflows/prism-agent-eval.yml .github/workflows/
65
+ ```
66
+
67
+ ## IAM Permissions
68
+
69
+ The OIDC role needs:
70
+
71
+ | Permission | Used by |
72
+ |---|---|
73
+ | `events:PutEvents` | All workflows |
74
+ | `bedrock:InvokeModel` | eval-gate, agent-eval |
75
+ | `cloudwatch:PutMetricData` | dora-weekly |
76
+
77
+ ## Customization
78
+
79
+ | Setting | How |
80
+ |---|---|
81
+ | Branch | Edit `branches` in each workflow |
82
+ | AWS region | Edit `aws-region` field + EventBridge commands |
83
+ | Eval threshold | Edit `.prism/.prism/eval-harness/eval-config.json` → `pass_threshold` |
84
+ | Eval model | Edit `.prism/.prism/eval-harness/eval-config.json` → `eval_model_id` |
85
+ | Weekly schedule | Edit cron in `prism-dora-weekly.yml` (default: `0 9 * * 1`) |
86
+
87
+ ## Events Emitted
88
+
89
+ | Detail Type | Source Workflow | Destination |
90
+ |---|---|---|
91
+ | `prism.d1.pr` | ai-metrics | EventBridge |
92
+ | `prism.d1.deploy` | ai-metrics | EventBridge |
93
+ | `prism.d1.eval` | eval-gate | EventBridge |
94
+ | `prism.d1.agent.eval` | agent-eval | EventBridge |
95
+ | `prism.d1.assessment` | dora-weekly | EventBridge |
96
+ | `prism.d1.security.code_review` | eval-gate (Security Agent) | EventBridge |
97
+ | `AIAdoptionRate`, `SpecCoverage`, tool counts | dora-weekly | CloudWatch |
98
+
99
+ All EventBridge events use source `prism.d1.velocity` and bus `prism-d1-metrics`.
100
+
101
+ ## Troubleshooting
102
+
103
+ | Issue | Solution |
104
+ |---|---|
105
+ | OIDC auth fails | Verify trust policy `sub` matches `repo:org/repo:*` |
106
+ | EventBridge put fails | Check `events:PutEvents` on bus ARN |
107
+ | Eval gate always skips | Ensure commits have `AI-Origin:` trailers (install git hooks) |
108
+ | Weekly not running | Workflow must exist on default branch; test with `workflow_dispatch` |
109
+ | Agent eval skips | No `agent/main.py` found — add `--mock` support to your agent |
110
+ | Security Agent timeout | Agent takes 2+ min to start; workflow waits up to 12 min total |
@@ -0,0 +1,313 @@
1
+ # prism-agent-eval.yml -- Evaluate agent outputs on PRs that modify agent code.
2
+ #
3
+ # Copy this file to .github/workflows/prism-agent-eval.yml in your repo.
4
+ #
5
+ # Required setup:
6
+ # - OIDC provider configured for GitHub Actions in your AWS account
7
+ # - IAM role with bedrock:InvokeModel and events:PutEvents permissions
8
+ # - Repository secret PRISM_METRICS_ROLE_ARN set to the OIDC role ARN
9
+ # - .prism/config.json committed to the repo (created by: bash prism-cli bootstrapper install-git-hooks)
10
+ # - .prism/eval-harness/ installed (run: bash prism-cli bootstrapper install-eval-harness)
11
+ # - (Optional) Agent entry point supports --mock flag for dry-run execution
12
+
13
+ name: PRISM Agent Eval
14
+
15
+ on:
16
+ pull_request:
17
+ types: [opened, synchronize, reopened]
18
+ branches: [main, master]
19
+ paths:
20
+ - 'agent/**'
21
+ - 'agents/**'
22
+ - 'mcp-servers/**'
23
+ - '**/agent*.py'
24
+ - '**/agent*.ts'
25
+
26
+ permissions:
27
+ id-token: write
28
+ contents: read
29
+ pull-requests: write
30
+
31
+ jobs:
32
+ agent-eval:
33
+ runs-on: ubuntu-latest
34
+ steps:
35
+ - name: Checkout
36
+ uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
37
+ with:
38
+ fetch-depth: 0
39
+
40
+ - name: Configure AWS credentials (OIDC)
41
+ uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
42
+ with:
43
+ role-to-assume: ${{ secrets.PRISM_METRICS_ROLE_ARN }}
44
+ aws-region: us-west-2
45
+
46
+ - name: Read team ID from .prism/config.json
47
+ id: prism-config
48
+ run: |
49
+ if [[ -f .prism/config.json ]]; then
50
+ TEAM_ID=$(jq -r '.team_id' .prism/config.json)
51
+ else
52
+ echo "::error::.prism/config.json not found. Run bootstrapper/metric-hooks/install.sh first."
53
+ exit 1
54
+ fi
55
+ echo "team_id=${TEAM_ID}" >> "$GITHUB_OUTPUT"
56
+
57
+ - name: Set up Python
58
+ uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
59
+ with:
60
+ python-version: '3.12'
61
+
62
+ - name: Install dependencies
63
+ run: |
64
+ sudo apt-get update -qq && sudo apt-get install -y -qq jq bc
65
+ pip install --quiet strands-agents strands-agents-tools requests
66
+
67
+ - name: Identify agent changes
68
+ id: detect-agent
69
+ run: |
70
+ BASE_SHA="${{ github.event.pull_request.base.sha }}"
71
+ HEAD_SHA="${{ github.sha }}"
72
+
73
+ # Find changed files in agent-related paths
74
+ AGENT_FILES=$(git diff --name-only "${BASE_SHA}..${HEAD_SHA}" \
75
+ | grep -E '(^agent/|^agents/|^mcp-servers/|agent.*\.(py|ts)$)' || true)
76
+
77
+ if [[ -z "${AGENT_FILES}" ]]; then
78
+ echo "No agent-related files changed."
79
+ echo "has_agent_changes=false" >> "$GITHUB_OUTPUT"
80
+ exit 0
81
+ fi
82
+
83
+ echo "has_agent_changes=true" >> "$GITHUB_OUTPUT"
84
+ echo "${AGENT_FILES}" > /tmp/agent-changed-files.txt
85
+
86
+ FILE_COUNT=$(echo "${AGENT_FILES}" | wc -l | tr -d ' ')
87
+ echo "Found ${FILE_COUNT} changed agent files."
88
+ echo "file_count=${FILE_COUNT}" >> "$GITHUB_OUTPUT"
89
+
90
+ - name: Run agent in mock mode
91
+ id: agent-run
92
+ if: steps.detect-agent.outputs.has_agent_changes == 'true'
93
+ run: |
94
+ # Look for agent entry points and run them with --mock flag
95
+ # The --mock flag should make the agent execute its workflow
96
+ # without real side effects, using stubbed tool responses.
97
+
98
+ AGENT_ENTRY=""
99
+ for CANDIDATE in agent/main.py agents/main.py agent.py; do
100
+ if [[ -f "${CANDIDATE}" ]]; then
101
+ AGENT_ENTRY="${CANDIDATE}"
102
+ break
103
+ fi
104
+ done
105
+
106
+ if [[ -z "${AGENT_ENTRY}" ]]; then
107
+ echo "No agent entry point found. Skipping mock run."
108
+ echo "mock_ran=false" >> "$GITHUB_OUTPUT"
109
+ echo "{}" > /tmp/agent-output.json
110
+ exit 0
111
+ fi
112
+
113
+ echo "Running agent in mock mode: ${AGENT_ENTRY}"
114
+ echo "mock_ran=true" >> "$GITHUB_OUTPUT"
115
+
116
+ set +e
117
+ python "${AGENT_ENTRY}" --mock > /tmp/agent-output.json 2>/tmp/agent-stderr.log
118
+ EXIT_CODE=$?
119
+ set -e
120
+
121
+ if [[ "${EXIT_CODE}" -ne 0 ]]; then
122
+ echo "Agent mock run exited with code ${EXIT_CODE}"
123
+ echo "agent_exit_code=${EXIT_CODE}" >> "$GITHUB_OUTPUT"
124
+ # Still continue to evaluate whatever output was produced
125
+ else
126
+ echo "agent_exit_code=0" >> "$GITHUB_OUTPUT"
127
+ fi
128
+
129
+ - name: Evaluate agent output
130
+ id: eval
131
+ if: steps.detect-agent.outputs.has_agent_changes == 'true'
132
+ run: |
133
+ set -o pipefail
134
+ chmod +x .prism/eval-harness/run-eval.sh
135
+
136
+ RUBRIC=".prism/eval-harness/rubrics/agent-quality.json"
137
+
138
+ # If agent produced output, evaluate it
139
+ if [[ -s /tmp/agent-output.json ]]; then
140
+ EVAL_TARGET="/tmp/agent-output.json"
141
+ else
142
+ # Fall back to evaluating the agent source code itself
143
+ EVAL_TARGET=$(head -1 /tmp/agent-changed-files.txt)
144
+ fi
145
+
146
+ echo "Evaluating: ${EVAL_TARGET} against ${RUBRIC}"
147
+
148
+ set +e
149
+ OUTPUT=$(./.prism/eval-harness/run-eval.sh "${RUBRIC}" "${EVAL_TARGET}" 2>&1)
150
+ EXIT_CODE=$?
151
+ set -e
152
+
153
+ SCORE=$(echo "${OUTPUT}" | grep '^Score:' | awk '{print $2}' || echo "0")
154
+ RESULT=$(echo "${OUTPUT}" | grep '^Result:' | awk '{print $2}' || echo "UNKNOWN")
155
+
156
+ echo "score=${SCORE}" >> "$GITHUB_OUTPUT"
157
+ echo "result=${RESULT}" >> "$GITHUB_OUTPUT"
158
+
159
+ # Extract per-criteria scores if available
160
+ TASK_COMPLETION=$(echo "${OUTPUT}" | grep 'task_completion' | awk '{print $NF}' || echo "N/A")
161
+ TOOL_EFFICIENCY=$(echo "${OUTPUT}" | grep 'tool_usage_efficiency' | awk '{print $NF}' || echo "N/A")
162
+ GUARDRAIL_COMPLIANCE=$(echo "${OUTPUT}" | grep 'guardrail_compliance' | awk '{print $NF}' || echo "N/A")
163
+ REASONING_QUALITY=$(echo "${OUTPUT}" | grep 'reasoning_trace_quality' | awk '{print $NF}' || echo "N/A")
164
+ ERROR_RECOVERY=$(echo "${OUTPUT}" | grep 'error_recovery' | awk '{print $NF}' || echo "N/A")
165
+
166
+ echo "task_completion=${TASK_COMPLETION}" >> "$GITHUB_OUTPUT"
167
+ echo "tool_efficiency=${TOOL_EFFICIENCY}" >> "$GITHUB_OUTPUT"
168
+ echo "guardrail_compliance=${GUARDRAIL_COMPLIANCE}" >> "$GITHUB_OUTPUT"
169
+ echo "reasoning_quality=${REASONING_QUALITY}" >> "$GITHUB_OUTPUT"
170
+ echo "error_recovery=${ERROR_RECOVERY}" >> "$GITHUB_OUTPUT"
171
+
172
+ - name: Post PR comment
173
+ if: steps.detect-agent.outputs.has_agent_changes == 'true'
174
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
175
+ with:
176
+ script: |
177
+ const score = '${{ steps.eval.outputs.score }}';
178
+ const result = '${{ steps.eval.outputs.result }}';
179
+ const mockRan = '${{ steps.agent-run.outputs.mock_ran }}';
180
+ const agentExit = '${{ steps.agent-run.outputs.agent_exit_code }}';
181
+ const fileCount = '${{ steps.detect-agent.outputs.file_count }}';
182
+ const emoji = result === 'PASS' ? ':white_check_mark:' : ':x:';
183
+
184
+ const taskCompletion = '${{ steps.eval.outputs.task_completion }}';
185
+ const toolEfficiency = '${{ steps.eval.outputs.tool_efficiency }}';
186
+ const guardrailCompliance = '${{ steps.eval.outputs.guardrail_compliance }}';
187
+ const reasoningQuality = '${{ steps.eval.outputs.reasoning_quality }}';
188
+ const errorRecovery = '${{ steps.eval.outputs.error_recovery }}';
189
+
190
+ const mockStatus = mockRan === 'true'
191
+ ? (agentExit === '0' ? ':white_check_mark: Mock run succeeded' : `:warning: Mock run exited with code ${agentExit}`)
192
+ : ':information_source: No agent entry point found — evaluated source code only';
193
+
194
+ const body = `## ${emoji} PRISM Agent Eval — ${result}
195
+
196
+ **Overall Score**: ${score} | **Agent Files Changed**: ${fileCount}
197
+
198
+ ${mockStatus}
199
+
200
+ ### Criteria Breakdown
201
+
202
+ | Criteria | Score | Weight |
203
+ |---|---|---|
204
+ | Task Completion | ${taskCompletion} | 0.30 |
205
+ | Tool Usage Efficiency | ${toolEfficiency} | 0.20 |
206
+ | Guardrail Compliance | ${guardrailCompliance} | 0.20 |
207
+ | Reasoning Trace Quality | ${reasoningQuality} | 0.15 |
208
+ | Error Recovery | ${errorRecovery} | 0.15 |
209
+
210
+ > Threshold: 0.82 | Rubric: agent-quality.json | Evaluated by Amazon Bedrock
211
+ > [PRISM D1 Velocity](${{ github.server_url }}/${{ github.repository }})`;
212
+
213
+ const { data: comments } = await github.rest.issues.listComments({
214
+ owner: context.repo.owner,
215
+ repo: context.repo.repo,
216
+ issue_number: context.issue.number,
217
+ });
218
+
219
+ const existingComment = comments.find(c => c.body.includes('PRISM Agent Eval'));
220
+
221
+ if (existingComment) {
222
+ await github.rest.issues.updateComment({
223
+ owner: context.repo.owner,
224
+ repo: context.repo.repo,
225
+ comment_id: existingComment.id,
226
+ body: body,
227
+ });
228
+ } else {
229
+ await github.rest.issues.createComment({
230
+ owner: context.repo.owner,
231
+ repo: context.repo.repo,
232
+ issue_number: context.issue.number,
233
+ body: body,
234
+ });
235
+ }
236
+
237
+ - name: Post skip comment
238
+ if: steps.detect-agent.outputs.has_agent_changes != 'true'
239
+ uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
240
+ with:
241
+ script: |
242
+ const { data: comments } = await github.rest.issues.listComments({
243
+ owner: context.repo.owner,
244
+ repo: context.repo.repo,
245
+ issue_number: context.issue.number,
246
+ });
247
+ const existing = comments.find(c => c.body.includes('PRISM Agent Eval'));
248
+ if (!existing) {
249
+ await github.rest.issues.createComment({
250
+ owner: context.repo.owner,
251
+ repo: context.repo.repo,
252
+ issue_number: context.issue.number,
253
+ body: ':white_check_mark: **PRISM Agent Eval** — No agent code changes detected. Evaluation skipped.',
254
+ });
255
+ }
256
+
257
+ - name: Emit prism.d1.agent.eval event
258
+ if: steps.detect-agent.outputs.has_agent_changes == 'true'
259
+ env:
260
+ SCORE: ${{ steps.eval.outputs.score }}
261
+ run: |
262
+ TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
263
+
264
+ EVAL_EVENT=$(jq -n \
265
+ --arg team_id "${{ steps.prism-config.outputs.team_id }}" \
266
+ --arg repo "${{ github.repository }}" \
267
+ --arg timestamp "${TIMESTAMP}" \
268
+ --argjson score "${SCORE:-0}" \
269
+ --arg result "${{ steps.eval.outputs.result }}" \
270
+ --argjson file_count "${{ steps.detect-agent.outputs.file_count }}" \
271
+ --arg pr_number "${{ github.event.pull_request.number }}" \
272
+ --arg mock_ran "${{ steps.agent-run.outputs.mock_ran }}" \
273
+ '{
274
+ "team_id": $team_id,
275
+ "repo": $repo,
276
+ "timestamp": $timestamp,
277
+ "prism_level": 3,
278
+ "metric": {
279
+ "name": "agent_eval_score",
280
+ "value": $score,
281
+ "unit": "score"
282
+ },
283
+ "ai_context": {
284
+ "tool": "bedrock-eval",
285
+ "model": "anthropic.claude-sonnet-4-20250514",
286
+ "origin": "ai-generated"
287
+ },
288
+ "ai_dora": {
289
+ "eval_gate_pass_rate": (if $result == "PASS" then 1.0 else 0.0 end)
290
+ },
291
+ "agent_eval": {
292
+ "file_count": $file_count,
293
+ "result": $result,
294
+ "mock_ran": ($mock_ran == "true"),
295
+ "pr_number": ($pr_number | tonumber),
296
+ "rubric": "agent-quality"
297
+ }
298
+ }')
299
+
300
+ aws events put-events \
301
+ --region us-west-2 \
302
+ --entries "[{
303
+ \"Source\": \"prism.d1.velocity\",
304
+ \"DetailType\": \"prism.d1.agent.eval\",
305
+ \"EventBusName\": \"prism-d1-metrics\",
306
+ \"Detail\": $(echo "${EVAL_EVENT}" | jq -c '.' | jq -Rs '.')
307
+ }]"
308
+
309
+ - name: Fail if eval gate failed
310
+ if: steps.eval.outputs.result == 'FAIL'
311
+ run: |
312
+ echo "Agent eval gate FAILED. Agent output scored below threshold."
313
+ exit 1