@prism-d1/cli 1.0.26 → 1.0.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/assets/eval-harness/README.md +114 -0
- package/dist/assets/eval-harness/eval-config.json +10 -0
- package/dist/assets/eval-harness/rubrics/agent-quality.json +79 -0
- package/dist/assets/eval-harness/rubrics/api-response-quality.json +45 -0
- package/dist/assets/eval-harness/rubrics/code-quality.json +98 -0
- package/dist/assets/eval-harness/rubrics/security-compliance.json +145 -0
- package/dist/assets/eval-harness/rubrics/spec-compliance.json +67 -0
- package/dist/assets/eval-harness/run-eval.sh +122 -0
- package/dist/assets/github-workflows/README.md +110 -0
- package/dist/assets/github-workflows/prism-agent-eval.yml +313 -0
- package/dist/assets/github-workflows/prism-ai-metrics.yml +261 -0
- package/dist/assets/github-workflows/prism-dora-weekly.yml +334 -0
- package/dist/assets/github-workflows/prism-eval-gate.yml +310 -0
- package/dist/assets/infra/bin/app.ts +56 -0
- package/dist/assets/infra/cdk.json +12 -0
- package/dist/assets/infra/lib/api-stack.ts +347 -0
- package/dist/assets/infra/lib/constructs/bedrock-guardrail-construct.ts +201 -0
- package/dist/assets/infra/lib/constructs/guardrail-enforcer-construct.ts +59 -0
- package/dist/assets/infra/lib/constructs/prism-vpc-construct.ts +75 -0
- package/dist/assets/infra/lib/constructs/security-agent-construct.ts +266 -0
- package/dist/assets/infra/lib/dashboard-stack.ts +1392 -0
- package/dist/assets/infra/lib/lambda/api-handler.ts +477 -0
- package/dist/assets/infra/lib/lambda/defect-correlator.ts +142 -0
- package/dist/assets/infra/lib/lambda/exfiltration-detector.ts +100 -0
- package/dist/assets/infra/lib/lambda/layers/guardrail-enforcer/nodejs/guardrail-enforcer.js +53 -0
- package/dist/assets/infra/lib/lambda/metrics-processor.ts +748 -0
- package/dist/assets/infra/lib/lambda/security-agent-processor.ts +231 -0
- package/dist/assets/infra/lib/lambda/security-remediation-tracker.ts +120 -0
- package/dist/assets/infra/lib/lambda/security-response-automator.ts +130 -0
- package/dist/assets/infra/lib/lambda/spec-to-code-calculator.ts +123 -0
- package/dist/assets/infra/lib/metrics-pipeline-stack.ts +701 -0
- package/dist/assets/infra/package.json +23 -0
- package/dist/assets/infra/tsconfig.json +24 -0
- package/dist/src/commands/bootstrapper/install-eval-harness.d.ts.map +1 -1
- package/dist/src/commands/bootstrapper/install-eval-harness.js +3 -4
- package/dist/src/commands/bootstrapper/install-eval-harness.js.map +1 -1
- package/dist/src/commands/bootstrapper/install-git-hooks.d.ts.map +1 -1
- package/dist/src/commands/bootstrapper/install-git-hooks.js +2 -5
- package/dist/src/commands/bootstrapper/install-git-hooks.js.map +1 -1
- package/dist/src/commands/securityagent/setup.d.ts.map +1 -1
- package/dist/src/commands/securityagent/setup.js +2 -3
- package/dist/src/commands/securityagent/setup.js.map +1 -1
- package/dist/src/commands/workshop/deploy-infra.d.ts.map +1 -1
- package/dist/src/commands/workshop/deploy-infra.js +2 -3
- package/dist/src/commands/workshop/deploy-infra.js.map +1 -1
- package/dist/src/commands/workshop/generate-demo-data.d.ts.map +1 -1
- package/dist/src/commands/workshop/generate-demo-data.js +3 -8
- package/dist/src/commands/workshop/generate-demo-data.js.map +1 -1
- package/dist/src/commands/workshop/perform-pen-test.d.ts.map +1 -1
- package/dist/src/commands/workshop/perform-pen-test.js +5 -14
- package/dist/src/commands/workshop/perform-pen-test.js.map +1 -1
- package/dist/src/utils/root.d.ts +6 -0
- package/dist/src/utils/root.d.ts.map +1 -1
- package/dist/src/utils/root.js +29 -0
- package/dist/src/utils/root.js.map +1 -1
- package/package.json +2 -2
|
@@ -0,0 +1,122 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# run-eval.sh — Evaluate a single file against a rubric using Bedrock.
|
|
3
|
+
#
|
|
4
|
+
# Usage: ./run-eval.sh <rubric-file> <input-file> [--spec <spec-file>]
|
|
5
|
+
#
|
|
6
|
+
# Output (stdout, parsed by workflow):
|
|
7
|
+
# Score: <0-1>
|
|
8
|
+
# Result: PASS|FAIL
|
|
9
|
+
#
|
|
10
|
+
# Exit: 0=pass, 1=fail, 2=error
|
|
11
|
+
|
|
12
|
+
set -euo pipefail
|
|
13
|
+
|
|
14
|
+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
15
|
+
CONFIG_FILE="${SCRIPT_DIR}/eval-config.json"
|
|
16
|
+
|
|
17
|
+
# --- Args ---
|
|
18
|
+
RUBRIC_FILE=""
|
|
19
|
+
INPUT_FILE=""
|
|
20
|
+
SPEC_FILE=""
|
|
21
|
+
|
|
22
|
+
while [[ $# -gt 0 ]]; do
|
|
23
|
+
case "$1" in
|
|
24
|
+
--spec) SPEC_FILE="$2"; shift 2 ;;
|
|
25
|
+
*) if [[ -z "${RUBRIC_FILE}" ]]; then RUBRIC_FILE="$1"
|
|
26
|
+
elif [[ -z "${INPUT_FILE}" ]]; then INPUT_FILE="$1"
|
|
27
|
+
fi; shift ;;
|
|
28
|
+
esac
|
|
29
|
+
done
|
|
30
|
+
|
|
31
|
+
[[ -f "${RUBRIC_FILE}" ]] || { echo "Error: rubric not found: ${RUBRIC_FILE}" >&2; exit 2; }
|
|
32
|
+
[[ -f "${INPUT_FILE}" ]] || { echo "Error: file not found: ${INPUT_FILE}" >&2; exit 2; }
|
|
33
|
+
[[ -f "${CONFIG_FILE}" ]] || { echo "Error: eval-config.json not found" >&2; exit 2; }
|
|
34
|
+
|
|
35
|
+
# --- Config ---
|
|
36
|
+
EVAL_MODEL=$(jq -r '.eval_model_id' "${CONFIG_FILE}")
|
|
37
|
+
PASS_THRESHOLD=$(jq -r '.pass_threshold' "${CONFIG_FILE}")
|
|
38
|
+
AWS_REGION=$(jq -r '.aws_region' "${CONFIG_FILE}")
|
|
39
|
+
|
|
40
|
+
RUBRIC_NAME=$(jq -r '.rubric_name' "${RUBRIC_FILE}")
|
|
41
|
+
CODE_CONTENT=$(cat "${INPUT_FILE}")
|
|
42
|
+
|
|
43
|
+
# --- Filter criteria: skip requires_spec criteria when no spec provided ---
|
|
44
|
+
CRITERIA_FILTER='if $spec == "" then .criteria | map(select(.requires_spec != true)) else .criteria end'
|
|
45
|
+
ACTIVE_CRITERIA=$(jq --arg spec "${SPEC_FILE}" "${CRITERIA_FILTER}" "${RUBRIC_FILE}")
|
|
46
|
+
ACTIVE_COUNT=$(echo "${ACTIVE_CRITERIA}" | jq 'length')
|
|
47
|
+
|
|
48
|
+
if [[ "${ACTIVE_COUNT}" -eq 0 ]]; then
|
|
49
|
+
echo "Skipped: ${RUBRIC_NAME} (all criteria require spec)"
|
|
50
|
+
echo "Score: 0"
|
|
51
|
+
echo "Result: SKIP"
|
|
52
|
+
echo "Hallucinations: 0"
|
|
53
|
+
exit 0
|
|
54
|
+
fi
|
|
55
|
+
|
|
56
|
+
# --- Build rubric text for prompt ---
|
|
57
|
+
RUBRIC_CRITERIA=$(echo "${ACTIVE_CRITERIA}" | jq -r '.[] | "- \(.name) [weight=\(.weight)]: \(.description)\n Scoring: \(.scoring | if type == "object" then to_entries | map("\(.key): \(.value)") | join("; ") else . end)"')
|
|
58
|
+
|
|
59
|
+
SPEC_SECTION="No spec provided. Evaluate based on code quality criteria only."
|
|
60
|
+
if [[ -n "${SPEC_FILE}" && -f "${SPEC_FILE}" ]]; then
|
|
61
|
+
SPEC_SECTION=$(cat "${SPEC_FILE}")
|
|
62
|
+
fi
|
|
63
|
+
|
|
64
|
+
# --- Prompt (ask for per-criterion scores, we calculate weighted average) ---
|
|
65
|
+
EVAL_PROMPT="You are a code quality evaluator. Evaluate the following code against the rubric criteria.
|
|
66
|
+
|
|
67
|
+
## Spec
|
|
68
|
+
${SPEC_SECTION}
|
|
69
|
+
|
|
70
|
+
## Code Under Evaluation
|
|
71
|
+
--- FILE: ${INPUT_FILE} ---
|
|
72
|
+
${CODE_CONTENT}
|
|
73
|
+
|
|
74
|
+
## Rubric Criteria
|
|
75
|
+
${RUBRIC_CRITERIA}
|
|
76
|
+
|
|
77
|
+
Respond in this exact JSON format (no other text):
|
|
78
|
+
{\"evaluations\": [{\"criterion\": \"<name>\", \"score\": <0.0-1.0>, \"rationale\": \"<brief>\"}]}"
|
|
79
|
+
|
|
80
|
+
# --- Call Bedrock ---
|
|
81
|
+
BODY_FILE=$(mktemp)
|
|
82
|
+
RESP_FILE=$(mktemp)
|
|
83
|
+
trap 'rm -f "${BODY_FILE}" "${RESP_FILE}"' EXIT
|
|
84
|
+
|
|
85
|
+
jq -n --arg p "${EVAL_PROMPT}" \
|
|
86
|
+
'{anthropic_version:"bedrock-2023-05-31",max_tokens:2000,messages:[{role:"user",content:$p}]}' > "${BODY_FILE}"
|
|
87
|
+
|
|
88
|
+
aws bedrock-runtime invoke-model \
|
|
89
|
+
--region "${AWS_REGION}" \
|
|
90
|
+
--model-id "${EVAL_MODEL}" \
|
|
91
|
+
--content-type "application/json" \
|
|
92
|
+
--accept "application/json" \
|
|
93
|
+
--body "fileb://${BODY_FILE}" \
|
|
94
|
+
"${RESP_FILE}" 2>/dev/null || { echo "Error: Bedrock invoke failed" >&2; exit 2; }
|
|
95
|
+
|
|
96
|
+
# --- Parse response ---
|
|
97
|
+
EVAL_JSON=$(jq -r '.content[0].text' "${RESP_FILE}" 2>/dev/null | sed -n '/^{/,/^}/p') || true
|
|
98
|
+
[[ -n "${EVAL_JSON}" ]] || { echo "Error: could not parse model response" >&2; exit 2; }
|
|
99
|
+
|
|
100
|
+
# --- Calculate weighted score (client-side, don't trust LLM math) ---
|
|
101
|
+
OVERALL=$(echo "${EVAL_JSON}" | jq --argjson criteria "${ACTIVE_CRITERIA}" '
|
|
102
|
+
($criteria | map(.weight) | add) as $total_weight |
|
|
103
|
+
[.evaluations[] as $e |
|
|
104
|
+
($criteria[] | select(.name == $e.criterion)) as $c |
|
|
105
|
+
($e.score * $c.weight)
|
|
106
|
+
] | (add // 0) / $total_weight')
|
|
107
|
+
|
|
108
|
+
# --- Detect hallucinations ---
|
|
109
|
+
HALLUCINATIONS=$(echo "${EVAL_JSON}" | jq '[.evaluations[] | select(.rationale | test("hallucinated|does not exist|not found"; "i"))] | length')
|
|
110
|
+
|
|
111
|
+
# --- Output (workflow parses these lines) ---
|
|
112
|
+
echo "${EVAL_JSON}" | jq -r '.evaluations[] | "\(.criterion): \(.score) — \(.rationale)"'
|
|
113
|
+
echo ""
|
|
114
|
+
printf "Score: %.4f\n" "${OVERALL}"
|
|
115
|
+
echo "Result: $(echo "${OVERALL} >= ${PASS_THRESHOLD}" | bc -l | grep -q '^1' && echo "PASS" || echo "FAIL")"
|
|
116
|
+
echo "Hallucinations: ${HALLUCINATIONS}"
|
|
117
|
+
|
|
118
|
+
# --- Exit ---
|
|
119
|
+
if (( $(echo "${OVERALL} < ${PASS_THRESHOLD}" | bc -l) )); then
|
|
120
|
+
exit 1
|
|
121
|
+
fi
|
|
122
|
+
exit 0
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
# GitHub Actions Workflows
|
|
2
|
+
|
|
3
|
+
Reusable GitHub Actions workflows for PRISM D1 Velocity metric collection.
|
|
4
|
+
|
|
5
|
+
## Workflows
|
|
6
|
+
|
|
7
|
+
| Workflow | Trigger | Purpose |
|
|
8
|
+
|---|---|---|
|
|
9
|
+
| `prism-ai-metrics.yml` | PR merge to main | Calculates AI-to-merge ratio, token usage, lead time. Emits `prism.d1.pr` + `prism.d1.deploy` events |
|
|
10
|
+
| `prism-eval-gate.yml` | PR open/update | Evaluates AI-generated code per-file with auto-selected rubrics, waits for Security Agent, blocks merge on failure |
|
|
11
|
+
| `prism-agent-eval.yml` | PR modifying agent code | Runs agent in mock mode, evaluates output with agent-quality rubric |
|
|
12
|
+
| `prism-dora-weekly.yml` | Weekly (Monday 09:00 UTC) | Calculates DORA + AI-DORA metrics, emits to EventBridge + CloudWatch |
|
|
13
|
+
|
|
14
|
+
## Setup
|
|
15
|
+
|
|
16
|
+
### 1. Configure AWS OIDC
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
bash prism-cli bootstrapper setup-github-oidc
|
|
20
|
+
```
|
|
21
|
+
|
|
22
|
+
This interactively creates:
|
|
23
|
+
- OIDC identity provider for `token.actions.githubusercontent.com`
|
|
24
|
+
- IAM role `GitHubActions-<repo>` with trust policy scoped to your repo
|
|
25
|
+
- Inline policy with `events:PutEvents` and `bedrock:InvokeModel`
|
|
26
|
+
|
|
27
|
+
For the weekly workflow, manually add `cloudwatch:PutMetricData` to the role policy.
|
|
28
|
+
|
|
29
|
+
### 2. Set Repository Secret
|
|
30
|
+
|
|
31
|
+
In GitHub: Settings → Secrets and variables → Actions → New repository secret:
|
|
32
|
+
|
|
33
|
+
| Secret | Value |
|
|
34
|
+
|---|---|
|
|
35
|
+
| `PRISM_METRICS_ROLE_ARN` | ARN printed by `setup-github-oidc` |
|
|
36
|
+
|
|
37
|
+
### 3. Install Git Hooks + Config
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
bash prism-cli bootstrapper install-git-hooks
|
|
41
|
+
```
|
|
42
|
+
|
|
43
|
+
Creates `.prism/config.json` with your team ID (read by all workflows).
|
|
44
|
+
|
|
45
|
+
### 4. Install Eval Harness
|
|
46
|
+
|
|
47
|
+
```bash
|
|
48
|
+
# Workshop mode — bring your own rubric
|
|
49
|
+
bash prism-cli bootstrapper install-eval-harness
|
|
50
|
+
|
|
51
|
+
# Production mode — includes all 5 rubrics
|
|
52
|
+
bash prism-cli bootstrapper install-eval-harness --with-rubrics
|
|
53
|
+
```
|
|
54
|
+
|
|
55
|
+
This copies `.prism/eval-harness/`, `eval-config.json`, rubrics, and the `prism-eval-gate.yml` workflow.
|
|
56
|
+
|
|
57
|
+
### 5. Copy Remaining Workflows
|
|
58
|
+
|
|
59
|
+
```bash
|
|
60
|
+
mkdir -p .github/workflows
|
|
61
|
+
cp bootstrapper/github-workflows/prism-ai-metrics.yml .github/workflows/
|
|
62
|
+
cp bootstrapper/github-workflows/prism-dora-weekly.yml .github/workflows/
|
|
63
|
+
# Optional — only if you have agents with --mock support:
|
|
64
|
+
cp bootstrapper/github-workflows/prism-agent-eval.yml .github/workflows/
|
|
65
|
+
```
|
|
66
|
+
|
|
67
|
+
## IAM Permissions
|
|
68
|
+
|
|
69
|
+
The OIDC role needs:
|
|
70
|
+
|
|
71
|
+
| Permission | Used by |
|
|
72
|
+
|---|---|
|
|
73
|
+
| `events:PutEvents` | All workflows |
|
|
74
|
+
| `bedrock:InvokeModel` | eval-gate, agent-eval |
|
|
75
|
+
| `cloudwatch:PutMetricData` | dora-weekly |
|
|
76
|
+
|
|
77
|
+
## Customization
|
|
78
|
+
|
|
79
|
+
| Setting | How |
|
|
80
|
+
|---|---|
|
|
81
|
+
| Branch | Edit `branches` in each workflow |
|
|
82
|
+
| AWS region | Edit `aws-region` field + EventBridge commands |
|
|
83
|
+
| Eval threshold | Edit `.prism/.prism/eval-harness/eval-config.json` → `pass_threshold` |
|
|
84
|
+
| Eval model | Edit `.prism/.prism/eval-harness/eval-config.json` → `eval_model_id` |
|
|
85
|
+
| Weekly schedule | Edit cron in `prism-dora-weekly.yml` (default: `0 9 * * 1`) |
|
|
86
|
+
|
|
87
|
+
## Events Emitted
|
|
88
|
+
|
|
89
|
+
| Detail Type | Source Workflow | Destination |
|
|
90
|
+
|---|---|---|
|
|
91
|
+
| `prism.d1.pr` | ai-metrics | EventBridge |
|
|
92
|
+
| `prism.d1.deploy` | ai-metrics | EventBridge |
|
|
93
|
+
| `prism.d1.eval` | eval-gate | EventBridge |
|
|
94
|
+
| `prism.d1.agent.eval` | agent-eval | EventBridge |
|
|
95
|
+
| `prism.d1.assessment` | dora-weekly | EventBridge |
|
|
96
|
+
| `prism.d1.security.code_review` | eval-gate (Security Agent) | EventBridge |
|
|
97
|
+
| `AIAdoptionRate`, `SpecCoverage`, tool counts | dora-weekly | CloudWatch |
|
|
98
|
+
|
|
99
|
+
All EventBridge events use source `prism.d1.velocity` and bus `prism-d1-metrics`.
|
|
100
|
+
|
|
101
|
+
## Troubleshooting
|
|
102
|
+
|
|
103
|
+
| Issue | Solution |
|
|
104
|
+
|---|---|
|
|
105
|
+
| OIDC auth fails | Verify trust policy `sub` matches `repo:org/repo:*` |
|
|
106
|
+
| EventBridge put fails | Check `events:PutEvents` on bus ARN |
|
|
107
|
+
| Eval gate always skips | Ensure commits have `AI-Origin:` trailers (install git hooks) |
|
|
108
|
+
| Weekly not running | Workflow must exist on default branch; test with `workflow_dispatch` |
|
|
109
|
+
| Agent eval skips | No `agent/main.py` found — add `--mock` support to your agent |
|
|
110
|
+
| Security Agent timeout | Agent takes 2+ min to start; workflow waits up to 12 min total |
|
|
@@ -0,0 +1,313 @@
|
|
|
1
|
+
# prism-agent-eval.yml -- Evaluate agent outputs on PRs that modify agent code.
|
|
2
|
+
#
|
|
3
|
+
# Copy this file to .github/workflows/prism-agent-eval.yml in your repo.
|
|
4
|
+
#
|
|
5
|
+
# Required setup:
|
|
6
|
+
# - OIDC provider configured for GitHub Actions in your AWS account
|
|
7
|
+
# - IAM role with bedrock:InvokeModel and events:PutEvents permissions
|
|
8
|
+
# - Repository secret PRISM_METRICS_ROLE_ARN set to the OIDC role ARN
|
|
9
|
+
# - .prism/config.json committed to the repo (created by: bash prism-cli bootstrapper install-git-hooks)
|
|
10
|
+
# - .prism/eval-harness/ installed (run: bash prism-cli bootstrapper install-eval-harness)
|
|
11
|
+
# - (Optional) Agent entry point supports --mock flag for dry-run execution
|
|
12
|
+
|
|
13
|
+
name: PRISM Agent Eval
|
|
14
|
+
|
|
15
|
+
on:
|
|
16
|
+
pull_request:
|
|
17
|
+
types: [opened, synchronize, reopened]
|
|
18
|
+
branches: [main, master]
|
|
19
|
+
paths:
|
|
20
|
+
- 'agent/**'
|
|
21
|
+
- 'agents/**'
|
|
22
|
+
- 'mcp-servers/**'
|
|
23
|
+
- '**/agent*.py'
|
|
24
|
+
- '**/agent*.ts'
|
|
25
|
+
|
|
26
|
+
permissions:
|
|
27
|
+
id-token: write
|
|
28
|
+
contents: read
|
|
29
|
+
pull-requests: write
|
|
30
|
+
|
|
31
|
+
jobs:
|
|
32
|
+
agent-eval:
|
|
33
|
+
runs-on: ubuntu-latest
|
|
34
|
+
steps:
|
|
35
|
+
- name: Checkout
|
|
36
|
+
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
37
|
+
with:
|
|
38
|
+
fetch-depth: 0
|
|
39
|
+
|
|
40
|
+
- name: Configure AWS credentials (OIDC)
|
|
41
|
+
uses: aws-actions/configure-aws-credentials@b47578312673ae6fa5b5096b330d9fbac3d116df # v4.2.1
|
|
42
|
+
with:
|
|
43
|
+
role-to-assume: ${{ secrets.PRISM_METRICS_ROLE_ARN }}
|
|
44
|
+
aws-region: us-west-2
|
|
45
|
+
|
|
46
|
+
- name: Read team ID from .prism/config.json
|
|
47
|
+
id: prism-config
|
|
48
|
+
run: |
|
|
49
|
+
if [[ -f .prism/config.json ]]; then
|
|
50
|
+
TEAM_ID=$(jq -r '.team_id' .prism/config.json)
|
|
51
|
+
else
|
|
52
|
+
echo "::error::.prism/config.json not found. Run bootstrapper/metric-hooks/install.sh first."
|
|
53
|
+
exit 1
|
|
54
|
+
fi
|
|
55
|
+
echo "team_id=${TEAM_ID}" >> "$GITHUB_OUTPUT"
|
|
56
|
+
|
|
57
|
+
- name: Set up Python
|
|
58
|
+
uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5.6.0
|
|
59
|
+
with:
|
|
60
|
+
python-version: '3.12'
|
|
61
|
+
|
|
62
|
+
- name: Install dependencies
|
|
63
|
+
run: |
|
|
64
|
+
sudo apt-get update -qq && sudo apt-get install -y -qq jq bc
|
|
65
|
+
pip install --quiet strands-agents strands-agents-tools requests
|
|
66
|
+
|
|
67
|
+
- name: Identify agent changes
|
|
68
|
+
id: detect-agent
|
|
69
|
+
run: |
|
|
70
|
+
BASE_SHA="${{ github.event.pull_request.base.sha }}"
|
|
71
|
+
HEAD_SHA="${{ github.sha }}"
|
|
72
|
+
|
|
73
|
+
# Find changed files in agent-related paths
|
|
74
|
+
AGENT_FILES=$(git diff --name-only "${BASE_SHA}..${HEAD_SHA}" \
|
|
75
|
+
| grep -E '(^agent/|^agents/|^mcp-servers/|agent.*\.(py|ts)$)' || true)
|
|
76
|
+
|
|
77
|
+
if [[ -z "${AGENT_FILES}" ]]; then
|
|
78
|
+
echo "No agent-related files changed."
|
|
79
|
+
echo "has_agent_changes=false" >> "$GITHUB_OUTPUT"
|
|
80
|
+
exit 0
|
|
81
|
+
fi
|
|
82
|
+
|
|
83
|
+
echo "has_agent_changes=true" >> "$GITHUB_OUTPUT"
|
|
84
|
+
echo "${AGENT_FILES}" > /tmp/agent-changed-files.txt
|
|
85
|
+
|
|
86
|
+
FILE_COUNT=$(echo "${AGENT_FILES}" | wc -l | tr -d ' ')
|
|
87
|
+
echo "Found ${FILE_COUNT} changed agent files."
|
|
88
|
+
echo "file_count=${FILE_COUNT}" >> "$GITHUB_OUTPUT"
|
|
89
|
+
|
|
90
|
+
- name: Run agent in mock mode
|
|
91
|
+
id: agent-run
|
|
92
|
+
if: steps.detect-agent.outputs.has_agent_changes == 'true'
|
|
93
|
+
run: |
|
|
94
|
+
# Look for agent entry points and run them with --mock flag
|
|
95
|
+
# The --mock flag should make the agent execute its workflow
|
|
96
|
+
# without real side effects, using stubbed tool responses.
|
|
97
|
+
|
|
98
|
+
AGENT_ENTRY=""
|
|
99
|
+
for CANDIDATE in agent/main.py agents/main.py agent.py; do
|
|
100
|
+
if [[ -f "${CANDIDATE}" ]]; then
|
|
101
|
+
AGENT_ENTRY="${CANDIDATE}"
|
|
102
|
+
break
|
|
103
|
+
fi
|
|
104
|
+
done
|
|
105
|
+
|
|
106
|
+
if [[ -z "${AGENT_ENTRY}" ]]; then
|
|
107
|
+
echo "No agent entry point found. Skipping mock run."
|
|
108
|
+
echo "mock_ran=false" >> "$GITHUB_OUTPUT"
|
|
109
|
+
echo "{}" > /tmp/agent-output.json
|
|
110
|
+
exit 0
|
|
111
|
+
fi
|
|
112
|
+
|
|
113
|
+
echo "Running agent in mock mode: ${AGENT_ENTRY}"
|
|
114
|
+
echo "mock_ran=true" >> "$GITHUB_OUTPUT"
|
|
115
|
+
|
|
116
|
+
set +e
|
|
117
|
+
python "${AGENT_ENTRY}" --mock > /tmp/agent-output.json 2>/tmp/agent-stderr.log
|
|
118
|
+
EXIT_CODE=$?
|
|
119
|
+
set -e
|
|
120
|
+
|
|
121
|
+
if [[ "${EXIT_CODE}" -ne 0 ]]; then
|
|
122
|
+
echo "Agent mock run exited with code ${EXIT_CODE}"
|
|
123
|
+
echo "agent_exit_code=${EXIT_CODE}" >> "$GITHUB_OUTPUT"
|
|
124
|
+
# Still continue to evaluate whatever output was produced
|
|
125
|
+
else
|
|
126
|
+
echo "agent_exit_code=0" >> "$GITHUB_OUTPUT"
|
|
127
|
+
fi
|
|
128
|
+
|
|
129
|
+
- name: Evaluate agent output
|
|
130
|
+
id: eval
|
|
131
|
+
if: steps.detect-agent.outputs.has_agent_changes == 'true'
|
|
132
|
+
run: |
|
|
133
|
+
set -o pipefail
|
|
134
|
+
chmod +x .prism/eval-harness/run-eval.sh
|
|
135
|
+
|
|
136
|
+
RUBRIC=".prism/eval-harness/rubrics/agent-quality.json"
|
|
137
|
+
|
|
138
|
+
# If agent produced output, evaluate it
|
|
139
|
+
if [[ -s /tmp/agent-output.json ]]; then
|
|
140
|
+
EVAL_TARGET="/tmp/agent-output.json"
|
|
141
|
+
else
|
|
142
|
+
# Fall back to evaluating the agent source code itself
|
|
143
|
+
EVAL_TARGET=$(head -1 /tmp/agent-changed-files.txt)
|
|
144
|
+
fi
|
|
145
|
+
|
|
146
|
+
echo "Evaluating: ${EVAL_TARGET} against ${RUBRIC}"
|
|
147
|
+
|
|
148
|
+
set +e
|
|
149
|
+
OUTPUT=$(./.prism/eval-harness/run-eval.sh "${RUBRIC}" "${EVAL_TARGET}" 2>&1)
|
|
150
|
+
EXIT_CODE=$?
|
|
151
|
+
set -e
|
|
152
|
+
|
|
153
|
+
SCORE=$(echo "${OUTPUT}" | grep '^Score:' | awk '{print $2}' || echo "0")
|
|
154
|
+
RESULT=$(echo "${OUTPUT}" | grep '^Result:' | awk '{print $2}' || echo "UNKNOWN")
|
|
155
|
+
|
|
156
|
+
echo "score=${SCORE}" >> "$GITHUB_OUTPUT"
|
|
157
|
+
echo "result=${RESULT}" >> "$GITHUB_OUTPUT"
|
|
158
|
+
|
|
159
|
+
# Extract per-criteria scores if available
|
|
160
|
+
TASK_COMPLETION=$(echo "${OUTPUT}" | grep 'task_completion' | awk '{print $NF}' || echo "N/A")
|
|
161
|
+
TOOL_EFFICIENCY=$(echo "${OUTPUT}" | grep 'tool_usage_efficiency' | awk '{print $NF}' || echo "N/A")
|
|
162
|
+
GUARDRAIL_COMPLIANCE=$(echo "${OUTPUT}" | grep 'guardrail_compliance' | awk '{print $NF}' || echo "N/A")
|
|
163
|
+
REASONING_QUALITY=$(echo "${OUTPUT}" | grep 'reasoning_trace_quality' | awk '{print $NF}' || echo "N/A")
|
|
164
|
+
ERROR_RECOVERY=$(echo "${OUTPUT}" | grep 'error_recovery' | awk '{print $NF}' || echo "N/A")
|
|
165
|
+
|
|
166
|
+
echo "task_completion=${TASK_COMPLETION}" >> "$GITHUB_OUTPUT"
|
|
167
|
+
echo "tool_efficiency=${TOOL_EFFICIENCY}" >> "$GITHUB_OUTPUT"
|
|
168
|
+
echo "guardrail_compliance=${GUARDRAIL_COMPLIANCE}" >> "$GITHUB_OUTPUT"
|
|
169
|
+
echo "reasoning_quality=${REASONING_QUALITY}" >> "$GITHUB_OUTPUT"
|
|
170
|
+
echo "error_recovery=${ERROR_RECOVERY}" >> "$GITHUB_OUTPUT"
|
|
171
|
+
|
|
172
|
+
- name: Post PR comment
|
|
173
|
+
if: steps.detect-agent.outputs.has_agent_changes == 'true'
|
|
174
|
+
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
|
175
|
+
with:
|
|
176
|
+
script: |
|
|
177
|
+
const score = '${{ steps.eval.outputs.score }}';
|
|
178
|
+
const result = '${{ steps.eval.outputs.result }}';
|
|
179
|
+
const mockRan = '${{ steps.agent-run.outputs.mock_ran }}';
|
|
180
|
+
const agentExit = '${{ steps.agent-run.outputs.agent_exit_code }}';
|
|
181
|
+
const fileCount = '${{ steps.detect-agent.outputs.file_count }}';
|
|
182
|
+
const emoji = result === 'PASS' ? ':white_check_mark:' : ':x:';
|
|
183
|
+
|
|
184
|
+
const taskCompletion = '${{ steps.eval.outputs.task_completion }}';
|
|
185
|
+
const toolEfficiency = '${{ steps.eval.outputs.tool_efficiency }}';
|
|
186
|
+
const guardrailCompliance = '${{ steps.eval.outputs.guardrail_compliance }}';
|
|
187
|
+
const reasoningQuality = '${{ steps.eval.outputs.reasoning_quality }}';
|
|
188
|
+
const errorRecovery = '${{ steps.eval.outputs.error_recovery }}';
|
|
189
|
+
|
|
190
|
+
const mockStatus = mockRan === 'true'
|
|
191
|
+
? (agentExit === '0' ? ':white_check_mark: Mock run succeeded' : `:warning: Mock run exited with code ${agentExit}`)
|
|
192
|
+
: ':information_source: No agent entry point found — evaluated source code only';
|
|
193
|
+
|
|
194
|
+
const body = `## ${emoji} PRISM Agent Eval — ${result}
|
|
195
|
+
|
|
196
|
+
**Overall Score**: ${score} | **Agent Files Changed**: ${fileCount}
|
|
197
|
+
|
|
198
|
+
${mockStatus}
|
|
199
|
+
|
|
200
|
+
### Criteria Breakdown
|
|
201
|
+
|
|
202
|
+
| Criteria | Score | Weight |
|
|
203
|
+
|---|---|---|
|
|
204
|
+
| Task Completion | ${taskCompletion} | 0.30 |
|
|
205
|
+
| Tool Usage Efficiency | ${toolEfficiency} | 0.20 |
|
|
206
|
+
| Guardrail Compliance | ${guardrailCompliance} | 0.20 |
|
|
207
|
+
| Reasoning Trace Quality | ${reasoningQuality} | 0.15 |
|
|
208
|
+
| Error Recovery | ${errorRecovery} | 0.15 |
|
|
209
|
+
|
|
210
|
+
> Threshold: 0.82 | Rubric: agent-quality.json | Evaluated by Amazon Bedrock
|
|
211
|
+
> [PRISM D1 Velocity](${{ github.server_url }}/${{ github.repository }})`;
|
|
212
|
+
|
|
213
|
+
const { data: comments } = await github.rest.issues.listComments({
|
|
214
|
+
owner: context.repo.owner,
|
|
215
|
+
repo: context.repo.repo,
|
|
216
|
+
issue_number: context.issue.number,
|
|
217
|
+
});
|
|
218
|
+
|
|
219
|
+
const existingComment = comments.find(c => c.body.includes('PRISM Agent Eval'));
|
|
220
|
+
|
|
221
|
+
if (existingComment) {
|
|
222
|
+
await github.rest.issues.updateComment({
|
|
223
|
+
owner: context.repo.owner,
|
|
224
|
+
repo: context.repo.repo,
|
|
225
|
+
comment_id: existingComment.id,
|
|
226
|
+
body: body,
|
|
227
|
+
});
|
|
228
|
+
} else {
|
|
229
|
+
await github.rest.issues.createComment({
|
|
230
|
+
owner: context.repo.owner,
|
|
231
|
+
repo: context.repo.repo,
|
|
232
|
+
issue_number: context.issue.number,
|
|
233
|
+
body: body,
|
|
234
|
+
});
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
- name: Post skip comment
|
|
238
|
+
if: steps.detect-agent.outputs.has_agent_changes != 'true'
|
|
239
|
+
uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
|
|
240
|
+
with:
|
|
241
|
+
script: |
|
|
242
|
+
const { data: comments } = await github.rest.issues.listComments({
|
|
243
|
+
owner: context.repo.owner,
|
|
244
|
+
repo: context.repo.repo,
|
|
245
|
+
issue_number: context.issue.number,
|
|
246
|
+
});
|
|
247
|
+
const existing = comments.find(c => c.body.includes('PRISM Agent Eval'));
|
|
248
|
+
if (!existing) {
|
|
249
|
+
await github.rest.issues.createComment({
|
|
250
|
+
owner: context.repo.owner,
|
|
251
|
+
repo: context.repo.repo,
|
|
252
|
+
issue_number: context.issue.number,
|
|
253
|
+
body: ':white_check_mark: **PRISM Agent Eval** — No agent code changes detected. Evaluation skipped.',
|
|
254
|
+
});
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
- name: Emit prism.d1.agent.eval event
|
|
258
|
+
if: steps.detect-agent.outputs.has_agent_changes == 'true'
|
|
259
|
+
env:
|
|
260
|
+
SCORE: ${{ steps.eval.outputs.score }}
|
|
261
|
+
run: |
|
|
262
|
+
TIMESTAMP=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
|
|
263
|
+
|
|
264
|
+
EVAL_EVENT=$(jq -n \
|
|
265
|
+
--arg team_id "${{ steps.prism-config.outputs.team_id }}" \
|
|
266
|
+
--arg repo "${{ github.repository }}" \
|
|
267
|
+
--arg timestamp "${TIMESTAMP}" \
|
|
268
|
+
--argjson score "${SCORE:-0}" \
|
|
269
|
+
--arg result "${{ steps.eval.outputs.result }}" \
|
|
270
|
+
--argjson file_count "${{ steps.detect-agent.outputs.file_count }}" \
|
|
271
|
+
--arg pr_number "${{ github.event.pull_request.number }}" \
|
|
272
|
+
--arg mock_ran "${{ steps.agent-run.outputs.mock_ran }}" \
|
|
273
|
+
'{
|
|
274
|
+
"team_id": $team_id,
|
|
275
|
+
"repo": $repo,
|
|
276
|
+
"timestamp": $timestamp,
|
|
277
|
+
"prism_level": 3,
|
|
278
|
+
"metric": {
|
|
279
|
+
"name": "agent_eval_score",
|
|
280
|
+
"value": $score,
|
|
281
|
+
"unit": "score"
|
|
282
|
+
},
|
|
283
|
+
"ai_context": {
|
|
284
|
+
"tool": "bedrock-eval",
|
|
285
|
+
"model": "anthropic.claude-sonnet-4-20250514",
|
|
286
|
+
"origin": "ai-generated"
|
|
287
|
+
},
|
|
288
|
+
"ai_dora": {
|
|
289
|
+
"eval_gate_pass_rate": (if $result == "PASS" then 1.0 else 0.0 end)
|
|
290
|
+
},
|
|
291
|
+
"agent_eval": {
|
|
292
|
+
"file_count": $file_count,
|
|
293
|
+
"result": $result,
|
|
294
|
+
"mock_ran": ($mock_ran == "true"),
|
|
295
|
+
"pr_number": ($pr_number | tonumber),
|
|
296
|
+
"rubric": "agent-quality"
|
|
297
|
+
}
|
|
298
|
+
}')
|
|
299
|
+
|
|
300
|
+
aws events put-events \
|
|
301
|
+
--region us-west-2 \
|
|
302
|
+
--entries "[{
|
|
303
|
+
\"Source\": \"prism.d1.velocity\",
|
|
304
|
+
\"DetailType\": \"prism.d1.agent.eval\",
|
|
305
|
+
\"EventBusName\": \"prism-d1-metrics\",
|
|
306
|
+
\"Detail\": $(echo "${EVAL_EVENT}" | jq -c '.' | jq -Rs '.')
|
|
307
|
+
}]"
|
|
308
|
+
|
|
309
|
+
- name: Fail if eval gate failed
|
|
310
|
+
if: steps.eval.outputs.result == 'FAIL'
|
|
311
|
+
run: |
|
|
312
|
+
echo "Agent eval gate FAILED. Agent output scored below threshold."
|
|
313
|
+
exit 1
|