@guilz-dev/sdlc-gh 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.github/CODEOWNERS +5 -0
- package/.github/ISSUE_TEMPLATE/bug_report.yml +68 -0
- package/.github/ISSUE_TEMPLATE/config.yml +1 -0
- package/.github/ISSUE_TEMPLATE/feature_request.yml +39 -0
- package/.github/ISSUE_TEMPLATE/support.yml +56 -0
- package/.github/ISSUE_TEMPLATE/task.yml +89 -0
- package/.github/agents/implementer.agent.md +17 -0
- package/.github/agents/reviewer.agent.md +18 -0
- package/.github/agents/triager.agent.md +13 -0
- package/.github/aw/actions-lock.json +9 -0
- package/.github/copilot-instructions.md +35 -0
- package/.github/hooks/hooks.json +12 -0
- package/.github/instructions/core.instructions.md +11 -0
- package/.github/instructions/profiles/go.instructions.md +10 -0
- package/.github/instructions/profiles/php.instructions.md +11 -0
- package/.github/instructions/profiles/python.instructions.md +11 -0
- package/.github/instructions/profiles/ruby.instructions.md +11 -0
- package/.github/instructions/profiles/typescript.instructions.md +11 -0
- package/.github/labels.yml +55 -0
- package/.github/pull_request_template.md +33 -0
- package/.github/ruleset.example.json +33 -0
- package/.github/ruleset.harness-eval.example.json +29 -0
- package/.github/skills/quality-loop/SKILL.md +23 -0
- package/.github/workflows/agent-retry-orchestrator.yml +161 -0
- package/.github/workflows/copilot-setup-steps.yml +64 -0
- package/.github/workflows/eval-ci.yml +169 -0
- package/.github/workflows/eval-drift.yml +75 -0
- package/.github/workflows/gh-aw-dogfood-ci.yml +73 -0
- package/.github/workflows/harness-ci.yml +244 -0
- package/.github/workflows/harness-sync.yml +28 -0
- package/.github/workflows/l1-readiness-check.yml +45 -0
- package/.github/workflows/labels-sync.yml +24 -0
- package/.github/workflows/nightly-harness-review.lock.yml +1643 -0
- package/.github/workflows/nightly-harness-review.md +87 -0
- package/.github/workflows/nightly-harness-review.yml +63 -0
- package/.github/workflows/npm-publish.yml +49 -0
- package/.github/workflows/pr-context-comment.yml +138 -0
- package/.github/workflows/product-ci-go.yml +33 -0
- package/.github/workflows/product-ci-php.yml +39 -0
- package/.github/workflows/product-ci-python.yml +34 -0
- package/.github/workflows/product-ci-ruby.yml +35 -0
- package/.github/workflows/product-ci-ts.yml +37 -0
- package/.github/workflows/task-issue-label-sync.yml +50 -0
- package/.github/workflows/weekly-redteam.lock.yml +1571 -0
- package/.github/workflows/weekly-redteam.md +76 -0
- package/.github/zizmor.yml +11 -0
- package/AGENTS.md +54 -0
- package/LICENSE +21 -0
- package/README.md +366 -0
- package/config/stacks.json +55 -0
- package/docs/adoption.md +126 -0
- package/docs/arch.md +535 -0
- package/docs/auth-boundaries.md +16 -0
- package/docs/coding-agent-l1.md +152 -0
- package/docs/exceptions/README.md +25 -0
- package/docs/exceptions/TEMPLATE.md +8 -0
- package/docs/failure-taxonomy.md +23 -0
- package/docs/gh-aw-dogfood.md +109 -0
- package/docs/kpi-baseline.md +9 -0
- package/docs/nightly-harness-review.md +94 -0
- package/docs/operations.md +108 -0
- package/docs/publishing.md +79 -0
- package/docs/revert-playbook.md +44 -0
- package/docs/shared-config.md +30 -0
- package/docs/telemetry-artifacts.md +78 -0
- package/docs/telemetry-schema.md +60 -0
- package/evals/.score-baseline.json +6 -0
- package/evals/e2e-bench/README.md +28 -0
- package/evals/e2e-bench/manifest.json +16 -0
- package/evals/e2e-bench/tasks/e2e-001.yml +10 -0
- package/evals/e2e-bench/tasks/e2e-002.yml +11 -0
- package/evals/e2e-bench/tasks/e2e-003.yml +10 -0
- package/evals/e2e-bench/tasks/e2e-004.yml +14 -0
- package/evals/e2e-bench/tasks/e2e-005.yml +11 -0
- package/evals/e2e-bench/tasks/e2e-006.yml +10 -0
- package/evals/e2e-bench/tasks/e2e-007.yml +10 -0
- package/evals/e2e-bench/tasks/e2e-008.yml +10 -0
- package/evals/e2e-bench/tasks/e2e-009.yml +10 -0
- package/evals/trajectories/rubric.md +12 -0
- package/evals/trajectories/test_harness_conventions.py +271 -0
- package/infra/README.md +49 -0
- package/infra/langfuse/docker-compose.yml +25 -0
- package/infra/otel/collector-config.yml +24 -0
- package/infra/samples/gh-aw-dogfood-report.json +44 -0
- package/infra/samples/harness-review-routing-plan.json +19 -0
- package/infra/samples/harness-review-summary.json +61 -0
- package/infra/samples/telemetry-artifact.json +29 -0
- package/infra/samples/telemetry-payload.json +19 -0
- package/package.json +85 -0
- package/prompts/triager-classify.prompt.yml +10 -0
- package/sample/go/add.go +5 -0
- package/sample/go/add_test.go +9 -0
- package/sample/go/go.mod +3 -0
- package/sample/php/composer.json +26 -0
- package/sample/php/composer.lock +1881 -0
- package/sample/php/phpunit.xml +8 -0
- package/sample/php/src/Add.php +13 -0
- package/sample/php/tests/AddTest.php +16 -0
- package/sample/python/requirements-dev.txt +2 -0
- package/sample/python/src/__init__.py +0 -0
- package/sample/python/src/greet.py +3 -0
- package/sample/python/tests/conftest.py +4 -0
- package/sample/python/tests/test_greet.py +5 -0
- package/sample/ruby/.rubocop.yml +10 -0
- package/sample/ruby/Gemfile +6 -0
- package/sample/ruby/Gemfile.lock +58 -0
- package/sample/ruby/lib/add.rb +9 -0
- package/sample/ruby/spec/add_spec.rb +11 -0
- package/sample/ts/biome.json +6 -0
- package/sample/ts/package-lock.json +1763 -0
- package/sample/ts/package.json +15 -0
- package/sample/ts/src/add.ts +3 -0
- package/sample/ts/tests/add.test.ts +8 -0
- package/sample/ts/tsconfig.json +12 -0
- package/scripts/aggregate-harness-review.mjs +48 -0
- package/scripts/bootstrap-harness.sh +411 -0
- package/scripts/check-diff-size.mjs +46 -0
- package/scripts/check-e2e-manifest.mjs +35 -0
- package/scripts/check-eval-score-drift.mjs +31 -0
- package/scripts/check-gh-aw-dogfood-scope.mjs +51 -0
- package/scripts/check-issue-spec.mjs +215 -0
- package/scripts/check-l1-readiness.mjs +82 -0
- package/scripts/check-open-pr-limit.mjs +34 -0
- package/scripts/doctor.mjs +177 -0
- package/scripts/emit-gh-aw-dogfood-report.mjs +112 -0
- package/scripts/emit-telemetry-artifact.mjs +99 -0
- package/scripts/fetch-telemetry-artifacts.mjs +176 -0
- package/scripts/harness-drift-report.mjs +99 -0
- package/scripts/lib/bootstrap-copy.mjs +123 -0
- package/scripts/lib/ccsd-contract.mjs +212 -0
- package/scripts/lib/diff-size.mjs +103 -0
- package/scripts/lib/doctor-local.mjs +179 -0
- package/scripts/lib/e2e-manifest.mjs +76 -0
- package/scripts/lib/gh-aw-dogfood.mjs +293 -0
- package/scripts/lib/github-config.mjs +94 -0
- package/scripts/lib/harness-ci-fragments.mjs +98 -0
- package/scripts/lib/harness-review-routing.mjs +244 -0
- package/scripts/lib/harness-review.mjs +388 -0
- package/scripts/lib/issue-form-label-sync.mjs +56 -0
- package/scripts/lib/l1-readiness.mjs +258 -0
- package/scripts/lib/merge-harness-package.mjs +36 -0
- package/scripts/lib/npm-package.mjs +129 -0
- package/scripts/lib/setup-wizard.mjs +224 -0
- package/scripts/lib/stacks.mjs +138 -0
- package/scripts/lib/telemetry-artifact.mjs +253 -0
- package/scripts/lib/template-root.mjs +39 -0
- package/scripts/merge-harness-package.mjs +14 -0
- package/scripts/route-harness-review.mjs +168 -0
- package/scripts/run-e2e-bench.mjs +216 -0
- package/scripts/sdlc-gh-cli.mjs +91 -0
- package/scripts/select-eval-jobs.mjs +41 -0
- package/scripts/setup-github.mjs +242 -0
- package/scripts/setup-github.sh +4 -0
- package/scripts/setup-wizard.mjs +426 -0
- package/scripts/test-bootstrap-guidance-scenarios.mjs +94 -0
- package/scripts/test-diff-size-scenarios.mjs +88 -0
- package/scripts/test-doctor-scenarios.mjs +70 -0
- package/scripts/test-e2e-manifest-scenarios.mjs +65 -0
- package/scripts/test-gh-aw-dogfood-scenarios.mjs +74 -0
- package/scripts/test-harness-review-routing-scenarios.mjs +130 -0
- package/scripts/test-harness-review-scenarios.mjs +92 -0
- package/scripts/test-hooks-scenarios.mjs +44 -0
- package/scripts/test-issue-form-label-sync-scenarios.mjs +48 -0
- package/scripts/test-issue-spec-scenarios.mjs +258 -0
- package/scripts/test-l1-readiness-scenarios.mjs +204 -0
- package/scripts/test-merge-harness-package-scenarios.mjs +53 -0
- package/scripts/test-npm-package-scenarios.mjs +31 -0
- package/scripts/test-sdlc-gh-cli-scenarios.mjs +54 -0
- package/scripts/test-setup-github-scenarios.mjs +103 -0
- package/scripts/test-setup-wizard-scenarios.mjs +114 -0
- package/scripts/test-telemetry-artifact-scenarios.mjs +69 -0
- package/scripts/trim-harness-ci.mjs +18 -0
- package/scripts/validate-gh-aw-compile.mjs +64 -0
- package/scripts/validate-harness.mjs +199 -0
- package/scripts/validate-telemetry.mjs +21 -0
- package/scripts/verify-bootstrap-stacks.sh +192 -0
|
@@ -0,0 +1,161 @@
|
|
|
1
|
+
name: Agent retry orchestrator
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
check_suite:
|
|
5
|
+
types: [completed]
|
|
6
|
+
|
|
7
|
+
permissions:
|
|
8
|
+
issues: write
|
|
9
|
+
pull-requests: write
|
|
10
|
+
checks: read
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
retry:
|
|
14
|
+
if: github.event.check_suite.conclusion == 'failure'
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
steps:
|
|
17
|
+
- uses: actions/checkout@v4
|
|
18
|
+
|
|
19
|
+
- name: Evaluate retry policy
|
|
20
|
+
id: retry
|
|
21
|
+
uses: actions/github-script@v7
|
|
22
|
+
with:
|
|
23
|
+
script: |
|
|
24
|
+
const MAX_RETRIES = 3;
|
|
25
|
+
const suite = context.payload.check_suite;
|
|
26
|
+
const linked = suite.pull_requests || [];
|
|
27
|
+
let pr = linked[0];
|
|
28
|
+
if (!pr) {
|
|
29
|
+
core.info('No PR associated with failed check suite');
|
|
30
|
+
core.setOutput('skip_telemetry', '1');
|
|
31
|
+
return;
|
|
32
|
+
}
|
|
33
|
+
const prNumber = pr.number;
|
|
34
|
+
if (!prNumber) {
|
|
35
|
+
core.info('Could not resolve PR number');
|
|
36
|
+
core.setOutput('skip_telemetry', '1');
|
|
37
|
+
return;
|
|
38
|
+
}
|
|
39
|
+
core.setOutput('pr_number', String(prNumber));
|
|
40
|
+
const { data: fullPr } = await github.rest.pulls.get({
|
|
41
|
+
owner: context.repo.owner,
|
|
42
|
+
repo: context.repo.repo,
|
|
43
|
+
pull_number: prNumber,
|
|
44
|
+
});
|
|
45
|
+
const labels = fullPr.labels.map((l) => l.name);
|
|
46
|
+
core.setOutput('pr_labels', labels.join(','));
|
|
47
|
+
core.setOutput('pr_body', fullPr.body || '');
|
|
48
|
+
const retryLabel = labels.find((l) => l.startsWith('retry:'));
|
|
49
|
+
const count = retryLabel ? parseInt(retryLabel.split(':')[1], 10) || 0 : 0;
|
|
50
|
+
|
|
51
|
+
const { data: checkRuns } = await github.rest.checks.listForRef({
|
|
52
|
+
owner: context.repo.owner,
|
|
53
|
+
repo: context.repo.repo,
|
|
54
|
+
ref: suite.head_sha,
|
|
55
|
+
});
|
|
56
|
+
const failed = checkRuns.check_runs.filter((c) => c.conclusion === 'failure');
|
|
57
|
+
const failureType = failed[0]?.name || suite.app?.slug || 'ci';
|
|
58
|
+
const sig = failed.map((c) => `${c.name}:${c.conclusion}`).sort().join('|') || failureType;
|
|
59
|
+
core.setOutput('wall_failure_type', failureType);
|
|
60
|
+
core.setOutput('retry_count', String(count));
|
|
61
|
+
|
|
62
|
+
const noRetry = /security|zizmor|codeql/i.test(failureType);
|
|
63
|
+
if (noRetry) {
|
|
64
|
+
core.setOutput('final_outcome', 'escalated');
|
|
65
|
+
await github.rest.issues.createComment({
|
|
66
|
+
owner: context.repo.owner,
|
|
67
|
+
repo: context.repo.repo,
|
|
68
|
+
issue_number: prNumber,
|
|
69
|
+
body: `⛔ \`${failureType}\` failures are not auto-retried. Human escalation required.\n\nfailure_sig: \`${sig}\``,
|
|
70
|
+
});
|
|
71
|
+
return;
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const { data: comments } = await github.rest.issues.listComments({
|
|
75
|
+
owner: context.repo.owner,
|
|
76
|
+
repo: context.repo.repo,
|
|
77
|
+
issue_number: prNumber,
|
|
78
|
+
});
|
|
79
|
+
const sigMatches = comments.filter((c) => c.body?.includes(`failure_sig: \`${sig}\``));
|
|
80
|
+
if (sigMatches.length >= 1) {
|
|
81
|
+
core.setOutput('final_outcome', 'escalated');
|
|
82
|
+
await github.rest.issues.createComment({
|
|
83
|
+
owner: context.repo.owner,
|
|
84
|
+
repo: context.repo.repo,
|
|
85
|
+
issue_number: prNumber,
|
|
86
|
+
body: `⛔ Same failure signature detected twice (\`${sig}\`). Stopping auto-retry per docs/operations.md.`,
|
|
87
|
+
});
|
|
88
|
+
return;
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
if (count >= MAX_RETRIES) {
|
|
92
|
+
core.setOutput('final_outcome', 'escalated');
|
|
93
|
+
await github.rest.issues.createComment({
|
|
94
|
+
owner: context.repo.owner,
|
|
95
|
+
repo: context.repo.repo,
|
|
96
|
+
issue_number: prNumber,
|
|
97
|
+
body: `⛔ Max retries (${MAX_RETRIES}) reached for \`${failureType}\`. Human escalation required.\n\nfailure_sig: \`${sig}\``,
|
|
98
|
+
});
|
|
99
|
+
return;
|
|
100
|
+
}
|
|
101
|
+
const newCount = count + 1;
|
|
102
|
+
core.setOutput('retry_count', String(newCount));
|
|
103
|
+
core.setOutput('final_outcome', 'in_progress');
|
|
104
|
+
if (retryLabel) {
|
|
105
|
+
await github.rest.issues.removeLabel({
|
|
106
|
+
owner: context.repo.owner,
|
|
107
|
+
repo: context.repo.repo,
|
|
108
|
+
issue_number: prNumber,
|
|
109
|
+
name: retryLabel,
|
|
110
|
+
});
|
|
111
|
+
}
|
|
112
|
+
await github.rest.issues.addLabels({
|
|
113
|
+
owner: context.repo.owner,
|
|
114
|
+
repo: context.repo.repo,
|
|
115
|
+
issue_number: prNumber,
|
|
116
|
+
labels: [`retry:${newCount}`],
|
|
117
|
+
});
|
|
118
|
+
await github.rest.issues.createComment({
|
|
119
|
+
owner: context.repo.owner,
|
|
120
|
+
repo: context.repo.repo,
|
|
121
|
+
issue_number: prNumber,
|
|
122
|
+
body: [
|
|
123
|
+
`🔄 CI failed (\`${failureType}\`). Retry **${newCount}/${MAX_RETRIES}**.`,
|
|
124
|
+
'',
|
|
125
|
+
'Re-trigger coding agent or push a fix to the same PR.',
|
|
126
|
+
`wall_failure_type: ${failureType}`,
|
|
127
|
+
`failure_sig: \`${sig}\``,
|
|
128
|
+
].join('\n'),
|
|
129
|
+
});
|
|
130
|
+
|
|
131
|
+
- uses: actions/setup-node@v4
|
|
132
|
+
if: steps.retry.outputs.skip_telemetry != '1'
|
|
133
|
+
with:
|
|
134
|
+
node-version: "22"
|
|
135
|
+
|
|
136
|
+
- name: Emit retry telemetry artifact
|
|
137
|
+
if: steps.retry.outputs.skip_telemetry != '1'
|
|
138
|
+
env:
|
|
139
|
+
TELEMETRY_SOURCE: agent-retry-orchestrator
|
|
140
|
+
GITHUB_REPOSITORY: ${{ github.repository }}
|
|
141
|
+
GITHUB_RUN_ID: ${{ github.run_id }}
|
|
142
|
+
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
|
|
143
|
+
GITHUB_WORKFLOW: ${{ github.workflow }}
|
|
144
|
+
GITHUB_EVENT_NAME: ${{ github.event_name }}
|
|
145
|
+
PR_NUMBER: ${{ steps.retry.outputs.pr_number }}
|
|
146
|
+
PR_BODY: ${{ steps.retry.outputs.pr_body }}
|
|
147
|
+
PR_LABELS: ${{ steps.retry.outputs.pr_labels }}
|
|
148
|
+
WALL_FAILURE_TYPE: ${{ steps.retry.outputs.wall_failure_type }}
|
|
149
|
+
RETRY_COUNT: ${{ steps.retry.outputs.retry_count }}
|
|
150
|
+
FINAL_OUTCOME: ${{ steps.retry.outputs.final_outcome }}
|
|
151
|
+
AGENT_TYPE: orchestrator
|
|
152
|
+
EXECUTION_MODE: gh_aw
|
|
153
|
+
run: node scripts/emit-telemetry-artifact.mjs
|
|
154
|
+
|
|
155
|
+
- name: Upload retry telemetry artifact
|
|
156
|
+
if: steps.retry.outputs.skip_telemetry != '1'
|
|
157
|
+
uses: actions/upload-artifact@v4
|
|
158
|
+
with:
|
|
159
|
+
name: retry-telemetry-${{ github.run_id }}
|
|
160
|
+
path: telemetry-artifacts/
|
|
161
|
+
if-no-files-found: error
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
name: Copilot setup
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
workflow_dispatch:
|
|
5
|
+
|
|
6
|
+
jobs:
|
|
7
|
+
copilot-setup-steps:
|
|
8
|
+
runs-on: ubuntu-latest
|
|
9
|
+
steps:
|
|
10
|
+
- uses: actions/checkout@v4
|
|
11
|
+
|
|
12
|
+
- uses: actions/setup-node@v4
|
|
13
|
+
if: hashFiles('sample/ts/package-lock.json') != ''
|
|
14
|
+
with:
|
|
15
|
+
node-version: "22"
|
|
16
|
+
|
|
17
|
+
- uses: actions/setup-python@v5
|
|
18
|
+
if: hashFiles('sample/python/requirements-dev.txt') != ''
|
|
19
|
+
with:
|
|
20
|
+
python-version: "3.12"
|
|
21
|
+
|
|
22
|
+
- uses: actions/setup-go@v5
|
|
23
|
+
if: hashFiles('sample/go/go.mod') != ''
|
|
24
|
+
with:
|
|
25
|
+
go-version: "1.22"
|
|
26
|
+
|
|
27
|
+
- uses: ruby/setup-ruby@v1
|
|
28
|
+
if: hashFiles('sample/ruby/Gemfile.lock') != ''
|
|
29
|
+
with:
|
|
30
|
+
ruby-version: "3.3"
|
|
31
|
+
bundler-cache: true
|
|
32
|
+
working-directory: sample/ruby
|
|
33
|
+
|
|
34
|
+
- uses: shivammathur/setup-php@v2
|
|
35
|
+
if: hashFiles('sample/php/composer.lock') != ''
|
|
36
|
+
with:
|
|
37
|
+
php-version: "8.2"
|
|
38
|
+
coverage: none
|
|
39
|
+
tools: composer
|
|
40
|
+
|
|
41
|
+
- name: Install TS sample deps
|
|
42
|
+
if: hashFiles('sample/ts/package-lock.json') != ''
|
|
43
|
+
working-directory: sample/ts
|
|
44
|
+
run: npm ci
|
|
45
|
+
|
|
46
|
+
- name: Install Python sample deps
|
|
47
|
+
if: hashFiles('sample/python/requirements-dev.txt') != ''
|
|
48
|
+
working-directory: sample/python
|
|
49
|
+
run: pip install -r requirements-dev.txt
|
|
50
|
+
|
|
51
|
+
- name: Install Go sample deps
|
|
52
|
+
if: hashFiles('sample/go/go.mod') != ''
|
|
53
|
+
working-directory: sample/go
|
|
54
|
+
run: go mod download
|
|
55
|
+
|
|
56
|
+
- name: Install Ruby sample deps
|
|
57
|
+
if: hashFiles('sample/ruby/Gemfile.lock') != ''
|
|
58
|
+
working-directory: sample/ruby
|
|
59
|
+
run: bundle install
|
|
60
|
+
|
|
61
|
+
- name: Install PHP sample deps
|
|
62
|
+
if: hashFiles('sample/php/composer.lock') != ''
|
|
63
|
+
working-directory: sample/php
|
|
64
|
+
run: composer install --no-interaction --prefer-dist
|
|
@@ -0,0 +1,169 @@
|
|
|
1
|
+
name: Eval CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
paths:
|
|
6
|
+
- ".github/**"
|
|
7
|
+
- "AGENTS.md"
|
|
8
|
+
- "prompts/**"
|
|
9
|
+
- "evals/**"
|
|
10
|
+
schedule:
|
|
11
|
+
- cron: "0 6 * * 1"
|
|
12
|
+
|
|
13
|
+
jobs:
|
|
14
|
+
select:
|
|
15
|
+
runs-on: ubuntu-latest
|
|
16
|
+
outputs:
|
|
17
|
+
jobs: ${{ steps.sel.outputs.jobs }}
|
|
18
|
+
steps:
|
|
19
|
+
- uses: actions/checkout@v4
|
|
20
|
+
with:
|
|
21
|
+
fetch-depth: 0
|
|
22
|
+
- uses: actions/setup-node@v4
|
|
23
|
+
with:
|
|
24
|
+
node-version: "22"
|
|
25
|
+
- id: sel
|
|
26
|
+
env:
|
|
27
|
+
BASE_SHA: origin/${{ github.base_ref }}
|
|
28
|
+
run: node scripts/select-eval-jobs.mjs
|
|
29
|
+
|
|
30
|
+
prompt-eval:
|
|
31
|
+
needs: select
|
|
32
|
+
if: contains(needs.select.outputs.jobs, 'prompt-eval')
|
|
33
|
+
runs-on: ubuntu-latest
|
|
34
|
+
steps:
|
|
35
|
+
- uses: actions/checkout@v4
|
|
36
|
+
- name: Prompt eval (gh models)
|
|
37
|
+
run: |
|
|
38
|
+
shopt -s nullglob
|
|
39
|
+
files=(prompts/*.prompt.yml)
|
|
40
|
+
if [ ${#files[@]} -eq 0 ]; then
|
|
41
|
+
echo "No .prompt.yml files; skipping"
|
|
42
|
+
exit 0
|
|
43
|
+
fi
|
|
44
|
+
if ! command -v gh >/dev/null; then
|
|
45
|
+
echo "::error::gh CLI required when prompt files exist"
|
|
46
|
+
exit 1
|
|
47
|
+
fi
|
|
48
|
+
for f in "${files[@]}"; do
|
|
49
|
+
echo "Evaluating $f"
|
|
50
|
+
gh models eval "$f"
|
|
51
|
+
done
|
|
52
|
+
|
|
53
|
+
agent-policy:
|
|
54
|
+
needs: select
|
|
55
|
+
if: contains(needs.select.outputs.jobs, 'agent-policy')
|
|
56
|
+
runs-on: ubuntu-latest
|
|
57
|
+
steps:
|
|
58
|
+
- uses: actions/checkout@v4
|
|
59
|
+
- uses: actions/setup-node@v4
|
|
60
|
+
with:
|
|
61
|
+
node-version: "22"
|
|
62
|
+
- name: Validate agent definitions
|
|
63
|
+
run: node scripts/validate-harness.mjs
|
|
64
|
+
|
|
65
|
+
trajectory-conventions:
|
|
66
|
+
needs: select
|
|
67
|
+
if: contains(needs.select.outputs.jobs, 'trajectory-conventions')
|
|
68
|
+
runs-on: ubuntu-latest
|
|
69
|
+
steps:
|
|
70
|
+
- uses: actions/checkout@v4
|
|
71
|
+
- uses: actions/setup-python@v5
|
|
72
|
+
with:
|
|
73
|
+
python-version: "3.12"
|
|
74
|
+
- name: Harness convention tests
|
|
75
|
+
run: |
|
|
76
|
+
pip install pytest
|
|
77
|
+
pytest evals/trajectories -q
|
|
78
|
+
|
|
79
|
+
trajectory-task:
|
|
80
|
+
needs: select
|
|
81
|
+
if: contains(needs.select.outputs.jobs, 'trajectory-task')
|
|
82
|
+
runs-on: ubuntu-latest
|
|
83
|
+
steps:
|
|
84
|
+
- uses: actions/checkout@v4
|
|
85
|
+
- uses: actions/setup-python@v5
|
|
86
|
+
with:
|
|
87
|
+
python-version: "3.12"
|
|
88
|
+
- name: Skill / task rubric tests
|
|
89
|
+
run: |
|
|
90
|
+
pip install pytest
|
|
91
|
+
pytest evals/trajectories -q
|
|
92
|
+
|
|
93
|
+
meta-eval:
|
|
94
|
+
needs: select
|
|
95
|
+
if: contains(needs.select.outputs.jobs, 'meta-eval')
|
|
96
|
+
runs-on: ubuntu-latest
|
|
97
|
+
steps:
|
|
98
|
+
- uses: actions/checkout@v4
|
|
99
|
+
- uses: actions/setup-node@v4
|
|
100
|
+
with:
|
|
101
|
+
node-version: "22"
|
|
102
|
+
- uses: actions/setup-python@v5
|
|
103
|
+
with:
|
|
104
|
+
python-version: "3.12"
|
|
105
|
+
- name: E2E manifest and task definitions
|
|
106
|
+
run: |
|
|
107
|
+
node scripts/check-e2e-manifest.mjs
|
|
108
|
+
node scripts/run-e2e-bench.mjs
|
|
109
|
+
pip install pytest
|
|
110
|
+
pytest evals/trajectories -q
|
|
111
|
+
|
|
112
|
+
e2e-bench:
|
|
113
|
+
if: github.event_name == 'schedule'
|
|
114
|
+
runs-on: ubuntu-latest
|
|
115
|
+
steps:
|
|
116
|
+
- uses: actions/checkout@v4
|
|
117
|
+
- uses: actions/setup-node@v4
|
|
118
|
+
with:
|
|
119
|
+
node-version: "22"
|
|
120
|
+
- name: E2E bench manifest and tasks
|
|
121
|
+
run: |
|
|
122
|
+
node scripts/check-e2e-manifest.mjs
|
|
123
|
+
node scripts/run-e2e-bench.mjs
|
|
124
|
+
|
|
125
|
+
telemetry:
|
|
126
|
+
name: telemetry-artifact
|
|
127
|
+
runs-on: ubuntu-latest
|
|
128
|
+
if: always() && github.event_name == 'pull_request'
|
|
129
|
+
needs:
|
|
130
|
+
- select
|
|
131
|
+
- prompt-eval
|
|
132
|
+
- agent-policy
|
|
133
|
+
- trajectory-conventions
|
|
134
|
+
- trajectory-task
|
|
135
|
+
- meta-eval
|
|
136
|
+
- e2e-bench
|
|
137
|
+
steps:
|
|
138
|
+
- uses: actions/checkout@v4
|
|
139
|
+
with:
|
|
140
|
+
fetch-depth: 0
|
|
141
|
+
|
|
142
|
+
- uses: actions/setup-node@v4
|
|
143
|
+
with:
|
|
144
|
+
node-version: "22"
|
|
145
|
+
|
|
146
|
+
- name: Emit eval CI telemetry artifact
|
|
147
|
+
env:
|
|
148
|
+
TELEMETRY_SOURCE: eval-ci
|
|
149
|
+
GITHUB_REPOSITORY: ${{ github.repository }}
|
|
150
|
+
GITHUB_RUN_ID: ${{ github.run_id }}
|
|
151
|
+
GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
|
|
152
|
+
GITHUB_WORKFLOW: ${{ github.workflow }}
|
|
153
|
+
GITHUB_EVENT_NAME: ${{ github.event_name }}
|
|
154
|
+
PR_NUMBER: ${{ github.event.pull_request.number || 0 }}
|
|
155
|
+
PR_BODY: ${{ github.event.pull_request.body }}
|
|
156
|
+
PR_LABELS: ${{ join(github.event.pull_request.labels.*.name, ',') }}
|
|
157
|
+
BASE_SHA: origin/${{ github.base_ref }}
|
|
158
|
+
JOB_RESULTS: ${{ toJSON(needs) }}
|
|
159
|
+
AGENT_TYPE: eval
|
|
160
|
+
EXECUTION_MODE: ci
|
|
161
|
+
FINAL_OUTCOME: in_progress
|
|
162
|
+
run: node scripts/emit-telemetry-artifact.mjs
|
|
163
|
+
|
|
164
|
+
- name: Upload eval telemetry artifact
|
|
165
|
+
uses: actions/upload-artifact@v4
|
|
166
|
+
with:
|
|
167
|
+
name: eval-telemetry-${{ github.run_id }}
|
|
168
|
+
path: telemetry-artifacts/
|
|
169
|
+
if-no-files-found: error
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
name: Eval drift check
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
schedule:
|
|
5
|
+
- cron: "0 7 * * 1"
|
|
6
|
+
workflow_dispatch:
|
|
7
|
+
|
|
8
|
+
permissions:
|
|
9
|
+
issues: write
|
|
10
|
+
contents: read
|
|
11
|
+
|
|
12
|
+
jobs:
|
|
13
|
+
e2e-staleness:
|
|
14
|
+
runs-on: ubuntu-latest
|
|
15
|
+
steps:
|
|
16
|
+
- uses: actions/checkout@v4
|
|
17
|
+
- uses: actions/setup-node@v4
|
|
18
|
+
with:
|
|
19
|
+
node-version: "22"
|
|
20
|
+
- name: Check E2E manifest freshness
|
|
21
|
+
id: e2e
|
|
22
|
+
run: |
|
|
23
|
+
node scripts/check-e2e-manifest.mjs 2>&1 | tee /tmp/e2e.out || true
|
|
24
|
+
if grep -q warning /tmp/e2e.out; then echo "stale=true" >> "$GITHUB_OUTPUT"; fi
|
|
25
|
+
|
|
26
|
+
- name: Check eval score drift (15pt threshold)
|
|
27
|
+
id: score
|
|
28
|
+
run: |
|
|
29
|
+
set +e
|
|
30
|
+
set -o pipefail
|
|
31
|
+
node scripts/check-eval-score-drift.mjs 2>&1 | tee /tmp/score.out
|
|
32
|
+
SC=$?
|
|
33
|
+
if [ "$SC" = "2" ]; then echo "drift=true" >> "$GITHUB_OUTPUT"; fi
|
|
34
|
+
|
|
35
|
+
- name: Open bench review issue if stale
|
|
36
|
+
if: steps.e2e.outputs.stale == 'true'
|
|
37
|
+
uses: actions/github-script@v7
|
|
38
|
+
with:
|
|
39
|
+
script: |
|
|
40
|
+
const title = 'E2E bench rotation due (quarterly 20%)';
|
|
41
|
+
const { data: issues } = await github.rest.issues.listForRepo({
|
|
42
|
+
owner: context.repo.owner,
|
|
43
|
+
repo: context.repo.repo,
|
|
44
|
+
state: 'open',
|
|
45
|
+
labels: 'harness:eval-drift',
|
|
46
|
+
});
|
|
47
|
+
if (issues.find((i) => i.title === title)) return;
|
|
48
|
+
await github.rest.issues.create({
|
|
49
|
+
owner: context.repo.owner,
|
|
50
|
+
repo: context.repo.repo,
|
|
51
|
+
title,
|
|
52
|
+
labels: ['harness:eval-drift', 'task:infra'],
|
|
53
|
+
body: 'E2E manifest not rotated in 90 days. Review evals/e2e-bench/manifest.json per docs/operations.md.',
|
|
54
|
+
});
|
|
55
|
+
|
|
56
|
+
- name: Open eval score drift issue
|
|
57
|
+
if: steps.score.outputs.drift == 'true'
|
|
58
|
+
uses: actions/github-script@v7
|
|
59
|
+
with:
|
|
60
|
+
script: |
|
|
61
|
+
const title = 'Eval/production acceptance gap exceeds 15pt';
|
|
62
|
+
const { data: issues } = await github.rest.issues.listForRepo({
|
|
63
|
+
owner: context.repo.owner,
|
|
64
|
+
repo: context.repo.repo,
|
|
65
|
+
state: 'open',
|
|
66
|
+
labels: 'harness:eval-drift',
|
|
67
|
+
});
|
|
68
|
+
if (issues.find((i) => i.title === title)) return;
|
|
69
|
+
await github.rest.issues.create({
|
|
70
|
+
owner: context.repo.owner,
|
|
71
|
+
repo: context.repo.repo,
|
|
72
|
+
title,
|
|
73
|
+
labels: ['harness:eval-drift', 'task:infra'],
|
|
74
|
+
body: 'Eval pass rate exceeds production acceptance by >15pt. Review evals/.score-baseline.json and e2e bench per docs/operations.md.',
|
|
75
|
+
});
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
name: gh-aw dogfood CI
|
|
2
|
+
|
|
3
|
+
on:
|
|
4
|
+
pull_request:
|
|
5
|
+
paths:
|
|
6
|
+
- ".github/workflows/nightly-harness-review.md"
|
|
7
|
+
- ".github/workflows/nightly-harness-review.lock.yml"
|
|
8
|
+
- ".github/workflows/weekly-redteam.md"
|
|
9
|
+
- ".github/workflows/weekly-redteam.lock.yml"
|
|
10
|
+
- ".github/workflows/gh-aw-dogfood-ci.yml"
|
|
11
|
+
- ".github/labels.yml"
|
|
12
|
+
- ".github/aw/**"
|
|
13
|
+
- "scripts/**gh-aw**"
|
|
14
|
+
- "scripts/lib/gh-aw-dogfood.mjs"
|
|
15
|
+
- "docs/gh-aw-dogfood.md"
|
|
16
|
+
- "docs/nightly-harness-review.md"
|
|
17
|
+
workflow_dispatch:
|
|
18
|
+
|
|
19
|
+
permissions:
|
|
20
|
+
contents: read
|
|
21
|
+
|
|
22
|
+
jobs:
|
|
23
|
+
dogfood:
|
|
24
|
+
name: validate-gh-aw-track
|
|
25
|
+
runs-on: ubuntu-latest
|
|
26
|
+
steps:
|
|
27
|
+
- uses: actions/checkout@v4
|
|
28
|
+
with:
|
|
29
|
+
fetch-depth: 0
|
|
30
|
+
|
|
31
|
+
- uses: actions/setup-node@v4
|
|
32
|
+
with:
|
|
33
|
+
node-version: "22"
|
|
34
|
+
|
|
35
|
+
- name: Install gh aw extension
|
|
36
|
+
env:
|
|
37
|
+
GH_AW_VERSION: v0.81.6
|
|
38
|
+
run: |
|
|
39
|
+
if gh extension list 2>/dev/null | grep -q 'github/gh-aw'; then
|
|
40
|
+
gh extension upgrade github/gh-aw --pin "$GH_AW_VERSION"
|
|
41
|
+
else
|
|
42
|
+
gh extension install github/gh-aw --pin "$GH_AW_VERSION"
|
|
43
|
+
fi
|
|
44
|
+
|
|
45
|
+
- name: Dogfood scenario tests
|
|
46
|
+
run: node scripts/test-gh-aw-dogfood-scenarios.mjs
|
|
47
|
+
|
|
48
|
+
- name: Enforce dogfood path scope
|
|
49
|
+
if: github.event_name == 'pull_request'
|
|
50
|
+
env:
|
|
51
|
+
PR_LABELS: ${{ join(github.event.pull_request.labels.*.name, ',') }}
|
|
52
|
+
BASE_SHA: origin/${{ github.base_ref }}
|
|
53
|
+
run: node scripts/check-gh-aw-dogfood-scope.mjs
|
|
54
|
+
|
|
55
|
+
- name: Validate gh-aw compile (when CLI available)
|
|
56
|
+
env:
|
|
57
|
+
GH_AW_COMPILE_REQUIRED: "0"
|
|
58
|
+
run: node scripts/validate-gh-aw-compile.mjs
|
|
59
|
+
|
|
60
|
+
- name: Emit dogfood evaluation report
|
|
61
|
+
env:
|
|
62
|
+
GITHUB_REPOSITORY: ${{ github.repository }}
|
|
63
|
+
PR_LABELS: ${{ join(github.event.pull_request.labels.*.name, ',') }}
|
|
64
|
+
BASE_SHA: origin/${{ github.base_ref }}
|
|
65
|
+
DOGFOOD_REPORT_DIR: dogfood-report
|
|
66
|
+
run: node scripts/emit-gh-aw-dogfood-report.mjs
|
|
67
|
+
|
|
68
|
+
- name: Upload dogfood report
|
|
69
|
+
uses: actions/upload-artifact@v4
|
|
70
|
+
with:
|
|
71
|
+
name: gh-aw-dogfood-${{ github.run_id }}
|
|
72
|
+
path: dogfood-report/
|
|
73
|
+
if-no-files-found: error
|