@guilz-dev/sdlc-gh 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (176) hide show
  1. package/.github/CODEOWNERS +5 -0
  2. package/.github/ISSUE_TEMPLATE/bug_report.yml +68 -0
  3. package/.github/ISSUE_TEMPLATE/config.yml +1 -0
  4. package/.github/ISSUE_TEMPLATE/feature_request.yml +39 -0
  5. package/.github/ISSUE_TEMPLATE/support.yml +56 -0
  6. package/.github/ISSUE_TEMPLATE/task.yml +89 -0
  7. package/.github/agents/implementer.agent.md +17 -0
  8. package/.github/agents/reviewer.agent.md +18 -0
  9. package/.github/agents/triager.agent.md +13 -0
  10. package/.github/aw/actions-lock.json +9 -0
  11. package/.github/copilot-instructions.md +35 -0
  12. package/.github/hooks/hooks.json +12 -0
  13. package/.github/instructions/core.instructions.md +11 -0
  14. package/.github/instructions/profiles/go.instructions.md +10 -0
  15. package/.github/instructions/profiles/php.instructions.md +11 -0
  16. package/.github/instructions/profiles/python.instructions.md +11 -0
  17. package/.github/instructions/profiles/ruby.instructions.md +11 -0
  18. package/.github/instructions/profiles/typescript.instructions.md +11 -0
  19. package/.github/labels.yml +55 -0
  20. package/.github/pull_request_template.md +33 -0
  21. package/.github/ruleset.example.json +33 -0
  22. package/.github/ruleset.harness-eval.example.json +29 -0
  23. package/.github/skills/quality-loop/SKILL.md +23 -0
  24. package/.github/workflows/agent-retry-orchestrator.yml +161 -0
  25. package/.github/workflows/copilot-setup-steps.yml +64 -0
  26. package/.github/workflows/eval-ci.yml +169 -0
  27. package/.github/workflows/eval-drift.yml +75 -0
  28. package/.github/workflows/gh-aw-dogfood-ci.yml +73 -0
  29. package/.github/workflows/harness-ci.yml +244 -0
  30. package/.github/workflows/harness-sync.yml +28 -0
  31. package/.github/workflows/l1-readiness-check.yml +45 -0
  32. package/.github/workflows/labels-sync.yml +24 -0
  33. package/.github/workflows/nightly-harness-review.lock.yml +1643 -0
  34. package/.github/workflows/nightly-harness-review.md +87 -0
  35. package/.github/workflows/nightly-harness-review.yml +63 -0
  36. package/.github/workflows/npm-publish.yml +49 -0
  37. package/.github/workflows/pr-context-comment.yml +138 -0
  38. package/.github/workflows/product-ci-go.yml +33 -0
  39. package/.github/workflows/product-ci-php.yml +39 -0
  40. package/.github/workflows/product-ci-python.yml +34 -0
  41. package/.github/workflows/product-ci-ruby.yml +35 -0
  42. package/.github/workflows/product-ci-ts.yml +37 -0
  43. package/.github/workflows/task-issue-label-sync.yml +50 -0
  44. package/.github/workflows/weekly-redteam.lock.yml +1571 -0
  45. package/.github/workflows/weekly-redteam.md +76 -0
  46. package/.github/zizmor.yml +11 -0
  47. package/AGENTS.md +54 -0
  48. package/LICENSE +21 -0
  49. package/README.md +366 -0
  50. package/config/stacks.json +55 -0
  51. package/docs/adoption.md +126 -0
  52. package/docs/arch.md +535 -0
  53. package/docs/auth-boundaries.md +16 -0
  54. package/docs/coding-agent-l1.md +152 -0
  55. package/docs/exceptions/README.md +25 -0
  56. package/docs/exceptions/TEMPLATE.md +8 -0
  57. package/docs/failure-taxonomy.md +23 -0
  58. package/docs/gh-aw-dogfood.md +109 -0
  59. package/docs/kpi-baseline.md +9 -0
  60. package/docs/nightly-harness-review.md +94 -0
  61. package/docs/operations.md +108 -0
  62. package/docs/publishing.md +79 -0
  63. package/docs/revert-playbook.md +44 -0
  64. package/docs/shared-config.md +30 -0
  65. package/docs/telemetry-artifacts.md +78 -0
  66. package/docs/telemetry-schema.md +60 -0
  67. package/evals/.score-baseline.json +6 -0
  68. package/evals/e2e-bench/README.md +28 -0
  69. package/evals/e2e-bench/manifest.json +16 -0
  70. package/evals/e2e-bench/tasks/e2e-001.yml +10 -0
  71. package/evals/e2e-bench/tasks/e2e-002.yml +11 -0
  72. package/evals/e2e-bench/tasks/e2e-003.yml +10 -0
  73. package/evals/e2e-bench/tasks/e2e-004.yml +14 -0
  74. package/evals/e2e-bench/tasks/e2e-005.yml +11 -0
  75. package/evals/e2e-bench/tasks/e2e-006.yml +10 -0
  76. package/evals/e2e-bench/tasks/e2e-007.yml +10 -0
  77. package/evals/e2e-bench/tasks/e2e-008.yml +10 -0
  78. package/evals/e2e-bench/tasks/e2e-009.yml +10 -0
  79. package/evals/trajectories/rubric.md +12 -0
  80. package/evals/trajectories/test_harness_conventions.py +271 -0
  81. package/infra/README.md +49 -0
  82. package/infra/langfuse/docker-compose.yml +25 -0
  83. package/infra/otel/collector-config.yml +24 -0
  84. package/infra/samples/gh-aw-dogfood-report.json +44 -0
  85. package/infra/samples/harness-review-routing-plan.json +19 -0
  86. package/infra/samples/harness-review-summary.json +61 -0
  87. package/infra/samples/telemetry-artifact.json +29 -0
  88. package/infra/samples/telemetry-payload.json +19 -0
  89. package/package.json +85 -0
  90. package/prompts/triager-classify.prompt.yml +10 -0
  91. package/sample/go/add.go +5 -0
  92. package/sample/go/add_test.go +9 -0
  93. package/sample/go/go.mod +3 -0
  94. package/sample/php/composer.json +26 -0
  95. package/sample/php/composer.lock +1881 -0
  96. package/sample/php/phpunit.xml +8 -0
  97. package/sample/php/src/Add.php +13 -0
  98. package/sample/php/tests/AddTest.php +16 -0
  99. package/sample/python/requirements-dev.txt +2 -0
  100. package/sample/python/src/__init__.py +0 -0
  101. package/sample/python/src/greet.py +3 -0
  102. package/sample/python/tests/conftest.py +4 -0
  103. package/sample/python/tests/test_greet.py +5 -0
  104. package/sample/ruby/.rubocop.yml +10 -0
  105. package/sample/ruby/Gemfile +6 -0
  106. package/sample/ruby/Gemfile.lock +58 -0
  107. package/sample/ruby/lib/add.rb +9 -0
  108. package/sample/ruby/spec/add_spec.rb +11 -0
  109. package/sample/ts/biome.json +6 -0
  110. package/sample/ts/package-lock.json +1763 -0
  111. package/sample/ts/package.json +15 -0
  112. package/sample/ts/src/add.ts +3 -0
  113. package/sample/ts/tests/add.test.ts +8 -0
  114. package/sample/ts/tsconfig.json +12 -0
  115. package/scripts/aggregate-harness-review.mjs +48 -0
  116. package/scripts/bootstrap-harness.sh +411 -0
  117. package/scripts/check-diff-size.mjs +46 -0
  118. package/scripts/check-e2e-manifest.mjs +35 -0
  119. package/scripts/check-eval-score-drift.mjs +31 -0
  120. package/scripts/check-gh-aw-dogfood-scope.mjs +51 -0
  121. package/scripts/check-issue-spec.mjs +215 -0
  122. package/scripts/check-l1-readiness.mjs +82 -0
  123. package/scripts/check-open-pr-limit.mjs +34 -0
  124. package/scripts/doctor.mjs +177 -0
  125. package/scripts/emit-gh-aw-dogfood-report.mjs +112 -0
  126. package/scripts/emit-telemetry-artifact.mjs +99 -0
  127. package/scripts/fetch-telemetry-artifacts.mjs +176 -0
  128. package/scripts/harness-drift-report.mjs +99 -0
  129. package/scripts/lib/bootstrap-copy.mjs +123 -0
  130. package/scripts/lib/ccsd-contract.mjs +212 -0
  131. package/scripts/lib/diff-size.mjs +103 -0
  132. package/scripts/lib/doctor-local.mjs +179 -0
  133. package/scripts/lib/e2e-manifest.mjs +76 -0
  134. package/scripts/lib/gh-aw-dogfood.mjs +293 -0
  135. package/scripts/lib/github-config.mjs +94 -0
  136. package/scripts/lib/harness-ci-fragments.mjs +98 -0
  137. package/scripts/lib/harness-review-routing.mjs +244 -0
  138. package/scripts/lib/harness-review.mjs +388 -0
  139. package/scripts/lib/issue-form-label-sync.mjs +56 -0
  140. package/scripts/lib/l1-readiness.mjs +258 -0
  141. package/scripts/lib/merge-harness-package.mjs +36 -0
  142. package/scripts/lib/npm-package.mjs +129 -0
  143. package/scripts/lib/setup-wizard.mjs +224 -0
  144. package/scripts/lib/stacks.mjs +138 -0
  145. package/scripts/lib/telemetry-artifact.mjs +253 -0
  146. package/scripts/lib/template-root.mjs +39 -0
  147. package/scripts/merge-harness-package.mjs +14 -0
  148. package/scripts/route-harness-review.mjs +168 -0
  149. package/scripts/run-e2e-bench.mjs +216 -0
  150. package/scripts/sdlc-gh-cli.mjs +91 -0
  151. package/scripts/select-eval-jobs.mjs +41 -0
  152. package/scripts/setup-github.mjs +242 -0
  153. package/scripts/setup-github.sh +4 -0
  154. package/scripts/setup-wizard.mjs +426 -0
  155. package/scripts/test-bootstrap-guidance-scenarios.mjs +94 -0
  156. package/scripts/test-diff-size-scenarios.mjs +88 -0
  157. package/scripts/test-doctor-scenarios.mjs +70 -0
  158. package/scripts/test-e2e-manifest-scenarios.mjs +65 -0
  159. package/scripts/test-gh-aw-dogfood-scenarios.mjs +74 -0
  160. package/scripts/test-harness-review-routing-scenarios.mjs +130 -0
  161. package/scripts/test-harness-review-scenarios.mjs +92 -0
  162. package/scripts/test-hooks-scenarios.mjs +44 -0
  163. package/scripts/test-issue-form-label-sync-scenarios.mjs +48 -0
  164. package/scripts/test-issue-spec-scenarios.mjs +258 -0
  165. package/scripts/test-l1-readiness-scenarios.mjs +204 -0
  166. package/scripts/test-merge-harness-package-scenarios.mjs +53 -0
  167. package/scripts/test-npm-package-scenarios.mjs +31 -0
  168. package/scripts/test-sdlc-gh-cli-scenarios.mjs +54 -0
  169. package/scripts/test-setup-github-scenarios.mjs +103 -0
  170. package/scripts/test-setup-wizard-scenarios.mjs +114 -0
  171. package/scripts/test-telemetry-artifact-scenarios.mjs +69 -0
  172. package/scripts/trim-harness-ci.mjs +18 -0
  173. package/scripts/validate-gh-aw-compile.mjs +64 -0
  174. package/scripts/validate-harness.mjs +199 -0
  175. package/scripts/validate-telemetry.mjs +21 -0
  176. package/scripts/verify-bootstrap-stacks.sh +192 -0
@@ -0,0 +1,161 @@
1
+ name: Agent retry orchestrator
2
+
3
+ on:
4
+ check_suite:
5
+ types: [completed]
6
+
7
+ permissions:
8
+ issues: write
9
+ pull-requests: write
10
+ checks: read
11
+
12
+ jobs:
13
+ retry:
14
+ if: github.event.check_suite.conclusion == 'failure'
15
+ runs-on: ubuntu-latest
16
+ steps:
17
+ - uses: actions/checkout@v4
18
+
19
+ - name: Evaluate retry policy
20
+ id: retry
21
+ uses: actions/github-script@v7
22
+ with:
23
+ script: |
24
+ const MAX_RETRIES = 3;
25
+ const suite = context.payload.check_suite;
26
+ const linked = suite.pull_requests || [];
27
+ let pr = linked[0];
28
+ if (!pr) {
29
+ core.info('No PR associated with failed check suite');
30
+ core.setOutput('skip_telemetry', '1');
31
+ return;
32
+ }
33
+ const prNumber = pr.number;
34
+ if (!prNumber) {
35
+ core.info('Could not resolve PR number');
36
+ core.setOutput('skip_telemetry', '1');
37
+ return;
38
+ }
39
+ core.setOutput('pr_number', String(prNumber));
40
+ const { data: fullPr } = await github.rest.pulls.get({
41
+ owner: context.repo.owner,
42
+ repo: context.repo.repo,
43
+ pull_number: prNumber,
44
+ });
45
+ const labels = fullPr.labels.map((l) => l.name);
46
+ core.setOutput('pr_labels', labels.join(','));
47
+ core.setOutput('pr_body', fullPr.body || '');
48
+ const retryLabel = labels.find((l) => l.startsWith('retry:'));
49
+ const count = retryLabel ? parseInt(retryLabel.split(':')[1], 10) || 0 : 0;
50
+
51
+ const { data: checkRuns } = await github.rest.checks.listForRef({
52
+ owner: context.repo.owner,
53
+ repo: context.repo.repo,
54
+ ref: suite.head_sha,
55
+ });
56
+ const failed = checkRuns.check_runs.filter((c) => c.conclusion === 'failure');
57
+ const failureType = failed[0]?.name || suite.app?.slug || 'ci';
58
+ const sig = failed.map((c) => `${c.name}:${c.conclusion}`).sort().join('|') || failureType;
59
+ core.setOutput('wall_failure_type', failureType);
60
+ core.setOutput('retry_count', String(count));
61
+
62
+ const noRetry = /security|zizmor|codeql/i.test(failureType);
63
+ if (noRetry) {
64
+ core.setOutput('final_outcome', 'escalated');
65
+ await github.rest.issues.createComment({
66
+ owner: context.repo.owner,
67
+ repo: context.repo.repo,
68
+ issue_number: prNumber,
69
+ body: `⛔ \`${failureType}\` failures are not auto-retried. Human escalation required.\n\nfailure_sig: \`${sig}\``,
70
+ });
71
+ return;
72
+ }
73
+
74
+ const { data: comments } = await github.rest.issues.listComments({
75
+ owner: context.repo.owner,
76
+ repo: context.repo.repo,
77
+ issue_number: prNumber,
78
+ });
79
+ const sigMatches = comments.filter((c) => c.body?.includes(`failure_sig: \`${sig}\``));
80
+ if (sigMatches.length >= 1) {
81
+ core.setOutput('final_outcome', 'escalated');
82
+ await github.rest.issues.createComment({
83
+ owner: context.repo.owner,
84
+ repo: context.repo.repo,
85
+ issue_number: prNumber,
86
+ body: `⛔ Same failure signature detected twice (\`${sig}\`). Stopping auto-retry per docs/operations.md.`,
87
+ });
88
+ return;
89
+ }
90
+
91
+ if (count >= MAX_RETRIES) {
92
+ core.setOutput('final_outcome', 'escalated');
93
+ await github.rest.issues.createComment({
94
+ owner: context.repo.owner,
95
+ repo: context.repo.repo,
96
+ issue_number: prNumber,
97
+ body: `⛔ Max retries (${MAX_RETRIES}) reached for \`${failureType}\`. Human escalation required.\n\nfailure_sig: \`${sig}\``,
98
+ });
99
+ return;
100
+ }
101
+ const newCount = count + 1;
102
+ core.setOutput('retry_count', String(newCount));
103
+ core.setOutput('final_outcome', 'in_progress');
104
+ if (retryLabel) {
105
+ await github.rest.issues.removeLabel({
106
+ owner: context.repo.owner,
107
+ repo: context.repo.repo,
108
+ issue_number: prNumber,
109
+ name: retryLabel,
110
+ });
111
+ }
112
+ await github.rest.issues.addLabels({
113
+ owner: context.repo.owner,
114
+ repo: context.repo.repo,
115
+ issue_number: prNumber,
116
+ labels: [`retry:${newCount}`],
117
+ });
118
+ await github.rest.issues.createComment({
119
+ owner: context.repo.owner,
120
+ repo: context.repo.repo,
121
+ issue_number: prNumber,
122
+ body: [
123
+ `🔄 CI failed (\`${failureType}\`). Retry **${newCount}/${MAX_RETRIES}**.`,
124
+ '',
125
+ 'Re-trigger coding agent or push a fix to the same PR.',
126
+ `wall_failure_type: ${failureType}`,
127
+ `failure_sig: \`${sig}\``,
128
+ ].join('\n'),
129
+ });
130
+
131
+ - uses: actions/setup-node@v4
132
+ if: steps.retry.outputs.skip_telemetry != '1'
133
+ with:
134
+ node-version: "22"
135
+
136
+ - name: Emit retry telemetry artifact
137
+ if: steps.retry.outputs.skip_telemetry != '1'
138
+ env:
139
+ TELEMETRY_SOURCE: agent-retry-orchestrator
140
+ GITHUB_REPOSITORY: ${{ github.repository }}
141
+ GITHUB_RUN_ID: ${{ github.run_id }}
142
+ GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
143
+ GITHUB_WORKFLOW: ${{ github.workflow }}
144
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
145
+ PR_NUMBER: ${{ steps.retry.outputs.pr_number }}
146
+ PR_BODY: ${{ steps.retry.outputs.pr_body }}
147
+ PR_LABELS: ${{ steps.retry.outputs.pr_labels }}
148
+ WALL_FAILURE_TYPE: ${{ steps.retry.outputs.wall_failure_type }}
149
+ RETRY_COUNT: ${{ steps.retry.outputs.retry_count }}
150
+ FINAL_OUTCOME: ${{ steps.retry.outputs.final_outcome }}
151
+ AGENT_TYPE: orchestrator
152
+ EXECUTION_MODE: gh_aw
153
+ run: node scripts/emit-telemetry-artifact.mjs
154
+
155
+ - name: Upload retry telemetry artifact
156
+ if: steps.retry.outputs.skip_telemetry != '1'
157
+ uses: actions/upload-artifact@v4
158
+ with:
159
+ name: retry-telemetry-${{ github.run_id }}
160
+ path: telemetry-artifacts/
161
+ if-no-files-found: error
@@ -0,0 +1,64 @@
1
+ name: Copilot setup
2
+
3
+ on:
4
+ workflow_dispatch:
5
+
6
+ jobs:
7
+ copilot-setup-steps:
8
+ runs-on: ubuntu-latest
9
+ steps:
10
+ - uses: actions/checkout@v4
11
+
12
+ - uses: actions/setup-node@v4
13
+ if: hashFiles('sample/ts/package-lock.json') != ''
14
+ with:
15
+ node-version: "22"
16
+
17
+ - uses: actions/setup-python@v5
18
+ if: hashFiles('sample/python/requirements-dev.txt') != ''
19
+ with:
20
+ python-version: "3.12"
21
+
22
+ - uses: actions/setup-go@v5
23
+ if: hashFiles('sample/go/go.mod') != ''
24
+ with:
25
+ go-version: "1.22"
26
+
27
+ - uses: ruby/setup-ruby@v1
28
+ if: hashFiles('sample/ruby/Gemfile.lock') != ''
29
+ with:
30
+ ruby-version: "3.3"
31
+ bundler-cache: true
32
+ working-directory: sample/ruby
33
+
34
+ - uses: shivammathur/setup-php@v2
35
+ if: hashFiles('sample/php/composer.lock') != ''
36
+ with:
37
+ php-version: "8.2"
38
+ coverage: none
39
+ tools: composer
40
+
41
+ - name: Install TS sample deps
42
+ if: hashFiles('sample/ts/package-lock.json') != ''
43
+ working-directory: sample/ts
44
+ run: npm ci
45
+
46
+ - name: Install Python sample deps
47
+ if: hashFiles('sample/python/requirements-dev.txt') != ''
48
+ working-directory: sample/python
49
+ run: pip install -r requirements-dev.txt
50
+
51
+ - name: Install Go sample deps
52
+ if: hashFiles('sample/go/go.mod') != ''
53
+ working-directory: sample/go
54
+ run: go mod download
55
+
56
+ - name: Install Ruby sample deps
57
+ if: hashFiles('sample/ruby/Gemfile.lock') != ''
58
+ working-directory: sample/ruby
59
+ run: bundle install
60
+
61
+ - name: Install PHP sample deps
62
+ if: hashFiles('sample/php/composer.lock') != ''
63
+ working-directory: sample/php
64
+ run: composer install --no-interaction --prefer-dist
@@ -0,0 +1,169 @@
1
+ name: Eval CI
2
+
3
+ on:
4
+ pull_request:
5
+ paths:
6
+ - ".github/**"
7
+ - "AGENTS.md"
8
+ - "prompts/**"
9
+ - "evals/**"
10
+ schedule:
11
+ - cron: "0 6 * * 1"
12
+
13
+ jobs:
14
+ select:
15
+ runs-on: ubuntu-latest
16
+ outputs:
17
+ jobs: ${{ steps.sel.outputs.jobs }}
18
+ steps:
19
+ - uses: actions/checkout@v4
20
+ with:
21
+ fetch-depth: 0
22
+ - uses: actions/setup-node@v4
23
+ with:
24
+ node-version: "22"
25
+ - id: sel
26
+ env:
27
+ BASE_SHA: origin/${{ github.base_ref }}
28
+ run: node scripts/select-eval-jobs.mjs
29
+
30
+ prompt-eval:
31
+ needs: select
32
+ if: contains(needs.select.outputs.jobs, 'prompt-eval')
33
+ runs-on: ubuntu-latest
34
+ steps:
35
+ - uses: actions/checkout@v4
36
+ - name: Prompt eval (gh models)
37
+ run: |
38
+ shopt -s nullglob
39
+ files=(prompts/*.prompt.yml)
40
+ if [ ${#files[@]} -eq 0 ]; then
41
+ echo "No .prompt.yml files; skipping"
42
+ exit 0
43
+ fi
44
+ if ! command -v gh >/dev/null; then
45
+ echo "::error::gh CLI required when prompt files exist"
46
+ exit 1
47
+ fi
48
+ for f in "${files[@]}"; do
49
+ echo "Evaluating $f"
50
+ gh models eval "$f"
51
+ done
52
+
53
+ agent-policy:
54
+ needs: select
55
+ if: contains(needs.select.outputs.jobs, 'agent-policy')
56
+ runs-on: ubuntu-latest
57
+ steps:
58
+ - uses: actions/checkout@v4
59
+ - uses: actions/setup-node@v4
60
+ with:
61
+ node-version: "22"
62
+ - name: Validate agent definitions
63
+ run: node scripts/validate-harness.mjs
64
+
65
+ trajectory-conventions:
66
+ needs: select
67
+ if: contains(needs.select.outputs.jobs, 'trajectory-conventions')
68
+ runs-on: ubuntu-latest
69
+ steps:
70
+ - uses: actions/checkout@v4
71
+ - uses: actions/setup-python@v5
72
+ with:
73
+ python-version: "3.12"
74
+ - name: Harness convention tests
75
+ run: |
76
+ pip install pytest
77
+ pytest evals/trajectories -q
78
+
79
+ trajectory-task:
80
+ needs: select
81
+ if: contains(needs.select.outputs.jobs, 'trajectory-task')
82
+ runs-on: ubuntu-latest
83
+ steps:
84
+ - uses: actions/checkout@v4
85
+ - uses: actions/setup-python@v5
86
+ with:
87
+ python-version: "3.12"
88
+ - name: Skill / task rubric tests
89
+ run: |
90
+ pip install pytest
91
+ pytest evals/trajectories -q
92
+
93
+ meta-eval:
94
+ needs: select
95
+ if: contains(needs.select.outputs.jobs, 'meta-eval')
96
+ runs-on: ubuntu-latest
97
+ steps:
98
+ - uses: actions/checkout@v4
99
+ - uses: actions/setup-node@v4
100
+ with:
101
+ node-version: "22"
102
+ - uses: actions/setup-python@v5
103
+ with:
104
+ python-version: "3.12"
105
+ - name: E2E manifest and task definitions
106
+ run: |
107
+ node scripts/check-e2e-manifest.mjs
108
+ node scripts/run-e2e-bench.mjs
109
+ pip install pytest
110
+ pytest evals/trajectories -q
111
+
112
+ e2e-bench:
113
+ if: github.event_name == 'schedule'
114
+ runs-on: ubuntu-latest
115
+ steps:
116
+ - uses: actions/checkout@v4
117
+ - uses: actions/setup-node@v4
118
+ with:
119
+ node-version: "22"
120
+ - name: E2E bench manifest and tasks
121
+ run: |
122
+ node scripts/check-e2e-manifest.mjs
123
+ node scripts/run-e2e-bench.mjs
124
+
125
+ telemetry:
126
+ name: telemetry-artifact
127
+ runs-on: ubuntu-latest
128
+ if: always() && github.event_name == 'pull_request'
129
+ needs:
130
+ - select
131
+ - prompt-eval
132
+ - agent-policy
133
+ - trajectory-conventions
134
+ - trajectory-task
135
+ - meta-eval
136
+ - e2e-bench
137
+ steps:
138
+ - uses: actions/checkout@v4
139
+ with:
140
+ fetch-depth: 0
141
+
142
+ - uses: actions/setup-node@v4
143
+ with:
144
+ node-version: "22"
145
+
146
+ - name: Emit eval CI telemetry artifact
147
+ env:
148
+ TELEMETRY_SOURCE: eval-ci
149
+ GITHUB_REPOSITORY: ${{ github.repository }}
150
+ GITHUB_RUN_ID: ${{ github.run_id }}
151
+ GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }}
152
+ GITHUB_WORKFLOW: ${{ github.workflow }}
153
+ GITHUB_EVENT_NAME: ${{ github.event_name }}
154
+ PR_NUMBER: ${{ github.event.pull_request.number || 0 }}
155
+ PR_BODY: ${{ github.event.pull_request.body }}
156
+ PR_LABELS: ${{ join(github.event.pull_request.labels.*.name, ',') }}
157
+ BASE_SHA: origin/${{ github.base_ref }}
158
+ JOB_RESULTS: ${{ toJSON(needs) }}
159
+ AGENT_TYPE: eval
160
+ EXECUTION_MODE: ci
161
+ FINAL_OUTCOME: in_progress
162
+ run: node scripts/emit-telemetry-artifact.mjs
163
+
164
+ - name: Upload eval telemetry artifact
165
+ uses: actions/upload-artifact@v4
166
+ with:
167
+ name: eval-telemetry-${{ github.run_id }}
168
+ path: telemetry-artifacts/
169
+ if-no-files-found: error
@@ -0,0 +1,75 @@
1
+ name: Eval drift check
2
+
3
+ on:
4
+ schedule:
5
+ - cron: "0 7 * * 1"
6
+ workflow_dispatch:
7
+
8
+ permissions:
9
+ issues: write
10
+ contents: read
11
+
12
+ jobs:
13
+ e2e-staleness:
14
+ runs-on: ubuntu-latest
15
+ steps:
16
+ - uses: actions/checkout@v4
17
+ - uses: actions/setup-node@v4
18
+ with:
19
+ node-version: "22"
20
+ - name: Check E2E manifest freshness
21
+ id: e2e
22
+ run: |
23
+ node scripts/check-e2e-manifest.mjs 2>&1 | tee /tmp/e2e.out || true
24
+ if grep -q warning /tmp/e2e.out; then echo "stale=true" >> "$GITHUB_OUTPUT"; fi
25
+
26
+ - name: Check eval score drift (15pt threshold)
27
+ id: score
28
+ run: |
29
+ set +e
30
+ set -o pipefail
31
+ node scripts/check-eval-score-drift.mjs 2>&1 | tee /tmp/score.out
32
+ SC=$?
33
+ if [ "$SC" = "2" ]; then echo "drift=true" >> "$GITHUB_OUTPUT"; fi
34
+
35
+ - name: Open bench review issue if stale
36
+ if: steps.e2e.outputs.stale == 'true'
37
+ uses: actions/github-script@v7
38
+ with:
39
+ script: |
40
+ const title = 'E2E bench rotation due (quarterly 20%)';
41
+ const { data: issues } = await github.rest.issues.listForRepo({
42
+ owner: context.repo.owner,
43
+ repo: context.repo.repo,
44
+ state: 'open',
45
+ labels: 'harness:eval-drift',
46
+ });
47
+ if (issues.find((i) => i.title === title)) return;
48
+ await github.rest.issues.create({
49
+ owner: context.repo.owner,
50
+ repo: context.repo.repo,
51
+ title,
52
+ labels: ['harness:eval-drift', 'task:infra'],
53
+ body: 'E2E manifest not rotated in 90 days. Review evals/e2e-bench/manifest.json per docs/operations.md.',
54
+ });
55
+
56
+ - name: Open eval score drift issue
57
+ if: steps.score.outputs.drift == 'true'
58
+ uses: actions/github-script@v7
59
+ with:
60
+ script: |
61
+ const title = 'Eval/production acceptance gap exceeds 15pt';
62
+ const { data: issues } = await github.rest.issues.listForRepo({
63
+ owner: context.repo.owner,
64
+ repo: context.repo.repo,
65
+ state: 'open',
66
+ labels: 'harness:eval-drift',
67
+ });
68
+ if (issues.find((i) => i.title === title)) return;
69
+ await github.rest.issues.create({
70
+ owner: context.repo.owner,
71
+ repo: context.repo.repo,
72
+ title,
73
+ labels: ['harness:eval-drift', 'task:infra'],
74
+ body: 'Eval pass rate exceeds production acceptance by >15pt. Review evals/.score-baseline.json and e2e bench per docs/operations.md.',
75
+ });
@@ -0,0 +1,73 @@
1
+ name: gh-aw dogfood CI
2
+
3
+ on:
4
+ pull_request:
5
+ paths:
6
+ - ".github/workflows/nightly-harness-review.md"
7
+ - ".github/workflows/nightly-harness-review.lock.yml"
8
+ - ".github/workflows/weekly-redteam.md"
9
+ - ".github/workflows/weekly-redteam.lock.yml"
10
+ - ".github/workflows/gh-aw-dogfood-ci.yml"
11
+ - ".github/labels.yml"
12
+ - ".github/aw/**"
13
+ - "scripts/**gh-aw**"
14
+ - "scripts/lib/gh-aw-dogfood.mjs"
15
+ - "docs/gh-aw-dogfood.md"
16
+ - "docs/nightly-harness-review.md"
17
+ workflow_dispatch:
18
+
19
+ permissions:
20
+ contents: read
21
+
22
+ jobs:
23
+ dogfood:
24
+ name: validate-gh-aw-track
25
+ runs-on: ubuntu-latest
26
+ steps:
27
+ - uses: actions/checkout@v4
28
+ with:
29
+ fetch-depth: 0
30
+
31
+ - uses: actions/setup-node@v4
32
+ with:
33
+ node-version: "22"
34
+
35
+ - name: Install gh aw extension
36
+ env:
37
+ GH_AW_VERSION: v0.81.6
38
+ run: |
39
+ if gh extension list 2>/dev/null | grep -q 'github/gh-aw'; then
40
+ gh extension upgrade github/gh-aw --pin "$GH_AW_VERSION"
41
+ else
42
+ gh extension install github/gh-aw --pin "$GH_AW_VERSION"
43
+ fi
44
+
45
+ - name: Dogfood scenario tests
46
+ run: node scripts/test-gh-aw-dogfood-scenarios.mjs
47
+
48
+ - name: Enforce dogfood path scope
49
+ if: github.event_name == 'pull_request'
50
+ env:
51
+ PR_LABELS: ${{ join(github.event.pull_request.labels.*.name, ',') }}
52
+ BASE_SHA: origin/${{ github.base_ref }}
53
+ run: node scripts/check-gh-aw-dogfood-scope.mjs
54
+
55
+ - name: Validate gh-aw compile (when CLI available)
56
+ env:
57
+ GH_AW_COMPILE_REQUIRED: "0"
58
+ run: node scripts/validate-gh-aw-compile.mjs
59
+
60
+ - name: Emit dogfood evaluation report
61
+ env:
62
+ GITHUB_REPOSITORY: ${{ github.repository }}
63
+ PR_LABELS: ${{ join(github.event.pull_request.labels.*.name, ',') }}
64
+ BASE_SHA: origin/${{ github.base_ref }}
65
+ DOGFOOD_REPORT_DIR: dogfood-report
66
+ run: node scripts/emit-gh-aw-dogfood-report.mjs
67
+
68
+ - name: Upload dogfood report
69
+ uses: actions/upload-artifact@v4
70
+ with:
71
+ name: gh-aw-dogfood-${{ github.run_id }}
72
+ path: dogfood-report/
73
+ if-no-files-found: error