@htekdev/actions-debugger 1.0.124 → 1.0.125
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/errors/caching-artifacts/caching-artifacts-073.yml +100 -0
- package/errors/caching-artifacts/caching-artifacts-074.yml +117 -0
- package/errors/known-unsolved/known-unsolved-071.yml +122 -0
- package/errors/known-unsolved/known-unsolved-072.yml +143 -0
- package/errors/permissions-auth/permissions-auth-071.yml +144 -0
- package/errors/permissions-auth/permissions-auth-072.yml +112 -0
- package/errors/permissions-auth/permissions-auth-073.yml +127 -0
- package/errors/permissions-auth/permissions-auth-074.yml +106 -0
- package/errors/permissions-auth/permissions-auth-075.yml +137 -0
- package/errors/runner-environment/runner-environment-227.yml +106 -0
- package/errors/runner-environment/runner-environment-228.yml +117 -0
- package/errors/runner-environment/runner-environment-229.yml +119 -0
- package/errors/runner-environment/runner-environment-230.yml +129 -0
- package/errors/runner-environment/runner-environment-231.yml +90 -0
- package/errors/runner-environment/runner-environment-232.yml +131 -0
- package/errors/runner-environment/runner-environment-233.yml +90 -0
- package/errors/runner-environment/runner-environment-234.yml +114 -0
- package/errors/runner-environment/runner-environment-235.yml +151 -0
- package/errors/silent-failures/silent-failures-112.yml +97 -0
- package/errors/silent-failures/silent-failures-113.yml +110 -0
- package/errors/silent-failures/silent-failures-114.yml +116 -0
- package/errors/silent-failures/silent-failures-115.yml +130 -0
- package/errors/silent-failures/silent-failures-116.yml +117 -0
- package/errors/silent-failures/silent-failures-117.yml +137 -0
- package/errors/silent-failures/silent-failures-118.yml +156 -0
- package/errors/yaml-syntax/yaml-syntax-075.yml +128 -0
- package/errors/yaml-syntax/yaml-syntax-076.yml +107 -0
- package/package.json +1 -1
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
id: runner-environment-233
|
|
2
|
+
title: 'actions/checkout < v6.0.3 fails on SHA-256 repositories — fatal: mismatched algorithms: client sha1; server sha256'
|
|
3
|
+
category: runner-environment
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- checkout
|
|
7
|
+
- sha256
|
|
8
|
+
- git
|
|
9
|
+
- object-format
|
|
10
|
+
- hash-algorithm
|
|
11
|
+
- repository
|
|
12
|
+
- v6
|
|
13
|
+
patterns:
|
|
14
|
+
- regex: 'fatal: mismatched algorithms: client sha1; server sha256'
|
|
15
|
+
flags: 'i'
|
|
16
|
+
- regex: 'fatal: mismatched algorithms: client sha256; server sha1'
|
|
17
|
+
flags: 'i'
|
|
18
|
+
- regex: 'mismatched algorithms.*sha1.*sha256'
|
|
19
|
+
flags: 'i'
|
|
20
|
+
error_messages:
|
|
21
|
+
- 'fatal: mismatched algorithms: client sha1; server sha256'
|
|
22
|
+
- "The process '/usr/bin/git' failed with exit code 128"
|
|
23
|
+
root_cause: |
|
|
24
|
+
GitHub is progressively rolling out support for SHA-256 object-format repositories
|
|
25
|
+
(also called NewHash or sha256-mode repos). When a repository is SHA-256-mode,
|
|
26
|
+
every git object reference is a 64-character hex SHA-256 hash instead of the
|
|
27
|
+
traditional 40-character SHA-1 hash.
|
|
28
|
+
|
|
29
|
+
Before actions/checkout v6.0.3, the checkout action always initialised the local
|
|
30
|
+
working-tree repository with a plain `git init`, which defaults to SHA-1 object
|
|
31
|
+
format. When the action then attempts to `git fetch` from a SHA-256 remote, git
|
|
32
|
+
detects that the local and remote repositories use different hash algorithms and
|
|
33
|
+
immediately aborts:
|
|
34
|
+
|
|
35
|
+
fatal: mismatched algorithms: client sha1; server sha256
|
|
36
|
+
|
|
37
|
+
The mismatch is unrecoverable — the local repository must be created with
|
|
38
|
+
`git init --object-format=sha256` BEFORE the first fetch. There is no in-place
|
|
39
|
+
conversion.
|
|
40
|
+
|
|
41
|
+
This only affects workflows where the target GitHub repository has been migrated
|
|
42
|
+
to or created as SHA-256 mode. GitHub began rolling out SHA-256 repository creation
|
|
43
|
+
in late 2025; adoption will grow over time. Previously-created SHA-1 repositories
|
|
44
|
+
are unaffected until they are explicitly migrated.
|
|
45
|
+
|
|
46
|
+
The fix shipped in actions/checkout v6.0.3 (June 2, 2026, commit 1cce339).
|
|
47
|
+
It calls the GitHub REST endpoint `GET /repos/{owner}/{repo}/hash-algorithm`
|
|
48
|
+
before `git init` and passes `--object-format=sha256` when the repository is
|
|
49
|
+
SHA-256 mode.
|
|
50
|
+
|
|
51
|
+
Source: actions/checkout#2160, PR#2439.
|
|
52
|
+
fix: |
|
|
53
|
+
Upgrade to actions/checkout@v6.0.3 or later. Version v6.0.3 detects the
|
|
54
|
+
repository's object format via the GitHub API before initialising the local
|
|
55
|
+
git repository and passes the correct `--object-format` flag to `git init`.
|
|
56
|
+
|
|
57
|
+
If you cannot upgrade immediately, set the GIT_DEFAULT_HASH environment variable
|
|
58
|
+
to sha256 before running checkout, then reinitialise manually — but upgrading
|
|
59
|
+
the action is strongly preferred.
|
|
60
|
+
|
|
61
|
+
Note: v6.0.3 introduced a new REST API call (`GET /repos/.../hash-algorithm`) on
|
|
62
|
+
every checkout, which can cause secondary rate-limit issues for organisations with
|
|
63
|
+
very high parallel job counts. See runner-environment-206 for that separate issue.
|
|
64
|
+
fix_code:
|
|
65
|
+
- language: yaml
|
|
66
|
+
label: 'Upgrade to actions/checkout v6.0.3+ to fix SHA-256 repository support'
|
|
67
|
+
code: |
|
|
68
|
+
- uses: actions/checkout@v6.0.3
|
|
69
|
+
with:
|
|
70
|
+
fetch-depth: 1
|
|
71
|
+
- language: yaml
|
|
72
|
+
label: 'Pin to v6.0.3+ by SHA for reproducibility (recommended for security-sensitive workflows)'
|
|
73
|
+
code: |
|
|
74
|
+
# Pin to the exact commit SHA of v6.0.3 to lock the version
|
|
75
|
+
- uses: actions/checkout@1cce3390c2bfda521930d01229c073c7ff920824 # v6.0.3
|
|
76
|
+
with:
|
|
77
|
+
fetch-depth: 1
|
|
78
|
+
prevention:
|
|
79
|
+
- 'Always pin to a specific patch version of actions/checkout (e.g., @v6.0.3) rather than a floating major tag (@v6), so breaking changes from new GitHub repository features land on your own schedule'
|
|
80
|
+
- 'If your organisation creates new repositories, check whether SHA-256 mode is the default; SHA-256 repos will silently break all workflows using checkout < v6.0.3'
|
|
81
|
+
- 'Monitor the actions/checkout changelog when upgrading; v6.0.3 also introduced a new API call that can affect rate limits in high-parallelism environments (see runner-environment-206)'
|
|
82
|
+
docs:
|
|
83
|
+
- url: 'https://github.com/actions/checkout/issues/2160'
|
|
84
|
+
label: 'actions/checkout#2160 — hashing mismatch error on SHA-256 repository'
|
|
85
|
+
- url: 'https://github.com/actions/checkout/pull/2439'
|
|
86
|
+
label: 'actions/checkout PR#2439 — Fix checkout init for SHA-256 repositories (merged June 2026)'
|
|
87
|
+
- url: 'https://github.com/actions/checkout/releases/tag/v6.0.3'
|
|
88
|
+
label: 'actions/checkout v6.0.3 release — SHA-256 repository support'
|
|
89
|
+
- url: 'https://git-scm.com/docs/hash-function-transition'
|
|
90
|
+
label: 'Git documentation — SHA-256 hash function transition'
|
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
id: runner-environment-234
|
|
2
|
+
title: "actions/checkout v6 credential helper overrides embedded PAT in cross-repo git push — 403 Permission denied to github-actions[bot]"
|
|
3
|
+
category: runner-environment
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- checkout
|
|
7
|
+
- v6
|
|
8
|
+
- credential-helper
|
|
9
|
+
- cross-repo
|
|
10
|
+
- PAT
|
|
11
|
+
- git-push
|
|
12
|
+
- extraheader
|
|
13
|
+
- 403
|
|
14
|
+
- permission-denied
|
|
15
|
+
patterns:
|
|
16
|
+
- regex: 'remote: Permission to .+\.git denied to github-actions\[bot\]'
|
|
17
|
+
flags: 'i'
|
|
18
|
+
- regex: 'fatal: unable to access .+github\.com.+: The requested URL returned error: 403'
|
|
19
|
+
flags: 'i'
|
|
20
|
+
- regex: 'Permission denied to github-actions\[bot\]\.'
|
|
21
|
+
flags: 'i'
|
|
22
|
+
error_messages:
|
|
23
|
+
- 'remote: Permission to owner/nightly-releases.git denied to github-actions[bot].'
|
|
24
|
+
- 'fatal: unable to access ''https://github.com/owner/repo/'': The requested URL returned error: 403'
|
|
25
|
+
root_cause: |
|
|
26
|
+
When a workflow uses actions/checkout v6 and then manually adds a second git remote
|
|
27
|
+
whose URL embeds a Personal Access Token (PAT), the cross-repo git push fails with
|
|
28
|
+
a 403 even though the PAT is valid and has the required scope.
|
|
29
|
+
|
|
30
|
+
actions/checkout injects a GitHub token (GITHUB_TOKEN by default) into the local
|
|
31
|
+
git configuration as an HTTP Authorization header for `https://github.com/` URLs
|
|
32
|
+
via `http.https://github.com/.extraheader`. A common pattern to push to a separate
|
|
33
|
+
repository using a PAT is to:
|
|
34
|
+
1. Add a remote with the PAT embedded in the URL:
|
|
35
|
+
git remote add nightly_repo https://TOKEN@github.com/owner/repo.git
|
|
36
|
+
2. Remove the checkout-injected extraheader to prevent it from conflicting:
|
|
37
|
+
git config --unset-all http.https://github.com/.extraheader
|
|
38
|
+
3. Push via the PAT URL remote.
|
|
39
|
+
|
|
40
|
+
This pattern worked correctly with actions/checkout v4 and v5. In v6, the sequence
|
|
41
|
+
fails because the action now also registers a git credential helper
|
|
42
|
+
(`credential.https://github.com.helper`) alongside the extraheader. When the
|
|
43
|
+
extraheader is manually unset in step 2, the credential helper remains configured.
|
|
44
|
+
Git consults the credential helper for all `https://github.com/` requests —
|
|
45
|
+
including the push to the PAT URL remote — and the helper returns GITHUB_TOKEN
|
|
46
|
+
(authenticated as `github-actions[bot]`). This token overrides the PAT embedded
|
|
47
|
+
in the remote URL, and since `github-actions[bot]` does not have write access to
|
|
48
|
+
the target repository, the push fails with HTTP 403.
|
|
49
|
+
|
|
50
|
+
The root distinction from earlier versions: unsetting `http.extraheader` alone is
|
|
51
|
+
no longer sufficient to fully remove checkout's credential injection in v6.
|
|
52
|
+
|
|
53
|
+
Source: actions/checkout#2424 (April 2026, open).
|
|
54
|
+
fix: |
|
|
55
|
+
**Option 1 (recommended): Use `persist-credentials: false` in checkout.**
|
|
56
|
+
This prevents checkout from configuring any git credentials at all. The subsequent
|
|
57
|
+
git push uses the PAT embedded in the remote URL without interference.
|
|
58
|
+
|
|
59
|
+
**Option 2: Explicitly clear the credential helper after checkout.**
|
|
60
|
+
After the checkout step, unset both the extraheader AND the credential helper:
|
|
61
|
+
git config --unset-all http.https://github.com/.extraheader || true
|
|
62
|
+
git config --global --unset credential.https://github.com.helper || true
|
|
63
|
+
git config --global --unset credential.helper || true
|
|
64
|
+
|
|
65
|
+
Note: the exact helper key may vary; inspect `git config --list` to see all keys
|
|
66
|
+
set by checkout before unsetting.
|
|
67
|
+
|
|
68
|
+
**Option 3: Provide the PAT via the checkout `token:` input instead.**
|
|
69
|
+
If the PAT has access to both the primary repo and the target cross-repo, pass it
|
|
70
|
+
as the `token:` input so checkout configures it as the credential for all
|
|
71
|
+
github.com pushes. This avoids the need to inject a second credential entirely.
|
|
72
|
+
fix_code:
|
|
73
|
+
- language: yaml
|
|
74
|
+
label: 'Recommended: use persist-credentials: false and embed PAT in remote URL'
|
|
75
|
+
code: |
|
|
76
|
+
- uses: actions/checkout@v6
|
|
77
|
+
with:
|
|
78
|
+
persist-credentials: false # prevents checkout from injecting any git credentials
|
|
79
|
+
|
|
80
|
+
- name: Push tags to nightly releases repository
|
|
81
|
+
env:
|
|
82
|
+
NIGHTLY_PAT: ${{ secrets.NIGHTLY_PAT }}
|
|
83
|
+
run: |
|
|
84
|
+
git remote add nightly_repo \
|
|
85
|
+
"https://${NIGHTLY_PAT}@github.com/owner/nightly-releases.git"
|
|
86
|
+
git tag -f nightly
|
|
87
|
+
git push -f nightly_repo nightly
|
|
88
|
+
- language: yaml
|
|
89
|
+
label: 'Alternative: pass PAT to checkout token: input (when PAT has access to primary repo)'
|
|
90
|
+
code: |
|
|
91
|
+
- uses: actions/checkout@v6
|
|
92
|
+
with:
|
|
93
|
+
# If CROSS_REPO_PAT has access to the checked-out repo as well,
|
|
94
|
+
# checkout will configure it for all github.com credential requests.
|
|
95
|
+
token: ${{ secrets.CROSS_REPO_PAT }}
|
|
96
|
+
|
|
97
|
+
- name: Push tags to nightly releases repository
|
|
98
|
+
run: |
|
|
99
|
+
git remote add nightly_repo \
|
|
100
|
+
"https://${{ secrets.CROSS_REPO_PAT }}@github.com/owner/nightly-releases.git"
|
|
101
|
+
git tag -f nightly
|
|
102
|
+
git push -f nightly_repo nightly
|
|
103
|
+
prevention:
|
|
104
|
+
- 'Use `persist-credentials: false` in any checkout step that is followed by a cross-repo git push using a PAT, to avoid checkout credential helpers interfering with the push'
|
|
105
|
+
- 'When debugging 403 errors on git push, check `git config --list` to see all credential-related configuration; look for any helper or extraheader keys referencing github.com'
|
|
106
|
+
- 'Avoid relying on unsetting only `http.https://github.com/.extraheader` to remove checkout credentials in v6+; the set of keys checkout writes has changed across versions'
|
|
107
|
+
- 'When migrating from actions/checkout v4/v5 to v6, test all workflows that manually manipulate git credentials or push to cross-repo remotes'
|
|
108
|
+
docs:
|
|
109
|
+
- url: 'https://github.com/actions/checkout/issues/2424'
|
|
110
|
+
label: 'actions/checkout#2424 — Unable to create tags in another repo with v6 (403 Permission denied)'
|
|
111
|
+
- url: 'https://github.com/actions/checkout#usage'
|
|
112
|
+
label: 'actions/checkout README — persist-credentials input'
|
|
113
|
+
- url: 'https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication'
|
|
114
|
+
label: 'GitHub Docs — GITHUB_TOKEN automatic token authentication'
|
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
id: runner-environment-235
|
|
2
|
+
title: 'Action download fails with HTTP 429 Too Many Requests during action resolution — "Failed to download action ... Response status code does not indicate success: 429"'
|
|
3
|
+
category: runner-environment
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- action-download
|
|
7
|
+
- rate-limit
|
|
8
|
+
- 429
|
|
9
|
+
- action-resolution
|
|
10
|
+
- tarball
|
|
11
|
+
- github-api
|
|
12
|
+
- outage
|
|
13
|
+
- retry
|
|
14
|
+
patterns:
|
|
15
|
+
- regex: 'Failed to download action.*429|Response status code does not indicate success: 429 \(Too Many Requests\)'
|
|
16
|
+
flags: 'i'
|
|
17
|
+
- regex: 'Error: Response status code does not indicate success: 429'
|
|
18
|
+
flags: 'i'
|
|
19
|
+
- regex: 'Warning: Failed to download action.*tarball'
|
|
20
|
+
flags: 'i'
|
|
21
|
+
error_messages:
|
|
22
|
+
- "Warning: Failed to download action 'https://api.github.com/repos/actions/checkout/tarball/11bd71901bbe5b1630ceea73d27597364c9af683'. Error: Response status code does not indicate success: 429 (Too Many Requests)."
|
|
23
|
+
- "Error: Response status code does not indicate success: 429 (Too Many Requests)."
|
|
24
|
+
- "Download action repository 'actions/checkout@v4'"
|
|
25
|
+
root_cause: |
|
|
26
|
+
When a GitHub Actions job starts, the runner downloads each action referenced in the
|
|
27
|
+
workflow by fetching the action's source tarball from the GitHub API endpoint:
|
|
28
|
+
GET https://api.github.com/repos/{owner}/{repo}/tarball/{sha}
|
|
29
|
+
|
|
30
|
+
This endpoint is subject to GitHub's API rate limits. When many runners start
|
|
31
|
+
simultaneously — during periods of high load, GitHub infrastructure incidents, or
|
|
32
|
+
large-scale rollouts — the tarball download requests can be rate-limited, returning
|
|
33
|
+
HTTP 429 Too Many Requests.
|
|
34
|
+
|
|
35
|
+
The runner logs a warning and retries with exponential backoff:
|
|
36
|
+
"Warning: Failed to download action '...'. Error: Response status code does not
|
|
37
|
+
indicate success: 429 (Too Many Requests). <correlationId>"
|
|
38
|
+
|
|
39
|
+
If the rate limit persists through all retry attempts, the job setup fails and the
|
|
40
|
+
entire job is marked as failed with "##[error]" on the "Set up job" step. The failure
|
|
41
|
+
is NOT a problem with the workflow configuration itself — it is a transient GitHub
|
|
42
|
+
infrastructure issue.
|
|
43
|
+
|
|
44
|
+
Contributing factors:
|
|
45
|
+
1. **GitHub outages or degraded performance** — action downloads share API quota with
|
|
46
|
+
all other GitHub API traffic from the runner's IP range.
|
|
47
|
+
2. **High parallel job counts** — organizations running hundreds of parallel jobs on
|
|
48
|
+
GitHub-hosted runners can exhaust rate limit buckets across a runner fleet.
|
|
49
|
+
3. **Composite actions with many nested uses** — deeply nested composite actions make
|
|
50
|
+
many download calls per job, multiplying the per-job API request count. PR #4296
|
|
51
|
+
in actions/runner (merged 2026-03) adds batching to reduce this.
|
|
52
|
+
4. **Self-hosted runners without caching** — runners that are freshly provisioned per
|
|
53
|
+
job always re-download all actions; runners with persistent `_work/_tool` directories
|
|
54
|
+
can cache downloads across jobs.
|
|
55
|
+
|
|
56
|
+
Note: this is distinct from runner-environment-202 (repeated downloads caused by
|
|
57
|
+
case-sensitivity mismatch in v2.334.0), which causes multiple SUCCESSFUL downloads
|
|
58
|
+
rather than 429 failures.
|
|
59
|
+
|
|
60
|
+
Source: actions/runner#4232 (Feb 2026, open), actions/checkout#2230.
|
|
61
|
+
fix: |
|
|
62
|
+
**Immediate:** Retry the failed workflow run. 429 errors during action downloads
|
|
63
|
+
are almost always transient — a re-run a few minutes later usually succeeds.
|
|
64
|
+
|
|
65
|
+
**Short-term: Reduce per-job download requests.**
|
|
66
|
+
Pin actions to a specific commit SHA rather than a mutable tag. Runners maintain an
|
|
67
|
+
action download cache keyed by commit SHA; pinned SHAs hit the cache more reliably
|
|
68
|
+
than mutable tags which may require a fresh lookup each run.
|
|
69
|
+
|
|
70
|
+
**For self-hosted runners: persist the action cache directory.**
|
|
71
|
+
The runner caches downloaded actions in `<runner-root>/_work/_tool`. If your runner
|
|
72
|
+
instances are ephemeral (e.g., provisioned fresh per job on EC2/GKE), mount a shared
|
|
73
|
+
volume or EFS/EBS on this path to share the cache across instances, reducing
|
|
74
|
+
download traffic dramatically.
|
|
75
|
+
|
|
76
|
+
**For high-parallelism organizations: stagger workflow start times.**
|
|
77
|
+
Spread out scheduled workflows or push triggers to avoid thousands of jobs starting
|
|
78
|
+
simultaneously. Use concurrency groups or scheduled-at offsets to distribute load.
|
|
79
|
+
|
|
80
|
+
**Self-hosted runners on GHES / GHAE:** Use GHES's internal action caching to serve
|
|
81
|
+
action downloads locally instead of hitting GitHub.com — this avoids rate limiting
|
|
82
|
+
entirely.
|
|
83
|
+
|
|
84
|
+
**Monitor GitHub status:** Check https://www.githubstatus.com/ when 429 errors appear
|
|
85
|
+
— they often correlate with incidents listed on the status page.
|
|
86
|
+
fix_code:
|
|
87
|
+
- language: yaml
|
|
88
|
+
label: 'Pin actions to commit SHA to maximize cache hits and reduce download requests'
|
|
89
|
+
code: |
|
|
90
|
+
# Instead of mutable tags (re-downloaded each time if tag moves):
|
|
91
|
+
# - uses: actions/checkout@v4
|
|
92
|
+
# - uses: actions/setup-node@v4
|
|
93
|
+
|
|
94
|
+
# Pin to immutable commit SHAs for reliable caching:
|
|
95
|
+
- uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
|
|
96
|
+
- uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
|
|
97
|
+
- language: yaml
|
|
98
|
+
label: 'Self-hosted runner: configure RUNNER_TOOL_CACHE to a persistent shared path'
|
|
99
|
+
code: |
|
|
100
|
+
# In the runner environment, set the tool cache path before starting the runner:
|
|
101
|
+
# export RUNNER_TOOL_CACHE=/mnt/shared/runner-tool-cache
|
|
102
|
+
# ./run.sh
|
|
103
|
+
|
|
104
|
+
# Or in the runner .env file (config/.env relative to the runner root):
|
|
105
|
+
# RUNNER_TOOL_CACHE=/mnt/shared/runner-tool-cache
|
|
106
|
+
|
|
107
|
+
# Docker-based self-hosted runner: mount the cache directory:
|
|
108
|
+
# docker run -v /mnt/shared/runner-tool-cache:/opt/hostedtoolcache ...
|
|
109
|
+
- language: yaml
|
|
110
|
+
label: 'Workflow-level retry: re-run the job automatically on setup failure'
|
|
111
|
+
code: |
|
|
112
|
+
# Actions does not have a built-in retry for setup failures.
|
|
113
|
+
# A common workaround: use a retry action in the first step that validates setup.
|
|
114
|
+
# Alternatively, set up a workflow_run trigger to retry on failure:
|
|
115
|
+
on:
|
|
116
|
+
workflow_run:
|
|
117
|
+
workflows: ["CI"]
|
|
118
|
+
types: [completed]
|
|
119
|
+
|
|
120
|
+
jobs:
|
|
121
|
+
retry-on-429:
|
|
122
|
+
if: github.event.workflow_run.conclusion == 'failure'
|
|
123
|
+
runs-on: ubuntu-latest
|
|
124
|
+
steps:
|
|
125
|
+
- name: Re-trigger the failed workflow
|
|
126
|
+
uses: actions/github-script@v7
|
|
127
|
+
with:
|
|
128
|
+
script: |
|
|
129
|
+
await github.rest.actions.createWorkflowDispatch({
|
|
130
|
+
owner: context.repo.owner,
|
|
131
|
+
repo: context.repo.repo,
|
|
132
|
+
workflow_id: 'ci.yml',
|
|
133
|
+
ref: '${{ github.event.workflow_run.head_branch }}'
|
|
134
|
+
});
|
|
135
|
+
prevention:
|
|
136
|
+
- 'Pin all `uses:` references to immutable commit SHAs — this maximises the chance that the runner serves the action from its local download cache rather than calling the GitHub API'
|
|
137
|
+
- 'For self-hosted runners provisioned per job (ephemeral), mount a persistent shared volume at the runner tool cache path so action downloads are reused across job invocations'
|
|
138
|
+
- 'Upgrade to actions/runner v2.335.0+ which includes PR#4296 action resolution batching — this reduces the number of download requests per job for workflows using composite actions'
|
|
139
|
+
- 'When you see 429 errors, check https://www.githubstatus.com/ — they often coincide with GitHub API rate limit incidents affecting the entire platform'
|
|
140
|
+
- 'Spread scheduled workflows across different minutes using cron expressions like `17 4 * * *` rather than `0 4 * * *` to avoid the stampede effect when many organizations schedule jobs at round-number times'
|
|
141
|
+
docs:
|
|
142
|
+
- url: 'https://github.com/actions/runner/issues/4232'
|
|
143
|
+
label: 'actions/runner#4232: Warning: Failed to download action ... 429 Too Many Requests'
|
|
144
|
+
- url: 'https://github.com/actions/checkout/issues/2230'
|
|
145
|
+
label: 'actions/checkout#2230: checkout fails with 429 Too Many Requests'
|
|
146
|
+
- url: 'https://github.com/actions/runner/pull/4296'
|
|
147
|
+
label: 'actions/runner PR#4296: Batch and deduplicate action resolution across composite depths (reduces 429 risk)'
|
|
148
|
+
- url: 'https://docs.github.com/en/rest/overview/rate-limits-for-the-rest-api'
|
|
149
|
+
label: 'GitHub Docs: REST API rate limits'
|
|
150
|
+
- url: 'https://www.githubstatus.com/'
|
|
151
|
+
label: 'GitHub Status page — check for active incidents when 429 errors appear'
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
id: silent-failures-112
|
|
2
|
+
title: 'Non-ephemeral self-hosted runner REST API busy flag desync causes auto-scaler to kill active job'
|
|
3
|
+
category: silent-failures
|
|
4
|
+
severity: silent-failure
|
|
5
|
+
tags:
|
|
6
|
+
- self-hosted
|
|
7
|
+
- non-ephemeral
|
|
8
|
+
- rest-api
|
|
9
|
+
- autoscaling
|
|
10
|
+
- broker
|
|
11
|
+
- busy-flag
|
|
12
|
+
- desync
|
|
13
|
+
- aws
|
|
14
|
+
patterns:
|
|
15
|
+
- regex: 'The runner has received a shutdown signal'
|
|
16
|
+
flags: 'i'
|
|
17
|
+
- regex: '"busy":\s*false'
|
|
18
|
+
flags: 'i'
|
|
19
|
+
- regex: 'broker\.actions\.githubusercontent\.com.*busy.*JobState'
|
|
20
|
+
flags: 'i'
|
|
21
|
+
error_messages:
|
|
22
|
+
- '##[error]The runner has received a shutdown signal.'
|
|
23
|
+
- '"busy": false ← REST API shows idle while broker actively renewing job lease'
|
|
24
|
+
- 'Successfully renew job, valid till ...'
|
|
25
|
+
root_cause: |
|
|
26
|
+
On non-ephemeral self-hosted runners (runners that execute multiple sequential jobs on the same
|
|
27
|
+
instance), there is a state desynchronization between the broker layer
|
|
28
|
+
(broker.actions.githubusercontent.com) and the REST API endpoint
|
|
29
|
+
GET /repos/{owner}/{repo}/actions/runners/{runner_id}.
|
|
30
|
+
|
|
31
|
+
The failure sequence:
|
|
32
|
+
1. Runner completes Job A and briefly enters the Online/idle state.
|
|
33
|
+
2. Runner picks up Job B, transitions to Busy, and reports JobState: Busy to the broker.
|
|
34
|
+
3. The broker acknowledges the job and successfully renews the lease every 60 seconds
|
|
35
|
+
(logged as "Successfully renew job, valid till ...").
|
|
36
|
+
4. Despite (3), the REST API returns "busy": false — sometimes immediately when Job B starts,
|
|
37
|
+
sometimes after tracking correctly for several minutes before spontaneously flipping.
|
|
38
|
+
5. Auto-scaling infrastructure (AWS Lambda scaler, GKE controllers, terraform-aws-github-runner,
|
|
39
|
+
or custom polling services) queries the REST API, sees the runner as idle, and terminates the
|
|
40
|
+
EC2/VM instance mid-job.
|
|
41
|
+
6. Job B fails with "The runner has received a shutdown signal." No error is surfaced by GitHub's
|
|
42
|
+
UI indicating the runner was inappropriately killed.
|
|
43
|
+
|
|
44
|
+
The desync appears to occur at the Busy→Online transition boundary between jobs. The broker and
|
|
45
|
+
the REST API maintain separate state stores; in certain timing windows the REST API does not pick
|
|
46
|
+
up the Job B start event, leaving its state stale from the inter-job idle period.
|
|
47
|
+
fix: |
|
|
48
|
+
Option 1 — Use ephemeral runners (recommended). Each runner instance handles exactly one job
|
|
49
|
+
and then terminates. There is no second job pick-up, so the busy-flag desync window never exists.
|
|
50
|
+
|
|
51
|
+
Option 2 — Do not rely solely on "busy": false from the REST API as the termination signal.
|
|
52
|
+
Cross-reference with a job-completion event (CloudWatch, Pub/Sub, broker callback) or add a
|
|
53
|
+
grace period of 60-120 seconds after the REST API first reports idle before terminating.
|
|
54
|
+
|
|
55
|
+
Option 3 — Use the Webhook-based ARC (Actions Runner Controller) instead of REST-poll-based
|
|
56
|
+
autoscaling. ARC receives direct runner lifecycle events and does not depend on the REST API
|
|
57
|
+
busy state for scale-down decisions.
|
|
58
|
+
fix_code:
|
|
59
|
+
- language: hcl
|
|
60
|
+
label: 'Enable ephemeral runners in terraform-aws-github-runner to prevent multi-job desync'
|
|
61
|
+
code: |
|
|
62
|
+
module "runners" {
|
|
63
|
+
source = "philips-labs/github-runner/aws"
|
|
64
|
+
version = "~> 6.6.0"
|
|
65
|
+
|
|
66
|
+
# Each instance executes exactly one job then terminates.
|
|
67
|
+
# Eliminates the busy-flag desync window between sequential jobs.
|
|
68
|
+
enable_ephemeral_runners = true
|
|
69
|
+
|
|
70
|
+
# ... other config
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
- language: yaml
|
|
74
|
+
label: 'Declare ephemeral runner label in the workflow'
|
|
75
|
+
code: |
|
|
76
|
+
jobs:
|
|
77
|
+
build:
|
|
78
|
+
# Request a fresh ephemeral runner — one job per instance.
|
|
79
|
+
# Requires your runner pool to be configured for ephemeral mode.
|
|
80
|
+
runs-on: [self-hosted, ephemeral, linux, x64]
|
|
81
|
+
steps:
|
|
82
|
+
- uses: actions/checkout@v4
|
|
83
|
+
- run: ./build.sh
|
|
84
|
+
prevention:
|
|
85
|
+
- 'Prefer ephemeral runners over reuse runners in autoscaling environments — one job per instance eliminates the busy-flag desync window entirely.'
|
|
86
|
+
- 'Never use "busy": false from the REST API as the sole signal for terminating a non-ephemeral runner instance.'
|
|
87
|
+
- 'Add a minimum 90-second grace period after the REST API first reports busy=false before terminating any non-ephemeral runner.'
|
|
88
|
+
- 'Monitor broker lease-renewal logs ("Successfully renew job") to track active job state independently of the REST API.'
|
|
89
|
+
docs:
|
|
90
|
+
- url: 'https://github.com/actions/runner/issues/4422'
|
|
91
|
+
label: 'actions/runner#4422 — /runners REST API reports busy: false while broker says busy (open)'
|
|
92
|
+
- url: 'https://github.com/github-aws-runners/terraform-aws-github-runner'
|
|
93
|
+
label: 'terraform-aws-github-runner — ephemeral runner module'
|
|
94
|
+
- url: 'https://docs.github.com/en/rest/actions/self-hosted-runners#list-self-hosted-runners-for-a-repository'
|
|
95
|
+
label: 'GitHub Docs — REST API for self-hosted runners'
|
|
96
|
+
- url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/about-self-hosted-runners'
|
|
97
|
+
label: 'GitHub Docs — About self-hosted runners'
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
id: silent-failures-113
|
|
2
|
+
title: 'Org-level self-hosted runner with correct group access never dispatched — job stays Queued indefinitely with no error'
|
|
3
|
+
category: silent-failures
|
|
4
|
+
severity: silent-failure
|
|
5
|
+
tags:
|
|
6
|
+
- self-hosted
|
|
7
|
+
- runner-group
|
|
8
|
+
- org-runner
|
|
9
|
+
- dispatch
|
|
10
|
+
- queued
|
|
11
|
+
- v2-broker
|
|
12
|
+
- silent
|
|
13
|
+
patterns:
|
|
14
|
+
- regex: 'runner_group_id.*null'
|
|
15
|
+
flags: 'i'
|
|
16
|
+
- regex: 'Waiting for a runner to pick up this job'
|
|
17
|
+
flags: 'i'
|
|
18
|
+
error_messages:
|
|
19
|
+
- 'Waiting for a runner to pick up this job'
|
|
20
|
+
- '(job stays in Queued state indefinitely — no error message is displayed)'
|
|
21
|
+
root_cause: |
|
|
22
|
+
When a self-hosted runner is registered at the **organization level** (not the
|
|
23
|
+
repository level) and added to a runner group with explicit per-repository access,
|
|
24
|
+
a bug in the V2 broker flow (`useV2Flow: true`,
|
|
25
|
+
`serverUrl: broker.actions.githubusercontent.com`) can cause the dispatcher to fail
|
|
26
|
+
to resolve the runner group membership against the target repository.
|
|
27
|
+
|
|
28
|
+
The result: the queued job never receives a `runner_group_id` assignment
|
|
29
|
+
(confirmed via `GET /repos/{owner}/{repo}/actions/runs/{run_id}/jobs` API, which
|
|
30
|
+
returns `runnerId: null`, `runnerName: null`, `runnerGroupId: null` throughout the
|
|
31
|
+
entire queue duration). The runner is online and idle throughout — it simply is
|
|
32
|
+
never offered the job.
|
|
33
|
+
|
|
34
|
+
From the UI perspective, the job shows the standard "Waiting for a runner to pick
|
|
35
|
+
up this job" message with no indication that the runner group resolution failed.
|
|
36
|
+
No error annotation is ever produced — the workflow just hangs until cancelled.
|
|
37
|
+
|
|
38
|
+
This failure mode has been observed under these conditions:
|
|
39
|
+
- Runner registered at organization scope
|
|
40
|
+
- Runner group has explicit per-repository access (not "All repositories")
|
|
41
|
+
- Runner version 2.334.0, V2 broker protocol enabled
|
|
42
|
+
- GitHub Team plan
|
|
43
|
+
- GitHub Enterprise Cloud is NOT required to reproduce
|
|
44
|
+
|
|
45
|
+
Repository-level runners using identical labels and configuration dispatch
|
|
46
|
+
correctly, confirming the bug is specific to the org-level + runner-group + V2
|
|
47
|
+
broker path.
|
|
48
|
+
|
|
49
|
+
The precise broker-side failure point is unknown; the hypothesis is an inconsistency
|
|
50
|
+
in how V2 broker resolves org runner group → repository access grant at dispatch time.
|
|
51
|
+
fix: |
|
|
52
|
+
Immediate workaround: re-register the runner at the repository level instead of
|
|
53
|
+
the organization level.
|
|
54
|
+
|
|
55
|
+
1. Stop and unregister the org-level runner:
|
|
56
|
+
cd <runner-dir> && ./config.sh remove --token <removal-token>
|
|
57
|
+
|
|
58
|
+
2. Re-register at repository level:
|
|
59
|
+
./config.sh --url https://github.com/<org>/<repo> --token <repo-token>
|
|
60
|
+
|
|
61
|
+
3. Trigger the workflow again — dispatch should proceed immediately.
|
|
62
|
+
|
|
63
|
+
If you need the runner to serve multiple repositories, repeat the registration
|
|
64
|
+
for each repository, or use runner groups with "All repositories" scope as an
|
|
65
|
+
interim workaround until GitHub resolves the V2 broker dispatch bug.
|
|
66
|
+
|
|
67
|
+
Track actions/runner#4429 for an official fix.
|
|
68
|
+
fix_code:
|
|
69
|
+
- language: bash
|
|
70
|
+
label: 'Re-register runner at repository level instead of org level'
|
|
71
|
+
code: |
|
|
72
|
+
cd /path/to/runner
|
|
73
|
+
|
|
74
|
+
# Remove org-level registration
|
|
75
|
+
./config.sh remove --token <ORG_REMOVAL_TOKEN>
|
|
76
|
+
|
|
77
|
+
# Re-register at repository level
|
|
78
|
+
./config.sh \
|
|
79
|
+
--url https://github.com/<org>/<repo> \
|
|
80
|
+
--token <REPO_RUNNER_TOKEN> \
|
|
81
|
+
--name my-runner \
|
|
82
|
+
--labels my-label \
|
|
83
|
+
--unattended
|
|
84
|
+
|
|
85
|
+
# Start the runner
|
|
86
|
+
./run.sh
|
|
87
|
+
|
|
88
|
+
- language: yaml
|
|
89
|
+
label: 'Workflow — no workflow change needed; fix is in runner registration scope'
|
|
90
|
+
code: |
|
|
91
|
+
# No changes needed in the workflow YAML itself.
|
|
92
|
+
# The runs-on label works correctly once the runner is repo-scoped.
|
|
93
|
+
jobs:
|
|
94
|
+
build:
|
|
95
|
+
runs-on: [self-hosted, my-label]
|
|
96
|
+
steps:
|
|
97
|
+
- uses: actions/checkout@v4
|
|
98
|
+
- run: echo "Runner dispatched correctly"
|
|
99
|
+
prevention:
|
|
100
|
+
- 'Prefer repository-level runner registration for single-repo pipelines; use org-level runners only when the runner must serve many repos and test dispatch before rolling out.'
|
|
101
|
+
- 'After registering an org-level runner, verify dispatch with a simple echo workflow before building production pipelines on top of it.'
|
|
102
|
+
- 'Monitor actions/runner#4429 for a fix to the V2 broker org-runner-group dispatch resolution bug.'
|
|
103
|
+
- 'When investigating stuck jobs, use the REST API (GET /repos/{owner}/{repo}/actions/runs/{run_id}/jobs) to check if runnerGroupId is null — this confirms dispatch resolution failure rather than a resource-wait.'
|
|
104
|
+
docs:
|
|
105
|
+
- url: 'https://github.com/actions/runner/issues/4429'
|
|
106
|
+
label: 'actions/runner#4429 — Org-level self-hosted runner never dispatched despite correct runner group repo access'
|
|
107
|
+
- url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/adding-self-hosted-runners'
|
|
108
|
+
label: 'GitHub Docs — Adding self-hosted runners (org vs repo level)'
|
|
109
|
+
- url: 'https://docs.github.com/en/rest/actions/self-hosted-runners'
|
|
110
|
+
label: 'GitHub REST API — Self-hosted runners'
|