@htekdev/actions-debugger 1.0.4 → 1.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/errors/concurrency-timing/runner-group-not-found-race.yml +97 -0
- package/errors/runner-environment/checkout-safe-directory-container.yml +84 -0
- package/errors/runner-environment/self-hosted-runner-version-deprecated.yml +99 -0
- package/errors/silent-failures/cache-hit-restore-keys-misleading.yml +82 -0
- package/package.json +1 -1
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
id: "concurrency-timing-008"
|
|
2
|
+
title: "Intermittent 'Required runner group not found' when ephemeral runner registers after job dispatch"
|
|
3
|
+
category: "concurrency-timing"
|
|
4
|
+
severity: "error"
|
|
5
|
+
tags:
|
|
6
|
+
- "runner-group"
|
|
7
|
+
- "self-hosted"
|
|
8
|
+
- "ephemeral"
|
|
9
|
+
- "race-condition"
|
|
10
|
+
- "dispatch"
|
|
11
|
+
- "organization"
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: "Required runner group '.*' not found"
|
|
14
|
+
flags: "i"
|
|
15
|
+
- regex: "runner_group_id.*null"
|
|
16
|
+
flags: "i"
|
|
17
|
+
error_messages:
|
|
18
|
+
- "Required runner group 'x' not found"
|
|
19
|
+
root_cause: |
|
|
20
|
+
In autoscaling self-hosted runner setups (EC2, GKE ephemeral runners, ARC), the runner
|
|
21
|
+
must register with GitHub BEFORE the job dispatcher resolves runner group membership.
|
|
22
|
+
|
|
23
|
+
The race condition occurs when:
|
|
24
|
+
1. A workflow is triggered and GitHub's broker immediately tries to assign the job
|
|
25
|
+
2. The ephemeral runner is still initializing (EC2 bootstrap, container pull, ~2:30 min)
|
|
26
|
+
3. The broker resolves runner group membership before the new runner completes registration
|
|
27
|
+
4. The broker reports "Required runner group 'X' not found" and fails the job
|
|
28
|
+
|
|
29
|
+
This is intermittent: matrix jobs expose it clearly because some cells get
|
|
30
|
+
already-running (pre-registered) runners while others need a fresh runner, triggering
|
|
31
|
+
the race. Inspecting the failed job via the Jobs API shows `runner_group_id: null`
|
|
32
|
+
and `runner_name: null` throughout the queue duration even though the runner group
|
|
33
|
+
exists and has the correct repository access.
|
|
34
|
+
|
|
35
|
+
A separate but related pattern occurs with org-level runner group repository access
|
|
36
|
+
grants not propagating to the broker V2 protocol in time, causing identical symptoms
|
|
37
|
+
regardless of runner initialization speed.
|
|
38
|
+
|
|
39
|
+
Reported upstream: https://github.com/actions/runner/issues/4252
|
|
40
|
+
Related: https://github.com/actions/runner/issues/4429
|
|
41
|
+
fix: |
|
|
42
|
+
For ephemeral autoscaling runners:
|
|
43
|
+
Implement a registration wait loop that polls the GitHub Runners API before signaling
|
|
44
|
+
the runner as available. The runner should only become eligible for jobs after the
|
|
45
|
+
broker has acknowledged its registration.
|
|
46
|
+
|
|
47
|
+
For org-level runner group access issues:
|
|
48
|
+
Verify that the target repository is in the runner group's allowed repositories list
|
|
49
|
+
via API. If misconfigured, re-registering the runner at repository level instead of
|
|
50
|
+
org level is a reliable workaround.
|
|
51
|
+
|
|
52
|
+
General mitigation: Add timeout-minutes to all jobs on self-hosted runners so stuck
|
|
53
|
+
queued jobs fail fast rather than waiting until the 6-hour workflow timeout.
|
|
54
|
+
fix_code:
|
|
55
|
+
- language: yaml
|
|
56
|
+
label: "Add timeout-minutes to detect stuck queued jobs quickly"
|
|
57
|
+
code: |
|
|
58
|
+
jobs:
|
|
59
|
+
build:
|
|
60
|
+
runs-on:
|
|
61
|
+
group: my-runner-group
|
|
62
|
+
labels: [self-hosted, linux]
|
|
63
|
+
timeout-minutes: 10 # Fail fast if runner never picks up the job
|
|
64
|
+
steps:
|
|
65
|
+
- uses: actions/checkout@v4
|
|
66
|
+
- run: echo "Runner assigned successfully"
|
|
67
|
+
- language: yaml
|
|
68
|
+
label: "Verify runner group repository access via API"
|
|
69
|
+
code: |
|
|
70
|
+
jobs:
|
|
71
|
+
debug-runner-group:
|
|
72
|
+
runs-on: ubuntu-latest
|
|
73
|
+
steps:
|
|
74
|
+
- name: Check runner group repository access
|
|
75
|
+
env:
|
|
76
|
+
GH_TOKEN: ${{ secrets.ORG_RUNNER_READ_TOKEN }}
|
|
77
|
+
run: |
|
|
78
|
+
echo "Runner groups and their visibility:"
|
|
79
|
+
gh api /orgs/${{ github.repository_owner }}/actions/runner-groups \
|
|
80
|
+
--jq '.runner_groups[] | "\(.name) (id: \(.id)) — visibility: \(.visibility)"'
|
|
81
|
+
|
|
82
|
+
echo "Repositories allowed for runner group ID 1:"
|
|
83
|
+
gh api /orgs/${{ github.repository_owner }}/actions/runner-groups/1/repositories \
|
|
84
|
+
--jq '.repositories[].full_name'
|
|
85
|
+
prevention:
|
|
86
|
+
- "Add timeout-minutes to all jobs using self-hosted runners so stuck-queued jobs fail fast instead of waiting for the 6h workflow limit"
|
|
87
|
+
- "For ephemeral runners (EC2/ARC), implement a registration health check that polls the Runners API before the runner accepts jobs"
|
|
88
|
+
- "For org-level runners, verify group repository access via API after any runner group configuration change"
|
|
89
|
+
- "For matrix jobs with ephemeral runners, keep N idle pre-registered runners to avoid cold-start races"
|
|
90
|
+
- "Monitor runner_group_id via the Jobs API to detect dispatch failures early in autoscaling pipelines"
|
|
91
|
+
docs:
|
|
92
|
+
- url: "https://github.com/actions/runner/issues/4252"
|
|
93
|
+
label: "actions/runner #4252 — Intermittent runner group not found"
|
|
94
|
+
- url: "https://github.com/actions/runner/issues/4429"
|
|
95
|
+
label: "actions/runner #4429 — Org-level runner never dispatched"
|
|
96
|
+
- url: "https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/managing-access-to-self-hosted-runners-using-groups"
|
|
97
|
+
label: "GitHub Docs — Managing runner group access"
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
id: "runner-environment-022"
|
|
2
|
+
title: "actions/checkout set-safe-directory only runs in post step — container jobs get dubious ownership errors"
|
|
3
|
+
category: "runner-environment"
|
|
4
|
+
severity: "error"
|
|
5
|
+
tags:
|
|
6
|
+
- "actions/checkout"
|
|
7
|
+
- "safe-directory"
|
|
8
|
+
- "container"
|
|
9
|
+
- "dubious-ownership"
|
|
10
|
+
- "CVE-2022-24765"
|
|
11
|
+
- "post-step"
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: "fatal: detected dubious ownership in repository"
|
|
14
|
+
flags: "i"
|
|
15
|
+
- regex: "safe\\.directory.*not.*owned"
|
|
16
|
+
flags: "i"
|
|
17
|
+
error_messages:
|
|
18
|
+
- "fatal: detected dubious ownership in repository at '/github/workspace'"
|
|
19
|
+
- "hint: git config --global --add safe.directory /github/workspace"
|
|
20
|
+
root_cause: |
|
|
21
|
+
The `actions/checkout` action configures `safe.directory` to allow git operations in
|
|
22
|
+
the workspace. However, this configuration only runs in the **post step** (cleanup
|
|
23
|
+
phase), not during the main execution step.
|
|
24
|
+
|
|
25
|
+
In container jobs, the workspace is mounted from the host and may be owned by a
|
|
26
|
+
different UID than the user running inside the container. Git's safe.directory
|
|
27
|
+
protection (introduced in Git 2.35.2 for CVE-2022-24765) blocks access when the
|
|
28
|
+
directory owner differs from the running user.
|
|
29
|
+
|
|
30
|
+
Because safe.directory is only written during the post step — after all workflow
|
|
31
|
+
steps have already run — any subsequent git operations in the job's main steps
|
|
32
|
+
fail with "fatal: detected dubious ownership". This includes third-party actions
|
|
33
|
+
that internally invoke git (reviewdog, gitops tools, semantic-release, etc.).
|
|
34
|
+
|
|
35
|
+
Reported upstream: https://github.com/actions/checkout/issues/2031
|
|
36
|
+
fix: |
|
|
37
|
+
Add an explicit safe.directory configuration step immediately after `actions/checkout`
|
|
38
|
+
in any container job that performs git operations. This ensures the directory is
|
|
39
|
+
trusted before any subsequent steps run.
|
|
40
|
+
fix_code:
|
|
41
|
+
- language: yaml
|
|
42
|
+
label: "Add safe.directory config step after checkout in container jobs"
|
|
43
|
+
code: |
|
|
44
|
+
jobs:
|
|
45
|
+
build:
|
|
46
|
+
runs-on: ubuntu-latest
|
|
47
|
+
container: node:20-bookworm
|
|
48
|
+
steps:
|
|
49
|
+
- uses: actions/checkout@v4
|
|
50
|
+
|
|
51
|
+
# Workaround: post step safe.directory config doesn't help in container jobs
|
|
52
|
+
- name: Mark workspace as safe for git
|
|
53
|
+
run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
|
|
54
|
+
|
|
55
|
+
- name: Run git-dependent steps
|
|
56
|
+
run: git log --oneline -5
|
|
57
|
+
- language: yaml
|
|
58
|
+
label: "Use wildcard to mark all directories safe in container workflows"
|
|
59
|
+
code: |
|
|
60
|
+
jobs:
|
|
61
|
+
build:
|
|
62
|
+
runs-on: ubuntu-latest
|
|
63
|
+
container: python:3.12-slim
|
|
64
|
+
steps:
|
|
65
|
+
- uses: actions/checkout@v4
|
|
66
|
+
|
|
67
|
+
- name: Configure git safe directories
|
|
68
|
+
run: |
|
|
69
|
+
git config --global --add safe.directory '*'
|
|
70
|
+
|
|
71
|
+
- name: Lint with pre-commit
|
|
72
|
+
run: pre-commit run --all-files
|
|
73
|
+
prevention:
|
|
74
|
+
- "Always add a safe.directory config step after checkout when using container jobs"
|
|
75
|
+
- "Audit third-party actions in container jobs — reviewdog, semantic-release, and gitops tools invoke git internally"
|
|
76
|
+
- "Consider running without a container and using docker run explicitly if git safety is complex to manage"
|
|
77
|
+
- "Track https://github.com/actions/checkout/issues/2031 for an official fix from the actions team"
|
|
78
|
+
docs:
|
|
79
|
+
- url: "https://github.com/actions/checkout/issues/2031"
|
|
80
|
+
label: "actions/checkout #2031 — safe.directory only set in post step"
|
|
81
|
+
- url: "https://github.blog/2022-04-12-git-security-vulnerability-announced/"
|
|
82
|
+
label: "Git CVE-2022-24765 — safe.directory background"
|
|
83
|
+
- url: "https://docs.github.com/en/actions/writing-workflows/choosing-where-your-workflow-runs/running-jobs-in-a-container"
|
|
84
|
+
label: "GitHub Docs — Running jobs in a container"
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
id: "runner-environment-023"
|
|
2
|
+
title: "Self-hosted runner on deprecated version stops receiving jobs"
|
|
3
|
+
category: "runner-environment"
|
|
4
|
+
severity: "error"
|
|
5
|
+
tags:
|
|
6
|
+
- "self-hosted"
|
|
7
|
+
- "runner"
|
|
8
|
+
- "deprecated"
|
|
9
|
+
- "version"
|
|
10
|
+
- "cannot-receive-messages"
|
|
11
|
+
- "maintenance"
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: "Runner version v\\d+\\.\\d+\\.\\d+ is deprecated and cannot receive messages"
|
|
14
|
+
flags: "i"
|
|
15
|
+
- regex: "WRITE ERROR.*runner.*deprecated"
|
|
16
|
+
flags: "i"
|
|
17
|
+
error_messages:
|
|
18
|
+
- "Runner version v2.332.0 is deprecated and cannot receive messages."
|
|
19
|
+
- "WRITE ERROR: An error occured: Runner version v2.XXX.X is deprecated and cannot receive messages."
|
|
20
|
+
root_cause: |
|
|
21
|
+
GitHub periodically deprecates older self-hosted runner versions. Once a runner version
|
|
22
|
+
is past its deprecation deadline, the runner agent can no longer communicate with the
|
|
23
|
+
GitHub Actions broker service.
|
|
24
|
+
|
|
25
|
+
The runner process stays alive, appears online in the GitHub UI (Settings → Actions →
|
|
26
|
+
Runners), and is listed as "Active" — but it can no longer receive job assignments.
|
|
27
|
+
Jobs queued for a runner group containing only deprecated-version runners will either
|
|
28
|
+
stay "Queued" indefinitely or time out without a clear error in the workflow UI.
|
|
29
|
+
|
|
30
|
+
This is a silent failure mode: the runner shows as online, no workflow error is
|
|
31
|
+
surfaced, but jobs never start. The deprecation schedule is published in the GitHub
|
|
32
|
+
Changelog and actions/runner releases but teams often miss it without automated
|
|
33
|
+
update pipelines.
|
|
34
|
+
|
|
35
|
+
As of 2026, GitHub requires runners to be within approximately the last 6 months of
|
|
36
|
+
releases. Related issues: actions/runner #4305, actions/runner #4442
|
|
37
|
+
fix: |
|
|
38
|
+
Update the runner binary to a currently supported version.
|
|
39
|
+
|
|
40
|
+
For manually managed runners:
|
|
41
|
+
1. SSH to the runner host
|
|
42
|
+
2. Stop the runner service: ./svc.sh stop
|
|
43
|
+
3. Download the latest runner from https://github.com/actions/runner/releases/latest
|
|
44
|
+
4. Extract to the runner directory (configuration is preserved via .runner file)
|
|
45
|
+
5. Restart: ./svc.sh start
|
|
46
|
+
|
|
47
|
+
For ARC (Actions Runner Controller) or autoscaling solutions: bump the runner image
|
|
48
|
+
version tag in your HelmRelease or Deployment manifest and redeploy.
|
|
49
|
+
fix_code:
|
|
50
|
+
- language: yaml
|
|
51
|
+
label: "Scheduled workflow to alert on outdated runner versions"
|
|
52
|
+
code: |
|
|
53
|
+
name: Check self-hosted runner versions
|
|
54
|
+
on:
|
|
55
|
+
schedule:
|
|
56
|
+
- cron: '0 9 * * 1' # Weekly Monday 9 AM
|
|
57
|
+
|
|
58
|
+
jobs:
|
|
59
|
+
check-runners:
|
|
60
|
+
runs-on: ubuntu-latest # GitHub-hosted runner for this diagnostic
|
|
61
|
+
steps:
|
|
62
|
+
- name: List runner versions via API
|
|
63
|
+
env:
|
|
64
|
+
GH_TOKEN: ${{ secrets.RUNNER_READ_TOKEN }}
|
|
65
|
+
run: |
|
|
66
|
+
echo "=== Org runners ==="
|
|
67
|
+
gh api /orgs/${{ github.repository_owner }}/actions/runners \
|
|
68
|
+
--jq '.runners[] | "\(.name): v\(.version) (\(.status))"'
|
|
69
|
+
|
|
70
|
+
- name: Check latest available version
|
|
71
|
+
run: |
|
|
72
|
+
LATEST=$(curl -sf https://api.github.com/repos/actions/runner/releases/latest | jq -r .tag_name)
|
|
73
|
+
echo "Latest runner version: $LATEST"
|
|
74
|
+
echo "Compare against your registered runners above"
|
|
75
|
+
- language: yaml
|
|
76
|
+
label: "ARC — bump runner version in HelmRelease"
|
|
77
|
+
code: |
|
|
78
|
+
# In your ARC HelmRelease or values.yaml
|
|
79
|
+
githubConfigUrl: "https://github.com/myorg"
|
|
80
|
+
template:
|
|
81
|
+
spec:
|
|
82
|
+
containers:
|
|
83
|
+
- name: runner
|
|
84
|
+
image: ghcr.io/actions/actions-runner:2.335.0 # Bump this regularly
|
|
85
|
+
prevention:
|
|
86
|
+
- "Subscribe to the GitHub Changelog (https://github.blog/changelog/) or watch actions/runner releases for deprecation notices"
|
|
87
|
+
- "Use Actions Runner Controller (ARC) or an autoscaling solution to automate runner lifecycle management"
|
|
88
|
+
- "Schedule a weekly cron workflow that checks registered runner versions via the Runners API and alerts if any are outdated"
|
|
89
|
+
- "Pin runner version in IaC (Terraform, Ansible) and include a runner version bump in your monthly maintenance checklist"
|
|
90
|
+
- "Set up Dependabot or Renovate to auto-update runner image tags in Docker/ARC manifests"
|
|
91
|
+
docs:
|
|
92
|
+
- url: "https://github.com/actions/runner/releases"
|
|
93
|
+
label: "actions/runner releases — version history and changelogs"
|
|
94
|
+
- url: "https://github.com/actions/runner/issues/4305"
|
|
95
|
+
label: "actions/runner #4305 — runner deprecated cannot receive messages"
|
|
96
|
+
- url: "https://github.com/actions/runner/issues/4442"
|
|
97
|
+
label: "actions/runner #4442 — version deprecation notice"
|
|
98
|
+
- url: "https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/about-self-hosted-runners"
|
|
99
|
+
label: "GitHub Docs — About self-hosted runners"
|
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
id: "silent-failures-010"
|
|
2
|
+
title: "cache-hit output is 'true' on restore-keys partial match, not just exact key"
|
|
3
|
+
category: "silent-failures"
|
|
4
|
+
severity: "silent-failure"
|
|
5
|
+
tags:
|
|
6
|
+
- "actions/cache"
|
|
7
|
+
- "cache-hit"
|
|
8
|
+
- "restore-keys"
|
|
9
|
+
- "partial-match"
|
|
10
|
+
- "output"
|
|
11
|
+
- "exact-match"
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: "cache-hit.*true"
|
|
14
|
+
flags: "i"
|
|
15
|
+
error_messages:
|
|
16
|
+
- "cache-hit: true"
|
|
17
|
+
root_cause: |
|
|
18
|
+
The `cache-hit` output of `actions/cache` is documented to return `true` for an exact
|
|
19
|
+
cache key match. In practice, `cache-hit` also returns `true` when the key matched via
|
|
20
|
+
`restore-keys` (a partial/fallback match), not only for exact key hits.
|
|
21
|
+
|
|
22
|
+
Workflows that gate post-build steps on `steps.cache.outputs.cache-hit == 'true'` to
|
|
23
|
+
skip dependency installs assume an exact match. When a restore-keys match occurs,
|
|
24
|
+
`cache-hit` is `true` even though the cache may be stale or from a different branch.
|
|
25
|
+
The correct exact-match indicator is the `exact-match` output (available in
|
|
26
|
+
actions/cache v4+) or comparing `cache-matched-key` against the computed key.
|
|
27
|
+
|
|
28
|
+
Reported upstream: https://github.com/actions/cache/issues/1675
|
|
29
|
+
fix: |
|
|
30
|
+
Use the `exact-match` output (actions/cache v4+) to determine if the restore was an
|
|
31
|
+
exact key match. `cache-hit` alone does not distinguish between exact and partial
|
|
32
|
+
(restore-keys) matches.
|
|
33
|
+
|
|
34
|
+
Alternatively, compare the `cache-matched-key` output against the expected key to
|
|
35
|
+
determine if the restore was exact.
|
|
36
|
+
fix_code:
|
|
37
|
+
- language: yaml
|
|
38
|
+
label: "Use exact-match output (actions/cache v4+)"
|
|
39
|
+
code: |
|
|
40
|
+
- name: Restore cache
|
|
41
|
+
id: cache
|
|
42
|
+
uses: actions/cache@v4
|
|
43
|
+
with:
|
|
44
|
+
path: ~/.npm
|
|
45
|
+
key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
|
|
46
|
+
restore-keys: |
|
|
47
|
+
${{ runner.os }}-node-
|
|
48
|
+
|
|
49
|
+
# Only skip install on EXACT key match, not partial restore-keys hit
|
|
50
|
+
- name: Install dependencies
|
|
51
|
+
if: steps.cache.outputs.exact-match != 'true'
|
|
52
|
+
run: npm ci
|
|
53
|
+
- language: yaml
|
|
54
|
+
label: "Compare cache-matched-key to verify exact hit"
|
|
55
|
+
code: |
|
|
56
|
+
- name: Restore cache
|
|
57
|
+
id: cache
|
|
58
|
+
uses: actions/cache@v4
|
|
59
|
+
with:
|
|
60
|
+
path: ~/.npm
|
|
61
|
+
key: npm-${{ hashFiles('**/package-lock.json') }}
|
|
62
|
+
restore-keys: npm-
|
|
63
|
+
|
|
64
|
+
- name: Install or skip dependencies
|
|
65
|
+
run: |
|
|
66
|
+
EXPECTED_KEY="npm-${{ hashFiles('**/package-lock.json') }}"
|
|
67
|
+
if [ "${{ steps.cache.outputs.cache-matched-key }}" = "$EXPECTED_KEY" ]; then
|
|
68
|
+
echo "Exact cache hit — skipping npm ci"
|
|
69
|
+
else
|
|
70
|
+
echo "Partial/stale restore-keys hit — running npm ci"
|
|
71
|
+
npm ci
|
|
72
|
+
fi
|
|
73
|
+
prevention:
|
|
74
|
+
- "Never rely on cache-hit == 'true' alone to skip dependency installs; it fires on partial restore-keys matches too"
|
|
75
|
+
- "Use the exact-match output (actions/cache v4+) when you need to distinguish exact vs partial cache hits"
|
|
76
|
+
- "Use cache-matched-key output to log or compare the actual key that was restored"
|
|
77
|
+
- "If using restore-keys, always validate that your skip-install condition handles partial matches correctly"
|
|
78
|
+
docs:
|
|
79
|
+
- url: "https://github.com/actions/cache/issues/1675"
|
|
80
|
+
label: "actions/cache #1675 — cache-hit true on restore-keys match"
|
|
81
|
+
- url: "https://github.com/actions/cache#outputs"
|
|
82
|
+
label: "actions/cache outputs documentation"
|
package/package.json
CHANGED