@htekdev/actions-debugger 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,97 @@
1
+ id: "concurrency-timing-008"
2
+ title: "Intermittent 'Required runner group not found' when ephemeral runner registers after job dispatch"
3
+ category: "concurrency-timing"
4
+ severity: "error"
5
+ tags:
6
+ - "runner-group"
7
+ - "self-hosted"
8
+ - "ephemeral"
9
+ - "race-condition"
10
+ - "dispatch"
11
+ - "organization"
12
+ patterns:
13
+ - regex: "Required runner group '.*' not found"
14
+ flags: "i"
15
+ - regex: "runner_group_id.*null"
16
+ flags: "i"
17
+ error_messages:
18
+ - "Required runner group 'x' not found"
19
+ root_cause: |
20
+ In autoscaling self-hosted runner setups (EC2, GKE ephemeral runners, ARC), the runner
21
+ must register with GitHub BEFORE the job dispatcher resolves runner group membership.
22
+
23
+ The race condition occurs when:
24
+ 1. A workflow is triggered and GitHub's broker immediately tries to assign the job
25
+ 2. The ephemeral runner is still initializing (EC2 bootstrap, container pull, ~2:30 min)
26
+ 3. The broker resolves runner group membership before the new runner completes registration
27
+ 4. The broker reports "Required runner group 'X' not found" and fails the job
28
+
29
+ This is intermittent: matrix jobs expose it clearly because some cells get
30
+ already-running (pre-registered) runners while others need a fresh runner, triggering
31
+ the race. Inspecting the failed job via the Jobs API shows `runner_group_id: null`
32
+ and `runner_name: null` throughout the queue duration even though the runner group
33
+ exists and has the correct repository access.
34
+
35
+ A separate but related pattern occurs with org-level runner group repository access
36
+ grants not propagating to the broker V2 protocol in time, causing identical symptoms
37
+ regardless of runner initialization speed.
38
+
39
+ Reported upstream: https://github.com/actions/runner/issues/4252
40
+ Related: https://github.com/actions/runner/issues/4429
41
+ fix: |
42
+ For ephemeral autoscaling runners:
43
+ Implement a registration wait loop that polls the GitHub Runners API before signaling
44
+ the runner as available. The runner should only become eligible for jobs after the
45
+ broker has acknowledged its registration.
46
+
47
+ For org-level runner group access issues:
48
+ Verify that the target repository is in the runner group's allowed repositories list
49
+ via API. If misconfigured, re-registering the runner at repository level instead of
50
+ org level is a reliable workaround.
51
+
52
+ General mitigation: Add timeout-minutes to all jobs on self-hosted runners so stuck
53
+ queued jobs fail fast rather than waiting until the 6-hour workflow timeout.
54
+ fix_code:
55
+ - language: yaml
56
+ label: "Add timeout-minutes to detect stuck queued jobs quickly"
57
+ code: |
58
+ jobs:
59
+ build:
60
+ runs-on:
61
+ group: my-runner-group
62
+ labels: [self-hosted, linux]
63
+ timeout-minutes: 10 # Fail fast if runner never picks up the job
64
+ steps:
65
+ - uses: actions/checkout@v4
66
+ - run: echo "Runner assigned successfully"
67
+ - language: yaml
68
+ label: "Verify runner group repository access via API"
69
+ code: |
70
+ jobs:
71
+ debug-runner-group:
72
+ runs-on: ubuntu-latest
73
+ steps:
74
+ - name: Check runner group repository access
75
+ env:
76
+ GH_TOKEN: ${{ secrets.ORG_RUNNER_READ_TOKEN }}
77
+ run: |
78
+ echo "Runner groups and their visibility:"
79
+ gh api /orgs/${{ github.repository_owner }}/actions/runner-groups \
80
+ --jq '.runner_groups[] | "\(.name) (id: \(.id)) — visibility: \(.visibility)"'
81
+
82
+ echo "Repositories allowed for runner group ID 1:"
83
+ gh api /orgs/${{ github.repository_owner }}/actions/runner-groups/1/repositories \
84
+ --jq '.repositories[].full_name'
85
+ prevention:
86
+ - "Add timeout-minutes to all jobs using self-hosted runners so stuck-queued jobs fail fast instead of waiting for the 6h workflow limit"
87
+ - "For ephemeral runners (EC2/ARC), implement a registration health check that polls the Runners API before the runner accepts jobs"
88
+ - "For org-level runners, verify group repository access via API after any runner group configuration change"
89
+ - "For matrix jobs with ephemeral runners, keep N idle pre-registered runners to avoid cold-start races"
90
+ - "Monitor runner_group_id via the Jobs API to detect dispatch failures early in autoscaling pipelines"
91
+ docs:
92
+ - url: "https://github.com/actions/runner/issues/4252"
93
+ label: "actions/runner #4252 — Intermittent runner group not found"
94
+ - url: "https://github.com/actions/runner/issues/4429"
95
+ label: "actions/runner #4429 — Org-level runner never dispatched"
96
+ - url: "https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/managing-access-to-self-hosted-runners-using-groups"
97
+ label: "GitHub Docs — Managing runner group access"
@@ -0,0 +1,84 @@
1
+ id: "runner-environment-022"
2
+ title: "actions/checkout set-safe-directory only runs in post step — container jobs get dubious ownership errors"
3
+ category: "runner-environment"
4
+ severity: "error"
5
+ tags:
6
+ - "actions/checkout"
7
+ - "safe-directory"
8
+ - "container"
9
+ - "dubious-ownership"
10
+ - "CVE-2022-24765"
11
+ - "post-step"
12
+ patterns:
13
+ - regex: "fatal: detected dubious ownership in repository"
14
+ flags: "i"
15
+ - regex: "safe\\.directory.*not.*owned"
16
+ flags: "i"
17
+ error_messages:
18
+ - "fatal: detected dubious ownership in repository at '/github/workspace'"
19
+ - "hint: git config --global --add safe.directory /github/workspace"
20
+ root_cause: |
21
+ The `actions/checkout` action configures `safe.directory` to allow git operations in
22
+ the workspace. However, this configuration only runs in the **post step** (cleanup
23
+ phase), not during the main execution step.
24
+
25
+ In container jobs, the workspace is mounted from the host and may be owned by a
26
+ different UID than the user running inside the container. Git's safe.directory
27
+ protection (introduced in Git 2.35.2 for CVE-2022-24765) blocks access when the
28
+ directory owner differs from the running user.
29
+
30
+ Because safe.directory is only written during the post step — after all workflow
31
+ steps have already run — any subsequent git operations in the job's main steps
32
+ fail with "fatal: detected dubious ownership". This includes third-party actions
33
+ that internally invoke git (reviewdog, gitops tools, semantic-release, etc.).
34
+
35
+ Reported upstream: https://github.com/actions/checkout/issues/2031
36
+ fix: |
37
+ Add an explicit safe.directory configuration step immediately after `actions/checkout`
38
+ in any container job that performs git operations. This ensures the directory is
39
+ trusted before any subsequent steps run.
40
+ fix_code:
41
+ - language: yaml
42
+ label: "Add safe.directory config step after checkout in container jobs"
43
+ code: |
44
+ jobs:
45
+ build:
46
+ runs-on: ubuntu-latest
47
+ container: node:20-bookworm
48
+ steps:
49
+ - uses: actions/checkout@v4
50
+
51
+ # Workaround: post step safe.directory config doesn't help in container jobs
52
+ - name: Mark workspace as safe for git
53
+ run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
54
+
55
+ - name: Run git-dependent steps
56
+ run: git log --oneline -5
57
+ - language: yaml
58
+ label: "Use wildcard to mark all directories safe in container workflows"
59
+ code: |
60
+ jobs:
61
+ build:
62
+ runs-on: ubuntu-latest
63
+ container: python:3.12-slim
64
+ steps:
65
+ - uses: actions/checkout@v4
66
+
67
+ - name: Configure git safe directories
68
+ run: |
69
+ git config --global --add safe.directory '*'
70
+
71
+ - name: Lint with pre-commit
72
+ run: pre-commit run --all-files
73
+ prevention:
74
+ - "Always add a safe.directory config step after checkout when using container jobs"
75
+ - "Audit third-party actions in container jobs — reviewdog, semantic-release, and gitops tools invoke git internally"
76
+ - "Consider running without a container and using docker run explicitly if git safety is complex to manage"
77
+ - "Track https://github.com/actions/checkout/issues/2031 for an official fix from the actions team"
78
+ docs:
79
+ - url: "https://github.com/actions/checkout/issues/2031"
80
+ label: "actions/checkout #2031 — safe.directory only set in post step"
81
+ - url: "https://github.blog/2022-04-12-git-security-vulnerability-announced/"
82
+ label: "Git CVE-2022-24765 — safe.directory background"
83
+ - url: "https://docs.github.com/en/actions/writing-workflows/choosing-where-your-workflow-runs/running-jobs-in-a-container"
84
+ label: "GitHub Docs — Running jobs in a container"
@@ -0,0 +1,99 @@
1
+ id: "runner-environment-023"
2
+ title: "Self-hosted runner on deprecated version stops receiving jobs"
3
+ category: "runner-environment"
4
+ severity: "error"
5
+ tags:
6
+ - "self-hosted"
7
+ - "runner"
8
+ - "deprecated"
9
+ - "version"
10
+ - "cannot-receive-messages"
11
+ - "maintenance"
12
+ patterns:
13
+ - regex: "Runner version v\\d+\\.\\d+\\.\\d+ is deprecated and cannot receive messages"
14
+ flags: "i"
15
+ - regex: "WRITE ERROR.*runner.*deprecated"
16
+ flags: "i"
17
+ error_messages:
18
+ - "Runner version v2.332.0 is deprecated and cannot receive messages."
19
+ - "WRITE ERROR: An error occured: Runner version v2.XXX.X is deprecated and cannot receive messages."
20
+ root_cause: |
21
+ GitHub periodically deprecates older self-hosted runner versions. Once a runner version
22
+ is past its deprecation deadline, the runner agent can no longer communicate with the
23
+ GitHub Actions broker service.
24
+
25
+ The runner process stays alive, appears online in the GitHub UI (Settings → Actions →
26
+ Runners), and is listed as "Active" — but it can no longer receive job assignments.
27
+ Jobs queued for a runner group containing only deprecated-version runners will either
28
+ stay "Queued" indefinitely or time out without a clear error in the workflow UI.
29
+
30
+ This is a silent failure mode: the runner shows as online, no workflow error is
31
+ surfaced, but jobs never start. The deprecation schedule is published in the GitHub
32
+ Changelog and actions/runner releases but teams often miss it without automated
33
+ update pipelines.
34
+
35
+ As of 2026, GitHub requires runners to be within approximately the last 6 months of
36
+ releases. Related issues: actions/runner #4305, actions/runner #4442
37
+ fix: |
38
+ Update the runner binary to a currently supported version.
39
+
40
+ For manually managed runners:
41
+ 1. SSH to the runner host
42
+ 2. Stop the runner service: ./svc.sh stop
43
+ 3. Download the latest runner from https://github.com/actions/runner/releases/latest
44
+ 4. Extract to the runner directory (configuration is preserved via .runner file)
45
+ 5. Restart: ./svc.sh start
46
+
47
+ For ARC (Actions Runner Controller) or autoscaling solutions: bump the runner image
48
+ version tag in your HelmRelease or Deployment manifest and redeploy.
49
+ fix_code:
50
+ - language: yaml
51
+ label: "Scheduled workflow to alert on outdated runner versions"
52
+ code: |
53
+ name: Check self-hosted runner versions
54
+ on:
55
+ schedule:
56
+ - cron: '0 9 * * 1' # Weekly Monday 9 AM
57
+
58
+ jobs:
59
+ check-runners:
60
+ runs-on: ubuntu-latest # GitHub-hosted runner for this diagnostic
61
+ steps:
62
+ - name: List runner versions via API
63
+ env:
64
+ GH_TOKEN: ${{ secrets.RUNNER_READ_TOKEN }}
65
+ run: |
66
+ echo "=== Org runners ==="
67
+ gh api /orgs/${{ github.repository_owner }}/actions/runners \
68
+ --jq '.runners[] | "\(.name): v\(.version) (\(.status))"'
69
+
70
+ - name: Check latest available version
71
+ run: |
72
+ LATEST=$(curl -sf https://api.github.com/repos/actions/runner/releases/latest | jq -r .tag_name)
73
+ echo "Latest runner version: $LATEST"
74
+ echo "Compare against your registered runners above"
75
+ - language: yaml
76
+ label: "ARC — bump runner version in HelmRelease"
77
+ code: |
78
+ # In your ARC HelmRelease or values.yaml
79
+ githubConfigUrl: "https://github.com/myorg"
80
+ template:
81
+ spec:
82
+ containers:
83
+ - name: runner
84
+ image: ghcr.io/actions/actions-runner:2.335.0 # Bump this regularly
85
+ prevention:
86
+ - "Subscribe to the GitHub Changelog (https://github.blog/changelog/) or watch actions/runner releases for deprecation notices"
87
+ - "Use Actions Runner Controller (ARC) or an autoscaling solution to automate runner lifecycle management"
88
+ - "Schedule a weekly cron workflow that checks registered runner versions via the Runners API and alerts if any are outdated"
89
+ - "Pin runner version in IaC (Terraform, Ansible) and include a runner version bump in your monthly maintenance checklist"
90
+ - "Set up Dependabot or Renovate to auto-update runner image tags in Docker/ARC manifests"
91
+ docs:
92
+ - url: "https://github.com/actions/runner/releases"
93
+ label: "actions/runner releases — version history and changelogs"
94
+ - url: "https://github.com/actions/runner/issues/4305"
95
+ label: "actions/runner #4305 — runner deprecated cannot receive messages"
96
+ - url: "https://github.com/actions/runner/issues/4442"
97
+ label: "actions/runner #4442 — version deprecation notice"
98
+ - url: "https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/about-self-hosted-runners"
99
+ label: "GitHub Docs — About self-hosted runners"
@@ -0,0 +1,82 @@
1
+ id: "silent-failures-010"
2
+ title: "cache-hit output is 'true' on restore-keys partial match, not just exact key"
3
+ category: "silent-failures"
4
+ severity: "silent-failure"
5
+ tags:
6
+ - "actions/cache"
7
+ - "cache-hit"
8
+ - "restore-keys"
9
+ - "partial-match"
10
+ - "output"
11
+ - "exact-match"
12
+ patterns:
13
+ - regex: "cache-hit.*true"
14
+ flags: "i"
15
+ error_messages:
16
+ - "cache-hit: true"
17
+ root_cause: |
18
+ The `cache-hit` output of `actions/cache` is documented to return `true` for an exact
19
+ cache key match. In practice, `cache-hit` also returns `true` when the key matched via
20
+ `restore-keys` (a partial/fallback match), not only for exact key hits.
21
+
22
+ Workflows that gate post-build steps on `steps.cache.outputs.cache-hit == 'true'` to
23
+ skip dependency installs assume an exact match. When a restore-keys match occurs,
24
+ `cache-hit` is `true` even though the cache may be stale or from a different branch.
25
+ The correct exact-match indicator is the `exact-match` output (available in
26
+ actions/cache v4+) or comparing `cache-matched-key` against the computed key.
27
+
28
+ Reported upstream: https://github.com/actions/cache/issues/1675
29
+ fix: |
30
+ Use the `exact-match` output (actions/cache v4+) to determine if the restore was an
31
+ exact key match. `cache-hit` alone does not distinguish between exact and partial
32
+ (restore-keys) matches.
33
+
34
+ Alternatively, compare the `cache-matched-key` output against the expected key to
35
+ determine if the restore was exact.
36
+ fix_code:
37
+ - language: yaml
38
+ label: "Use exact-match output (actions/cache v4+)"
39
+ code: |
40
+ - name: Restore cache
41
+ id: cache
42
+ uses: actions/cache@v4
43
+ with:
44
+ path: ~/.npm
45
+ key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
46
+ restore-keys: |
47
+ ${{ runner.os }}-node-
48
+
49
+ # Only skip install on EXACT key match, not partial restore-keys hit
50
+ - name: Install dependencies
51
+ if: steps.cache.outputs.exact-match != 'true'
52
+ run: npm ci
53
+ - language: yaml
54
+ label: "Compare cache-matched-key to verify exact hit"
55
+ code: |
56
+ - name: Restore cache
57
+ id: cache
58
+ uses: actions/cache@v4
59
+ with:
60
+ path: ~/.npm
61
+ key: npm-${{ hashFiles('**/package-lock.json') }}
62
+ restore-keys: npm-
63
+
64
+ - name: Install or skip dependencies
65
+ run: |
66
+ EXPECTED_KEY="npm-${{ hashFiles('**/package-lock.json') }}"
67
+ if [ "${{ steps.cache.outputs.cache-matched-key }}" = "$EXPECTED_KEY" ]; then
68
+ echo "Exact cache hit — skipping npm ci"
69
+ else
70
+ echo "Partial/stale restore-keys hit — running npm ci"
71
+ npm ci
72
+ fi
73
+ prevention:
74
+ - "Never rely on cache-hit == 'true' alone to skip dependency installs; it fires on partial restore-keys matches too"
75
+ - "Use the exact-match output (actions/cache v4+) when you need to distinguish exact vs partial cache hits"
76
+ - "Use cache-matched-key output to log or compare the actual key that was restored"
77
+ - "If using restore-keys, always validate that your skip-install condition handles partial matches correctly"
78
+ docs:
79
+ - url: "https://github.com/actions/cache/issues/1675"
80
+ label: "actions/cache #1675 — cache-hit true on restore-keys match"
81
+ - url: "https://github.com/actions/cache#outputs"
82
+ label: "actions/cache outputs documentation"
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@htekdev/actions-debugger",
3
- "version": "1.0.4",
3
+ "version": "1.0.5",
4
4
  "description": "65+ real GitHub Actions errors, queryable by agents. MCP server + Copilot skills + error database.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",