@htekdev/actions-debugger 1.0.116 → 1.0.117

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,107 @@
1
+ id: caching-artifacts-068
2
+ title: '`actions/cache` key containing Windows backslash path separators never matches on restore — cache miss every run'
3
+ category: caching-artifacts
4
+ severity: silent-failure
5
+ tags:
6
+ - cache
7
+ - windows
8
+ - path-separator
9
+ - backslash
10
+ - cache-miss
11
+ - cross-platform
12
+ patterns:
13
+ - regex: 'Cache not found for input keys'
14
+ flags: 'i'
15
+ - regex: 'cache.*miss.*windows|windows.*cache.*miss'
16
+ flags: 'i'
17
+ error_messages:
18
+ - 'Cache not found for input keys:'
19
+ - 'cache hit for key: false'
20
+ root_cause: |
21
+ GitHub Actions cache keys are plain strings matched byte-for-byte. On Windows
22
+ runners, several common patterns produce cache keys containing backslashes (`\`):
23
+
24
+ 1. Embedding ${{ runner.temp }} or ${{ github.workspace }} directly in the key:
25
+ key: ${{ runner.os }}-${{ runner.temp }}-${{ hashFiles('**/package-lock.json') }}
26
+ On Windows, runner.temp resolves to `D:\a\_temp` producing a key with `\`.
27
+
28
+ 2. Using hashFiles() with backslash glob patterns:
29
+ key: ${{ hashFiles('**\node_modules\**') }}
30
+ hashFiles() on Windows sometimes receives backslash-escaped paths in its
31
+ argument, encoding them into the resulting hash string.
32
+
33
+ 3. String concatenation with Windows path variables set in earlier steps:
34
+ TOOL_PATH: C:\hostedtoolcache\node\18\x64
35
+ When $TOOL_PATH is embedded in the cache key, the `\` chars are literal.
36
+
37
+ The cache service stores and retrieves keys as exact string matches. A cache
38
+ saved with key `npm-D:\a\_temp-abc123` is never found by a lookup for
39
+ `npm-D:/a/_temp-abc123` or by a lookup on a different run where the runner.temp
40
+ path differs.
41
+
42
+ On Linux/macOS runners, paths always use `/` so this issue is Windows-specific.
43
+ Cross-platform matrix workflows are especially affected: the Linux cache is found
44
+ on restore; the Windows cache is always missed.
45
+ fix: |
46
+ Normalize all path separators to forward slashes before including in cache keys.
47
+
48
+ Option 1 — Avoid runner.temp and github.workspace in cache keys entirely.
49
+ Use runner.os and a fixed path pattern instead:
50
+ key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
51
+
52
+ Option 2 — If you must embed a path, normalize in a prior step:
53
+ - name: Set normalized cache path
54
+ shell: bash
55
+ run: echo "CACHE_PATH=$(echo '${{ runner.temp }}' | tr '\\' '/')" >> $GITHUB_ENV
56
+ Then:
57
+ key: ${{ runner.os }}-${{ env.CACHE_PATH }}-${{ hashFiles(...) }}
58
+
59
+ Option 3 — Always use forward-slash glob patterns in hashFiles():
60
+ key: ${{ hashFiles('**/package-lock.json') }} # correct
61
+ # NOT: ${{ hashFiles('**\package-lock.json') }} # Windows-style — avoid
62
+
63
+ Option 4 — Use a bash shell step to compute the key and store in GITHUB_ENV,
64
+ ensuring all path manipulation happens in a POSIX shell that normalizes separators.
65
+ fix_code:
66
+ - language: yaml
67
+ label: 'Bad: runner.temp in cache key produces backslashes on Windows'
68
+ code: |
69
+ # ❌ runner.temp on Windows = "D:\a\_temp" — backslashes in key → never matches
70
+ - uses: actions/cache@v4
71
+ with:
72
+ path: ${{ runner.temp }}/cache
73
+ key: ${{ runner.os }}-${{ runner.temp }}-${{ hashFiles('**/lock.json') }}
74
+ - language: yaml
75
+ label: 'Good: use runner.os and a fixed path — no path variables in key'
76
+ code: |
77
+ # ✅ No Windows path separators in key
78
+ - uses: actions/cache@v4
79
+ with:
80
+ path: ~/.npm
81
+ key: ${{ runner.os }}-npm-${{ hashFiles('**/package-lock.json') }}
82
+ - language: yaml
83
+ label: 'Good: normalize path separators with bash tr before embedding in key'
84
+ code: |
85
+ - name: Normalize cache path
86
+ shell: bash
87
+ run: |
88
+ NORM_PATH=$(echo '${{ runner.tool_cache }}' | tr '\\' '/')
89
+ echo "NORM_TOOL_CACHE=$NORM_PATH" >> $GITHUB_ENV
90
+
91
+ - uses: actions/cache@v4
92
+ with:
93
+ path: ${{ runner.tool_cache }}
94
+ key: ${{ runner.os }}-tools-${{ env.NORM_TOOL_CACHE }}-${{ hashFiles('**/tool-versions') }}
95
+ prevention:
96
+ - 'Never include runner.temp, runner.tool_cache, or github.workspace in cache keys — these resolve to OS-specific paths with backslashes on Windows'
97
+ - 'Always use runner.os (e.g., "Windows") as the OS discriminator in cross-platform cache keys, not path-based variables'
98
+ - 'Use forward-slash glob patterns in hashFiles() expressions on all platforms'
99
+ - 'When path variables must appear in a cache key, normalize separators to forward slashes in a bash step first and use the env variable'
100
+ - 'Test cache hit rate explicitly in CI by checking the cache-hit output of the restore step on Windows runners'
101
+ docs:
102
+ - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows'
103
+ label: 'GitHub Docs: Caching dependencies to speed up workflows'
104
+ - url: 'https://github.com/actions/cache/issues/671'
105
+ label: 'actions/cache#671: Cache key with Windows path separators causes cache miss'
106
+ - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables'
107
+ label: 'GitHub Docs: Default environment variables (runner.temp, etc.)'
@@ -0,0 +1,89 @@
1
+ id: concurrency-timing-054
2
+ title: '"Re-run failed jobs" bypasses concurrency group — runs in parallel with a newly triggered run for the same group'
3
+ category: concurrency-timing
4
+ severity: silent-failure
5
+ tags:
6
+ - concurrency
7
+ - rerun
8
+ - re-run-failed-jobs
9
+ - parallel
10
+ - cancel-in-progress
11
+ patterns:
12
+ - regex: 'Re-run failed jobs'
13
+ flags: 'i'
14
+ - regex: 'This run was automatically cancelled'
15
+ flags: 'i'
16
+ error_messages:
17
+ - 'This run was automatically cancelled'
18
+ root_cause: |
19
+ When a developer clicks "Re-run failed jobs" (as opposed to "Re-run all jobs"),
20
+ GitHub Actions does not evaluate the workflow's concurrency group before starting
21
+ the rerun. The rerun begins immediately and runs in parallel with any currently
22
+ in-progress or newly triggered run that occupies the same concurrency group.
23
+
24
+ This is distinct from "Re-run all jobs", which DOES re-trigger the concurrency
25
+ check and will cancel the currently in-progress run if cancel-in-progress is true,
26
+ or queue behind it if cancel-in-progress is false.
27
+
28
+ Common scenario:
29
+ 1. A push triggers run A, which starts and partially fails.
30
+ 2. A second push triggers run B for the same branch (same concurrency group).
31
+ Run B is queued or cancels run A depending on cancel-in-progress setting.
32
+ 3. Developer hits "Re-run failed jobs" on run A.
33
+ 4. Run A's rerun starts immediately — now run A (rerun) and run B are both
34
+ executing concurrently, violating the intent of the concurrency group.
35
+
36
+ Side effects:
37
+ - Two deploys to the same environment can run simultaneously.
38
+ - A self-hosted runner with a single slot gets double-booked.
39
+ - Race conditions between two concurrent jobs writing to the same artifact.
40
+
41
+ Root cause: GitHub's "Re-run failed jobs" path was implemented as a targeted job
42
+ restart that bypasses workflow-level orchestration including concurrency evaluation.
43
+ This behavior is tracked in actions/runner#2294 (open).
44
+ fix: |
45
+ Option 1 — Use "Re-run all jobs" instead of "Re-run failed jobs" when the workflow
46
+ has a concurrency group. "Re-run all jobs" triggers a fresh run that participates
47
+ in concurrency evaluation normally.
48
+
49
+ Option 2 — Include github.run_attempt in the concurrency group key so each attempt
50
+ gets its own group slot, preventing cancellation loops during reruns while still
51
+ serializing independent triggers:
52
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.run_attempt }}
53
+ cancel-in-progress: false
54
+
55
+ Option 3 — For deployment workflows, use environment protection rules as the
56
+ serialization mechanism instead of (or in addition to) concurrency groups. A
57
+ pending environment review gate serializes deployments even when concurrency is
58
+ bypassed.
59
+
60
+ Option 4 — Accept the behavior: if the failed jobs don't interact with shared
61
+ resources, concurrent partial reruns may be safe. Restrict "Re-run failed jobs" to
62
+ jobs that are idempotent and isolated.
63
+ fix_code:
64
+ - language: yaml
65
+ label: 'Option A: include run_attempt in group key — each attempt gets its own slot'
66
+ code: |
67
+ concurrency:
68
+ group: ${{ github.workflow }}-${{ github.ref }}-${{ github.run_attempt }}
69
+ cancel-in-progress: false
70
+ - language: yaml
71
+ label: 'Option B: environment protection to serialize deployments independent of concurrency'
72
+ code: |
73
+ jobs:
74
+ deploy:
75
+ environment: production # Requires reviewer approval; serializes even without concurrency
76
+ steps:
77
+ - run: ./deploy.sh
78
+ prevention:
79
+ - 'Prefer "Re-run all jobs" over "Re-run failed jobs" when your workflow uses a concurrency group to prevent duplicate concurrent runs'
80
+ - 'Include ${{ github.run_attempt }} in the concurrency group key to give each attempt its own slot and avoid silent bypass'
81
+ - 'For deployment workflows with shared infrastructure, combine concurrency groups with environment protection rules for defense-in-depth serialization'
82
+ - 'Document in your workflow YAML that "Re-run failed jobs" bypasses concurrency to alert future maintainers'
83
+ docs:
84
+ - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/using-concurrency'
85
+ label: 'GitHub Docs: Using concurrency'
86
+ - url: 'https://github.com/actions/runner/issues/2294'
87
+ label: 'actions/runner#2294: Re-run failed jobs bypasses concurrency group (open)'
88
+ - url: 'https://docs.github.com/en/actions/managing-workflow-runs-and-deployments/managing-workflow-runs/re-running-workflows-and-jobs'
89
+ label: 'GitHub Docs: Re-running workflows and jobs'
@@ -0,0 +1,108 @@
1
+ id: known-unsolved-064
2
+ title: 'No conditional matrix skip — an empty matrix from `fromJSON([])` always fails the workflow with a validation error'
3
+ category: known-unsolved
4
+ severity: limitation
5
+ tags:
6
+ - matrix
7
+ - dynamic-matrix
8
+ - fromJSON
9
+ - conditional
10
+ - strategy
11
+ - limitation
12
+ patterns:
13
+ - regex: 'The strategy/matrix must contain at least one'
14
+ flags: 'i'
15
+ - regex: 'matrix must define at least one vector'
16
+ flags: 'i'
17
+ - regex: 'Error when evaluating .strategy. for job.*matrix.*empty'
18
+ flags: 'i'
19
+ error_messages:
20
+ - 'The strategy/matrix must contain at least one vector'
21
+ - 'matrix must define at least one vector'
22
+ - 'Error when evaluating ''strategy'' for job'
23
+ root_cause: |
24
+ GitHub Actions validates that a matrix strategy always produces at least one job.
25
+ When a prior step outputs an empty JSON array and the matrix job uses
26
+ `fromJSON(steps.generate.outputs.matrix)`, the workflow fails at plan time with
27
+ a validation error before any jobs run.
28
+
29
+ There is NO supported way to:
30
+ - Provide an `if:` condition on the matrix strategy to skip it entirely
31
+ - Use `strategy: if: ${{ condition }}` syntax (does not exist)
32
+ - Have a job run zero times when its matrix is empty
33
+
34
+ This is a fundamental platform limitation: matrix jobs must always expand to at
35
+ least one job instance. The validation runs before job execution, so even an `if:`
36
+ on the job itself does not help — the matrix must be non-empty regardless.
37
+
38
+ Common scenarios:
39
+ - CI that builds only changed packages: if no packages changed, the matrix
40
+ is empty and the entire workflow fails.
41
+ - Release workflows that conditionally matrix over artifacts: if nothing was
42
+ built, the matrix is empty.
43
+ - PR labeler workflows that matrix over affected services: a trivial change
44
+ affects no services, producing an empty list.
45
+
46
+ Note: yaml-syntax-008 covers the technical "fromJSON parse error" message. This
47
+ entry covers the LIMITATION: there is no native mechanism to conditionally skip
48
+ matrix jobs or provide a zero-matrix strategy.
49
+ fix: |
50
+ Workaround 1 — Always include at least one sentinel/dummy entry and guard with if:
51
+ In the matrix-generating step, append a sentinel entry when the list is empty:
52
+ matrix=$(echo "$matrix" | jq 'if length == 0 then [{"skip":true}] else map(. + {"skip":false}) end')
53
+ Then in the job:
54
+ if: ${{ !matrix.skip }}
55
+
56
+ Workaround 2 — Use a separate check job with needs: to gate the matrix job:
57
+ Add an upstream job that outputs a boolean "has_work". The matrix job then uses
58
+ `needs: check` and reads `needs.check.outputs.has_work` in an `if:` condition.
59
+ This still requires the matrix to be non-empty (use sentinel), but the `if:`
60
+ prevents actual work from running.
61
+
62
+ Workaround 3 — Always include a no-op entry in the matrix output:
63
+ Ensure the matrix-generating script never outputs an empty array by always
64
+ appending a `{"target":"none"}` entry, then:
65
+ if: matrix.target != 'none'
66
+
67
+ None of these workarounds are elegant. This is a known limitation with no native
68
+ fix scheduled. Tracked in actions/runner#1502 (96+ reactions, open since 2021).
69
+ fix_code:
70
+ - language: yaml
71
+ label: 'Workaround: inject sentinel entry when matrix is empty, skip in job if:'
72
+ code: |
73
+ jobs:
74
+ generate-matrix:
75
+ runs-on: ubuntu-latest
76
+ outputs:
77
+ matrix: ${{ steps.set-matrix.outputs.matrix }}
78
+ steps:
79
+ - id: set-matrix
80
+ run: |
81
+ # Build your matrix list; inject sentinel when empty
82
+ matrix='[]' # Replace with real generation logic
83
+ if [ "$(echo "$matrix" | jq 'length')" -eq 0 ]; then
84
+ matrix='[{"target":"__skip__"}]'
85
+ fi
86
+ echo "matrix=$matrix" >> $GITHUB_OUTPUT
87
+
88
+ build:
89
+ needs: generate-matrix
90
+ if: ${{ fromJSON(needs.generate-matrix.outputs.matrix)[0].target != '__skip__' }}
91
+ strategy:
92
+ matrix:
93
+ include: ${{ fromJSON(needs.generate-matrix.outputs.matrix) }}
94
+ runs-on: ubuntu-latest
95
+ steps:
96
+ - run: echo "Building ${{ matrix.target }}"
97
+ prevention:
98
+ - 'Never pass a potentially empty array directly to fromJSON() in a matrix strategy — always guard with a sentinel entry'
99
+ - 'Add an upstream check job that validates matrix size before the matrix job runs'
100
+ - 'Document the sentinel pattern in your workflow so future maintainers understand why a dummy entry exists'
101
+ - 'Vote/watch actions/runner#1502 for native conditional matrix support'
102
+ docs:
103
+ - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/running-variations-of-jobs-in-a-workflow'
104
+ label: 'GitHub Docs: Running variations of jobs in a workflow (matrix)'
105
+ - url: 'https://github.com/actions/runner/issues/1502'
106
+ label: 'actions/runner#1502: Support for empty matrix (96+ reactions, open)'
107
+ - url: 'https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#jobsjob_idstrategymatrix'
108
+ label: 'GitHub Docs: jobs.<job_id>.strategy.matrix'
@@ -0,0 +1,100 @@
1
+ id: runner-environment-206
2
+ title: 'actions/checkout v6.0.3 regression — new /hash-algorithm API call on every checkout exhausts rate limits in high-volume orgs'
3
+ category: runner-environment
4
+ severity: error
5
+ tags:
6
+ - checkout
7
+ - rate-limit
8
+ - api-rate-limit
9
+ - v6
10
+ - PAT
11
+ - regression
12
+ - hash-algorithm
13
+ patterns:
14
+ - regex: 'API rate limit exceeded'
15
+ flags: 'i'
16
+ - regex: 'You have exceeded a secondary rate limit'
17
+ flags: 'i'
18
+ - regex: 'HttpError.*API rate limit exceeded'
19
+ flags: 'i'
20
+ - regex: 'Rate limit.*exceeded.*403'
21
+ flags: 'i'
22
+ error_messages:
23
+ - 'Error: HttpError: API rate limit exceeded for user ID'
24
+ - 'You have exceeded a secondary rate limit and have been temporarily blocked from content creation'
25
+ - 'remote: Repository not found.'
26
+ - 'fatal: repository ''https://github.com/owner/repo/'' not found'
27
+ root_cause: |
28
+ actions/checkout v6.0.3 (released June 2, 2026, commit 1cce339) introduced a new
29
+ REST API call to GET /repos/{owner}/{repo}/hash-algorithm on every checkout operation
30
+ to determine the repository's object hashing algorithm.
31
+
32
+ In organizations with high-concurrency workflows, this additional API call multiplies
33
+ rate-limit consumption significantly. A matrix build with 30 parallel jobs each running
34
+ checkout makes 30 additional API calls per push event. At the per-user rate limit of
35
+ 5,000 requests/hour, large orgs using PATs (Personal Access Tokens) for cross-repo
36
+ checkout quickly exhaust their quota across many concurrent pipelines.
37
+
38
+ When the /hash-algorithm endpoint returns HTTP 403 (rate limited), the checkout action
39
+ may interpret the 403 as a resource-not-found condition, producing misleading errors
40
+ such as "remote: Repository not found" that mask the true rate-limit cause.
41
+
42
+ GITHUB_TOKEN is not affected in the same way because it carries per-repository rate
43
+ limits (15,000 requests/hour for Actions) rather than per-user limits.
44
+ Source: actions/checkout#2450.
45
+ fix: |
46
+ Immediate fix — pin to actions/checkout@v6.0.2 until the upstream regression is
47
+ addressed:
48
+ uses: actions/checkout@v6.0.2
49
+
50
+ Use GITHUB_TOKEN instead of PAT where possible:
51
+ GITHUB_TOKEN rate limits (15,000 req/hr for GitHub Actions) are scoped per
52
+ repository and do not aggregate across your organization's other workflows.
53
+ Reserve PATs for cross-repo or cross-org checkouts only.
54
+
55
+ Reduce parallel checkout volume:
56
+ Add fetch-depth: 1 and/or sparse-checkout to minimize the API surface area
57
+ per checkout call, reducing the number of API requests triggered per job.
58
+
59
+ Monitor actions/checkout#2450 for the upstream fix (caching the hash-algorithm
60
+ result within a workflow run, or making the call conditional).
61
+ fix_code:
62
+ - language: yaml
63
+ label: 'Pin to v6.0.2 to avoid the regression until upstream fix is released'
64
+ code: |
65
+ - uses: actions/checkout@v6.0.2
66
+ with:
67
+ # Prefer GITHUB_TOKEN over a PAT to use per-repo rate limits
68
+ token: ${{ secrets.GITHUB_TOKEN }}
69
+ fetch-depth: 1 # Shallow fetch reduces ancillary API calls
70
+ - language: yaml
71
+ label: 'Cross-repo checkout — use dedicated PAT only where necessary, pin version'
72
+ code: |
73
+ - uses: actions/checkout@v6.0.2
74
+ with:
75
+ repository: org/other-repo
76
+ token: ${{ secrets.CROSS_REPO_PAT }} # PAT required here; rate-limited per user
77
+ fetch-depth: 1
78
+ - language: yaml
79
+ label: 'Monitor rate limit headers in a preflight step (diagnostic aid)'
80
+ code: |
81
+ - name: Check remaining GitHub API rate limit
82
+ run: |
83
+ remaining=$(curl -s -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
84
+ https://api.github.com/rate_limit | jq '.rate.remaining')
85
+ echo "Remaining API calls: $remaining"
86
+ if (( remaining < 500 )); then
87
+ echo "::warning::Low API rate limit — $remaining calls remaining"
88
+ fi
89
+ prevention:
90
+ - 'Pin actions/checkout to a specific patch version (e.g., @v6.0.2) in high-volume orgs — patch updates can introduce API regressions like this one'
91
+ - 'Prefer GITHUB_TOKEN over org-wide PATs for checkout; GITHUB_TOKEN rate limits are per-repository and isolated from other workflows'
92
+ - 'Monitor your org REST API usage under Settings > Insights > API Requests to detect unexpected call spikes from action updates before they hit production'
93
+ - 'Add a rate-limit check step to long-running or high-parallelism pipelines to catch exhaustion before it causes misleading errors'
94
+ docs:
95
+ - url: 'https://github.com/actions/checkout/issues/2450'
96
+ label: 'actions/checkout#2450: New /hash-algorithm API call causing rate limiting failures in v6.0.3'
97
+ - url: 'https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api'
98
+ label: 'GitHub Docs: Rate limits for the REST API'
99
+ - url: 'https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#permissions-for-the-github_token'
100
+ label: 'GitHub Docs: GITHUB_TOKEN permissions and rate limits'
@@ -0,0 +1,126 @@
1
+ id: runner-environment-205
2
+ title: 'macOS self-hosted Runner.Listener silently stalls after AAD credential-refresh — ghost-busy state blocks queue'
3
+ category: runner-environment
4
+ severity: silent-failure
5
+ tags:
6
+ - self-hosted
7
+ - macos
8
+ - apple-silicon
9
+ - listener
10
+ - aad
11
+ - ghost-busy
12
+ - broker-reconnect
13
+ - credential-refresh
14
+ patterns:
15
+ - regex: 'AAD Correlation ID for this token request:\s*Unknown'
16
+ flags: 'i'
17
+ - regex: 'RSAFileKeyManager.*Loading RSA key parameters from file.*credentials_rsaparams'
18
+ flags: 'i'
19
+ - regex: 'GitHubActionsService.*AAD Correlation ID.*Unknown'
20
+ flags: 'i'
21
+ error_messages:
22
+ - '[INFO RSAFileKeyManager] Loading RSA key parameters from file .../.credentials_rsaparams'
23
+ - '[INFO GitHubActionsService] AAD Correlation ID for this token request: Unknown'
24
+ root_cause: |
25
+ On long-lived self-hosted macOS runners (v2.334.0+, Apple Silicon), the
26
+ Runner.Listener process can permanently stall after an AAD (Azure Active Directory)
27
+ credential-refresh event coincides with a broker session disconnect.
28
+
29
+ Normal broker long-poll timeouts produce "SocketException (89): Operation canceled"
30
+ entries and the listener successfully reconnects. However, when a broker disconnect
31
+ occurs at the same time as an AAD credential refresh, the listener logs its final
32
+ diagnostic sequence and then goes permanently silent:
33
+ [INFO RSAFileKeyManager] Loading RSA key parameters from file .../.credentials_rsaparams
34
+ [INFO GitHubActionsService] AAD Correlation ID for this token request: Unknown
35
+
36
+ After these lines, the main thread parks in pthread_cond_wait with no further diag
37
+ log output and no TCP ESTABLISHED connection to the broker. The OS process stays alive
38
+ (visible in ps/Activity Monitor/launchctl list), so launchd does not restart it. The
39
+ broker-side agent state continues to show the runner as "busy" from its last completed
40
+ job, stalling all subsequent queued jobs behind the phantom runner until an external
41
+ restart clears the state.
42
+
43
+ The trigger requires: runner lifetime longer than several hours (so a credential
44
+ refresh occurs), plus a broker disconnect at or immediately after the refresh boundary.
45
+ Observed simultaneously affecting all 4 of 4 macOS ARM64 runners on a single host
46
+ within a 32-minute window, causing a 4-hour queue stall. Source: actions/runner#4446.
47
+ fix: |
48
+ No platform-side fix available as of June 2026 (open issue).
49
+
50
+ Workaround — implement an out-of-band watchdog script/cron that:
51
+ 1. Confirms the Runner.Listener PID has an ESTABLISHED TCP socket to the broker
52
+ (check with `lsof -p <pid> -i TCP | grep ESTABLISHED`)
53
+ 2. Confirms the most recent entry in _diag/Runner_*.log is less than N minutes old
54
+ (e.g., 10 minutes)
55
+ 3. If both checks fail, restarts the runner service:
56
+ - macOS (launchd):
57
+ launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/actions.runner.<owner-repo>.<name>.plist
58
+ sleep 5
59
+ launchctl bootstrap gui/$(id -u) ~/Library/LaunchAgents/actions.runner.<owner-repo>.<name>.plist
60
+ - Linux (systemd):
61
+ systemctl --user restart actions.runner.<owner-repo>.<name>.service
62
+ 4. Optionally clear the broker-side ghost-busy state via REST API:
63
+ curl -X DELETE \
64
+ -H "Authorization: Bearer $GH_TOKEN" \
65
+ "https://api.github.com/repos/<owner>/<repo>/actions/runners/<runner_id>"
66
+ This forces re-registration and clears the stale busy state immediately.
67
+
68
+ Long-term: run macOS runners as ephemeral (--once) with a process supervisor
69
+ that restarts after each completed job, eliminating the multi-hour lifetime
70
+ that triggers the credential-refresh race.
71
+ fix_code:
72
+ - language: yaml
73
+ label: 'Watchdog workflow on separate runner — detect and restart stalled listeners'
74
+ code: |
75
+ # Separate monitoring workflow on a non-affected runner
76
+ # Runs every 15 minutes via cron
77
+ on:
78
+ schedule:
79
+ - cron: '*/15 * * * *'
80
+ jobs:
81
+ watchdog:
82
+ runs-on: ubuntu-latest # Use a separate hosted runner for the watchdog
83
+ steps:
84
+ - name: Check and restart stalled macOS listeners
85
+ env:
86
+ GH_TOKEN: ${{ secrets.RUNNER_MGMT_PAT }}
87
+ run: |
88
+ # List all self-hosted runners and check for stuck-busy ones
89
+ gh api /repos/${{ github.repository }}/actions/runners \
90
+ --jq '.runners[] | select(.status=="online" and .busy==true) | .id' \
91
+ | while read runner_id; do
92
+ echo "Runner $runner_id showing busy — may need investigation"
93
+ # Add custom liveness check here (SSH to host, check log freshness)
94
+ done
95
+ - language: bash
96
+ label: 'Shell watchdog — check listener log freshness and restart via launchctl'
97
+ code: |
98
+ #!/bin/bash
99
+ # Run on the macOS runner host via cron every 10 minutes
100
+ RUNNER_LABEL="owner-repo-runner-name"
101
+ PLIST="$HOME/Library/LaunchAgents/actions.runner.${RUNNER_LABEL}.plist"
102
+ DIAG_DIR="$HOME/actions-runner/_diag"
103
+ STALE_MINUTES=10
104
+
105
+ latest_log=$(ls -t "${DIAG_DIR}/Runner_"*.log 2>/dev/null | head -1)
106
+ if [[ -z "$latest_log" ]]; then exit 0; fi
107
+
108
+ age_minutes=$(( ($(date +%s) - $(stat -f %m "$latest_log")) / 60 ))
109
+ if (( age_minutes > STALE_MINUTES )); then
110
+ echo "Runner diag log stale for ${age_minutes}min — restarting..."
111
+ launchctl bootout "gui/$(id -u)" "$PLIST" 2>/dev/null
112
+ sleep 5
113
+ launchctl bootstrap "gui/$(id -u)" "$PLIST"
114
+ fi
115
+ prevention:
116
+ - 'Run macOS self-hosted runners as ephemeral (--once) with a process supervisor — this eliminates the multi-hour lifetime needed to trigger the credential-refresh race condition'
117
+ - 'Implement a log-freshness watchdog that monitors _diag/Runner_*.log modification time and restarts the launchd service if no new entries appear for > 10 minutes'
118
+ - 'Monitor GET /repos/{owner}/{repo}/actions/runners and alert on runners with busy=true for longer than your longest expected job duration'
119
+ - 'Limit runner lifetime with a cron-triggered scheduled restart between jobs (e.g., nightly) to reduce the window where credential refresh coincides with a broker disconnect'
120
+ docs:
121
+ - url: 'https://github.com/actions/runner/issues/4446'
122
+ label: 'actions/runner#4446: Listener silently exits broker-reconnect loop after AAD credential-refresh (ghost-busy)'
123
+ - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/monitoring-and-troubleshooting-self-hosted-runners'
124
+ label: 'GitHub Docs: Monitoring and troubleshooting self-hosted runners'
125
+ - url: 'https://docs.github.com/en/rest/actions/self-hosted-runners'
126
+ label: 'GitHub REST API: Self-hosted runners'
@@ -0,0 +1,103 @@
1
+ id: runner-environment-204
2
+ title: 'setup-node fails with EBADDEVENGINES when devEngines.packageManager requires a newer npm than Node ships'
3
+ category: runner-environment
4
+ severity: error
5
+ tags:
6
+ - setup-node
7
+ - npm
8
+ - devEngines
9
+ - EBADDEVENGINES
10
+ - package-manager
11
+ - node-22
12
+ - package-json
13
+ patterns:
14
+ - regex: 'npm error code EBADDEVENGINES'
15
+ flags: 'i'
16
+ - regex: 'EBADDEVENGINES.*Invalid devEngines\.packageManager'
17
+ flags: 'i'
18
+ - regex: 'Invalid semver version.*does not match.*for "packageManager"'
19
+ flags: 'i'
20
+ - regex: 'npm config get cache.*EBADDEVENGINES'
21
+ flags: 'si'
22
+ error_messages:
23
+ - 'npm error code EBADDEVENGINES'
24
+ - 'npm error EBADDEVENGINES The developer of this package has specified the following through devEngines'
25
+ - 'npm error EBADDEVENGINES Invalid devEngines.packageManager'
26
+ - 'npm error EBADDEVENGINES Invalid semver version "^11.10.0" does not match "10.9.7" for "packageManager"'
27
+ - '/opt/hostedtoolcache/node/22.22.2/x64/bin/npm config get cache'
28
+ root_cause: |
29
+ actions/setup-node detects the package manager by running `npm config get cache`
30
+ early in its initialization — before any user-specified npm upgrade step can run.
31
+ When package.json contains a `devEngines.packageManager` field specifying a minimum
32
+ npm version higher than the version bundled with the selected Node.js release, npm
33
+ enforces the devEngines constraint and exits with EBADDEVENGINES during that preflight
34
+ `npm config get cache` call, causing setup-node to fail immediately.
35
+
36
+ Example: Node.js v22 ships with npm v10. If package.json requires:
37
+ "devEngines": { "packageManager": { "name": "npm", "version": "^11.10.0" } }
38
+ then npm v10 raises EBADDEVENGINES, aborting setup-node before the user can run
39
+ `npm install --global npm@latest` in a subsequent step.
40
+
41
+ The `devEngines` field (npm RFC) was introduced in Node.js 22's era and allows
42
+ packages to declare their required toolchain. npm 10.x enforces it strictly by
43
+ default. Source: actions/setup-node#1553.
44
+ fix: |
45
+ Option 1 — Use the npm-version input on setup-node (recommended):
46
+ If setup-node applies the npm-version upgrade before its package manager detection,
47
+ the correct npm version will already be present when devEngines is checked.
48
+ Test this first; behaviour may depend on setup-node version.
49
+
50
+ Option 2 — Set devEngines.packageManager.onFail to "warn" in package.json:
51
+ npm 10.9+ supports an `onFail` sub-field that downgrades the constraint violation
52
+ from fatal error to warning. This allows CI to proceed and install the required
53
+ npm in a subsequent step.
54
+
55
+ Option 3 — Use explicit node-version instead of node-version-file:
56
+ When node-version-file is used, setup-node reads package.json to determine the
57
+ Node version, which may trigger the devEngines check. Using an explicit
58
+ node-version string avoids reading package.json entirely for version resolution.
59
+
60
+ Option 4 — Remove or relax devEngines.packageManager for CI use:
61
+ Consider whether the devEngines constraint is necessary for CI vs local dev.
62
+ A `.npmrc` with `engine-strict=false` in the repo will disable enforcement
63
+ for all npm operations in that directory, but this also affects local installs.
64
+ fix_code:
65
+ - language: yaml
66
+ label: 'Option 1 — use npm-version input to pre-install required npm'
67
+ code: |
68
+ - uses: actions/setup-node@v6
69
+ with:
70
+ node-version-file: package.json # Reads engines.node
71
+ npm-version: '11' # Pre-installs npm 11 before devEngines check
72
+ - language: yaml
73
+ label: 'Option 3 — use explicit node-version to skip package.json parsing'
74
+ code: |
75
+ - uses: actions/setup-node@v6
76
+ with:
77
+ node-version: '22' # Explicit version; does not read package.json
78
+ # Install required npm explicitly in a later step
79
+ - run: npm install --global npm@^11.10.0
80
+ - language: json
81
+ label: 'Option 2 — set onFail: warn in package.json to prevent fatal error'
82
+ code: |
83
+ {
84
+ "devEngines": {
85
+ "packageManager": {
86
+ "name": "npm",
87
+ "version": "^11.10.0",
88
+ "onFail": "warn"
89
+ }
90
+ }
91
+ }
92
+ prevention:
93
+ - 'When adopting devEngines.packageManager in package.json, test the CI setup-node step immediately — it may fail before your npm upgrade step can run'
94
+ - 'Set devEngines.packageManager.onFail: "warn" in any repo where CI installs the required npm version dynamically rather than having it pre-installed'
95
+ - 'Use the npm-version input on setup-node when your package.json requires a specific npm version via devEngines'
96
+ - 'Avoid relying on node-version-file if your package.json contains strict devEngines constraints and you need to upgrade the package manager in CI'
97
+ docs:
98
+ - url: 'https://github.com/actions/setup-node/issues/1553'
99
+ label: 'actions/setup-node#1553: npm config get cache fails with EBADDEVENGINES'
100
+ - url: 'https://docs.npmjs.com/cli/v11/configuring-npm/package-json#devengines'
101
+ label: 'npm docs: devEngines field in package.json'
102
+ - url: 'https://github.com/nodejs/package-maintenance/issues/539'
103
+ label: 'Node.js RFC: devEngines field specification'
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@htekdev/actions-debugger",
3
- "version": "1.0.116",
3
+ "version": "1.0.117",
4
4
  "description": "65+ real GitHub Actions errors, queryable by agents. CLI + MCP server + Copilot skills + error database.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",