@htekdev/actions-debugger 1.0.115 → 1.0.117
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/errors/caching-artifacts/cache-key-windows-path-separator-never-matches.yml +107 -0
- package/errors/concurrency-timing/rerun-failed-jobs-bypasses-concurrency-group.yml +89 -0
- package/errors/known-unsolved/empty-matrix-fromjson-workflow-failure-no-conditional-skip.yml +108 -0
- package/errors/runner-environment/arc-ephemeral-runner-oom-kill-session-conflict.yml +129 -0
- package/errors/runner-environment/checkout-v603-hash-algorithm-api-rate-limiting.yml +100 -0
- package/errors/runner-environment/macos-26-homebrew-python313-removed-stdlib-modules.yml +113 -0
- package/errors/runner-environment/macos-26-openssl3-legacy-cipher-p12-import-failure.yml +102 -0
- package/errors/runner-environment/macos-self-hosted-listener-aad-ghost-busy-stall.yml +126 -0
- package/errors/runner-environment/runner-v2334-action-download-repeated-case-sensitivity.yml +100 -0
- package/errors/runner-environment/setup-node-ebaddevengines-devengines-packagemanager.yml +103 -0
- package/errors/silent-failures/paths-filter-before-field-missing-workflow-run.yml +105 -0
- package/errors/silent-failures/windows-11-arm-bash-shell-intermittent-zero-output.yml +99 -0
- package/package.json +1 -1
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
id: caching-artifacts-068
|
|
2
|
+
title: '`actions/cache` key containing Windows backslash path separators never matches on restore — cache miss every run'
|
|
3
|
+
category: caching-artifacts
|
|
4
|
+
severity: silent-failure
|
|
5
|
+
tags:
|
|
6
|
+
- cache
|
|
7
|
+
- windows
|
|
8
|
+
- path-separator
|
|
9
|
+
- backslash
|
|
10
|
+
- cache-miss
|
|
11
|
+
- cross-platform
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: 'Cache not found for input keys'
|
|
14
|
+
flags: 'i'
|
|
15
|
+
- regex: 'cache.*miss.*windows|windows.*cache.*miss'
|
|
16
|
+
flags: 'i'
|
|
17
|
+
error_messages:
|
|
18
|
+
- 'Cache not found for input keys:'
|
|
19
|
+
- 'cache hit for key: false'
|
|
20
|
+
root_cause: |
|
|
21
|
+
GitHub Actions cache keys are plain strings matched byte-for-byte. On Windows
|
|
22
|
+
runners, several common patterns produce cache keys containing backslashes (`\`):
|
|
23
|
+
|
|
24
|
+
1. Embedding ${{ runner.temp }} or ${{ github.workspace }} directly in the key:
|
|
25
|
+
key: ${{ runner.os }}-${{ runner.temp }}-${{ hashFiles('**/package-lock.json') }}
|
|
26
|
+
On Windows, runner.temp resolves to `D:\a\_temp` producing a key with `\`.
|
|
27
|
+
|
|
28
|
+
2. Using hashFiles() with backslash glob patterns:
|
|
29
|
+
key: ${{ hashFiles('**\node_modules\**') }}
|
|
30
|
+
hashFiles() on Windows sometimes receives backslash-escaped paths in its
|
|
31
|
+
argument, encoding them into the resulting hash string.
|
|
32
|
+
|
|
33
|
+
3. String concatenation with Windows path variables set in earlier steps:
|
|
34
|
+
TOOL_PATH: C:\hostedtoolcache\node\18\x64
|
|
35
|
+
When $TOOL_PATH is embedded in the cache key, the `\` chars are literal.
|
|
36
|
+
|
|
37
|
+
The cache service stores and retrieves keys as exact string matches. A cache
|
|
38
|
+
saved with key `npm-D:\a\_temp-abc123` is never found by a lookup for
|
|
39
|
+
`npm-D:/a/_temp-abc123` or by a lookup on a different run where the runner.temp
|
|
40
|
+
path differs.
|
|
41
|
+
|
|
42
|
+
On Linux/macOS runners, paths always use `/` so this issue is Windows-specific.
|
|
43
|
+
Cross-platform matrix workflows are especially affected: the Linux cache is found
|
|
44
|
+
on restore; the Windows cache is always missed.
|
|
45
|
+
fix: |
|
|
46
|
+
Normalize all path separators to forward slashes before including in cache keys.
|
|
47
|
+
|
|
48
|
+
Option 1 — Avoid runner.temp and github.workspace in cache keys entirely.
|
|
49
|
+
Use runner.os and a fixed path pattern instead:
|
|
50
|
+
key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
|
|
51
|
+
|
|
52
|
+
Option 2 — If you must embed a path, normalize in a prior step:
|
|
53
|
+
- name: Set normalized cache path
|
|
54
|
+
shell: bash
|
|
55
|
+
run: echo "CACHE_PATH=$(echo '${{ runner.temp }}' | tr '\\' '/')" >> $GITHUB_ENV
|
|
56
|
+
Then:
|
|
57
|
+
key: ${{ runner.os }}-${{ env.CACHE_PATH }}-${{ hashFiles(...) }}
|
|
58
|
+
|
|
59
|
+
Option 3 — Always use forward-slash glob patterns in hashFiles():
|
|
60
|
+
key: ${{ hashFiles('**/package-lock.json') }} # correct
|
|
61
|
+
# NOT: ${{ hashFiles('**\package-lock.json') }} # Windows-style — avoid
|
|
62
|
+
|
|
63
|
+
Option 4 — Use a bash shell step to compute the key and store in GITHUB_ENV,
|
|
64
|
+
ensuring all path manipulation happens in a POSIX shell that normalizes separators.
|
|
65
|
+
fix_code:
|
|
66
|
+
- language: yaml
|
|
67
|
+
label: 'Bad: runner.temp in cache key produces backslashes on Windows'
|
|
68
|
+
code: |
|
|
69
|
+
# ❌ runner.temp on Windows = "D:\a\_temp" — backslashes in key → never matches
|
|
70
|
+
- uses: actions/cache@v4
|
|
71
|
+
with:
|
|
72
|
+
path: ${{ runner.temp }}/cache
|
|
73
|
+
key: ${{ runner.os }}-${{ runner.temp }}-${{ hashFiles('**/lock.json') }}
|
|
74
|
+
- language: yaml
|
|
75
|
+
label: 'Good: use runner.os and a fixed path — no path variables in key'
|
|
76
|
+
code: |
|
|
77
|
+
# ✅ No Windows path separators in key
|
|
78
|
+
- uses: actions/cache@v4
|
|
79
|
+
with:
|
|
80
|
+
path: ~/.npm
|
|
81
|
+
key: ${{ runner.os }}-npm-${{ hashFiles('**/package-lock.json') }}
|
|
82
|
+
- language: yaml
|
|
83
|
+
label: 'Good: normalize path separators with bash tr before embedding in key'
|
|
84
|
+
code: |
|
|
85
|
+
- name: Normalize cache path
|
|
86
|
+
shell: bash
|
|
87
|
+
run: |
|
|
88
|
+
NORM_PATH=$(echo '${{ runner.tool_cache }}' | tr '\\' '/')
|
|
89
|
+
echo "NORM_TOOL_CACHE=$NORM_PATH" >> $GITHUB_ENV
|
|
90
|
+
|
|
91
|
+
- uses: actions/cache@v4
|
|
92
|
+
with:
|
|
93
|
+
path: ${{ runner.tool_cache }}
|
|
94
|
+
key: ${{ runner.os }}-tools-${{ env.NORM_TOOL_CACHE }}-${{ hashFiles('**/tool-versions') }}
|
|
95
|
+
prevention:
|
|
96
|
+
- 'Never include runner.temp, runner.tool_cache, or github.workspace in cache keys — these resolve to OS-specific paths with backslashes on Windows'
|
|
97
|
+
- 'Always use runner.os (e.g., "Windows") as the OS discriminator in cross-platform cache keys, not path-based variables'
|
|
98
|
+
- 'Use forward-slash glob patterns in hashFiles() expressions on all platforms'
|
|
99
|
+
- 'When path variables must appear in a cache key, normalize separators to forward slashes in a bash step first and use the env variable'
|
|
100
|
+
- 'Test cache hit rate explicitly in CI by checking the cache-hit output of the restore step on Windows runners'
|
|
101
|
+
docs:
|
|
102
|
+
- url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows'
|
|
103
|
+
label: 'GitHub Docs: Caching dependencies to speed up workflows'
|
|
104
|
+
- url: 'https://github.com/actions/cache/issues/671'
|
|
105
|
+
label: 'actions/cache#671: Cache key with Windows path separators causes cache miss'
|
|
106
|
+
- url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables'
|
|
107
|
+
label: 'GitHub Docs: Default environment variables (runner.temp, etc.)'
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
id: concurrency-timing-054
|
|
2
|
+
title: '"Re-run failed jobs" bypasses concurrency group — runs in parallel with a newly triggered run for the same group'
|
|
3
|
+
category: concurrency-timing
|
|
4
|
+
severity: silent-failure
|
|
5
|
+
tags:
|
|
6
|
+
- concurrency
|
|
7
|
+
- rerun
|
|
8
|
+
- re-run-failed-jobs
|
|
9
|
+
- parallel
|
|
10
|
+
- cancel-in-progress
|
|
11
|
+
patterns:
|
|
12
|
+
- regex: 'Re-run failed jobs'
|
|
13
|
+
flags: 'i'
|
|
14
|
+
- regex: 'This run was automatically cancelled'
|
|
15
|
+
flags: 'i'
|
|
16
|
+
error_messages:
|
|
17
|
+
- 'This run was automatically cancelled'
|
|
18
|
+
root_cause: |
|
|
19
|
+
When a developer clicks "Re-run failed jobs" (as opposed to "Re-run all jobs"),
|
|
20
|
+
GitHub Actions does not evaluate the workflow's concurrency group before starting
|
|
21
|
+
the rerun. The rerun begins immediately and runs in parallel with any currently
|
|
22
|
+
in-progress or newly triggered run that occupies the same concurrency group.
|
|
23
|
+
|
|
24
|
+
This is distinct from "Re-run all jobs", which DOES re-trigger the concurrency
|
|
25
|
+
check and will cancel the currently in-progress run if cancel-in-progress is true,
|
|
26
|
+
or queue behind it if cancel-in-progress is false.
|
|
27
|
+
|
|
28
|
+
Common scenario:
|
|
29
|
+
1. A push triggers run A, which starts and partially fails.
|
|
30
|
+
2. A second push triggers run B for the same branch (same concurrency group).
|
|
31
|
+
Run B is queued or cancels run A depending on cancel-in-progress setting.
|
|
32
|
+
3. Developer hits "Re-run failed jobs" on run A.
|
|
33
|
+
4. Run A's rerun starts immediately — now run A (rerun) and run B are both
|
|
34
|
+
executing concurrently, violating the intent of the concurrency group.
|
|
35
|
+
|
|
36
|
+
Side effects:
|
|
37
|
+
- Two deploys to the same environment can run simultaneously.
|
|
38
|
+
- A self-hosted runner with a single slot gets double-booked.
|
|
39
|
+
- Race conditions between two concurrent jobs writing to the same artifact.
|
|
40
|
+
|
|
41
|
+
Root cause: GitHub's "Re-run failed jobs" path was implemented as a targeted job
|
|
42
|
+
restart that bypasses workflow-level orchestration including concurrency evaluation.
|
|
43
|
+
This behavior is tracked in actions/runner#2294 (open).
|
|
44
|
+
fix: |
|
|
45
|
+
Option 1 — Use "Re-run all jobs" instead of "Re-run failed jobs" when the workflow
|
|
46
|
+
has a concurrency group. "Re-run all jobs" triggers a fresh run that participates
|
|
47
|
+
in concurrency evaluation normally.
|
|
48
|
+
|
|
49
|
+
Option 2 — Include github.run_attempt in the concurrency group key so each attempt
|
|
50
|
+
gets its own group slot, preventing cancellation loops during reruns while still
|
|
51
|
+
serializing independent triggers:
|
|
52
|
+
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.run_attempt }}
|
|
53
|
+
cancel-in-progress: false
|
|
54
|
+
|
|
55
|
+
Option 3 — For deployment workflows, use environment protection rules as the
|
|
56
|
+
serialization mechanism instead of (or in addition to) concurrency groups. A
|
|
57
|
+
pending environment review gate serializes deployments even when concurrency is
|
|
58
|
+
bypassed.
|
|
59
|
+
|
|
60
|
+
Option 4 — Accept the behavior: if the failed jobs don't interact with shared
|
|
61
|
+
resources, concurrent partial reruns may be safe. Restrict "Re-run failed jobs" to
|
|
62
|
+
jobs that are idempotent and isolated.
|
|
63
|
+
fix_code:
|
|
64
|
+
- language: yaml
|
|
65
|
+
label: 'Option A: include run_attempt in group key — each attempt gets its own slot'
|
|
66
|
+
code: |
|
|
67
|
+
concurrency:
|
|
68
|
+
group: ${{ github.workflow }}-${{ github.ref }}-${{ github.run_attempt }}
|
|
69
|
+
cancel-in-progress: false
|
|
70
|
+
- language: yaml
|
|
71
|
+
label: 'Option B: environment protection to serialize deployments independent of concurrency'
|
|
72
|
+
code: |
|
|
73
|
+
jobs:
|
|
74
|
+
deploy:
|
|
75
|
+
environment: production # Requires reviewer approval; serializes even without concurrency
|
|
76
|
+
steps:
|
|
77
|
+
- run: ./deploy.sh
|
|
78
|
+
prevention:
|
|
79
|
+
- 'Prefer "Re-run all jobs" over "Re-run failed jobs" when your workflow uses a concurrency group to prevent duplicate concurrent runs'
|
|
80
|
+
- 'Include ${{ github.run_attempt }} in the concurrency group key to give each attempt its own slot and avoid silent bypass'
|
|
81
|
+
- 'For deployment workflows with shared infrastructure, combine concurrency groups with environment protection rules for defense-in-depth serialization'
|
|
82
|
+
- 'Document in your workflow YAML that "Re-run failed jobs" bypasses concurrency to alert future maintainers'
|
|
83
|
+
docs:
|
|
84
|
+
- url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/using-concurrency'
|
|
85
|
+
label: 'GitHub Docs: Using concurrency'
|
|
86
|
+
- url: 'https://github.com/actions/runner/issues/2294'
|
|
87
|
+
label: 'actions/runner#2294: Re-run failed jobs bypasses concurrency group (open)'
|
|
88
|
+
- url: 'https://docs.github.com/en/actions/managing-workflow-runs-and-deployments/managing-workflow-runs/re-running-workflows-and-jobs'
|
|
89
|
+
label: 'GitHub Docs: Re-running workflows and jobs'
|
|
@@ -0,0 +1,108 @@
|
|
|
1
|
+
id: known-unsolved-064
|
|
2
|
+
title: 'No conditional matrix skip — an empty matrix from `fromJSON([])` always fails the workflow with a validation error'
|
|
3
|
+
category: known-unsolved
|
|
4
|
+
severity: limitation
|
|
5
|
+
tags:
|
|
6
|
+
- matrix
|
|
7
|
+
- dynamic-matrix
|
|
8
|
+
- fromJSON
|
|
9
|
+
- conditional
|
|
10
|
+
- strategy
|
|
11
|
+
- limitation
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: 'The strategy/matrix must contain at least one'
|
|
14
|
+
flags: 'i'
|
|
15
|
+
- regex: 'matrix must define at least one vector'
|
|
16
|
+
flags: 'i'
|
|
17
|
+
- regex: 'Error when evaluating .strategy. for job.*matrix.*empty'
|
|
18
|
+
flags: 'i'
|
|
19
|
+
error_messages:
|
|
20
|
+
- 'The strategy/matrix must contain at least one vector'
|
|
21
|
+
- 'matrix must define at least one vector'
|
|
22
|
+
- 'Error when evaluating ''strategy'' for job'
|
|
23
|
+
root_cause: |
|
|
24
|
+
GitHub Actions validates that a matrix strategy always produces at least one job.
|
|
25
|
+
When a prior step outputs an empty JSON array and the matrix job uses
|
|
26
|
+
`fromJSON(steps.generate.outputs.matrix)`, the workflow fails at plan time with
|
|
27
|
+
a validation error before any jobs run.
|
|
28
|
+
|
|
29
|
+
There is NO supported way to:
|
|
30
|
+
- Provide an `if:` condition on the matrix strategy to skip it entirely
|
|
31
|
+
- Use `strategy: if: ${{ condition }}` syntax (does not exist)
|
|
32
|
+
- Have a job run zero times when its matrix is empty
|
|
33
|
+
|
|
34
|
+
This is a fundamental platform limitation: matrix jobs must always expand to at
|
|
35
|
+
least one job instance. The validation runs before job execution, so even an `if:`
|
|
36
|
+
on the job itself does not help — the matrix must be non-empty regardless.
|
|
37
|
+
|
|
38
|
+
Common scenarios:
|
|
39
|
+
- CI that builds only changed packages: if no packages changed, the matrix
|
|
40
|
+
is empty and the entire workflow fails.
|
|
41
|
+
- Release workflows that conditionally matrix over artifacts: if nothing was
|
|
42
|
+
built, the matrix is empty.
|
|
43
|
+
- PR labeler workflows that matrix over affected services: a trivial change
|
|
44
|
+
affects no services, producing an empty list.
|
|
45
|
+
|
|
46
|
+
Note: yaml-syntax-008 covers the technical "fromJSON parse error" message. This
|
|
47
|
+
entry covers the LIMITATION: there is no native mechanism to conditionally skip
|
|
48
|
+
matrix jobs or provide a zero-matrix strategy.
|
|
49
|
+
fix: |
|
|
50
|
+
Workaround 1 — Always include at least one sentinel/dummy entry and guard with if:
|
|
51
|
+
In the matrix-generating step, append a sentinel entry when the list is empty:
|
|
52
|
+
matrix=$(echo "$matrix" | jq 'if length == 0 then [{"skip":true}] else map(. + {"skip":false}) end')
|
|
53
|
+
Then in the job:
|
|
54
|
+
if: ${{ !matrix.skip }}
|
|
55
|
+
|
|
56
|
+
Workaround 2 — Use a separate check job with needs: to gate the matrix job:
|
|
57
|
+
Add an upstream job that outputs a boolean "has_work". The matrix job then uses
|
|
58
|
+
`needs: check` and reads `needs.check.outputs.has_work` in an `if:` condition.
|
|
59
|
+
This still requires the matrix to be non-empty (use sentinel), but the `if:`
|
|
60
|
+
prevents actual work from running.
|
|
61
|
+
|
|
62
|
+
Workaround 3 — Always include a no-op entry in the matrix output:
|
|
63
|
+
Ensure the matrix-generating script never outputs an empty array by always
|
|
64
|
+
appending a `{"target":"none"}` entry, then:
|
|
65
|
+
if: matrix.target != 'none'
|
|
66
|
+
|
|
67
|
+
None of these workarounds are elegant. This is a known limitation with no native
|
|
68
|
+
fix scheduled. Tracked in actions/runner#1502 (96+ reactions, open since 2021).
|
|
69
|
+
fix_code:
|
|
70
|
+
- language: yaml
|
|
71
|
+
label: 'Workaround: inject sentinel entry when matrix is empty, skip in job if:'
|
|
72
|
+
code: |
|
|
73
|
+
jobs:
|
|
74
|
+
generate-matrix:
|
|
75
|
+
runs-on: ubuntu-latest
|
|
76
|
+
outputs:
|
|
77
|
+
matrix: ${{ steps.set-matrix.outputs.matrix }}
|
|
78
|
+
steps:
|
|
79
|
+
- id: set-matrix
|
|
80
|
+
run: |
|
|
81
|
+
# Build your matrix list; inject sentinel when empty
|
|
82
|
+
matrix='[]' # Replace with real generation logic
|
|
83
|
+
if [ "$(echo "$matrix" | jq 'length')" -eq 0 ]; then
|
|
84
|
+
matrix='[{"target":"__skip__"}]'
|
|
85
|
+
fi
|
|
86
|
+
echo "matrix=$matrix" >> $GITHUB_OUTPUT
|
|
87
|
+
|
|
88
|
+
build:
|
|
89
|
+
needs: generate-matrix
|
|
90
|
+
if: ${{ fromJSON(needs.generate-matrix.outputs.matrix)[0].target != '__skip__' }}
|
|
91
|
+
strategy:
|
|
92
|
+
matrix:
|
|
93
|
+
include: ${{ fromJSON(needs.generate-matrix.outputs.matrix) }}
|
|
94
|
+
runs-on: ubuntu-latest
|
|
95
|
+
steps:
|
|
96
|
+
- run: echo "Building ${{ matrix.target }}"
|
|
97
|
+
prevention:
|
|
98
|
+
- 'Never pass a potentially empty array directly to fromJSON() in a matrix strategy — always guard with a sentinel entry'
|
|
99
|
+
- 'Add an upstream check job that validates matrix size before the matrix job runs'
|
|
100
|
+
- 'Document the sentinel pattern in your workflow so future maintainers understand why a dummy entry exists'
|
|
101
|
+
- 'Vote/watch actions/runner#1502 for native conditional matrix support'
|
|
102
|
+
docs:
|
|
103
|
+
- url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/running-variations-of-jobs-in-a-workflow'
|
|
104
|
+
label: 'GitHub Docs: Running variations of jobs in a workflow (matrix)'
|
|
105
|
+
- url: 'https://github.com/actions/runner/issues/1502'
|
|
106
|
+
label: 'actions/runner#1502: Support for empty matrix (96+ reactions, open)'
|
|
107
|
+
- url: 'https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#jobsjob_idstrategymatrix'
|
|
108
|
+
label: 'GitHub Docs: jobs.<job_id>.strategy.matrix'
|
|
@@ -0,0 +1,129 @@
|
|
|
1
|
+
id: runner-environment-203
|
|
2
|
+
title: 'ARC EphemeralRunner Stuck in Running State After OOM Kill — Scale Set Blocked'
|
|
3
|
+
category: runner-environment
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- arc
|
|
7
|
+
- actions-runner-controller
|
|
8
|
+
- ephemeral-runner
|
|
9
|
+
- oomkill
|
|
10
|
+
- kubernetes
|
|
11
|
+
- scale-set
|
|
12
|
+
- session-conflict
|
|
13
|
+
- stuck
|
|
14
|
+
- self-hosted
|
|
15
|
+
patterns:
|
|
16
|
+
- regex: 'RunnerScaleSetSessionConflictException'
|
|
17
|
+
flags: 'i'
|
|
18
|
+
- regex: 'TaskAgentSessionConflictException'
|
|
19
|
+
flags: 'i'
|
|
20
|
+
- regex: 'A session for this runner already exists'
|
|
21
|
+
flags: 'i'
|
|
22
|
+
- regex: 'Runner connect error: Error: Conflict\. Retrying until reconnected'
|
|
23
|
+
flags: 'i'
|
|
24
|
+
error_messages:
|
|
25
|
+
- 'RunnerScaleSetSessionConflictException: there is already an active session'
|
|
26
|
+
- 'TaskAgentSessionConflictException: Error: Conflict'
|
|
27
|
+
- 'A session for this runner already exists.'
|
|
28
|
+
- '2026-XX-XX HH:MM:SSZ: Runner connect error: Error: Conflict. Retrying until reconnected.'
|
|
29
|
+
root_cause: |
|
|
30
|
+
When an ARC (Actions Runner Controller) ephemeral runner pod is OOM-killed by the
|
|
31
|
+
Kubernetes kubelet (memory limit exceeded), the pod terminates abruptly without going
|
|
32
|
+
through the runner's graceful shutdown path. The runner never sends a "job completed" or
|
|
33
|
+
"runner offline" signal to the GitHub broker.
|
|
34
|
+
|
|
35
|
+
As a result:
|
|
36
|
+
1. The EphemeralRunner custom resource stays in phase `Running` indefinitely.
|
|
37
|
+
2. The ARC scale-set controller still counts the dead runner as "in use", so it does not
|
|
38
|
+
spin up a replacement pod to service the next queued job.
|
|
39
|
+
3. New jobs remain stuck at "Waiting for a runner to pick up this job..." until the
|
|
40
|
+
stale EphemeralRunner CR is manually deleted.
|
|
41
|
+
|
|
42
|
+
When ARC tries to restart the runner (either via a controller health check or manually),
|
|
43
|
+
the new runner pod connects to the GitHub broker using the same JIT token/session, and
|
|
44
|
+
the broker responds HTTP 409 Conflict because the old session is still registered:
|
|
45
|
+
|
|
46
|
+
TaskAgentSessionConflictException: Error: Conflict
|
|
47
|
+
A session for this runner already exists.
|
|
48
|
+
Runner connect error: Error: Conflict. Retrying until reconnected.
|
|
49
|
+
|
|
50
|
+
The runner retries every 30 seconds. After the broker's session lease expires (~2-3 min
|
|
51
|
+
in most cases), the conflict resolves and the runner connects — but the session timeout
|
|
52
|
+
window varies and can leave the scale set blocked for longer periods.
|
|
53
|
+
|
|
54
|
+
Versions affected:
|
|
55
|
+
- Reproducible across ARC v0.9.x - v0.12.x; partial mitigation added in ARC v0.12.0
|
|
56
|
+
(stale EphemeralRunner detection), but OOM kills on active runners can still bypass it.
|
|
57
|
+
- Frequently triggered by Vitest --coverage, Jest with large test suites, or any
|
|
58
|
+
memory-intensive build tool running without memory limits in the runner container.
|
|
59
|
+
fix: |
|
|
60
|
+
Immediate recovery: delete the stuck EphemeralRunner CR to release the scale-set slot:
|
|
61
|
+
|
|
62
|
+
kubectl delete ephemeralrunner -n <namespace> <runner-name>
|
|
63
|
+
|
|
64
|
+
After deletion, ARC will spin up a new runner pod and pick up the queued job.
|
|
65
|
+
|
|
66
|
+
Root fix (prevent recurrence):
|
|
67
|
+
1. Set memory limits on runner containers that match actual job requirements with headroom.
|
|
68
|
+
2. Add workflow-level `timeout-minutes:` to ensure jobs terminate and release the runner
|
|
69
|
+
if they run too long.
|
|
70
|
+
3. Upgrade ARC to v0.12.0+ for improved stale EphemeralRunner detection.
|
|
71
|
+
4. Configure `terminationGracePeriodSeconds: 90` (or longer) on runner pods to give the
|
|
72
|
+
runner process time to deregister gracefully before the kubelet force-kills it.
|
|
73
|
+
fix_code:
|
|
74
|
+
- language: yaml
|
|
75
|
+
label: 'Set memory limits and timeout on runner pods to prevent OOM kills'
|
|
76
|
+
code: |
|
|
77
|
+
# In your HelmRelease / values.yaml for actions-runner-controller
|
|
78
|
+
githubConfigUrl: "https://github.com/your-org/your-repo"
|
|
79
|
+
maxRunners: 4
|
|
80
|
+
minRunners: 0
|
|
81
|
+
template:
|
|
82
|
+
spec:
|
|
83
|
+
terminationGracePeriodSeconds: 90
|
|
84
|
+
containers:
|
|
85
|
+
- name: runner
|
|
86
|
+
image: ghcr.io/actions/actions-runner:latest
|
|
87
|
+
resources:
|
|
88
|
+
requests:
|
|
89
|
+
memory: "2Gi"
|
|
90
|
+
cpu: "500m"
|
|
91
|
+
limits:
|
|
92
|
+
memory: "4Gi" # Set appropriate limit; OOM kill occurs when exceeded
|
|
93
|
+
cpu: "2"
|
|
94
|
+
- language: yaml
|
|
95
|
+
label: 'Add workflow timeout to guarantee runner release even on hang'
|
|
96
|
+
code: |
|
|
97
|
+
jobs:
|
|
98
|
+
test:
|
|
99
|
+
runs-on: arc-runner-set
|
|
100
|
+
timeout-minutes: 30 # Runner is released after 30 min even if job hangs
|
|
101
|
+
steps:
|
|
102
|
+
- uses: actions/checkout@v4
|
|
103
|
+
- run: npm test -- --coverage
|
|
104
|
+
- language: yaml
|
|
105
|
+
label: 'Manual recovery: delete stuck EphemeralRunner CR'
|
|
106
|
+
code: |
|
|
107
|
+
# List stuck EphemeralRunners
|
|
108
|
+
kubectl get ephemeralrunner -n arc-systems
|
|
109
|
+
|
|
110
|
+
# Delete the stuck one (ARC will create a new pod automatically)
|
|
111
|
+
kubectl delete ephemeralrunner -n arc-systems <stuck-runner-name>
|
|
112
|
+
|
|
113
|
+
# Alternatively, delete all stuck runners in a namespace
|
|
114
|
+
kubectl delete ephemeralrunner -n arc-systems --field-selector='status.phase=Running'
|
|
115
|
+
prevention:
|
|
116
|
+
- 'Always set memory limits on ARC runner containers; without limits, a single job can consume all node memory and OOM-kill other runners.'
|
|
117
|
+
- 'Set timeout-minutes: at the job level for all ARC-backed workflows to guarantee the runner is eventually released.'
|
|
118
|
+
- 'Upgrade ARC to v0.12.0+ for automatic stale EphemeralRunner cleanup.'
|
|
119
|
+
- 'Monitor EphemeralRunner phase distribution; a growing count of Running CRs with no corresponding pods is a leading indicator of this issue.'
|
|
120
|
+
- 'Add terminationGracePeriodSeconds: 90+ to runner pod templates so gradual shutdown signals have time to deregister the runner.'
|
|
121
|
+
docs:
|
|
122
|
+
- url: 'https://github.com/actions/actions-runner-controller/issues/4155'
|
|
123
|
+
label: 'EphemeralRunner and its pods left stuck Running after runner OOMKILL (15 reactions)'
|
|
124
|
+
- url: 'https://github.com/actions/actions-runner-controller/issues/3922'
|
|
125
|
+
label: 'Scaleset controllers stuck with RunnerScaleSetSessionConflictException (12 reactions)'
|
|
126
|
+
- url: 'https://github.com/actions/runner/issues/4312'
|
|
127
|
+
label: 'Self-hosted runner gets stuck in active state, blocking queued jobs across multiple repositories'
|
|
128
|
+
- url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners-with-actions-runner-controller'
|
|
129
|
+
label: 'Managing self-hosted runners with Actions Runner Controller'
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
id: runner-environment-206
|
|
2
|
+
title: 'actions/checkout v6.0.3 regression — new /hash-algorithm API call on every checkout exhausts rate limits in high-volume orgs'
|
|
3
|
+
category: runner-environment
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- checkout
|
|
7
|
+
- rate-limit
|
|
8
|
+
- api-rate-limit
|
|
9
|
+
- v6
|
|
10
|
+
- PAT
|
|
11
|
+
- regression
|
|
12
|
+
- hash-algorithm
|
|
13
|
+
patterns:
|
|
14
|
+
- regex: 'API rate limit exceeded'
|
|
15
|
+
flags: 'i'
|
|
16
|
+
- regex: 'You have exceeded a secondary rate limit'
|
|
17
|
+
flags: 'i'
|
|
18
|
+
- regex: 'HttpError.*API rate limit exceeded'
|
|
19
|
+
flags: 'i'
|
|
20
|
+
- regex: 'Rate limit.*exceeded.*403'
|
|
21
|
+
flags: 'i'
|
|
22
|
+
error_messages:
|
|
23
|
+
- 'Error: HttpError: API rate limit exceeded for user ID'
|
|
24
|
+
- 'You have exceeded a secondary rate limit and have been temporarily blocked from content creation'
|
|
25
|
+
- 'remote: Repository not found.'
|
|
26
|
+
- 'fatal: repository ''https://github.com/owner/repo/'' not found'
|
|
27
|
+
root_cause: |
|
|
28
|
+
actions/checkout v6.0.3 (released June 2, 2026, commit 1cce339) introduced a new
|
|
29
|
+
REST API call to GET /repos/{owner}/{repo}/hash-algorithm on every checkout operation
|
|
30
|
+
to determine the repository's object hashing algorithm.
|
|
31
|
+
|
|
32
|
+
In organizations with high-concurrency workflows, this additional API call multiplies
|
|
33
|
+
rate-limit consumption significantly. A matrix build with 30 parallel jobs each running
|
|
34
|
+
checkout makes 30 additional API calls per push event. At the per-user rate limit of
|
|
35
|
+
5,000 requests/hour, large orgs using PATs (Personal Access Tokens) for cross-repo
|
|
36
|
+
checkout quickly exhaust their quota across many concurrent pipelines.
|
|
37
|
+
|
|
38
|
+
When the /hash-algorithm endpoint returns HTTP 403 (rate limited), the checkout action
|
|
39
|
+
may interpret the 403 as a resource-not-found condition, producing misleading errors
|
|
40
|
+
such as "remote: Repository not found" that mask the true rate-limit cause.
|
|
41
|
+
|
|
42
|
+
GITHUB_TOKEN is not affected in the same way because it carries per-repository rate
|
|
43
|
+
limits (15,000 requests/hour for Actions) rather than per-user limits.
|
|
44
|
+
Source: actions/checkout#2450.
|
|
45
|
+
fix: |
|
|
46
|
+
Immediate fix — pin to actions/checkout@v6.0.2 until the upstream regression is
|
|
47
|
+
addressed:
|
|
48
|
+
uses: actions/checkout@v6.0.2
|
|
49
|
+
|
|
50
|
+
Use GITHUB_TOKEN instead of PAT where possible:
|
|
51
|
+
GITHUB_TOKEN rate limits (15,000 req/hr for GitHub Actions) are scoped per
|
|
52
|
+
repository and do not aggregate across your organization's other workflows.
|
|
53
|
+
Reserve PATs for cross-repo or cross-org checkouts only.
|
|
54
|
+
|
|
55
|
+
Reduce parallel checkout volume:
|
|
56
|
+
Add fetch-depth: 1 and/or sparse-checkout to minimize the API surface area
|
|
57
|
+
per checkout call, reducing the number of API requests triggered per job.
|
|
58
|
+
|
|
59
|
+
Monitor actions/checkout#2450 for the upstream fix (caching the hash-algorithm
|
|
60
|
+
result within a workflow run, or making the call conditional).
|
|
61
|
+
fix_code:
|
|
62
|
+
- language: yaml
|
|
63
|
+
label: 'Pin to v6.0.2 to avoid the regression until upstream fix is released'
|
|
64
|
+
code: |
|
|
65
|
+
- uses: actions/checkout@v6.0.2
|
|
66
|
+
with:
|
|
67
|
+
# Prefer GITHUB_TOKEN over a PAT to use per-repo rate limits
|
|
68
|
+
token: ${{ secrets.GITHUB_TOKEN }}
|
|
69
|
+
fetch-depth: 1 # Shallow fetch reduces ancillary API calls
|
|
70
|
+
- language: yaml
|
|
71
|
+
label: 'Cross-repo checkout — use dedicated PAT only where necessary, pin version'
|
|
72
|
+
code: |
|
|
73
|
+
- uses: actions/checkout@v6.0.2
|
|
74
|
+
with:
|
|
75
|
+
repository: org/other-repo
|
|
76
|
+
token: ${{ secrets.CROSS_REPO_PAT }} # PAT required here; rate-limited per user
|
|
77
|
+
fetch-depth: 1
|
|
78
|
+
- language: yaml
|
|
79
|
+
label: 'Monitor rate limit headers in a preflight step (diagnostic aid)'
|
|
80
|
+
code: |
|
|
81
|
+
- name: Check remaining GitHub API rate limit
|
|
82
|
+
run: |
|
|
83
|
+
remaining=$(curl -s -H "Authorization: Bearer ${{ secrets.GITHUB_TOKEN }}" \
|
|
84
|
+
https://api.github.com/rate_limit | jq '.rate.remaining')
|
|
85
|
+
echo "Remaining API calls: $remaining"
|
|
86
|
+
if (( remaining < 500 )); then
|
|
87
|
+
echo "::warning::Low API rate limit — $remaining calls remaining"
|
|
88
|
+
fi
|
|
89
|
+
prevention:
|
|
90
|
+
- 'Pin actions/checkout to a specific patch version (e.g., @v6.0.2) in high-volume orgs — patch updates can introduce API regressions like this one'
|
|
91
|
+
- 'Prefer GITHUB_TOKEN over org-wide PATs for checkout; GITHUB_TOKEN rate limits are per-repository and isolated from other workflows'
|
|
92
|
+
- 'Monitor your org REST API usage under Settings > Insights > API Requests to detect unexpected call spikes from action updates before they hit production'
|
|
93
|
+
- 'Add a rate-limit check step to long-running or high-parallelism pipelines to catch exhaustion before it causes misleading errors'
|
|
94
|
+
docs:
|
|
95
|
+
- url: 'https://github.com/actions/checkout/issues/2450'
|
|
96
|
+
label: 'actions/checkout#2450: New /hash-algorithm API call causing rate limiting failures in v6.0.3'
|
|
97
|
+
- url: 'https://docs.github.com/en/rest/using-the-rest-api/rate-limits-for-the-rest-api'
|
|
98
|
+
label: 'GitHub Docs: Rate limits for the REST API'
|
|
99
|
+
- url: 'https://docs.github.com/en/actions/security-for-github-actions/security-guides/automatic-token-authentication#permissions-for-the-github_token'
|
|
100
|
+
label: 'GitHub Docs: GITHUB_TOKEN permissions and rate limits'
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
id: runner-environment-201
|
|
2
|
+
title: 'macOS 26 Homebrew python@3 ships Python 3.13 — removed stdlib modules (cgi, imghdr, aifc, telnetlib) cause ModuleNotFoundError'
|
|
3
|
+
category: runner-environment
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- macos
|
|
7
|
+
- macos-26
|
|
8
|
+
- python
|
|
9
|
+
- homebrew
|
|
10
|
+
- stdlib
|
|
11
|
+
- runner-image
|
|
12
|
+
- breaking-change
|
|
13
|
+
patterns:
|
|
14
|
+
- regex: 'ModuleNotFoundError: No module named ''(cgi|imghdr|aifc|chunk|nntplib|telnetlib|uu|xdrlib|sndhdr|sunau|mailcap|msilib|pipes|crypt|spwd|ossaudiodev)'''
|
|
15
|
+
flags: 'i'
|
|
16
|
+
- regex: 'ImportError.*No module named.*cgi|No module named.*imghdr|No module named.*telnetlib'
|
|
17
|
+
flags: 'i'
|
|
18
|
+
- regex: 'python.*3\.13.*deprecated.*module|removed.*python.*3\.13'
|
|
19
|
+
flags: 'i'
|
|
20
|
+
error_messages:
|
|
21
|
+
- 'ModuleNotFoundError: No module named ''cgi'''
|
|
22
|
+
- 'ModuleNotFoundError: No module named ''imghdr'''
|
|
23
|
+
- 'ModuleNotFoundError: No module named ''aifc'''
|
|
24
|
+
- 'ModuleNotFoundError: No module named ''telnetlib'''
|
|
25
|
+
- 'ModuleNotFoundError: No module named ''chunk'''
|
|
26
|
+
- 'ModuleNotFoundError: No module named ''nntplib'''
|
|
27
|
+
root_cause: |
|
|
28
|
+
macOS 26 runner images ship Homebrew python@3 pointing to Python 3.13.x. Python
|
|
29
|
+
3.13 removed the following stdlib modules that were deprecated since Python 3.11:
|
|
30
|
+
|
|
31
|
+
cgi, cgitb, aifc, chunk, crypt, imghdr, mailcap, msilib (Windows only),
|
|
32
|
+
nntplib, ossaudiodev, pipes, sndhdr, spwd, sunau, telnetlib, uu, xdrlib
|
|
33
|
+
|
|
34
|
+
Workflows that call bare python3 (resolved to Homebrew Python 3.13 on macOS 26)
|
|
35
|
+
and import any of these modules fail with ModuleNotFoundError at runtime.
|
|
36
|
+
|
|
37
|
+
This affects:
|
|
38
|
+
- Scripts using cgi or cgitb for HTTP form parsing
|
|
39
|
+
- Image-type detection using imghdr (commonly used with Pillow-based workflows)
|
|
40
|
+
- Legacy FTP/NNTP clients using nntplib or telnetlib
|
|
41
|
+
- Audio file handling using aifc, sunau, or sndhdr
|
|
42
|
+
|
|
43
|
+
Workflows that previously ran on macos-14 or macos-15 (Homebrew Python 3.11/3.12)
|
|
44
|
+
are affected when the job label is macos-26 or when macos-latest migrates to
|
|
45
|
+
macOS 26. The failure is not immediately obvious because the error occurs at
|
|
46
|
+
import time inside Python, not at the runner level, and the runner step exits
|
|
47
|
+
with a non-zero code that may be mistaken for a test failure rather than an
|
|
48
|
+
environment regression.
|
|
49
|
+
|
|
50
|
+
Note: actions/setup-python@v5+ with an explicit python-version is unaffected —
|
|
51
|
+
this issue only affects scripts that rely on the system/Homebrew python3 binary.
|
|
52
|
+
fix: |
|
|
53
|
+
Option 1 (recommended) — Pin Python with actions/setup-python:
|
|
54
|
+
Always use actions/setup-python with an explicit version to get the exact
|
|
55
|
+
Python version your code requires. This bypasses the Homebrew python@3 symlink.
|
|
56
|
+
|
|
57
|
+
Option 2 — Replace removed modules with modern equivalents:
|
|
58
|
+
- cgi → urllib.parse + email.parser (or the 3rd-party 'cgi' backport)
|
|
59
|
+
- imghdr → imghdr is available as the 3rd-party 'imghdr' backport on PyPI,
|
|
60
|
+
or use python-magic / filetype for image detection
|
|
61
|
+
- telnetlib → use telnetlib3 (PyPI) or asyncio-based Telnet
|
|
62
|
+
- aifc/sunau → use soundfile or wave for audio I/O
|
|
63
|
+
|
|
64
|
+
Option 3 — Pin Homebrew Python to 3.12 on macos-26 (temporary):
|
|
65
|
+
brew install python@3.12
|
|
66
|
+
brew link python@3.12 --force
|
|
67
|
+
echo "/usr/local/opt/python@3.12/bin" >> $GITHUB_PATH
|
|
68
|
+
fix_code:
|
|
69
|
+
- language: yaml
|
|
70
|
+
label: 'Fix: pin Python version with actions/setup-python to avoid Homebrew python@3'
|
|
71
|
+
code: |
|
|
72
|
+
- uses: actions/setup-python@v5
|
|
73
|
+
with:
|
|
74
|
+
python-version: '3.12' # pins to 3.12; immune to Homebrew python@3 upgrade
|
|
75
|
+
|
|
76
|
+
- name: Install dependencies
|
|
77
|
+
run: pip install -r requirements.txt
|
|
78
|
+
|
|
79
|
+
- name: Run script
|
|
80
|
+
run: python script.py # uses setup-python's 3.12, not Homebrew 3.13
|
|
81
|
+
- language: yaml
|
|
82
|
+
label: 'Fix: install removed modules from PyPI backports'
|
|
83
|
+
code: |
|
|
84
|
+
- uses: actions/setup-python@v5
|
|
85
|
+
with:
|
|
86
|
+
python-version: '3.13'
|
|
87
|
+
- name: Install backported removed modules
|
|
88
|
+
run: |
|
|
89
|
+
pip install imghdr # PyPI backport of imghdr for Python 3.13+
|
|
90
|
+
# pip install telnetlib3 # if using Telnet
|
|
91
|
+
- name: Run script
|
|
92
|
+
run: python script.py
|
|
93
|
+
- language: yaml
|
|
94
|
+
label: 'Temporary: install and use python@3.12 from Homebrew on macos-26'
|
|
95
|
+
code: |
|
|
96
|
+
- name: Pin Homebrew Python to 3.12
|
|
97
|
+
run: |
|
|
98
|
+
brew install python@3.12
|
|
99
|
+
echo "/usr/local/opt/python@3.12/libexec/bin" >> $GITHUB_PATH
|
|
100
|
+
- name: Verify Python version
|
|
101
|
+
run: python3 --version # should print Python 3.12.x
|
|
102
|
+
prevention:
|
|
103
|
+
- 'Always use actions/setup-python with an explicit version — never rely on bare python3 pointing to Homebrew python@3'
|
|
104
|
+
- 'Audit scripts for imports of modules removed in Python 3.13: cgi, imghdr, aifc, telnetlib, nntplib, chunk, uu'
|
|
105
|
+
- 'Run pyupgrade --py313-plus locally before the macos-26 migration to catch deprecated imports'
|
|
106
|
+
- 'Add python --version to diagnostic steps to catch unexpected Python version changes early'
|
|
107
|
+
docs:
|
|
108
|
+
- url: 'https://docs.python.org/3/whatsnew/3.13.html#removed-modules'
|
|
109
|
+
label: 'Python 3.13: Removed modules (official docs)'
|
|
110
|
+
- url: 'https://peps.python.org/pep-0594/'
|
|
111
|
+
label: 'PEP 594: Removing dead batteries from the standard library'
|
|
112
|
+
- url: 'https://github.com/actions/setup-python'
|
|
113
|
+
label: 'actions/setup-python: Pin a specific Python version'
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
id: runner-environment-200
|
|
2
|
+
title: 'macOS 26 OpenSSL 3.x rejects legacy RC2 ciphers — p12 certificate import fails with inner_evp_generic_fetch unsupported'
|
|
3
|
+
category: runner-environment
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- macos
|
|
7
|
+
- macos-26
|
|
8
|
+
- openssl
|
|
9
|
+
- code-signing
|
|
10
|
+
- certificate
|
|
11
|
+
- ios
|
|
12
|
+
- legacy-cipher
|
|
13
|
+
patterns:
|
|
14
|
+
- regex: 'inner_evp_generic_fetch:unsupported'
|
|
15
|
+
flags: 'i'
|
|
16
|
+
- regex: 'Algorithm \(RC2-\d+-CBC.*\)'
|
|
17
|
+
flags: 'i'
|
|
18
|
+
- regex: 'Could not find certificate from.*stdin|Could not parse.*certificate'
|
|
19
|
+
flags: 'i'
|
|
20
|
+
- regex: 'openssl.*failed with return code:\s*1'
|
|
21
|
+
flags: 'i'
|
|
22
|
+
error_messages:
|
|
23
|
+
- '40CBBC50F87F0000:error:0308010C:digital envelope routines:inner_evp_generic_fetch:unsupported:crypto/evp/evp_fetch.c:355:Global default library context, Algorithm (RC2-40-CBC : 0), Properties ()'
|
|
24
|
+
- 'Could not find certificate from <stdin>'
|
|
25
|
+
- '##[error]Error: /usr/local/bin/openssl failed with return code: 1'
|
|
26
|
+
- '##[warning]Error parsing certificate. This might be caused by an unsupported algorithm. If you''re using old certificate with a new OpenSSL version try to set -legacy flag in opensslPkcsArgs input.'
|
|
27
|
+
root_cause: |
|
|
28
|
+
macOS 26 runner images ship OpenSSL 3.x (OpenSSL 3.6.x as of runner-images macOS 26
|
|
29
|
+
release notes). OpenSSL 3.x removed RC2-40-CBC, RC2-64-CBC, and RC2-128-CBC from
|
|
30
|
+
the default provider. These legacy ciphers were commonly used to encrypt PKCS#12
|
|
31
|
+
certificate bundles generated by macOS Keychain Access, Fastlane cert, older CI
|
|
32
|
+
pipelines, or Keychain import/export tools shipped before 2020.
|
|
33
|
+
|
|
34
|
+
When a workflow installs a p12 certificate using openssl pkcs12 (directly or via
|
|
35
|
+
apple-actions/import-codesign-certs@v1/v2), OpenSSL 3.x cannot decrypt the legacy
|
|
36
|
+
bundle and emits:
|
|
37
|
+
|
|
38
|
+
error:0308010C:digital envelope routines:inner_evp_generic_fetch:unsupported:
|
|
39
|
+
...Algorithm (RC2-40-CBC : 0), Properties ()
|
|
40
|
+
|
|
41
|
+
followed by "Could not find certificate from <stdin>" and openssl exit code 1.
|
|
42
|
+
|
|
43
|
+
The code-signing step silently skips certificate import, causing downstream
|
|
44
|
+
codesign, xcodebuild archive, or notarytool steps to fail with "no identity found"
|
|
45
|
+
or "identity not in keychain" errors.
|
|
46
|
+
|
|
47
|
+
Workflows that ran successfully on macos-14 (OpenSSL 1.1.x) or macos-15
|
|
48
|
+
(OpenSSL 3.3.x with legacy provider enabled) may start failing when the job
|
|
49
|
+
label is macos-26 or when macos-latest migrates to macOS 26.
|
|
50
|
+
fix: |
|
|
51
|
+
Option 1 (quickest) — Pass opensslPkcsArgs: -legacy to apple-actions/import-codesign-certs:
|
|
52
|
+
The -legacy flag activates OpenSSL's legacy provider which re-enables RC2 and
|
|
53
|
+
other deprecated algorithms. Supported in apple-actions/import-codesign-certs v2+.
|
|
54
|
+
|
|
55
|
+
Option 2 (permanent) — Re-encode the p12 with a modern cipher on your local machine
|
|
56
|
+
before uploading to GitHub Secrets:
|
|
57
|
+
openssl pkcs12 -in legacy.p12 -out certs.pem -nodes -passin pass:OLDPASSWORD
|
|
58
|
+
openssl pkcs12 -export -in certs.pem -out modern.p12 -passout pass:NEWPASSWORD \
|
|
59
|
+
-keypbe aes-256-cbc -certpbe aes-256-cbc -macalg sha256
|
|
60
|
+
base64 -w 0 modern.p12 > modern_b64.txt
|
|
61
|
+
# Upload content of modern_b64.txt as the new GitHub secret value
|
|
62
|
+
|
|
63
|
+
Option 3 — Regenerate certificates with modern tooling:
|
|
64
|
+
Use Fastlane match (>= 3.x) or Xcode 26 certificate export; both default to
|
|
65
|
+
AES-256-CBC which OpenSSL 3.x supports without -legacy.
|
|
66
|
+
fix_code:
|
|
67
|
+
- language: yaml
|
|
68
|
+
label: 'Fix: pass opensslPkcsArgs: -legacy (apple-actions/import-codesign-certs)'
|
|
69
|
+
code: |
|
|
70
|
+
- uses: apple-actions/import-codesign-certs@v3
|
|
71
|
+
with:
|
|
72
|
+
p12-file-base64: ${{ secrets.IOS_DISTRIBUTION_P12 }}
|
|
73
|
+
p12-password: ${{ secrets.IOS_DISTRIBUTION_P12_PASSWORD }}
|
|
74
|
+
opensslPkcsArgs: -legacy # required on macOS 26 / OpenSSL 3.x for RC2-encrypted p12
|
|
75
|
+
- language: yaml
|
|
76
|
+
label: 'Long-term fix: re-encode p12 with AES-256-CBC before updating the secret'
|
|
77
|
+
code: |
|
|
78
|
+
# Run these commands locally (not in CI) to produce a modern p12:
|
|
79
|
+
#
|
|
80
|
+
# openssl pkcs12 -in legacy.p12 -out certs.pem -nodes -passin pass:$OLD_PASS \
|
|
81
|
+
# -legacy # may need -legacy on your local OpenSSL 3.x too
|
|
82
|
+
# openssl pkcs12 -export \
|
|
83
|
+
# -in certs.pem -out modern.p12 \
|
|
84
|
+
# -passout pass:$NEW_PASS \
|
|
85
|
+
# -keypbe aes-256-cbc \
|
|
86
|
+
# -certpbe aes-256-cbc \
|
|
87
|
+
# -macalg sha256
|
|
88
|
+
# base64 -w 0 modern.p12 | pbcopy # paste into GitHub Secrets
|
|
89
|
+
#
|
|
90
|
+
# After updating the secret, remove the opensslPkcsArgs: -legacy workaround.
|
|
91
|
+
prevention:
|
|
92
|
+
- 'Re-encode all p12 certificate bundles with AES-256-CBC before migrating to macos-26'
|
|
93
|
+
- 'Audit p12 cipher with: openssl pkcs12 -in cert.p12 -info -noout -passin pass:X 2>&1 | grep -i cipher'
|
|
94
|
+
- 'Pin macos-15 temporarily as a fallback while re-encoding certificates'
|
|
95
|
+
- 'Test certificate import on macos-26 runners before macos-latest label migration completes'
|
|
96
|
+
docs:
|
|
97
|
+
- url: 'https://github.com/actions/runner-images/issues/10934'
|
|
98
|
+
label: 'GitHub runner-images #10934: Intermittent SSL issue loading iOS certificates on macOS 26'
|
|
99
|
+
- url: 'https://www.openssl.org/docs/man3.0/man7/OSSL_PROVIDER-legacy.html'
|
|
100
|
+
label: 'OpenSSL 3.x legacy provider documentation'
|
|
101
|
+
- url: 'https://github.com/apple-actions/import-codesign-certs'
|
|
102
|
+
label: 'apple-actions/import-codesign-certs: opensslPkcsArgs input'
|
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
id: runner-environment-205
|
|
2
|
+
title: 'macOS self-hosted Runner.Listener silently stalls after AAD credential-refresh — ghost-busy state blocks queue'
|
|
3
|
+
category: runner-environment
|
|
4
|
+
severity: silent-failure
|
|
5
|
+
tags:
|
|
6
|
+
- self-hosted
|
|
7
|
+
- macos
|
|
8
|
+
- apple-silicon
|
|
9
|
+
- listener
|
|
10
|
+
- aad
|
|
11
|
+
- ghost-busy
|
|
12
|
+
- broker-reconnect
|
|
13
|
+
- credential-refresh
|
|
14
|
+
patterns:
|
|
15
|
+
- regex: 'AAD Correlation ID for this token request:\s*Unknown'
|
|
16
|
+
flags: 'i'
|
|
17
|
+
- regex: 'RSAFileKeyManager.*Loading RSA key parameters from file.*credentials_rsaparams'
|
|
18
|
+
flags: 'i'
|
|
19
|
+
- regex: 'GitHubActionsService.*AAD Correlation ID.*Unknown'
|
|
20
|
+
flags: 'i'
|
|
21
|
+
error_messages:
|
|
22
|
+
- '[INFO RSAFileKeyManager] Loading RSA key parameters from file .../.credentials_rsaparams'
|
|
23
|
+
- '[INFO GitHubActionsService] AAD Correlation ID for this token request: Unknown'
|
|
24
|
+
root_cause: |
|
|
25
|
+
On long-lived self-hosted macOS runners (v2.334.0+, Apple Silicon), the
|
|
26
|
+
Runner.Listener process can permanently stall after an AAD (Azure Active Directory)
|
|
27
|
+
credential-refresh event coincides with a broker session disconnect.
|
|
28
|
+
|
|
29
|
+
Normal broker long-poll timeouts produce "SocketException (89): Operation canceled"
|
|
30
|
+
entries and the listener successfully reconnects. However, when a broker disconnect
|
|
31
|
+
occurs at the same time as an AAD credential refresh, the listener logs its final
|
|
32
|
+
diagnostic sequence and then goes permanently silent:
|
|
33
|
+
[INFO RSAFileKeyManager] Loading RSA key parameters from file .../.credentials_rsaparams
|
|
34
|
+
[INFO GitHubActionsService] AAD Correlation ID for this token request: Unknown
|
|
35
|
+
|
|
36
|
+
After these lines, the main thread parks in pthread_cond_wait with no further diag
|
|
37
|
+
log output and no TCP ESTABLISHED connection to the broker. The OS process stays alive
|
|
38
|
+
(visible in ps/Activity Monitor/launchctl list), so launchd does not restart it. The
|
|
39
|
+
broker-side agent state continues to show the runner as "busy" from its last completed
|
|
40
|
+
job, stalling all subsequent queued jobs behind the phantom runner until an external
|
|
41
|
+
restart clears the state.
|
|
42
|
+
|
|
43
|
+
The trigger requires: runner lifetime longer than several hours (so a credential
|
|
44
|
+
refresh occurs), plus a broker disconnect at or immediately after the refresh boundary.
|
|
45
|
+
Observed simultaneously affecting all 4 of 4 macOS ARM64 runners on a single host
|
|
46
|
+
within a 32-minute window, causing a 4-hour queue stall. Source: actions/runner#4446.
|
|
47
|
+
fix: |
|
|
48
|
+
No platform-side fix available as of June 2026 (open issue).
|
|
49
|
+
|
|
50
|
+
Workaround — implement an out-of-band watchdog script/cron that:
|
|
51
|
+
1. Confirms the Runner.Listener PID has an ESTABLISHED TCP socket to the broker
|
|
52
|
+
(check with `lsof -p <pid> -i TCP | grep ESTABLISHED`)
|
|
53
|
+
2. Confirms the most recent entry in _diag/Runner_*.log is less than N minutes old
|
|
54
|
+
(e.g., 10 minutes)
|
|
55
|
+
3. If both checks fail, restarts the runner service:
|
|
56
|
+
- macOS (launchd):
|
|
57
|
+
launchctl bootout gui/$(id -u) ~/Library/LaunchAgents/actions.runner.<owner-repo>.<name>.plist
|
|
58
|
+
sleep 5
|
|
59
|
+
launchctl bootstrap gui/$(id -u) ~/Library/LaunchAgents/actions.runner.<owner-repo>.<name>.plist
|
|
60
|
+
- Linux (systemd):
|
|
61
|
+
systemctl --user restart actions.runner.<owner-repo>.<name>.service
|
|
62
|
+
4. Optionally clear the broker-side ghost-busy state via REST API:
|
|
63
|
+
curl -X DELETE \
|
|
64
|
+
-H "Authorization: Bearer $GH_TOKEN" \
|
|
65
|
+
"https://api.github.com/repos/<owner>/<repo>/actions/runners/<runner_id>"
|
|
66
|
+
This forces re-registration and clears the stale busy state immediately.
|
|
67
|
+
|
|
68
|
+
Long-term: run macOS runners as ephemeral (--once) with a process supervisor
|
|
69
|
+
that restarts after each completed job, eliminating the multi-hour lifetime
|
|
70
|
+
that triggers the credential-refresh race.
|
|
71
|
+
fix_code:
|
|
72
|
+
- language: yaml
|
|
73
|
+
label: 'Watchdog workflow on separate runner — detect and restart stalled listeners'
|
|
74
|
+
code: |
|
|
75
|
+
# Separate monitoring workflow on a non-affected runner
|
|
76
|
+
# Runs every 15 minutes via cron
|
|
77
|
+
on:
|
|
78
|
+
schedule:
|
|
79
|
+
- cron: '*/15 * * * *'
|
|
80
|
+
jobs:
|
|
81
|
+
watchdog:
|
|
82
|
+
runs-on: ubuntu-latest # Use a separate hosted runner for the watchdog
|
|
83
|
+
steps:
|
|
84
|
+
- name: Check and restart stalled macOS listeners
|
|
85
|
+
env:
|
|
86
|
+
GH_TOKEN: ${{ secrets.RUNNER_MGMT_PAT }}
|
|
87
|
+
run: |
|
|
88
|
+
# List all self-hosted runners and check for stuck-busy ones
|
|
89
|
+
gh api /repos/${{ github.repository }}/actions/runners \
|
|
90
|
+
--jq '.runners[] | select(.status=="online" and .busy==true) | .id' \
|
|
91
|
+
| while read runner_id; do
|
|
92
|
+
echo "Runner $runner_id showing busy — may need investigation"
|
|
93
|
+
# Add custom liveness check here (SSH to host, check log freshness)
|
|
94
|
+
done
|
|
95
|
+
- language: bash
|
|
96
|
+
label: 'Shell watchdog — check listener log freshness and restart via launchctl'
|
|
97
|
+
code: |
|
|
98
|
+
#!/bin/bash
|
|
99
|
+
# Run on the macOS runner host via cron every 10 minutes
|
|
100
|
+
RUNNER_LABEL="owner-repo-runner-name"
|
|
101
|
+
PLIST="$HOME/Library/LaunchAgents/actions.runner.${RUNNER_LABEL}.plist"
|
|
102
|
+
DIAG_DIR="$HOME/actions-runner/_diag"
|
|
103
|
+
STALE_MINUTES=10
|
|
104
|
+
|
|
105
|
+
latest_log=$(ls -t "${DIAG_DIR}/Runner_"*.log 2>/dev/null | head -1)
|
|
106
|
+
if [[ -z "$latest_log" ]]; then exit 0; fi
|
|
107
|
+
|
|
108
|
+
age_minutes=$(( ($(date +%s) - $(stat -f %m "$latest_log")) / 60 ))
|
|
109
|
+
if (( age_minutes > STALE_MINUTES )); then
|
|
110
|
+
echo "Runner diag log stale for ${age_minutes}min — restarting..."
|
|
111
|
+
launchctl bootout "gui/$(id -u)" "$PLIST" 2>/dev/null
|
|
112
|
+
sleep 5
|
|
113
|
+
launchctl bootstrap "gui/$(id -u)" "$PLIST"
|
|
114
|
+
fi
|
|
115
|
+
prevention:
|
|
116
|
+
- 'Run macOS self-hosted runners as ephemeral (--once) with a process supervisor — this eliminates the multi-hour lifetime needed to trigger the credential-refresh race condition'
|
|
117
|
+
- 'Implement a log-freshness watchdog that monitors _diag/Runner_*.log modification time and restarts the launchd service if no new entries appear for > 10 minutes'
|
|
118
|
+
- 'Monitor GET /repos/{owner}/{repo}/actions/runners and alert on runners with busy=true for longer than your longest expected job duration'
|
|
119
|
+
- 'Limit runner lifetime with a cron-triggered scheduled restart between jobs (e.g., nightly) to reduce the window where credential refresh coincides with a broker disconnect'
|
|
120
|
+
docs:
|
|
121
|
+
- url: 'https://github.com/actions/runner/issues/4446'
|
|
122
|
+
label: 'actions/runner#4446: Listener silently exits broker-reconnect loop after AAD credential-refresh (ghost-busy)'
|
|
123
|
+
- url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/monitoring-and-troubleshooting-self-hosted-runners'
|
|
124
|
+
label: 'GitHub Docs: Monitoring and troubleshooting self-hosted runners'
|
|
125
|
+
- url: 'https://docs.github.com/en/rest/actions/self-hosted-runners'
|
|
126
|
+
label: 'GitHub REST API: Self-hosted runners'
|
|
@@ -0,0 +1,100 @@
|
|
|
1
|
+
id: runner-environment-202
|
|
2
|
+
title: 'Runner v2.334.0 "Download action repository" repeats for same action when references use different letter casing'
|
|
3
|
+
category: runner-environment
|
|
4
|
+
severity: warning
|
|
5
|
+
tags:
|
|
6
|
+
- runner
|
|
7
|
+
- action-resolution
|
|
8
|
+
- performance
|
|
9
|
+
- v2.334
|
|
10
|
+
- composite-actions
|
|
11
|
+
- case-sensitivity
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: 'Download action repository.*already been downloaded'
|
|
14
|
+
flags: 'i'
|
|
15
|
+
- regex: 'Download action repository.*\d+\.\d+s.*Download action repository'
|
|
16
|
+
flags: 'i'
|
|
17
|
+
- regex: 'Resolving action.*batching.*dedup.*failed|action.*resolution.*duplicate.*case'
|
|
18
|
+
flags: 'i'
|
|
19
|
+
error_messages:
|
|
20
|
+
- 'Download action repository ''actions/checkout@v4'' (SHA:abc123...) 18.35s'
|
|
21
|
+
- 'Download action repository ''Actions/Checkout@v4'' (SHA:abc123...) 19.12s'
|
|
22
|
+
- 'Download action repository ''ACTIONS/CHECKOUT@v4'' (SHA:abc123...) 17.88s'
|
|
23
|
+
root_cause: |
|
|
24
|
+
Runner v2.334.0 introduced "batch and deduplicate action resolution across composite
|
|
25
|
+
depths" to speed up jobs with many actions. The deduplication logic uses a
|
|
26
|
+
case-sensitive equality check on the action reference string (owner/repo@ref). When
|
|
27
|
+
the same action is referenced with different capitalizations — for example,
|
|
28
|
+
actions/checkout@v4 in one workflow step and Actions/Checkout@v4 inside a composite
|
|
29
|
+
action — the deduplication logic treats them as distinct actions and downloads both.
|
|
30
|
+
|
|
31
|
+
This regression is tracked in actions/runner#3731. Each duplicate download takes
|
|
32
|
+
15-20 seconds, which multiplies across all case-variant references in a job. In
|
|
33
|
+
workflows with many composite actions or large action dependency trees, this can add
|
|
34
|
+
several minutes of silent overhead.
|
|
35
|
+
|
|
36
|
+
The issue does not cause a build failure — the downloads resolve to the same SHA
|
|
37
|
+
and the action runs once. The cost is purely in duplicate network traffic, API
|
|
38
|
+
calls, and elapsed time. Repeated downloads also count against GitHub API rate
|
|
39
|
+
limits for private runners or GHES installations.
|
|
40
|
+
|
|
41
|
+
Most commonly observed when:
|
|
42
|
+
- Composite actions use uppercase or title-case owners in their uses: fields
|
|
43
|
+
- Third-party actions internally call actions/core or actions/toolcache with
|
|
44
|
+
inconsistent casing
|
|
45
|
+
- Workflows mix community-contributed composite actions with differing conventions
|
|
46
|
+
fix: |
|
|
47
|
+
Canonicalize all action references to lowercase owner/repo throughout your
|
|
48
|
+
workflow files and composite action.yml files. GitHub API lookups are
|
|
49
|
+
case-insensitive, so actions/checkout and Actions/Checkout resolve identically,
|
|
50
|
+
but the v2.334.0 deduplication cache is case-sensitive.
|
|
51
|
+
|
|
52
|
+
After fixing, run the job again to confirm "Download action repository" appears
|
|
53
|
+
only once per unique action in the logs.
|
|
54
|
+
|
|
55
|
+
A fix for the runner-side deduplication (case-insensitive cache key) is tracked
|
|
56
|
+
in actions/runner#3731 and may be included in a future runner release.
|
|
57
|
+
fix_code:
|
|
58
|
+
- language: yaml
|
|
59
|
+
label: 'Bug: mixed-case action references cause repeated downloads'
|
|
60
|
+
code: |
|
|
61
|
+
# workflow.yml — uses lowercase
|
|
62
|
+
steps:
|
|
63
|
+
- uses: actions/checkout@v4
|
|
64
|
+
|
|
65
|
+
# internal/composite/action.yml — uses title-case (different casing)
|
|
66
|
+
runs:
|
|
67
|
+
using: composite
|
|
68
|
+
steps:
|
|
69
|
+
- uses: Actions/Checkout@v4 # triggers a second download in v2.334.0
|
|
70
|
+
- language: yaml
|
|
71
|
+
label: 'Fix: canonicalize to lowercase owner/repo in all workflow and composite action files'
|
|
72
|
+
code: |
|
|
73
|
+
# workflow.yml
|
|
74
|
+
steps:
|
|
75
|
+
- uses: actions/checkout@v4 # lowercase
|
|
76
|
+
|
|
77
|
+
# internal/composite/action.yml
|
|
78
|
+
runs:
|
|
79
|
+
using: composite
|
|
80
|
+
steps:
|
|
81
|
+
- uses: actions/checkout@v4 # lowercase matches — deduplicated correctly
|
|
82
|
+
- language: yaml
|
|
83
|
+
label: 'Diagnostic: detect duplicate downloads by searching job logs'
|
|
84
|
+
code: |
|
|
85
|
+
# In the Actions UI, open the job log and search for "Download action repository"
|
|
86
|
+
# If the same action appears more than once (even with different casing), you
|
|
87
|
+
# have duplicate downloads.
|
|
88
|
+
#
|
|
89
|
+
# CLI equivalent (with gh):
|
|
90
|
+
# gh run view <run-id> --log | grep "Download action repository" | sort | uniq -c
|
|
91
|
+
prevention:
|
|
92
|
+
- 'Use consistent lowercase owner/repo for all action references across workflow files and composite actions'
|
|
93
|
+
- 'Add an actionlint check to CI; it flags inconsistent action reference casing'
|
|
94
|
+
- 'Audit composite action.yml files for uppercase uses: references, as they are the most common source'
|
|
95
|
+
- 'Pin runner version to v2.333.0 as a temporary workaround while waiting for runner#3731 fix'
|
|
96
|
+
docs:
|
|
97
|
+
- url: 'https://github.com/actions/runner/issues/3731'
|
|
98
|
+
label: 'GitHub runner#3731: Download action repository called repeatedly for same action (case-sensitivity)'
|
|
99
|
+
- url: 'https://github.com/actions/runner/releases/tag/v2.334.0'
|
|
100
|
+
label: 'Runner v2.334.0 release notes: batch and deduplicate action resolution'
|
|
@@ -0,0 +1,103 @@
|
|
|
1
|
+
id: runner-environment-204
|
|
2
|
+
title: 'setup-node fails with EBADDEVENGINES when devEngines.packageManager requires a newer npm than Node ships'
|
|
3
|
+
category: runner-environment
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- setup-node
|
|
7
|
+
- npm
|
|
8
|
+
- devEngines
|
|
9
|
+
- EBADDEVENGINES
|
|
10
|
+
- package-manager
|
|
11
|
+
- node-22
|
|
12
|
+
- package-json
|
|
13
|
+
patterns:
|
|
14
|
+
- regex: 'npm error code EBADDEVENGINES'
|
|
15
|
+
flags: 'i'
|
|
16
|
+
- regex: 'EBADDEVENGINES.*Invalid devEngines\.packageManager'
|
|
17
|
+
flags: 'i'
|
|
18
|
+
- regex: 'Invalid semver version.*does not match.*for "packageManager"'
|
|
19
|
+
flags: 'i'
|
|
20
|
+
- regex: 'npm config get cache.*EBADDEVENGINES'
|
|
21
|
+
flags: 'si'
|
|
22
|
+
error_messages:
|
|
23
|
+
- 'npm error code EBADDEVENGINES'
|
|
24
|
+
- 'npm error EBADDEVENGINES The developer of this package has specified the following through devEngines'
|
|
25
|
+
- 'npm error EBADDEVENGINES Invalid devEngines.packageManager'
|
|
26
|
+
- 'npm error EBADDEVENGINES Invalid semver version "^11.10.0" does not match "10.9.7" for "packageManager"'
|
|
27
|
+
- '/opt/hostedtoolcache/node/22.22.2/x64/bin/npm config get cache'
|
|
28
|
+
root_cause: |
|
|
29
|
+
actions/setup-node detects the package manager by running `npm config get cache`
|
|
30
|
+
early in its initialization — before any user-specified npm upgrade step can run.
|
|
31
|
+
When package.json contains a `devEngines.packageManager` field specifying a minimum
|
|
32
|
+
npm version higher than the version bundled with the selected Node.js release, npm
|
|
33
|
+
enforces the devEngines constraint and exits with EBADDEVENGINES during that preflight
|
|
34
|
+
`npm config get cache` call, causing setup-node to fail immediately.
|
|
35
|
+
|
|
36
|
+
Example: Node.js v22 ships with npm v10. If package.json requires:
|
|
37
|
+
"devEngines": { "packageManager": { "name": "npm", "version": "^11.10.0" } }
|
|
38
|
+
then npm v10 raises EBADDEVENGINES, aborting setup-node before the user can run
|
|
39
|
+
`npm install --global npm@latest` in a subsequent step.
|
|
40
|
+
|
|
41
|
+
The `devEngines` field (npm RFC) was introduced in Node.js 22's era and allows
|
|
42
|
+
packages to declare their required toolchain. npm 10.x enforces it strictly by
|
|
43
|
+
default. Source: actions/setup-node#1553.
|
|
44
|
+
fix: |
|
|
45
|
+
Option 1 — Use the npm-version input on setup-node (recommended):
|
|
46
|
+
If setup-node applies the npm-version upgrade before its package manager detection,
|
|
47
|
+
the correct npm version will already be present when devEngines is checked.
|
|
48
|
+
Test this first; behaviour may depend on setup-node version.
|
|
49
|
+
|
|
50
|
+
Option 2 — Set devEngines.packageManager.onFail to "warn" in package.json:
|
|
51
|
+
npm 10.9+ supports an `onFail` sub-field that downgrades the constraint violation
|
|
52
|
+
from fatal error to warning. This allows CI to proceed and install the required
|
|
53
|
+
npm in a subsequent step.
|
|
54
|
+
|
|
55
|
+
Option 3 — Use explicit node-version instead of node-version-file:
|
|
56
|
+
When node-version-file is used, setup-node reads package.json to determine the
|
|
57
|
+
Node version, which may trigger the devEngines check. Using an explicit
|
|
58
|
+
node-version string avoids reading package.json entirely for version resolution.
|
|
59
|
+
|
|
60
|
+
Option 4 — Remove or relax devEngines.packageManager for CI use:
|
|
61
|
+
Consider whether the devEngines constraint is necessary for CI vs local dev.
|
|
62
|
+
A `.npmrc` with `engine-strict=false` in the repo will disable enforcement
|
|
63
|
+
for all npm operations in that directory, but this also affects local installs.
|
|
64
|
+
fix_code:
|
|
65
|
+
- language: yaml
|
|
66
|
+
label: 'Option 1 — use npm-version input to pre-install required npm'
|
|
67
|
+
code: |
|
|
68
|
+
- uses: actions/setup-node@v6
|
|
69
|
+
with:
|
|
70
|
+
node-version-file: package.json # Reads engines.node
|
|
71
|
+
npm-version: '11' # Pre-installs npm 11 before devEngines check
|
|
72
|
+
- language: yaml
|
|
73
|
+
label: 'Option 3 — use explicit node-version to skip package.json parsing'
|
|
74
|
+
code: |
|
|
75
|
+
- uses: actions/setup-node@v6
|
|
76
|
+
with:
|
|
77
|
+
node-version: '22' # Explicit version; does not read package.json
|
|
78
|
+
# Install required npm explicitly in a later step
|
|
79
|
+
- run: npm install --global npm@^11.10.0
|
|
80
|
+
- language: json
|
|
81
|
+
label: 'Option 2 — set onFail: warn in package.json to prevent fatal error'
|
|
82
|
+
code: |
|
|
83
|
+
{
|
|
84
|
+
"devEngines": {
|
|
85
|
+
"packageManager": {
|
|
86
|
+
"name": "npm",
|
|
87
|
+
"version": "^11.10.0",
|
|
88
|
+
"onFail": "warn"
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
prevention:
|
|
93
|
+
- 'When adopting devEngines.packageManager in package.json, test the CI setup-node step immediately — it may fail before your npm upgrade step can run'
|
|
94
|
+
- 'Set devEngines.packageManager.onFail: "warn" in any repo where CI installs the required npm version dynamically rather than having it pre-installed'
|
|
95
|
+
- 'Use the npm-version input on setup-node when your package.json requires a specific npm version via devEngines'
|
|
96
|
+
- 'Avoid relying on node-version-file if your package.json contains strict devEngines constraints and you need to upgrade the package manager in CI'
|
|
97
|
+
docs:
|
|
98
|
+
- url: 'https://github.com/actions/setup-node/issues/1553'
|
|
99
|
+
label: 'actions/setup-node#1553: npm config get cache fails with EBADDEVENGINES'
|
|
100
|
+
- url: 'https://docs.npmjs.com/cli/v11/configuring-npm/package-json#devengines'
|
|
101
|
+
label: 'npm docs: devEngines field in package.json'
|
|
102
|
+
- url: 'https://github.com/nodejs/package-maintenance/issues/539'
|
|
103
|
+
label: 'Node.js RFC: devEngines field specification'
|
|
@@ -0,0 +1,105 @@
|
|
|
1
|
+
id: silent-failures-107
|
|
2
|
+
title: "dorny/paths-filter Reports All Files Changed When 'before' Field Missing on workflow_run"
|
|
3
|
+
category: silent-failures
|
|
4
|
+
severity: silent-failure
|
|
5
|
+
tags:
|
|
6
|
+
- paths-filter
|
|
7
|
+
- workflow_run
|
|
8
|
+
- before-field
|
|
9
|
+
- silent-failure
|
|
10
|
+
- fork-pr
|
|
11
|
+
- changed-files
|
|
12
|
+
- wrong-output
|
|
13
|
+
patterns:
|
|
14
|
+
- regex: '''before'' field is missing in event payload'
|
|
15
|
+
flags: 'i'
|
|
16
|
+
- regex: 'changes will be detected from last commit'
|
|
17
|
+
flags: 'i'
|
|
18
|
+
- regex: 'paths-filter.*workflow.run'
|
|
19
|
+
flags: 'i'
|
|
20
|
+
error_messages:
|
|
21
|
+
- "Warning: 'before' field is missing in event payload - changes will be detected from last commit"
|
|
22
|
+
root_cause: |
|
|
23
|
+
`dorny/paths-filter` (v3 and earlier) contains an internal code path that requires a
|
|
24
|
+
`before` SHA field from the GitHub event payload to compute the diff base. When used in
|
|
25
|
+
a `workflow_run`-triggered workflow, the event payload does not contain a `before` field
|
|
26
|
+
(it only has `head_sha`, `head_branch`, etc.) — so the action falls into a fallback path
|
|
27
|
+
that calls `getChangesInLastCommit()` instead of performing a proper merge-base diff.
|
|
28
|
+
|
|
29
|
+
The result: **every file in the repository is reported as "changed"**, defeating any
|
|
30
|
+
attempt to use paths-filter as a change detection gate in `workflow_run` workflows.
|
|
31
|
+
|
|
32
|
+
Root cause in source code (paths-filter v3):
|
|
33
|
+
The action's `getChangedFilesFromGit()` logic normalizes both `base` and `head` to short
|
|
34
|
+
branch names. On `workflow_run`, `github.context.ref` resolves to the same ref as the
|
|
35
|
+
user-supplied `base:` (e.g. both become "main"). The `isBaseSameAsHead` check triggers
|
|
36
|
+
the fallback: if `base === head` AND `beforeSha` is null, the action emits the warning
|
|
37
|
+
and returns only the last commit's changes — which in a merge-queue or PR workflow is
|
|
38
|
+
completely wrong.
|
|
39
|
+
|
|
40
|
+
The same bug occurs when:
|
|
41
|
+
- Using paths-filter in a `workflow_run` workflow without explicitly passing `ref:` as a SHA.
|
|
42
|
+
- Running from a forked PR where the `before` field is absent from the push payload.
|
|
43
|
+
- Any workflow_run where `github.context.ref` resolves to the same branch name as the
|
|
44
|
+
`base:` input after short-name normalization.
|
|
45
|
+
fix: |
|
|
46
|
+
Pass `ref:` explicitly as the HEAD SHA of the triggering workflow run. Using a full SHA
|
|
47
|
+
prevents the `isBaseSameAsHead` short-circuit because a SHA never equals a branch name
|
|
48
|
+
after normalization:
|
|
49
|
+
|
|
50
|
+
- uses: dorny/paths-filter@v3
|
|
51
|
+
with:
|
|
52
|
+
base: 'main'
|
|
53
|
+
ref: ${{ github.event.workflow_run.head_sha }}
|
|
54
|
+
filters: |
|
|
55
|
+
backend:
|
|
56
|
+
- 'src/**'
|
|
57
|
+
|
|
58
|
+
Also ensure the prior `actions/checkout` step uses `fetch-depth: 0` so the action has
|
|
59
|
+
both `base` and `ref` locally available for `getChangesSinceMergeBase`.
|
|
60
|
+
|
|
61
|
+
For fork PRs: use `ref: ${{ github.event.workflow_run.head_sha }}` and
|
|
62
|
+
`base: ${{ github.event.workflow_run.base_branch }}`.
|
|
63
|
+
|
|
64
|
+
Alternative: upgrade to dorny/paths-filter v4.0.1+ which adds native `merge_group`
|
|
65
|
+
support and has improved ref handling (check release notes for workflow_run fixes).
|
|
66
|
+
fix_code:
|
|
67
|
+
- language: yaml
|
|
68
|
+
label: 'Pass ref as SHA to avoid isBaseSameAsHead false-positive on workflow_run'
|
|
69
|
+
code: |
|
|
70
|
+
on:
|
|
71
|
+
workflow_run:
|
|
72
|
+
workflows: ["CI"]
|
|
73
|
+
types: [completed]
|
|
74
|
+
|
|
75
|
+
jobs:
|
|
76
|
+
check-changes:
|
|
77
|
+
runs-on: ubuntu-latest
|
|
78
|
+
outputs:
|
|
79
|
+
backend: ${{ steps.filter.outputs.backend }}
|
|
80
|
+
steps:
|
|
81
|
+
- uses: actions/checkout@v4
|
|
82
|
+
with:
|
|
83
|
+
ref: ${{ github.event.workflow_run.head_sha }}
|
|
84
|
+
fetch-depth: 0
|
|
85
|
+
|
|
86
|
+
- uses: dorny/paths-filter@v3
|
|
87
|
+
id: filter
|
|
88
|
+
with:
|
|
89
|
+
base: ${{ github.event.workflow_run.base_branch }}
|
|
90
|
+
# REQUIRED: pass ref as SHA, not branch name
|
|
91
|
+
# Without this, base == head after normalization → all files reported as changed
|
|
92
|
+
ref: ${{ github.event.workflow_run.head_sha }}
|
|
93
|
+
filters: |
|
|
94
|
+
backend:
|
|
95
|
+
- 'src/**'
|
|
96
|
+
prevention:
|
|
97
|
+
- 'Always pass ref: ${{ github.event.workflow_run.head_sha }} when using paths-filter in workflow_run contexts.'
|
|
98
|
+
- 'Never rely on implicit ref resolution in workflow_run workflows — the context.ref is the default branch, not the PR head.'
|
|
99
|
+
- 'After fixing, add a smoke-test PR that changes only an unmonitored path and verify paths-filter returns false for the guarded path.'
|
|
100
|
+
- 'Consider tj-actions/changed-files as an alternative; check its workflow_run documentation before switching.'
|
|
101
|
+
docs:
|
|
102
|
+
- url: 'https://github.com/dorny/paths-filter/issues/261'
|
|
103
|
+
label: "Bug: 'before' field is missing when it should not even be used (11 reactions, open)"
|
|
104
|
+
- url: 'https://github.com/dorny/paths-filter'
|
|
105
|
+
label: 'dorny/paths-filter — conditionally run actions based on modified files'
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
id: silent-failures-106
|
|
2
|
+
title: 'windows-11-arm shell: bash Steps Intermittently Produce Zero Output and Exit 0'
|
|
3
|
+
category: silent-failures
|
|
4
|
+
severity: silent-failure
|
|
5
|
+
tags:
|
|
6
|
+
- windows-11-arm
|
|
7
|
+
- bash
|
|
8
|
+
- composite-action
|
|
9
|
+
- arm64
|
|
10
|
+
- wow64
|
|
11
|
+
- silent-failure
|
|
12
|
+
- intermittent
|
|
13
|
+
patterns:
|
|
14
|
+
- regex: 'error: no such command: `[a-z]'
|
|
15
|
+
flags: 'i'
|
|
16
|
+
- regex: 'error: no such subcommand: `[a-z]'
|
|
17
|
+
flags: 'i'
|
|
18
|
+
- regex: 'error\[E0463\]: can''t find crate for `[a-z]'
|
|
19
|
+
flags: 'i'
|
|
20
|
+
error_messages:
|
|
21
|
+
- 'error: no such command: `make`'
|
|
22
|
+
- 'error: no such command: `expand`'
|
|
23
|
+
- 'error: no such subcommand: `make`'
|
|
24
|
+
root_cause: |
|
|
25
|
+
On `windows-11-arm` runners, `shell: bash` resolves to `C:\Program Files\Git\bin\bash.EXE`,
|
|
26
|
+
which is an x86_64 binary running under WoW64 (Windows-on-Windows 64-bit) ARM64
|
|
27
|
+
emulation. The `actions/runner` binary on this platform is a **native ARM64** process.
|
|
28
|
+
When the native ARM64 runner spawns the x86_64 bash.EXE as a child process, an
|
|
29
|
+
intermittent failure in the WoW64 cross-architecture process launch causes the bash
|
|
30
|
+
process to start but never execute its script body. The step logs the command invocation
|
|
31
|
+
and environment dump, then exits 0 with zero script output — the script never ran.
|
|
32
|
+
|
|
33
|
+
Key characteristics:
|
|
34
|
+
- Failure is transient per-process-invocation: two consecutive bash steps in the same
|
|
35
|
+
job can have one succeed and one fail, ruling out image or runner misconfiguration.
|
|
36
|
+
- Zero-output steps take 0-3 seconds; successful steps take 2-7 seconds.
|
|
37
|
+
- Only occurs on `windows-11-arm`; `windows-latest` (amd64) is unaffected.
|
|
38
|
+
- The downstream step then fails with "no such command" or "not installed" errors because
|
|
39
|
+
the tool that was supposed to be installed by the bash step is missing.
|
|
40
|
+
- Reported across multiple independent Microsoft repositories
|
|
41
|
+
(microsoft/Windows-rust-driver-samples, microsoft/windows-drivers-rs), confirming
|
|
42
|
+
this is a platform-level runner bug, not a workflow configuration issue.
|
|
43
|
+
|
|
44
|
+
Root cause: intermittent x86_64 process execution failure under ARM64 WoW64 emulation
|
|
45
|
+
when launched from a native ARM64 parent process. No fix from the platform side as of
|
|
46
|
+
April 2026; actions/runner issue is open in partner-runner-images.
|
|
47
|
+
fix: |
|
|
48
|
+
Option 1 (recommended): invoke bash from PowerShell as a workaround. Wrapping the bash
|
|
49
|
+
call in a pwsh step bypasses the native-arm64 → x86_64 WoW64 launch that triggers the
|
|
50
|
+
bug, because PowerShell on windows-11-arm is itself x86_64, so spawning x86_64 bash
|
|
51
|
+
from x86_64 pwsh does not cross the ARM64/x86_64 boundary:
|
|
52
|
+
|
|
53
|
+
- name: Run script via pwsh workaround
|
|
54
|
+
shell: pwsh
|
|
55
|
+
run: |
|
|
56
|
+
$result = & 'C:\Program Files\Git\bin\bash.EXE' -c "./your-script.sh"
|
|
57
|
+
if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE }
|
|
58
|
+
|
|
59
|
+
Option 2: use `shell: pwsh` for the entire step and rewrite the logic in PowerShell.
|
|
60
|
+
|
|
61
|
+
Option 3: add a retry wrapper that re-runs the composite action step if the output
|
|
62
|
+
is unexpectedly empty (e.g. check for absence of the installed binary before proceeding
|
|
63
|
+
and re-invoke if missing).
|
|
64
|
+
|
|
65
|
+
Option 4: avoid `windows-11-arm` in the runner matrix until the platform bug is fixed
|
|
66
|
+
upstream, or pin to `windows-latest` (amd64) for bash-heavy composite actions.
|
|
67
|
+
fix_code:
|
|
68
|
+
- language: yaml
|
|
69
|
+
label: 'Invoke bash from pwsh to avoid the WoW64 ARM64 launch bug'
|
|
70
|
+
code: |
|
|
71
|
+
- name: Install tool (windows-11-arm workaround)
|
|
72
|
+
shell: pwsh
|
|
73
|
+
run: |
|
|
74
|
+
# Call x86_64 bash from x86_64 pwsh — avoids native-arm64 → x86_64 WoW64 issue
|
|
75
|
+
& 'C:\Program Files\Git\bin\bash.EXE' --noprofile --norc `
|
|
76
|
+
"${env:GITHUB_ACTION_PATH}/main.sh"
|
|
77
|
+
if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE }
|
|
78
|
+
- language: yaml
|
|
79
|
+
label: 'Exclude windows-11-arm from bash-heavy matrix jobs'
|
|
80
|
+
code: |
|
|
81
|
+
jobs:
|
|
82
|
+
build:
|
|
83
|
+
strategy:
|
|
84
|
+
matrix:
|
|
85
|
+
os: [windows-latest, ubuntu-latest, macos-latest]
|
|
86
|
+
# windows-11-arm excluded: shell: bash intermittent zero-output bug
|
|
87
|
+
# See: https://github.com/actions/partner-runner-images/issues/169
|
|
88
|
+
prevention:
|
|
89
|
+
- 'Test composite actions on windows-11-arm at scale before relying on them in production CI.'
|
|
90
|
+
- 'Add post-step verification that checks for the installed binary; fail explicitly rather than silently continuing.'
|
|
91
|
+
- 'Monitor taiki-e/install-action release notes for the official workaround (pwsh retry wrapper) if using that action.'
|
|
92
|
+
- 'Prefer shell: pwsh over shell: bash for setup/install scripts in composite actions targeting windows-11-arm.'
|
|
93
|
+
docs:
|
|
94
|
+
- url: 'https://github.com/actions/partner-runner-images/issues/169'
|
|
95
|
+
label: '[windows-11-arm] Composite action bash steps intermittently produce zero output and exit 0'
|
|
96
|
+
- url: 'https://github.com/taiki-e/install-action/issues/1562'
|
|
97
|
+
label: 'On windows-11-arm, sometimes does nothing, but succeeds'
|
|
98
|
+
- url: 'https://github.com/taiki-e/install-action/pull/1647'
|
|
99
|
+
label: 'Call main.sh from pwsh on Windows to work around windows-11-arm runner bug'
|
package/package.json
CHANGED