@htekdev/actions-debugger 1.0.14 → 1.0.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/db/search.js +3 -1
- package/dist/db/search.js.map +1 -1
- package/dist/tools/suggest-fix.d.ts.map +1 -1
- package/dist/tools/suggest-fix.js +5 -1
- package/dist/tools/suggest-fix.js.map +1 -1
- package/errors/caching-artifacts/cache-key-too-long.yml +93 -0
- package/errors/caching-artifacts/cache-path-not-exist-skipped.yml +152 -0
- package/errors/caching-artifacts/cache-save-same-key-html-conflict.yml +109 -0
- package/errors/caching-artifacts/docker-buildx-gha-cache-capacity.yml +107 -0
- package/errors/caching-artifacts/setup-ruby-bundler-ephemeral-workdir-cache-miss.yml +147 -0
- package/errors/caching-artifacts/upload-artifact-v3-retirement-blocked.yml +123 -0
- package/errors/caching-artifacts/upload-artifact-v4-large-file-macos-hang.yml +111 -0
- package/errors/concurrency-timing/always-cleanup-5min-forced-kill.yml +140 -0
- package/errors/concurrency-timing/concurrency-group-env-context-undefined.yml +99 -0
- package/errors/concurrency-timing/required-check-pending-path-filter-skip.yml +160 -0
- package/errors/concurrency-timing/wait-timer-cancel-in-progress-starvation.yml +125 -0
- package/errors/known-unsolved/composite-action-step-timeout-minutes-ignored.yml +146 -0
- package/errors/known-unsolved/reusable-workflow-no-composite-action-call.yml +116 -0
- package/errors/known-unsolved/schedule-trigger-default-branch-only.yml +113 -0
- package/errors/known-unsolved/secrets-not-allowed-in-if-conditions.yml +149 -0
- package/errors/known-unsolved/workflow-50-rerun-limit.yml +110 -0
- package/errors/permissions-auth/check-run-status-modification-blocked.yml +134 -0
- package/errors/permissions-auth/dependabot-pr-secrets-unavailable.yml +133 -0
- package/errors/permissions-auth/fine-grained-pat-deployment-write-required.yml +146 -0
- package/errors/permissions-auth/github-app-installation-token-new-format.yml +124 -0
- package/errors/permissions-auth/github-packages-read-requires-packages-permission.yml +128 -0
- package/errors/permissions-auth/oidc-id-token-write-permission-missing.yml +169 -0
- package/errors/permissions-auth/permissions-empty-block-removes-contents-read.yml +97 -0
- package/errors/permissions-auth/reusable-workflow-permissions-not-inherited.yml +114 -0
- package/errors/runner-environment/checkout-windows-ebusy-lock.yml +124 -0
- package/errors/runner-environment/deprecated-action-version-auto-rejected.yml +89 -0
- package/errors/runner-environment/github-hosted-runner-disk-space-full.yml +85 -0
- package/errors/runner-environment/github-path-same-step-not-found.yml +114 -0
- package/errors/runner-environment/github-script-v6-octokit-rest-actions-not-function.yml +87 -0
- package/errors/runner-environment/macos-13-deprecation-brownout.yml +93 -0
- package/errors/runner-environment/macos-15-mono-nuget-removed.yml +151 -0
- package/errors/runner-environment/macos-15-xcode-simulator-sdk-policy.yml +141 -0
- package/errors/runner-environment/multi-runtime-nov2025-removal.yml +120 -0
- package/errors/runner-environment/runner-oom-exit-code-137.yml +117 -0
- package/errors/runner-environment/setup-go-go123-telemetry-cache-failure.yml +92 -0
- package/errors/runner-environment/setup-java-distribution-required.yml +108 -0
- package/errors/runner-environment/ubuntu-2004-retirement-brownout.yml +107 -0
- package/errors/runner-environment/windows-latest-d-drive-removed.yml +104 -0
- package/errors/runner-environment/windows-vs2026-cuda-host-compiler-unsupported.yml +145 -0
- package/errors/silent-failures/event-commits-empty-on-workflow-dispatch.yml +110 -0
- package/errors/silent-failures/fetch-tags-depth-one-silent-no-op.yml +77 -0
- package/errors/silent-failures/github-env-multiline-value-truncated.yml +127 -0
- package/errors/silent-failures/github-sha-pr-merge-commit-not-head.yml +150 -0
- package/errors/silent-failures/job-output-masked-as-secret-empty.yml +147 -0
- package/errors/silent-failures/upload-artifact-permissions-stripped.yml +98 -0
- package/errors/triggers/pull-request-branches-filter-matches-base-not-head.yml +140 -0
- package/errors/triggers/push-event-fires-on-branch-delete.yml +129 -0
- package/errors/triggers/push-first-commit-before-sha-zeros.yml +160 -0
- package/errors/yaml-syntax/continue-on-error-env-context-rejected.yml +130 -0
- package/errors/yaml-syntax/fromjson-empty-string-crash.yml +99 -0
- package/errors/yaml-syntax/if-bang-negation-yaml-tag.yml +145 -0
- package/errors/yaml-syntax/local-action-path-always-top-level.yml +142 -0
- package/package.json +1 -1
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
id: caching-artifacts-018
|
|
2
|
+
title: "setup-ruby Bundler Cache Always Misses on Ephemeral Self-Hosted Runners Due to Workdir in Cache Key"
|
|
3
|
+
category: caching-artifacts
|
|
4
|
+
severity: silent-failure
|
|
5
|
+
tags:
|
|
6
|
+
- ruby
|
|
7
|
+
- setup-ruby
|
|
8
|
+
- bundler
|
|
9
|
+
- cache
|
|
10
|
+
- self-hosted
|
|
11
|
+
- ephemeral
|
|
12
|
+
- cache-miss
|
|
13
|
+
- cache-key
|
|
14
|
+
patterns:
|
|
15
|
+
- regex: "Cache not found for.*setup-ruby-bundler-cache.*wd-.*[0-9]{8,}"
|
|
16
|
+
flags: "i"
|
|
17
|
+
- regex: "setup-ruby-bundler-cache.*wd-\\/.*[a-f0-9]{8,}.*Gemfile\\.lock"
|
|
18
|
+
flags: "i"
|
|
19
|
+
- regex: "No cache found.*setup-ruby-bundler.*workdir.*ephemeral"
|
|
20
|
+
flags: "i"
|
|
21
|
+
- regex: "Cache miss.*bundler.*setup-ruby.*self-hosted"
|
|
22
|
+
flags: "i"
|
|
23
|
+
error_messages:
|
|
24
|
+
- "Cache not found for input keys: setup-ruby-bundler-cache-v6-ubuntu-22.04-x64-ruby-3.3.6-wd-/codebuild/output/src1813367680/src/actions-runner/_work/myapp/myapp-with--without--only--Gemfile.lock-3f96ad38..."
|
|
25
|
+
- "No cache found for key: setup-ruby-bundler-cache-v6-..."
|
|
26
|
+
root_cause: |
|
|
27
|
+
`ruby/setup-ruby` includes the absolute working directory path (`wd-<path>`) as a
|
|
28
|
+
component of the Bundler cache key. The full cache key format is:
|
|
29
|
+
|
|
30
|
+
setup-ruby-bundler-cache-v{VERSION}-{OS}-{ARCH}-ruby-{RUBY_VERSION}-wd-{WORKDIR}-...{GEMFILE_HASH}
|
|
31
|
+
|
|
32
|
+
On GitHub-hosted runners, the working directory is deterministic per run
|
|
33
|
+
(`/home/runner/work/{repo}/{repo}` on Linux), so the cache key is stable across runs
|
|
34
|
+
and the cache is reused correctly.
|
|
35
|
+
|
|
36
|
+
On **ephemeral self-hosted runners** (e.g., AWS CodeBuild, GitLab CI runners with
|
|
37
|
+
unique workspace paths, or any runner that generates a unique working directory path
|
|
38
|
+
per run for isolation), the `wd-` component changes with each run. This makes every
|
|
39
|
+
cache lookup a miss — the Bundler gems are reinstalled from scratch on every run,
|
|
40
|
+
completely defeating the purpose of caching.
|
|
41
|
+
|
|
42
|
+
This affects:
|
|
43
|
+
- AWS CodeBuild with GitHub Actions runners (CodeBuild generates unique src paths per build)
|
|
44
|
+
- Kubernetes-based ephemeral runners where the pod workspace path includes a job ID
|
|
45
|
+
- Any custom runner setup that includes a timestamp or job ID in the workspace path
|
|
46
|
+
|
|
47
|
+
The issue is open (ruby/setup-ruby#904, April 2026) and has no upstream fix yet as of
|
|
48
|
+
mid-2026. The workdir was included in the cache key to allow multiple Ruby projects to
|
|
49
|
+
have separate caches within the same repository, but it breaks ephemeral runners as a
|
|
50
|
+
side effect.
|
|
51
|
+
|
|
52
|
+
Note: This is a **silent failure** — no error is thrown; the workflow succeeds but Bundler
|
|
53
|
+
installs all gems on every run, causing slow CI with no visible warning about cache
|
|
54
|
+
effectiveness.
|
|
55
|
+
fix: |
|
|
56
|
+
**Workaround 1 — Disable setup-ruby's built-in bundler cache, use actions/cache manually**:
|
|
57
|
+
Set `bundler-cache: false` in setup-ruby and manage the Bundler cache yourself with
|
|
58
|
+
`actions/cache`, using only the `BUNDLE_PATH` and `Gemfile.lock` hash as the key
|
|
59
|
+
(no workdir component). This is the most reliable fix for ephemeral runners.
|
|
60
|
+
|
|
61
|
+
**Workaround 2 — Normalize the working directory** (if your runner supports it):
|
|
62
|
+
Configure your runner to use a fixed, predictable working directory path instead of
|
|
63
|
+
a unique-per-job path. This makes the setup-ruby cache key stable.
|
|
64
|
+
|
|
65
|
+
**Workaround 3 — Cache the Ruby gems directory directly**:
|
|
66
|
+
Cache `~/.bundle` or the Bundler install path rather than the per-project `vendor/bundle`,
|
|
67
|
+
since the home directory path is typically stable even on ephemeral runners.
|
|
68
|
+
fix_code:
|
|
69
|
+
- language: yaml
|
|
70
|
+
label: "Disable setup-ruby cache and use actions/cache with stable key"
|
|
71
|
+
code: |
|
|
72
|
+
jobs:
|
|
73
|
+
test:
|
|
74
|
+
runs-on: self-hosted # ephemeral runner
|
|
75
|
+
steps:
|
|
76
|
+
- uses: actions/checkout@v4
|
|
77
|
+
|
|
78
|
+
- uses: ruby/setup-ruby@v1
|
|
79
|
+
with:
|
|
80
|
+
ruby-version: '3.3'
|
|
81
|
+
bundler-cache: false # Disable built-in cache (broken on ephemeral runners)
|
|
82
|
+
|
|
83
|
+
# Cache gems using a workdir-independent key
|
|
84
|
+
- uses: actions/cache@v4
|
|
85
|
+
id: bundle-cache
|
|
86
|
+
with:
|
|
87
|
+
path: ~/.bundle/cache
|
|
88
|
+
key: ${{ runner.os }}-bundle-${{ hashFiles('**/Gemfile.lock') }}
|
|
89
|
+
restore-keys: |
|
|
90
|
+
${{ runner.os }}-bundle-
|
|
91
|
+
|
|
92
|
+
- name: Install gems
|
|
93
|
+
run: bundle install --path ~/.bundle/cache
|
|
94
|
+
if: steps.bundle-cache.outputs.cache-hit != 'true'
|
|
95
|
+
|
|
96
|
+
- name: Bundle check
|
|
97
|
+
run: bundle check || bundle install
|
|
98
|
+
- language: yaml
|
|
99
|
+
label: "Cache vendor/bundle with BUNDLE_PATH and stable key (no workdir)"
|
|
100
|
+
code: |
|
|
101
|
+
jobs:
|
|
102
|
+
test:
|
|
103
|
+
runs-on: self-hosted
|
|
104
|
+
steps:
|
|
105
|
+
- uses: actions/checkout@v4
|
|
106
|
+
|
|
107
|
+
- uses: ruby/setup-ruby@v1
|
|
108
|
+
with:
|
|
109
|
+
ruby-version: '3.3'
|
|
110
|
+
bundler-cache: false
|
|
111
|
+
|
|
112
|
+
- uses: actions/cache@v4
|
|
113
|
+
with:
|
|
114
|
+
path: vendor/bundle
|
|
115
|
+
key: ${{ runner.os }}-gems-${{ hashFiles('**/Gemfile.lock') }}
|
|
116
|
+
restore-keys: |
|
|
117
|
+
${{ runner.os }}-gems-
|
|
118
|
+
|
|
119
|
+
- name: Install gems
|
|
120
|
+
run: |
|
|
121
|
+
bundle config path vendor/bundle
|
|
122
|
+
bundle install --jobs 4 --retry 3
|
|
123
|
+
- language: yaml
|
|
124
|
+
label: "GitHub-hosted runner — built-in cache works fine (no workaround needed)"
|
|
125
|
+
code: |
|
|
126
|
+
jobs:
|
|
127
|
+
test:
|
|
128
|
+
runs-on: ubuntu-latest # GitHub-hosted: stable workdir, cache works
|
|
129
|
+
steps:
|
|
130
|
+
- uses: actions/checkout@v4
|
|
131
|
+
|
|
132
|
+
- uses: ruby/setup-ruby@v1
|
|
133
|
+
with:
|
|
134
|
+
ruby-version: '3.3'
|
|
135
|
+
bundler-cache: true # Works correctly on GitHub-hosted runners
|
|
136
|
+
prevention:
|
|
137
|
+
- "Always use `bundler-cache: false` with `ruby/setup-ruby` on ephemeral self-hosted runners (CodeBuild, ephemeral Kubernetes runners) and manage caching manually."
|
|
138
|
+
- "Verify cache effectiveness by checking the `cache-hit` output and monitoring job duration across runs — a stable cache hit means no Bundler reinstall."
|
|
139
|
+
- "Use workdir-independent cache keys: `${{ runner.os }}-bundle-${{ hashFiles('**/Gemfile.lock') }}` instead of paths that include dynamic segments."
|
|
140
|
+
- "Track ruby/setup-ruby#904 for an upstream fix that makes the cache key workdir-independent by default."
|
|
141
|
+
docs:
|
|
142
|
+
- url: "https://github.com/ruby/setup-ruby/issues/904"
|
|
143
|
+
label: "ruby/setup-ruby#904: Bundler cache not working due to ephemeral workdir (open)"
|
|
144
|
+
- url: "https://github.com/ruby/setup-ruby#caching-bundle-install-automatically"
|
|
145
|
+
label: "setup-ruby README: Caching bundle install automatically"
|
|
146
|
+
- url: "https://github.com/actions/cache/blob/main/README.md"
|
|
147
|
+
label: "actions/cache README: manual caching approach"
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
id: caching-artifacts-016
|
|
2
|
+
title: "actions/upload-artifact v3 Automatically Blocked After January 2025 Retirement"
|
|
3
|
+
category: caching-artifacts
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- upload-artifact
|
|
7
|
+
- download-artifact
|
|
8
|
+
- v3
|
|
9
|
+
- deprecated
|
|
10
|
+
- retirement
|
|
11
|
+
- brownout
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: "automatically failed.*deprecated.*version.*upload-artifact"
|
|
14
|
+
flags: "i"
|
|
15
|
+
- regex: "This request has been automatically failed because it uses a deprecated version"
|
|
16
|
+
flags: "i"
|
|
17
|
+
- regex: "upload-artifact.*v3.*deprecated"
|
|
18
|
+
flags: "i"
|
|
19
|
+
- regex: "download-artifact.*v3.*deprecated"
|
|
20
|
+
flags: "i"
|
|
21
|
+
error_messages:
|
|
22
|
+
- "This request has been automatically failed because it uses a deprecated version of actions/upload-artifact: v3"
|
|
23
|
+
- "This request has been automatically failed because it uses a deprecated version of actions/download-artifact: v3"
|
|
24
|
+
root_cause: |
|
|
25
|
+
GitHub retired **actions/upload-artifact@v3** and **actions/download-artifact@v3**
|
|
26
|
+
on January 30, 2025. After the retirement date, any workflow still calling v3 of
|
|
27
|
+
these actions receives an immediate hard failure at the step level — the action
|
|
28
|
+
does not run; instead, the runner returns:
|
|
29
|
+
|
|
30
|
+
"This request has been automatically failed because it uses a deprecated
|
|
31
|
+
version of actions/upload-artifact: v3"
|
|
32
|
+
|
|
33
|
+
**Timeline:**
|
|
34
|
+
- April 16, 2024 — GitHub announced v3 deprecation and scheduled retirement
|
|
35
|
+
- November 2024 → January 2025 — Brownout periods (random scheduled failures)
|
|
36
|
+
- January 30, 2025 — Full retirement: all v3 calls blocked unconditionally
|
|
37
|
+
|
|
38
|
+
**Why repos are still affected:**
|
|
39
|
+
- Many CI configurations were written before the deprecation announcement and
|
|
40
|
+
were never updated
|
|
41
|
+
- Reusable workflows called from other orgs/repos may reference v3 internally
|
|
42
|
+
- Third-party action marketplace actions that internally use v3 as a dependency
|
|
43
|
+
were broken until their own maintainers upgraded
|
|
44
|
+
- Workflows with infrequent trigger schedules (e.g., monthly releases) only hit
|
|
45
|
+
the brownout windows occasionally, masking the problem until full retirement
|
|
46
|
+
|
|
47
|
+
Source: GitHub Changelog 2024-04-16, community discussions/149325
|
|
48
|
+
fix: |
|
|
49
|
+
Upgrade both upload and download steps to v4 simultaneously. Do NOT mix v3 and v4
|
|
50
|
+
in the same workflow — they use different artifact backends and are not cross-compatible.
|
|
51
|
+
|
|
52
|
+
**Key v4 behavior changes to be aware of:**
|
|
53
|
+
- Artifact names must be unique per workflow run (v4 does NOT overwrite; throws 409)
|
|
54
|
+
- Hidden files (dotfiles) are excluded by default — set `include-hidden-files: true`
|
|
55
|
+
if you need them
|
|
56
|
+
- Cross-repo artifact access requires explicit permissions
|
|
57
|
+
- GHES instances older than 3.15 do not support v4 — pin to v3 only if on old GHES
|
|
58
|
+
(but old GHES has its own known issues)
|
|
59
|
+
fix_code:
|
|
60
|
+
- language: yaml
|
|
61
|
+
label: "Migrate upload and download to v4 (minimal change)"
|
|
62
|
+
code: |
|
|
63
|
+
jobs:
|
|
64
|
+
build:
|
|
65
|
+
runs-on: ubuntu-latest
|
|
66
|
+
steps:
|
|
67
|
+
- uses: actions/checkout@v4
|
|
68
|
+
- run: npm run build
|
|
69
|
+
|
|
70
|
+
# ❌ Retired — will auto-fail after Jan 30, 2025
|
|
71
|
+
# - uses: actions/upload-artifact@v3
|
|
72
|
+
# with:
|
|
73
|
+
# name: dist
|
|
74
|
+
# path: dist/
|
|
75
|
+
|
|
76
|
+
# ✅ Use v4
|
|
77
|
+
- uses: actions/upload-artifact@v4
|
|
78
|
+
with:
|
|
79
|
+
name: dist
|
|
80
|
+
path: dist/
|
|
81
|
+
|
|
82
|
+
deploy:
|
|
83
|
+
needs: build
|
|
84
|
+
runs-on: ubuntu-latest
|
|
85
|
+
steps:
|
|
86
|
+
# ✅ Download also on v4 — must match upload version
|
|
87
|
+
- uses: actions/download-artifact@v4
|
|
88
|
+
with:
|
|
89
|
+
name: dist
|
|
90
|
+
path: dist/
|
|
91
|
+
- language: yaml
|
|
92
|
+
label: "Handle v4 duplicate-name conflict if multiple jobs upload the same name"
|
|
93
|
+
code: |
|
|
94
|
+
jobs:
|
|
95
|
+
build:
|
|
96
|
+
strategy:
|
|
97
|
+
matrix:
|
|
98
|
+
target: [linux, windows, macos]
|
|
99
|
+
runs-on: ubuntu-latest
|
|
100
|
+
steps:
|
|
101
|
+
- run: echo "Build ${{ matrix.target }}" > output.txt
|
|
102
|
+
|
|
103
|
+
# v4: artifact names must be unique per run
|
|
104
|
+
- uses: actions/upload-artifact@v4
|
|
105
|
+
with:
|
|
106
|
+
# Append matrix value to keep names unique
|
|
107
|
+
name: output-${{ matrix.target }}
|
|
108
|
+
path: output.txt
|
|
109
|
+
prevention:
|
|
110
|
+
- "Run `grep -r 'upload-artifact@v3\\|download-artifact@v3' .github/` periodically to catch stale version pins."
|
|
111
|
+
- "Use Dependabot or Renovate to automatically open PRs when GitHub-maintained actions release new major versions."
|
|
112
|
+
- "Subscribe to the GitHub Changelog (https://github.blog/changelog/) for deprecation notices."
|
|
113
|
+
- "When upgrading to v4, test artifact names for uniqueness — v4 throws HTTP 409 when the same name is uploaded twice in one run."
|
|
114
|
+
- "Set `retention-days` explicitly on v4 artifacts; default retention changed between v3 and v4."
|
|
115
|
+
docs:
|
|
116
|
+
- url: "https://github.blog/changelog/2024-04-16-deprecation-notice-v3-of-the-artifact-actions"
|
|
117
|
+
label: "GitHub Changelog: Deprecation notice — v3 of the artifact actions"
|
|
118
|
+
- url: "https://github.com/orgs/community/discussions/149325"
|
|
119
|
+
label: "Community discussion — workflows failing after artifact v3 retirement"
|
|
120
|
+
- url: "https://github.com/actions/upload-artifact/blob/main/docs/MIGRATION.md"
|
|
121
|
+
label: "actions/upload-artifact — v3 to v4 migration guide"
|
|
122
|
+
- url: "https://docs.github.com/en/actions/using-workflows/storing-workflow-data-as-artifacts"
|
|
123
|
+
label: "GitHub Docs: Storing workflow data as artifacts"
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
id: caching-artifacts-021
|
|
2
|
+
title: "upload-artifact v4 Silently Hangs on Large Files (500MB+) on macOS Runners"
|
|
3
|
+
category: caching-artifacts
|
|
4
|
+
severity: silent-failure
|
|
5
|
+
tags:
|
|
6
|
+
- upload-artifact
|
|
7
|
+
- macos
|
|
8
|
+
- large-file
|
|
9
|
+
- hang
|
|
10
|
+
- timeout
|
|
11
|
+
- silent-failure
|
|
12
|
+
- v4
|
|
13
|
+
patterns:
|
|
14
|
+
- regex: "Uploaded bytes \\d+"
|
|
15
|
+
flags: "i"
|
|
16
|
+
- regex: "upload-artifact.*stall|stall.*upload-artifact"
|
|
17
|
+
flags: "i"
|
|
18
|
+
- regex: "The operation was cancelled.*upload|upload.*operation was cancelled"
|
|
19
|
+
flags: "i"
|
|
20
|
+
- regex: "Error: The process.*took too long.*upload-artifact"
|
|
21
|
+
flags: "i"
|
|
22
|
+
error_messages:
|
|
23
|
+
- "Uploaded bytes 8388608"
|
|
24
|
+
- "The runner has received a shutdown signal. This can happen when the runner service is stopped, or a manually started runner is canceled."
|
|
25
|
+
root_cause: |
|
|
26
|
+
`actions/upload-artifact@v4` intermittently stalls during upload on macOS GitHub-hosted
|
|
27
|
+
runners (macos-13-xl-arm64, macos-14-xlarge, macos-15) when artifact size is approximately
|
|
28
|
+
500 MB or larger. The stall manifests as the upload progress halting after logging "Uploaded
|
|
29
|
+
bytes XXXXXXXXX" with no further output — the job then exceeds its timeout and is cancelled
|
|
30
|
+
by GitHub without an explicit error message.
|
|
31
|
+
|
|
32
|
+
This behavior was reported and tracked in actions/upload-artifact#527. The hang appears
|
|
33
|
+
intermittent (roughly 30–50% failure rate for affected workflows), which makes it difficult
|
|
34
|
+
to diagnose in standard CI logs — the workflow shows as "cancelled" rather than "failed",
|
|
35
|
+
masking the root cause.
|
|
36
|
+
|
|
37
|
+
Contributing factors observed in community reports:
|
|
38
|
+
- Large uncompressed artifacts (binaries, build artifacts, test reports with raw data)
|
|
39
|
+
- macOS ARM64 hosted runners appear more susceptible than Linux or Windows runners
|
|
40
|
+
- Compression level settings do not consistently prevent the hang
|
|
41
|
+
|
|
42
|
+
This is a silent failure because:
|
|
43
|
+
1. The upload simply stops with no error log entry
|
|
44
|
+
2. The job shows as "cancelled" (not "failed") in the GitHub UI
|
|
45
|
+
3. Downstream artifact-download steps fail with "no artifact found" — the real cause is upstream
|
|
46
|
+
fix: |
|
|
47
|
+
Use one or more of these mitigations while the issue is tracked by the actions team:
|
|
48
|
+
|
|
49
|
+
1. **Split large artifacts**: Break large upload paths into multiple smaller upload steps, each
|
|
50
|
+
under ~200MB. This reduces the risk of hitting the hang threshold.
|
|
51
|
+
|
|
52
|
+
2. **Add an explicit timeout**: Set `timeout-minutes` on the upload step to detect hangs faster
|
|
53
|
+
and fail with a clear error rather than waiting for the job-level timeout.
|
|
54
|
+
|
|
55
|
+
3. **Retry the upload**: Wrap the upload step in a retry loop or use a community action like
|
|
56
|
+
`nick-fields/retry` to automatically re-attempt on failure.
|
|
57
|
+
|
|
58
|
+
4. **Use direct storage for very large artifacts**: For artifacts over 1GB, upload directly to
|
|
59
|
+
S3, Azure Blob Storage, or GCS using provider CLI tools. Use upload-artifact only for test
|
|
60
|
+
reports and smaller build outputs.
|
|
61
|
+
|
|
62
|
+
5. **Switch to Linux runners**: If macOS-specific features are not required for the upload
|
|
63
|
+
phase, run artifact collection on an ubuntu-latest runner where the hang does not occur.
|
|
64
|
+
fix_code:
|
|
65
|
+
- language: yaml
|
|
66
|
+
label: "Add timeout and retry to upload step"
|
|
67
|
+
code: |
|
|
68
|
+
- name: Upload large artifact
|
|
69
|
+
uses: actions/upload-artifact@v4
|
|
70
|
+
timeout-minutes: 10 # fail fast instead of waiting for job timeout
|
|
71
|
+
with:
|
|
72
|
+
name: release-binaries
|
|
73
|
+
path: dist/
|
|
74
|
+
compression-level: 6
|
|
75
|
+
retention-days: 7
|
|
76
|
+
- language: yaml
|
|
77
|
+
label: "Split large artifact into parts to reduce hang risk"
|
|
78
|
+
code: |
|
|
79
|
+
- name: Upload binaries (part 1)
|
|
80
|
+
uses: actions/upload-artifact@v4
|
|
81
|
+
with:
|
|
82
|
+
name: binaries-part1
|
|
83
|
+
path: dist/platform-a/
|
|
84
|
+
|
|
85
|
+
- name: Upload binaries (part 2)
|
|
86
|
+
uses: actions/upload-artifact@v4
|
|
87
|
+
with:
|
|
88
|
+
name: binaries-part2
|
|
89
|
+
path: dist/platform-b/
|
|
90
|
+
- language: yaml
|
|
91
|
+
label: "Upload to S3 for very large artifacts (bypass upload-artifact)"
|
|
92
|
+
code: |
|
|
93
|
+
- name: Upload large artifact to S3
|
|
94
|
+
env:
|
|
95
|
+
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
|
96
|
+
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
|
97
|
+
run: |
|
|
98
|
+
aws s3 cp dist/release.tar.gz s3://my-bucket/artifacts/${{ github.sha }}/release.tar.gz
|
|
99
|
+
echo "Uploaded to s3://my-bucket/artifacts/${{ github.sha }}/release.tar.gz"
|
|
100
|
+
prevention:
|
|
101
|
+
- "Keep individual artifact uploads under 200MB per upload step on macOS runners to avoid the hang threshold."
|
|
102
|
+
- "Always set `timeout-minutes` on upload steps for large files so the CI job fails fast with a clear error instead of silently timing out."
|
|
103
|
+
- "Monitor for `cancelled` status on jobs that use upload-artifact on macOS — these may be silently failing uploads."
|
|
104
|
+
- "For release artifacts exceeding 1GB, use cloud storage (S3, Azure Blob, GCS) directly rather than upload-artifact."
|
|
105
|
+
docs:
|
|
106
|
+
- url: "https://github.com/actions/upload-artifact/issues/527"
|
|
107
|
+
label: "actions/upload-artifact#527 — macOS large-file upload hang report and discussion"
|
|
108
|
+
- url: "https://github.com/actions/upload-artifact"
|
|
109
|
+
label: "actions/upload-artifact — official repository and documentation"
|
|
110
|
+
- url: "https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/storing-workflow-data-as-artifacts"
|
|
111
|
+
label: "GitHub Docs — storing workflow data as artifacts"
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
id: concurrency-timing-011
|
|
2
|
+
title: "always() Cleanup Jobs Forcibly Killed After 5-Minute Cancellation Timeout"
|
|
3
|
+
category: concurrency-timing
|
|
4
|
+
severity: warning
|
|
5
|
+
tags:
|
|
6
|
+
- always
|
|
7
|
+
- cancellation
|
|
8
|
+
- cleanup
|
|
9
|
+
- forced-termination
|
|
10
|
+
- notification
|
|
11
|
+
- timeout
|
|
12
|
+
- teardown
|
|
13
|
+
patterns:
|
|
14
|
+
- regex: "The runner has received a shutdown signal"
|
|
15
|
+
flags: "i"
|
|
16
|
+
- regex: "Job was cancelled"
|
|
17
|
+
flags: "i"
|
|
18
|
+
- regex: "The operation was canceled"
|
|
19
|
+
flags: "i"
|
|
20
|
+
error_messages:
|
|
21
|
+
- "The runner has received a shutdown signal. This can happen when the runner service is stopped, a new job is started, or the runner is in the process of shutting down."
|
|
22
|
+
- "Job was cancelled"
|
|
23
|
+
root_cause: |
|
|
24
|
+
When a workflow run is cancelled (manually or via `cancel-in-progress`), GitHub Actions
|
|
25
|
+
re-evaluates the `if:` condition for every currently running job. Jobs marked with
|
|
26
|
+
`if: always()` continue running — this is the intended mechanism for cleanup, notifications,
|
|
27
|
+
and teardown steps.
|
|
28
|
+
|
|
29
|
+
However, GitHub enforces a **5-minute hard termination window** after cancellation is
|
|
30
|
+
initiated. Once 5 minutes have elapsed since the cancellation signal, ALL remaining jobs
|
|
31
|
+
are forcibly killed by the server, regardless of their `if:` conditions — including jobs
|
|
32
|
+
explicitly marked `if: always()`.
|
|
33
|
+
|
|
34
|
+
This means:
|
|
35
|
+
- Cleanup jobs that take more than 5 minutes (Terraform destroy, test result uploads,
|
|
36
|
+
Slack notifications with retries, database teardown) will be killed mid-execution.
|
|
37
|
+
- The job may appear partially completed in the logs with no clear failure message —
|
|
38
|
+
it simply stops, often leaving infrastructure in a partial or inconsistent state.
|
|
39
|
+
- Developers are surprised that `always()` does not guarantee the job completes after
|
|
40
|
+
a workflow cancellation.
|
|
41
|
+
|
|
42
|
+
Common failure scenarios:
|
|
43
|
+
- Artifact upload in an `if: always()` post-job step when the upload is slow
|
|
44
|
+
- Terraform `destroy` as a cleanup job when a long-running deployment is cancelled
|
|
45
|
+
- Notification jobs that retry on transient failures and consume more time than expected
|
|
46
|
+
- Integration test teardown (database resets, container removal) that exceeds 5 minutes
|
|
47
|
+
|
|
48
|
+
Source: GitHub Docs — Canceling a workflow: "After the 5 minute cancellation timeout
|
|
49
|
+
period, the server will forcibly terminate all jobs that are still running."
|
|
50
|
+
fix: |
|
|
51
|
+
Design `always()` cleanup jobs to complete well within 5 minutes. Add a job-level
|
|
52
|
+
`timeout-minutes: 4` to any cleanup job that runs after cancellation so it fails
|
|
53
|
+
cleanly rather than being force-killed at an unpredictable point.
|
|
54
|
+
|
|
55
|
+
For teardown that cannot be shortened, trigger cleanup from a separate workflow using
|
|
56
|
+
`workflow_run: [completed]` — it runs after the cancelled run fully settles and is
|
|
57
|
+
not subject to the 5-minute window.
|
|
58
|
+
|
|
59
|
+
Use the `cancelled()` expression to detect cancellation and take a fast code path.
|
|
60
|
+
fix_code:
|
|
61
|
+
- language: yaml
|
|
62
|
+
label: "Guard cleanup job with timeout-minutes to fail fast before forced kill"
|
|
63
|
+
code: |
|
|
64
|
+
jobs:
|
|
65
|
+
deploy:
|
|
66
|
+
runs-on: ubuntu-latest
|
|
67
|
+
timeout-minutes: 60
|
|
68
|
+
steps:
|
|
69
|
+
- uses: actions/checkout@v4
|
|
70
|
+
- run: ./deploy.sh
|
|
71
|
+
|
|
72
|
+
cleanup:
|
|
73
|
+
needs: deploy
|
|
74
|
+
if: always()
|
|
75
|
+
runs-on: ubuntu-latest
|
|
76
|
+
timeout-minutes: 4 # Stay under the 5-min forced-kill window
|
|
77
|
+
steps:
|
|
78
|
+
- name: Teardown infrastructure
|
|
79
|
+
run: ./teardown.sh
|
|
80
|
+
timeout-minutes: 3 # Per-step guard too
|
|
81
|
+
|
|
82
|
+
- language: yaml
|
|
83
|
+
label: "Use cancelled() to take a fast notification path on cancellation"
|
|
84
|
+
code: |
|
|
85
|
+
jobs:
|
|
86
|
+
build:
|
|
87
|
+
runs-on: ubuntu-latest
|
|
88
|
+
steps:
|
|
89
|
+
- run: ./slow-build.sh
|
|
90
|
+
|
|
91
|
+
notify:
|
|
92
|
+
needs: build
|
|
93
|
+
if: always()
|
|
94
|
+
runs-on: ubuntu-latest
|
|
95
|
+
steps:
|
|
96
|
+
- name: Quick notification (cancellation — must be fast)
|
|
97
|
+
if: cancelled()
|
|
98
|
+
run: |
|
|
99
|
+
curl -s -X POST "$SLACK_WEBHOOK" \
|
|
100
|
+
-H 'Content-type: application/json' \
|
|
101
|
+
-d '{"text":"⚠️ Workflow cancelled — cleanup may be incomplete"}'
|
|
102
|
+
|
|
103
|
+
- name: Full notification (success or failure path — has time)
|
|
104
|
+
if: "!cancelled()"
|
|
105
|
+
run: ./full-notify.sh "${{ needs.build.result }}"
|
|
106
|
+
|
|
107
|
+
- language: yaml
|
|
108
|
+
label: "Post-cancellation teardown via workflow_run — not subject to 5-min window"
|
|
109
|
+
code: |
|
|
110
|
+
# cleanup.yml — separate workflow triggered after any completion including cancellation
|
|
111
|
+
on:
|
|
112
|
+
workflow_run:
|
|
113
|
+
workflows: ["Deploy"]
|
|
114
|
+
types: [completed]
|
|
115
|
+
|
|
116
|
+
jobs:
|
|
117
|
+
teardown:
|
|
118
|
+
runs-on: ubuntu-latest
|
|
119
|
+
steps:
|
|
120
|
+
- uses: actions/checkout@v4
|
|
121
|
+
|
|
122
|
+
- name: Emergency cleanup when deploy was cancelled
|
|
123
|
+
if: github.event.workflow_run.conclusion == 'cancelled'
|
|
124
|
+
run: ./emergency-teardown.sh
|
|
125
|
+
|
|
126
|
+
- name: Normal cleanup on success or failure
|
|
127
|
+
if: github.event.workflow_run.conclusion != 'cancelled'
|
|
128
|
+
run: ./standard-teardown.sh
|
|
129
|
+
prevention:
|
|
130
|
+
- "Keep `if: always()` cleanup jobs under 4 minutes — add `timeout-minutes: 4` as a safety guard."
|
|
131
|
+
- "Use `if: cancelled()` to detect cancellation and take a fast code path rather than the full teardown path."
|
|
132
|
+
- "For cleanup that takes longer than 5 minutes, use a separate `workflow_run: [completed]` workflow that runs outside the cancellation window."
|
|
133
|
+
- "Test cancellation behavior by manually cancelling a long-running workflow and verifying cleanup jobs complete before 5 minutes."
|
|
134
|
+
docs:
|
|
135
|
+
- url: "https://docs.github.com/en/actions/managing-workflow-runs-and-deployments/managing-workflow-runs/canceling-a-workflow"
|
|
136
|
+
label: "GitHub Docs: Canceling a workflow (5-minute forced termination)"
|
|
137
|
+
- url: "https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/evaluate-expressions-in-workflows-and-actions#status-check-functions"
|
|
138
|
+
label: "Status check functions: always(), cancelled()"
|
|
139
|
+
- url: "https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#workflow_run"
|
|
140
|
+
label: "workflow_run event — trigger cleanup after completed workflows"
|
|
@@ -0,0 +1,99 @@
|
|
|
1
|
+
id: concurrency-timing-010
|
|
2
|
+
title: "env Context Unavailable in Concurrency Group Expression Collapses All Runs"
|
|
3
|
+
category: concurrency-timing
|
|
4
|
+
severity: silent-failure
|
|
5
|
+
tags:
|
|
6
|
+
- concurrency
|
|
7
|
+
- env-context
|
|
8
|
+
- expression
|
|
9
|
+
- silent-failure
|
|
10
|
+
- group-collision
|
|
11
|
+
patterns:
|
|
12
|
+
- regex: "Canceling since a higher priority waiting"
|
|
13
|
+
flags: "i"
|
|
14
|
+
- regex: "concurrency.*group.*\"\""
|
|
15
|
+
flags: "i"
|
|
16
|
+
error_messages:
|
|
17
|
+
- "Canceling since a higher priority waiting request for '' exists"
|
|
18
|
+
- "Canceling since a higher priority waiting run was found for ''"
|
|
19
|
+
root_cause: |
|
|
20
|
+
The `concurrency.group` expression is evaluated at workflow scheduling time, before
|
|
21
|
+
most runtime contexts are available. The `env` context is one of the contexts that
|
|
22
|
+
is NOT available when concurrency expressions are evaluated.
|
|
23
|
+
|
|
24
|
+
When you use `${{ env.MY_VAR }}` in a concurrency group key:
|
|
25
|
+
- The expression silently evaluates to an empty string `""`
|
|
26
|
+
- Every workflow run (across all branches, all events) shares the same group: `""`
|
|
27
|
+
- Runs from completely unrelated branches cancel each other unexpectedly
|
|
28
|
+
- The runner may emit "Canceling since a higher priority waiting request for '' exists"
|
|
29
|
+
with an empty group name — which is the giveaway
|
|
30
|
+
|
|
31
|
+
Contexts available in `concurrency.group`: `github`, `inputs`, `vars`
|
|
32
|
+
Contexts NOT available: `env`, `steps`, `job`, `runner`, `secrets`, `matrix`, `needs`
|
|
33
|
+
|
|
34
|
+
This is a documented limitation but easy to miss because the expression evaluates
|
|
35
|
+
silently without error — it just returns empty string.
|
|
36
|
+
|
|
37
|
+
Sources: GitHub Community #26308, #45734, #69704
|
|
38
|
+
fix: |
|
|
39
|
+
Replace `env` context references in concurrency group expressions with supported
|
|
40
|
+
contexts. Use `github` (event properties, ref, workflow name), `inputs` (for
|
|
41
|
+
workflow_dispatch or workflow_call), or `vars` (repository/org variables).
|
|
42
|
+
|
|
43
|
+
For environment-specific group keys, use `github.event_name`, `github.ref_name`,
|
|
44
|
+
`github.workflow`, or pass an explicit input to workflow_dispatch.
|
|
45
|
+
fix_code:
|
|
46
|
+
- language: yaml
|
|
47
|
+
label: "Broken — env context evaluates to empty string in concurrency group"
|
|
48
|
+
code: |
|
|
49
|
+
# ❌ BROKEN: ${{ env.ENVIRONMENT }} returns "" at scheduling time
|
|
50
|
+
env:
|
|
51
|
+
ENVIRONMENT: production
|
|
52
|
+
|
|
53
|
+
concurrency:
|
|
54
|
+
group: deploy-${{ env.ENVIRONMENT }} # Always evaluates to "deploy-"
|
|
55
|
+
cancel-in-progress: false
|
|
56
|
+
- language: yaml
|
|
57
|
+
label: "Fixed — use github context or vars instead of env"
|
|
58
|
+
code: |
|
|
59
|
+
# ✅ FIXED: use github context properties (available at scheduling time)
|
|
60
|
+
concurrency:
|
|
61
|
+
group: deploy-${{ github.ref_name }}-${{ github.workflow }}
|
|
62
|
+
cancel-in-progress: false
|
|
63
|
+
- language: yaml
|
|
64
|
+
label: "Fixed — pass environment as workflow_dispatch input for dynamic group key"
|
|
65
|
+
code: |
|
|
66
|
+
# ✅ FIXED: expose the value as an input so it's available via `inputs` context
|
|
67
|
+
on:
|
|
68
|
+
workflow_dispatch:
|
|
69
|
+
inputs:
|
|
70
|
+
environment:
|
|
71
|
+
required: true
|
|
72
|
+
type: choice
|
|
73
|
+
options: [production, staging]
|
|
74
|
+
|
|
75
|
+
concurrency:
|
|
76
|
+
group: deploy-${{ inputs.environment }}
|
|
77
|
+
cancel-in-progress: false
|
|
78
|
+
- language: yaml
|
|
79
|
+
label: "Fixed — use repository variable (vars context is available)"
|
|
80
|
+
code: |
|
|
81
|
+
# ✅ FIXED: vars context is available in concurrency expressions
|
|
82
|
+
concurrency:
|
|
83
|
+
group: deploy-${{ vars.DEPLOY_ENV }}-${{ github.ref_name }}
|
|
84
|
+
cancel-in-progress: false
|
|
85
|
+
prevention:
|
|
86
|
+
- "Only use `github`, `inputs`, and `vars` contexts in `concurrency.group` expressions."
|
|
87
|
+
- "If you see runs from unrelated branches cancelling each other, inspect the concurrency group key for empty-string evaluation."
|
|
88
|
+
- "Test concurrency group expressions by adding a step that echoes the group key: `run: echo 'group=${{ github.workflow }}-${{ github.ref_name }}'`."
|
|
89
|
+
- "If concurrency cancellation messages show an empty group name `''`, the expression evaluated to an empty string."
|
|
90
|
+
- "Use `vars` (repository/org variables) rather than `env` when you need a configured value in the group key."
|
|
91
|
+
docs:
|
|
92
|
+
- url: "https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/using-concurrency"
|
|
93
|
+
label: "Using concurrency — supported expression contexts"
|
|
94
|
+
- url: "https://github.com/orgs/community/discussions/26308"
|
|
95
|
+
label: "GitHub Community #26308 — env context not available in concurrency"
|
|
96
|
+
- url: "https://github.com/orgs/community/discussions/69704"
|
|
97
|
+
label: "GitHub Community #69704 — concurrency group context limitations"
|
|
98
|
+
- url: "https://github.com/orgs/community/discussions/45734"
|
|
99
|
+
label: "GitHub Community #45734 — concurrency expression supported contexts"
|