@htekdev/actions-debugger 1.0.123 → 1.0.125

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/errors/caching-artifacts/caching-artifacts-073.yml +100 -0
  2. package/errors/caching-artifacts/caching-artifacts-074.yml +117 -0
  3. package/errors/known-unsolved/known-unsolved-070.yml +83 -0
  4. package/errors/known-unsolved/known-unsolved-071.yml +122 -0
  5. package/errors/known-unsolved/known-unsolved-072.yml +143 -0
  6. package/errors/permissions-auth/permissions-auth-071.yml +144 -0
  7. package/errors/permissions-auth/permissions-auth-072.yml +112 -0
  8. package/errors/permissions-auth/permissions-auth-073.yml +127 -0
  9. package/errors/permissions-auth/permissions-auth-074.yml +106 -0
  10. package/errors/permissions-auth/permissions-auth-075.yml +137 -0
  11. package/errors/runner-environment/runner-environment-224.yml +74 -0
  12. package/errors/runner-environment/runner-environment-225.yml +85 -0
  13. package/errors/runner-environment/runner-environment-226.yml +91 -0
  14. package/errors/runner-environment/runner-environment-227.yml +106 -0
  15. package/errors/runner-environment/runner-environment-228.yml +117 -0
  16. package/errors/runner-environment/runner-environment-229.yml +119 -0
  17. package/errors/runner-environment/runner-environment-230.yml +129 -0
  18. package/errors/runner-environment/runner-environment-231.yml +90 -0
  19. package/errors/runner-environment/runner-environment-232.yml +131 -0
  20. package/errors/runner-environment/runner-environment-233.yml +90 -0
  21. package/errors/runner-environment/runner-environment-234.yml +114 -0
  22. package/errors/runner-environment/runner-environment-235.yml +151 -0
  23. package/errors/silent-failures/silent-failures-112.yml +97 -0
  24. package/errors/silent-failures/silent-failures-113.yml +110 -0
  25. package/errors/silent-failures/silent-failures-114.yml +116 -0
  26. package/errors/silent-failures/silent-failures-115.yml +130 -0
  27. package/errors/silent-failures/silent-failures-116.yml +117 -0
  28. package/errors/silent-failures/silent-failures-117.yml +137 -0
  29. package/errors/silent-failures/silent-failures-118.yml +156 -0
  30. package/errors/yaml-syntax/yaml-syntax-075.yml +128 -0
  31. package/errors/yaml-syntax/yaml-syntax-076.yml +107 -0
  32. package/package.json +1 -1
@@ -0,0 +1,151 @@
1
+ id: runner-environment-235
2
+ title: 'Action download fails with HTTP 429 Too Many Requests during action resolution — "Failed to download action ... Response status code does not indicate success: 429"'
3
+ category: runner-environment
4
+ severity: error
5
+ tags:
6
+ - action-download
7
+ - rate-limit
8
+ - 429
9
+ - action-resolution
10
+ - tarball
11
+ - github-api
12
+ - outage
13
+ - retry
14
+ patterns:
15
+ - regex: 'Failed to download action.*429|Response status code does not indicate success: 429 \(Too Many Requests\)'
16
+ flags: 'i'
17
+ - regex: 'Error: Response status code does not indicate success: 429'
18
+ flags: 'i'
19
+ - regex: 'Warning: Failed to download action.*tarball'
20
+ flags: 'i'
21
+ error_messages:
22
+ - "Warning: Failed to download action 'https://api.github.com/repos/actions/checkout/tarball/11bd71901bbe5b1630ceea73d27597364c9af683'. Error: Response status code does not indicate success: 429 (Too Many Requests)."
23
+ - "Error: Response status code does not indicate success: 429 (Too Many Requests)."
24
+ - "Download action repository 'actions/checkout@v4'"
25
+ root_cause: |
26
+ When a GitHub Actions job starts, the runner downloads each action referenced in the
27
+ workflow by fetching the action's source tarball from the GitHub API endpoint:
28
+ GET https://api.github.com/repos/{owner}/{repo}/tarball/{sha}
29
+
30
+ This endpoint is subject to GitHub's API rate limits. When many runners start
31
+ simultaneously — during periods of high load, GitHub infrastructure incidents, or
32
+ large-scale rollouts — the tarball download requests can be rate-limited, returning
33
+ HTTP 429 Too Many Requests.
34
+
35
+ The runner logs a warning and retries with exponential backoff:
36
+ "Warning: Failed to download action '...'. Error: Response status code does not
37
+ indicate success: 429 (Too Many Requests). <correlationId>"
38
+
39
+ If the rate limit persists through all retry attempts, the job setup fails and the
40
+ entire job is marked as failed with "##[error]" on the "Set up job" step. The failure
41
+ is NOT a problem with the workflow configuration itself — it is a transient GitHub
42
+ infrastructure issue.
43
+
44
+ Contributing factors:
45
+ 1. **GitHub outages or degraded performance** — action downloads share API quota with
46
+ all other GitHub API traffic from the runner's IP range.
47
+ 2. **High parallel job counts** — organizations running hundreds of parallel jobs on
48
+ GitHub-hosted runners can exhaust rate limit buckets across a runner fleet.
49
+ 3. **Composite actions with many nested uses** — deeply nested composite actions make
50
+ many download calls per job, multiplying the per-job API request count. PR #4296
51
+ in actions/runner (merged 2026-03) adds batching to reduce this.
52
+ 4. **Self-hosted runners without caching** — runners that are freshly provisioned per
53
+ job always re-download all actions; runners with persistent `_work/_tool` directories
54
+ can cache downloads across jobs.
55
+
56
+ Note: this is distinct from runner-environment-202 (repeated downloads caused by
57
+ case-sensitivity mismatch in v2.334.0), which causes multiple SUCCESSFUL downloads
58
+ rather than 429 failures.
59
+
60
+ Source: actions/runner#4232 (Feb 2026, open), actions/checkout#2230.
61
+ fix: |
62
+ **Immediate:** Retry the failed workflow run. 429 errors during action downloads
63
+ are almost always transient — a re-run a few minutes later usually succeeds.
64
+
65
+ **Short-term: Reduce per-job download requests.**
66
+ Pin actions to a specific commit SHA rather than a mutable tag. Runners maintain an
67
+ action download cache keyed by commit SHA; pinned SHAs hit the cache more reliably
68
+ than mutable tags which may require a fresh lookup each run.
69
+
70
+ **For self-hosted runners: persist the action cache directory.**
71
+ The runner caches downloaded actions in `<runner-root>/_work/_tool`. If your runner
72
+ instances are ephemeral (e.g., provisioned fresh per job on EC2/GKE), mount a shared
73
+ volume or EFS/EBS on this path to share the cache across instances, reducing
74
+ download traffic dramatically.
75
+
76
+ **For high-parallelism organizations: stagger workflow start times.**
77
+ Spread out scheduled workflows or push triggers to avoid thousands of jobs starting
78
+ simultaneously. Use concurrency groups or scheduled-at offsets to distribute load.
79
+
80
+ **Self-hosted runners on GHES / GHAE:** Use GHES's internal action caching to serve
81
+ action downloads locally instead of hitting GitHub.com — this avoids rate limiting
82
+ entirely.
83
+
84
+ **Monitor GitHub status:** Check https://www.githubstatus.com/ when 429 errors appear
85
+ — they often correlate with incidents listed on the status page.
86
+ fix_code:
87
+ - language: yaml
88
+ label: 'Pin actions to commit SHA to maximize cache hits and reduce download requests'
89
+ code: |
90
+ # Instead of mutable tags (re-downloaded each time if tag moves):
91
+ # - uses: actions/checkout@v4
92
+ # - uses: actions/setup-node@v4
93
+
94
+ # Pin to immutable commit SHAs for reliable caching:
95
+ - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
96
+ - uses: actions/setup-node@39370e3970a6d050c480ffad4ff0ed4d3fdee5af # v4.1.0
97
+ - language: yaml
98
+ label: 'Self-hosted runner: configure RUNNER_TOOL_CACHE to a persistent shared path'
99
+ code: |
100
+ # In the runner environment, set the tool cache path before starting the runner:
101
+ # export RUNNER_TOOL_CACHE=/mnt/shared/runner-tool-cache
102
+ # ./run.sh
103
+
104
+ # Or in the runner .env file (config/.env relative to the runner root):
105
+ # RUNNER_TOOL_CACHE=/mnt/shared/runner-tool-cache
106
+
107
+ # Docker-based self-hosted runner: mount the cache directory:
108
+ # docker run -v /mnt/shared/runner-tool-cache:/opt/hostedtoolcache ...
109
+ - language: yaml
110
+ label: 'Workflow-level retry: re-run the job automatically on setup failure'
111
+ code: |
112
+ # Actions does not have a built-in retry for setup failures.
113
+ # A common workaround: use a retry action in the first step that validates setup.
114
+ # Alternatively, set up a workflow_run trigger to retry on failure:
115
+ on:
116
+ workflow_run:
117
+ workflows: ["CI"]
118
+ types: [completed]
119
+
120
+ jobs:
121
+ retry-on-429:
122
+ if: github.event.workflow_run.conclusion == 'failure'
123
+ runs-on: ubuntu-latest
124
+ steps:
125
+ - name: Re-trigger the failed workflow
126
+ uses: actions/github-script@v7
127
+ with:
128
+ script: |
129
+ await github.rest.actions.createWorkflowDispatch({
130
+ owner: context.repo.owner,
131
+ repo: context.repo.repo,
132
+ workflow_id: 'ci.yml',
133
+ ref: '${{ github.event.workflow_run.head_branch }}'
134
+ });
135
+ prevention:
136
+ - 'Pin all `uses:` references to immutable commit SHAs — this maximises the chance that the runner serves the action from its local download cache rather than calling the GitHub API'
137
+ - 'For self-hosted runners provisioned per job (ephemeral), mount a persistent shared volume at the runner tool cache path so action downloads are reused across job invocations'
138
+ - 'Upgrade to actions/runner v2.335.0+ which includes PR#4296 action resolution batching — this reduces the number of download requests per job for workflows using composite actions'
139
+ - 'When you see 429 errors, check https://www.githubstatus.com/ — they often coincide with GitHub API rate limit incidents affecting the entire platform'
140
+ - 'Spread scheduled workflows across different minutes using cron expressions like `17 4 * * *` rather than `0 4 * * *` to avoid the stampede effect when many organizations schedule jobs at round-number times'
141
+ docs:
142
+ - url: 'https://github.com/actions/runner/issues/4232'
143
+ label: 'actions/runner#4232: Warning: Failed to download action ... 429 Too Many Requests'
144
+ - url: 'https://github.com/actions/checkout/issues/2230'
145
+ label: 'actions/checkout#2230: checkout fails with 429 Too Many Requests'
146
+ - url: 'https://github.com/actions/runner/pull/4296'
147
+ label: 'actions/runner PR#4296: Batch and deduplicate action resolution across composite depths (reduces 429 risk)'
148
+ - url: 'https://docs.github.com/en/rest/overview/rate-limits-for-the-rest-api'
149
+ label: 'GitHub Docs: REST API rate limits'
150
+ - url: 'https://www.githubstatus.com/'
151
+ label: 'GitHub Status page — check for active incidents when 429 errors appear'
@@ -0,0 +1,97 @@
1
+ id: silent-failures-112
2
+ title: 'Non-ephemeral self-hosted runner REST API busy flag desync causes auto-scaler to kill active job'
3
+ category: silent-failures
4
+ severity: silent-failure
5
+ tags:
6
+ - self-hosted
7
+ - non-ephemeral
8
+ - rest-api
9
+ - autoscaling
10
+ - broker
11
+ - busy-flag
12
+ - desync
13
+ - aws
14
+ patterns:
15
+ - regex: 'The runner has received a shutdown signal'
16
+ flags: 'i'
17
+ - regex: '"busy":\s*false'
18
+ flags: 'i'
19
+ - regex: 'broker\.actions\.githubusercontent\.com.*busy.*JobState'
20
+ flags: 'i'
21
+ error_messages:
22
+ - '##[error]The runner has received a shutdown signal.'
23
+ - '"busy": false ← REST API shows idle while broker actively renewing job lease'
24
+ - 'Successfully renew job, valid till ...'
25
+ root_cause: |
26
+ On non-ephemeral self-hosted runners (runners that execute multiple sequential jobs on the same
27
+ instance), there is a state desynchronization between the broker layer
28
+ (broker.actions.githubusercontent.com) and the REST API endpoint
29
+ GET /repos/{owner}/{repo}/actions/runners/{runner_id}.
30
+
31
+ The failure sequence:
32
+ 1. Runner completes Job A and briefly enters the Online/idle state.
33
+ 2. Runner picks up Job B, transitions to Busy, and reports JobState: Busy to the broker.
34
+ 3. The broker acknowledges the job and successfully renews the lease every 60 seconds
35
+ (logged as "Successfully renew job, valid till ...").
36
+ 4. Despite (3), the REST API returns "busy": false — sometimes immediately when Job B starts,
37
+ sometimes after tracking correctly for several minutes before spontaneously flipping.
38
+ 5. Auto-scaling infrastructure (AWS Lambda scaler, GKE controllers, terraform-aws-github-runner,
39
+ or custom polling services) queries the REST API, sees the runner as idle, and terminates the
40
+ EC2/VM instance mid-job.
41
+ 6. Job B fails with "The runner has received a shutdown signal." No error is surfaced by GitHub's
42
+ UI indicating the runner was inappropriately killed.
43
+
44
+ The desync appears to occur at the Busy→Online transition boundary between jobs. The broker and
45
+ the REST API maintain separate state stores; in certain timing windows the REST API does not pick
46
+ up the Job B start event, leaving its state stale from the inter-job idle period.
47
+ fix: |
48
+ Option 1 — Use ephemeral runners (recommended). Each runner instance handles exactly one job
49
+ and then terminates. There is no second job pick-up, so the busy-flag desync window never exists.
50
+
51
+ Option 2 — Do not rely solely on "busy": false from the REST API as the termination signal.
52
+ Cross-reference with a job-completion event (CloudWatch, Pub/Sub, broker callback) or add a
53
+ grace period of 60-120 seconds after the REST API first reports idle before terminating.
54
+
55
+ Option 3 — Use the Webhook-based ARC (Actions Runner Controller) instead of REST-poll-based
56
+ autoscaling. ARC receives direct runner lifecycle events and does not depend on the REST API
57
+ busy state for scale-down decisions.
58
+ fix_code:
59
+ - language: hcl
60
+ label: 'Enable ephemeral runners in terraform-aws-github-runner to prevent multi-job desync'
61
+ code: |
62
+ module "runners" {
63
+ source = "philips-labs/github-runner/aws"
64
+ version = "~> 6.6.0"
65
+
66
+ # Each instance executes exactly one job then terminates.
67
+ # Eliminates the busy-flag desync window between sequential jobs.
68
+ enable_ephemeral_runners = true
69
+
70
+ # ... other config
71
+ }
72
+
73
+ - language: yaml
74
+ label: 'Declare ephemeral runner label in the workflow'
75
+ code: |
76
+ jobs:
77
+ build:
78
+ # Request a fresh ephemeral runner — one job per instance.
79
+ # Requires your runner pool to be configured for ephemeral mode.
80
+ runs-on: [self-hosted, ephemeral, linux, x64]
81
+ steps:
82
+ - uses: actions/checkout@v4
83
+ - run: ./build.sh
84
+ prevention:
85
+ - 'Prefer ephemeral runners over reuse runners in autoscaling environments — one job per instance eliminates the busy-flag desync window entirely.'
86
+ - 'Never use "busy": false from the REST API as the sole signal for terminating a non-ephemeral runner instance.'
87
+ - 'Add a minimum 90-second grace period after the REST API first reports busy=false before terminating any non-ephemeral runner.'
88
+ - 'Monitor broker lease-renewal logs ("Successfully renew job") to track active job state independently of the REST API.'
89
+ docs:
90
+ - url: 'https://github.com/actions/runner/issues/4422'
91
+ label: 'actions/runner#4422 — /runners REST API reports busy: false while broker says busy (open)'
92
+ - url: 'https://github.com/github-aws-runners/terraform-aws-github-runner'
93
+ label: 'terraform-aws-github-runner — ephemeral runner module'
94
+ - url: 'https://docs.github.com/en/rest/actions/self-hosted-runners#list-self-hosted-runners-for-a-repository'
95
+ label: 'GitHub Docs — REST API for self-hosted runners'
96
+ - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/about-self-hosted-runners'
97
+ label: 'GitHub Docs — About self-hosted runners'
@@ -0,0 +1,110 @@
1
+ id: silent-failures-113
2
+ title: 'Org-level self-hosted runner with correct group access never dispatched — job stays Queued indefinitely with no error'
3
+ category: silent-failures
4
+ severity: silent-failure
5
+ tags:
6
+ - self-hosted
7
+ - runner-group
8
+ - org-runner
9
+ - dispatch
10
+ - queued
11
+ - v2-broker
12
+ - silent
13
+ patterns:
14
+ - regex: 'runner_group_id.*null'
15
+ flags: 'i'
16
+ - regex: 'Waiting for a runner to pick up this job'
17
+ flags: 'i'
18
+ error_messages:
19
+ - 'Waiting for a runner to pick up this job'
20
+ - '(job stays in Queued state indefinitely — no error message is displayed)'
21
+ root_cause: |
22
+ When a self-hosted runner is registered at the **organization level** (not the
23
+ repository level) and added to a runner group with explicit per-repository access,
24
+ a bug in the V2 broker flow (`useV2Flow: true`,
25
+ `serverUrl: broker.actions.githubusercontent.com`) can cause the dispatcher to fail
26
+ to resolve the runner group membership against the target repository.
27
+
28
+ The result: the queued job never receives a `runner_group_id` assignment
29
+ (confirmed via `GET /repos/{owner}/{repo}/actions/runs/{run_id}/jobs` API, which
30
+ returns `runnerId: null`, `runnerName: null`, `runnerGroupId: null` throughout the
31
+ entire queue duration). The runner is online and idle throughout — it simply is
32
+ never offered the job.
33
+
34
+ From the UI perspective, the job shows the standard "Waiting for a runner to pick
35
+ up this job" message with no indication that the runner group resolution failed.
36
+ No error annotation is ever produced — the workflow just hangs until cancelled.
37
+
38
+ This failure mode has been observed under these conditions:
39
+ - Runner registered at organization scope
40
+ - Runner group has explicit per-repository access (not "All repositories")
41
+ - Runner version 2.334.0, V2 broker protocol enabled
42
+ - GitHub Team plan
43
+ - GitHub Enterprise Cloud is NOT required to reproduce
44
+
45
+ Repository-level runners using identical labels and configuration dispatch
46
+ correctly, confirming the bug is specific to the org-level + runner-group + V2
47
+ broker path.
48
+
49
+ The precise broker-side failure point is unknown; the hypothesis is an inconsistency
50
+ in how V2 broker resolves org runner group → repository access grant at dispatch time.
51
+ fix: |
52
+ Immediate workaround: re-register the runner at the repository level instead of
53
+ the organization level.
54
+
55
+ 1. Stop and unregister the org-level runner:
56
+ cd <runner-dir> && ./config.sh remove --token <removal-token>
57
+
58
+ 2. Re-register at repository level:
59
+ ./config.sh --url https://github.com/<org>/<repo> --token <repo-token>
60
+
61
+ 3. Trigger the workflow again — dispatch should proceed immediately.
62
+
63
+ If you need the runner to serve multiple repositories, repeat the registration
64
+ for each repository, or use runner groups with "All repositories" scope as an
65
+ interim workaround until GitHub resolves the V2 broker dispatch bug.
66
+
67
+ Track actions/runner#4429 for an official fix.
68
+ fix_code:
69
+ - language: bash
70
+ label: 'Re-register runner at repository level instead of org level'
71
+ code: |
72
+ cd /path/to/runner
73
+
74
+ # Remove org-level registration
75
+ ./config.sh remove --token <ORG_REMOVAL_TOKEN>
76
+
77
+ # Re-register at repository level
78
+ ./config.sh \
79
+ --url https://github.com/<org>/<repo> \
80
+ --token <REPO_RUNNER_TOKEN> \
81
+ --name my-runner \
82
+ --labels my-label \
83
+ --unattended
84
+
85
+ # Start the runner
86
+ ./run.sh
87
+
88
+ - language: yaml
89
+ label: 'Workflow — no workflow change needed; fix is in runner registration scope'
90
+ code: |
91
+ # No changes needed in the workflow YAML itself.
92
+ # The runs-on label works correctly once the runner is repo-scoped.
93
+ jobs:
94
+ build:
95
+ runs-on: [self-hosted, my-label]
96
+ steps:
97
+ - uses: actions/checkout@v4
98
+ - run: echo "Runner dispatched correctly"
99
+ prevention:
100
+ - 'Prefer repository-level runner registration for single-repo pipelines; use org-level runners only when the runner must serve many repos and test dispatch before rolling out.'
101
+ - 'After registering an org-level runner, verify dispatch with a simple echo workflow before building production pipelines on top of it.'
102
+ - 'Monitor actions/runner#4429 for a fix to the V2 broker org-runner-group dispatch resolution bug.'
103
+ - 'When investigating stuck jobs, use the REST API (GET /repos/{owner}/{repo}/actions/runs/{run_id}/jobs) to check if runnerGroupId is null — this confirms dispatch resolution failure rather than a resource-wait.'
104
+ docs:
105
+ - url: 'https://github.com/actions/runner/issues/4429'
106
+ label: 'actions/runner#4429 — Org-level self-hosted runner never dispatched despite correct runner group repo access'
107
+ - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/adding-self-hosted-runners'
108
+ label: 'GitHub Docs — Adding self-hosted runners (org vs repo level)'
109
+ - url: 'https://docs.github.com/en/rest/actions/self-hosted-runners'
110
+ label: 'GitHub REST API — Self-hosted runners'
@@ -0,0 +1,116 @@
1
+ id: silent-failures-114
2
+ title: 'upload-artifact@v7 archive:false uploads artifact using filename instead of name input — causes name conflict across jobs'
3
+ category: silent-failures
4
+ severity: silent-failure
5
+ tags:
6
+ - upload-artifact
7
+ - archive
8
+ - artifact-name
9
+ - v7
10
+ - matrix
11
+ - parallel-jobs
12
+ - name-conflict
13
+ patterns:
14
+ - regex: 'An artifact with this name already exists on the run'
15
+ flags: 'i'
16
+ - regex: 'Artifact name conflict.*archive.*false'
17
+ flags: 'i'
18
+ - regex: 'Failed to CreateArtifact.*already exists'
19
+ flags: 'i'
20
+ error_messages:
21
+ - 'An artifact with this name already exists on the run'
22
+ - 'Failed to CreateArtifact: artifact with name already exists'
23
+ root_cause: |
24
+ In `actions/upload-artifact@v7`, when `archive: false` is set, the action uploads
25
+ individual files as separate artifact entries rather than bundling them into a zip
26
+ archive. In this mode, a bug causes the action to use the **uploaded filename** as
27
+ the artifact name instead of respecting the `name:` input parameter.
28
+
29
+ For example, if two parallel jobs both upload `report.html` with `name: report` and
30
+ `archive: false`, the first job creates an artifact named `report.html` (not `report`).
31
+ The second job then tries to create another artifact named `report.html` and hits a
32
+ name collision error:
33
+ "An artifact with this name already exists on the run"
34
+
35
+ In the `archive: true` (default) mode, the `name:` input works correctly and the
36
+ artifact is a zip named after the `name:` value. The bug is specific to the
37
+ unarchived upload path introduced in v7.
38
+
39
+ This failure is partially silent because:
40
+ - The first job's upload succeeds (just with the wrong name)
41
+ - The error only manifests on the second+ job where the collision is detected
42
+ - The artifact appears in the Actions UI under the filename, not the user-provided name
43
+ - Users downloading by the `name:` value will find no artifact with that name
44
+
45
+ This bug was filed against v7.0.0 and was not fixed at time of filing.
46
+ fix: |
47
+ Option 1 — Remove archive:false and use the default archive:true behavior.
48
+ The name: input works correctly when archiving is enabled. This is the recommended
49
+ fix until the v7 archive:false bug is resolved.
50
+
51
+ Option 2 — Use unique name: values per job.
52
+ If archive:false is required (e.g., for browser-preview of HTML artifacts), give
53
+ each job a unique artifact name using the matrix value or job index to avoid the
54
+ filename-collision bug manifesting:
55
+ name: report-${{ matrix.os }}
56
+
57
+ Option 3 — Pin to upload-artifact@v6 or earlier.
58
+ The archive:false feature did not exist in v6; artifacts were always archived.
59
+
60
+ Track actions/upload-artifact#785 for a fix.
61
+ fix_code:
62
+ - language: yaml
63
+ label: 'Remove archive:false — use default archive:true (name: input works correctly)'
64
+ code: |
65
+ jobs:
66
+ lint:
67
+ runs-on: ubuntu-latest
68
+ steps:
69
+ - uses: actions/checkout@v4
70
+ - run: ./lint.sh > report.html
71
+ - uses: actions/upload-artifact@v7
72
+ with:
73
+ name: lint-report # works correctly with default archive: true
74
+ path: report.html
75
+ # archive: false <-- remove this line
76
+
77
+ test:
78
+ runs-on: ubuntu-latest
79
+ steps:
80
+ - uses: actions/checkout@v4
81
+ - run: ./test.sh > report.html
82
+ - uses: actions/upload-artifact@v7
83
+ with:
84
+ name: test-report # different name per job — no collision
85
+ path: report.html
86
+
87
+ - language: yaml
88
+ label: 'Workaround — unique name per job if archive:false is required'
89
+ code: |
90
+ jobs:
91
+ check:
92
+ strategy:
93
+ matrix:
94
+ component: [lint, test, typecheck]
95
+ runs-on: ubuntu-latest
96
+ steps:
97
+ - uses: actions/checkout@v4
98
+ - run: ./${{ matrix.component }}.sh > report.html
99
+ - uses: actions/upload-artifact@v7
100
+ with:
101
+ # Include matrix value in name to avoid the archive:false collision bug
102
+ name: report-${{ matrix.component }}
103
+ path: report.html
104
+ archive: false
105
+ prevention:
106
+ - 'When uploading the same filename from multiple parallel jobs, always use a unique name: per job (e.g., name: report-${{ matrix.component }}) regardless of the archive: setting.'
107
+ - 'Test artifact upload in a single-job workflow first to verify the artifact appears under the expected name before rolling out to parallel matrix jobs.'
108
+ - 'Monitor actions/upload-artifact#785 and the v7 changelog for a fix to the archive:false + name: input handling bug.'
109
+ - 'Default to archive:true (the default) unless you specifically need individual file browsing in the UI — the default mode has fewer edge-case bugs.'
110
+ docs:
111
+ - url: 'https://github.com/actions/upload-artifact/issues/785'
112
+ label: 'actions/upload-artifact#785 — archive:false does not respect artifact name'
113
+ - url: 'https://github.com/actions/upload-artifact/blob/main/README.md#inputs'
114
+ label: 'upload-artifact README — inputs (name, path, archive)'
115
+ - url: 'https://github.com/actions/upload-artifact/releases/tag/v7.0.0'
116
+ label: 'upload-artifact v7.0.0 release notes'
@@ -0,0 +1,130 @@
1
+ id: silent-failures-115
2
+ title: 'actions/cache save exits 0 and logs "Cache saved" despite 503 backend upload failures'
3
+ category: silent-failures
4
+ severity: silent-failure
5
+ tags:
6
+ - actions/cache
7
+ - cache-save
8
+ - 503
9
+ - backend-error
10
+ - silent-success
11
+ - false-positive
12
+ - upload
13
+ patterns:
14
+ - regex: 'Cache service responded with 503'
15
+ flags: 'i'
16
+ - regex: 'uploadChunk .+ failed: Cache service responded with'
17
+ flags: 'i'
18
+ - regex: 'Warning: Failed to save: uploadChunk .+ failed'
19
+ flags: 'i'
20
+ error_messages:
21
+ - 'Warning: Failed to save: uploadChunk (start: 67108864, end: 100663295) failed: Cache service responded with 503'
22
+ - 'Cache saved with key: Linux-<run_id>-build'
23
+ root_cause: |
24
+ When `actions/cache` (or `actions/cache/save@v4`) uploads a cache, it
25
+ splits the archive into chunks and uploads them in parallel using the Azure
26
+ SDK `BlobClient`. If the backend returns HTTP 503 errors for individual
27
+ chunks, the action retries once but ultimately logs the failure as a
28
+ `Warning:` line and continues.
29
+
30
+ The critical flaw is that the action then calls `commitCache()` to finalize
31
+ the cache entry regardless of whether all chunks uploaded successfully.
32
+ The cache entry is committed to the Actions service database even though
33
+ the underlying blob storage is corrupt or incomplete.
34
+
35
+ This results in a silently false "cache saved" state:
36
+ - The action logs `Cache saved with key: <key>` (green success)
37
+ - The job exits with code 0 (no failure)
38
+ - BUT the stored cache entry is corrupt or truncated
39
+
40
+ On the next run, `actions/cache/restore` (or the inline `actions/cache`
41
+ restore phase) either fails with a decompression/extraction error or
42
+ silently falls back to "Cache not found" — depending on the extent of
43
+ corruption. The developer sees a cache miss on the restore side and
44
+ investigates there, never realizing the root cause was a silent save failure
45
+ several runs earlier.
46
+
47
+ This pattern is especially harmful when:
48
+ - The cache backend is temporarily degraded (503 storms during peak usage)
49
+ - The cache key includes the run_id, making the corrupt entry unique and
50
+ never overwritten by a subsequent run with the same key
51
+ - The project has long build times that the cache was supposed to skip
52
+ fix: |
53
+ 1. **Add a post-save validation step** that calls the GitHub Actions cache
54
+ REST API to confirm the cache entry was actually committed:
55
+
56
+ ```bash
57
+ gh api "/repos/${{ github.repository }}/actions/caches?key=${{ steps.cache.outputs.cache-primary-key }}" \
58
+ | jq -e '.actions_caches | length > 0' || { echo "Cache save failed — no entry in API"; exit 1; }
59
+ ```
60
+
61
+ 2. **Use `save-always: false` (the default)** — only invoke `actions/cache`
62
+ for save when you know the prior job succeeded. If you use the `save`
63
+ sub-action with `save-always: true`, be aware that backend failures are
64
+ swallowed.
65
+
66
+ 3. **Separate save from restore** using the `actions/cache/save` and
67
+ `actions/cache/restore` sub-actions so you can add explicit error
68
+ handling around the save step via `continue-on-error: false`.
69
+
70
+ 4. **Monitor for the upstream fix** in actions/cache#1416. Once the action
71
+ propagates chunk upload failures to the overall exit code, the job will
72
+ correctly fail on corrupt saves.
73
+ fix_code:
74
+ - language: yaml
75
+ label: 'Validate cache was committed after save using the REST API'
76
+ code: |
77
+ - name: Cache node modules
78
+ id: cache
79
+ uses: actions/cache@v4
80
+ with:
81
+ path: ~/.npm
82
+ key: ${{ runner.os }}-${{ hashFiles('**/package-lock.json') }}
83
+
84
+ - name: Verify cache entry was committed
85
+ if: steps.cache.outputs.cache-hit != 'true'
86
+ env:
87
+ GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
88
+ run: |
89
+ KEY="${{ runner.os }}-${{ hashFiles('**/package-lock.json') }}"
90
+ COUNT=$(gh api \
91
+ "/repos/${{ github.repository }}/actions/caches?key=${KEY}" \
92
+ --jq '.actions_caches | length')
93
+ if [ "$COUNT" -eq 0 ]; then
94
+ echo "::error::Cache save reported success but no entry found in API. Backend may have returned 503."
95
+ exit 1
96
+ fi
97
+ - language: yaml
98
+ label: 'Separate save from restore to add explicit error handling'
99
+ code: |
100
+ # In your build job:
101
+ - name: Restore cache
102
+ id: cache-restore
103
+ uses: actions/cache/restore@v4
104
+ with:
105
+ path: ~/.npm
106
+ key: ${{ runner.os }}-${{ hashFiles('**/package-lock.json') }}
107
+
108
+ - run: npm ci
109
+
110
+ - name: Save cache
111
+ if: steps.cache-restore.outputs.cache-hit != 'true'
112
+ uses: actions/cache/save@v4
113
+ with:
114
+ path: ~/.npm
115
+ key: ${{ runner.os }}-${{ hashFiles('**/package-lock.json') }}
116
+ # NOTE: If this step shows "Warning: Failed to save: uploadChunk..."
117
+ # but still exits 0, the cache is corrupt. Add the REST API validation
118
+ # step above to catch this scenario.
119
+ prevention:
120
+ - 'Watch for "Warning: Failed to save: uploadChunk ... failed: Cache service responded with 503" in your workflow logs — this indicates a silent corrupt save even though the step shows green.'
121
+ - 'If you see intermittent cache misses that cannot be explained by key changes, check whether the previous save step logged any 503 chunk-upload warnings.'
122
+ - 'Use time-bounded cache keys (e.g. including the week number) so a corrupt entry from a bad save is overwritten on the next successful run rather than being cached indefinitely.'
123
+ - 'For critical caches that must succeed, add a post-save REST API validation step to fail fast when the backend silently corrupted the save.'
124
+ docs:
125
+ - url: 'https://github.com/actions/cache/issues/1416'
126
+ label: 'actions/cache#1416 — actions/cache and actions/cache/save consider cache uploaded successfully even with backend errors'
127
+ - url: 'https://docs.github.com/en/rest/actions/cache?apiVersion=2022-11-28#list-github-actions-caches-for-a-repository'
128
+ label: 'GitHub REST API — List GitHub Actions caches for a repository'
129
+ - url: 'https://github.com/actions/cache/blob/main/tips-and-workarounds.md'
130
+ label: 'actions/cache — Tips and Workarounds'