@htekdev/actions-debugger 1.0.110 → 1.0.112
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/errors/caching-artifacts/buildkit-gha-cache-legacy-api-v1-self-hosted.yml +123 -0
- package/errors/concurrency-timing/merge-group-pr-number-concurrency-null-collapse.yml +85 -0
- package/errors/concurrency-timing/push-schedule-shared-concurrency-silent-cancel.yml +84 -0
- package/errors/known-unsolved/ephemeral-jit-runner-lost-communication-false-positive.yml +93 -0
- package/errors/known-unsolved/vars-context-reusable-workflow-reflects-caller-repo.yml +91 -0
- package/errors/permissions-auth/security-events-write-required-for-sarif-upload.yml +104 -0
- package/errors/runner-environment/runner-environment-180.yml +132 -0
- package/errors/runner-environment/runner-environment-181.yml +114 -0
- package/errors/runner-environment/runner-registration-token-502-burst.yml +102 -0
- package/errors/runner-environment/setup-java-sudo-strips-java-home-env-reset.yml +111 -0
- package/errors/runner-environment/setup-python-free-threaded-arm64-broken-symlink.yml +98 -0
- package/errors/runner-environment/sparse-checkout-cone-mode-not-honored-git-pre-237.yml +121 -0
- package/errors/silent-failures/matrix-array-in-object-serialized-as-Array-string.yml +127 -0
- package/errors/silent-failures/silent-failures-099.yml +104 -0
- package/errors/silent-failures/sparse-checkout-non-cone-gitignore-depth-extra-paths.yml +96 -0
- package/errors/silent-failures/upload-download-artifact-silent-failure-windows-heap-corruption.yml +90 -0
- package/package.json +1 -1
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
id: 'caching-artifacts-065'
|
|
2
|
+
title: 'Docker BuildKit type=gha cache fails on self-hosted runners with BuildKit < 0.21.0 after legacy cache service v1 shutdown'
|
|
3
|
+
category: 'caching-artifacts'
|
|
4
|
+
severity: 'error'
|
|
5
|
+
tags:
|
|
6
|
+
- docker
|
|
7
|
+
- buildkit
|
|
8
|
+
- buildx
|
|
9
|
+
- gha-cache
|
|
10
|
+
- self-hosted
|
|
11
|
+
- cache-api-v2
|
|
12
|
+
- legacy-service
|
|
13
|
+
patterns:
|
|
14
|
+
- regex: 'This legacy service is shutting down'
|
|
15
|
+
flags: 'i'
|
|
16
|
+
- regex: 'failed to solve.*legacy.*service'
|
|
17
|
+
flags: 'i'
|
|
18
|
+
- regex: 'gha-cache-sunset'
|
|
19
|
+
flags: 'i'
|
|
20
|
+
- regex: 'cache.*type=gha.*legacy'
|
|
21
|
+
flags: 'i'
|
|
22
|
+
error_messages:
|
|
23
|
+
- "ERROR: failed to solve: This legacy service is shutting down, effective April 15, 2025. Migrate to the new service ASAP. For more information: https://gh.io/gha-cache-sunset"
|
|
24
|
+
- "This legacy service is shutting down, effective April 15, 2025."
|
|
25
|
+
- "importing cache manifest from gha"
|
|
26
|
+
- "exporting to GitHub Actions Cache"
|
|
27
|
+
root_cause: |
|
|
28
|
+
GitHub decommissioned the legacy GitHub Actions Cache service v1 API on
|
|
29
|
+
April 15, 2025. All cache operations now require the new v2 API.
|
|
30
|
+
|
|
31
|
+
Docker BuildKit uses a separate Go-based GitHub Actions cache client
|
|
32
|
+
(`tonistiigi/go-actions-cache`) — distinct from the `@actions/cache` npm
|
|
33
|
+
package used by `actions/cache`. The `go-actions-cache` library added support
|
|
34
|
+
for cache API v2 in BuildKit 0.20.0 and Buildx 0.21.0.
|
|
35
|
+
|
|
36
|
+
Self-hosted runners that have not been updated since before the migration
|
|
37
|
+
still ship Docker Engine with an older BuildKit version (< 0.20.0) or an
|
|
38
|
+
older Docker Buildx binary (< 0.21.0). When these workflows use
|
|
39
|
+
`cache-from: type=gha` or `cache-to: type=gha`, the old BuildKit communicates
|
|
40
|
+
with the now-decommissioned v1 API endpoint and receives the shutdown error:
|
|
41
|
+
|
|
42
|
+
ERROR: failed to solve: This legacy service is shutting down, effective
|
|
43
|
+
April 15, 2025. Migrate to the new service ASAP.
|
|
44
|
+
|
|
45
|
+
GitHub-hosted runners (ubuntu-latest, etc.) are automatically up to date and
|
|
46
|
+
are not affected. Only self-hosted runners or custom BuildKit setups that
|
|
47
|
+
pin an old Buildx/BuildKit version are impacted.
|
|
48
|
+
|
|
49
|
+
Note: this is distinct from `actions/cache` action failing due to stale
|
|
50
|
+
ACTIONS_CACHE_URL overrides (caching-artifacts-031), which is a different
|
|
51
|
+
failure mode involving the npm-based cache client.
|
|
52
|
+
fix: |
|
|
53
|
+
Option 1 (recommended): Pin docker/setup-buildx-action to install the latest
|
|
54
|
+
Buildx on the self-hosted runner before each build. This ensures Buildx >=
|
|
55
|
+
0.21.0 is always used regardless of the runner's system Buildx version.
|
|
56
|
+
|
|
57
|
+
Option 2: Update the self-hosted runner's Docker Engine to >= 28.0.0 and
|
|
58
|
+
Buildx to >= 0.21.0 at the system level.
|
|
59
|
+
|
|
60
|
+
Option 3: Switch to registry-based cache (type=registry) if upgrading Buildx
|
|
61
|
+
is not feasible. GHCR registry cache is not subject to the API version
|
|
62
|
+
requirement and is free for public repositories.
|
|
63
|
+
|
|
64
|
+
Minimum versions required for cache API v2:
|
|
65
|
+
- Docker Buildx >= 0.21.0
|
|
66
|
+
- BuildKit >= 0.20.0
|
|
67
|
+
- Docker Compose >= 2.33.1
|
|
68
|
+
- Docker Engine >= 28.0.0 (when using containerd image store)
|
|
69
|
+
fix_code:
|
|
70
|
+
- language: yaml
|
|
71
|
+
label: 'Pin setup-buildx-action to install latest Buildx on self-hosted runner'
|
|
72
|
+
code: |
|
|
73
|
+
jobs:
|
|
74
|
+
build:
|
|
75
|
+
runs-on: [self-hosted, linux, x64]
|
|
76
|
+
steps:
|
|
77
|
+
- uses: actions/checkout@v4
|
|
78
|
+
|
|
79
|
+
# Always install the latest Buildx — ensures >= 0.21.0 for cache API v2
|
|
80
|
+
- name: Set up Docker Buildx
|
|
81
|
+
uses: docker/setup-buildx-action@v3
|
|
82
|
+
with:
|
|
83
|
+
version: latest # Overrides the runner's system Buildx version
|
|
84
|
+
|
|
85
|
+
- name: Build and push
|
|
86
|
+
uses: docker/build-push-action@v7
|
|
87
|
+
with:
|
|
88
|
+
push: ${{ github.ref == 'refs/heads/main' }}
|
|
89
|
+
tags: ghcr.io/${{ github.repository }}:latest
|
|
90
|
+
cache-from: type=gha
|
|
91
|
+
cache-to: type=gha,mode=max
|
|
92
|
+
- language: yaml
|
|
93
|
+
label: 'Fallback: use GHCR registry cache instead of type=gha'
|
|
94
|
+
code: |
|
|
95
|
+
- name: Log in to GHCR
|
|
96
|
+
uses: docker/login-action@v3
|
|
97
|
+
with:
|
|
98
|
+
registry: ghcr.io
|
|
99
|
+
username: ${{ github.actor }}
|
|
100
|
+
password: ${{ secrets.GITHUB_TOKEN }}
|
|
101
|
+
|
|
102
|
+
- name: Build and push (registry cache — no API version dependency)
|
|
103
|
+
uses: docker/build-push-action@v7
|
|
104
|
+
with:
|
|
105
|
+
push: ${{ github.ref == 'refs/heads/main' }}
|
|
106
|
+
tags: ghcr.io/${{ github.repository }}:latest
|
|
107
|
+
cache-from: type=registry,ref=ghcr.io/${{ github.repository }}:buildcache
|
|
108
|
+
cache-to: type=registry,ref=ghcr.io/${{ github.repository }}:buildcache,mode=max
|
|
109
|
+
prevention:
|
|
110
|
+
- "Always add docker/setup-buildx-action with 'version: latest' before any Docker build step on self-hosted runners."
|
|
111
|
+
- "Periodically update self-hosted runner Docker Engine and Buildx to latest stable versions."
|
|
112
|
+
- "Subscribe to the GitHub Changelog (github.blog/changelog) for advance notice of cache service API changes."
|
|
113
|
+
- "Verify Buildx version in CI: add 'docker buildx version' as a diagnostic step to catch version drift early."
|
|
114
|
+
- "Consider using registry cache (type=registry) as a more portable alternative to type=gha on self-hosted runners."
|
|
115
|
+
docs:
|
|
116
|
+
- url: 'https://github.com/docker/build-push-action/issues/1345'
|
|
117
|
+
label: 'docker/build-push-action#1345 — type=gha cache failure after legacy service shutdown (Apr 2025)'
|
|
118
|
+
- url: 'https://docs.docker.com/build/ci/github-actions/cache/#github-cache'
|
|
119
|
+
label: 'Docker Docs — GitHub Actions cache backend (note: legacy v1 API shut down April 15, 2025)'
|
|
120
|
+
- url: 'https://github.com/docker/setup-buildx-action'
|
|
121
|
+
label: 'docker/setup-buildx-action — Install latest Buildx on any runner'
|
|
122
|
+
- url: 'https://gh.io/gha-cache-sunset'
|
|
123
|
+
label: 'GitHub — GHA cache service v1 sunset announcement'
|
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
id: concurrency-timing-051
|
|
2
|
+
title: 'merge_group event has no pull_request.number — PR-number-keyed concurrency group collapses all merge queue runs to the same slot'
|
|
3
|
+
category: concurrency-timing
|
|
4
|
+
severity: silent-failure
|
|
5
|
+
tags:
|
|
6
|
+
- concurrency
|
|
7
|
+
- merge-queue
|
|
8
|
+
- merge_group
|
|
9
|
+
- pull_request
|
|
10
|
+
- cancel-in-progress
|
|
11
|
+
- null-key
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: 'merge_group'
|
|
14
|
+
flags: 'i'
|
|
15
|
+
- regex: 'pull_request\.number'
|
|
16
|
+
flags: 'i'
|
|
17
|
+
- regex: 'Canceling since a higher priority waiting run was found'
|
|
18
|
+
flags: 'i'
|
|
19
|
+
error_messages:
|
|
20
|
+
- 'This run was automatically cancelled'
|
|
21
|
+
- 'Canceling since a higher priority waiting run was found'
|
|
22
|
+
- 'Required status checks did not pass'
|
|
23
|
+
root_cause: |
|
|
24
|
+
The merge_group event payload does not include a pull_request object. When a workflow
|
|
25
|
+
uses ${{ github.event.pull_request.number }} in a concurrency group key — a common
|
|
26
|
+
pattern to scope CI runs per open PR — the expression evaluates to an empty string
|
|
27
|
+
for all merge_group-triggered runs.
|
|
28
|
+
|
|
29
|
+
All merge queue validation runs therefore share an identical concurrency group key
|
|
30
|
+
(e.g., ci- instead of ci-42). With cancel-in-progress: true, each new merge queue
|
|
31
|
+
batch immediately cancels the currently running validation for all other batches in
|
|
32
|
+
the queue. With cancel-in-progress: false, only the most-recently-queued batch runs;
|
|
33
|
+
earlier batches are ejected from the merge queue with 'Required status checks did
|
|
34
|
+
not pass'.
|
|
35
|
+
|
|
36
|
+
This silently serializes or destroys merge queue throughput and commonly occurs when
|
|
37
|
+
developers add merge_group to an existing pull_request workflow that already uses
|
|
38
|
+
PR-number-based concurrency without updating the group expression.
|
|
39
|
+
fix: |
|
|
40
|
+
Use a conditional expression that resolves to a unique key for both event types.
|
|
41
|
+
For merge_group events, github.event.merge_group.head_sha provides a value that is
|
|
42
|
+
unique per merge queue batch:
|
|
43
|
+
|
|
44
|
+
group: ci-${{ github.event.pull_request.number || github.event.merge_group.head_sha }}
|
|
45
|
+
fix_code:
|
|
46
|
+
- language: yaml
|
|
47
|
+
label: 'Broken: merge_group runs collapse to ci- (empty PR number)'
|
|
48
|
+
code: |
|
|
49
|
+
on:
|
|
50
|
+
pull_request:
|
|
51
|
+
merge_group:
|
|
52
|
+
|
|
53
|
+
concurrency:
|
|
54
|
+
group: ci-${{ github.event.pull_request.number }}
|
|
55
|
+
cancel-in-progress: true
|
|
56
|
+
- language: yaml
|
|
57
|
+
label: 'Fixed: fallback to merge_group.head_sha for merge queue runs'
|
|
58
|
+
code: |
|
|
59
|
+
on:
|
|
60
|
+
pull_request:
|
|
61
|
+
merge_group:
|
|
62
|
+
|
|
63
|
+
concurrency:
|
|
64
|
+
# pull_request: keyed by PR number (e.g., ci-42)
|
|
65
|
+
# merge_group: keyed by batch SHA (unique per merge queue batch)
|
|
66
|
+
group: ci-${{ github.event.pull_request.number || github.event.merge_group.head_sha }}
|
|
67
|
+
cancel-in-progress: true
|
|
68
|
+
|
|
69
|
+
jobs:
|
|
70
|
+
test:
|
|
71
|
+
runs-on: ubuntu-latest
|
|
72
|
+
steps:
|
|
73
|
+
- uses: actions/checkout@v4
|
|
74
|
+
- run: make test
|
|
75
|
+
prevention:
|
|
76
|
+
- 'When adding merge_group to an existing pull_request workflow, audit every concurrency group expression for github.event.pull_request.* references — all are null on merge_group events'
|
|
77
|
+
- 'Use github.event.merge_group.head_sha as the fallback identifier for merge queue runs; it is unique per batch'
|
|
78
|
+
- 'Verify the resolved concurrency key by adding a debug step that prints the evaluated expression for each event type'
|
|
79
|
+
docs:
|
|
80
|
+
- url: 'https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#merge_group'
|
|
81
|
+
label: 'GitHub Docs: merge_group event'
|
|
82
|
+
- url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/using-concurrency'
|
|
83
|
+
label: 'GitHub Docs: Using concurrency'
|
|
84
|
+
- url: 'https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/configuring-pull-request-merges/managing-a-merge-queue'
|
|
85
|
+
label: 'GitHub Docs: Managing a merge queue'
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
id: concurrency-timing-050
|
|
2
|
+
title: 'push and schedule sharing a concurrency group silently cancel each other on the default branch'
|
|
3
|
+
category: concurrency-timing
|
|
4
|
+
severity: silent-failure
|
|
5
|
+
tags:
|
|
6
|
+
- concurrency
|
|
7
|
+
- push
|
|
8
|
+
- schedule
|
|
9
|
+
- cancel-in-progress
|
|
10
|
+
- nightly-build
|
|
11
|
+
- silent-failure
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: 'Canceling since a higher priority waiting run was found'
|
|
14
|
+
flags: 'i'
|
|
15
|
+
- regex: 'This run was automatically cancelled'
|
|
16
|
+
flags: 'i'
|
|
17
|
+
error_messages:
|
|
18
|
+
- 'This run was automatically cancelled'
|
|
19
|
+
- 'Canceling since a higher priority waiting run was found'
|
|
20
|
+
root_cause: |
|
|
21
|
+
When a workflow responds to both on.push and on.schedule, and uses a concurrency
|
|
22
|
+
group key that does not include github.event_name, all push and schedule runs share
|
|
23
|
+
the same concurrency slot.
|
|
24
|
+
|
|
25
|
+
Both a push to the default branch and a scheduled run targeting the default branch
|
|
26
|
+
evaluate github.ref to refs/heads/main (or the configured default branch name).
|
|
27
|
+
A concurrency group of ${{ github.workflow }}-${{ github.ref }} therefore produces
|
|
28
|
+
an identical key for both event types.
|
|
29
|
+
|
|
30
|
+
With cancel-in-progress: true, any push commit that lands while a scheduled run
|
|
31
|
+
(nightly build, report, data sync) is executing immediately cancels that run with
|
|
32
|
+
no error notification. With cancel-in-progress: false, the one pending-run slot is
|
|
33
|
+
taken by the push run, silently displacing any queued scheduled run.
|
|
34
|
+
|
|
35
|
+
This is distinct from the workflow_dispatch + schedule pattern (concurrency-timing-047)
|
|
36
|
+
because push events are emitted automatically on every commit and are far more
|
|
37
|
+
frequent than manual dispatches, making accidental cancellation of scheduled runs a
|
|
38
|
+
routine occurrence rather than an occasional manual event.
|
|
39
|
+
fix: |
|
|
40
|
+
Include github.event_name in the concurrency group key so that push runs and
|
|
41
|
+
schedule runs each occupy a separate, independent slot:
|
|
42
|
+
|
|
43
|
+
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
|
|
44
|
+
fix_code:
|
|
45
|
+
- language: yaml
|
|
46
|
+
label: 'Broken: push and schedule share one slot'
|
|
47
|
+
code: |
|
|
48
|
+
on:
|
|
49
|
+
push:
|
|
50
|
+
branches: [main]
|
|
51
|
+
schedule:
|
|
52
|
+
- cron: '0 2 * * *'
|
|
53
|
+
|
|
54
|
+
concurrency:
|
|
55
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
56
|
+
cancel-in-progress: true
|
|
57
|
+
- language: yaml
|
|
58
|
+
label: 'Fixed: include github.event_name to isolate push and schedule concurrency slots'
|
|
59
|
+
code: |
|
|
60
|
+
on:
|
|
61
|
+
push:
|
|
62
|
+
branches: [main]
|
|
63
|
+
schedule:
|
|
64
|
+
- cron: '0 2 * * *'
|
|
65
|
+
|
|
66
|
+
concurrency:
|
|
67
|
+
group: ${{ github.workflow }}-${{ github.event_name }}-${{ github.ref }}
|
|
68
|
+
cancel-in-progress: true
|
|
69
|
+
|
|
70
|
+
jobs:
|
|
71
|
+
ci:
|
|
72
|
+
runs-on: ubuntu-latest
|
|
73
|
+
steps:
|
|
74
|
+
- uses: actions/checkout@v4
|
|
75
|
+
- run: make test
|
|
76
|
+
prevention:
|
|
77
|
+
- 'Always include ${{ github.event_name }} in the concurrency group key for workflows triggered by both automated (push/pull_request) and scheduled events'
|
|
78
|
+
- 'After adding on.schedule to an existing push-triggered workflow, audit the concurrency group key to confirm scheduled runs receive their own independent slot'
|
|
79
|
+
- 'Monitor the Actions tab for unexpected gaps in scheduled run history — silent cancellations leave no failed run, only a missing run for that time slot'
|
|
80
|
+
docs:
|
|
81
|
+
- url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/using-concurrency'
|
|
82
|
+
label: 'GitHub Docs: Using concurrency'
|
|
83
|
+
- url: 'https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#schedule'
|
|
84
|
+
label: 'GitHub Docs: schedule event'
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
id: known-unsolved-058
|
|
2
|
+
title: 'Ephemeral JIT Runner Reports "Lost Communication" Despite Successful Job Completion'
|
|
3
|
+
category: known-unsolved
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- self-hosted
|
|
7
|
+
- ephemeral
|
|
8
|
+
- jit-runner
|
|
9
|
+
- lost-communication
|
|
10
|
+
- false-positive
|
|
11
|
+
- broker
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: 'The self-hosted runner lost communication with the server'
|
|
14
|
+
flags: 'i'
|
|
15
|
+
- regex: 'messageQueueLoopTokenSource|Stop message queue looping'
|
|
16
|
+
flags: 'i'
|
|
17
|
+
error_messages:
|
|
18
|
+
- 'The self-hosted runner lost communication with the server'
|
|
19
|
+
- 'GET request to broker.actions.githubusercontent.com/message ... has been cancelled'
|
|
20
|
+
- 'TaskCanceledException: The operation was canceled'
|
|
21
|
+
root_cause: |
|
|
22
|
+
In Runner.cs line 576, after a one-time-use (ephemeral/JIT) job completes, the message queue
|
|
23
|
+
is cancelled immediately with zero grace period:
|
|
24
|
+
|
|
25
|
+
messageQueueLoopTokenSource.Cancel();
|
|
26
|
+
|
|
27
|
+
This tears down the in-flight broker long-poll (GET broker.actions.githubusercontent.com/message)
|
|
28
|
+
immediately after CompleteJobAsync returns. GitHub's broker health monitor detects the TCP
|
|
29
|
+
disconnect and flags the runner as "lost communication" — racing against the pipeline service
|
|
30
|
+
that just received the successful completion.
|
|
31
|
+
|
|
32
|
+
Two independent GitHub backend systems race:
|
|
33
|
+
1. Pipeline service — received CompleteJobAsync, knows the job succeeded
|
|
34
|
+
2. Broker health monitor — sees TCP disconnect, flags runner as "lost communication"
|
|
35
|
+
|
|
36
|
+
When the broker monitor wins (which happens on ~5-10% of short jobs), the UI shows
|
|
37
|
+
"The self-hosted runner lost communication with the server" even though the worker exited
|
|
38
|
+
with code 100 (success) and the runner itself exited with return code 0.
|
|
39
|
+
|
|
40
|
+
The runner logs show all events at identical timestamps — zero delay between
|
|
41
|
+
"Received job status event. JobState: Online" and "messageQueueLoopTokenSource.Cancel()".
|
|
42
|
+
|
|
43
|
+
No user-side configuration can fix this. A code change to Runner.cs is required
|
|
44
|
+
(add ~5s grace delay before cancel). Source: actions/runner#4309 (March 2026, open bug,
|
|
45
|
+
affects runner v2.331.0+, ~5-10% failure rate on short ephemeral jobs).
|
|
46
|
+
fix: |
|
|
47
|
+
There is no user-side configuration fix. The root cause is a zero-grace-period teardown
|
|
48
|
+
in Runner.cs that must be patched by GitHub.
|
|
49
|
+
|
|
50
|
+
Mitigations:
|
|
51
|
+
1. Verify the diagnostic logs — the job actually succeeded. Check _diag/Runner_*.log
|
|
52
|
+
for "return code 0" and "result: Succeeded" to confirm before concluding failure.
|
|
53
|
+
2. Retry the failed job — the "lost communication" is a false positive; re-running
|
|
54
|
+
produces a clean success.
|
|
55
|
+
3. Do not use ephemeral JIT runners as required status checks until actions/runner#4309
|
|
56
|
+
is resolved, or implement a workflow-level retry wrapper.
|
|
57
|
+
4. Alert on job result=="failure" not on "lost communication" text alone — false positives
|
|
58
|
+
from this race should not page on-call engineers.
|
|
59
|
+
5. Batch small jobs — the race is more likely on jobs shorter than 60 seconds. Combining
|
|
60
|
+
work into longer-running jobs reduces false positive frequency.
|
|
61
|
+
fix_code:
|
|
62
|
+
- language: yaml
|
|
63
|
+
label: 'Confirm false positive by reading runner diagnostic log before retrying'
|
|
64
|
+
code: |
|
|
65
|
+
# After "lost communication" on an ephemeral JIT runner, check:
|
|
66
|
+
# _diag/Runner_YYYYMMDD-hhmmss-utc.log
|
|
67
|
+
#
|
|
68
|
+
# Successful completion signs (all at identical timestamps):
|
|
69
|
+
# [INFO] finish job request for job {id} with result: Succeeded
|
|
70
|
+
# [INFO] Job X completed with result: Succeeded
|
|
71
|
+
# [INFO] Received job status event. JobState: Online
|
|
72
|
+
# [INFO] Runner execution has finished with return code 0
|
|
73
|
+
#
|
|
74
|
+
# If these appear immediately before TaskCanceledException, the job DID succeed.
|
|
75
|
+
# Simply re-run the workflow — the retry will show a clean green result.
|
|
76
|
+
jobs:
|
|
77
|
+
build:
|
|
78
|
+
runs-on: [self-hosted, ephemeral, linux]
|
|
79
|
+
# Note: retry-on-failure logic can wrap this job in the caller:
|
|
80
|
+
steps:
|
|
81
|
+
- uses: actions/checkout@v4
|
|
82
|
+
- run: make build
|
|
83
|
+
prevention:
|
|
84
|
+
- 'Treat "lost communication" on ephemeral JIT runners as a potential false positive — check _diag/Runner_*.log for "return code 0" before escalating.'
|
|
85
|
+
- 'Do not gate required status checks on ephemeral JIT runners until actions/runner#4309 is resolved upstream.'
|
|
86
|
+
- 'Alert on job result=="failure", not on the "lost communication" message text alone.'
|
|
87
|
+
- 'Batch work into longer jobs (>60s total) to reduce the broker teardown race frequency.'
|
|
88
|
+
- 'Watch actions/runner#4309 for an upstream fix (proposed: 5-second grace period before messageQueueLoopTokenSource.Cancel()).'
|
|
89
|
+
docs:
|
|
90
|
+
- url: 'https://github.com/actions/runner/issues/4309'
|
|
91
|
+
label: 'actions/runner#4309: Ephemeral/JIT runner reports "lost communication" despite successful completion (March 2026)'
|
|
92
|
+
- url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/autoscaling-with-self-hosted-runners#using-just-in-time-runners'
|
|
93
|
+
label: 'GitHub Docs: Just-in-time (JIT) runners'
|
|
@@ -0,0 +1,91 @@
|
|
|
1
|
+
id: known-unsolved-059
|
|
2
|
+
title: 'vars context in reusable workflows reflects the CALLER repository variables — the called workflow repository-level vars are inaccessible'
|
|
3
|
+
category: known-unsolved
|
|
4
|
+
severity: limitation
|
|
5
|
+
tags:
|
|
6
|
+
- vars
|
|
7
|
+
- reusable-workflow
|
|
8
|
+
- workflow_call
|
|
9
|
+
- variables
|
|
10
|
+
- configuration
|
|
11
|
+
- known-limitation
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: 'vars\.[A-Z_]+'
|
|
14
|
+
flags: 'i'
|
|
15
|
+
- regex: 'workflow_call'
|
|
16
|
+
flags: 'i'
|
|
17
|
+
error_messages:
|
|
18
|
+
- '(no runtime error — vars context silently returns caller repository values instead of called workflow repository values)'
|
|
19
|
+
root_cause: |
|
|
20
|
+
GitHub Actions populates the vars context in a reusable workflow called via
|
|
21
|
+
workflow_call using the CALLER repository organization, repository, and environment
|
|
22
|
+
variables — not those defined in the repository that hosts the reusable workflow.
|
|
23
|
+
|
|
24
|
+
This means repository-level variables set in the reusable workflow repository are
|
|
25
|
+
not accessible when the workflow executes in another repository context. The vars
|
|
26
|
+
context in the called workflow reflects only:
|
|
27
|
+
- Organization-level variables visible to the caller
|
|
28
|
+
- The CALLER repository-level variables
|
|
29
|
+
- Environment-scoped variables from environments the caller deploys to
|
|
30
|
+
|
|
31
|
+
Unlike secrets (which has secrets: inherit), there is no vars: inherit mechanism.
|
|
32
|
+
A shared reusable workflow library that stores its own configuration in
|
|
33
|
+
repository-level variables cannot access those values at runtime when called
|
|
34
|
+
from another repository.
|
|
35
|
+
|
|
36
|
+
The workflow runs without error but references wrong or empty variable values
|
|
37
|
+
silently, producing incorrect behavior that is difficult to diagnose because
|
|
38
|
+
the runner logs show no permission error.
|
|
39
|
+
fix: |
|
|
40
|
+
There is no built-in mechanism to make the called workflow repository-level
|
|
41
|
+
variables available at runtime.
|
|
42
|
+
|
|
43
|
+
Available workarounds:
|
|
44
|
+
1. Promote shared configuration to organization-level variables — visible to
|
|
45
|
+
all member repositories and available in both caller and called contexts.
|
|
46
|
+
2. Pass required values as explicit workflow_call inputs from the caller:
|
|
47
|
+
with:
|
|
48
|
+
registry_url: ${{ vars.REGISTRY_URL }}
|
|
49
|
+
3. Store shared configuration in a committed file in the called workflow
|
|
50
|
+
repository and read it via a step after checkout.
|
|
51
|
+
fix_code:
|
|
52
|
+
- language: yaml
|
|
53
|
+
label: 'Workaround: pass configuration values as explicit workflow_call inputs'
|
|
54
|
+
code: |
|
|
55
|
+
# Caller repository workflow
|
|
56
|
+
jobs:
|
|
57
|
+
deploy:
|
|
58
|
+
uses: org/shared-workflows/.github/workflows/deploy.yml@main
|
|
59
|
+
with:
|
|
60
|
+
deploy_env: ${{ vars.DEPLOY_ENV }}
|
|
61
|
+
registry_url: ${{ vars.REGISTRY_URL }}
|
|
62
|
+
secrets: inherit
|
|
63
|
+
|
|
64
|
+
---
|
|
65
|
+
# Called reusable workflow (in org/shared-workflows repo)
|
|
66
|
+
on:
|
|
67
|
+
workflow_call:
|
|
68
|
+
inputs:
|
|
69
|
+
deploy_env:
|
|
70
|
+
type: string
|
|
71
|
+
required: true
|
|
72
|
+
registry_url:
|
|
73
|
+
type: string
|
|
74
|
+
required: true
|
|
75
|
+
jobs:
|
|
76
|
+
deploy:
|
|
77
|
+
runs-on: ubuntu-latest
|
|
78
|
+
steps:
|
|
79
|
+
- run: echo "Deploying to ${{ inputs.deploy_env }}"
|
|
80
|
+
prevention:
|
|
81
|
+
- 'Do not rely on repository-level vars in a shared reusable workflow library repo — they will not be available when the workflow runs in another repository context'
|
|
82
|
+
- 'Use organization-level variables for shared configuration that must be accessible from reusable workflows across all member repositories'
|
|
83
|
+
- 'Document all required configuration values as explicit workflow_call inputs with type and description so callers know exactly what to pass'
|
|
84
|
+
- 'Audit shared workflow libraries for vars. references that assume the called repo context — they will silently read caller repo values or return empty strings'
|
|
85
|
+
docs:
|
|
86
|
+
- url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/accessing-contextual-information-about-workflow-runs#vars-context'
|
|
87
|
+
label: 'GitHub Docs: vars context'
|
|
88
|
+
- url: 'https://docs.github.com/en/actions/sharing-automations/reusing-workflows#passing-inputs-and-secrets-to-a-reusable-workflow'
|
|
89
|
+
label: 'GitHub Docs: Passing inputs and secrets to a reusable workflow'
|
|
90
|
+
- url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables'
|
|
91
|
+
label: 'GitHub Docs: Store information in variables'
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
id: permissions-auth-063
|
|
2
|
+
title: 'security-events: write required for SARIF upload — missing permission causes Resource not accessible by integration'
|
|
3
|
+
category: permissions-auth
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- security-events
|
|
7
|
+
- sarif
|
|
8
|
+
- code-scanning
|
|
9
|
+
- permissions
|
|
10
|
+
- upload-sarif
|
|
11
|
+
- github-token
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: 'Resource not accessible by integration'
|
|
14
|
+
flags: 'i'
|
|
15
|
+
- regex: 'HttpError: Resource not accessible by integration'
|
|
16
|
+
flags: 'i'
|
|
17
|
+
- regex: 'Code scanning is not enabled for this repository'
|
|
18
|
+
flags: 'i'
|
|
19
|
+
- regex: 'Advanced Security must be enabled for this repository to use code scanning'
|
|
20
|
+
flags: 'i'
|
|
21
|
+
error_messages:
|
|
22
|
+
- 'Resource not accessible by integration'
|
|
23
|
+
- 'HttpError: Resource not accessible by integration'
|
|
24
|
+
- 'Code scanning is not enabled for this repository'
|
|
25
|
+
- 'Advanced Security must be enabled for this repository to use code scanning'
|
|
26
|
+
root_cause: |
|
|
27
|
+
The github/codeql-action/upload-sarif action and third-party security scanning
|
|
28
|
+
actions (Trivy, Snyk, Semgrep, osv-scanner, etc.) submit SARIF results to the
|
|
29
|
+
GitHub code scanning API endpoint POST /repos/{owner}/{repo}/code-scanning/sarifs.
|
|
30
|
+
This endpoint requires the security-events: write permission on the GITHUB_TOKEN.
|
|
31
|
+
|
|
32
|
+
Since February 2023, newly created repositories default GITHUB_TOKEN to read-only
|
|
33
|
+
permissions. A workflow that calls upload-sarif without an explicit
|
|
34
|
+
security-events: write permission block will receive 403 Resource not accessible by
|
|
35
|
+
integration from the API and the scan results will not appear in the Security tab.
|
|
36
|
+
|
|
37
|
+
This error is commonly overlooked because:
|
|
38
|
+
1. The scanning step (CodeQL analyze, Trivy scan, Semgrep scan) often completes
|
|
39
|
+
successfully — the red X appears only on the upload step.
|
|
40
|
+
2. Example workflows published before the read-only-default policy may omit the
|
|
41
|
+
security-events: write permission line.
|
|
42
|
+
3. For fork pull requests, the pull_request trigger cannot write security-events
|
|
43
|
+
even with the permission declared — a workflow_run split pattern is required.
|
|
44
|
+
fix: |
|
|
45
|
+
Add security-events: write to the job permissions block. Include contents: read
|
|
46
|
+
for checkout and actions: read if the workflow uses CodeQL (which needs to read
|
|
47
|
+
workflow definitions).
|
|
48
|
+
fix_code:
|
|
49
|
+
- language: yaml
|
|
50
|
+
label: 'Add security-events: write for SARIF upload'
|
|
51
|
+
code: |
|
|
52
|
+
jobs:
|
|
53
|
+
scan:
|
|
54
|
+
runs-on: ubuntu-latest
|
|
55
|
+
permissions:
|
|
56
|
+
security-events: write # required for upload-sarif
|
|
57
|
+
contents: read # required for checkout
|
|
58
|
+
actions: read # required by CodeQL to read workflow info
|
|
59
|
+
steps:
|
|
60
|
+
- uses: actions/checkout@v4
|
|
61
|
+
- name: Run Trivy vulnerability scanner
|
|
62
|
+
uses: aquasecurity/trivy-action@master
|
|
63
|
+
with:
|
|
64
|
+
scan-type: 'fs'
|
|
65
|
+
format: 'sarif'
|
|
66
|
+
output: 'trivy-results.sarif'
|
|
67
|
+
- name: Upload Trivy results to GitHub Security tab
|
|
68
|
+
uses: github/codeql-action/upload-sarif@v3
|
|
69
|
+
with:
|
|
70
|
+
sarif_file: 'trivy-results.sarif'
|
|
71
|
+
- language: yaml
|
|
72
|
+
label: 'Fork PRs: upload SARIF in a workflow_run triggered workflow (has write access)'
|
|
73
|
+
code: |
|
|
74
|
+
# Scanning step runs in pull_request context (no write access from fork)
|
|
75
|
+
# Upload step runs in workflow_run context (has security-events: write)
|
|
76
|
+
on:
|
|
77
|
+
workflow_run:
|
|
78
|
+
workflows: ['Security Scan']
|
|
79
|
+
types: [completed]
|
|
80
|
+
jobs:
|
|
81
|
+
upload-results:
|
|
82
|
+
runs-on: ubuntu-latest
|
|
83
|
+
permissions:
|
|
84
|
+
security-events: write
|
|
85
|
+
steps:
|
|
86
|
+
- name: Download SARIF artifact
|
|
87
|
+
uses: actions/download-artifact@v4
|
|
88
|
+
with:
|
|
89
|
+
run-id: ${{ github.event.workflow_run.id }}
|
|
90
|
+
name: sarif-results
|
|
91
|
+
- uses: github/codeql-action/upload-sarif@v3
|
|
92
|
+
with:
|
|
93
|
+
sarif_file: results.sarif
|
|
94
|
+
prevention:
|
|
95
|
+
- 'Add security-events: write to every job that calls github/codeql-action/upload-sarif or any third-party SARIF upload action'
|
|
96
|
+
- 'For fork pull request scanning, use the workflow_run + artifact pattern — the pull_request trigger cannot write security-events from fork contexts'
|
|
97
|
+
- 'Check the Security tab after a scan workflow succeeds — absence of scan results despite a green workflow is the primary indicator of missing security-events permission'
|
|
98
|
+
docs:
|
|
99
|
+
- url: 'https://docs.github.com/en/code-security/code-scanning/integrating-with-code-scanning/uploading-a-sarif-file-to-github'
|
|
100
|
+
label: 'GitHub Docs: Uploading a SARIF file to GitHub'
|
|
101
|
+
- url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/assigning-permissions-to-jobs'
|
|
102
|
+
label: 'GitHub Docs: Assigning permissions to jobs'
|
|
103
|
+
- url: 'https://docs.github.com/en/code-security/code-scanning/troubleshooting-code-scanning/results-are-different-than-expected'
|
|
104
|
+
label: 'GitHub Docs: Code scanning troubleshooting — results are different than expected'
|