@htekdev/actions-debugger 1.0.117 → 1.0.119

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. package/errors/caching-artifacts/caching-artifacts-069.yml +133 -0
  2. package/errors/caching-artifacts/caching-artifacts-070.yml +94 -0
  3. package/errors/concurrency-timing/concurrency-timing-056.yml +127 -0
  4. package/errors/concurrency-timing/concurrency-timing-057.yml +115 -0
  5. package/errors/concurrency-timing/workflow-run-head-branch-null-schedule-dispatch-concurrency.yml +135 -0
  6. package/errors/known-unsolved/known-unsolved-067.yml +117 -0
  7. package/errors/known-unsolved/known-unsolved-068.yml +124 -0
  8. package/errors/known-unsolved/node-action-post-step-wrong-inputs-nested-composite.yml +133 -0
  9. package/errors/known-unsolved/ubuntu-24-04-arm64-missing-binder-ashmem-kernel-modules.yml +149 -0
  10. package/errors/permissions-auth/permissions-auth-069.yml +161 -0
  11. package/errors/runner-environment/arc-autoscalinglistener-ephemeralrunnerset-stale-after-upgrade.yml +134 -0
  12. package/errors/runner-environment/broker-server-socket-exception-nat-timeout-linux.yml +114 -0
  13. package/errors/runner-environment/runner-environment-210.yml +105 -0
  14. package/errors/runner-environment/runner-environment-213.yml +142 -0
  15. package/errors/runner-environment/runner-environment-214.yml +107 -0
  16. package/errors/runner-environment/runner-environment-215.yml +93 -0
  17. package/errors/runner-environment/runner-environment-216.yml +82 -0
  18. package/errors/runner-environment/runner-environment-217.yml +99 -0
  19. package/errors/runner-environment/runner-environment-218.yml +111 -0
  20. package/errors/runner-environment/ubuntu-24-man-db-dpkg-trigger-apt-install-stall.yml +94 -0
  21. package/errors/runner-environment/ubuntu-26-04-missing-preinstalled-tools.yml +178 -0
  22. package/errors/runner-environment/upload-artifact-v6-proxy-headers-leak-strict-proxy-fail.yml +101 -0
  23. package/errors/silent-failures/silent-failures-108.yml +108 -0
  24. package/errors/silent-failures/silent-failures-109.yml +119 -0
  25. package/errors/silent-failures/silent-failures-110.yml +91 -0
  26. package/errors/silent-failures/silent-failures-111.yml +107 -0
  27. package/errors/triggers/pull-request-labeled-fires-all-labels-no-name-filter.yml +110 -0
  28. package/errors/yaml-syntax/duplicate-step-id-within-job-scope-validation-error.yml +130 -0
  29. package/errors/yaml-syntax/yaml-syntax-072.yml +93 -0
  30. package/errors/yaml-syntax/yaml-syntax-073.yml +103 -0
  31. package/package.json +1 -1
@@ -0,0 +1,161 @@
1
+ id: permissions-auth-069
2
+ title: 'OIDC trust policy silently fails for repos missing required custom property claim — repository_property:* absent when property unset'
3
+ category: permissions-auth
4
+ severity: error
5
+ tags:
6
+ - oidc
7
+ - custom-properties
8
+ - trust-policy
9
+ - aws
10
+ - azure
11
+ - gcp
12
+ - repository-property
13
+ - april-2026
14
+ patterns:
15
+ - regex: 'Not authorized to perform sts:AssumeRoleWithWebIdentity'
16
+ flags: 'i'
17
+ - regex: 'AccessDenied.*AssumeRoleWithWebIdentity|WebIdentityErr.*AccessDenied'
18
+ flags: 'i'
19
+ - regex: 'Couldn''t retrieve OIDC token.*403|OIDC token.*invalid.*claim'
20
+ flags: 'i'
21
+ - regex: 'Error: Credentials could not be loaded.*OIDC'
22
+ flags: 'i'
23
+ error_messages:
24
+ - 'Error: Not authorized to perform sts:AssumeRoleWithWebIdentity'
25
+ - 'AccessDenied: User: arn:aws:sts::... is not authorized to perform: sts:AssumeRoleWithWebIdentity'
26
+ - 'Error: Credentials could not be loaded, please check your action inputs: Could not load credentials from any providers'
27
+ - 'google.auth.exceptions.DefaultCredentialsError: OIDC token condition not satisfied'
28
+ root_cause: |
29
+ GitHub Actions OIDC tokens now include `repository_property:{name}` claims for each
30
+ custom property set on the repository (generally available from April 2026). This
31
+ feature lets organizations create finer-grained cloud trust policies — for example,
32
+ only allowing OIDC authentication for repos whose `deploy_tier` custom property is
33
+ set to `production`.
34
+
35
+ However, the claim is **absent from the OIDC token when the property is not set on
36
+ the repository**. Cloud providers (AWS IAM, Azure AD, Google Cloud) evaluate a
37
+ missing claim as a condition failure:
38
+
39
+ - **AWS IAM** `Condition: { StringEquals: { "...repository_property:deploy_tier": "production" } }`
40
+ → `AccessDenied` if the repo has no `deploy_tier` property set
41
+ - **GCP** workload identity attribute conditions on `attribute.repository_property_*`
42
+ → condition evaluates false, token exchange rejected
43
+
44
+ Common scenarios that trigger this:
45
+ 1. **Org-wide trust policy** uses a custom property claim, but individual repos have
46
+ not been tagged with the required property.
47
+ 2. **Property renamed or deleted** — the trust policy still references the old
48
+ property name; the token no longer includes the old claim.
49
+ 3. **Fork PRs** — forked repositories do not inherit the parent org's custom
50
+ properties; OIDC tokens from fork CI lack the expected claims.
51
+ 4. **New repo** — a repository was added to the org after the trust policy was
52
+ configured; the property has not yet been applied to it.
53
+
54
+ The error message (`Not authorized to perform sts:AssumeRoleWithWebIdentity`) is
55
+ identical to other OIDC failures (wrong `sub`, wrong `aud`, expired token) and gives
56
+ no indication that a missing custom property claim is the cause.
57
+ fix: |
58
+ 1. **Verify the claim is present in the token**: Use the GitHub OIDC debugger or
59
+ print the decoded token payload in a workflow step to confirm the
60
+ `repository_property:{name}` claim exists and has the expected value.
61
+
62
+ 2. **Ensure the custom property is set on all target repos**: In the org settings,
63
+ verify that every repository expected to use the trust policy has the required
64
+ property configured. Newly added repos will not have it by default.
65
+
66
+ 3. **Make the condition optional (if the property may not always be set)**:
67
+ In AWS IAM, use `StringLike` with a wildcard or remove the custom-property
68
+ condition from the trust policy; use a separate, more permissive role for repos
69
+ without the property.
70
+
71
+ 4. **For fork PRs**: custom properties on the upstream org do not flow to forks.
72
+ Avoid trust policies that require custom property claims in workflows triggered
73
+ by `pull_request` from external forks.
74
+ fix_code:
75
+ - language: yaml
76
+ label: 'Debug step — print OIDC token claims to diagnose missing custom property'
77
+ code: |
78
+ jobs:
79
+ debug-oidc:
80
+ runs-on: ubuntu-latest
81
+ permissions:
82
+ id-token: write
83
+ steps:
84
+ - name: Fetch OIDC token and decode payload
85
+ run: |
86
+ TOKEN=$(curl -sH "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \
87
+ "$ACTIONS_ID_TOKEN_REQUEST_URL&audience=sts.amazonaws.com" | jq -r '.value')
88
+ # Decode payload (second segment of JWT, base64url-encoded)
89
+ echo "$TOKEN" | cut -d. -f2 | tr '_-' '/+' \
90
+ | base64 -d 2>/dev/null | jq .
91
+ # Look for "repository_property:your_property_name" in the output.
92
+ # If the claim is missing, the repo does not have that property set.
93
+
94
+ - language: yaml
95
+ label: 'AWS IAM trust policy — correct use of repository_property claim'
96
+ code: |
97
+ # AWS IAM role trust policy (JSON, not YAML — shown here for reference)
98
+ # Only allow OIDC from repos where custom property "deploy_tier" = "production"
99
+ {
100
+ "Version": "2012-10-17",
101
+ "Statement": [{
102
+ "Effect": "Allow",
103
+ "Principal": { "Federated": "arn:aws:iam::ACCOUNT:oidc-provider/token.actions.githubusercontent.com" },
104
+ "Action": "sts:AssumeRoleWithWebIdentity",
105
+ "Condition": {
106
+ "StringEquals": {
107
+ "token.actions.githubusercontent.com:aud": "sts.amazonaws.com",
108
+ "token.actions.githubusercontent.com:repository_property:deploy_tier": "production"
109
+ }
110
+ }
111
+ }]
112
+ }
113
+ # IMPORTANT: Every repo that runs this workflow MUST have the "deploy_tier"
114
+ # custom property set to "production" in org settings. If the property is
115
+ # unset or absent, the token will not include the claim and the assume-role
116
+ # call will return AccessDenied.
117
+
118
+ - language: yaml
119
+ label: 'Workflow — ensure the custom property claim is available before assuming role'
120
+ code: |
121
+ jobs:
122
+ deploy:
123
+ runs-on: ubuntu-latest
124
+ permissions:
125
+ id-token: write
126
+ contents: read
127
+ steps:
128
+ - uses: actions/checkout@v4
129
+
130
+ # Verify the custom property claim is present before assuming the role
131
+ - name: Validate OIDC custom property claim
132
+ run: |
133
+ TOKEN=$(curl -sH "Authorization: bearer $ACTIONS_ID_TOKEN_REQUEST_TOKEN" \
134
+ "$ACTIONS_ID_TOKEN_REQUEST_URL&audience=sts.amazonaws.com" | jq -r '.value')
135
+ PAYLOAD=$(echo "$TOKEN" | cut -d. -f2 | tr '_-' '/+' | base64 -d 2>/dev/null)
136
+ TIER=$(echo "$PAYLOAD" | jq -r '."repository_property:deploy_tier" // "MISSING"')
137
+ echo "deploy_tier claim: $TIER"
138
+ if [[ "$TIER" != "production" ]]; then
139
+ echo "::error::Repository custom property 'deploy_tier' is not set to 'production'. Set it in org settings before running this workflow."
140
+ exit 1
141
+ fi
142
+
143
+ - name: Configure AWS credentials via OIDC
144
+ uses: aws-actions/configure-aws-credentials@v4
145
+ with:
146
+ role-to-assume: arn:aws:iam::123456789012:role/my-production-deploy-role
147
+ aws-region: us-east-1
148
+
149
+ prevention:
150
+ - 'Maintain a registry of which repositories have each custom property set — before applying a trust policy that requires a custom property claim, verify all target repos have the property configured.'
151
+ - 'When adding a new repo to an org that uses OIDC custom property trust policies, immediately apply the required custom properties before running any workflows that assume cloud roles.'
152
+ - 'Do not use repository custom property claims in OIDC trust policies for workflows triggered by external fork pull requests — forks do not inherit the upstream org''s custom properties.'
153
+ - 'Add a preflight validation step to workflows that assume cloud roles — verify the expected repository_property:* claim is present in the OIDC token before calling the cloud provider.'
154
+ - 'If a custom property is renamed or removed, update all OIDC trust policies before the change takes effect to avoid sudden AccessDenied failures.'
155
+ docs:
156
+ - url: 'https://github.blog/changelog/2026-04-02-github-actions-early-april-2026-updates/#actions-oidc-tokens-now-support-repository-custom-properties'
157
+ label: 'GitHub Changelog: OIDC tokens now support repository custom properties (April 2026)'
158
+ - url: 'https://docs.github.com/en/actions/security-for-github-actions/security-hardening-your-deployments/about-security-hardening-with-openid-connect#customizing-the-token-claims'
159
+ label: 'GitHub Docs: Customizing OIDC token claims — repository custom properties'
160
+ - url: 'https://docs.github.com/en/organizations/managing-organization-settings/managing-custom-properties-for-repositories-in-your-organization'
161
+ label: 'GitHub Docs: Managing custom properties for repositories in your organization'
@@ -0,0 +1,134 @@
1
+ id: runner-environment-211
2
+ title: 'ARC Controller Upgrade Leaves Stale AutoscalingListener and EphemeralRunnerSet — Manual Intervention Required'
3
+ category: runner-environment
4
+ severity: error
5
+ tags:
6
+ - arc
7
+ - actions-runner-controller
8
+ - kubernetes
9
+ - upgrade
10
+ - autoscaling
11
+ - helm
12
+ - stale-controller
13
+ patterns:
14
+ - regex: 'AutoscalingListener.*spec\.image.*old.*version|spec\.image.*ghcr\.io/actions.*stale'
15
+ flags: 'i'
16
+ - regex: 'app\.kubernetes\.io/version.*mismatch|helm\.sh/chart.*stale.*controller'
17
+ flags: 'i'
18
+ - regex: 'RunnerScaleSet.*stale.*image|EphemeralRunnerSet.*old.*version'
19
+ flags: 'i'
20
+ error_messages:
21
+ - 'AutoscalingListener CRs retain stale controller image after controller upgrade'
22
+ - 'EphemeralRunnerSet retains stale version labels after controller upgrade'
23
+ - 'spec.image still points to old controller version after helm upgrade'
24
+ root_cause: |
25
+ When upgrading the `gha-runner-scale-set-controller` Helm chart, two
26
+ controller-managed objects are NOT updated to reflect the new version:
27
+
28
+ 1. **AutoscalingListener CRs** — retain the old controller image in `spec.image`
29
+ 2. **EphemeralRunnerSet objects** — retain old version labels
30
+ (`app.kubernetes.io/version`, `helm.sh/chart`)
31
+
32
+ The root cause is that the controller gates reconciliation on a **spec hash**.
33
+ A controller-only upgrade does not change any `AutoscalingRunnerSet` spec, so
34
+ the hash is identical and the controller skips reconciliation of both objects
35
+ entirely. The `updateStrategy` flag only governs spec-change rollout, not
36
+ controller version upgrades.
37
+
38
+ **Object hierarchy affected:**
39
+ ```
40
+ AutoscalingRunnerSet (Helm-managed)
41
+ ├── AutoscalingListener ← stale image after controller upgrade
42
+ └── EphemeralRunnerSet ← stale labels/spec after controller upgrade
43
+ └── EphemeralRunner
44
+ ```
45
+
46
+ For minor version bumps (e.g. 0.14.1 → 0.14.2) the staleness may appear
47
+ cosmetic. For **major upgrades** where the `EphemeralRunnerSet` or
48
+ `EphemeralRunner` spec has breaking changes (new required fields, removed
49
+ fields, changed defaults), stale objects under a new controller can cause
50
+ runtime failures — jobs queued but never dispatched, runner pods using the old
51
+ image's entrypoint, or scale-set reporting incorrect capacity.
52
+
53
+ **Version confirmed affected:** controller 0.14.2 / scale-set 0.14.2,
54
+ Kubernetes RKE2 (reproducible on any Kubernetes distribution).
55
+ fix: |
56
+ Two separate manual steps are required after every controller upgrade where
57
+ AutoscalingListener or EphemeralRunnerSet spec has changed:
58
+
59
+ **Step 1 — Delete all AutoscalingListener CRs** (the controller recreates them
60
+ immediately with the new image; runner pods are unaffected):
61
+
62
+ ```bash
63
+ kubectl delete autoscalinglisteners -A --all
64
+ ```
65
+
66
+ Verify the new version is used after recreation:
67
+ ```bash
68
+ kubectl get autoscalinglisteners -A \
69
+ -o jsonpath='{range .items[*]}{.metadata.name}{"\t"}{.spec.image}{"\n"}{end}'
70
+ ```
71
+
72
+ **Step 2 — Trigger EphemeralRunnerSet reconciliation** via a dummy annotation
73
+ change to `spec.template.spec` (a change to `minRunners` alone is NOT
74
+ sufficient — it only affects the AutoscalingListener hash, not the
75
+ EphemeralRunnerSet hash):
76
+
77
+ ```yaml
78
+ spec:
79
+ template:
80
+ metadata:
81
+ annotations:
82
+ upgrade-trigger: "0.14.2" # bump on each controller upgrade
83
+ ```
84
+
85
+ This triggers a graceful transition: the old EphemeralRunnerSet drains
86
+ (in-progress jobs complete) while the new one starts accepting jobs
87
+ immediately. Remove the annotation in a follow-up commit.
88
+
89
+ **Verify EphemeralRunnerSet version labels are updated:**
90
+ ```bash
91
+ kubectl get ephemeralrunnersets -A \
92
+ -o custom-columns='NAME:.metadata.name,VERSION:.metadata.labels.app\.kubernetes\.io/version'
93
+ ```
94
+ fix_code:
95
+ - language: yaml
96
+ label: 'Trigger EphemeralRunnerSet reconciliation via dummy annotation (per scale set)'
97
+ code: |
98
+ # In your AutoscalingRunnerSet HelmRelease or values.yaml:
99
+ # Add a dummy annotation to spec.template.spec to force EphemeralRunnerSet
100
+ # reconciliation after a controller upgrade.
101
+ # Remove the annotation in a follow-up commit once migration is confirmed.
102
+ spec:
103
+ template:
104
+ metadata:
105
+ annotations:
106
+ upgrade-trigger: "0.14.2" # bump to new controller version
107
+ - language: yaml
108
+ label: 'Post-upgrade runbook as a one-off Job'
109
+ code: |
110
+ # After upgrading the controller chart, run this in CI or manually:
111
+ # Step 1: delete stale AutoscalingListener CRs (controller recreates immediately)
112
+ # kubectl delete autoscalinglisteners -A --all
113
+ #
114
+ # Step 2: patch each AutoscalingRunnerSet with a dummy annotation to force
115
+ # EphemeralRunnerSet reconciliation:
116
+ # kubectl annotate autoscalingrunnersets -A --all \
117
+ # upgrade-trigger=$(date +%s) --overwrite
118
+ #
119
+ # Note: kubectl annotate updates metadata.annotations, not spec.template.spec,
120
+ # so it does NOT trigger EphemeralRunnerSet reconciliation. Use the values
121
+ # approach above (spec.template.metadata.annotations) instead.
122
+ prevention:
123
+ - 'After every ARC controller upgrade, check AutoscalingListener images and EphemeralRunnerSet labels before routing production traffic.'
124
+ - 'Add a post-upgrade step to your CI/CD pipeline that deletes AutoscalingListener CRs and adds a dummy upgrade-trigger annotation.'
125
+ - 'Pin a dummy annotation like `upgrade-trigger: "<version>"` in your HelmRelease values and bump it with each controller upgrade.'
126
+ - 'Subscribe to actions/actions-runner-controller releases and review EphemeralRunnerSet spec changes before upgrading.'
127
+ - 'Track the upstream issue at actions/actions-runner-controller#4513 for a platform-side fix.'
128
+ docs:
129
+ - url: 'https://github.com/actions/actions-runner-controller/issues/4513'
130
+ label: 'ARC #4513 — AutoscalingListener and EphemeralRunnerSet retain stale controller image/labels after upgrade (open Jun 2026)'
131
+ - url: 'https://github.com/actions/actions-runner-controller/blob/master/TROUBLESHOOTING.md'
132
+ label: 'ARC Troubleshooting Guide'
133
+ - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners-with-actions-runner-controller/about-actions-runner-controller'
134
+ label: 'About Actions Runner Controller'
@@ -0,0 +1,114 @@
1
+ id: runner-environment-209
2
+ title: 'Self-hosted runner BrokerServer TaskCanceledException / SocketException — runner stuck in Busy, jobs delayed'
3
+ category: runner-environment
4
+ severity: error
5
+ tags:
6
+ - self-hosted
7
+ - broker
8
+ - TaskCanceledException
9
+ - SocketException
10
+ - NAT
11
+ - kubernetes
12
+ - ARC
13
+ - busy-state
14
+ patterns:
15
+ - regex: 'BrokerServer.*TaskCanceledException'
16
+ flags: 'i'
17
+ - regex: 'SocketException \(125\): Operation canceled'
18
+ flags: 'i'
19
+ - regex: 'GET request to https://broker\.actions\.githubusercontent\.com.*has been cancelled'
20
+ flags: 'i'
21
+ error_messages:
22
+ - '[ERR BrokerServer] System.Threading.Tasks.TaskCanceledException: The operation was canceled.'
23
+ - '[ERR BrokerServer] System.IO.IOException: Unable to read data from the transport connection: Operation canceled.'
24
+ - '[ERR BrokerServer] System.Net.Sockets.SocketException (125): Operation canceled'
25
+ - '[WARN GitHubActionsService] GET request to https://broker.actions.githubusercontent.com/message?sessionId=...&status=Busy&runnerVersion=... has been cancelled.'
26
+ - '[WARN BrokerServer] Back off 6.934 seconds before next retry. 4 attempt left.'
27
+ root_cause: |
28
+ The GitHub Actions runner (on Linux, macOS, Kubernetes, and ARC) maintains a
29
+ persistent long-poll HTTPS connection to `broker.actions.githubusercontent.com`
30
+ to receive job dispatch messages. This connection is kept open by a blocking
31
+ GET request that the server holds for up to 90 seconds before responding.
32
+
33
+ When the runner operates behind a **NAT gateway, stateful firewall, or cloud
34
+ provider network** (common in Kubernetes/ARC deployments on EKS, GKE, AKS, or
35
+ on-premise k8s), the network layer's connection tracking table can expire the
36
+ idle TLS socket before the server responds. Most cloud NAT tables have a
37
+ default idle timeout of 30–60 seconds — shorter than the runner's 90-second
38
+ poll interval.
39
+
40
+ When the NAT table entry expires:
41
+ 1. The next packet the runner sends receives an RST from the network (or is
42
+ silently dropped), causing the underlying `SslStream.ReadAsyncInternal`
43
+ to throw `SocketException (125): Operation canceled`
44
+ 2. The exception propagates as `TaskCanceledException` through the
45
+ `BrokerHttpClient.GetRunnerMessageAsync` call chain
46
+ 3. The runner logs `ERR BrokerServer` and backs off exponentially (6–60 s)
47
+ 4. During the back-off, the runner remains in **Busy** status from the broker's
48
+ perspective, preventing new jobs from being dispatched
49
+
50
+ The back-off recovers automatically but delays job pickup by minutes. Under
51
+ high-frequency job dispatch (CI matrix builds), this causes jobs to queue
52
+ while the runner is technically idle.
53
+
54
+ **Distinct from re-199** (Windows V2 broker listener stops polling after the
55
+ first job — Windows-specific software bug): this issue affects Linux/macOS/K8s
56
+ and recovers automatically; re-199 causes a permanent stall requiring service
57
+ restart.
58
+ fix: |
59
+ **1. Enable TCP keepalive on the runner host (most effective):**
60
+
61
+ Configure the OS to send TCP keepalive probes before the NAT table expires:
62
+ ```bash
63
+ # Linux — reduce keepalive idle time from default 7200s to 30s
64
+ sudo sysctl -w net.ipv4.tcp_keepalive_time=30
65
+ sudo sysctl -w net.ipv4.tcp_keepalive_intvl=10
66
+ sudo sysctl -w net.ipv4.tcp_keepalive_probes=3
67
+ # Make persistent:
68
+ echo "net.ipv4.tcp_keepalive_time=30" | sudo tee -a /etc/sysctl.conf
69
+ ```
70
+
71
+ **2. Increase NAT idle timeout (infrastructure change):**
72
+
73
+ - **AWS EKS:** Set `--conntrack-tcp-timeout-established=300` on kube-proxy,
74
+ or add a NAT gateway connection tracking timeout of 350 s
75
+ - **GKE:** Use Cloud NAT with `--nat-tcp-established-idle-timeout=350`
76
+ - **Azure AKS:** Set `--load-balancer-idle-timeout-in-minutes=10` (default is 4 min)
77
+ - **On-premise k8s:** Increase `conntrack` timeout or set up a keepalive proxy
78
+
79
+ **3. Use a runner proxy with keepalive support:**
80
+
81
+ Route runner outbound traffic through an application-level proxy that
82
+ maintains the connection, preventing the NAT table from expiring the socket.
83
+
84
+ **4. Upgrade runner version:**
85
+
86
+ Runner v2.326.0+ includes improved broker reconnect logic that reduces the
87
+ window where the runner stays in Busy state after a socket reset.
88
+ fix_code:
89
+ - language: yaml
90
+ label: 'Runner DaemonSet init container — set TCP keepalive before runner starts'
91
+ code: |
92
+ # In your ARC runner DaemonSet / Pod spec
93
+ initContainers:
94
+ - name: set-sysctl
95
+ image: busybox
96
+ securityContext:
97
+ privileged: true
98
+ command:
99
+ - sh
100
+ - -c
101
+ - |
102
+ sysctl -w net.ipv4.tcp_keepalive_time=30
103
+ sysctl -w net.ipv4.tcp_keepalive_intvl=10
104
+ sysctl -w net.ipv4.tcp_keepalive_probes=3
105
+ prevention:
106
+ - 'Set tcp_keepalive_time to 30 s on all Linux self-hosted runner hosts, especially those in Kubernetes'
107
+ - 'For ARC scale sets in EKS/GKE/AKS, explicitly configure NAT idle timeout to at least 350 seconds'
108
+ - 'Monitor runner diagnostic logs (Runner_<date>-utc.log) for repeated BrokerServer ERR lines — they indicate this issue'
109
+ - 'Upgrade runner to v2.326.0+ which has improved back-off and reconnect behavior'
110
+ docs:
111
+ - url: 'https://github.com/actions/runner/issues/3904'
112
+ label: 'actions/runner#3904 — Runner fails to connect to broker, TaskCanceledException / SocketException (17 reactions)'
113
+ - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/about-self-hosted-runners#communication-between-self-hosted-runners-and-github'
114
+ label: 'GitHub Docs — Self-hosted runner communication requirements'
@@ -0,0 +1,105 @@
1
+ id: runner-environment-210
2
+ title: 'Runner step-log and summary uploads silently stall behind egress-only firewall — .NET BlobClient ignores HTTPS_PROXY'
3
+ category: runner-environment
4
+ severity: silent-failure
5
+ tags:
6
+ - proxy
7
+ - https-proxy
8
+ - blob-storage
9
+ - self-hosted
10
+ - egress-firewall
11
+ - logs-missing
12
+ - azure-blob
13
+ patterns:
14
+ - regex: 'productionresultssa\d+\.blob\.core\.windows\.net'
15
+ flags: 'i'
16
+ - regex: 'ua=azsdk-net-Storage\.Blobs.*latency=[0-9]{2,3}\.'
17
+ flags: 'i'
18
+ - regex: 'step summary.*not.*available|summary.*upload.*failed|diagnostic.*log.*missing'
19
+ flags: 'i'
20
+ error_messages:
21
+ - 'Step logs not visible in Actions UI — log upload stalled silently'
22
+ - 'Job completed but step summary is blank or missing'
23
+ - 'latency=74.999634s ua=azsdk-net-Storage.Blobs/12.27.0 (.NET 8.0)'
24
+ - 'host=productionresultssa6.blob.core.windows.net:443 latency=74.99s'
25
+ root_cause: |
26
+ The GitHub Actions runner process (v2.333.1 and earlier) creates Azure SDK
27
+ `BlobClient` instances for uploading step logs, workflow summaries, and diagnostic
28
+ logs without configuring an `HttpClientTransport` that honours the `HTTPS_PROXY`
29
+ environment variable.
30
+
31
+ In `ResultsHttpClient.cs`, the `GetBlobClient()` and `GetAppendBlobClient()` methods
32
+ pass only retry/timeout options to `BlobClientOptions` — no `Transport`. The Azure SDK
33
+ therefore falls back to its internal default HTTP pipeline, which does **not** inherit
34
+ the runner's `RunnerWebProxy` configuration.
35
+
36
+ The impact differs depending on network topology:
37
+
38
+ - **Without an egress firewall**: The runner's BlobClient connects directly to
39
+ `*.blob.core.windows.net` and `*.actions.githubusercontent.com`, bypassing the
40
+ proxy entirely. Direct egress works, so no error is visible.
41
+
42
+ - **With a deny-all egress firewall (proxy-only)**: The BlobClient attempts a direct
43
+ connection that the firewall silently drops. Each attempt stalls for ~75 seconds
44
+ (the TCP connect timeout), then times out. Since log uploads are non-fatal, the job
45
+ eventually completes — but step logs are absent from the Actions UI and the job
46
+ takes 5–15 minutes longer than expected.
47
+
48
+ This is separate from the `upload-artifact@v6` proxy CONNECT-headers regression
49
+ (re-208), which affects the Node.js artifact upload path.
50
+ fix: |
51
+ Add the Azure Blob Storage and Actions results endpoints to the `NO_PROXY` (or
52
+ `no_proxy`) environment variable so the runner bypasses the proxy for those hosts
53
+ and connects to them directly.
54
+
55
+ This requires that the runner's egress firewall allows direct connections to
56
+ `*.blob.core.windows.net` and `results-receiver.actions.githubusercontent.com`.
57
+ If only proxy egress is available, the workaround is to configure the proxy to
58
+ pass through those hosts without TLS inspection.
59
+
60
+ A proper fix (runner-side, not yet released): the `BlobClientOptions.Transport` in
61
+ `ResultsHttpClient.cs` should be configured with an `HttpClientTransport` wrapping
62
+ the runner's `RunnerWebProxy` — tracked in actions/runner#4351.
63
+ fix_code:
64
+ - language: yaml
65
+ label: 'Self-hosted runner — set NO_PROXY to bypass proxy for Azure Blob endpoints'
66
+ code: |
67
+ # Set at the OS level or in the runner's .env file before starting the runner service.
68
+ # This allows the BlobClient to connect directly while other traffic goes through the proxy.
69
+ #
70
+ # On Linux/macOS (add to /etc/environment or runner startup script):
71
+ # NO_PROXY=.blob.core.windows.net,.actions.githubusercontent.com,results-receiver.actions.githubusercontent.com
72
+ #
73
+ # In a workflow (if runner is ephemeral and you can set env per job):
74
+ jobs:
75
+ build:
76
+ runs-on: self-hosted
77
+ env:
78
+ HTTPS_PROXY: http://proxy.corp.example.com:3128
79
+ NO_PROXY: '.blob.core.windows.net,.actions.githubusercontent.com,results-receiver.actions.githubusercontent.com'
80
+ steps:
81
+ - uses: actions/checkout@v4
82
+ - run: echo "Logs will upload correctly now"
83
+ - language: yaml
84
+ label: 'ARC / Kubernetes — set NO_PROXY in RunnerDeployment or RunnerScaleSet'
85
+ code: |
86
+ apiVersion: actions.summerwind.dev/v1alpha1
87
+ kind: RunnerDeployment
88
+ spec:
89
+ template:
90
+ spec:
91
+ env:
92
+ - name: HTTPS_PROXY
93
+ value: http://proxy.corp.example.com:3128
94
+ - name: NO_PROXY
95
+ value: '.blob.core.windows.net,.actions.githubusercontent.com,results-receiver.actions.githubusercontent.com'
96
+ prevention:
97
+ - 'When deploying self-hosted runners behind a forward proxy with deny-all egress, always set NO_PROXY to include Azure Blob Storage endpoints — the runner BlobClient does not inherit HTTPS_PROXY.'
98
+ - 'Monitor Actions step log visibility alongside job exit codes — missing logs with a successful exit often indicate a proxy or network configuration issue, not a code failure.'
99
+ - 'Run a proxy connectivity diagnostic on a new runner host: confirm .blob.core.windows.net is reachable (directly or via proxy) before routing real workloads to it.'
100
+ - 'Track actions/runner#4351 for a first-party fix that configures the BlobClient transport to use the runner proxy settings.'
101
+ docs:
102
+ - url: 'https://github.com/actions/runner/issues/4351'
103
+ label: 'actions/runner #4351 — BlobClient uploads stall through HTTPS proxy (Apr 2026)'
104
+ - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/using-a-proxy-server-with-self-hosted-runners'
105
+ label: 'GitHub Docs: Using a proxy server with self-hosted runners'
@@ -0,0 +1,142 @@
1
+ id: runner-environment-213
2
+ title: 'Self-Hosted Runner Stuck in Active State Indefinitely When Job Process Hangs Without Exiting'
3
+ category: runner-environment
4
+ severity: error
5
+ tags:
6
+ - self-hosted
7
+ - runner
8
+ - active-state
9
+ - hung-process
10
+ - child-process
11
+ - stuck
12
+ - service-restart
13
+ - timeout-minutes
14
+ patterns:
15
+ - regex: 'Waiting for a runner to pick up this job'
16
+ flags: 'i'
17
+ - regex: 'Runner\.Worker.*hung|Worker.*process.*running|active.*runner.*blocking'
18
+ flags: 'i'
19
+ - regex: 'sudo systemctl restart actions\.runner'
20
+ flags: 'i'
21
+ error_messages:
22
+ - 'Waiting for a runner to pick up this job...'
23
+ - 'Runner shows as Active in GitHub UI; new jobs remain Queued indefinitely'
24
+ root_cause: |
25
+ On self-hosted runners, the Runner.Worker process tracks the lifecycle of the running
26
+ job. When a job step spawns a child process (e.g., a test runner like `vitest --coverage`,
27
+ a long network operation, or a background daemon) that does not exit cleanly, the
28
+ Runner.Worker stays alive waiting for the child to terminate.
29
+
30
+ While Runner.Worker is alive, the parent Runner.Listener considers the runner slot
31
+ occupied (busy=true) and does not accept new job messages from the broker. The runner
32
+ appears as "Active" in the GitHub UI and all queued jobs for that runner remain
33
+ in "Waiting for a runner to pick up this job..." state indefinitely.
34
+
35
+ This differs from:
36
+ - GitHub-hosted runners: these have a hard 6-hour job timeout enforced by the platform;
37
+ the job is cancelled and the slot freed automatically
38
+ - Self-hosted runners WITH timeout-minutes set: once timeout-minutes elapses the
39
+ job is cancelled and the runner sends SIGTERM to the worker — but if the child
40
+ process ignores SIGTERM (common with some test runners), the worker still hangs
41
+
42
+ Common triggers:
43
+ - vitest --coverage, jest --forceExit not used, pytest hanging due to unclosed resources
44
+ - npm/yarn scripts that spawn background processes not tied to the shell session
45
+ - Docker commands (docker run without --rm) that keep running after the step exits
46
+ - Network calls blocked by firewall with no connection timeout
47
+ - Interactive prompts waiting for stdin input in a CI non-interactive context
48
+
49
+ Automatic recovery does NOT occur. The runner stays Active until either:
50
+ 1. The hung child process eventually exits on its own
51
+ 2. An operator manually restarts the runner service
52
+ 3. A watchdog script kills the orphaned worker process
53
+ fix: |
54
+ Immediate recovery: restart the runner service to free the stuck slot.
55
+ sudo systemctl restart actions.runner.<scope>.<name>.service # Linux systemd
56
+ launchctl unload ~/Library/LaunchAgents/actions.runner.*.plist # macOS
57
+ .\svc.sh stop && .\svc.sh start # Windows
58
+
59
+ Prevention (preferred):
60
+ 1. Add timeout-minutes to ALL jobs on self-hosted runners to cap maximum runtime.
61
+ Even if the worker hangs, the platform cancels the job and sends SIGTERM after
62
+ the timeout. Pair with process group kill to catch SIGTERM-resistant children.
63
+
64
+ 2. Ensure test commands force-exit when done:
65
+ - vitest: add --forceExit flag
66
+ - jest: use jest --forceExit or --detectOpenHandles to identify hanging handles
67
+ - pytest: add timeout fixtures via pytest-timeout plugin
68
+
69
+ 3. Use process groups (setsid / start new session) so SIGTERM cascades to children:
70
+ run: |
71
+ setsid bash -c 'npm test' &
72
+ CHILD_PID=$!
73
+ wait $CHILD_PID
74
+
75
+ 4. Deploy a runner watchdog that monitors Worker processes with no active child
76
+ CPU activity for > N minutes and kills them:
77
+ - Check elapsed time + zero CPU descendants
78
+ - SIGKILL stale Worker processes
79
+ - Trigger runner service restart via systemd or equivalent
80
+ fix_code:
81
+ - language: yaml
82
+ label: 'Add timeout-minutes to prevent indefinite runner lock'
83
+ code: |
84
+ jobs:
85
+ test:
86
+ runs-on: self-hosted
87
+ timeout-minutes: 30 # Always set on self-hosted runners
88
+ steps:
89
+ - uses: actions/checkout@v4
90
+ - name: Run tests
91
+ run: npm test
92
+ - language: yaml
93
+ label: 'Force-exit test runner so worker process completes cleanly'
94
+ code: |
95
+ jobs:
96
+ test:
97
+ runs-on: self-hosted
98
+ timeout-minutes: 30
99
+ steps:
100
+ - name: Run Vitest tests
101
+ run: npx vitest run --forceExit
102
+
103
+ - name: Run Jest tests
104
+ run: npx jest --forceExit
105
+
106
+ - name: Run pytest with timeout
107
+ run: pytest --timeout=300
108
+ - language: yaml
109
+ label: 'Watchdog step — kill orphaned background processes after main step'
110
+ code: |
111
+ jobs:
112
+ test:
113
+ runs-on: self-hosted
114
+ timeout-minutes: 30
115
+ steps:
116
+ - name: Run tests
117
+ run: npm test
118
+ continue-on-error: true
119
+ - name: Kill orphaned processes
120
+ if: always()
121
+ run: |
122
+ # Kill any remaining node processes owned by this runner user
123
+ pkill -u "$(whoami)" -f "vitest|jest|mocha" || true
124
+ prevention:
125
+ - 'Always set timeout-minutes on self-hosted runner jobs — without it there is no
126
+ platform-enforced maximum and a hung process can block the runner indefinitely'
127
+ - 'Use --forceExit with Jest/Vitest; use --timeout with pytest; audit any test suite
128
+ that takes longer than expected for open handles (jest --detectOpenHandles)'
129
+ - 'Avoid spawning background daemons in run: steps without explicit cleanup in an
130
+ if: always() cleanup step'
131
+ - 'Consider running self-hosted runners as ephemeral (ephemeral: true with ARC or
132
+ JIT tokens) — an ephemeral runner terminates after one job, so a hung runner
133
+ does not affect other jobs (a new runner pod is provisioned for each job)'
134
+ - 'Monitor runner Active state duration via GitHub REST API (GET /repos/{owner}/{repo}/actions/runners)
135
+ and alert when busy: true persists beyond expected max job duration'
136
+ docs:
137
+ - url: 'https://github.com/actions/runner/issues/4312'
138
+ label: 'actions/runner#4312 — Self-hosted runner gets stuck in active state, blocking queued jobs'
139
+ - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/monitoring-and-troubleshooting-self-hosted-runners'
140
+ label: 'GitHub Docs — Monitoring and troubleshooting self-hosted runners'
141
+ - url: 'https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#jobsjob_idtimeout-minutes'
142
+ label: 'GitHub Docs — timeout-minutes syntax reference'