@htekdev/actions-debugger 1.0.23 → 1.0.24
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/errors/caching-artifacts/artifact-minimum-retention-one-day.yml +153 -0
- package/errors/caching-artifacts/cache-api-propagation-delay-post-save.yml +128 -0
- package/errors/caching-artifacts/cache-backend-internal-error-skipped.yml +75 -0
- package/errors/caching-artifacts/cache-hit-step-id-case-sensitive-mismatch.yml +95 -0
- package/errors/caching-artifacts/cache-save-post-step-skipped-on-failure.yml +114 -0
- package/errors/concurrency-timing/deploy-pages-in-progress-deployment-wedged.yml +70 -0
- package/errors/concurrency-timing/deployment-review-timeout-expired.yml +88 -0
- package/errors/concurrency-timing/job-concurrency-scope-per-run-not-global.yml +81 -0
- package/errors/concurrency-timing/merge-queue-concurrency-cancel-blocks-all.yml +86 -0
- package/errors/concurrency-timing/reusable-workflow-github-workflow-context-cancel.yml +124 -0
- package/errors/concurrency-timing/runner-scale-set-jobs-never-start.yml +123 -0
- package/errors/concurrency-timing/runner-temp-dir-race-concurrent-workers.yml +90 -0
- package/errors/known-unsolved/artifact-download-url-unauthenticated-404.yml +98 -0
- package/errors/known-unsolved/checkout-v6-credentials-docker-run-manual.yml +105 -0
- package/errors/known-unsolved/concurrency-groups-repo-scoped-only.yml +138 -0
- package/errors/known-unsolved/matrix-256-job-limit.yml +142 -0
- package/errors/known-unsolved/merge-group-paths-filter-not-supported.yml +137 -0
- package/errors/known-unsolved/no-job-allow-failure.yml +73 -0
- package/errors/known-unsolved/schedule-cron-hours-long-queue-drift.yml +101 -0
- package/errors/permissions-auth/checkout-persist-credentials-token-write.yml +90 -0
- package/errors/permissions-auth/create-github-app-token-cross-job-token-revoked.yml +95 -0
- package/errors/permissions-auth/github-token-contents-write-missing-git-push.yml +117 -0
- package/errors/permissions-auth/org-actions-policy-blocks-unapproved-action.yml +106 -0
- package/errors/runner-environment/codeql-action-v2-deprecated.yml +110 -0
- package/errors/runner-environment/macos-26-openssl-3-system-library-breaking.yml +114 -0
- package/errors/runner-environment/macos-26-ruby-34-default-upgrade.yml +114 -0
- package/errors/runner-environment/macos-26-xcode-default-265-pin-required.yml +99 -0
- package/errors/runner-environment/macos-latest-label-switches-to-macos26.yml +127 -0
- package/errors/runner-environment/node20-removed-toolcache-default-node22.yml +104 -0
- package/errors/runner-environment/powershell-74-76-threadjob-module-rename.yml +124 -0
- package/errors/runner-environment/self-hosted-runner-not-found.yml +134 -0
- package/errors/runner-environment/self-hosted-runner-selinux-service-exec-failure.yml +116 -0
- package/errors/runner-environment/service-container-no-healthcheck.yml +158 -0
- package/errors/runner-environment/setup-node-v5-corepack-pnpm-not-found.yml +101 -0
- package/errors/runner-environment/setup-node-yarn-not-installed-self-hosted.yml +76 -0
- package/errors/runner-environment/setup-python-externally-managed-env-error.yml +95 -0
- package/errors/runner-environment/windows-2019-runner-retired-june2025.yml +118 -0
- package/errors/runner-environment/windows-2022-docker-daemon-not-started.yml +108 -0
- package/errors/silent-failures/cache-hit-output-string-not-boolean.yml +96 -0
- package/errors/silent-failures/checkout-lfs-pointer-not-content.yml +105 -0
- package/errors/silent-failures/reusable-workflow-output-skipped-contains-secret.yml +115 -0
- package/errors/silent-failures/setup-node-silent-download-exit-zero.yml +105 -0
- package/errors/silent-failures/setup-python-truncated-manifest-silent-exit.yml +111 -0
- package/errors/silent-failures/undefined-env-expression-empty-string-silent.yml +115 -0
- package/errors/silent-failures/windows-powershell-github-output-bash-syntax.yml +118 -0
- package/errors/triggers/fork-pr-first-time-contributor-approval-required.yml +142 -0
- package/errors/triggers/on-push-branches-glob-star-no-slash-match.yml +78 -0
- package/errors/triggers/pull-request-target-env-protection-default-branch-eval.yml +117 -0
- package/errors/triggers/required-status-check-renamed-never-passes.yml +87 -0
- package/errors/triggers/schedule-cron-self-hosted-runner-not-triggered.yml +107 -0
- package/errors/yaml-syntax/composite-action-run-shell-missing.yml +90 -0
- package/errors/yaml-syntax/composite-action-secrets-context-unavailable.yml +99 -0
- package/errors/yaml-syntax/github-script-octokit-renamed-to-github.yml +130 -0
- package/errors/yaml-syntax/labeler-v5-config-format-breaking.yml +67 -0
- package/errors/yaml-syntax/runs-on-expression-array-syntax-error.yml +121 -0
- package/errors/yaml-syntax/setup-go-matrix-version-float-coercion.yml +69 -0
- package/package.json +1 -1
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
id: "concurrency-timing-021"
|
|
2
|
+
title: "Deployment job expires when environment review timeout lapses"
|
|
3
|
+
category: "concurrency-timing"
|
|
4
|
+
severity: "error"
|
|
5
|
+
tags:
|
|
6
|
+
- "deployment"
|
|
7
|
+
- "environment"
|
|
8
|
+
- "protection-rule"
|
|
9
|
+
- "review-timeout"
|
|
10
|
+
- "pending"
|
|
11
|
+
- "expired"
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: "was not approved before the review period expired"
|
|
14
|
+
flags: "i"
|
|
15
|
+
- regex: "deployment.*review period expired"
|
|
16
|
+
flags: "i"
|
|
17
|
+
- regex: "Your deployment to .+ expired"
|
|
18
|
+
flags: "i"
|
|
19
|
+
error_messages:
|
|
20
|
+
- "Your deployment to [environment] was not approved before the review period expired."
|
|
21
|
+
root_cause: |
|
|
22
|
+
Environment protection rules that require manual reviewers have a configurable review
|
|
23
|
+
timeout (default 30 days; configurable from 1 to 30 days). When a workflow job waits
|
|
24
|
+
for approval and no reviewer acts within that window, the deployment job automatically
|
|
25
|
+
fails with a timeout error.
|
|
26
|
+
|
|
27
|
+
This creates a silent cascade: while the job is pending, it appears as "Waiting" in
|
|
28
|
+
the UI with no failure indicator. Needs-dependent jobs that run after the deployment
|
|
29
|
+
also remain queued. When the timeout fires, the deployment job fails, which then
|
|
30
|
+
cascade-fails all downstream jobs simultaneously — a large batch of failures with a
|
|
31
|
+
confusing single root cause.
|
|
32
|
+
|
|
33
|
+
Common triggers:
|
|
34
|
+
- Teams shorten the timeout from 30 days to 1-7 days for security compliance.
|
|
35
|
+
- Approvers are on leave or in different time zones and miss the notification.
|
|
36
|
+
- A deployment is triggered late Friday; nobody approves over the weekend.
|
|
37
|
+
- The notification email was caught by spam filters or notification fatigue.
|
|
38
|
+
fix: |
|
|
39
|
+
1. Re-run the failed workflow — a fresh run starts a new review clock.
|
|
40
|
+
2. Increase the review timeout in: repo Settings → Environments → [env]
|
|
41
|
+
→ Protection rules → Required reviewers → "Timeout".
|
|
42
|
+
3. Add more reviewers or a team with guaranteed availability across time zones.
|
|
43
|
+
4. Send a proactive notification (Slack, Teams, PagerDuty) at deploy-queue time
|
|
44
|
+
so reviewers don't have to poll GitHub.
|
|
45
|
+
fix_code:
|
|
46
|
+
- language: yaml
|
|
47
|
+
label: "Notify reviewers immediately when the deployment enters the queue"
|
|
48
|
+
code: |
|
|
49
|
+
jobs:
|
|
50
|
+
notify-pending:
|
|
51
|
+
runs-on: ubuntu-latest
|
|
52
|
+
steps:
|
|
53
|
+
- name: Notify reviewers that approval is needed
|
|
54
|
+
uses: slackapi/slack-github-action@v2
|
|
55
|
+
with:
|
|
56
|
+
webhook: ${{ secrets.SLACK_WEBHOOK_URL }}
|
|
57
|
+
webhook-type: incoming-webhook
|
|
58
|
+
payload: |
|
|
59
|
+
{
|
|
60
|
+
"text": ":rocket: Deployment to *production* is waiting for review.\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|Approve here>"
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
deploy:
|
|
64
|
+
needs: notify-pending
|
|
65
|
+
runs-on: ubuntu-latest
|
|
66
|
+
environment: production
|
|
67
|
+
steps:
|
|
68
|
+
- name: Deploy
|
|
69
|
+
run: ./deploy.sh
|
|
70
|
+
|
|
71
|
+
- language: yaml
|
|
72
|
+
label: "Programmatic re-trigger via workflow_dispatch on expiry (escape hatch)"
|
|
73
|
+
code: |
|
|
74
|
+
# After a timeout expiry, re-run the failed workflow via GitHub CLI:
|
|
75
|
+
# gh run rerun <run-id> --repo owner/repo --failed
|
|
76
|
+
# Or push a no-op commit to trigger a fresh run.
|
|
77
|
+
prevention:
|
|
78
|
+
- "Configure Slack/Teams/email notifications that fire when a deployment enters the pending approval state — do not rely on reviewers proactively checking GitHub."
|
|
79
|
+
- "Add multiple reviewers or use a team as reviewer so a single person's absence doesn't block the pipeline."
|
|
80
|
+
- "Do not shorten the review timeout below what your team can realistically respond to (e.g., account for weekends and holidays)."
|
|
81
|
+
- "Use the wait-timer setting only if you need a minimum delay; set it lower than the review timeout."
|
|
82
|
+
docs:
|
|
83
|
+
- url: "https://docs.github.com/en/actions/managing-workflow-runs/reviewing-deployments"
|
|
84
|
+
label: "GitHub Docs — Reviewing deployments"
|
|
85
|
+
- url: "https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#required-reviewers"
|
|
86
|
+
label: "GitHub Docs — Environment protection rules: required reviewers and timeout"
|
|
87
|
+
- url: "https://github.com/orgs/community/discussions/72259"
|
|
88
|
+
label: "GitHub Community #72259 — Approving environment deployment in Actions"
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
id: concurrency-timing-016
|
|
2
|
+
title: "Job-Level Concurrency Does Not Prevent Parallel Runs Across Different Workflow Runs"
|
|
3
|
+
category: concurrency-timing
|
|
4
|
+
severity: silent-failure
|
|
5
|
+
tags:
|
|
6
|
+
- concurrency
|
|
7
|
+
- job-level
|
|
8
|
+
- cross-run
|
|
9
|
+
- mutex
|
|
10
|
+
- deployment
|
|
11
|
+
patterns:
|
|
12
|
+
- regex: 'jobs\.[a-zA-Z_-]+\.concurrency'
|
|
13
|
+
flags: "i"
|
|
14
|
+
- regex: 'concurrency:\s*\n\s+group:'
|
|
15
|
+
flags: "im"
|
|
16
|
+
error_messages:
|
|
17
|
+
- "Multiple deployments running simultaneously"
|
|
18
|
+
- "Concurrent job executions detected"
|
|
19
|
+
root_cause: |
|
|
20
|
+
When the `concurrency:` block is placed at the job level (`jobs.<job_id>.concurrency`)
|
|
21
|
+
rather than the workflow level, the concurrency group is scoped to a single workflow
|
|
22
|
+
run. This means:
|
|
23
|
+
- Within one run: duplicate jobs queue against each other (per-run serialization).
|
|
24
|
+
- Across different runs: jobs with identical concurrency keys execute in parallel with
|
|
25
|
+
no queuing or cancellation.
|
|
26
|
+
|
|
27
|
+
Developers expecting a global mutex — for example, preventing two simultaneous
|
|
28
|
+
deployments triggered by separate PRs — are surprised when both deploys run
|
|
29
|
+
concurrently. The GitHub Docs note explicitly states: "If concurrency is specified
|
|
30
|
+
at a job level, the concurrency group is scoped to that job within the workflow run."
|
|
31
|
+
|
|
32
|
+
This is especially confusing because the workflow-level concurrency block syntax is
|
|
33
|
+
nearly identical to the job-level one, so copy-paste errors are common.
|
|
34
|
+
fix: |
|
|
35
|
+
Move the concurrency block from the job level to the workflow level (top of the
|
|
36
|
+
workflow file, alongside `on:` and `name:`). Workflow-level concurrency provides
|
|
37
|
+
cross-run serialization and prevents parallel executions across all workflow runs.
|
|
38
|
+
|
|
39
|
+
For deployment workflows, use cancel-in-progress: false to queue rather than cancel
|
|
40
|
+
in-flight deployments, preventing accidental skips of intermediate releases.
|
|
41
|
+
fix_code:
|
|
42
|
+
- language: yaml
|
|
43
|
+
label: "Wrong: job-level concurrency (only per-run, not cross-run)"
|
|
44
|
+
code: |
|
|
45
|
+
# DO NOT USE for cross-run mutex — this only queues within one run
|
|
46
|
+
jobs:
|
|
47
|
+
deploy:
|
|
48
|
+
runs-on: ubuntu-latest
|
|
49
|
+
concurrency:
|
|
50
|
+
group: deploy-${{ github.ref }}
|
|
51
|
+
cancel-in-progress: true
|
|
52
|
+
steps:
|
|
53
|
+
- run: ./deploy.sh
|
|
54
|
+
- language: yaml
|
|
55
|
+
label: "Correct: workflow-level concurrency (true cross-run mutex)"
|
|
56
|
+
code: |
|
|
57
|
+
name: Deploy
|
|
58
|
+
on: push
|
|
59
|
+
|
|
60
|
+
# Workflow-level: prevents parallel deploys across ALL simultaneous runs
|
|
61
|
+
concurrency:
|
|
62
|
+
group: deploy-${{ github.ref }}
|
|
63
|
+
cancel-in-progress: false # queue deploys, don't skip intermediate releases
|
|
64
|
+
|
|
65
|
+
jobs:
|
|
66
|
+
deploy:
|
|
67
|
+
runs-on: ubuntu-latest
|
|
68
|
+
steps:
|
|
69
|
+
- uses: actions/checkout@v4
|
|
70
|
+
- run: ./deploy.sh
|
|
71
|
+
prevention:
|
|
72
|
+
- "Always use workflow-level concurrency when the goal is preventing simultaneous deployments across PRs or pushes."
|
|
73
|
+
- "Use job-level concurrency only for intra-run fan-out limiting — document intent with a comment."
|
|
74
|
+
- "Verify cross-run mutex behavior by triggering two simultaneous push runs and checking the Actions tab."
|
|
75
|
+
- "Prefer cancel-in-progress: false for deploy workflows to avoid skipping intermediate releases."
|
|
76
|
+
- "Search your workflows for `jobs.<name>.concurrency` and audit whether cross-run isolation is the intent."
|
|
77
|
+
docs:
|
|
78
|
+
- url: "https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/controlling-concurrency"
|
|
79
|
+
label: "GitHub Docs: Controlling concurrency"
|
|
80
|
+
- url: "https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#concurrency"
|
|
81
|
+
label: "Workflow syntax: concurrency"
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
id: concurrency-timing-017
|
|
2
|
+
title: "merge_group Trigger With cancel-in-progress Cascades Cancellations Across Queued Merges"
|
|
3
|
+
category: concurrency-timing
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- merge-queue
|
|
7
|
+
- merge_group
|
|
8
|
+
- concurrency
|
|
9
|
+
- cancel-in-progress
|
|
10
|
+
- branch-protection
|
|
11
|
+
patterns:
|
|
12
|
+
- regex: "merge_group"
|
|
13
|
+
flags: "i"
|
|
14
|
+
- regex: 'cancel-in-progress:\s*true'
|
|
15
|
+
flags: "i"
|
|
16
|
+
error_messages:
|
|
17
|
+
- "Some checks were not successful"
|
|
18
|
+
- "This check was cancelled"
|
|
19
|
+
- "Run was cancelled by a newer queued run"
|
|
20
|
+
root_cause: |
|
|
21
|
+
GitHub's merge queue processes PRs by grouping them and running CI on each queued
|
|
22
|
+
batch via the `merge_group` event. When a workflow triggered by `merge_group` uses a
|
|
23
|
+
shared concurrency group key with `cancel-in-progress: true`, each new PR entry in
|
|
24
|
+
the queue triggers a new workflow run. Because the concurrency key is shared (e.g.,
|
|
25
|
+
based on the branch name or workflow name), the new run cancels the already-running
|
|
26
|
+
check for the previous queued PR.
|
|
27
|
+
|
|
28
|
+
The cancelled PR's required check fails or is marked cancelled, causing GitHub to
|
|
29
|
+
remove that PR from the merge queue and requeue it. The requeued PR triggers a new
|
|
30
|
+
run — which cancels the next PR's check — creating an infinite cascade. No PR
|
|
31
|
+
successfully merges.
|
|
32
|
+
|
|
33
|
+
The merge queue is designed to serialize merges safely. Adding `cancel-in-progress:
|
|
34
|
+
true` to `merge_group` workflows defeats the serialization and creates cascading
|
|
35
|
+
failures.
|
|
36
|
+
fix: |
|
|
37
|
+
For `merge_group` events, always use `cancel-in-progress: false` (or omit it).
|
|
38
|
+
Each merge queue entry must complete independently. To keep cancellation for regular
|
|
39
|
+
pull_request events (to cancel superseded draft checks), use a conditional expression:
|
|
40
|
+
|
|
41
|
+
concurrency:
|
|
42
|
+
group: ci-${{ github.event.merge_group.head_sha || github.head_ref || github.ref }}
|
|
43
|
+
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
|
44
|
+
|
|
45
|
+
This cancels in-progress runs for updated PRs but never cancels merge queue checks.
|
|
46
|
+
fix_code:
|
|
47
|
+
- language: yaml
|
|
48
|
+
label: "Safe: conditional cancel-in-progress for merge_group + pull_request"
|
|
49
|
+
code: |
|
|
50
|
+
on:
|
|
51
|
+
pull_request:
|
|
52
|
+
merge_group:
|
|
53
|
+
|
|
54
|
+
concurrency:
|
|
55
|
+
# Use merge_group head SHA for unique per-entry key; fall back for PR events
|
|
56
|
+
group: ci-${{ github.event.merge_group.head_sha || github.head_ref || github.ref }}
|
|
57
|
+
# Only cancel for PRs (draft updates), NEVER for merge queue entries
|
|
58
|
+
cancel-in-progress: ${{ github.event_name == 'pull_request' }}
|
|
59
|
+
|
|
60
|
+
jobs:
|
|
61
|
+
test:
|
|
62
|
+
runs-on: ubuntu-latest
|
|
63
|
+
steps:
|
|
64
|
+
- uses: actions/checkout@v4
|
|
65
|
+
- run: npm test
|
|
66
|
+
- language: yaml
|
|
67
|
+
label: "Broken: cancel-in-progress true on merge_group causes cascade"
|
|
68
|
+
code: |
|
|
69
|
+
# DO NOT USE — cancels queued merge checks, nothing merges
|
|
70
|
+
on:
|
|
71
|
+
merge_group:
|
|
72
|
+
concurrency:
|
|
73
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
74
|
+
cancel-in-progress: true
|
|
75
|
+
prevention:
|
|
76
|
+
- "Never use cancel-in-progress: true when the workflow is triggered by the merge_group event."
|
|
77
|
+
- "Use github.event.merge_group.head_sha in the concurrency key to ensure each queue entry gets a unique group."
|
|
78
|
+
- "Test merge queue behavior by simultaneously queuing 2-3 PRs and confirming all checks complete."
|
|
79
|
+
- "If combining pull_request and merge_group triggers, make cancel-in-progress conditional on the event type."
|
|
80
|
+
docs:
|
|
81
|
+
- url: "https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/configuring-pull-request-merges/managing-a-merge-queue"
|
|
82
|
+
label: "GitHub Docs: Managing a merge queue"
|
|
83
|
+
- url: "https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#merge_group"
|
|
84
|
+
label: "Events that trigger workflows: merge_group"
|
|
85
|
+
- url: "https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/controlling-concurrency"
|
|
86
|
+
label: "GitHub Docs: Controlling concurrency"
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
id: concurrency-timing-019
|
|
2
|
+
title: "Reusable Workflow Inherits Caller's ${{ github.workflow }} — Concurrency Group Collision Cancels Caller"
|
|
3
|
+
category: concurrency-timing
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- reusable-workflow
|
|
7
|
+
- workflow-call
|
|
8
|
+
- concurrency
|
|
9
|
+
- github-workflow-context
|
|
10
|
+
- cancel-in-progress
|
|
11
|
+
- context-inheritance
|
|
12
|
+
patterns:
|
|
13
|
+
- regex: "Run was cancelled|Canceling since a higher priority waiting request"
|
|
14
|
+
flags: "i"
|
|
15
|
+
- regex: "called workflow.*cancel|reusable.*workflow.*concurrency"
|
|
16
|
+
flags: "i"
|
|
17
|
+
- regex: "github\\.workflow.*concurrency group|concurrency.*reusable"
|
|
18
|
+
flags: "i"
|
|
19
|
+
error_messages:
|
|
20
|
+
- "Run was cancelled"
|
|
21
|
+
- "Canceling since a higher priority waiting request for '...' exists"
|
|
22
|
+
- "This run was cancelled because another run in the same concurrency group was started"
|
|
23
|
+
root_cause: |
|
|
24
|
+
When a called (reusable) workflow uses `${{ github.workflow }}` in its `concurrency.group`
|
|
25
|
+
expression, it inherits the CALLER workflow's name — not its own file name. This causes both
|
|
26
|
+
the caller and the called workflow to join the same concurrency group, leading to cascading
|
|
27
|
+
cancellations.
|
|
28
|
+
|
|
29
|
+
Example: Caller workflow named "CI" calls "Deploy" as a reusable workflow. Both define:
|
|
30
|
+
concurrency:
|
|
31
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
32
|
+
cancel-in-progress: true
|
|
33
|
+
|
|
34
|
+
At runtime, `github.workflow` inside "Deploy" evaluates to "CI" — the same string as the
|
|
35
|
+
caller. When the called workflow starts, `cancel-in-progress: true` fires and cancels the
|
|
36
|
+
still-running caller workflow.
|
|
37
|
+
|
|
38
|
+
This is documented behavior: the GitHub Actions runner passes the caller workflow's
|
|
39
|
+
`github` context to all called workflows. The called workflow has no way to determine its
|
|
40
|
+
own filename via context expressions alone.
|
|
41
|
+
|
|
42
|
+
The GitHub Docs now include an explicit warning: "A called workflow uses the name of its
|
|
43
|
+
caller workflow in ${{ github.workflow }}, so using this context as the value of
|
|
44
|
+
jobs.<job_id>.concurrency.group in both caller and called workflows will cause the caller
|
|
45
|
+
workflow to be cancelled when the called workflow runs."
|
|
46
|
+
|
|
47
|
+
Source: actions/runner#3205 (github.workflow context variable causes child workflow runs to
|
|
48
|
+
be prematurely canceled — opened Mar 2024, closed stale Apr 2025, 0 official fix)
|
|
49
|
+
Source: GitHub Docs — Using concurrency with reusable workflows
|
|
50
|
+
fix: |
|
|
51
|
+
Do not use `${{ github.workflow }}` in concurrency group expressions inside reusable
|
|
52
|
+
(workflow_call) workflows.
|
|
53
|
+
|
|
54
|
+
Option A — Hard-code a unique identifier for the called workflow:
|
|
55
|
+
Use a static string that cannot match the caller's concurrency group.
|
|
56
|
+
|
|
57
|
+
Option B — Pass caller context as an explicit input:
|
|
58
|
+
Accept a `concurrency_key` input from the caller and use that in the called
|
|
59
|
+
workflow's concurrency group, giving the caller full control.
|
|
60
|
+
|
|
61
|
+
Option C — Remove concurrency from the called workflow entirely:
|
|
62
|
+
If the caller manages concurrency at the job level, the called workflow's internal
|
|
63
|
+
concurrency is often redundant and causes more harm than good.
|
|
64
|
+
fix_code:
|
|
65
|
+
- language: yaml
|
|
66
|
+
label: "caller.yml — standard concurrency group using github.workflow (safe here)"
|
|
67
|
+
code: |
|
|
68
|
+
name: CI
|
|
69
|
+
|
|
70
|
+
on:
|
|
71
|
+
push:
|
|
72
|
+
branches: [main]
|
|
73
|
+
|
|
74
|
+
concurrency:
|
|
75
|
+
group: ${{ github.workflow }}-${{ github.ref }}
|
|
76
|
+
cancel-in-progress: true
|
|
77
|
+
|
|
78
|
+
jobs:
|
|
79
|
+
test:
|
|
80
|
+
uses: ./.github/workflows/deploy.yml
|
|
81
|
+
with:
|
|
82
|
+
environment: staging
|
|
83
|
+
|
|
84
|
+
- language: yaml
|
|
85
|
+
label: "deploy.yml (called) — DO NOT use github.workflow; use a static name instead"
|
|
86
|
+
code: |
|
|
87
|
+
name: Deploy
|
|
88
|
+
# NOTE: when called, github.workflow returns "CI" (caller name), NOT "Deploy"
|
|
89
|
+
|
|
90
|
+
on:
|
|
91
|
+
workflow_call:
|
|
92
|
+
inputs:
|
|
93
|
+
environment:
|
|
94
|
+
type: string
|
|
95
|
+
required: true
|
|
96
|
+
|
|
97
|
+
# ✅ CORRECT: static unique identifier — never collides with caller group
|
|
98
|
+
concurrency:
|
|
99
|
+
group: deploy-called-${{ inputs.environment }}-${{ github.ref }}
|
|
100
|
+
cancel-in-progress: false
|
|
101
|
+
|
|
102
|
+
# ❌ WRONG: "CI-refs/heads/main" == caller's group → caller gets cancelled
|
|
103
|
+
# concurrency:
|
|
104
|
+
# group: ${{ github.workflow }}-${{ github.ref }}
|
|
105
|
+
# cancel-in-progress: true
|
|
106
|
+
|
|
107
|
+
jobs:
|
|
108
|
+
deploy:
|
|
109
|
+
runs-on: ubuntu-latest
|
|
110
|
+
steps:
|
|
111
|
+
- uses: actions/checkout@v4
|
|
112
|
+
- run: ./deploy.sh ${{ inputs.environment }}
|
|
113
|
+
prevention:
|
|
114
|
+
- "Never use `${{ github.workflow }}` in concurrency group expressions inside reusable (workflow_call) workflows."
|
|
115
|
+
- "Use hard-coded or input-derived identifiers in called workflow concurrency groups to ensure uniqueness."
|
|
116
|
+
- "If the caller already manages concurrency at the job level, omit concurrency from the called workflow entirely."
|
|
117
|
+
- "Test by triggering two back-to-back runs and confirming neither cancels the other unexpectedly."
|
|
118
|
+
docs:
|
|
119
|
+
- url: "https://docs.github.com/en/actions/sharing-automations/reusing-workflows"
|
|
120
|
+
label: "GitHub Docs: Reusing workflows — concurrency warning"
|
|
121
|
+
- url: "https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/using-concurrency"
|
|
122
|
+
label: "GitHub Docs: Using concurrency in GitHub Actions"
|
|
123
|
+
- url: "https://github.com/actions/runner/issues/3205"
|
|
124
|
+
label: "actions/runner#3205: github.workflow context causes child workflow cancellation"
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
id: concurrency-timing-018
|
|
2
|
+
title: "Jobs Dispatched to Runner Scale Set Never Start — Label Mismatch or Registration Failure"
|
|
3
|
+
category: concurrency-timing
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- runner-scale-set
|
|
7
|
+
- self-hosted
|
|
8
|
+
- job-routing
|
|
9
|
+
- label
|
|
10
|
+
- registration
|
|
11
|
+
- arc
|
|
12
|
+
- actions-runner-controller
|
|
13
|
+
- stuck
|
|
14
|
+
patterns:
|
|
15
|
+
- regex: "Waiting for a runner to pick up this job"
|
|
16
|
+
flags: "i"
|
|
17
|
+
- regex: "runner scale set.*not found"
|
|
18
|
+
flags: "im"
|
|
19
|
+
- regex: "No runner matching the required labels .*\\[.*\\] is online"
|
|
20
|
+
flags: "i"
|
|
21
|
+
- regex: "Could not find any online and idle runners that match the requested labels"
|
|
22
|
+
flags: "i"
|
|
23
|
+
error_messages:
|
|
24
|
+
- "Waiting for a runner to pick up this job..."
|
|
25
|
+
- "No runner matching the required labels '[self-hosted, production-scale-set]' is online."
|
|
26
|
+
- "Could not find any online and idle runners that match the requested labels."
|
|
27
|
+
- "This job was lost while waiting for a runner. Please re-run the job."
|
|
28
|
+
root_cause: |
|
|
29
|
+
GitHub Actions runner scale sets (introduced via Actions Runner Controller / ARC and the
|
|
30
|
+
newer Go-based runner scale set client) dispatch jobs to a dynamically scaling pool of
|
|
31
|
+
runners. When a job is queued but no runner starts to process it, several misconfiguration
|
|
32
|
+
scenarios cause the job to wait indefinitely:
|
|
33
|
+
|
|
34
|
+
1. **Label mismatch**: The workflow's `runs-on` label does not exactly match the
|
|
35
|
+
runner scale set's configured name/label. Labels are case-sensitive. A workflow
|
|
36
|
+
specifying `runs-on: prod-scale-set` will never match a scale set registered as
|
|
37
|
+
`Prod-Scale-Set` or `production-scale-set`.
|
|
38
|
+
|
|
39
|
+
2. **Registration failure**: The scale set client failed to register with GitHub
|
|
40
|
+
(bad `githubConfigUrl`, expired PAT, GitHub App credentials rotated) but no alert
|
|
41
|
+
was generated. The runner group appears in the UI but receives no jobs because it
|
|
42
|
+
has zero healthy, registered instances.
|
|
43
|
+
|
|
44
|
+
3. **Min runners set to 0, scale-up blocked**: The scale set is configured with
|
|
45
|
+
`minRunners: 0` and depends on job events to scale up, but a network policy,
|
|
46
|
+
firewall rule, or Kubernetes resource quota prevents new runner pods from starting.
|
|
47
|
+
The broker sees a healthy scale set but new runners cannot provision.
|
|
48
|
+
|
|
49
|
+
4. **Runner listener terminated unexpectedly**: The runner listener process (the
|
|
50
|
+
manager component) crashed silently. GitHub shows the scale set as registered but
|
|
51
|
+
it has no workers accepting the dispatch message.
|
|
52
|
+
|
|
53
|
+
5. **Org/repo runner group restriction**: The runner group containing the scale set
|
|
54
|
+
has repository-access restrictions — the requesting repo is not in the allowed list.
|
|
55
|
+
|
|
56
|
+
Jobs in this state do not produce an explicit error in the Actions log; they are
|
|
57
|
+
stuck in "Waiting for a runner" indefinitely until they hit the 35-day job timeout
|
|
58
|
+
or are manually cancelled.
|
|
59
|
+
fix: |
|
|
60
|
+
1. **Verify label exact-match**: Compare `runs-on:` value in the workflow against
|
|
61
|
+
the registered runner scale set name in Settings → Actions → Runners. Must be
|
|
62
|
+
an exact case-sensitive match.
|
|
63
|
+
|
|
64
|
+
2. **Check registration health**: In the runner scale set client logs (or ARC
|
|
65
|
+
controller pod logs), look for registration errors. Re-create credentials if
|
|
66
|
+
the GitHub App or PAT has been rotated.
|
|
67
|
+
|
|
68
|
+
3. **Inspect runner group access**: Settings → Actions → Runner groups → the group
|
|
69
|
+
containing your scale set → confirm the requesting repo is listed under
|
|
70
|
+
"Repository access".
|
|
71
|
+
|
|
72
|
+
4. **Check scale-up events**: Review Kubernetes events (`kubectl get events -n <ns>`)
|
|
73
|
+
for resource quota exceeded, image pull failures, or scheduling constraints that
|
|
74
|
+
prevent runner pods from starting.
|
|
75
|
+
|
|
76
|
+
5. **Restart the runner listener**: If the manager/listener process crashed, restart
|
|
77
|
+
the scale set client deployment or the ARC AutoscalingRunnerSet resource.
|
|
78
|
+
fix_code:
|
|
79
|
+
- language: yaml
|
|
80
|
+
label: "Workflow: verify exact label matches your scale set registration name"
|
|
81
|
+
code: |
|
|
82
|
+
jobs:
|
|
83
|
+
build:
|
|
84
|
+
# Must exactly match the scale set name registered in Settings → Actions → Runners
|
|
85
|
+
# e.g., registered as "prod-k8s-runners" → use exactly "prod-k8s-runners"
|
|
86
|
+
runs-on: prod-k8s-runners # ← exact case-sensitive match required
|
|
87
|
+
steps:
|
|
88
|
+
- uses: actions/checkout@v4
|
|
89
|
+
- run: echo "Running on scale set runner"
|
|
90
|
+
- language: yaml
|
|
91
|
+
label: "Diagnose label and runner group access via GitHub API"
|
|
92
|
+
code: |
|
|
93
|
+
# List runner groups for an org (requires admin PAT):
|
|
94
|
+
# GET /orgs/{org}/actions/runner-groups
|
|
95
|
+
#
|
|
96
|
+
# List repos allowed to use a specific runner group:
|
|
97
|
+
# GET /orgs/{org}/actions/runner-groups/{group_id}/repositories
|
|
98
|
+
#
|
|
99
|
+
# List self-hosted runners in a group:
|
|
100
|
+
# GET /orgs/{org}/actions/runner-groups/{group_id}/runners
|
|
101
|
+
#
|
|
102
|
+
# Using gh CLI:
|
|
103
|
+
# gh api /orgs/MY-ORG/actions/runner-groups --jq '.[].runners_url'
|
|
104
|
+
- name: Print runner labels (inside the running job)
|
|
105
|
+
run: |
|
|
106
|
+
echo "Runner name: ${{ runner.name }}"
|
|
107
|
+
echo "Runner OS: ${{ runner.os }}"
|
|
108
|
+
echo "Runner arch: ${{ runner.arch }}"
|
|
109
|
+
prevention:
|
|
110
|
+
- "Treat runner scale set names as API contracts — document them in a shared wiki and never rename without updating all workflow files."
|
|
111
|
+
- "Set up monitoring on the runner scale set listener/controller pod — alert if it crashes or stops processing dispatches."
|
|
112
|
+
- "Configure `minRunners: 1` on production scale sets to keep at least one warm runner active; `minRunners: 0` requires flawless scale-up on every job dispatch."
|
|
113
|
+
- "Add a `timeout-minutes` on jobs dispatched to scale sets so stuck jobs fail fast rather than consuming the 35-day wait limit."
|
|
114
|
+
- "After rotating GitHub App credentials or PATs used by the scale set client, immediately verify a test workflow completes on the scale set."
|
|
115
|
+
docs:
|
|
116
|
+
- url: "https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners-with-actions-runner-controller/deploying-runner-scale-sets-with-actions-runner-controller"
|
|
117
|
+
label: "GitHub Docs: Deploying runner scale sets with Actions Runner Controller"
|
|
118
|
+
- url: "https://github.blog/changelog/2026-02-05-github-actions-early-february-2026-updates/"
|
|
119
|
+
label: "GitHub Changelog: Actions early February 2026 updates (runner scale set client)"
|
|
120
|
+
- url: "https://github.com/orgs/community/discussions/171291"
|
|
121
|
+
label: "Community Discussion: Runner scale set jobs not being dispatched"
|
|
122
|
+
- url: "https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/managing-access-to-self-hosted-runners-using-groups"
|
|
123
|
+
label: "GitHub Docs: Managing access to self-hosted runners using groups"
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
id: concurrency-timing-022
|
|
2
|
+
title: "Concurrent Self-Hosted Runner Workers Share _temp — Cancelled Job Wipes Active Job Files"
|
|
3
|
+
category: concurrency-timing
|
|
4
|
+
severity: error
|
|
5
|
+
tags:
|
|
6
|
+
- self-hosted-runner
|
|
7
|
+
- temp-directory
|
|
8
|
+
- race-condition
|
|
9
|
+
- worker
|
|
10
|
+
- non-ephemeral
|
|
11
|
+
patterns:
|
|
12
|
+
- regex: "Missing file at path:.*_temp.*_runner_file_commands.*set_output"
|
|
13
|
+
flags: "i"
|
|
14
|
+
- regex: "TempDirectoryManager.*Cleaning runner temp folder"
|
|
15
|
+
flags: "i"
|
|
16
|
+
- regex: "We are not yet checking the state of jobrequest.*Cancel running worker right away"
|
|
17
|
+
flags: "i"
|
|
18
|
+
- regex: "Worker finished for job.*Code: 102"
|
|
19
|
+
flags: "i"
|
|
20
|
+
error_messages:
|
|
21
|
+
- "Error: Missing file at path: /home/runner/work/_temp/_runner_file_commands/set_output_<uuid>"
|
|
22
|
+
- "[TempDirectoryManager] Cleaning runner temp folder: /home/runner/work/_temp"
|
|
23
|
+
- "[JobDispatcher] We are not yet checking the state of jobrequest ... Cancel running worker right away."
|
|
24
|
+
- "Worker finished for job ... Code: 102"
|
|
25
|
+
root_cause: |
|
|
26
|
+
When two jobs are dispatched to the same non-ephemeral self-hosted runner in quick succession,
|
|
27
|
+
the runner cancels the first Worker and immediately spawns a second Worker — both run
|
|
28
|
+
simultaneously for several seconds. Both Workers share the same _temp directory
|
|
29
|
+
(_work/_temp). The cancelled Worker TempDirectoryManager cleanup fires 10-30 seconds
|
|
30
|
+
after cancellation, which is after the new Worker has already created its file-command
|
|
31
|
+
pipes (_runner_file_commands/set_output_*, step_summary_*) in the shared _temp.
|
|
32
|
+
|
|
33
|
+
The cleanup unconditionally wipes the entire _temp folder, deleting the new Worker active
|
|
34
|
+
file command pipes mid-step. The new job exits with code 102
|
|
35
|
+
(runner infrastructure failure), not a workflow step failure.
|
|
36
|
+
|
|
37
|
+
Root issue: JobDispatcher does not wait for the previous Worker process to exit before
|
|
38
|
+
spawning the new Worker. Both PIDs coexist on the same temp path.
|
|
39
|
+
Reported in actions/runner#4357, Apr 2026; fix proposed in PR #4371.
|
|
40
|
+
fix: |
|
|
41
|
+
Primary fix (recommended): Use ephemeral runners.
|
|
42
|
+
Ephemeral runners (--ephemeral) start fresh for each job and never receive a second job
|
|
43
|
+
assignment, eliminating the race entirely.
|
|
44
|
+
|
|
45
|
+
Alternative: Limit concurrency on the runner group.
|
|
46
|
+
If ephemeral mode is not an option, configure the workflow concurrency group to prevent
|
|
47
|
+
two workflows from targeting the same runner label simultaneously.
|
|
48
|
+
|
|
49
|
+
Alternative: Use a per-job container.
|
|
50
|
+
Run each job inside an isolated Docker container on the self-hosted runner so
|
|
51
|
+
the file paths do not collide between concurrent jobs.
|
|
52
|
+
fix_code:
|
|
53
|
+
- language: yaml
|
|
54
|
+
label: "Use ephemeral self-hosted runner (recommended)"
|
|
55
|
+
code: |
|
|
56
|
+
# Register your runner with --ephemeral flag:
|
|
57
|
+
# ./config.sh --url https://github.com/org/repo --token TOKEN --ephemeral
|
|
58
|
+
#
|
|
59
|
+
# In your workflow, the runner label stays the same:
|
|
60
|
+
jobs:
|
|
61
|
+
build:
|
|
62
|
+
runs-on: [self-hosted, linux, x64]
|
|
63
|
+
steps:
|
|
64
|
+
- uses: actions/checkout@v4
|
|
65
|
+
- run: make build
|
|
66
|
+
- language: yaml
|
|
67
|
+
label: "Limit per-runner concurrency via workflow concurrency group"
|
|
68
|
+
code: |
|
|
69
|
+
concurrency:
|
|
70
|
+
group: self-hosted-runner-${{ github.ref }}
|
|
71
|
+
cancel-in-progress: false # Queue instead of cancel
|
|
72
|
+
|
|
73
|
+
jobs:
|
|
74
|
+
build:
|
|
75
|
+
runs-on: [self-hosted, linux, x64]
|
|
76
|
+
steps:
|
|
77
|
+
- uses: actions/checkout@v4
|
|
78
|
+
- run: make build
|
|
79
|
+
prevention:
|
|
80
|
+
- "Use ephemeral runners for all self-hosted CI workloads — they also improve security and reduce state leakage between jobs."
|
|
81
|
+
- "Avoid non-ephemeral runners for workflows that run frequently; shorter intervals increase the chance of dispatch overlap."
|
|
82
|
+
- "Monitor for exit code 102 in workflow logs — it indicates a runner infrastructure failure, not a step failure."
|
|
83
|
+
- "Track actions/runner#4357 for an official fix to TempDirectoryManager shared-path cleanup behavior."
|
|
84
|
+
docs:
|
|
85
|
+
- url: "https://github.com/actions/runner/issues/4357"
|
|
86
|
+
label: "actions/runner#4357 — TempDirectoryManager race condition (Apr 2026)"
|
|
87
|
+
- url: "https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/autoscaling-with-self-hosted-runners#using-ephemeral-runners-for-autoscaling"
|
|
88
|
+
label: "Ephemeral runners for autoscaling"
|
|
89
|
+
- url: "https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/using-concurrency"
|
|
90
|
+
label: "Using concurrency"
|