@htekdev/actions-debugger 1.0.23 → 1.0.24

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/errors/caching-artifacts/artifact-minimum-retention-one-day.yml +153 -0
  2. package/errors/caching-artifacts/cache-api-propagation-delay-post-save.yml +128 -0
  3. package/errors/caching-artifacts/cache-backend-internal-error-skipped.yml +75 -0
  4. package/errors/caching-artifacts/cache-hit-step-id-case-sensitive-mismatch.yml +95 -0
  5. package/errors/caching-artifacts/cache-save-post-step-skipped-on-failure.yml +114 -0
  6. package/errors/concurrency-timing/deploy-pages-in-progress-deployment-wedged.yml +70 -0
  7. package/errors/concurrency-timing/deployment-review-timeout-expired.yml +88 -0
  8. package/errors/concurrency-timing/job-concurrency-scope-per-run-not-global.yml +81 -0
  9. package/errors/concurrency-timing/merge-queue-concurrency-cancel-blocks-all.yml +86 -0
  10. package/errors/concurrency-timing/reusable-workflow-github-workflow-context-cancel.yml +124 -0
  11. package/errors/concurrency-timing/runner-scale-set-jobs-never-start.yml +123 -0
  12. package/errors/concurrency-timing/runner-temp-dir-race-concurrent-workers.yml +90 -0
  13. package/errors/known-unsolved/artifact-download-url-unauthenticated-404.yml +98 -0
  14. package/errors/known-unsolved/checkout-v6-credentials-docker-run-manual.yml +105 -0
  15. package/errors/known-unsolved/concurrency-groups-repo-scoped-only.yml +138 -0
  16. package/errors/known-unsolved/matrix-256-job-limit.yml +142 -0
  17. package/errors/known-unsolved/merge-group-paths-filter-not-supported.yml +137 -0
  18. package/errors/known-unsolved/no-job-allow-failure.yml +73 -0
  19. package/errors/known-unsolved/schedule-cron-hours-long-queue-drift.yml +101 -0
  20. package/errors/permissions-auth/checkout-persist-credentials-token-write.yml +90 -0
  21. package/errors/permissions-auth/create-github-app-token-cross-job-token-revoked.yml +95 -0
  22. package/errors/permissions-auth/github-token-contents-write-missing-git-push.yml +117 -0
  23. package/errors/permissions-auth/org-actions-policy-blocks-unapproved-action.yml +106 -0
  24. package/errors/runner-environment/codeql-action-v2-deprecated.yml +110 -0
  25. package/errors/runner-environment/macos-26-openssl-3-system-library-breaking.yml +114 -0
  26. package/errors/runner-environment/macos-26-ruby-34-default-upgrade.yml +114 -0
  27. package/errors/runner-environment/macos-26-xcode-default-265-pin-required.yml +99 -0
  28. package/errors/runner-environment/macos-latest-label-switches-to-macos26.yml +127 -0
  29. package/errors/runner-environment/node20-removed-toolcache-default-node22.yml +104 -0
  30. package/errors/runner-environment/powershell-74-76-threadjob-module-rename.yml +124 -0
  31. package/errors/runner-environment/self-hosted-runner-not-found.yml +134 -0
  32. package/errors/runner-environment/self-hosted-runner-selinux-service-exec-failure.yml +116 -0
  33. package/errors/runner-environment/service-container-no-healthcheck.yml +158 -0
  34. package/errors/runner-environment/setup-node-v5-corepack-pnpm-not-found.yml +101 -0
  35. package/errors/runner-environment/setup-node-yarn-not-installed-self-hosted.yml +76 -0
  36. package/errors/runner-environment/setup-python-externally-managed-env-error.yml +95 -0
  37. package/errors/runner-environment/windows-2019-runner-retired-june2025.yml +118 -0
  38. package/errors/runner-environment/windows-2022-docker-daemon-not-started.yml +108 -0
  39. package/errors/silent-failures/cache-hit-output-string-not-boolean.yml +96 -0
  40. package/errors/silent-failures/checkout-lfs-pointer-not-content.yml +105 -0
  41. package/errors/silent-failures/reusable-workflow-output-skipped-contains-secret.yml +115 -0
  42. package/errors/silent-failures/setup-node-silent-download-exit-zero.yml +105 -0
  43. package/errors/silent-failures/setup-python-truncated-manifest-silent-exit.yml +111 -0
  44. package/errors/silent-failures/undefined-env-expression-empty-string-silent.yml +115 -0
  45. package/errors/silent-failures/windows-powershell-github-output-bash-syntax.yml +118 -0
  46. package/errors/triggers/fork-pr-first-time-contributor-approval-required.yml +142 -0
  47. package/errors/triggers/on-push-branches-glob-star-no-slash-match.yml +78 -0
  48. package/errors/triggers/pull-request-target-env-protection-default-branch-eval.yml +117 -0
  49. package/errors/triggers/required-status-check-renamed-never-passes.yml +87 -0
  50. package/errors/triggers/schedule-cron-self-hosted-runner-not-triggered.yml +107 -0
  51. package/errors/yaml-syntax/composite-action-run-shell-missing.yml +90 -0
  52. package/errors/yaml-syntax/composite-action-secrets-context-unavailable.yml +99 -0
  53. package/errors/yaml-syntax/github-script-octokit-renamed-to-github.yml +130 -0
  54. package/errors/yaml-syntax/labeler-v5-config-format-breaking.yml +67 -0
  55. package/errors/yaml-syntax/runs-on-expression-array-syntax-error.yml +121 -0
  56. package/errors/yaml-syntax/setup-go-matrix-version-float-coercion.yml +69 -0
  57. package/package.json +1 -1
@@ -0,0 +1,88 @@
1
+ id: "concurrency-timing-021"
2
+ title: "Deployment job expires when environment review timeout lapses"
3
+ category: "concurrency-timing"
4
+ severity: "error"
5
+ tags:
6
+ - "deployment"
7
+ - "environment"
8
+ - "protection-rule"
9
+ - "review-timeout"
10
+ - "pending"
11
+ - "expired"
12
+ patterns:
13
+ - regex: "was not approved before the review period expired"
14
+ flags: "i"
15
+ - regex: "deployment.*review period expired"
16
+ flags: "i"
17
+ - regex: "Your deployment to .+ expired"
18
+ flags: "i"
19
+ error_messages:
20
+ - "Your deployment to [environment] was not approved before the review period expired."
21
+ root_cause: |
22
+ Environment protection rules that require manual reviewers have a configurable review
23
+ timeout (default 30 days; configurable from 1 to 30 days). When a workflow job waits
24
+ for approval and no reviewer acts within that window, the deployment job automatically
25
+ fails with a timeout error.
26
+
27
+ This creates a silent cascade: while the job is pending, it appears as "Waiting" in
28
+ the UI with no failure indicator. Needs-dependent jobs that run after the deployment
29
+ also remain queued. When the timeout fires, the deployment job fails, which then
30
+ cascade-fails all downstream jobs simultaneously — a large batch of failures with a
31
+ confusing single root cause.
32
+
33
+ Common triggers:
34
+ - Teams shorten the timeout from 30 days to 1-7 days for security compliance.
35
+ - Approvers are on leave or in different time zones and miss the notification.
36
+ - A deployment is triggered late Friday; nobody approves over the weekend.
37
+ - The notification email was caught by spam filters or notification fatigue.
38
+ fix: |
39
+ 1. Re-run the failed workflow — a fresh run starts a new review clock.
40
+ 2. Increase the review timeout in: repo Settings → Environments → [env]
41
+ → Protection rules → Required reviewers → "Timeout".
42
+ 3. Add more reviewers or a team with guaranteed availability across time zones.
43
+ 4. Send a proactive notification (Slack, Teams, PagerDuty) at deploy-queue time
44
+ so reviewers don't have to poll GitHub.
45
+ fix_code:
46
+ - language: yaml
47
+ label: "Notify reviewers immediately when the deployment enters the queue"
48
+ code: |
49
+ jobs:
50
+ notify-pending:
51
+ runs-on: ubuntu-latest
52
+ steps:
53
+ - name: Notify reviewers that approval is needed
54
+ uses: slackapi/slack-github-action@v2
55
+ with:
56
+ webhook: ${{ secrets.SLACK_WEBHOOK_URL }}
57
+ webhook-type: incoming-webhook
58
+ payload: |
59
+ {
60
+ "text": ":rocket: Deployment to *production* is waiting for review.\n<${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|Approve here>"
61
+ }
62
+
63
+ deploy:
64
+ needs: notify-pending
65
+ runs-on: ubuntu-latest
66
+ environment: production
67
+ steps:
68
+ - name: Deploy
69
+ run: ./deploy.sh
70
+
71
+ - language: yaml
72
+ label: "Programmatic re-trigger via workflow_dispatch on expiry (escape hatch)"
73
+ code: |
74
+ # After a timeout expiry, re-run the failed workflow via GitHub CLI:
75
+ # gh run rerun <run-id> --repo owner/repo --failed
76
+ # Or push a no-op commit to trigger a fresh run.
77
+ prevention:
78
+ - "Configure Slack/Teams/email notifications that fire when a deployment enters the pending approval state — do not rely on reviewers proactively checking GitHub."
79
+ - "Add multiple reviewers or use a team as reviewer so a single person's absence doesn't block the pipeline."
80
+ - "Do not shorten the review timeout below what your team can realistically respond to (e.g., account for weekends and holidays)."
81
+ - "Use the wait-timer setting only if you need a minimum delay; set it lower than the review timeout."
82
+ docs:
83
+ - url: "https://docs.github.com/en/actions/managing-workflow-runs/reviewing-deployments"
84
+ label: "GitHub Docs — Reviewing deployments"
85
+ - url: "https://docs.github.com/en/actions/deployment/targeting-different-environments/using-environments-for-deployment#required-reviewers"
86
+ label: "GitHub Docs — Environment protection rules: required reviewers and timeout"
87
+ - url: "https://github.com/orgs/community/discussions/72259"
88
+ label: "GitHub Community #72259 — Approving environment deployment in Actions"
@@ -0,0 +1,81 @@
1
+ id: concurrency-timing-016
2
+ title: "Job-Level Concurrency Does Not Prevent Parallel Runs Across Different Workflow Runs"
3
+ category: concurrency-timing
4
+ severity: silent-failure
5
+ tags:
6
+ - concurrency
7
+ - job-level
8
+ - cross-run
9
+ - mutex
10
+ - deployment
11
+ patterns:
12
+ - regex: 'jobs\.[a-zA-Z_-]+\.concurrency'
13
+ flags: "i"
14
+ - regex: 'concurrency:\s*\n\s+group:'
15
+ flags: "im"
16
+ error_messages:
17
+ - "Multiple deployments running simultaneously"
18
+ - "Concurrent job executions detected"
19
+ root_cause: |
20
+ When the `concurrency:` block is placed at the job level (`jobs.<job_id>.concurrency`)
21
+ rather than the workflow level, the concurrency group is scoped to a single workflow
22
+ run. This means:
23
+ - Within one run: duplicate jobs queue against each other (per-run serialization).
24
+ - Across different runs: jobs with identical concurrency keys execute in parallel with
25
+ no queuing or cancellation.
26
+
27
+ Developers expecting a global mutex — for example, preventing two simultaneous
28
+ deployments triggered by separate PRs — are surprised when both deploys run
29
+ concurrently. The GitHub Docs note explicitly states: "If concurrency is specified
30
+ at a job level, the concurrency group is scoped to that job within the workflow run."
31
+
32
+ This is especially confusing because the workflow-level concurrency block syntax is
33
+ nearly identical to the job-level one, so copy-paste errors are common.
34
+ fix: |
35
+ Move the concurrency block from the job level to the workflow level (top of the
36
+ workflow file, alongside `on:` and `name:`). Workflow-level concurrency provides
37
+ cross-run serialization and prevents parallel executions across all workflow runs.
38
+
39
+ For deployment workflows, use cancel-in-progress: false to queue rather than cancel
40
+ in-flight deployments, preventing accidental skips of intermediate releases.
41
+ fix_code:
42
+ - language: yaml
43
+ label: "Wrong: job-level concurrency (only per-run, not cross-run)"
44
+ code: |
45
+ # DO NOT USE for cross-run mutex — this only queues within one run
46
+ jobs:
47
+ deploy:
48
+ runs-on: ubuntu-latest
49
+ concurrency:
50
+ group: deploy-${{ github.ref }}
51
+ cancel-in-progress: true
52
+ steps:
53
+ - run: ./deploy.sh
54
+ - language: yaml
55
+ label: "Correct: workflow-level concurrency (true cross-run mutex)"
56
+ code: |
57
+ name: Deploy
58
+ on: push
59
+
60
+ # Workflow-level: prevents parallel deploys across ALL simultaneous runs
61
+ concurrency:
62
+ group: deploy-${{ github.ref }}
63
+ cancel-in-progress: false # queue deploys, don't skip intermediate releases
64
+
65
+ jobs:
66
+ deploy:
67
+ runs-on: ubuntu-latest
68
+ steps:
69
+ - uses: actions/checkout@v4
70
+ - run: ./deploy.sh
71
+ prevention:
72
+ - "Always use workflow-level concurrency when the goal is preventing simultaneous deployments across PRs or pushes."
73
+ - "Use job-level concurrency only for intra-run fan-out limiting — document intent with a comment."
74
+ - "Verify cross-run mutex behavior by triggering two simultaneous push runs and checking the Actions tab."
75
+ - "Prefer cancel-in-progress: false for deploy workflows to avoid skipping intermediate releases."
76
+ - "Search your workflows for `jobs.<name>.concurrency` and audit whether cross-run isolation is the intent."
77
+ docs:
78
+ - url: "https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/controlling-concurrency"
79
+ label: "GitHub Docs: Controlling concurrency"
80
+ - url: "https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#concurrency"
81
+ label: "Workflow syntax: concurrency"
@@ -0,0 +1,86 @@
1
+ id: concurrency-timing-017
2
+ title: "merge_group Trigger With cancel-in-progress Cascades Cancellations Across Queued Merges"
3
+ category: concurrency-timing
4
+ severity: error
5
+ tags:
6
+ - merge-queue
7
+ - merge_group
8
+ - concurrency
9
+ - cancel-in-progress
10
+ - branch-protection
11
+ patterns:
12
+ - regex: "merge_group"
13
+ flags: "i"
14
+ - regex: 'cancel-in-progress:\s*true'
15
+ flags: "i"
16
+ error_messages:
17
+ - "Some checks were not successful"
18
+ - "This check was cancelled"
19
+ - "Run was cancelled by a newer queued run"
20
+ root_cause: |
21
+ GitHub's merge queue processes PRs by grouping them and running CI on each queued
22
+ batch via the `merge_group` event. When a workflow triggered by `merge_group` uses a
23
+ shared concurrency group key with `cancel-in-progress: true`, each new PR entry in
24
+ the queue triggers a new workflow run. Because the concurrency key is shared (e.g.,
25
+ based on the branch name or workflow name), the new run cancels the already-running
26
+ check for the previous queued PR.
27
+
28
+ The cancelled PR's required check fails or is marked cancelled, causing GitHub to
29
+ remove that PR from the merge queue and requeue it. The requeued PR triggers a new
30
+ run — which cancels the next PR's check — creating an infinite cascade. No PR
31
+ successfully merges.
32
+
33
+ The merge queue is designed to serialize merges safely. Adding `cancel-in-progress:
34
+ true` to `merge_group` workflows defeats the serialization and creates cascading
35
+ failures.
36
+ fix: |
37
+ For `merge_group` events, always use `cancel-in-progress: false` (or omit it).
38
+ Each merge queue entry must complete independently. To keep cancellation for regular
39
+ pull_request events (to cancel superseded draft checks), use a conditional expression:
40
+
41
+ concurrency:
42
+ group: ci-${{ github.event.merge_group.head_sha || github.head_ref || github.ref }}
43
+ cancel-in-progress: ${{ github.event_name == 'pull_request' }}
44
+
45
+ This cancels in-progress runs for updated PRs but never cancels merge queue checks.
46
+ fix_code:
47
+ - language: yaml
48
+ label: "Safe: conditional cancel-in-progress for merge_group + pull_request"
49
+ code: |
50
+ on:
51
+ pull_request:
52
+ merge_group:
53
+
54
+ concurrency:
55
+ # Use merge_group head SHA for unique per-entry key; fall back for PR events
56
+ group: ci-${{ github.event.merge_group.head_sha || github.head_ref || github.ref }}
57
+ # Only cancel for PRs (draft updates), NEVER for merge queue entries
58
+ cancel-in-progress: ${{ github.event_name == 'pull_request' }}
59
+
60
+ jobs:
61
+ test:
62
+ runs-on: ubuntu-latest
63
+ steps:
64
+ - uses: actions/checkout@v4
65
+ - run: npm test
66
+ - language: yaml
67
+ label: "Broken: cancel-in-progress true on merge_group causes cascade"
68
+ code: |
69
+ # DO NOT USE — cancels queued merge checks, nothing merges
70
+ on:
71
+ merge_group:
72
+ concurrency:
73
+ group: ${{ github.workflow }}-${{ github.ref }}
74
+ cancel-in-progress: true
75
+ prevention:
76
+ - "Never use cancel-in-progress: true when the workflow is triggered by the merge_group event."
77
+ - "Use github.event.merge_group.head_sha in the concurrency key to ensure each queue entry gets a unique group."
78
+ - "Test merge queue behavior by simultaneously queuing 2-3 PRs and confirming all checks complete."
79
+ - "If combining pull_request and merge_group triggers, make cancel-in-progress conditional on the event type."
80
+ docs:
81
+ - url: "https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/configuring-pull-request-merges/managing-a-merge-queue"
82
+ label: "GitHub Docs: Managing a merge queue"
83
+ - url: "https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#merge_group"
84
+ label: "Events that trigger workflows: merge_group"
85
+ - url: "https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/controlling-concurrency"
86
+ label: "GitHub Docs: Controlling concurrency"
@@ -0,0 +1,124 @@
1
+ id: concurrency-timing-019
2
+ title: "Reusable Workflow Inherits Caller's ${{ github.workflow }} — Concurrency Group Collision Cancels Caller"
3
+ category: concurrency-timing
4
+ severity: error
5
+ tags:
6
+ - reusable-workflow
7
+ - workflow-call
8
+ - concurrency
9
+ - github-workflow-context
10
+ - cancel-in-progress
11
+ - context-inheritance
12
+ patterns:
13
+ - regex: "Run was cancelled|Canceling since a higher priority waiting request"
14
+ flags: "i"
15
+ - regex: "called workflow.*cancel|reusable.*workflow.*concurrency"
16
+ flags: "i"
17
+ - regex: "github\\.workflow.*concurrency group|concurrency.*reusable"
18
+ flags: "i"
19
+ error_messages:
20
+ - "Run was cancelled"
21
+ - "Canceling since a higher priority waiting request for '...' exists"
22
+ - "This run was cancelled because another run in the same concurrency group was started"
23
+ root_cause: |
24
+ When a called (reusable) workflow uses `${{ github.workflow }}` in its `concurrency.group`
25
+ expression, it inherits the CALLER workflow's name — not its own file name. This causes both
26
+ the caller and the called workflow to join the same concurrency group, leading to cascading
27
+ cancellations.
28
+
29
+ Example: Caller workflow named "CI" calls "Deploy" as a reusable workflow. Both define:
30
+ concurrency:
31
+ group: ${{ github.workflow }}-${{ github.ref }}
32
+ cancel-in-progress: true
33
+
34
+ At runtime, `github.workflow` inside "Deploy" evaluates to "CI" — the same string as the
35
+ caller. When the called workflow starts, `cancel-in-progress: true` fires and cancels the
36
+ still-running caller workflow.
37
+
38
+ This is documented behavior: the GitHub Actions runner passes the caller workflow's
39
+ `github` context to all called workflows. The called workflow has no way to determine its
40
+ own filename via context expressions alone.
41
+
42
+ The GitHub Docs now include an explicit warning: "A called workflow uses the name of its
43
+ caller workflow in ${{ github.workflow }}, so using this context as the value of
44
+ jobs.<job_id>.concurrency.group in both caller and called workflows will cause the caller
45
+ workflow to be cancelled when the called workflow runs."
46
+
47
+ Source: actions/runner#3205 (github.workflow context variable causes child workflow runs to
48
+ be prematurely canceled — opened Mar 2024, closed stale Apr 2025, 0 official fix)
49
+ Source: GitHub Docs — Using concurrency with reusable workflows
50
+ fix: |
51
+ Do not use `${{ github.workflow }}` in concurrency group expressions inside reusable
52
+ (workflow_call) workflows.
53
+
54
+ Option A — Hard-code a unique identifier for the called workflow:
55
+ Use a static string that cannot match the caller's concurrency group.
56
+
57
+ Option B — Pass caller context as an explicit input:
58
+ Accept a `concurrency_key` input from the caller and use that in the called
59
+ workflow's concurrency group, giving the caller full control.
60
+
61
+ Option C — Remove concurrency from the called workflow entirely:
62
+ If the caller manages concurrency at the job level, the called workflow's internal
63
+ concurrency is often redundant and causes more harm than good.
64
+ fix_code:
65
+ - language: yaml
66
+ label: "caller.yml — standard concurrency group using github.workflow (safe here)"
67
+ code: |
68
+ name: CI
69
+
70
+ on:
71
+ push:
72
+ branches: [main]
73
+
74
+ concurrency:
75
+ group: ${{ github.workflow }}-${{ github.ref }}
76
+ cancel-in-progress: true
77
+
78
+ jobs:
79
+ test:
80
+ uses: ./.github/workflows/deploy.yml
81
+ with:
82
+ environment: staging
83
+
84
+ - language: yaml
85
+ label: "deploy.yml (called) — DO NOT use github.workflow; use a static name instead"
86
+ code: |
87
+ name: Deploy
88
+ # NOTE: when called, github.workflow returns "CI" (caller name), NOT "Deploy"
89
+
90
+ on:
91
+ workflow_call:
92
+ inputs:
93
+ environment:
94
+ type: string
95
+ required: true
96
+
97
+ # ✅ CORRECT: static unique identifier — never collides with caller group
98
+ concurrency:
99
+ group: deploy-called-${{ inputs.environment }}-${{ github.ref }}
100
+ cancel-in-progress: false
101
+
102
+ # ❌ WRONG: "CI-refs/heads/main" == caller's group → caller gets cancelled
103
+ # concurrency:
104
+ # group: ${{ github.workflow }}-${{ github.ref }}
105
+ # cancel-in-progress: true
106
+
107
+ jobs:
108
+ deploy:
109
+ runs-on: ubuntu-latest
110
+ steps:
111
+ - uses: actions/checkout@v4
112
+ - run: ./deploy.sh ${{ inputs.environment }}
113
+ prevention:
114
+ - "Never use `${{ github.workflow }}` in concurrency group expressions inside reusable (workflow_call) workflows."
115
+ - "Use hard-coded or input-derived identifiers in called workflow concurrency groups to ensure uniqueness."
116
+ - "If the caller already manages concurrency at the job level, omit concurrency from the called workflow entirely."
117
+ - "Test by triggering two back-to-back runs and confirming neither cancels the other unexpectedly."
118
+ docs:
119
+ - url: "https://docs.github.com/en/actions/sharing-automations/reusing-workflows"
120
+ label: "GitHub Docs: Reusing workflows — concurrency warning"
121
+ - url: "https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/using-concurrency"
122
+ label: "GitHub Docs: Using concurrency in GitHub Actions"
123
+ - url: "https://github.com/actions/runner/issues/3205"
124
+ label: "actions/runner#3205: github.workflow context causes child workflow cancellation"
@@ -0,0 +1,123 @@
1
+ id: concurrency-timing-018
2
+ title: "Jobs Dispatched to Runner Scale Set Never Start — Label Mismatch or Registration Failure"
3
+ category: concurrency-timing
4
+ severity: error
5
+ tags:
6
+ - runner-scale-set
7
+ - self-hosted
8
+ - job-routing
9
+ - label
10
+ - registration
11
+ - arc
12
+ - actions-runner-controller
13
+ - stuck
14
+ patterns:
15
+ - regex: "Waiting for a runner to pick up this job"
16
+ flags: "i"
17
+ - regex: "runner scale set.*not found"
18
+ flags: "im"
19
+ - regex: "No runner matching the required labels .*\\[.*\\] is online"
20
+ flags: "i"
21
+ - regex: "Could not find any online and idle runners that match the requested labels"
22
+ flags: "i"
23
+ error_messages:
24
+ - "Waiting for a runner to pick up this job..."
25
+ - "No runner matching the required labels '[self-hosted, production-scale-set]' is online."
26
+ - "Could not find any online and idle runners that match the requested labels."
27
+ - "This job was lost while waiting for a runner. Please re-run the job."
28
+ root_cause: |
29
+ GitHub Actions runner scale sets (introduced via Actions Runner Controller / ARC and the
30
+ newer Go-based runner scale set client) dispatch jobs to a dynamically scaling pool of
31
+ runners. When a job is queued but no runner starts to process it, several misconfiguration
32
+ scenarios cause the job to wait indefinitely:
33
+
34
+ 1. **Label mismatch**: The workflow's `runs-on` label does not exactly match the
35
+ runner scale set's configured name/label. Labels are case-sensitive. A workflow
36
+ specifying `runs-on: prod-scale-set` will never match a scale set registered as
37
+ `Prod-Scale-Set` or `production-scale-set`.
38
+
39
+ 2. **Registration failure**: The scale set client failed to register with GitHub
40
+ (bad `githubConfigUrl`, expired PAT, GitHub App credentials rotated) but no alert
41
+ was generated. The runner group appears in the UI but receives no jobs because it
42
+ has zero healthy, registered instances.
43
+
44
+ 3. **Min runners set to 0, scale-up blocked**: The scale set is configured with
45
+ `minRunners: 0` and depends on job events to scale up, but a network policy,
46
+ firewall rule, or Kubernetes resource quota prevents new runner pods from starting.
47
+ The broker sees a healthy scale set but new runners cannot provision.
48
+
49
+ 4. **Runner listener terminated unexpectedly**: The runner listener process (the
50
+ manager component) crashed silently. GitHub shows the scale set as registered but
51
+ it has no workers accepting the dispatch message.
52
+
53
+ 5. **Org/repo runner group restriction**: The runner group containing the scale set
54
+ has repository-access restrictions — the requesting repo is not in the allowed list.
55
+
56
+ Jobs in this state do not produce an explicit error in the Actions log; they are
57
+ stuck in "Waiting for a runner" indefinitely until they hit the 35-day job timeout
58
+ or are manually cancelled.
59
+ fix: |
60
+ 1. **Verify label exact-match**: Compare `runs-on:` value in the workflow against
61
+ the registered runner scale set name in Settings → Actions → Runners. Must be
62
+ an exact case-sensitive match.
63
+
64
+ 2. **Check registration health**: In the runner scale set client logs (or ARC
65
+ controller pod logs), look for registration errors. Re-create credentials if
66
+ the GitHub App or PAT has been rotated.
67
+
68
+ 3. **Inspect runner group access**: Settings → Actions → Runner groups → the group
69
+ containing your scale set → confirm the requesting repo is listed under
70
+ "Repository access".
71
+
72
+ 4. **Check scale-up events**: Review Kubernetes events (`kubectl get events -n <ns>`)
73
+ for resource quota exceeded, image pull failures, or scheduling constraints that
74
+ prevent runner pods from starting.
75
+
76
+ 5. **Restart the runner listener**: If the manager/listener process crashed, restart
77
+ the scale set client deployment or the ARC AutoscalingRunnerSet resource.
78
+ fix_code:
79
+ - language: yaml
80
+ label: "Workflow: verify exact label matches your scale set registration name"
81
+ code: |
82
+ jobs:
83
+ build:
84
+ # Must exactly match the scale set name registered in Settings → Actions → Runners
85
+ # e.g., registered as "prod-k8s-runners" → use exactly "prod-k8s-runners"
86
+ runs-on: prod-k8s-runners # ← exact case-sensitive match required
87
+ steps:
88
+ - uses: actions/checkout@v4
89
+ - run: echo "Running on scale set runner"
90
+ - language: yaml
91
+ label: "Diagnose label and runner group access via GitHub API"
92
+ code: |
93
+ # List runner groups for an org (requires admin PAT):
94
+ # GET /orgs/{org}/actions/runner-groups
95
+ #
96
+ # List repos allowed to use a specific runner group:
97
+ # GET /orgs/{org}/actions/runner-groups/{group_id}/repositories
98
+ #
99
+ # List self-hosted runners in a group:
100
+ # GET /orgs/{org}/actions/runner-groups/{group_id}/runners
101
+ #
102
+ # Using gh CLI:
103
+ # gh api /orgs/MY-ORG/actions/runner-groups --jq '.[].runners_url'
104
+ - name: Print runner labels (inside the running job)
105
+ run: |
106
+ echo "Runner name: ${{ runner.name }}"
107
+ echo "Runner OS: ${{ runner.os }}"
108
+ echo "Runner arch: ${{ runner.arch }}"
109
+ prevention:
110
+ - "Treat runner scale set names as API contracts — document them in a shared wiki and never rename without updating all workflow files."
111
+ - "Set up monitoring on the runner scale set listener/controller pod — alert if it crashes or stops processing dispatches."
112
+ - "Configure `minRunners: 1` on production scale sets to keep at least one warm runner active; `minRunners: 0` requires flawless scale-up on every job dispatch."
113
+ - "Add a `timeout-minutes` on jobs dispatched to scale sets so stuck jobs fail fast rather than consuming the 35-day wait limit."
114
+ - "After rotating GitHub App credentials or PATs used by the scale set client, immediately verify a test workflow completes on the scale set."
115
+ docs:
116
+ - url: "https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners-with-actions-runner-controller/deploying-runner-scale-sets-with-actions-runner-controller"
117
+ label: "GitHub Docs: Deploying runner scale sets with Actions Runner Controller"
118
+ - url: "https://github.blog/changelog/2026-02-05-github-actions-early-february-2026-updates/"
119
+ label: "GitHub Changelog: Actions early February 2026 updates (runner scale set client)"
120
+ - url: "https://github.com/orgs/community/discussions/171291"
121
+ label: "Community Discussion: Runner scale set jobs not being dispatched"
122
+ - url: "https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/managing-access-to-self-hosted-runners-using-groups"
123
+ label: "GitHub Docs: Managing access to self-hosted runners using groups"
@@ -0,0 +1,90 @@
1
+ id: concurrency-timing-022
2
+ title: "Concurrent Self-Hosted Runner Workers Share _temp — Cancelled Job Wipes Active Job Files"
3
+ category: concurrency-timing
4
+ severity: error
5
+ tags:
6
+ - self-hosted-runner
7
+ - temp-directory
8
+ - race-condition
9
+ - worker
10
+ - non-ephemeral
11
+ patterns:
12
+ - regex: "Missing file at path:.*_temp.*_runner_file_commands.*set_output"
13
+ flags: "i"
14
+ - regex: "TempDirectoryManager.*Cleaning runner temp folder"
15
+ flags: "i"
16
+ - regex: "We are not yet checking the state of jobrequest.*Cancel running worker right away"
17
+ flags: "i"
18
+ - regex: "Worker finished for job.*Code: 102"
19
+ flags: "i"
20
+ error_messages:
21
+ - "Error: Missing file at path: /home/runner/work/_temp/_runner_file_commands/set_output_<uuid>"
22
+ - "[TempDirectoryManager] Cleaning runner temp folder: /home/runner/work/_temp"
23
+ - "[JobDispatcher] We are not yet checking the state of jobrequest ... Cancel running worker right away."
24
+ - "Worker finished for job ... Code: 102"
25
+ root_cause: |
26
+ When two jobs are dispatched to the same non-ephemeral self-hosted runner in quick succession,
27
+ the runner cancels the first Worker and immediately spawns a second Worker — both run
28
+ simultaneously for several seconds. Both Workers share the same _temp directory
29
+ (_work/_temp). The cancelled Worker TempDirectoryManager cleanup fires 10-30 seconds
30
+ after cancellation, which is after the new Worker has already created its file-command
31
+ pipes (_runner_file_commands/set_output_*, step_summary_*) in the shared _temp.
32
+
33
+ The cleanup unconditionally wipes the entire _temp folder, deleting the new Worker active
34
+ file command pipes mid-step. The new job exits with code 102
35
+ (runner infrastructure failure), not a workflow step failure.
36
+
37
+ Root issue: JobDispatcher does not wait for the previous Worker process to exit before
38
+ spawning the new Worker. Both PIDs coexist on the same temp path.
39
+ Reported in actions/runner#4357, Apr 2026; fix proposed in PR #4371.
40
+ fix: |
41
+ Primary fix (recommended): Use ephemeral runners.
42
+ Ephemeral runners (--ephemeral) start fresh for each job and never receive a second job
43
+ assignment, eliminating the race entirely.
44
+
45
+ Alternative: Limit concurrency on the runner group.
46
+ If ephemeral mode is not an option, configure the workflow concurrency group to prevent
47
+ two workflows from targeting the same runner label simultaneously.
48
+
49
+ Alternative: Use a per-job container.
50
+ Run each job inside an isolated Docker container on the self-hosted runner so
51
+ the file paths do not collide between concurrent jobs.
52
+ fix_code:
53
+ - language: yaml
54
+ label: "Use ephemeral self-hosted runner (recommended)"
55
+ code: |
56
+ # Register your runner with --ephemeral flag:
57
+ # ./config.sh --url https://github.com/org/repo --token TOKEN --ephemeral
58
+ #
59
+ # In your workflow, the runner label stays the same:
60
+ jobs:
61
+ build:
62
+ runs-on: [self-hosted, linux, x64]
63
+ steps:
64
+ - uses: actions/checkout@v4
65
+ - run: make build
66
+ - language: yaml
67
+ label: "Limit per-runner concurrency via workflow concurrency group"
68
+ code: |
69
+ concurrency:
70
+ group: self-hosted-runner-${{ github.ref }}
71
+ cancel-in-progress: false # Queue instead of cancel
72
+
73
+ jobs:
74
+ build:
75
+ runs-on: [self-hosted, linux, x64]
76
+ steps:
77
+ - uses: actions/checkout@v4
78
+ - run: make build
79
+ prevention:
80
+ - "Use ephemeral runners for all self-hosted CI workloads — they also improve security and reduce state leakage between jobs."
81
+ - "Avoid non-ephemeral runners for workflows that run frequently; shorter intervals increase the chance of dispatch overlap."
82
+ - "Monitor for exit code 102 in workflow logs — it indicates a runner infrastructure failure, not a step failure."
83
+ - "Track actions/runner#4357 for an official fix to TempDirectoryManager shared-path cleanup behavior."
84
+ docs:
85
+ - url: "https://github.com/actions/runner/issues/4357"
86
+ label: "actions/runner#4357 — TempDirectoryManager race condition (Apr 2026)"
87
+ - url: "https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/autoscaling-with-self-hosted-runners#using-ephemeral-runners-for-autoscaling"
88
+ label: "Ephemeral runners for autoscaling"
89
+ - url: "https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/using-concurrency"
90
+ label: "Using concurrency"