@htekdev/actions-debugger 1.0.28 → 1.0.30

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,78 @@
1
+ id: 'caching-artifacts-028'
2
+ title: 'hashFiles() returns empty string when no files match pattern, causing cache key collision across all runs'
3
+ category: caching-artifacts
4
+ severity: silent-failure
5
+ tags:
6
+ - hashfiles
7
+ - cache-key
8
+ - empty-string
9
+ - collision
10
+ - lock-file
11
+ - monorepo
12
+ patterns:
13
+ - regex: 'hashFiles\('
14
+ flags: 'i'
15
+ error_messages:
16
+ - 'Cache hit for key:'
17
+ - 'hashFiles result is empty string'
18
+ root_cause: |
19
+ When hashFiles('**/package-lock.json') finds no matching files, it returns an empty
20
+ string instead of failing. A cache key like:
21
+ ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
22
+ evaluates to:
23
+ Linux-node-
24
+ (no hash suffix). All workflow runs — regardless of their actual dependency state —
25
+ then share this single cache key. The first run to complete saves its node_modules
26
+ under this key; every subsequent run gets a stale cache hit with potentially outdated
27
+ or wrong dependencies.
28
+
29
+ The cache-hit output shows 'true' and the step succeeds with no warning. Developers
30
+ see unexpectedly fast runs (cache always hits) but may encounter subtle dependency
31
+ staleness bugs.
32
+
33
+ Behavior varies across versions: actions/toolkit prior to 1.9.0 threw an exception
34
+ on empty results; later versions silently return empty string, making the collapsing
35
+ key the default behavior for repositories without the expected lock file.
36
+ fix: |
37
+ Guard hashFiles() with a fallback value so the cache key is never incomplete when
38
+ no matching files exist. Using || github.sha or || github.run_id ensures each run
39
+ gets a unique key when no lock file is present, preventing stale cache collisions.
40
+ fix_code:
41
+ - language: yaml
42
+ label: 'Add fallback to hashFiles to prevent empty cache key'
43
+ code: |
44
+ - uses: actions/cache@v4
45
+ with:
46
+ path: ~/.npm
47
+ # Fallback to github.sha when no lock file exists — prevents key collision
48
+ key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') || github.sha }}
49
+ restore-keys: |
50
+ ${{ runner.os }}-node-
51
+ - language: yaml
52
+ label: 'Verify cache is populated before relying on it'
53
+ code: |
54
+ - uses: actions/cache@v4
55
+ id: npm-cache
56
+ with:
57
+ path: ~/.npm
58
+ key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') || github.sha }}
59
+
60
+ - name: Confirm cache restored correctly
61
+ if: steps.npm-cache.outputs.cache-hit == 'true'
62
+ run: |
63
+ if [ ! -d ~/.npm ]; then
64
+ echo "Cache hit claimed but directory missing — likely empty-key collision"
65
+ exit 1
66
+ fi
67
+ prevention:
68
+ - 'Always add || github.sha fallback after hashFiles() in cache keys'
69
+ - 'Use actions built-in caching (setup-node cache: npm) which handles missing lock files safely'
70
+ - 'In monorepos without a root-level lock file, construct keys from per-package hash patterns'
71
+ - 'Test cache behavior in branches or forks where lock files might not yet exist'
72
+ docs:
73
+ - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/evaluate-expressions-in-workflows-and-actions#hashfiles'
74
+ label: 'GitHub Docs: hashFiles() function'
75
+ - url: 'https://github.com/actions/cache/issues/1175'
76
+ label: 'actions/cache#1175: hashFiles empty result causes key collision'
77
+ - url: 'https://github.com/actions/toolkit/blob/main/packages/glob/README.md'
78
+ label: 'actions/toolkit: glob — hashFiles behavior on no match'
@@ -0,0 +1,80 @@
1
+ id: 'caching-artifacts-027'
2
+ title: 'restore-keys fallback matches cross-OS or cross-architecture cache, restoring incompatible binaries'
3
+ category: caching-artifacts
4
+ severity: silent-failure
5
+ tags:
6
+ - cache
7
+ - restore-keys
8
+ - cross-platform
9
+ - architecture
10
+ - arm64
11
+ - runner-os
12
+ patterns:
13
+ - regex: 'restore-keys:'
14
+ flags: 'i'
15
+ error_messages:
16
+ - 'Cache restored from key'
17
+ - 'Exec format error'
18
+ - 'cannot execute binary file: Exec format error'
19
+ root_cause: |
20
+ restore-keys performs prefix matching against ALL cached entries in the repository,
21
+ regardless of operating system or CPU architecture. When a restore-keys prefix is
22
+ shorter than the primary cache key and omits runner.os or runner.arch, a cache
23
+ saved on one platform can be silently restored on a different one.
24
+
25
+ Example: primary key Linux-x64-node-abc123, restore-keys Linux-node- will match
26
+ a Linux ARM64 cache saved as Linux-arm64-node-xyz789. The ARM64 node_modules
27
+ contains native addon binaries (esbuild, sqlite3, etc.) compiled for ARM64; when
28
+ restored on an x64 runner, they fail at runtime with "Exec format error."
29
+
30
+ This became a widespread issue after GitHub introduced macOS ARM64 (M1/M2) runners
31
+ in 2023 and Linux ARM64 runners in 2024. Teams adding new runner architectures
32
+ to existing matrix builds often expose this silently.
33
+
34
+ The cache-hit output evaluates to 'true' even for cross-architecture restores,
35
+ providing no indication that the restored content may be incompatible.
36
+ fix: |
37
+ Always include runner.os AND runner.arch in every level of restore-keys, mirroring
38
+ whatever isolation is present in the primary cache key. No restore-keys prefix should
39
+ ever be shorter than the architecture scope of the primary key.
40
+ fix_code:
41
+ - language: yaml
42
+ label: 'Include runner.os and runner.arch in all restore-keys levels'
43
+ code: |
44
+ - uses: actions/cache@v4
45
+ with:
46
+ path: ~/.npm
47
+ # Primary key includes full OS and architecture isolation
48
+ key: ${{ runner.os }}-${{ runner.arch }}-node-${{ hashFiles('**/package-lock.json') }}
49
+ # Every fallback level maintains OS plus architecture isolation
50
+ restore-keys: |
51
+ ${{ runner.os }}-${{ runner.arch }}-node-
52
+ ${{ runner.os }}-${{ runner.arch }}-
53
+ - language: yaml
54
+ label: 'Matrix build with per-arch cache keys'
55
+ code: |
56
+ strategy:
57
+ matrix:
58
+ os: [ubuntu-latest, macos-latest, windows-latest]
59
+ arch: [x64, arm64]
60
+ steps:
61
+ - uses: actions/cache@v4
62
+ with:
63
+ path: |
64
+ ~/.cargo/registry
65
+ target/
66
+ key: ${{ matrix.os }}-${{ matrix.arch }}-rust-${{ hashFiles('**/Cargo.lock') }}
67
+ restore-keys: |
68
+ ${{ matrix.os }}-${{ matrix.arch }}-rust-
69
+ prevention:
70
+ - 'Always include runner.os AND runner.arch in every level of restore-keys'
71
+ - 'Audit cache configurations when adding new runner OS or arch combinations to matrix builds'
72
+ - 'Add a verification step after cache restore to confirm a native binary executes correctly'
73
+ - 'When migrating from x64-only to multi-arch, update all restore-keys at the same time'
74
+ docs:
75
+ - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows#matching-a-cache-key'
76
+ label: 'GitHub Docs: Matching a cache key'
77
+ - url: 'https://github.com/actions/cache#inputs'
78
+ label: 'actions/cache README: restore-keys input'
79
+ - url: 'https://github.com/actions/cache/issues/1660'
80
+ label: 'actions/cache#1660: restore-keys cross-architecture match'
@@ -0,0 +1,86 @@
1
+ id: 'concurrency-timing-023'
2
+ title: 'Cleanup jobs with if: cancelled() do not run when workflow is canceled by concurrency group'
3
+ category: concurrency-timing
4
+ severity: silent-failure
5
+ tags:
6
+ - concurrency
7
+ - cancelled
8
+ - cleanup
9
+ - cancel-in-progress
10
+ - if-condition
11
+ patterns:
12
+ - regex: 'cancel-in-progress:\s*true'
13
+ flags: 'i'
14
+ - regex: 'if:\s*cancelled\(\)'
15
+ flags: 'i'
16
+ error_messages:
17
+ - 'This run has been cancelled.'
18
+ - 'Job cancelled by a newer workflow run'
19
+ root_cause: |
20
+ When cancel-in-progress: true cancels a workflow run because a new run was queued in the same
21
+ concurrency group, GitHub cancels the entire workflow run at the infrastructure level before
22
+ individual job-level if: conditions are evaluated. As a result, jobs with if: cancelled() or
23
+ if: always() defined to run after a canceled parent job are themselves canceled before they
24
+ can be dispatched to a runner.
25
+
26
+ This is distinct from a job failing or being manually canceled: concurrency-group cancellation
27
+ is an external platform signal. In practice, cleanup jobs that rely on if: cancelled() may
28
+ start briefly but are killed mid-execution if they happen to be in-flight when the cancel
29
+ propagates.
30
+ fix: |
31
+ Use a separate workflow triggered by workflow_run with types: [completed] as the cleanup
32
+ trigger rather than relying on in-workflow if: cancelled() jobs. The workflow_run approach
33
+ fires reliably regardless of how the parent workflow ended.
34
+
35
+ If the in-workflow approach is required, use if: always() rather than if: cancelled() and
36
+ ensure the cleanup job starts quickly (lightweight first step) to reduce the window during
37
+ which the cancellation signal can reach it.
38
+ fix_code:
39
+ - language: yaml
40
+ label: 'Reliable cleanup via separate workflow_run trigger'
41
+ code: |
42
+ # .github/workflows/cleanup.yml
43
+ on:
44
+ workflow_run:
45
+ workflows: ['CI']
46
+ types: [completed]
47
+
48
+ jobs:
49
+ cleanup:
50
+ runs-on: ubuntu-latest
51
+ if: >-
52
+ ${{ github.event.workflow_run.conclusion == 'cancelled' ||
53
+ github.event.workflow_run.conclusion == 'failure' }}
54
+ steps:
55
+ - name: Run cleanup
56
+ run: echo "Cleaning up after ${{ github.event.workflow_run.conclusion }}"
57
+ - language: yaml
58
+ label: 'Best-effort if:always() with fast first step'
59
+ code: |
60
+ jobs:
61
+ build:
62
+ runs-on: ubuntu-latest
63
+ steps:
64
+ - run: ./run-tests.sh
65
+
66
+ cleanup:
67
+ needs: build
68
+ if: always()
69
+ runs-on: ubuntu-latest
70
+ steps:
71
+ - name: Signal start immediately
72
+ run: echo "Cleanup starting"
73
+ - name: Do cleanup
74
+ run: ./cleanup.sh
75
+ prevention:
76
+ - 'Do not rely solely on if: cancelled() for critical cleanup when cancel-in-progress: true is active'
77
+ - 'Use a separate workflow_run: completed trigger for guaranteed post-run cleanup logic'
78
+ - 'Use if: always() instead of if: cancelled() for broader coverage'
79
+ - 'Keep cleanup steps inside the main job where possible — step-level if: always() is more reliable than job-level when concurrency cancels the run'
80
+ docs:
81
+ - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/using-concurrency'
82
+ label: 'GitHub Docs: Using concurrency'
83
+ - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#workflow_run'
84
+ label: 'GitHub Docs: workflow_run event'
85
+ - url: 'https://github.com/orgs/community/discussions/13655'
86
+ label: 'GitHub Community: cleanup jobs not running after concurrency cancellation'
@@ -0,0 +1,72 @@
1
+ id: 'concurrency-timing-024'
2
+ title: 'timeout-minutes applies to job execution only, not queue wait time — jobs can wait indefinitely'
3
+ category: concurrency-timing
4
+ severity: warning
5
+ tags:
6
+ - timeout
7
+ - queue
8
+ - self-hosted
9
+ - runner
10
+ - wait-time
11
+ patterns:
12
+ - regex: 'timeout-minutes:\s*\d+'
13
+ flags: 'i'
14
+ error_messages:
15
+ - 'The job running on runner has exceeded the maximum execution time of'
16
+ - 'The operation was canceled.'
17
+ root_cause: |
18
+ timeout-minutes only counts elapsed time from when a job actually begins executing on
19
+ a runner — not from when it enters the queue. A job waiting for an available runner
20
+ slot (including jobs waiting in a concurrency group queue) can sit pending for hours
21
+ or indefinitely without any timeout being applied.
22
+
23
+ This is particularly impactful with:
24
+ - Self-hosted runners under heavy load with limited runner capacity
25
+ - Concurrency groups with cancel-in-progress: false that accumulate queued jobs
26
+ - Repository-level runner quotas on GitHub-hosted runners during peak usage
27
+
28
+ Developers are often surprised that a job with timeout-minutes: 30 waited 4+ hours
29
+ before starting, then proceeded to run for its full 30-minute budget.
30
+ fix: |
31
+ There is no native queue-timeout setting in GitHub Actions. Recommended workarounds:
32
+
33
+ 1. Set cancel-in-progress: true in concurrency groups to drop stale queued jobs
34
+ when newer commits arrive, preventing queue accumulation.
35
+ 2. Monitor queue depth using the GitHub REST API /repos/{owner}/{repo}/actions/runs
36
+ and set up external alerting for runs stuck in 'queued' status too long.
37
+ 3. Ensure adequate self-hosted runner pool capacity relative to expected parallelism.
38
+ 4. Use github-hosted runners for time-sensitive jobs to avoid self-hosted queue depth issues.
39
+ fix_code:
40
+ - language: yaml
41
+ label: 'Prevent queue accumulation with cancel-in-progress'
42
+ code: |
43
+ jobs:
44
+ build:
45
+ runs-on: self-hosted
46
+ timeout-minutes: 30 # Only counts execution time, NOT queue wait time
47
+ concurrency:
48
+ group: ${{ github.workflow }}-${{ github.ref }}
49
+ cancel-in-progress: true # Drop stale queued jobs on new push
50
+ steps:
51
+ - uses: actions/checkout@v4
52
+ - run: ./build.sh
53
+ - language: yaml
54
+ label: 'External queue monitoring via API'
55
+ code: |
56
+ # Monitor for stuck queued runs via GitHub API
57
+ # GET /repos/{owner}/{repo}/actions/runs?status=queued
58
+ # Alert if any run has been queued for more than N minutes
59
+ # (implement in a separate monitoring workflow or external system)
60
+ prevention:
61
+ - 'Do not assume timeout-minutes prevents jobs from waiting indefinitely in the runner queue'
62
+ - 'Use cancel-in-progress: true for CI workflows to prevent queue accumulation'
63
+ - 'Size self-hosted runner pools to handle expected peak concurrency'
64
+ - 'Monitor workflow run queue depth separately via the GitHub REST API'
65
+ - 'Document queue wait behavior in team CI runbooks so on-call engineers know what to expect'
66
+ docs:
67
+ - url: 'https://docs.github.com/en/actions/writing-workflows/workflow-syntax-for-github-actions#jobsjob_idtimeout-minutes'
68
+ label: 'GitHub Docs: timeout-minutes'
69
+ - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/using-concurrency'
70
+ label: 'GitHub Docs: Using concurrency'
71
+ - url: 'https://docs.github.com/en/rest/actions/workflow-runs'
72
+ label: 'GitHub REST API: Workflow runs'
@@ -0,0 +1,78 @@
1
+ id: 'known-unsolved-030'
2
+ title: 'core.getInput cannot distinguish unset input from explicitly-empty input'
3
+ category: known-unsolved
4
+ severity: limitation
5
+ tags:
6
+ - toolkit
7
+ - core-getInput
8
+ - composite-action
9
+ - empty-string
10
+ - null-input
11
+ - fork-secrets
12
+ patterns:
13
+ - regex: 'core\.getInput\('
14
+ flags: 'i'
15
+ error_messages:
16
+ - 'No way to detect if input was provided vs set to empty string'
17
+ root_cause: |
18
+ The `core.getInput(name)` function in `@actions/core` always returns an empty string ('')
19
+ when an input is either not provided by the caller or explicitly set to the empty string.
20
+ There is no `core.hasInput(name)` API or other mechanism to distinguish these two cases.
21
+
22
+ This creates several practical problems for action authors:
23
+ - A required input set to '' passes the required: true check in getInput but is semantically absent
24
+ - Fork pull requests inject secrets as empty strings (secrets are unavailable); an action cannot tell
25
+ if a secret input was omitted vs provided-as-empty-because-fork
26
+ - Composite action callers cannot express "I intentionally leave this blank" vs "I don't provide this at all"
27
+ - YAML null or ~ values (e.g., with: my_val: ~) are coerced to '' by the runner before the action sees them
28
+
29
+ Upstream GitHub toolkit issue #940 has been open since 2022 with 22 upvotes and no fix planned.
30
+ fix: |
31
+ No direct fix exists — there is no core.hasInput() API. Workarounds depend on the use case:
32
+ - For sentinel detection: document a convention like 'none' or '__unset__' as the explicit absent value
33
+ and check getInput('x') === 'none'
34
+ - For fork secret detection: check github.event.pull_request.head.repo.fork == true and gate on that
35
+ rather than on whether the secret is empty
36
+ - For optional inputs: provide a well-documented default value in action.yml so callers always get a
37
+ predictable non-empty string when they omit the input
38
+ - For composite actions: use ${{ inputs.my_input != '' }} in if: conditions, documenting that
39
+ callers must pass a non-empty string to opt in
40
+ fix_code:
41
+ - language: yaml
42
+ label: 'Use sentinel value convention to detect absent input'
43
+ code: |
44
+ # action.yml — declare sentinel default
45
+ inputs:
46
+ deploy_env:
47
+ description: 'Target environment (leave blank to skip deployment)'
48
+ required: false
49
+ default: '__unset__'
50
+
51
+ # In composite action steps
52
+ steps:
53
+ - name: Deploy
54
+ if: ${{ inputs.deploy_env != '__unset__' && inputs.deploy_env != '' }}
55
+ run: echo "Deploying to ${{ inputs.deploy_env }}"
56
+ - language: yaml
57
+ label: 'Detect fork PR to guard secret-gated steps instead of empty-check'
58
+ code: |
59
+ steps:
60
+ - name: Publish (skip on fork PRs)
61
+ if: >-
62
+ ${{ github.event_name != 'pull_request' ||
63
+ github.event.pull_request.head.repo.full_name == github.repository }}
64
+ run: echo "$NPM_TOKEN" | npm login --registry https://registry.npmjs.org
65
+ env:
66
+ NPM_TOKEN: ${{ secrets.NPM_TOKEN }}
67
+ prevention:
68
+ - 'Document in action.yml that empty string and absent are treated identically by core.getInput'
69
+ - 'Use a non-empty sentinel default value (e.g. __unset__) instead of relying on empty-check logic'
70
+ - 'Never gate fork-secret logic on secret emptiness — use fork-detection via event context instead'
71
+ - 'For required inputs that must be non-empty, add an explicit validation step that fails with a helpful message'
72
+ docs:
73
+ - url: 'https://github.com/actions/toolkit/issues/940'
74
+ label: 'actions/toolkit#940: Impossible to detect unset inputs from inputs set as empty string'
75
+ - url: 'https://github.com/actions/toolkit/tree/main/packages/core'
76
+ label: 'actions/toolkit core package — getInput API'
77
+ - url: 'https://docs.github.com/en/actions/sharing-automations/creating-actions/metadata-syntax-for-github-actions#inputs'
78
+ label: 'GitHub Docs: Action metadata — inputs'
@@ -0,0 +1,90 @@
1
+ id: 'runner-environment-083'
2
+ title: 'Actions Runner Controller pods intermittently fail with "A task was cancelled" during large matrix jobs'
3
+ category: runner-environment
4
+ severity: error
5
+ tags:
6
+ - arc
7
+ - kubernetes
8
+ - matrix
9
+ - task-cancelled
10
+ - pod-eviction
11
+ - self-hosted
12
+ - keda
13
+ patterns:
14
+ - regex: 'A task was cancell?ed\.'
15
+ flags: 'i'
16
+ - regex: 'The operation was canceled\.'
17
+ flags: 'i'
18
+ error_messages:
19
+ - 'Error: A task was cancelled.'
20
+ - 'Error: The operation was canceled.'
21
+ root_cause: |
22
+ When using Actions Runner Controller (ARC) to run GitHub Actions on Kubernetes, ephemeral
23
+ runner pods can be evicted or preempted mid-job by the Kubernetes scheduler, producing
24
+ a "A task was cancelled" or "The operation was canceled" error with no application-level
25
+ log output before the failure.
26
+
27
+ Common causes:
28
+ - Kubernetes resource pressure: if a node is under memory or CPU pressure, the kubelet
29
+ evicts lower-priority pods. ARC runner pods have no PriorityClass by default and are
30
+ among the first to be evicted
31
+ - Node autoscaling: Cluster Autoscaler draining nodes for scale-down triggers eviction of
32
+ runner pods that have been running longer than the scale-down grace period
33
+ - KEDA queue-length scaling: KEDA scaling down the runner Deployment while jobs are
34
+ in-flight terminates runner pods before jobs complete
35
+ - OOM kills: matrix jobs that each consume significant memory can saturate node memory,
36
+ causing the OOM killer to terminate runner pods
37
+
38
+ The issue is especially common with large matrix builds (10+ parallel jobs) because the
39
+ aggregate resource demand spike can trigger autoscaler or eviction behavior. Because ARC
40
+ runner pods are ephemeral and have no restart policy, the job is permanently failed when
41
+ the pod is evicted.
42
+ fix: |
43
+ Assign a high PriorityClass to ARC runner pods so the Kubernetes scheduler avoids evicting
44
+ them during resource pressure. Also set adequate resource requests/limits and configure
45
+ terminationGracePeriodSeconds to at least the expected maximum job duration.
46
+ fix_code:
47
+ - language: yaml
48
+ label: 'Create a high-priority PriorityClass for ARC runner pods'
49
+ code: |
50
+ apiVersion: scheduling.k8s.io/v1
51
+ kind: PriorityClass
52
+ metadata:
53
+ name: github-runner-high
54
+ value: 1000000
55
+ globalDefault: false
56
+ description: 'High priority for GitHub Actions runner pods to prevent eviction'
57
+ - language: yaml
58
+ label: 'Reference PriorityClass and set resource limits in ARC AutoscalingRunnerSet values'
59
+ code: |
60
+ # helm values for actions-runner-controller AutoscalingRunnerSet chart
61
+ template:
62
+ spec:
63
+ priorityClassName: github-runner-high
64
+ # Allow jobs up to 1 hour to finish before pod is force-terminated
65
+ terminationGracePeriodSeconds: 3600
66
+ containers:
67
+ - name: runner
68
+ resources:
69
+ requests:
70
+ memory: '2Gi'
71
+ cpu: '500m'
72
+ limits:
73
+ memory: '4Gi'
74
+ cpu: '2000m'
75
+ prevention:
76
+ - 'Assign a PriorityClass to ARC runner pods to prevent eviction under resource pressure'
77
+ - 'Set terminationGracePeriodSeconds to at least the expected maximum single-job duration'
78
+ - 'Set explicit resource requests and limits to avoid OOM kills during large matrix builds'
79
+ - 'Configure KEDA scale-down stabilization windows to prevent scaling down while jobs run'
80
+ - 'Monitor node resource utilization and right-size cluster nodes for peak matrix concurrency'
81
+ - 'Enable PodDisruptionBudgets for runner workloads to reduce involuntary evictions during node drains'
82
+ docs:
83
+ - url: 'https://github.com/actions/runner/issues/3819'
84
+ label: 'actions/runner#3819: A lot of random "A task was cancelled" errors'
85
+ - url: 'https://kubernetes.io/docs/concepts/scheduling-eviction/pod-priority-preemption/'
86
+ label: 'Kubernetes: Pod Priority and Preemption'
87
+ - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/autoscaling-with-self-hosted-runners'
88
+ label: 'GitHub Docs: Autoscaling with self-hosted runners'
89
+ - url: 'https://github.com/actions/actions-runner-controller'
90
+ label: 'Actions Runner Controller (ARC) GitHub repository'
@@ -0,0 +1,83 @@
1
+ id: 'runner-environment-084'
2
+ title: 'Bash/sh script handler does not quote script path — fails when path contains spaces'
3
+ category: runner-environment
4
+ severity: error
5
+ tags:
6
+ - bash
7
+ - shell
8
+ - path-spaces
9
+ - job-hooks
10
+ - self-hosted
11
+ - macos
12
+ - tart-vm
13
+ patterns:
14
+ - regex: 'ACTIONS_RUNNER_HOOK_JOB_(?:STARTED|COMPLETED)'
15
+ flags: 'i'
16
+ - regex: 'bash.*No such file or directory'
17
+ flags: 'i'
18
+ error_messages:
19
+ - 'bash: /Volumes/My Shared Files/hook.sh: No such file or directory'
20
+ - 'sh: /path with spaces/script.sh: not found'
21
+ - '/path/with: not found'
22
+ root_cause: |
23
+ The GitHub Actions runner bash/sh script handler does not quote the script path
24
+ placeholder when building the shell invocation. The default bash arguments in
25
+ ScriptHandlerHelpers.cs are:
26
+ --noprofile --norc -e -o pipefail {0}
27
+ When {0} is replaced with a path containing spaces — e.g.,
28
+ /Volumes/My Shared Files/hook.sh
29
+ bash receives the path as two separate arguments due to word splitting:
30
+ bash --noprofile --norc -e -o pipefail /Volumes/My Shared Files/hook.sh
31
+ This causes a "No such file or directory" error for the first word-split token.
32
+
33
+ By contrast, the PowerShell and cmd handlers correctly quote the path:
34
+ pwsh: -command "& '{0}'"
35
+ powershell: -command ". '{0}'"
36
+ cmd: /D /E:ON /V:OFF /S /C "CALL "{0}""
37
+ Only bash and sh are affected (runner#4404, unresolved as of June 2026).
38
+
39
+ Practical impact:
40
+ - Job hooks (ACTIONS_RUNNER_HOOK_JOB_STARTED, ACTIONS_RUNNER_HOOK_JOB_COMPLETED) placed in
41
+ shared directories with spaces — common on macOS Tart VMs mounted at /Volumes/My Shared Files/
42
+ - Self-hosted runner workspaces on paths containing spaces (less common but possible)
43
+ - Any run: step using a working-directory with spaces in the resolved path
44
+ fix: |
45
+ Ensure hook script paths and runner working directories never contain spaces.
46
+ On macOS Tart VMs, place hook scripts under a path without spaces (e.g., /Users/runner/hooks/).
47
+
48
+ Use a wrapper script at a space-free path that exec-delegates to the actual script if
49
+ it must reside under a shared mount with spaces in its path.
50
+
51
+ Monitor actions/runner#4404 for the upstream fix and upgrade when a patched runner ships.
52
+ fix_code:
53
+ - language: yaml
54
+ label: 'Configure job hook at a space-free path (environment variable)'
55
+ code: |
56
+ # In the runner .env file (e.g., /home/runner/actions-runner/.env):
57
+ # Point hook variables to a path WITHOUT spaces
58
+ ACTIONS_RUNNER_HOOK_JOB_STARTED=/Users/runner/hooks/job-started.sh
59
+ ACTIONS_RUNNER_HOOK_JOB_COMPLETED=/Users/runner/hooks/job-completed.sh
60
+ #
61
+ # Avoid paths like:
62
+ # /Volumes/My Shared Files/hooks/ ← spaces cause bash word-splitting error
63
+ - language: yaml
64
+ label: 'Wrapper script at space-free path delegates to actual hook in shared mount'
65
+ code: |
66
+ #!/bin/bash
67
+ # /Users/runner/hooks/job-started.sh (space-free path — registered as the hook)
68
+ #
69
+ # Exec-delegates to the actual hook that lives under a shared volume with spaces.
70
+ # Using exec preserves exit codes and avoids a subprocess layer.
71
+ exec "/Volumes/My Shared Files/hooks/actual-job-started.sh" "$@"
72
+ prevention:
73
+ - 'Never place runner hooks, workspace paths, or working-directories in paths containing spaces'
74
+ - 'On macOS Tart VMs, configure shared mounts to use space-free mount points (e.g., /Volumes/SharedFiles)'
75
+ - 'Test runner hook invocations explicitly on macOS or Windows deployments with shared mounts'
76
+ - 'Watch actions/runner#4404 for the upstream fix; upgrade the runner version when it ships'
77
+ docs:
78
+ - url: 'https://github.com/actions/runner/issues/4404'
79
+ label: 'actions/runner#4404: Bash script handler does not quote script path — breaks with spaces'
80
+ - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job'
81
+ label: 'GitHub Docs: Running scripts before or after a job (hooks)'
82
+ - url: 'https://github.com/actions/runner/blob/main/src/Runner.Worker/Handlers/ScriptHandlerHelpers.cs'
83
+ label: 'Runner source: ScriptHandlerHelpers.cs (unquoted bash path template)'
@@ -0,0 +1,109 @@
1
+ id: 'runner-environment-082'
2
+ title: 'Self-hosted runner gets stuck "Waiting for a runner to pick up this job" between jobs in the same workflow'
3
+ category: runner-environment
4
+ severity: error
5
+ tags:
6
+ - self-hosted
7
+ - runner
8
+ - multi-job
9
+ - queued
10
+ - stuck
11
+ - windows
12
+ - auto-update
13
+ patterns:
14
+ - regex: 'Waiting for a runner to pick up this job'
15
+ flags: 'i'
16
+ error_messages:
17
+ - 'Waiting for a runner to pick up this job...'
18
+ root_cause: |
19
+ After completing the first job in a multi-job workflow, a self-hosted runner sometimes
20
+ fails to pick up subsequent jobs in the same workflow run. The subsequent jobs remain
21
+ in queued status indefinitely, with no timeout and no automatic retry.
22
+
23
+ Common root causes:
24
+ - Runner auto-update race condition: when the runner auto-updates between jobs, the post-job
25
+ cleanup of the first job can leave the runner in a state where it reports idle to the
26
+ broker but cannot accept new job messages
27
+ - Windows service restart latency: on Windows hosts, if the runner was auto-updated or
28
+ the service restarted between jobs, the new process may not have fully re-registered
29
+ with the GitHub Actions broker before the second job is dispatched
30
+ - JIT token expiry: in ephemeral/JIT runner setups, the registration token can expire
31
+ between jobs if the first job runs for a long time, and the runner cannot re-register
32
+ - Broker disconnect: a transient network interruption between jobs severs the long-poll
33
+ connection; the runner reconnects but the already-dispatched job message is missed
34
+
35
+ Manually cancelling and re-running the workflow, or restarting the runner service,
36
+ resolves the issue immediately, confirming the runner is functional but lost broker contact.
37
+ fix: |
38
+ Immediate workaround: cancel the stuck workflow run and re-trigger it, or restart the
39
+ runner service:
40
+ Windows: Restart-Service "actions.runner.*"
41
+ Linux: sudo systemctl restart actions.runner.*.<name>.service
42
+
43
+ Long-term fixes:
44
+ - Disable runner auto-update during active workflows by setting RUNNER_ALLOW_RUNASROOT
45
+ environment variable and pinning a specific runner version
46
+ - Use ephemeral runners (--ephemeral flag) so each job dispatches a fresh runner that
47
+ registers anew with the broker, eliminating the between-job reconnect window
48
+ - Split long workflows into separate workflow files triggered via workflow_run or
49
+ repository_dispatch so each workflow gets an independent runner session
50
+ - On Windows: ensure the runner service account has the "Log on as a service" right
51
+ and that antivirus is not blocking runner binary updates
52
+ fix_code:
53
+ - language: yaml
54
+ label: 'Use ephemeral runners to avoid stuck-between-jobs on self-hosted'
55
+ code: |
56
+ # When configuring the runner, use the --ephemeral flag:
57
+ # ./config.sh --url https://github.com/OWNER/REPO --token TOKEN --ephemeral
58
+ #
59
+ # For ARC (Actions Runner Controller), set runnerScaleSetSettings:
60
+ # spec:
61
+ # template:
62
+ # metadata:
63
+ # labels:
64
+ # ephemeral: 'true'
65
+ #
66
+ # Each job gets a freshly-registered runner; no between-job broker reconnect issues.
67
+ - language: yaml
68
+ label: 'Split multi-job workflow into two workflows triggered by workflow_run'
69
+ code: |
70
+ # phase1.yml
71
+ on:
72
+ push:
73
+ jobs:
74
+ build:
75
+ runs-on: [self-hosted, linux]
76
+ steps:
77
+ - uses: actions/checkout@v4
78
+ - run: make build
79
+ - uses: actions/upload-artifact@v4
80
+ with:
81
+ name: build-output
82
+ path: dist/
83
+
84
+ # phase2.yml (fresh runner registration — no stuck-between-jobs risk)
85
+ on:
86
+ workflow_run:
87
+ workflows: [phase1.yml]
88
+ types: [completed]
89
+ jobs:
90
+ test:
91
+ if: ${{ github.event.workflow_run.conclusion == 'success' }}
92
+ runs-on: [self-hosted, linux]
93
+ steps:
94
+ - uses: actions/download-artifact@v4
95
+ with:
96
+ name: build-output
97
+ - run: make test
98
+ prevention:
99
+ - 'Use ephemeral runners (--ephemeral) to ensure each job gets a fresh broker registration'
100
+ - 'Configure the runner service with Restart=on-failure to auto-recover from crashes between jobs'
101
+ - 'Pin runner versions and suppress auto-updates in production to prevent mid-workflow upgrades'
102
+ - 'Monitor for stuck runs via the GitHub Actions API and alert or auto-cancel them'
103
+ docs:
104
+ - url: 'https://github.com/actions/runner/issues/3609'
105
+ label: 'actions/runner#3609: Self-hosted runner stuck on "Waiting for a runner to pick up this job"'
106
+ - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/about-self-hosted-runners'
107
+ label: 'GitHub Docs: About self-hosted runners'
108
+ - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/autoscaling-with-self-hosted-runners'
109
+ label: 'GitHub Docs: Autoscaling with self-hosted runners'
@@ -0,0 +1,88 @@
1
+ id: 'triggers-026'
2
+ title: 'workflow_run does not fire when triggering workflow is skipped by paths or branches filter'
3
+ category: triggers
4
+ severity: silent-failure
5
+ tags:
6
+ - workflow-run
7
+ - skipped
8
+ - paths-filter
9
+ - branches-filter
10
+ - trigger-chain
11
+ patterns:
12
+ - regex: 'workflow_run'
13
+ flags: 'i'
14
+ - regex: 'types:\s*\[.*completed.*\]'
15
+ flags: 'i'
16
+ error_messages:
17
+ - 'This run was triggered by a workflow_run event but the parent workflow was not found'
18
+ root_cause: |
19
+ When a workflow is skipped because its on.push.paths or on.push.branches filter does not
20
+ match the pushed commit, GitHub does not create a workflow run record and therefore does
21
+ not emit a workflow_run completion event. A downstream workflow that listens for
22
+ on.workflow_run: [UpstreamWorkflow] with types: [completed] silently never fires.
23
+
24
+ This breaks fan-out CI/CD architectures where a primary workflow is gated by path/branch
25
+ filters, and secondary workflows (deploy, notify, publish) depend on its completion.
26
+ When the paths filter causes the primary workflow to be skipped entirely, the downstream
27
+ chain is dropped with no error message.
28
+
29
+ The issue affects both on.push.paths and on.push.branches filters. It does not affect
30
+ workflows that run but exit early via an if: condition on a job — only skipped runs
31
+ (which never appear in the GitHub Actions run list) cause the downstream gap.
32
+ fix: |
33
+ Replace trigger-level on.push.paths filtering with in-workflow job-level path detection
34
+ using an action like dorny/paths-filter. This ensures the upstream workflow always
35
+ creates a run (triggering the workflow_run event), while individual jobs are skipped
36
+ when paths do not match.
37
+ fix_code:
38
+ - language: yaml
39
+ label: 'Replace trigger-level paths filter with in-workflow detection'
40
+ code: |
41
+ # upstream.yml
42
+ # BAD: on.push.paths silently skips the run — workflow_run downstream never fires
43
+ # on:
44
+ # push:
45
+ # paths: ['src/**']
46
+
47
+ # GOOD: Always run, detect paths inside the workflow
48
+ on: [push]
49
+
50
+ jobs:
51
+ detect-changes:
52
+ runs-on: ubuntu-latest
53
+ outputs:
54
+ src-changed: ${{ steps.filter.outputs.src }}
55
+ steps:
56
+ - uses: actions/checkout@v4
57
+ - uses: dorny/paths-filter@v3
58
+ id: filter
59
+ with:
60
+ filters: |
61
+ src:
62
+ - 'src/**'
63
+
64
+ build:
65
+ needs: detect-changes
66
+ if: ${{ needs.detect-changes.outputs.src-changed == 'true' }}
67
+ runs-on: ubuntu-latest
68
+ steps:
69
+ - uses: actions/checkout@v4
70
+ - run: ./build.sh
71
+
72
+ # downstream.yml — now reliably fires on every push
73
+ # on:
74
+ # workflow_run:
75
+ # workflows: ['Upstream CI']
76
+ # types: [completed]
77
+ prevention:
78
+ - 'Do not combine on.push.paths/branches filters with workflow_run downstream dependencies'
79
+ - 'Use dorny/paths-filter or tj-actions/changed-files inside always-running workflows instead'
80
+ - 'Test the full trigger chain end-to-end by pushing commits that both match and do not match the filter'
81
+ - 'Document the skipped-runs gap in team CI docs for anyone building workflow_run chains'
82
+ docs:
83
+ - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#workflow_run'
84
+ label: 'GitHub Docs: workflow_run — triggering workflow must run on default branch'
85
+ - url: 'https://github.com/dorny/paths-filter'
86
+ label: 'dorny/paths-filter — job-level path filtering'
87
+ - url: 'https://github.com/orgs/community/discussions/23710'
88
+ label: 'GitHub Community: workflow_run not triggered when upstream workflow is skipped'
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@htekdev/actions-debugger",
3
- "version": "1.0.28",
3
+ "version": "1.0.30",
4
4
  "description": "65+ real GitHub Actions errors, queryable by agents. CLI + MCP server + Copilot skills + error database.",
5
5
  "type": "module",
6
6
  "main": "./dist/index.js",