npm - @htekdev/actions-debugger - Versions diffs - 1.0.34 → 1.0.36 - Mend

@htekdev/actions-debugger 1.0.34 → 1.0.36

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/errors/caching-artifacts/cache-service-429-upload-ebadf-crash.yml ADDED Viewed

@@ -0,0 +1,108 @@
+id: 'caching-artifacts-030'
+title: "Cache Service 429 Rate Limit During Upload Causes EBADF File Stream Crash"
+category: caching-artifacts
+severity: error
+tags:
+  - cache
+  - rate-limit
+  - 429
+  - ebadf
+  - post-step
+  - upload
+  - toolkit
+patterns:
+  - regex: 'Cache service responded with 429'
+    flags: 'i'
+  - regex: 'Cache upload failed because file read failed with EBADF'
+    flags: 'i'
+  - regex: 'Failed to save:.*429'
+    flags: 'i'
+error_messages:
+  - "Warning: Failed to save: Cache service responded with 429 during upload chunk."
+  - "Error: Cache upload failed because file read failed with EBADF: bad file descriptor, read"
+root_cause: |
+  When the GitHub cache service rate-limits a cache upload request with HTTP 429
+  Too Many Requests, the actions/toolkit cache implementation does not cleanly
+  handle the rate-limit response. It emits a warning but continues attempting
+  to read from the underlying file stream. Because the HTTP upload connection was
+  already torn down after the 429, the file stream is left in a bad state. A
+  subsequent read on the orphaned stream raises EBADF (bad file descriptor), which
+  surfaces as a hard crash in the cleanup or post step of any action that uses
+  the toolkit cache library — including actions/setup-java, actions/setup-node,
+  actions/setup-python, and direct actions/cache usage.
+  The root cause is missing retry-with-backoff logic for 429 responses in the
+  toolkit's cache upload path. 429 responses include a Retry-After header that
+  the toolkit ignores entirely.
+  Most commonly triggered on large matrix builds (20+ concurrent jobs) where many
+  jobs save large caches simultaneously and collectively exhaust the cache service
+  rate limit. Individual jobs that hit the rate limit fail with the EBADF crash
+  rather than retrying or gracefully degrading.
+  Open since November 2023 (actions/toolkit#1589). Multiple large open-source
+  projects have reported it: apache/beam, techmatters/terraso-mobile-client,
+  synapsecns/sanguine, and others.
+fix: |
+  No upstream fix is available — this is an open bug in actions/toolkit since
+  November 2023. The rate-limit retry path is not implemented.
+  Workarounds:
+  1. Re-run the failed job from the GitHub Actions UI — on re-run, the cache
+     service rate limit has usually recovered and the upload succeeds.
+  2. Reduce concurrent cache saves: split large matrix builds into smaller
+     batches using strategy.max-parallel to stagger cache upload timing.
+  3. Pin to the latest patch version of actions/cache — GitHub occasionally
+     ships partial fixes. Keep the action version pinned to the latest release.
+  4. Use actions/cache/save with if: always() and accept that the step may
+     still warn on 429 — but it avoids the EBADF crash if the stream
+     handling is improved in a newer version.
+  5. Increase actions/cache version: v4+ has the most recent reliability fixes.
+fix_code:
+  - language: yaml
+    label: "Limit concurrent cache saves with max-parallel to avoid rate limiting"
+    code: |
+      jobs:
+        build:
+          strategy:
+            matrix:
+              os: [ubuntu-latest, macos-latest, windows-latest]
+              node: [18, 20, 22]
+            max-parallel: 4   # Stagger cache saves — avoid 20+ simultaneous uploads
+          runs-on: ${{ matrix.os }}
+          steps:
+            - uses: actions/checkout@v4
+            - uses: actions/setup-node@v4
+              with:
+                node-version: ${{ matrix.node }}
+                cache: npm
+            - run: npm ci
+            - run: npm test
+  - language: yaml
+    label: "Accept cache-save failure gracefully with continue-on-error"
+    code: |
+      steps:
+        - uses: actions/cache/restore@v4
+          with:
+            path: ~/.m2/repository
+            key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
+        - name: Build
+          run: mvn --batch-mode package
+        - name: Save cache (continue even if 429 occurs)
+          uses: actions/cache/save@v4
+          continue-on-error: true   # Prevents EBADF crash from failing the job
+          with:
+            path: ~/.m2/repository
+            key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}
+prevention:
+  - "Use strategy.max-parallel to limit concurrent matrix jobs and stagger cache upload timing."
+  - "Prefer actions/cache@v4 (latest) which includes the most recent reliability patches."
+  - "Add continue-on-error: true to explicit cache save steps to prevent EBADF from failing the workflow."
+  - "Monitor large matrix builds for recurring 429 errors — they indicate you need to reduce concurrency or shard differently."
+docs:
+  - url: "https://github.com/actions/toolkit/issues/1589"
+    label: "actions/toolkit#1589: Cache upload does not handle 429 error (open since Nov 2023, 6 reactions)"
+  - url: "https://github.com/actions/setup-java/issues/543"
+    label: "actions/setup-java#543: Transient 429 error fail upload cache cause workflow failure"
+  - url: "https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/caching-dependencies-to-speed-up-workflows"
+    label: "GitHub Docs: Caching dependencies to speed up workflows"

package/errors/known-unsolved/checkout-eu-runner-timeout-regional-degradation.yml ADDED Viewed

@@ -0,0 +1,97 @@
+id: known-unsolved-033
+title: 'actions/checkout Hangs or Times Out From EU GitHub-Hosted Runners (Regional Degradation)'
+category: known-unsolved
+severity: error
+tags:
+  - checkout
+  - performance
+  - eu-runners
+  - timeout
+  - regional
+  - infrastructure
+patterns:
+  - regex: 'fatal: unable to access.*https://github\.com.*timed out'
+    flags: 'i'
+  - regex: 'fatal: unable to access.*https://github\.com.*Could not resolve host'
+    flags: 'i'
+  - regex: 'Error: Process completed with exit code 128'
+    flags: 'i'
+error_messages:
+  - "fatal: unable to access 'https://github.com/owner/repo/': Operation timed out"
+  - "Error: Process completed with exit code 128."
+  - 'actions/checkout step hanging for 5-30 minutes with no output'
+root_cause: |
+  Starting May 19, 2026, workflows running on GitHub-hosted runners in European
+  data centers began experiencing severely degraded actions/checkout performance.
+  The fetch/clone phase hangs silently for 5-30 minutes before either completing
+  slowly or timing out, regardless of repository size. Workflows that previously
+  completed checkout in 10-30 seconds are affected.
+  The root cause is an infrastructure-level degradation on GitHub's side
+  affecting the European runner subnet's connectivity to GitHub's Smart HTTP
+  server or CDN endpoints. This is distinct from general large-repo slowness:
+  even tiny repositories with shallow clones exhibit the hang. GitHub has not
+  published a root cause analysis or resolution timeline as of June 2026.
+  Notably, runners in US regions (us-east-1, us-west-2) are not affected —
+  the issue is specific to EU runner region routing. The error manifests as
+  either a silent hang (no log output during the fetch phase) or an eventual
+  "Operation timed out" exit code 128.
+  Source: actions/checkout#2441 (52 reactions, opened May 24, 2026, still open).
+fix: |
+  No upstream fix available — this is a GitHub infrastructure issue with no
+  workaround that completely eliminates the problem. Mitigations to reduce
+  impact:
+  1. Add timeout-minutes to checkout steps to prevent indefinite hangs and
+     fail fast with a clear error rather than a silent stuck pipeline.
+  2. Use fetch-depth: 1 (shallow clone) to reduce transfer size, which may
+     reduce hang duration even if it does not eliminate it.
+  3. Use sparse-checkout to limit the files transferred from the CDN.
+  4. For critical pipelines, consider temporarily switching to ubuntu-latest
+     with an explicit us-east-1 runner label if your GitHub plan supports
+     regional runner selection.
+  5. Subscribe to GitHub Status (githubstatus.com) for EU infrastructure
+     degradation notices — incidents affecting this region are tracked there.
+fix_code:
+  - language: yaml
+    label: 'Shallow clone with timeout to fail fast during EU degradation'
+    code: |
+      steps:
+        - name: Checkout
+          uses: actions/checkout@v4
+          timeout-minutes: 5      # fail fast instead of hanging for 30+ minutes
+          with:
+            fetch-depth: 1        # shallow clone reduces CDN transfer size
+  - language: yaml
+    label: 'Sparse checkout to minimize data fetched during regional degradation'
+    code: |
+      steps:
+        - name: Sparse checkout
+          uses: actions/checkout@v4
+          timeout-minutes: 5
+          with:
+            fetch-depth: 1
+            sparse-checkout: |
+              src/
+              tests/
+              package.json
+              go.mod
+prevention:
+  - 'Always specify fetch-depth: 1 for workflows that do not require full commit history'
+  - 'Add timeout-minutes to every checkout step to prevent indefinite pipeline hangs'
+  - 'Monitor p99 checkout duration from EU runners as a CI health SLI'
+  - 'Subscribe to GitHub Status page (githubstatus.com) for EU infrastructure degradation notices'
+  - 'Use sparse-checkout in large monorepos to reduce CDN dependency during fetch'
+docs:
+  - url: 'https://github.com/actions/checkout/issues/2441'
+    label: 'actions/checkout #2441: Checkouts extremely slow or timing out from EU (52 reactions, May 2026)'
+  - url: 'https://www.githubstatus.com/'
+    label: 'GitHub Status page for infrastructure incidents'
+  - url: 'https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/evaluate-expressions-in-workflows-and-actions'
+    label: 'GitHub Docs: sparse-checkout in actions/checkout'

package/errors/known-unsolved/merge-queue-ejected-pr-runs-not-auto-cancelled.yml ADDED Viewed

@@ -0,0 +1,144 @@
+id: 'known-unsolved-032'
+title: "Ejecting a PR from the Merge Queue Does Not Cancel Its Running Workflow Runs"
+category: known-unsolved
+severity: limitation
+tags:
+  - merge-queue
+  - merge_group
+  - cancellation
+  - orphaned-runs
+  - ci-minutes
+  - limitation
+  - waste
+patterns:
+  - regex: 'gh-readonly-queue/'
+    flags: 'i'
+  - regex: 'on:\s*\n\s*merge_group'
+    flags: 'im'
+error_messages:
+  - "Workflow run continues after PR is ejected from the merge queue"
+  - "No automatic cancellation for gh-readonly-queue/... runs on PR removal"
+root_cause: |
+  When GitHub's merge queue ejects a PR — due to a failing required check,
+  a merge conflict with another PR in the batch, or manual removal — GitHub
+  does NOT automatically cancel the workflow runs that were started for the
+  merge group batch that contained that PR.
+  Those runs continue to execute and consume CI minutes even though the
+  associated PR will never be merged via that queue entry. In active
+  repositories with large merge queues (especially monorepos or high-velocity
+  teams), a single failed check can cause a wave of orphaned runs as PRs are
+  rebatched and re-queued, multiplying wasted CI time.
+  Merge group workflow runs are triggered on ephemeral
+  `gh-readonly-queue/<base-branch>/pr-<number>-<sha>` refs. When the queue
+  ejects a PR, the ref is deleted but in-progress runs are not signalled.
+  GitHub has acknowledged this as working-as-designed behavior: workflow run
+  cancellation on merge queue ejection must be managed by the repository owner.
+  Source: dotCMS/core#34592 (GitHub merge queue orphaned workflow runs waste
+  CI resources, Feb 2026, open).
+fix: |
+  No built-in automatic cancellation mechanism exists. Available workarounds:
+  Option 1 — Scoped concurrency group per merge queue entry:
+  Set a concurrency group scoped to the workflow and the merge group ref. This
+  prevents a single PR from accumulating multiple parallel runs as it is
+  rebatched, but does NOT cancel runs when the PR is ejected.
+  Option 2 — Differentiated cancel-in-progress by event:
+  Use cancel-in-progress only for pull_request events (not merge_group events)
+  to avoid cancelling sibling PRs in the same batch while still cancelling
+  redundant PR-branch runs.
+  Option 3 — External cleanup script:
+  A separate monitoring workflow on schedule or repository_dispatch can call
+  the Actions API to cancel in-progress runs on refs that no longer exist.
+  This is operationally complex but achieves true cleanup.
+fix_code:
+  - language: yaml
+    label: "Differentiated concurrency — cancel PR runs but not merge queue runs"
+    code: |
+      on:
+        push:
+          branches: [main]
+        pull_request:
+        merge_group:
+      concurrency:
+        # Include workflow name to avoid cross-workflow cancellation
+        group: ${{ github.workflow }}-${{ github.ref }}
+        # Cancel duplicate PR branch runs, but do NOT cancel merge queue runs
+        # (cancelling merge_group runs ejects sibling PRs from the queue)
+        cancel-in-progress: ${{ github.event_name == 'pull_request' }}
+      jobs:
+        ci:
+          runs-on: ubuntu-latest
+          steps:
+            - uses: actions/checkout@v4
+            - run: npm test
+  - language: yaml
+    label: "Monitoring workflow to cancel orphaned merge queue runs (advanced)"
+    code: |
+      # This workflow runs periodically and cancels in-progress runs
+      # on merge queue refs that no longer exist as active queue entries.
+      # Requires: contents: read, actions: write
+      on:
+        schedule:
+          - cron: '*/15 * * * *'   # Every 15 minutes
+        workflow_dispatch:
+      permissions:
+        actions: write
+        contents: read
+      jobs:
+        cleanup-orphaned-runs:
+          runs-on: ubuntu-latest
+          steps:
+            - name: Cancel orphaned merge queue runs
+              uses: actions/github-script@v7
+              with:
+                script: |
+                  const runs = await github.rest.actions.listWorkflowRunsForRepo({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    status: 'in_progress',
+                    per_page: 100,
+                  });
+                  for (const run of runs.data.workflow_runs) {
+                    if (run.head_branch?.startsWith('gh-readonly-queue/')) {
+                      // Verify the queue ref still exists
+                      try {
+                        await github.rest.git.getRef({
+                          owner: context.repo.owner,
+                          repo: context.repo.repo,
+                          ref: `heads/${run.head_branch}`,
+                        });
+                      } catch (e) {
+                        if (e.status === 404) {
+                          // Ref gone — cancel the orphaned run
+                          await github.rest.actions.cancelWorkflowRun({
+                            owner: context.repo.owner,
+                            repo: context.repo.repo,
+                            run_id: run.id,
+                          });
+                          console.log(`Cancelled orphaned run ${run.id} for ${run.head_branch}`);
+                        }
+                      }
+                    }
+                  }
+prevention:
+  - "Accept that some CI minutes will be wasted on ejected PRs — this is a known platform constraint with no first-class solution."
+  - "Monitor total merge queue depth in high-velocity repos; if the queue frequently rebatches, orphaned runs accumulate quickly."
+  - "Use differentiated cancel-in-progress (disabled for merge_group events) to at least avoid accidentally ejecting sibling PRs while managing PR-branch redundancy."
+  - "Consider a periodic cleanup workflow using the Actions API to cancel in-progress runs on deleted merge queue refs."
+docs:
+  - url: "https://github.com/dotCMS/core/issues/34592"
+    label: "dotCMS/core#34592: Merge queue orphaned workflow runs waste CI resources (open, Feb 2026)"
+  - url: "https://docs.github.com/en/repositories/configuring-branches-and-merges-in-your-repository/configuring-pull-request-merges/managing-a-merge-queue"
+    label: "GitHub Docs: Managing a merge queue"
+  - url: "https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#merge_group"
+    label: "GitHub Docs: merge_group event"

package/errors/permissions-auth/checkout-v6-includif-symlinked-work-credential-failure.yml ADDED Viewed

@@ -0,0 +1,93 @@
+id: permissions-auth-032
+title: 'checkout@v6 Credential Injection Fails on Self-Hosted Runners With Symlinked _work Directory'
+category: permissions-auth
+severity: error
+tags:
+  - checkout-v6
+  - self-hosted
+  - symlink
+  - credentials
+  - includif
+  - macos
+patterns:
+  - regex: 'fatal: could not read Username for.*terminal prompts disabled'
+    flags: 'i'
+  - regex: 'includeIf.*gitdir.*_work'
+    flags: 'i'
+  - regex: 'fatal: repository.*not found'
+    flags: 'i'
+error_messages:
+  - "fatal: could not read Username for 'https://github.com': terminal prompts disabled"
+  - 'Error: fatal: repository not found'
+  - 'Authentication failed'
+root_cause: |
+  actions/checkout@v6 changed credential injection from writing directly into
+  the repository configuration file as http.https://github.com/.extraheader
+  (the v5 approach) to using includeIf "gitdir:..." directives that reference
+  a temporary credentials file stored in _work/_temp/.
+  v6 writes the includeIf path using the symlink path of the runner _work
+  directory. However, the version control system evaluates gitdir: conditions
+  against the resolved (real) absolute path — it follows symlinks when
+  determining the current repository's directory.
+  When the runner _work directory is a symlink to an external volume (a common
+  setup for macOS Apple Silicon runners using external SSD storage), the
+  includeIf path written by v6 uses the symlink path
+  (e.g., /Users/runner/actions-runner-N/_work/repo/.git) but the actual
+  resolved path is different
+  (e.g., /Volumes/External/actions-runner-N-work/repo/.git).
+  These never match, so the credentials config file is never loaded and the
+  fetch step fails with "terminal prompts disabled."
+  v5 is unaffected because it injects credentials directly into the repository
+  configuration file rather than using conditional includes.
+  Source: actions/checkout#2393 (open March 2026, macOS Apple Silicon).
+fix: |
+  Option 1 (recommended): Pin to actions/checkout@v5 for workflows running on
+  self-hosted runners with symlinked _work directories. v5 injects credentials
+  directly and is not affected by this symlink resolution issue.
+  Option 2: Reconfigure the runner to use the real volume path directly.
+  Remove the symlink from _work and mount the external volume at the actual
+  runner work path location. This eliminates the symlink entirely.
+  Option 3: Use persist-credentials: false with a separate authentication
+  step that does not rely on the includeIf mechanism.
+fix_code:
+  - language: yaml
+    label: 'Pin to v5 as workaround for symlinked _work runners (checkout#2393)'
+    code: |
+      steps:
+        - name: Checkout
+          # Pinned to v5 — v6 includeIf credential injection fails when runner
+          # _work directory is a symlink to an external volume (checkout#2393)
+          uses: actions/checkout@v5
+          with:
+            token: ${{ secrets.GITHUB_TOKEN }}
+  - language: yaml
+    label: 'Use persist-credentials false with explicit token for subsequent steps'
+    code: |
+      steps:
+        - name: Checkout without credential persistence
+          uses: actions/checkout@v6
+          with:
+            persist-credentials: false
+        - name: Subsequent steps using explicit token
+          env:
+            GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          run: |
+            echo "Use GITHUB_TOKEN env var in subsequent authenticated operations"
+prevention:
+  - 'Audit self-hosted runner _work paths for symlinks before upgrading from checkout@v5 to v6'
+  - 'Avoid symlinking the runner _work directory — use bind mounts or configure the real path'
+  - 'Test checkout behavior on self-hosted runners in a canary workflow before rolling out v6'
+  - 'Check the resolved path differs from the symlink path when debugging "terminal prompts disabled" errors'
+docs:
+  - url: 'https://github.com/actions/checkout/issues/2393'
+    label: 'actions/checkout #2393: v6 includeIf credential matching fails on symlinked _work (open March 2026)'
+  - url: 'https://github.com/actions/checkout/issues/2313'
+    label: 'actions/checkout #2313: v6 breaks Docker actions using credential auth (related, closed)'
+  - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/configuring-the-self-hosted-runner-application-as-a-service'
+    label: 'GitHub Docs: Configuring the self-hosted runner as a service'

package/errors/runner-environment/ephemeral-runner-not-found-after-register.yml ADDED Viewed

@@ -0,0 +1,115 @@
+id: runner-environment-090
+title: 'Ephemeral Self-Hosted Runner Fails Immediately With "An error occurred: Runner not found"'
+category: runner-environment
+severity: error
+tags:
+  - self-hosted
+  - ephemeral
+  - runner-not-found
+  - registration
+  - broker
+  - jit
+patterns:
+  - regex: 'An error occurred: Runner not found'
+    flags: 'i'
+  - regex: 'RunnerNotFoundException'
+    flags: 'i'
+error_messages:
+  - 'An error occurred: Runner not found'
+  - 'GitHub.Actions.RunService.WebApi.RunnerNotFoundException'
+  - 'Listening for Jobs'
+root_cause: |
+  GitHub's broker endpoint returns RunnerNotFoundException immediately after
+  a successful registration and connection for ephemeral self-hosted runners
+  configured with replace mode. The runner completes registration ("Successfully
+  replaced the runner"), establishes connection ("Runner connection is good"),
+  starts listening for jobs, then receives a RunnerNotFoundException from the
+  broker HTTP client within seconds.
+  The error originates in BrokerHttpClient.cs where the broker API returns a
+  404/RunnerNotFoundException for the registered runner slot. This can occur
+  when the broker has stale slot state from the previous ephemeral runner
+  iteration that collides with the newly registered runner identity during the
+  brief window between registration and first poll.
+  The runner has no graceful retry handling for this condition — it exits with
+  status 1, causing systemd to restart it repeatedly, rapidly exhausting GitHub
+  App installation tokens through frequent re-registration cycles.
+  Affects all architectures (x86_64, aarch64, s390x) on various runner versions.
+  Spikes during periods of elevated load on GitHub broker infrastructure.
+  Source: actions/runner#3857 (116 reactions, open May 2025).
+fix: |
+  1. Switch from replace-mode ephemeral runners to JIT (Just-In-Time) runner
+     tokens. JIT runners receive a pre-assigned job ID and avoid the broker
+     slot replacement race entirely.
+  2. Update the runner to the latest version (v2.334.0+) which improves retry
+     behavior around transient broker errors.
+  3. Add restart delay in the systemd service unit to prevent token exhaustion
+     on rapid restart loops:
+       RestartSec=30
+       StartLimitIntervalSec=300
+       StartLimitBurst=5
+  4. Monitor runner diagnostic logs in _diag/Runner_*.log for the
+     RunnerNotFoundException pattern to distinguish broker errors from
+     configuration issues.
+fix_code:
+  - language: yaml
+    label: 'Systemd service unit with restart throttle to prevent token exhaustion'
+    code: |
+      # /etc/systemd/system/actions-runner.service
+      [Unit]
+      Description=GitHub Actions Self-Hosted Runner
+      After=network-online.target
+      [Service]
+      ExecStart=/home/runner/actions-runner/run.sh
+      Restart=on-failure
+      RestartSec=30
+      StartLimitIntervalSec=300
+      StartLimitBurst=5
+      User=runner
+      [Install]
+      WantedBy=multi-user.target
+  - language: yaml
+    label: 'Workflow using JIT runner token to avoid broker slot collision'
+    code: |
+      jobs:
+        provision-runner:
+          runs-on: ubuntu-latest
+          outputs:
+            runner-token: ${{ steps.jit.outputs.encoded_jit_config }}
+          steps:
+            - name: Generate JIT runner token
+              id: jit
+              uses: actions/github-script@v7
+              with:
+                script: |
+                  const { data } = await github.rest.actions.generateRunnerJitconfigForRepo({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    name: 'ephemeral-jit-runner',
+                    runner_group_id: 1,
+                    labels: ['self-hosted', 'ephemeral', 'linux']
+                  });
+                  core.setOutput('encoded_jit_config', data.encoded_jit_config);
+        build:
+          needs: provision-runner
+          runs-on: [self-hosted, ephemeral, linux]
+          steps:
+            - uses: actions/checkout@v4
+prevention:
+  - 'Use JIT runner tokens instead of replace-mode registration to eliminate broker slot race'
+  - 'Set systemd RestartSec to at least 30 seconds to avoid GitHub App token exhaustion'
+  - 'Monitor _diag/Runner_*.log for RunnerNotFoundException patterns and alert on restart frequency'
+  - 'Keep runner version current — broker compatibility fixes are regularly backported'
+  - 'Consider Kubernetes ARC ephemeral runners where pod lifecycle handles registration cleanly'
+docs:
+  - url: 'https://github.com/actions/runner/issues/3857'
+    label: 'actions/runner #3857: An error occurred: Runner not found (116 reactions, open May 2025)'
+  - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/autoscaling-with-self-hosted-runners#using-just-in-time-runners'
+    label: 'GitHub Docs: Just-in-time runners (JIT)'
+  - url: 'https://docs.github.com/en/rest/actions/self-hosted-runners#create-configuration-for-a-just-in-time-runner-for-a-repository'
+    label: 'GitHub REST API: Generate JIT runner config'

package/errors/runner-environment/worker-wedged-task-orchestration-job-not-found.yml ADDED Viewed

@@ -0,0 +1,119 @@
+id: runner-environment-091
+title: 'Self-Hosted Runner Worker Wedges Indefinitely After TaskOrchestrationJobNotFoundException'
+category: runner-environment
+severity: error
+tags:
+  - self-hosted
+  - runner-worker
+  - wedged
+  - slot-starvation
+  - v2-runservice
+  - macos
+  - apple-silicon
+patterns:
+  - regex: 'TaskOrchestrationJobNotFoundException.*workflow instance not found'
+    flags: 'i'
+  - regex: 'Job not found:.*workflow instance not found'
+    flags: 'i'
+  - regex: 'CompleteJobAsync.*TaskOrchestrationJobNotFoundException'
+    flags: 'i'
+error_messages:
+  - 'GitHub.DistributedTask.WebApi.TaskOrchestrationJobNotFoundException: Job not found: <job-guid>. workflow instance not found'
+  - 'TaskOrchestrationJobNotFoundException: workflow instance not found'
+  - 'System.AggregateException: One or more errors occurred. (Job not found:'
+root_cause: |
+  When a self-hosted runner Worker process calls CompleteJobAsync in the V2
+  RunService path (useV2Flow: true, RunServiceHttpClient.CompleteJobAsync)
+  and the GitHub orchestrator has discarded the job record (e.g., due to a
+  server-side timeout, infrastructure failover, or job cancellation during
+  finalization), the Worker receives TaskOrchestrationJobNotFoundException
+  with "workflow instance not found."
+  After exhausting configured retry attempts (default maxAttempts), the Worker
+  logs the exception and stops processing — but critically, it fails to call
+  Environment.Exit and the process remains alive at ~0.1% CPU with no active
+  work, no child processes, and no job cleanup activity.
+  The parent Runner.Listener treats the still-running Worker process as a busy
+  runner slot and refuses to spawn a new Worker. This causes runner slot
+  starvation: the affected runner stops accepting new jobs until the wedged
+  Worker is externally terminated (kill, reboot, or watchdog).
+  On one 3-host Apple Silicon runner pool (v2.334.0), this affected 32.8% of
+  Worker invocations (50 of 152) over three weeks, with one incident wedging
+  all three Workers simultaneously and blocking CI for 3+ hours.
+  Source: actions/runner#4418 (open May 2026).
+fix: |
+  No upstream fix available — the Worker does not exit on non-retryable
+  CompleteJobAsync failures. Mitigations:
+  1. Deploy a watchdog that monitors _diag/Worker_*.log for the
+     TaskOrchestrationJobNotFoundException pattern and kills the wedged
+     Worker process by PID.
+  2. Use Kubernetes ARC ephemeral runners where the pod lifecycle replaces
+     the entire runner environment after each job — a wedged Worker is
+     automatically cleaned up when the pod is recycled.
+  3. Configure a hard systemd runtime limit (RuntimeMaxSec) that terminates
+     any runner process exceeding your longest expected job duration plus a
+     safety margin.
+  4. Add an external health-check cron that queries the GitHub API for runner
+     status and restarts the runner service if slots show "busy" longer than
+     expected.
+fix_code:
+  - language: yaml
+    label: 'Kubernetes ARC ephemeral runner configuration (avoids wedged Worker state)'
+    code: |
+      # ARC RunnerDeployment — pods are recycled after each job
+      apiVersion: actions.summerwind.dev/v1alpha1
+      kind: RunnerDeployment
+      metadata:
+        name: ephemeral-runner-deployment
+      spec:
+        replicas: 3
+        template:
+          spec:
+            ephemeral: true          # pod recycled after each job, no wedge possible
+            repository: owner/repo
+            labels:
+              - self-hosted
+              - ephemeral
+  - language: yaml
+    label: 'Scheduled watchdog workflow to detect stalled runner slots via API'
+    code: |
+      on:
+        schedule:
+          - cron: '*/15 * * * *'    # every 15 minutes
+      jobs:
+        runner-health-check:
+          runs-on: ubuntu-latest
+          steps:
+            - name: Detect stalled self-hosted runners
+              uses: actions/github-script@v7
+              with:
+                script: |
+                  const runners = await github.rest.actions.listSelfHostedRunnersForRepo({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo
+                  });
+                  const offline = runners.data.runners.filter(r => r.status === 'offline');
+                  if (offline.length > 0) {
+                    core.warning('Offline/stalled runners: ' + offline.map(r => r.name).join(', '));
+                    // Trigger your runner restart webhook here
+                  }
+prevention:
+  - 'Use ephemeral Kubernetes ARC runners — pod recycle eliminates wedged Worker slot starvation'
+  - 'Monitor _diag/Worker_*.log for TaskOrchestrationJobNotFoundException patterns'
+  - 'Set systemd RuntimeMaxSec to maximum expected job duration plus 30 minutes'
+  - 'Track runner slot busy duration — sudden sustained busy state with no job output indicates wedge'
+  - 'Deploy a watchdog process alongside the runner that monitors Worker PID lifetime'
+docs:
+  - url: 'https://github.com/actions/runner/issues/4418'
+    label: 'actions/runner #4418: Worker wedges after TaskOrchestrationJobNotFoundException (open May 2026)'
+  - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/autoscaling-with-self-hosted-runners'
+    label: 'GitHub Docs: Autoscaling with self-hosted runners'
+  - url: 'https://github.com/actions/actions-runner-controller'
+    label: 'actions/actions-runner-controller: Kubernetes ARC runner controller'

package/errors/triggers/issue-comment-default-branch-context-no-pr-checkout.yml ADDED Viewed

@@ -0,0 +1,132 @@
+id: 'triggers-029'
+title: "issue_comment Trigger Runs in Default Branch Context — PR Code Not Checked Out"
+category: triggers
+severity: silent-failure
+tags:
+  - issue_comment
+  - pull_request
+  - checkout
+  - default-branch
+  - context
+  - ref
+  - pr-comment
+patterns:
+  - regex: 'on:\s*\n\s*issue_comment'
+    flags: 'im'
+  - regex: 'github\.event\.issue\.pull_request'
+    flags: 'i'
+error_messages:
+  - "Warning: This commit is not necessarily the head of this branch."
+  - "issue_comment event fires in the default branch context, not the PR branch"
+root_cause: |
+  When a comment is posted on a pull request, GitHub fires the `issue_comment`
+  event in the **default branch context**, not in the PR branch context.
+  Specifically:
+  - github.ref = refs/heads/main (or the repo default branch)
+  - github.sha = the HEAD commit of the default branch
+  - github.event.pull_request is undefined (issue_comment doesn't include PR data)
+  A workflow triggered by `issue_comment` that runs `actions/checkout` without
+  an explicit `ref:` will silently check out the **default branch**, not the
+  PR's code. The workflow appears to succeed — but it ran against stale or
+  unrelated code, not the PR changes being discussed.
+  This is a well-known footgun documented in 67-reaction issues. Developers
+  commonly add /deploy, /test, or /approve slash command workflows on PR comments,
+  expecting the workflow to run against the PR's code.
+  A second consequence: `github.event.pull_request` is undefined in this context.
+  Workflows that assume `github.event.pull_request.head.sha` exists will error
+  with "Cannot read properties of undefined (reading 'head')".
+  Source: actions/checkout#331 (67 reactions, open since Aug 2020).
+fix: |
+  Explicitly retrieve the PR details from the GitHub API and check out the PR
+  head commit using the pull request number from the issue comment event payload.
+  The PR number is available at: github.event.issue.number
+  (issue_comment events on PRs use the issue number, which matches the PR number)
+  Two approaches:
+  1. Use actions/github-script to fetch the PR head SHA, then checkout with
+     that explicit ref.
+  2. Use pull_request_target instead of issue_comment for workflows that need
+     to run on PR code — but be aware of the security implications (pull_request_target
+     runs with write permissions in the base repo context, even for fork PRs).
+  Always gate on `github.event.issue.pull_request` in an if: condition to
+  distinguish PR comments from plain issue comments.
+fix_code:
+  - language: yaml
+    label: "Checkout PR head commit from issue_comment event"
+    code: |
+      on:
+        issue_comment:
+          types: [created]
+      jobs:
+        run-on-pr-comment:
+          runs-on: ubuntu-latest
+          # Only run on PR comments, not plain issue comments
+          if: github.event.issue.pull_request != null
+          steps:
+            - name: Get PR head SHA
+              id: pr
+              uses: actions/github-script@v7
+              with:
+                script: |
+                  const pr = await github.rest.pulls.get({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    pull_number: context.issue.number,
+                  });
+                  core.setOutput('head_sha', pr.data.head.sha);
+                  core.setOutput('head_ref', pr.data.head.ref);
+            - uses: actions/checkout@v4
+              with:
+                ref: ${{ steps.pr.outputs.head_sha }}
+            - name: Run tests on PR code
+              run: npm test
+  - language: yaml
+    label: "Slash command pattern — only act on specific PR comment text"
+    code: |
+      on:
+        issue_comment:
+          types: [created]
+      jobs:
+        slash-command:
+          runs-on: ubuntu-latest
+          if: |
+            github.event.issue.pull_request != null &&
+            contains(github.event.comment.body, '/deploy')
+          steps:
+            - name: Get PR head SHA
+              id: pr
+              uses: actions/github-script@v7
+              with:
+                script: |
+                  const pr = await github.rest.pulls.get({
+                    owner: context.repo.owner,
+                    repo: context.repo.repo,
+                    pull_number: context.issue.number,
+                  });
+                  core.setOutput('head_sha', pr.data.head.sha);
+            - uses: actions/checkout@v4
+              with:
+                ref: ${{ steps.pr.outputs.head_sha }}
+            - run: ./scripts/deploy.sh
+prevention:
+  - "Always add `if: github.event.issue.pull_request != null` to distinguish PR comments from issue comments."
+  - "Never rely on github.ref or github.sha in issue_comment workflows — they point to the default branch, not the PR."
+  - "Use actions/github-script to fetch the PR head SHA via the pulls.get API, then pass it to actions/checkout as ref."
+  - "Consider pull_request_target for PR-triggered workflows, but audit for untrusted code execution risk first."
+docs:
+  - url: "https://github.com/actions/checkout/issues/331"
+    label: "actions/checkout#331: Any way to checkout PR from issue_comment event? (67 reactions)"
+  - url: "https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#issue_comment"
+    label: "GitHub Docs: issue_comment event trigger"
+  - url: "https://docs.github.com/en/actions/writing-workflows/choosing-when-your-workflow-runs/events-that-trigger-workflows#pull_request_target"
+    label: "GitHub Docs: pull_request_target event — alternative with write access"

package/errors/yaml-syntax/workflow-call-outputs-jobs-result-empty.yml ADDED Viewed

@@ -0,0 +1,125 @@
+id: 'yaml-syntax-032'
+title: "jobs.<id>.result Always Returns Empty String in on.workflow_call.outputs..value"
+category: yaml-syntax
+severity: silent-failure
+tags:
+  - reusable-workflow
+  - workflow_call
+  - outputs
+  - jobs-context
+  - result
+  - expression
+  - empty-string
+patterns:
+  - regex: 'jobs\.\w+\.result'
+    flags: 'i'
+  - regex: 'on:\s*\n\s*workflow_call:\s*\n[\s\S]*?outputs:'
+    flags: 'im'
+error_messages:
+  - "jobs.<job_id>.result evaluates to empty string in on.workflow_call.outputs value"
+  - "needs.<reusable_workflow>.outputs.<output> is empty string instead of success/failure/skipped"
+root_cause: |
+  In a reusable workflow, the expression `${{ jobs.<id>.result }}` silently
+  evaluates to an empty string when referenced inside an
+  `on.workflow_call.outputs.<output_name>.value` expression.
+  Despite the GitHub documentation stating that the `jobs` context is available
+  in that expression scope, direct property access to the `.result` field of a
+  job object returns empty string rather than the expected outcome value
+  (`success`, `failure`, `cancelled`, or `skipped`).
+  This causes downstream caller workflows that test the reusable workflow's
+  output (e.g., `needs.reusable.outputs.my-result == 'skipped'`) to silently
+  receive an empty string. Conditions that gate follow-up jobs on the result
+  of the reusable workflow never evaluate to true.
+  Open since January 2024 (actions/runner#3087, 11 reactions). The issue
+  affects all GitHub-hosted runners. A workaround exists using
+  `fromJSON(toJSON(jobs.<id>)).result` which forces the expression through JSON
+  serialization and correctly surfaces the result value.
+  Note: This is distinct from the more common mistake of referencing outputs
+  without declaring them in the job's outputs: block. The bug occurs even when
+  the job produces no step outputs and you simply want to surface whether the
+  job ran, was skipped, or failed.
+fix: |
+  Two approaches:
+  Option 1 — fromJSON/toJSON workaround (quickest):
+  Replace `${{ jobs.build.result }}` with
+  `${{ fromJSON(toJSON(jobs.build)).result }}`. The JSON round-trip forces
+  full evaluation of the jobs context object and correctly returns the result
+  string.
+  Option 2 — Surface result via step output (most reliable):
+  Add an explicit step with `if: always()` that writes `job.status` to
+  GITHUB_OUTPUT. Reference the step output in the job's outputs block and
+  in the top-level workflow_call outputs. This avoids the expression evaluation
+  quirk entirely.
+fix_code:
+  - language: yaml
+    label: "Broken pattern vs fromJSON/toJSON workaround"
+    code: |
+      # BROKEN: jobs.build.result returns empty string
+      on:
+        workflow_call:
+          outputs:
+            job-result:
+              value: ${{ jobs.build.result }}   # Always empty string
+      # FIXED: fromJSON/toJSON forces expression evaluation
+      on:
+        workflow_call:
+          outputs:
+            job-result:
+              value: ${{ fromJSON(toJSON(jobs.build)).result }}   # Returns success/failure/skipped
+  - language: yaml
+    label: "Preferred fix — surface result via explicit step output"
+    code: |
+      on:
+        workflow_call:
+          outputs:
+            job-result:
+              description: "The build job outcome"
+              value: ${{ jobs.build.outputs.result }}
+      jobs:
+        build:
+          runs-on: ubuntu-latest
+          outputs:
+            result: ${{ steps.capture-result.outputs.result }}
+          steps:
+            - uses: actions/checkout@v4
+            - name: Build
+              run: npm run build
+            - name: Capture job result
+              id: capture-result
+              if: always()
+              run: echo "result=${{ job.status }}" >> $GITHUB_OUTPUT
+  - language: yaml
+    label: "Caller workflow checking reusable output"
+    code: |
+      jobs:
+        reusable:
+          uses: ./.github/workflows/build.yml
+          secrets: inherit
+        deploy:
+          needs: reusable
+          runs-on: ubuntu-latest
+          # This condition now works correctly with either fix above
+          if: needs.reusable.outputs.job-result == 'success'
+          steps:
+            - run: echo "Deploying after successful build"
+prevention:
+  - "Never rely on direct `jobs.<id>.result` access in workflow_call top-level outputs — use the fromJSON(toJSON()) workaround or explicit step outputs."
+  - "Add integration tests for reusable workflows that verify output values are non-empty strings."
+  - "Use job.status (available inside step runs) rather than jobs.<id>.result (the problematic context) when capturing job outcome."
+docs:
+  - url: "https://github.com/actions/runner/issues/3087"
+    label: "actions/runner#3087: Cannot access jobs.<id>.result from on.workflow_call.outputs (open since Jan 2024, 11 reactions)"
+  - url: "https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/passing-information-between-jobs"
+    label: "GitHub Docs: Passing information between jobs"
+  - url: "https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/contexts#jobs-context"
+    label: "GitHub Docs: jobs context (documents result property)"

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@htekdev/actions-debugger",
-  "version": "1.0.34",
+  "version": "1.0.36",
   "description": "65+ real GitHub Actions errors, queryable by agents. CLI + MCP server + Copilot skills + error database.",
   "type": "module",
   "main": "./dist/index.js",