npm - @htekdev/actions-debugger - Versions diffs - 1.0.122 → 1.0.123 - Mend

@htekdev/actions-debugger 1.0.122 → 1.0.123

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/errors/caching-artifacts/caching-artifacts-072.yml +137 -0
package/errors/runner-environment/runner-environment-222.yml +140 -0
package/errors/runner-environment/runner-environment-223.yml +149 -0
package/package.json +1 -1

package/errors/caching-artifacts/caching-artifacts-072.yml ADDED Viewed

@@ -0,0 +1,137 @@
+id: caching-artifacts-072
+title: 'actions/cache@v5 Restore Rate Limit (429) Silently Treated as Cache Miss'
+category: caching-artifacts
+severity: silent-failure
+tags:
+  - cache
+  - rate-limit
+  - 429
+  - restore
+  - cache-miss
+  - silent-failure
+  - v5
+  - performance
+patterns:
+  - regex: 'Warning: You''ve hit a rate limit, your rate limit will reset in \d+ seconds'
+    flags: 'i'
+  - regex: 'Failed to restore:.*GetCacheEntryDownloadURL.*Rate [Ll]imited'
+    flags: 'i'
+  - regex: 'Failed request: \(429\) Too Many Requests: rate limit exceeded'
+    flags: 'i'
+error_messages:
+  - "Warning: You've hit a rate limit, your rate limit will reset in 18 seconds"
+  - "Warning: Failed to restore: Failed to GetCacheEntryDownloadURL: Rate Limited: Failed request: (429) Too Many Requests: rate limit exceeded"
+  - "Cache not found for input keys: goofy-b41b01ad3312fe1358359b7522c43860bfdad754166c7f1d385e51766e57b4c0"
+root_cause: |
+  When the GitHub cache service rate-limits a cache restore lookup request with HTTP
+  429 Too Many Requests, actions/cache@v5 does NOT retry the request. Instead, it
+  prints a warning and immediately treats the response as a cache miss, proceeding
+  with a full build from scratch.
+  The cache service includes a Retry-After header in the 429 response that tells the
+  client exactly how many seconds to wait before retrying (often ≤60 seconds). The
+  actions/cache implementation ignores this header entirely — no retry, no backoff,
+  no configurable behavior. The job simply never gets its cached dependencies.
+  This is a silent failure in the sense that:
+  1. The job succeeds — it just rebuilds everything from scratch.
+  2. No annotation or error is surfaced in the Actions UI. Only a Warning line in
+     the step log reveals what happened.
+  3. The resulting build artifacts are correct, but the CI run takes 2–10x longer
+     than expected, masking the real cause.
+  Most commonly triggered in large matrix builds (20+ parallel jobs) where many jobs
+  simultaneously query the cache service and collectively exhaust the per-repo or
+  per-org cache API rate limit. Also reported on repos with heavy cross-job cache
+  sharing patterns.
+  Distinct from caching-artifacts-030 (cache-service-429-upload-ebadf-crash.yml):
+  that entry covers 429 during the cache UPLOAD phase which crashes with EBADF.
+  This entry covers 429 during the cache RESTORE/lookup phase which silently misses
+  — different operation, different error message, different impact, different fix path.
+  Source: actions/cache#1758 (May 2026, open); also reported in
+  oxidecomputer/hubris#2535 "CI fails intermittently on Windows while restoring cache"
+  (May 2026).
+fix: |
+  There is no complete fix — this is an open upstream bug (actions/cache#1758).
+  The rate-limit retry path is not implemented in actions/cache. Workarounds:
+  Option 1 — Reduce cache API pressure by staggering matrix jobs:
+    strategy:
+      matrix: ...
+      max-parallel: 5   # Limit to 5 concurrent jobs instead of all at once
+  This reduces the burst of simultaneous restore calls and lowers the chance of
+  hitting the rate limit.
+  Option 2 — Add a retry wrapper using actions/cache's restore-keys cascade:
+    - uses: actions/cache@v5
+      id: cache
+      with:
+        key: ${{ runner.os }}-deps-${{ hashFiles('**/lockfile') }}
+        restore-keys: |
+          ${{ runner.os }}-deps-
+    - name: Warn on cache rate limit miss
+      if: steps.cache.outputs.cache-hit != 'true'
+      run: |
+        echo "::warning::Cache miss — may be rate limited. Check step log for 429."
+  Option 3 — Switch to a self-hosted cache backend to bypass GitHub's rate limits:
+    env:
+      ACTIONS_CACHE_URL: https://your-cache-backend.example.com/
+      ACTIONS_RUNTIME_TOKEN: ${{ secrets.CACHE_TOKEN }}
+  Option 4 — Accept it and add monitoring. If you frequently see the rate limit
+  warning, consider filing a support ticket to request a higher cache API rate limit
+  for your organization.
+fix_code:
+  - language: yaml
+    label: 'Reduce parallelism to lower cache restore burst pressure'
+    code: |
+      jobs:
+        build:
+          strategy:
+            matrix:
+              target: [linux-x64, linux-arm64, windows-x64, macos-x64, macos-arm64]
+            max-parallel: 4   # Stagger jobs to reduce simultaneous cache restore calls
+          steps:
+            - uses: actions/cache@v5
+              with:
+                path: ~/.cargo/registry
+                key: ${{ runner.os }}-cargo-${{ hashFiles('**/Cargo.lock') }}
+  - language: yaml
+    label: 'Add explicit warning step to surface rate limit cache misses clearly'
+    code: |
+      steps:
+        - uses: actions/cache@v5
+          id: cache-restore
+          with:
+            path: |
+              ~/.npm
+              node_modules
+            key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
+            restore-keys: |
+              ${{ runner.os }}-node-
+        - name: Check for cache rate limit miss
+          if: steps.cache-restore.outputs.cache-hit != 'true'
+          run: |
+            echo "::warning::Cache miss detected — check step log for '429 Too Many Requests' to distinguish rate-limit miss from genuine cache absence."
+prevention:
+  - 'Set max-parallel on matrix strategies to limit simultaneous cache restore API calls and avoid triggering the per-org/per-repo cache rate limit.'
+  - 'Monitor the cache restore step logs for the warning message "You''ve hit a rate limit" to distinguish rate-limit misses from genuine cache absences when diagnosing slow CI runs.'
+  - 'Consider using restore-keys fallback chains so that even a rate-limited primary key miss may still succeed with a partial restore from a broader key.'
+  - 'Report rate limit occurrences to GitHub Support with your org name to request a higher cache API rate limit if you encounter this regularly in large workflows.'
+docs:
+  - url: 'https://github.com/actions/cache/issues/1758'
+    label: 'actions/cache#1758 — Handle rate limit (open, May 2026)'
+  - url: 'https://github.com/oxidecomputer/hubris/issues/2535'
+    label: 'oxidecomputer/hubris#2535 — CI fails intermittently on Windows while restoring cache (May 2026)'
+  - url: 'https://docs.github.com/en/actions/using-workflows/caching-dependencies-to-speed-up-workflows#usage-limits-and-eviction-policy'
+    label: 'GitHub Docs — Caching usage limits and eviction policy'

package/errors/runner-environment/runner-environment-222.yml ADDED Viewed

@@ -0,0 +1,140 @@
+id: runner-environment-222
+title: 'Windows Self-Hosted Runner V2 Broker Listener Stops Polling After First Job Completion'
+category: runner-environment
+severity: error
+tags:
+  - self-hosted
+  - windows
+  - broker
+  - v2-flow
+  - listener
+  - polling
+  - idle
+  - 2.334.0
+  - BrokerMessageListener
+patterns:
+  - regex: 'BrokerMessageListener.*Get messages has been cancelled using local token source\. Continue to get messages with new status\.'
+    flags: 'i'
+  - regex: 'BrokerMessageListener.*Received job status event\. JobState: Online'
+    flags: 'i'
+error_messages:
+  - "[2026-05-21 14:48:13Z INFO BrokerMessageListener] Get messages has been cancelled using local token source. Continue to get messages with new status."
+  - "[INFO BrokerMessageListener] Received job status event. JobState: Online"
+  - "[INFO BrokerMessageListener] Session created."
+root_cause: |
+  On Windows self-hosted runners using the V2 broker protocol (useV2Flow: true,
+  serverUrlV2: broker.actions.githubusercontent.com), a race condition in the
+  BrokerMessageListener causes the runner to permanently stop polling the broker
+  after the first job completes.
+  The sequence that triggers the hang:
+  1. Runner starts, creates a broker session, and begins polling for messages.
+  2. First job arrives → BrokerMessageListener logs "JobState: Busy".
+  3. Job finishes → BrokerMessageListener logs "JobState: Online".
+  4. The Online state transition triggers a cancellation of the current polling
+     loop via a local token source ("Get messages has been cancelled using
+     local token source. Continue to get messages with new status.").
+  5. The listener is supposed to create a new polling loop with fresh state, but
+     due to a bug in the V2 flow state machine, the new polling loop is never
+     started. No further GET /message requests are ever issued.
+  The runner process stays alive. OAuth token refreshes continue on schedule (so
+  credentials are not the problem). The runner shows as "Idle" in the GitHub UI.
+  However, it will never pick up another job until the service is manually restarted.
+  The bug was introduced in or around v2.334.0 on Windows. It does not affect:
+  - Linux runners (different socket layer — see broker-server-socket-exception-nat-timeout-linux.yml)
+  - macOS runners (see macos-self-hosted-listener-aad-ghost-busy-stall.yml for a
+    separate macOS stall pattern)
+  - V1 flow runners (useV2Flow: false)
+  - GitHub-hosted runners (not affected by self-hosted listener bugs)
+  Source: actions/runner#4444 (May 2026, open). Reported on Windows Server 2022
+  x64, v2.334.0, V2 flow. Three reproducible occurrences in 22 hours on a
+  previously stable 6+ day continuous runner.
+fix: |
+  Immediate fix — Restart the runner service to recover:
+    Restart-Service actions.runner.*
+  Or via the runner management interface:
+    1. Go to repo/org Settings → Actions → Runners
+    2. Force-remove the stale runner registration
+    3. Re-register and restart
+  Structural workarounds:
+  Option 1 — Switch to ephemeral runners (recommended for most use cases):
+  Ephemeral runners register once, run one job, and exit cleanly. No stale state.
+    ./config.sh --url https://github.com/ORG/REPO --token TOKEN --ephemeral
+    ./run.sh
+  Or with Actions Runner Controller (ARC):
+    autoscaling.runnerScaleSetListener.minRunners: 1
+  Option 2 — Revert to V1 broker flow if ephemeral is not an option:
+  The V1 flow (long-polling, non-broker) does not exhibit this specific hang.
+  Edit the .runner config file and set useV2Flow: false, then restart the service.
+  Note: V1 is deprecated and will eventually be removed.
+  Option 3 — Add an automatic service recovery watchdog:
+    # Windows Task Scheduler: check every 5 minutes if runner has been Idle >20min
+    # and restart the service if it's stuck
+    $runner = Get-Service "actions.runner.*"
+    if ($runner.Status -eq "Running") {
+      # Check last job timestamp via API; if >20min and jobs queued, restart
+      Restart-Service "actions.runner.*"
+    }
+  Option 4 — Pin to a known-good runner version:
+  If v2.334.0+ reliably triggers this, pin the runner to v2.333.x by editing
+  the .runner config and disabling auto-update. Note: outdated versions
+  eventually stop being able to receive messages.
+fix_code:
+  - language: yaml
+    label: 'Use ephemeral runners to avoid stale listener state entirely'
+    code: |
+      # In your workflow:
+      jobs:
+        build:
+          runs-on: [self-hosted, windows-x64]   # Labels for your runner pool
+      # Register runners as ephemeral:
+      # ./config.cmd --url https://github.com/ORG/REPO --token TOKEN --ephemeral
+      # Each runner exits after completing one job; a process manager (NSSM, task
+      # scheduler, or ARC) restarts it to accept the next job.
+  - language: yaml
+    label: 'Add watchdog step to detect stale listener symptom (queue depth check)'
+    code: |
+      # Optional diagnostic: surface "no runners picked up job for >N minutes" via API
+      # Run this in a separate monitoring workflow:
+      jobs:
+        watchdog:
+          runs-on: ubuntu-latest
+          steps:
+            - name: Check for stuck self-hosted Windows runners
+              env:
+                GH_TOKEN: ${{ secrets.RUNNER_ADMIN_PAT }}
+              run: |
+                # List queued jobs older than 10 minutes that are assigned to self-hosted
+                gh api repos/${{ github.repository }}/actions/runs \
+                  --jq '.workflow_runs[] | select(.status=="queued") | .id' \
+                | while read run_id; do
+                    echo "Queued run: $run_id"
+                  done
+prevention:
+  - 'Use ephemeral self-hosted runners — they register, run one job, and exit. No stale listener state can accumulate.'
+  - 'If using long-lived runners on Windows with V2 broker flow, add monitoring to detect runners stuck in the Idle state with queued jobs.'
+  - 'Set up automatic service recovery for the runner service on Windows (e.g., via Windows Service recovery actions: restart after 1st failure).'
+  - 'Monitor actions/runner release notes for a fix to the V2 listener polling regression introduced around v2.334.0.'
+docs:
+  - url: 'https://github.com/actions/runner/issues/4444'
+    label: 'actions/runner#4444 — Listener stops polling broker after first job''s Busy→Online transition (2.334.0, Windows, V2 flow)'
+  - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/running-scripts-before-or-after-a-job'
+    label: 'GitHub Docs — Self-hosted runner configuration'
+  - url: 'https://docs.github.com/en/actions/hosting-your-own-runners/managing-self-hosted-runners/autoscaling-with-self-hosted-runners'
+    label: 'GitHub Docs — Autoscaling with self-hosted runners (ephemeral runner pattern)'

package/errors/runner-environment/runner-environment-223.yml ADDED Viewed

@@ -0,0 +1,149 @@
+id: runner-environment-223
+title: 'macOS-15 Arm64 brew update Fails with Stale lockf Lock When Run Twice'
+category: runner-environment
+severity: error
+tags:
+  - macos
+  - homebrew
+  - brew-update
+  - lockf
+  - arm64
+  - macos-15
+  - regression
+  - concurrent
+patterns:
+  - regex: 'lockf: 200: already locked'
+    flags: 'i'
+  - regex: 'Error: Another `brew update` process is already running\.'
+    flags: 'i'
+  - regex: 'lockf:.*already locked\s*\nError: Another.*brew update.*process is already running'
+    flags: 'im'
+error_messages:
+  - "lockf: 200: already locked"
+  - "Error: Another `brew update` process is already running."
+  - "Please wait for it to finish or terminate it to continue."
+  - "Error: Process completed with exit code 1."
+root_cause: |
+  On macOS-15 Arm64 GitHub Actions hosted runners starting with image version
+  20260422.526 (released ~April 22, 2026), running `brew update` more than once
+  within the same workflow — or across two steps that both call `brew update` —
+  fails on the second invocation with a stale lockf lock error.
+  Homebrew uses a lockfile at `/opt/homebrew/Library/Taps/homebrew/homebrew-core/.git/index.lock`
+  (or a similar path) to prevent concurrent updates. In the affected image versions,
+  the first `brew update` completes successfully but leaves the lock file in a
+  state that subsequent `brew update` calls cannot acquire. The `lockf` system
+  call returns errno 200 (EDEADLK on macOS), which Homebrew surfaces as
+  "Another brew update process is already running."
+  This is a regression — the same workflow step pattern worked correctly on image
+  version 20260415.520 and earlier.
+  Common trigger patterns:
+  1. Explicit double-update in a single step: `brew update && brew update`
+  2. Two separate steps that each call `brew update` before installing different tools
+  3. Parallel jobs on the same runner image that both run `brew update` (less common
+     since each hosted runner job gets a fresh VM, but affects matrix jobs in the
+     same workflow when they share a Homebrew setup step via the action cache)
+  4. A step script that calls `brew update` internally AND the user also calls it
+  Note: This affects macOS-15 Arm64 specifically. macOS-14, macOS-26, and x86_64
+  variants were NOT marked as affected in the original bug report (runner-images#13965).
+  Source: actions/runner-images#13965 (April 2026, open, under investigation by
+  GitHub runner-images team). Reported with reproducible case from the
+  mullvad/mullvadvpn-app CI pipeline.
+fix: |
+  Option 1 — Run brew update only once per job (preferred):
+  Consolidate all your brew installations into a single step and call brew update
+  exactly once before them:
+    - name: Install dependencies
+      run: |
+        brew update
+        brew install cmake ninja pkg-config
+  Option 2 — Use HOMEBREW_NO_AUTO_UPDATE=1 on steps that don't need fresh formulae:
+  If you only need brew update for specific steps, set the env var on all other
+  brew-using steps to prevent automatic update attempts:
+    - name: Install specific tool
+      env:
+        HOMEBREW_NO_AUTO_UPDATE: '1'
+      run: brew install your-tool   # Skips the implicit brew update
+  Option 3 — Guard the second brew update with a lock check:
+    - name: Safe brew update
+      run: |
+        flock -xn /opt/homebrew/Library/Taps/homebrew/homebrew-core/.git/index.lock \
+          brew update || echo "::warning::brew update skipped (lock already held)"
+  Option 4 — Use brew upgrade instead of repeated brew update:
+  If you need the latest formula versions, run brew update once and then
+  use brew upgrade to update installed packages:
+    - name: Update and upgrade Homebrew
+      run: |
+        brew update        # Run exactly once
+        brew upgrade       # Upgrades installed formulae to latest
+  Option 5 — Check for the regression in your image version and pin:
+  If you need to pin to a known-good image version while the fix is pending,
+  see GitHub's runner-images documentation for image version pinning options
+  (note: pinning is not officially supported for GitHub-hosted standard runners).
+fix_code:
+  - language: yaml
+    label: 'Broken — two brew update calls in same workflow (second fails on affected image)'
+    code: |
+      # This fails on macOS-15 Arm64 image 20260422.526+ with lockf: 200: already locked:
+      steps:
+        - name: Install build tools
+          run: |
+            brew update
+            brew install cmake ninja
+        - name: Install test tools
+          run: |
+            brew update        # FAILS: Another brew update process is already running
+            brew install lcov
+  - language: yaml
+    label: 'Fixed — single brew update before consolidated installs'
+    code: |
+      # Consolidate into one brew update call at the start:
+      steps:
+        - name: Install all Homebrew tools
+          run: |
+            brew update   # Only call once per job
+            brew install cmake ninja lcov
+  - language: yaml
+    label: 'Fixed — use HOMEBREW_NO_AUTO_UPDATE=1 on subsequent brew steps'
+    code: |
+      # Or prevent auto-update on steps after the first:
+      steps:
+        - name: Install build tools
+          run: |
+            brew update
+            brew install cmake ninja
+        - name: Install test tools (no re-update needed)
+          env:
+            HOMEBREW_NO_AUTO_UPDATE: '1'
+          run: brew install lcov   # Uses existing formula cache; no brew update call
+prevention:
+  - 'Call brew update at most once per job. Consolidate all Homebrew installations into a single step with one brew update at the top.'
+  - 'Set HOMEBREW_NO_AUTO_UPDATE=1 as a job-level env var and call brew update explicitly only in the one step that needs it.'
+  - 'Pin to macOS-14 (macos-14-xlarge) or use macOS-26 (which has different Homebrew behavior) if the regression is blocking critical workflows while runner-images#13965 is open.'
+  - 'Check your CI logs for the "lockf: 200: already locked" error if macOS-15 Arm64 workflows started failing around late April 2026 — this regression is the likely cause.'
+docs:
+  - url: 'https://github.com/actions/runner-images/issues/13965'
+    label: 'actions/runner-images#13965 — Running brew update twice in one workflow breaks (open, April 2026)'
+  - url: 'https://docs.brew.sh/Manpage#environment'
+    label: 'Homebrew docs — HOMEBREW_NO_AUTO_UPDATE environment variable'
+  - url: 'https://github.com/mullvad/mullvadvpn-app/actions/runs/24890005834'
+    label: 'mullvad/mullvadvpn-app — Example failing run (regression confirmed between image 20260415 and 20260422)'

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@htekdev/actions-debugger",
-  "version": "1.0.122",
+  "version": "1.0.123",
   "description": "65+ real GitHub Actions errors, queryable by agents. CLI + MCP server + Copilot skills + error database.",
   "type": "module",
   "main": "./dist/index.js",