ctrlrelay 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,319 @@
1
+ """GitHub Issue Poller for ctrlrelay."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ from dataclasses import dataclass, field
8
+ from datetime import datetime, timezone
9
+ from pathlib import Path
10
+ from typing import Any, Awaitable, Callable
11
+
12
+ from ctrlrelay.core.github import GitHubCLI, GitHubError
13
+ from ctrlrelay.core.obs import get_logger, log_event
14
+
15
+ _logger = get_logger("core.poller")
16
+
17
+ # Exceptions that are transient and should skip the current repo/iteration
18
+ # rather than tear the whole poll loop down. asyncio.CancelledError is
19
+ # deliberately excluded so a shutdown signal still propagates.
20
+ #
21
+ # GitHubError is included because we can't distinguish transient (rate
22
+ # limit, 5xx, network) from permanent (bad repo name, expired auth, 404)
23
+ # without fragile error-message parsing — classifying both as skip avoids
24
+ # crashes. A persistent-failure counter (see below) makes permanent
25
+ # misconfiguration visible even though it's technically skipped here.
26
+ _TRANSIENT_POLL_ERRORS = (TimeoutError, GitHubError, OSError)
27
+
28
+ # After this many consecutive per-repo failures, escalate log level to
29
+ # WARNING so a persistent misconfiguration (expired auth, renamed repo,
30
+ # revoked access) stops hiding behind routine "transient" skip logs.
31
+ _REPO_FAILURE_WARN_THRESHOLD = 3
32
+
33
+
34
+ @dataclass
35
+ class IssuePoller:
36
+ """Polls GitHub repos for newly assigned issues.
37
+
38
+ Maintains a set of seen issue numbers per repo so that only genuinely new
39
+ issues are surfaced on each call to ``poll()``.
40
+ """
41
+
42
+ github: GitHubCLI
43
+ username: str
44
+ repos: list[str]
45
+ state_file: Path
46
+ seen_issues: dict[str, set[int]] = field(default_factory=dict)
47
+ # Per-repo consecutive-skip counter; populated at runtime by poll() /
48
+ # seed_current(). Not persisted — intentionally resets on daemon
49
+ # restart so an operator fix is exercised before we re-escalate.
50
+ _repo_failure_counts: dict[str, int] = field(default_factory=dict, repr=False)
51
+
52
+ def __post_init__(self) -> None:
53
+ self._load_state()
54
+
55
+ # ------------------------------------------------------------------
56
+ # State persistence
57
+ # ------------------------------------------------------------------
58
+
59
+ def _load_state(self) -> None:
60
+ """Load seen issues from the JSON state file (if it exists)."""
61
+ if not self.state_file.exists():
62
+ return
63
+ try:
64
+ data = json.loads(self.state_file.read_text())
65
+ raw = data.get("seen_issues", {})
66
+ self.seen_issues = {repo: set(numbers) for repo, numbers in raw.items()}
67
+ except (json.JSONDecodeError, OSError):
68
+ # Corrupt or unreadable state — start fresh
69
+ self.seen_issues = {}
70
+
71
+ def _save_state(self) -> None:
72
+ """Persist seen issues and a ``last_poll`` timestamp to the state file."""
73
+ data = {
74
+ "seen_issues": {
75
+ repo: sorted(numbers) for repo, numbers in self.seen_issues.items()
76
+ },
77
+ "last_poll": datetime.now(timezone.utc).isoformat(),
78
+ }
79
+ self.state_file.parent.mkdir(parents=True, exist_ok=True)
80
+ self.state_file.write_text(json.dumps(data, indent=2))
81
+
82
+ def _save_state_best_effort(self) -> None:
83
+ """Try to persist state; log and continue on disk errors.
84
+
85
+ Callers MUST NOT let a _save_state failure propagate out of poll() —
86
+ doing so would drop the new-issues list on the floor while the
87
+ in-memory seen_issues set has already been mutated, silently
88
+ abandoning the work until the daemon restarts.
89
+ """
90
+ try:
91
+ self._save_state()
92
+ except OSError as e:
93
+ log_event(
94
+ _logger,
95
+ "poll.save_state.failed",
96
+ reason=type(e).__name__,
97
+ error=str(e)[:200],
98
+ state_file=str(self.state_file),
99
+ )
100
+
101
+ def _record_repo_failure(
102
+ self,
103
+ repo: str,
104
+ exc: Exception,
105
+ *,
106
+ phase: str = "poll",
107
+ ) -> None:
108
+ """Bump the consecutive-failure counter and log with an escalated
109
+ level once the threshold is reached. ``phase`` distinguishes
110
+ poll-time vs seed-time skips in the event payload."""
111
+ count = self._repo_failure_counts.get(repo, 0) + 1
112
+ self._repo_failure_counts[repo] = count
113
+ fields = {
114
+ "repo": repo,
115
+ "reason": type(exc).__name__,
116
+ "error": str(exc)[:200],
117
+ "consecutive_failures": count,
118
+ "phase": phase,
119
+ }
120
+ if count >= _REPO_FAILURE_WARN_THRESHOLD:
121
+ fields["persistent"] = True
122
+ _logger.warning("poll.repo.skipped", extra=fields)
123
+ else:
124
+ log_event(_logger, "poll.repo.skipped", **fields)
125
+
126
+ def _clear_repo_failure(self, repo: str) -> None:
127
+ """Reset the failure counter after a successful repo lookup."""
128
+ self._repo_failure_counts.pop(repo, None)
129
+
130
+ # ------------------------------------------------------------------
131
+ # Public API
132
+ # ------------------------------------------------------------------
133
+
134
+ async def poll(self) -> list[dict[str, Any]]:
135
+ """Poll all configured repos for new issues assigned to ``self.username``.
136
+
137
+ Returns:
138
+ A list of ``{"repo": str, "issue": dict}`` entries for issues that
139
+ have not been seen before. Updates ``seen_issues`` and persists
140
+ state to disk.
141
+
142
+ Per-repo resilience: a transient failure on one repo (network timeout,
143
+ ``gh`` exit, OS error) is logged and skipped so the other repos still
144
+ get polled. Only ``asyncio.CancelledError`` escapes, which allows a
145
+ clean shutdown signal to propagate.
146
+ """
147
+ new_issues: list[dict[str, Any]] = []
148
+
149
+ for repo in self.repos:
150
+ try:
151
+ issues = await self.github.list_assigned_issues(
152
+ repo, assignee=self.username
153
+ )
154
+ except asyncio.CancelledError:
155
+ raise
156
+ except Exception as e:
157
+ # Transient-ish (TimeoutError/GitHubError/OSError) goes through
158
+ # the failure counter so persistent misconfig escalates; any
159
+ # other unexpected exception is logged as a skip too so the
160
+ # surrounding repos still get processed AND new_issues from
161
+ # prior repos reaches the caller. Without this catch, a later
162
+ # repo exploding would leave earlier repos' seen_issues
163
+ # mutated but their new_issues list unreturned.
164
+ if isinstance(e, _TRANSIENT_POLL_ERRORS):
165
+ self._record_repo_failure(repo, e, phase="poll")
166
+ else:
167
+ log_event(
168
+ _logger,
169
+ "poll.repo.unexpected_error",
170
+ repo=repo,
171
+ reason=type(e).__name__,
172
+ error=str(e)[:200],
173
+ phase="poll",
174
+ )
175
+ continue
176
+
177
+ # Successful lookup — clear any accumulated failure count.
178
+ self._clear_repo_failure(repo)
179
+
180
+ seen_for_repo = self.seen_issues.setdefault(repo, set())
181
+ for issue in issues:
182
+ # Per-issue guard so ONE malformed payload (missing 'number',
183
+ # wrong type, non-dict entry) doesn't poison the remaining
184
+ # good issues in the same repo's batch.
185
+ try:
186
+ number = int(issue["number"])
187
+ except asyncio.CancelledError:
188
+ raise
189
+ except Exception as e:
190
+ log_event(
191
+ _logger,
192
+ "poll.issue.malformed",
193
+ repo=repo,
194
+ reason=type(e).__name__,
195
+ error=str(e)[:200],
196
+ )
197
+ continue
198
+ if number not in seen_for_repo:
199
+ new_issues.append({"repo": repo, "issue": issue})
200
+ seen_for_repo.add(number)
201
+
202
+ # Never propagate a save_state disk failure out of poll() — the
203
+ # caller has work to do with new_issues. Log and move on.
204
+ self._save_state_best_effort()
205
+ return new_issues
206
+
207
+ def mark_seen(self, repo: str, issue_number: int) -> None:
208
+ """Mark an issue as seen without triggering a poll.
209
+
210
+ Useful for pre-seeding state from external sources (e.g. resuming
211
+ after a crash where work was already started).
212
+ """
213
+ self.seen_issues.setdefault(repo, set()).add(issue_number)
214
+ self._save_state()
215
+
216
+ def unmark_seen(self, repo: str, issue_number: int) -> None:
217
+ """Remove an issue from the seen-set so the next poll picks it up
218
+ again. Use this when a handler failed for a transient reason that
219
+ retrying would fix — the canonical case is a per-repo lock
220
+ conflict with a concurrent secops sweep. Without this, the
221
+ issue would be silently dropped forever because
222
+ ``poll()`` marks issues seen **before** handing them to the
223
+ handler, so a single handler failure is fatal by default.
224
+ Disk-save is best-effort; a failed save is logged but never
225
+ propagates."""
226
+ seen = self.seen_issues.get(repo)
227
+ if seen and issue_number in seen:
228
+ seen.discard(issue_number)
229
+ self._save_state_best_effort()
230
+
231
+ async def seed_current(self) -> None:
232
+ """Seed seen_issues with all currently assigned issues.
233
+
234
+ Call this on first startup to avoid treating existing assignments
235
+ as new. Only issues assigned AFTER this seed will trigger handlers.
236
+
237
+ Failure mode: if a per-repo lookup fails transiently, the seed skips
238
+ that repo and logs ``poll.repo.skipped``. The consequence is that on
239
+ next poll, any currently-assigned issues on the skipped repo will be
240
+ treated as new and picked up — that's safer than crashing first-run.
241
+ """
242
+ for repo in self.repos:
243
+ try:
244
+ issues = await self.github.list_assigned_issues(
245
+ repo, assignee=self.username
246
+ )
247
+ except asyncio.CancelledError:
248
+ raise
249
+ except _TRANSIENT_POLL_ERRORS as e:
250
+ self._record_repo_failure(repo, e, phase="seed")
251
+ continue
252
+ self._clear_repo_failure(repo)
253
+ seen_for_repo = self.seen_issues.setdefault(repo, set())
254
+ for issue in issues:
255
+ seen_for_repo.add(issue["number"])
256
+ self._save_state_best_effort()
257
+
258
+
259
+ async def run_poll_loop(
260
+ poller: IssuePoller,
261
+ handler: Callable[[str, dict[str, Any]], Awaitable[None]],
262
+ interval: int = 300,
263
+ max_iterations: int | None = None,
264
+ ) -> None:
265
+ """Run the polling loop.
266
+
267
+ Args:
268
+ poller: IssuePoller instance
269
+ handler: Async function to call for each new issue (repo, issue)
270
+ interval: Seconds between polls
271
+ max_iterations: Max iterations (None = infinite)
272
+
273
+ Iteration resilience: any non-cancellation exception from the poll or a
274
+ handler call is logged as ``poll.iteration.failed`` and the loop sleeps
275
+ and continues. This keeps a single bad cycle (slow network, one flaky
276
+ handler) from crashing the daemon and forcing a launchd restart.
277
+ """
278
+ iterations = 0
279
+ while max_iterations is None or iterations < max_iterations:
280
+ # Guard poll() separately from the handler dispatch: a malformed
281
+ # poll result shouldn't lose queued work, and a handler failure
282
+ # shouldn't skip the rest of the batch.
283
+ try:
284
+ new_issues = await poller.poll()
285
+ except asyncio.CancelledError:
286
+ raise
287
+ except Exception as e:
288
+ log_event(
289
+ _logger,
290
+ "poll.iteration.failed",
291
+ iteration=iterations,
292
+ phase="poll",
293
+ reason=type(e).__name__,
294
+ error=str(e)[:200],
295
+ )
296
+ new_issues = []
297
+
298
+ # Each handler invocation is isolated. A failure on one issue must
299
+ # not cancel the remaining already-seen-and-persisted issues — those
300
+ # would otherwise be silently dropped until daemon restart.
301
+ for item in new_issues:
302
+ try:
303
+ await handler(item["repo"], item["issue"])
304
+ except asyncio.CancelledError:
305
+ raise
306
+ except Exception as e:
307
+ issue = item.get("issue") or {}
308
+ log_event(
309
+ _logger,
310
+ "poll.handler.failed",
311
+ repo=item.get("repo"),
312
+ issue_number=issue.get("number"),
313
+ reason=type(e).__name__,
314
+ error=str(e)[:200],
315
+ )
316
+
317
+ iterations += 1
318
+ if max_iterations is None or iterations < max_iterations:
319
+ await asyncio.sleep(interval)
@@ -0,0 +1,177 @@
1
+ """PR verification: wait for CI and confirm mergeability before hand-off."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from dataclasses import dataclass, field
7
+ from typing import Any
8
+
9
+ from ctrlrelay.core.github import GitHubCLI
10
+
11
+ # `gh pr checks --json bucket` returns one of: pass, fail, pending, skipping, cancel.
12
+ # Treat skipping as pass (skipped jobs don't block a merge) and everything except
13
+ # pending as "terminal".
14
+ _PENDING_BUCKETS = frozenset({"pending"})
15
+ _PASSING_BUCKETS = frozenset({"pass", "skipping"})
16
+ _TERMINAL_MERGEABLE_VALUES = frozenset({"MERGEABLE", "CONFLICTING"})
17
+
18
+ # The dev pipeline's contract is "open a PR and hand it to humans for review",
19
+ # not "merge the PR". So the verifier does NOT require the PR be in a directly-
20
+ # mergeable state — human-gated states (awaiting review, unresolved comments,
21
+ # pending deployments, merge-queue requirements) are all expected terminal
22
+ # states for us. What we DO reject is state that (a) indicates broken code
23
+ # (failing checks) or (b) the orchestrator can itself fix before hand-off
24
+ # (conflicts, behind base). All other mergeStateStatus values are accepted.
25
+ _REBASE_REQUIRED_MERGE_STATE_STATUS = frozenset({"BEHIND"})
26
+
27
+ # Require 2 consecutive empty check-list responses separated by a poll interval
28
+ # before concluding "no CI configured". GitHub registers check runs a few
29
+ # seconds after `gh pr create`, so a single-shot empty read is unreliable.
30
+ _EMPTY_CHECKS_CONFIRM_POLLS = 2
31
+
32
+
33
+ @dataclass
34
+ class VerificationResult:
35
+ """Outcome of verifying a PR is ready for hand-off."""
36
+
37
+ ready: bool
38
+ reason: str = ""
39
+ failing_checks: list[dict[str, Any]] = field(default_factory=list)
40
+ pending_checks: list[dict[str, Any]] = field(default_factory=list)
41
+ # Set True when wait_for_checks returned with pending entries (timeout hit
42
+ # while CI was still running). Callers use this to distinguish "needs a
43
+ # fix" from "just slow CI" and avoid burning retry budget on the latter.
44
+ timed_out: bool = False
45
+ mergeable: str | None = None
46
+ merge_state_status: str | None = None
47
+
48
+
49
+ @dataclass
50
+ class PRVerifier:
51
+ """Verifies a PR is green and conflict-free before declaring a dev task done."""
52
+
53
+ github: GitHubCLI
54
+ poll_interval: int = 30
55
+ check_timeout: int = 1800
56
+ mergeable_poll_attempts: int = 10
57
+
58
+ async def wait_for_checks(
59
+ self,
60
+ repo: str,
61
+ pr_number: int,
62
+ timeout: int | None = None,
63
+ ) -> list[dict[str, Any]]:
64
+ """Poll PR checks until every check has left the 'pending' bucket or
65
+ the timeout is reached.
66
+
67
+ Empty-check handling: GitHub registers check runs asynchronously after
68
+ `gh pr create` so a single empty read is ambiguous — the repo might
69
+ have no CI, or CI just hasn't registered yet. We require
70
+ `_EMPTY_CHECKS_CONFIRM_POLLS` consecutive empty reads separated by
71
+ `poll_interval` before concluding "no CI configured"."""
72
+ limit = self.check_timeout if timeout is None else timeout
73
+ elapsed = 0
74
+ empty_streak = 0
75
+ checks: list[dict[str, Any]] = []
76
+ while True:
77
+ checks = await self.github.get_pr_checks(repo, pr_number)
78
+ if not checks:
79
+ empty_streak += 1
80
+ if empty_streak >= _EMPTY_CHECKS_CONFIRM_POLLS:
81
+ return checks
82
+ else:
83
+ empty_streak = 0
84
+ if all(c.get("bucket") not in _PENDING_BUCKETS for c in checks):
85
+ return checks
86
+ if elapsed >= limit:
87
+ return checks
88
+ await asyncio.sleep(self.poll_interval)
89
+ elapsed += self.poll_interval
90
+
91
+ async def verify(
92
+ self,
93
+ repo: str,
94
+ pr_number: int,
95
+ timeout: int | None = None,
96
+ ) -> VerificationResult:
97
+ """Wait for CI, then check mergeability. Report ready only when both are green."""
98
+ checks = await self.wait_for_checks(repo, pr_number, timeout=timeout)
99
+ pending = [c for c in checks if c.get("bucket") in _PENDING_BUCKETS]
100
+ failing = [
101
+ c for c in checks
102
+ if c.get("bucket") not in _PENDING_BUCKETS
103
+ and c.get("bucket") not in _PASSING_BUCKETS
104
+ ]
105
+ # Failing checks take priority over pending. A matrix where lint
106
+ # already failed but a long integration run is still pending must be
107
+ # reported as broken, not timed out — otherwise the caller would hand
108
+ # off a known-bad PR.
109
+ if failing:
110
+ names = ", ".join(c.get("name", "?") for c in failing)
111
+ return VerificationResult(
112
+ ready=False,
113
+ reason=f"{len(failing)} check(s) failing: {names}",
114
+ failing_checks=failing,
115
+ pending_checks=pending,
116
+ )
117
+ if pending:
118
+ # All failing paths ruled out; we simply hit the timeout while
119
+ # everything still in flight was healthy. Don't ask Claude to
120
+ # "fix" slow CI — surface it as a distinct outcome so the caller
121
+ # hands off the PR as-is.
122
+ names = ", ".join(c.get("name", "?") for c in pending)
123
+ return VerificationResult(
124
+ ready=False,
125
+ timed_out=True,
126
+ reason=(
127
+ f"CI still running after timeout: {len(pending)} "
128
+ f"check(s) pending ({names})"
129
+ ),
130
+ pending_checks=pending,
131
+ )
132
+
133
+ mergeable: str | None = None
134
+ merge_state: str | None = None
135
+ for _ in range(self.mergeable_poll_attempts):
136
+ state = await self.github.get_pr_state(repo, pr_number)
137
+ mergeable = state.get("mergeable")
138
+ merge_state = state.get("mergeStateStatus")
139
+ if mergeable in _TERMINAL_MERGEABLE_VALUES:
140
+ break
141
+ await asyncio.sleep(self.poll_interval)
142
+
143
+ if mergeable == "CONFLICTING":
144
+ return VerificationResult(
145
+ ready=False,
146
+ reason="PR has merge conflicts with the base branch",
147
+ mergeable=mergeable,
148
+ merge_state_status=merge_state,
149
+ )
150
+ if mergeable != "MERGEABLE":
151
+ return VerificationResult(
152
+ ready=False,
153
+ reason=f"PR mergeable state unresolved: {mergeable}",
154
+ mergeable=mergeable,
155
+ merge_state_status=merge_state,
156
+ )
157
+ if merge_state in _REBASE_REQUIRED_MERGE_STATE_STATUS:
158
+ return VerificationResult(
159
+ ready=False,
160
+ reason=(
161
+ "PR is behind the base branch and must be rebased before "
162
+ "merge (mergeStateStatus=BEHIND)"
163
+ ),
164
+ mergeable=mergeable,
165
+ merge_state_status=merge_state,
166
+ )
167
+
168
+ # Any remaining state (CLEAN, HAS_HOOKS, BLOCKED, UNSTABLE, DRAFT,
169
+ # etc.) is accepted. CI is verified green above, conflicts and
170
+ # behind-base are handled explicitly, so what's left is either
171
+ # directly mergeable or human-gated — both are valid hand-off states
172
+ # for a pipeline that never auto-merges.
173
+ return VerificationResult(
174
+ ready=True,
175
+ mergeable=mergeable,
176
+ merge_state_status=merge_state,
177
+ )
@@ -0,0 +1,121 @@
1
+ """PR merge watcher for monitoring PR state."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ from dataclasses import dataclass
7
+ from typing import Awaitable, Callable
8
+
9
+ from ctrlrelay.core.github import GitHubCLI, GitHubError
10
+ from ctrlrelay.core.obs import get_logger, log_event
11
+
12
+ _logger = get_logger("core.pr_watcher")
13
+
14
+ # After this many CONSECUTIVE transient failures we give up on the watch.
15
+ # gh can raise "transient-looking" errors (GitHubError, TimeoutError,
16
+ # OSError) for permanent problems too — bad repo, expired auth,
17
+ # permission change, missing gh binary. Without a cap, those would
18
+ # silently loop for the full 7-day timeout and never surface the
19
+ # problem.
20
+ #
21
+ # Sizing: at the default 60s poll interval, 60 consecutive failures
22
+ # covers a ~1-hour outage window. That's enough slack for routine VPN
23
+ # flaps, GitHub incidents, or auth token rotations without abandoning
24
+ # the watch, while still bounding truly permanent failures (deleted
25
+ # repo, revoked credentials) to an hour instead of the full 7-day
26
+ # timeout. A successful poll resets the counter, so genuinely
27
+ # intermittent failures never accumulate.
28
+ _TRANSIENT_FAILURE_CAP = 60
29
+
30
+
31
+ @dataclass
32
+ class PRWatcher:
33
+ """Watches PRs for merge events."""
34
+
35
+ github: GitHubCLI
36
+ poll_interval: int = 60
37
+
38
+ async def check_merged(self, repo: str, pr_number: int) -> bool:
39
+ """Check if a PR has been merged.
40
+
41
+ Args:
42
+ repo: Repository name (owner/repo)
43
+ pr_number: PR number
44
+
45
+ Returns:
46
+ True if merged, False otherwise
47
+ """
48
+ pr_state = await self.github.get_pr_state(repo, pr_number)
49
+ return pr_state.get("state") == "MERGED"
50
+
51
+ async def wait_for_merge(
52
+ self,
53
+ repo: str,
54
+ pr_number: int,
55
+ timeout: int = 86400,
56
+ on_poll: Callable[[], Awaitable[None]] | None = None,
57
+ ) -> bool:
58
+ """Wait for a PR to be merged.
59
+
60
+ Args:
61
+ repo: Repository name
62
+ pr_number: PR number
63
+ timeout: Max seconds to wait (default 24h)
64
+ on_poll: Optional callback after each poll
65
+
66
+ Returns:
67
+ True if merged within timeout, False otherwise
68
+
69
+ Transient-failure handling: individual ``gh`` failures
70
+ (``GitHubError``, ``TimeoutError``, network-level ``OSError``)
71
+ during a multi-day watch MUST NOT abort the loop — otherwise a
72
+ single flaky poll cycle permanently stops monitoring the PR.
73
+ Log a structured ``pr_watch.transient_error`` event and keep
74
+ polling. ``asyncio.CancelledError`` is always re-raised so a
75
+ clean shutdown propagates.
76
+ """
77
+ elapsed = 0
78
+ consecutive_failures = 0
79
+ while elapsed < timeout:
80
+ try:
81
+ if await self.check_merged(repo, pr_number):
82
+ return True
83
+ consecutive_failures = 0 # successful poll resets the counter
84
+ except asyncio.CancelledError:
85
+ raise
86
+ except (GitHubError, TimeoutError, OSError) as e:
87
+ consecutive_failures += 1
88
+ log_event(
89
+ _logger, "pr_watch.transient_error",
90
+ repo=repo, pr_number=pr_number,
91
+ reason=type(e).__name__,
92
+ error=str(e)[:200],
93
+ elapsed=elapsed,
94
+ consecutive_failures=consecutive_failures,
95
+ )
96
+ if consecutive_failures >= _TRANSIENT_FAILURE_CAP:
97
+ # Likely permanent: bad repo, expired auth, 404,
98
+ # missing gh binary. Fail fast instead of zombie-
99
+ # sleeping for 7 days.
100
+ log_event(
101
+ _logger, "pr_watch.abandoned_after_too_many_errors",
102
+ repo=repo, pr_number=pr_number,
103
+ consecutive_failures=consecutive_failures,
104
+ last_reason=type(e).__name__,
105
+ last_error=str(e)[:200],
106
+ )
107
+ raise
108
+ # Fall through to the sleep + retry.
109
+
110
+ if on_poll:
111
+ try:
112
+ await on_poll()
113
+ except asyncio.CancelledError:
114
+ raise
115
+ except Exception:
116
+ pass # on_poll is best-effort diagnostic plumbing
117
+
118
+ await asyncio.sleep(self.poll_interval)
119
+ elapsed += self.poll_interval
120
+
121
+ return False