PyPI - ctrlrelay - Versions diffs - 0.1.5__py3-none-any.whl - Mend

ctrlrelay 0.1.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

ctrlrelay/__init__.py +8 -0
ctrlrelay/bridge/__init__.py +21 -0
ctrlrelay/bridge/__main__.py +69 -0
ctrlrelay/bridge/protocol.py +75 -0
ctrlrelay/bridge/server.py +285 -0
ctrlrelay/bridge/telegram_handler.py +117 -0
ctrlrelay/cli.py +1449 -0
ctrlrelay/core/__init__.py +54 -0
ctrlrelay/core/audit.py +257 -0
ctrlrelay/core/checkpoint.py +155 -0
ctrlrelay/core/config.py +291 -0
ctrlrelay/core/dispatcher.py +202 -0
ctrlrelay/core/github.py +272 -0
ctrlrelay/core/obs.py +118 -0
ctrlrelay/core/poller.py +319 -0
ctrlrelay/core/pr_verifier.py +177 -0
ctrlrelay/core/pr_watcher.py +121 -0
ctrlrelay/core/scheduler.py +337 -0
ctrlrelay/core/state.py +167 -0
ctrlrelay/core/worktree.py +673 -0
ctrlrelay/dashboard/__init__.py +5 -0
ctrlrelay/dashboard/client.py +159 -0
ctrlrelay/pipelines/__init__.py +15 -0
ctrlrelay/pipelines/base.py +50 -0
ctrlrelay/pipelines/dev.py +562 -0
ctrlrelay/pipelines/post_merge.py +279 -0
ctrlrelay/pipelines/secops.py +379 -0
ctrlrelay/transports/__init__.py +33 -0
ctrlrelay/transports/base.py +47 -0
ctrlrelay/transports/file_mock.py +94 -0
ctrlrelay/transports/socket_client.py +180 -0
ctrlrelay-0.1.5.dist-info/METADATA +251 -0
ctrlrelay-0.1.5.dist-info/RECORD +36 -0
ctrlrelay-0.1.5.dist-info/WHEEL +4 -0
ctrlrelay-0.1.5.dist-info/entry_points.txt +2 -0
ctrlrelay-0.1.5.dist-info/licenses/LICENSE +201 -0

ctrlrelay/core/poller.py ADDED Viewed

@@ -0,0 +1,319 @@
+"""GitHub Issue Poller for ctrlrelay."""
+from __future__ import annotations
+import asyncio
+import json
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Awaitable, Callable
+from ctrlrelay.core.github import GitHubCLI, GitHubError
+from ctrlrelay.core.obs import get_logger, log_event
+_logger = get_logger("core.poller")
+# Exceptions that are transient and should skip the current repo/iteration
+# rather than tear the whole poll loop down. asyncio.CancelledError is
+# deliberately excluded so a shutdown signal still propagates.
+#
+# GitHubError is included because we can't distinguish transient (rate
+# limit, 5xx, network) from permanent (bad repo name, expired auth, 404)
+# without fragile error-message parsing — classifying both as skip avoids
+# crashes. A persistent-failure counter (see below) makes permanent
+# misconfiguration visible even though it's technically skipped here.
+_TRANSIENT_POLL_ERRORS = (TimeoutError, GitHubError, OSError)
+# After this many consecutive per-repo failures, escalate log level to
+# WARNING so a persistent misconfiguration (expired auth, renamed repo,
+# revoked access) stops hiding behind routine "transient" skip logs.
+_REPO_FAILURE_WARN_THRESHOLD = 3
+@dataclass
+class IssuePoller:
+    """Polls GitHub repos for newly assigned issues.
+    Maintains a set of seen issue numbers per repo so that only genuinely new
+    issues are surfaced on each call to ``poll()``.
+    """
+    github: GitHubCLI
+    username: str
+    repos: list[str]
+    state_file: Path
+    seen_issues: dict[str, set[int]] = field(default_factory=dict)
+    # Per-repo consecutive-skip counter; populated at runtime by poll() /
+    # seed_current(). Not persisted — intentionally resets on daemon
+    # restart so an operator fix is exercised before we re-escalate.
+    _repo_failure_counts: dict[str, int] = field(default_factory=dict, repr=False)
+    def __post_init__(self) -> None:
+        self._load_state()
+    # ------------------------------------------------------------------
+    # State persistence
+    # ------------------------------------------------------------------
+    def _load_state(self) -> None:
+        """Load seen issues from the JSON state file (if it exists)."""
+        if not self.state_file.exists():
+            return
+        try:
+            data = json.loads(self.state_file.read_text())
+            raw = data.get("seen_issues", {})
+            self.seen_issues = {repo: set(numbers) for repo, numbers in raw.items()}
+        except (json.JSONDecodeError, OSError):
+            # Corrupt or unreadable state — start fresh
+            self.seen_issues = {}
+    def _save_state(self) -> None:
+        """Persist seen issues and a ``last_poll`` timestamp to the state file."""
+        data = {
+            "seen_issues": {
+                repo: sorted(numbers) for repo, numbers in self.seen_issues.items()
+            },
+            "last_poll": datetime.now(timezone.utc).isoformat(),
+        }
+        self.state_file.parent.mkdir(parents=True, exist_ok=True)
+        self.state_file.write_text(json.dumps(data, indent=2))
+    def _save_state_best_effort(self) -> None:
+        """Try to persist state; log and continue on disk errors.
+        Callers MUST NOT let a _save_state failure propagate out of poll() —
+        doing so would drop the new-issues list on the floor while the
+        in-memory seen_issues set has already been mutated, silently
+        abandoning the work until the daemon restarts.
+        """
+        try:
+            self._save_state()
+        except OSError as e:
+            log_event(
+                _logger,
+                "poll.save_state.failed",
+                reason=type(e).__name__,
+                error=str(e)[:200],
+                state_file=str(self.state_file),
+            )
+    def _record_repo_failure(
+        self,
+        repo: str,
+        exc: Exception,
+        *,
+        phase: str = "poll",
+    ) -> None:
+        """Bump the consecutive-failure counter and log with an escalated
+        level once the threshold is reached. ``phase`` distinguishes
+        poll-time vs seed-time skips in the event payload."""
+        count = self._repo_failure_counts.get(repo, 0) + 1
+        self._repo_failure_counts[repo] = count
+        fields = {
+            "repo": repo,
+            "reason": type(exc).__name__,
+            "error": str(exc)[:200],
+            "consecutive_failures": count,
+            "phase": phase,
+        }
+        if count >= _REPO_FAILURE_WARN_THRESHOLD:
+            fields["persistent"] = True
+            _logger.warning("poll.repo.skipped", extra=fields)
+        else:
+            log_event(_logger, "poll.repo.skipped", **fields)
+    def _clear_repo_failure(self, repo: str) -> None:
+        """Reset the failure counter after a successful repo lookup."""
+        self._repo_failure_counts.pop(repo, None)
+    # ------------------------------------------------------------------
+    # Public API
+    # ------------------------------------------------------------------
+    async def poll(self) -> list[dict[str, Any]]:
+        """Poll all configured repos for new issues assigned to ``self.username``.
+        Returns:
+            A list of ``{"repo": str, "issue": dict}`` entries for issues that
+            have not been seen before. Updates ``seen_issues`` and persists
+            state to disk.
+        Per-repo resilience: a transient failure on one repo (network timeout,
+        ``gh`` exit, OS error) is logged and skipped so the other repos still
+        get polled. Only ``asyncio.CancelledError`` escapes, which allows a
+        clean shutdown signal to propagate.
+        """
+        new_issues: list[dict[str, Any]] = []
+        for repo in self.repos:
+            try:
+                issues = await self.github.list_assigned_issues(
+                    repo, assignee=self.username
+                )
+            except asyncio.CancelledError:
+                raise
+            except Exception as e:
+                # Transient-ish (TimeoutError/GitHubError/OSError) goes through
+                # the failure counter so persistent misconfig escalates; any
+                # other unexpected exception is logged as a skip too so the
+                # surrounding repos still get processed AND new_issues from
+                # prior repos reaches the caller. Without this catch, a later
+                # repo exploding would leave earlier repos' seen_issues
+                # mutated but their new_issues list unreturned.
+                if isinstance(e, _TRANSIENT_POLL_ERRORS):
+                    self._record_repo_failure(repo, e, phase="poll")
+                else:
+                    log_event(
+                        _logger,
+                        "poll.repo.unexpected_error",
+                        repo=repo,
+                        reason=type(e).__name__,
+                        error=str(e)[:200],
+                        phase="poll",
+                    )
+                continue
+            # Successful lookup — clear any accumulated failure count.
+            self._clear_repo_failure(repo)
+            seen_for_repo = self.seen_issues.setdefault(repo, set())
+            for issue in issues:
+                # Per-issue guard so ONE malformed payload (missing 'number',
+                # wrong type, non-dict entry) doesn't poison the remaining
+                # good issues in the same repo's batch.
+                try:
+                    number = int(issue["number"])
+                except asyncio.CancelledError:
+                    raise
+                except Exception as e:
+                    log_event(
+                        _logger,
+                        "poll.issue.malformed",
+                        repo=repo,
+                        reason=type(e).__name__,
+                        error=str(e)[:200],
+                    )
+                    continue
+                if number not in seen_for_repo:
+                    new_issues.append({"repo": repo, "issue": issue})
+                    seen_for_repo.add(number)
+        # Never propagate a save_state disk failure out of poll() — the
+        # caller has work to do with new_issues. Log and move on.
+        self._save_state_best_effort()
+        return new_issues
+    def mark_seen(self, repo: str, issue_number: int) -> None:
+        """Mark an issue as seen without triggering a poll.
+        Useful for pre-seeding state from external sources (e.g. resuming
+        after a crash where work was already started).
+        """
+        self.seen_issues.setdefault(repo, set()).add(issue_number)
+        self._save_state()
+    def unmark_seen(self, repo: str, issue_number: int) -> None:
+        """Remove an issue from the seen-set so the next poll picks it up
+        again. Use this when a handler failed for a transient reason that
+        retrying would fix — the canonical case is a per-repo lock
+        conflict with a concurrent secops sweep. Without this, the
+        issue would be silently dropped forever because
+        ``poll()`` marks issues seen **before** handing them to the
+        handler, so a single handler failure is fatal by default.
+        Disk-save is best-effort; a failed save is logged but never
+        propagates."""
+        seen = self.seen_issues.get(repo)
+        if seen and issue_number in seen:
+            seen.discard(issue_number)
+            self._save_state_best_effort()
+    async def seed_current(self) -> None:
+        """Seed seen_issues with all currently assigned issues.
+        Call this on first startup to avoid treating existing assignments
+        as new. Only issues assigned AFTER this seed will trigger handlers.
+        Failure mode: if a per-repo lookup fails transiently, the seed skips
+        that repo and logs ``poll.repo.skipped``. The consequence is that on
+        next poll, any currently-assigned issues on the skipped repo will be
+        treated as new and picked up — that's safer than crashing first-run.
+        """
+        for repo in self.repos:
+            try:
+                issues = await self.github.list_assigned_issues(
+                    repo, assignee=self.username
+                )
+            except asyncio.CancelledError:
+                raise
+            except _TRANSIENT_POLL_ERRORS as e:
+                self._record_repo_failure(repo, e, phase="seed")
+                continue
+            self._clear_repo_failure(repo)
+            seen_for_repo = self.seen_issues.setdefault(repo, set())
+            for issue in issues:
+                seen_for_repo.add(issue["number"])
+        self._save_state_best_effort()
+async def run_poll_loop(
+    poller: IssuePoller,
+    handler: Callable[[str, dict[str, Any]], Awaitable[None]],
+    interval: int = 300,
+    max_iterations: int | None = None,
+) -> None:
+    """Run the polling loop.
+    Args:
+        poller: IssuePoller instance
+        handler: Async function to call for each new issue (repo, issue)
+        interval: Seconds between polls
+        max_iterations: Max iterations (None = infinite)
+    Iteration resilience: any non-cancellation exception from the poll or a
+    handler call is logged as ``poll.iteration.failed`` and the loop sleeps
+    and continues. This keeps a single bad cycle (slow network, one flaky
+    handler) from crashing the daemon and forcing a launchd restart.
+    """
+    iterations = 0
+    while max_iterations is None or iterations < max_iterations:
+        # Guard poll() separately from the handler dispatch: a malformed
+        # poll result shouldn't lose queued work, and a handler failure
+        # shouldn't skip the rest of the batch.
+        try:
+            new_issues = await poller.poll()
+        except asyncio.CancelledError:
+            raise
+        except Exception as e:
+            log_event(
+                _logger,
+                "poll.iteration.failed",
+                iteration=iterations,
+                phase="poll",
+                reason=type(e).__name__,
+                error=str(e)[:200],
+            )
+            new_issues = []
+        # Each handler invocation is isolated. A failure on one issue must
+        # not cancel the remaining already-seen-and-persisted issues — those
+        # would otherwise be silently dropped until daemon restart.
+        for item in new_issues:
+            try:
+                await handler(item["repo"], item["issue"])
+            except asyncio.CancelledError:
+                raise
+            except Exception as e:
+                issue = item.get("issue") or {}
+                log_event(
+                    _logger,
+                    "poll.handler.failed",
+                    repo=item.get("repo"),
+                    issue_number=issue.get("number"),
+                    reason=type(e).__name__,
+                    error=str(e)[:200],
+                )
+        iterations += 1
+        if max_iterations is None or iterations < max_iterations:
+            await asyncio.sleep(interval)

ctrlrelay/core/pr_verifier.py ADDED Viewed

@@ -0,0 +1,177 @@
+"""PR verification: wait for CI and confirm mergeability before hand-off."""
+from __future__ import annotations
+import asyncio
+from dataclasses import dataclass, field
+from typing import Any
+from ctrlrelay.core.github import GitHubCLI
+# `gh pr checks --json bucket` returns one of: pass, fail, pending, skipping, cancel.
+# Treat skipping as pass (skipped jobs don't block a merge) and everything except
+# pending as "terminal".
+_PENDING_BUCKETS = frozenset({"pending"})
+_PASSING_BUCKETS = frozenset({"pass", "skipping"})
+_TERMINAL_MERGEABLE_VALUES = frozenset({"MERGEABLE", "CONFLICTING"})
+# The dev pipeline's contract is "open a PR and hand it to humans for review",
+# not "merge the PR". So the verifier does NOT require the PR be in a directly-
+# mergeable state — human-gated states (awaiting review, unresolved comments,
+# pending deployments, merge-queue requirements) are all expected terminal
+# states for us. What we DO reject is state that (a) indicates broken code
+# (failing checks) or (b) the orchestrator can itself fix before hand-off
+# (conflicts, behind base). All other mergeStateStatus values are accepted.
+_REBASE_REQUIRED_MERGE_STATE_STATUS = frozenset({"BEHIND"})
+# Require 2 consecutive empty check-list responses separated by a poll interval
+# before concluding "no CI configured". GitHub registers check runs a few
+# seconds after `gh pr create`, so a single-shot empty read is unreliable.
+_EMPTY_CHECKS_CONFIRM_POLLS = 2
+@dataclass
+class VerificationResult:
+    """Outcome of verifying a PR is ready for hand-off."""
+    ready: bool
+    reason: str = ""
+    failing_checks: list[dict[str, Any]] = field(default_factory=list)
+    pending_checks: list[dict[str, Any]] = field(default_factory=list)
+    # Set True when wait_for_checks returned with pending entries (timeout hit
+    # while CI was still running). Callers use this to distinguish "needs a
+    # fix" from "just slow CI" and avoid burning retry budget on the latter.
+    timed_out: bool = False
+    mergeable: str | None = None
+    merge_state_status: str | None = None
+@dataclass
+class PRVerifier:
+    """Verifies a PR is green and conflict-free before declaring a dev task done."""
+    github: GitHubCLI
+    poll_interval: int = 30
+    check_timeout: int = 1800
+    mergeable_poll_attempts: int = 10
+    async def wait_for_checks(
+        self,
+        repo: str,
+        pr_number: int,
+        timeout: int | None = None,
+    ) -> list[dict[str, Any]]:
+        """Poll PR checks until every check has left the 'pending' bucket or
+        the timeout is reached.
+        Empty-check handling: GitHub registers check runs asynchronously after
+        `gh pr create` so a single empty read is ambiguous — the repo might
+        have no CI, or CI just hasn't registered yet. We require
+        `_EMPTY_CHECKS_CONFIRM_POLLS` consecutive empty reads separated by
+        `poll_interval` before concluding "no CI configured"."""
+        limit = self.check_timeout if timeout is None else timeout
+        elapsed = 0
+        empty_streak = 0
+        checks: list[dict[str, Any]] = []
+        while True:
+            checks = await self.github.get_pr_checks(repo, pr_number)
+            if not checks:
+                empty_streak += 1
+                if empty_streak >= _EMPTY_CHECKS_CONFIRM_POLLS:
+                    return checks
+            else:
+                empty_streak = 0
+                if all(c.get("bucket") not in _PENDING_BUCKETS for c in checks):
+                    return checks
+            if elapsed >= limit:
+                return checks
+            await asyncio.sleep(self.poll_interval)
+            elapsed += self.poll_interval
+    async def verify(
+        self,
+        repo: str,
+        pr_number: int,
+        timeout: int | None = None,
+    ) -> VerificationResult:
+        """Wait for CI, then check mergeability. Report ready only when both are green."""
+        checks = await self.wait_for_checks(repo, pr_number, timeout=timeout)
+        pending = [c for c in checks if c.get("bucket") in _PENDING_BUCKETS]
+        failing = [
+            c for c in checks
+            if c.get("bucket") not in _PENDING_BUCKETS
+            and c.get("bucket") not in _PASSING_BUCKETS
+        ]
+        # Failing checks take priority over pending. A matrix where lint
+        # already failed but a long integration run is still pending must be
+        # reported as broken, not timed out — otherwise the caller would hand
+        # off a known-bad PR.
+        if failing:
+            names = ", ".join(c.get("name", "?") for c in failing)
+            return VerificationResult(
+                ready=False,
+                reason=f"{len(failing)} check(s) failing: {names}",
+                failing_checks=failing,
+                pending_checks=pending,
+            )
+        if pending:
+            # All failing paths ruled out; we simply hit the timeout while
+            # everything still in flight was healthy. Don't ask Claude to
+            # "fix" slow CI — surface it as a distinct outcome so the caller
+            # hands off the PR as-is.
+            names = ", ".join(c.get("name", "?") for c in pending)
+            return VerificationResult(
+                ready=False,
+                timed_out=True,
+                reason=(
+                    f"CI still running after timeout: {len(pending)} "
+                    f"check(s) pending ({names})"
+                ),
+                pending_checks=pending,
+            )
+        mergeable: str | None = None
+        merge_state: str | None = None
+        for _ in range(self.mergeable_poll_attempts):
+            state = await self.github.get_pr_state(repo, pr_number)
+            mergeable = state.get("mergeable")
+            merge_state = state.get("mergeStateStatus")
+            if mergeable in _TERMINAL_MERGEABLE_VALUES:
+                break
+            await asyncio.sleep(self.poll_interval)
+        if mergeable == "CONFLICTING":
+            return VerificationResult(
+                ready=False,
+                reason="PR has merge conflicts with the base branch",
+                mergeable=mergeable,
+                merge_state_status=merge_state,
+            )
+        if mergeable != "MERGEABLE":
+            return VerificationResult(
+                ready=False,
+                reason=f"PR mergeable state unresolved: {mergeable}",
+                mergeable=mergeable,
+                merge_state_status=merge_state,
+            )
+        if merge_state in _REBASE_REQUIRED_MERGE_STATE_STATUS:
+            return VerificationResult(
+                ready=False,
+                reason=(
+                    "PR is behind the base branch and must be rebased before "
+                    "merge (mergeStateStatus=BEHIND)"
+                ),
+                mergeable=mergeable,
+                merge_state_status=merge_state,
+            )
+        # Any remaining state (CLEAN, HAS_HOOKS, BLOCKED, UNSTABLE, DRAFT,
+        # etc.) is accepted. CI is verified green above, conflicts and
+        # behind-base are handled explicitly, so what's left is either
+        # directly mergeable or human-gated — both are valid hand-off states
+        # for a pipeline that never auto-merges.
+        return VerificationResult(
+            ready=True,
+            mergeable=mergeable,
+            merge_state_status=merge_state,
+        )

ctrlrelay/core/pr_watcher.py ADDED Viewed

@@ -0,0 +1,121 @@
+"""PR merge watcher for monitoring PR state."""
+from __future__ import annotations
+import asyncio
+from dataclasses import dataclass
+from typing import Awaitable, Callable
+from ctrlrelay.core.github import GitHubCLI, GitHubError
+from ctrlrelay.core.obs import get_logger, log_event
+_logger = get_logger("core.pr_watcher")
+# After this many CONSECUTIVE transient failures we give up on the watch.
+# gh can raise "transient-looking" errors (GitHubError, TimeoutError,
+# OSError) for permanent problems too — bad repo, expired auth,
+# permission change, missing gh binary. Without a cap, those would
+# silently loop for the full 7-day timeout and never surface the
+# problem.
+#
+# Sizing: at the default 60s poll interval, 60 consecutive failures
+# covers a ~1-hour outage window. That's enough slack for routine VPN
+# flaps, GitHub incidents, or auth token rotations without abandoning
+# the watch, while still bounding truly permanent failures (deleted
+# repo, revoked credentials) to an hour instead of the full 7-day
+# timeout. A successful poll resets the counter, so genuinely
+# intermittent failures never accumulate.
+_TRANSIENT_FAILURE_CAP = 60
+@dataclass
+class PRWatcher:
+    """Watches PRs for merge events."""
+    github: GitHubCLI
+    poll_interval: int = 60
+    async def check_merged(self, repo: str, pr_number: int) -> bool:
+        """Check if a PR has been merged.
+        Args:
+            repo: Repository name (owner/repo)
+            pr_number: PR number
+        Returns:
+            True if merged, False otherwise
+        """
+        pr_state = await self.github.get_pr_state(repo, pr_number)
+        return pr_state.get("state") == "MERGED"
+    async def wait_for_merge(
+        self,
+        repo: str,
+        pr_number: int,
+        timeout: int = 86400,
+        on_poll: Callable[[], Awaitable[None]] | None = None,
+    ) -> bool:
+        """Wait for a PR to be merged.
+        Args:
+            repo: Repository name
+            pr_number: PR number
+            timeout: Max seconds to wait (default 24h)
+            on_poll: Optional callback after each poll
+        Returns:
+            True if merged within timeout, False otherwise
+        Transient-failure handling: individual ``gh`` failures
+        (``GitHubError``, ``TimeoutError``, network-level ``OSError``)
+        during a multi-day watch MUST NOT abort the loop — otherwise a
+        single flaky poll cycle permanently stops monitoring the PR.
+        Log a structured ``pr_watch.transient_error`` event and keep
+        polling. ``asyncio.CancelledError`` is always re-raised so a
+        clean shutdown propagates.
+        """
+        elapsed = 0
+        consecutive_failures = 0
+        while elapsed < timeout:
+            try:
+                if await self.check_merged(repo, pr_number):
+                    return True
+                consecutive_failures = 0  # successful poll resets the counter
+            except asyncio.CancelledError:
+                raise
+            except (GitHubError, TimeoutError, OSError) as e:
+                consecutive_failures += 1
+                log_event(
+                    _logger, "pr_watch.transient_error",
+                    repo=repo, pr_number=pr_number,
+                    reason=type(e).__name__,
+                    error=str(e)[:200],
+                    elapsed=elapsed,
+                    consecutive_failures=consecutive_failures,
+                )
+                if consecutive_failures >= _TRANSIENT_FAILURE_CAP:
+                    # Likely permanent: bad repo, expired auth, 404,
+                    # missing gh binary. Fail fast instead of zombie-
+                    # sleeping for 7 days.
+                    log_event(
+                        _logger, "pr_watch.abandoned_after_too_many_errors",
+                        repo=repo, pr_number=pr_number,
+                        consecutive_failures=consecutive_failures,
+                        last_reason=type(e).__name__,
+                        last_error=str(e)[:200],
+                    )
+                    raise
+                # Fall through to the sleep + retry.
+            if on_poll:
+                try:
+                    await on_poll()
+                except asyncio.CancelledError:
+                    raise
+                except Exception:
+                    pass  # on_poll is best-effort diagnostic plumbing
+            await asyncio.sleep(self.poll_interval)
+            elapsed += self.poll_interval
+        return False