interloper-scheduler 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,15 @@
1
+ from interloper_scheduler.cron import CronController
2
+ from interloper_scheduler.executor import RunExecutor
3
+ from interloper_scheduler.launcher import InProcessLauncher, Launcher, build_launcher
4
+ from interloper_scheduler.queue import QueueController
5
+ from interloper_scheduler.reaper import Reaper
6
+
7
+ __all__ = [
8
+ "CronController",
9
+ "InProcessLauncher",
10
+ "Launcher",
11
+ "QueueController",
12
+ "Reaper",
13
+ "RunExecutor",
14
+ "build_launcher",
15
+ ]
@@ -0,0 +1,201 @@
1
+ """Cron controller: evaluates cron jobs and creates queued runs."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import datetime as dt
6
+ import logging
7
+ import os
8
+ from datetime import datetime, timezone
9
+ from threading import Event
10
+ from typing import cast
11
+
12
+ from croniter import croniter
13
+ from interloper_db import Store, get_engine
14
+ from interloper_db.models import Backfill, Job, Run
15
+ from sqlalchemy import or_
16
+ from sqlmodel import Session, col, select
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ class CronController:
22
+ """Evaluates cron jobs and creates queued runs.
23
+
24
+ Runs in a loop:
25
+ 1. ``SELECT FOR UPDATE SKIP LOCKED`` (lock jobs)
26
+ 2. ``UPDATE next_run_at`` (calculate next)
27
+ 3. ``INSERT run`` with ``status='queued'`` (create run)
28
+ 4. ``COMMIT`` (release locks)
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ store: Store | None = None,
34
+ reconcile_interval: int | None = None,
35
+ max_execution_delay: int | None = None,
36
+ batch_size: int = 50,
37
+ ) -> None:
38
+ """Initialize the cron controller.
39
+
40
+ Args:
41
+ store: The Store for creating backfills. Creates a default if not provided.
42
+ reconcile_interval: Seconds between cron evaluation cycles.
43
+ max_execution_delay: Max seconds a scheduled job can be late.
44
+ batch_size: Number of jobs to process per cycle.
45
+ """
46
+ if store is None:
47
+ from interloper.catalog import Catalog
48
+
49
+ store = Store(catalog=Catalog.from_settings())
50
+ self._store = store
51
+ self._batch_size = batch_size
52
+ self._reconcile_interval = reconcile_interval or int(os.getenv("JOB_RECONCILE_INTERVAL", "10"))
53
+ self._max_execution_delay = max_execution_delay or int(
54
+ os.getenv("MAX_JOB_EXECUTION_DELAY", str(self._reconcile_interval))
55
+ )
56
+ if self._max_execution_delay < self._reconcile_interval:
57
+ from interloper.errors import ConfigError
58
+
59
+ raise ConfigError("MAX_JOB_EXECUTION_DELAY must be >= JOB_RECONCILE_INTERVAL")
60
+ self._stop_event = Event()
61
+
62
+ def start(self) -> None:
63
+ """Run the cron evaluation loop until stopped."""
64
+ logger.info("Starting cron controller...")
65
+
66
+ try:
67
+ while not self._stop_event.is_set():
68
+ logger.info("Evaluating cron jobs...")
69
+ try:
70
+ self._process_jobs()
71
+ except Exception as e:
72
+ logger.error("Failed to process jobs: %s", e)
73
+
74
+ if self._stop_event.wait(self._reconcile_interval):
75
+ break
76
+ except KeyboardInterrupt:
77
+ logger.info("Shutting down cron controller...")
78
+
79
+ def stop(self) -> None:
80
+ """Signal the loop to stop."""
81
+ self._stop_event.set()
82
+
83
+ def _process_jobs(self) -> None:
84
+ """Process a batch of due jobs in a single transaction."""
85
+ session = Session(get_engine())
86
+
87
+ try:
88
+ now = datetime.now(timezone.utc)
89
+
90
+ statement = (
91
+ select(Job)
92
+ .where(Job.enabled)
93
+ .where(or_(col(Job.next_run_at) <= now, col(Job.next_run_at).is_(None)))
94
+ .order_by(col(Job.next_run_at).asc().nulls_last())
95
+ .limit(self._batch_size)
96
+ .with_for_update(skip_locked=True)
97
+ )
98
+
99
+ jobs = session.exec(statement).all()
100
+ if not jobs:
101
+ return
102
+
103
+ logger.info("Found %d job(s) ready to run", len(jobs))
104
+
105
+ for job in jobs:
106
+ next_run_at = self._calculate_next_run(job.cron, now)
107
+
108
+ # New job: schedule for the future, don't run yet
109
+ if job.next_run_at is None:
110
+ job.next_run_at = next_run_at
111
+ session.add(job)
112
+ session.flush()
113
+ logger.info("Scheduling new job '%s' for %s", job.name, next_run_at)
114
+ continue
115
+
116
+ # Check if too old to execute
117
+ scheduled_time = job.next_run_at
118
+ if scheduled_time.tzinfo is None:
119
+ scheduled_time = scheduled_time.replace(tzinfo=timezone.utc)
120
+
121
+ delay_seconds = (now - scheduled_time).total_seconds()
122
+ if delay_seconds > self._max_execution_delay:
123
+ logger.warning(
124
+ "Skipping job '%s' - too late (%ds > %ds)",
125
+ job.name,
126
+ int(delay_seconds),
127
+ self._max_execution_delay,
128
+ )
129
+ job.next_run_at = next_run_at
130
+ session.add(job)
131
+ session.flush()
132
+ continue
133
+
134
+ # Update next_run_at
135
+ job.next_run_at = next_run_at
136
+ session.add(job)
137
+ session.flush()
138
+
139
+ # Create runs
140
+ if job.partitioned and job.backfill_days:
141
+ end_date = now.date() - dt.timedelta(days=1)
142
+ start_date = end_date - dt.timedelta(days=job.backfill_days - 1)
143
+ backfill = Backfill(
144
+ org_id=job.org_id,
145
+ job_id=job.id,
146
+ start_date=start_date,
147
+ end_date=end_date,
148
+ status="running",
149
+ started_at=now,
150
+ )
151
+ session.add(backfill)
152
+ session.flush()
153
+
154
+ count = 0
155
+ current = start_date
156
+ while current <= end_date:
157
+ run = Run(
158
+ job_id=job.id,
159
+ org_id=job.org_id,
160
+ backfill_id=backfill.id,
161
+ status="queued",
162
+ partition_date=current,
163
+ )
164
+ session.add(run)
165
+ count += 1
166
+ current += dt.timedelta(days=1)
167
+ backfill.partitions = count
168
+ session.add(backfill)
169
+ else:
170
+ run = Run(
171
+ job_id=job.id,
172
+ org_id=job.org_id,
173
+ status="queued",
174
+ )
175
+ session.add(run)
176
+
177
+ session.commit()
178
+ logger.info("Processed %d job(s)", len(jobs))
179
+
180
+ except Exception as e:
181
+ logger.exception("Error processing jobs: %s", e)
182
+ session.rollback()
183
+ raise
184
+ finally:
185
+ session.close()
186
+
187
+ def _calculate_next_run(self, cron_expr: str, base_time: datetime) -> datetime:
188
+ """Calculate the next run time from a cron expression.
189
+
190
+ Args:
191
+ cron_expr: Cron expression string.
192
+ base_time: The reference time.
193
+
194
+ Returns:
195
+ The next scheduled datetime (UTC).
196
+ """
197
+ itr = croniter(cron_expr, base_time)
198
+ next_run = cast(datetime, itr.get_next(datetime))
199
+ if next_run.tzinfo is None:
200
+ return next_run.replace(tzinfo=timezone.utc)
201
+ return next_run
@@ -0,0 +1,195 @@
1
+ """Run executor: loads a run from DB, builds the DAG, and executes it."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import datetime as dt
6
+ import logging
7
+ from typing import Any
8
+ from uuid import UUID
9
+
10
+ import interloper as il
11
+ from interloper.runner import ExecutionStatus
12
+ from interloper.runner.sync_runner import SyncRunner
13
+ from interloper_db import Store, get_engine
14
+ from interloper_db.models import AssetDependency, Job, Run, Source
15
+ from sqlalchemy.orm import selectinload
16
+ from sqlmodel import Session, col, select
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+
21
+ # TODO: cache source and asset hydrations
22
+
23
+
24
+ class RunExecutor:
25
+ """Executes a run: loads from DB, builds the DAG, runs it, tracks events.
26
+
27
+ Uses the ``Store`` for hydration so all reconstruction goes through
28
+ the standard framework path.
29
+ """
30
+
31
+ def __init__(
32
+ self,
33
+ store: Store | None = None,
34
+ runner_type: type[SyncRunner] = il.MultiThreadRunner,
35
+ runner_kwargs: dict[str, Any] | None = None,
36
+ ) -> None:
37
+ if store is None:
38
+ from interloper.catalog import Catalog
39
+
40
+ store = Store(catalog=Catalog.from_settings())
41
+ self._store = store
42
+ self._runner_type = runner_type
43
+ self._runner_kwargs = runner_kwargs or {}
44
+
45
+ def execute(self, run_id: UUID) -> bool:
46
+ """Execute a run with full lifecycle tracking.
47
+
48
+ Returns:
49
+ ``True`` if the run completed successfully, ``False`` otherwise.
50
+ """
51
+ org_id: UUID | None = None
52
+ backfill_id: str | None = None
53
+
54
+ try:
55
+ logger.info("Starting run %s", run_id)
56
+
57
+ with Session(get_engine()) as session:
58
+ db_run = self._load_run(session, run_id)
59
+ if not db_run or not db_run.job:
60
+ logger.info("Run %s not found, skipping", run_id)
61
+ return False
62
+
63
+ org_id = db_run.org_id
64
+ backfill_id = str(db_run.backfill_id) if db_run.backfill_id else None
65
+
66
+ self._mark_running(session, db_run)
67
+
68
+ assets = self._hydrate_job_assets(db_run.job)
69
+ if not assets:
70
+ logger.info("No sources or assets for run %s, marking success", run_id)
71
+ self._store.complete_run(run_id, success=True)
72
+ return True
73
+
74
+ self._resolve_upstream_deps(db_run.job, assets)
75
+
76
+ dag = il.DAG(*assets)
77
+ partition = il.TimePartition(db_run.partition_date) if db_run.partition_date else None
78
+
79
+ result = self._run_dag(dag, partition, org_id=org_id, run_id=run_id, backfill_id=backfill_id)
80
+
81
+ success = result.status == ExecutionStatus.COMPLETED
82
+ logger.info("Run %s completed: %s", run_id, result.status.name)
83
+ self._store.complete_run(run_id, success=success)
84
+ return success
85
+
86
+ except Exception as e:
87
+ logger.exception("Run %s failed: %s", run_id, e)
88
+ try:
89
+ metadata: dict[str, Any] = {
90
+ "run_id": str(run_id),
91
+ "backfill_id": backfill_id,
92
+ "error": str(e),
93
+ }
94
+ if org_id is not None:
95
+ event = il.Event(type=il.EventType.RUN_FAILED, metadata=metadata)
96
+ self._store.save_event(event, org_id=org_id, run_id=run_id)
97
+ self._store.complete_run(run_id, success=False)
98
+ except Exception:
99
+ logger.exception("Failed to mark run %s as failed", run_id)
100
+ return False
101
+
102
+ # ------------------------------------------------------------------
103
+ # Helpers
104
+ # ------------------------------------------------------------------
105
+
106
+ @staticmethod
107
+ def _load_run(session: Session, run_id: UUID) -> Run | None:
108
+ return session.get(
109
+ Run,
110
+ run_id,
111
+ options=[
112
+ selectinload(Run.job).selectinload(Job.sources).selectinload(Source.assets), # type: ignore[arg-type]
113
+ selectinload(Run.job).selectinload(Job.assets), # type: ignore[arg-type]
114
+ ],
115
+ )
116
+
117
+ @staticmethod
118
+ def _mark_running(session: Session, db_run: Run) -> None:
119
+ db_run.status = "running"
120
+ db_run.started_at = dt.datetime.now(dt.timezone.utc)
121
+ session.add(db_run)
122
+ session.commit()
123
+
124
+ def _hydrate_job_assets(self, db_job: Job) -> list[il.Asset]:
125
+ """Hydrate job sources/assets and return only DB-registered assets."""
126
+ assets: list[il.Asset] = []
127
+
128
+ # Source-owned: hydrate the full source, then cherry-pick registered assets.
129
+ for db_source in db_job.sources:
130
+ assert db_source.id is not None
131
+ source = self._store.load_source(db_source.id)
132
+ registered_keys = {db_asset.key for db_asset in db_source.assets}
133
+ for asset in source.assets:
134
+ if type(asset).key in registered_keys:
135
+ assets.append(asset)
136
+
137
+ # Standalone assets
138
+ for db_asset in db_job.assets:
139
+ assert db_asset.id is not None
140
+ assets.append(self._store.load_asset(db_asset.id))
141
+
142
+ return assets
143
+
144
+ def _resolve_upstream_deps(self, db_job: Job, assets: list[il.Asset]) -> None:
145
+ """Add transitive upstream deps to *assets* as non-materializable."""
146
+ db_asset_ids: set[UUID] = set()
147
+ for db_source in db_job.sources:
148
+ for db_asset in db_source.assets:
149
+ assert db_asset.id is not None
150
+ db_asset_ids.add(db_asset.id)
151
+ for db_asset in db_job.assets:
152
+ assert db_asset.id is not None
153
+ db_asset_ids.add(db_asset.id)
154
+
155
+ frontier = list(db_asset_ids)
156
+ visited = set(db_asset_ids)
157
+ with Session(get_engine()) as session:
158
+ while frontier:
159
+ dependency_rows = session.exec(
160
+ select(AssetDependency).where(col(AssetDependency.asset_id).in_(frontier))
161
+ ).all()
162
+ next_frontier: list[UUID] = []
163
+ for dependency in dependency_rows:
164
+ if dependency.upstream_asset_id not in visited:
165
+ visited.add(dependency.upstream_asset_id)
166
+ next_frontier.append(dependency.upstream_asset_id)
167
+ upstream_asset = self._store.load_asset(dependency.upstream_asset_id)
168
+ upstream_asset.materializable = False
169
+ assets.append(upstream_asset)
170
+ frontier = next_frontier
171
+
172
+ def _run_dag(
173
+ self,
174
+ dag: il.DAG,
175
+ partition: il.TimePartition | None,
176
+ *,
177
+ org_id: UUID,
178
+ run_id: UUID,
179
+ backfill_id: str | None,
180
+ ) -> il.RunResult:
181
+ def handle_event(event: il.Event) -> None:
182
+ self._store.save_event(event, org_id=org_id, run_id=run_id) # type: ignore[arg-type]
183
+
184
+ with self._runner_type(
185
+ **self._runner_kwargs,
186
+ on_event=handle_event,
187
+ ) as runner:
188
+ return runner.run(
189
+ dag,
190
+ partition,
191
+ metadata={
192
+ "run_id": str(run_id),
193
+ "backfill_id": backfill_id,
194
+ },
195
+ )
@@ -0,0 +1,203 @@
1
+ """Launcher interface and in-process implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ import threading
7
+ from abc import ABC, abstractmethod
8
+ from dataclasses import dataclass
9
+ from enum import Enum
10
+ from typing import TYPE_CHECKING, Any
11
+ from uuid import UUID
12
+
13
+ from interloper_db import Store
14
+
15
+ if TYPE_CHECKING:
16
+ from interloper.catalog.base import Catalog
17
+ from interloper.settings import LauncherSettings, PostgresSettings, RunnerSettings
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class RunStatus(str, Enum):
23
+ """Authoritative status of a launched run, reported by the launcher."""
24
+
25
+ RUNNING = "running"
26
+ SUCCEEDED = "succeeded"
27
+ FAILED = "failed"
28
+ NOT_FOUND = "not_found"
29
+
30
+
31
+ @dataclass
32
+ class RunState:
33
+ """Authoritative state of a launched run, as reported by its launcher."""
34
+
35
+ status: RunStatus
36
+ error: str | None = None
37
+
38
+
39
+ def build_launcher(
40
+ launcher: LauncherSettings,
41
+ *,
42
+ postgres: PostgresSettings,
43
+ runner: RunnerSettings,
44
+ catalog: Catalog,
45
+ store: Any | None = None,
46
+ ) -> Any:
47
+ """Build a launcher instance from settings.
48
+
49
+ The runner configuration is always forwarded so every launcher type
50
+ respects ``RunnerSettings`` uniformly.
51
+
52
+ Args:
53
+ launcher: Launcher settings (type + type-specific config).
54
+ postgres: Postgres settings forwarded to launchers that spawn
55
+ isolated processes (e.g. Docker).
56
+ runner: Runner settings forwarded to every launcher.
57
+ catalog: Catalog forwarded to launchers that spawn isolated
58
+ processes so they can reproduce an identical catalog.
59
+ store: Optional Store instance shared with in-process launchers.
60
+
61
+ Returns:
62
+ A scheduler ``Launcher`` instance.
63
+
64
+ Raises:
65
+ ValueError: If the launcher type is unknown.
66
+ """
67
+ match launcher.type:
68
+ case "in_process":
69
+ from interloper_scheduler import InProcessLauncher
70
+
71
+ return InProcessLauncher(
72
+ store=store,
73
+ runner_type=runner.type,
74
+ runner_config=runner.config,
75
+ )
76
+ case "docker":
77
+ from interloper_docker import DockerLauncher
78
+
79
+ postgres_kwargs = {
80
+ "postgres_host": postgres.host,
81
+ "postgres_port": postgres.port,
82
+ "postgres_user": postgres.user,
83
+ "postgres_password": postgres.password,
84
+ "postgres_database": postgres.database,
85
+ }
86
+ kwargs = {**postgres_kwargs, **launcher.config}
87
+ return DockerLauncher(
88
+ catalog=catalog,
89
+ runner_type=runner.type,
90
+ runner_config=runner.config,
91
+ **kwargs,
92
+ )
93
+ case "kubernetes":
94
+ try:
95
+ from interloper_k8s import KubernetesLauncher
96
+ except ImportError as exc:
97
+ raise ValueError(
98
+ "Launcher 'kubernetes' requires the 'interloper-k8s' package to be installed."
99
+ ) from exc
100
+
101
+ postgres_kwargs = {
102
+ "postgres_host": postgres.host,
103
+ "postgres_port": postgres.port,
104
+ "postgres_user": postgres.user,
105
+ "postgres_password": postgres.password,
106
+ "postgres_database": postgres.database,
107
+ }
108
+ kwargs = {**postgres_kwargs, **launcher.config}
109
+ return KubernetesLauncher(
110
+ catalog=catalog,
111
+ runner_type=runner.type,
112
+ runner_config=runner.config,
113
+ **kwargs,
114
+ )
115
+ case _:
116
+ raise ValueError(f"Unknown launcher: {launcher.type!r}. Available: in_process, docker, kubernetes")
117
+
118
+
119
+ class Launcher(ABC):
120
+ """Abstract base for run launchers.
121
+
122
+ A launcher decides *where* a run executes: in-process, Docker, Kubernetes, etc.
123
+ Every launcher carries a runner configuration that determines *how* the
124
+ DAG is executed once it reaches the execution environment.
125
+ """
126
+
127
+ def __init__(
128
+ self,
129
+ runner_type: str = "multi_thread",
130
+ runner_config: dict[str, Any] | None = None,
131
+ ) -> None:
132
+ """Initialize the launcher.
133
+
134
+ Args:
135
+ runner_type: Runner type name (``serial``, ``multi_thread``, ``multi_process``).
136
+ runner_config: Runner-specific kwargs forwarded to the runner constructor.
137
+ """
138
+ self._runner_type = runner_type
139
+ self._runner_config = runner_config or {}
140
+
141
+ @abstractmethod
142
+ def launch(self, run_id: UUID) -> None:
143
+ """Launch a run for execution.
144
+
145
+ Args:
146
+ run_id: The run UUID to execute.
147
+ """
148
+
149
+ def describe_run(self, run_id: UUID) -> RunState | None:
150
+ """Return the authoritative state of a launched run.
151
+
152
+ Args:
153
+ run_id: The run UUID to describe.
154
+
155
+ Returns:
156
+ The run's authoritative state, or ``None`` if the launcher
157
+ cannot introspect its runs.
158
+ """
159
+ return None
160
+
161
+
162
+ class InProcessLauncher(Launcher):
163
+ """Launches runs in a detached thread using ``RunExecutor``.
164
+
165
+ Accepts an optional ``store`` so all runs share the same persistence
166
+ layer (encryption keys, etc.) rather than creating a fresh default.
167
+ """
168
+
169
+ def __init__(
170
+ self,
171
+ runner_type: str = "multi_thread",
172
+ runner_config: dict[str, Any] | None = None,
173
+ store: Store | None = None,
174
+ ) -> None:
175
+ """Initialize the launcher.
176
+
177
+ Args:
178
+ runner_type: Runner type name (``serial``, ``multi_thread``, ``multi_process``).
179
+ runner_config: Runner-specific kwargs forwarded to the runner constructor.
180
+ store: Optional Store instance to share with executors.
181
+ """
182
+ super().__init__(runner_type=runner_type, runner_config=runner_config)
183
+ self._store = store
184
+
185
+ def launch(self, run_id: UUID) -> None:
186
+ """Launch a run in a background thread.
187
+
188
+ Args:
189
+ run_id: The run UUID to execute.
190
+ """
191
+ from interloper.runner import build_runner
192
+
193
+ from interloper_scheduler.executor import RunExecutor
194
+
195
+ runner_cls, runner_kwargs = build_runner(self._runner_type, self._runner_config)
196
+ executor = RunExecutor(
197
+ store=self._store,
198
+ runner_type=runner_cls,
199
+ runner_kwargs=runner_kwargs,
200
+ )
201
+ thread = threading.Thread(target=executor.execute, args=(run_id,), daemon=True)
202
+ thread.start()
203
+ logger.info("Launched run %s in background thread", run_id)
@@ -0,0 +1,88 @@
1
+ """Queue controller: polls for queued runs and dispatches them."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import datetime as dt
6
+ import logging
7
+ import time
8
+ from threading import Event
9
+
10
+ from interloper_db import get_engine
11
+ from interloper_db.models import Run
12
+ from sqlmodel import Session, col, select
13
+
14
+ from interloper_scheduler.launcher import InProcessLauncher, Launcher
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class QueueController:
20
+ """Polls the runs table for queued runs and dispatches them.
21
+
22
+ Uses ``SELECT FOR UPDATE SKIP LOCKED`` for safe concurrent polling.
23
+ """
24
+
25
+ def __init__(
26
+ self,
27
+ launcher: Launcher | None = None,
28
+ poll_interval: int = 5,
29
+ ) -> None:
30
+ """Initialize the queue controller.
31
+
32
+ Args:
33
+ launcher: The launcher to use for dispatching runs.
34
+ poll_interval: Seconds between poll cycles.
35
+ """
36
+ self._launcher = launcher or InProcessLauncher()
37
+ self._poll_interval = poll_interval
38
+ self._stop_event = Event()
39
+
40
+ def start(self) -> None:
41
+ """Run the polling loop until stopped."""
42
+ logger.info("Starting queue controller...")
43
+
44
+ while not self._stop_event.is_set():
45
+ logger.info("Polling for queued runs...")
46
+
47
+ try:
48
+ with Session(get_engine()) as session:
49
+ statement = (
50
+ select(Run)
51
+ .where(Run.status == "queued")
52
+ .order_by(col(Run.created_at).asc())
53
+ .limit(1)
54
+ .with_for_update(skip_locked=True)
55
+ )
56
+ run = session.exec(statement).first()
57
+
58
+ if not run or not run.id:
59
+ if self._stop_event.wait(self._poll_interval):
60
+ break
61
+ continue
62
+
63
+ run_id = run.id
64
+ run.status = "dispatched"
65
+ session.add(run)
66
+ session.commit()
67
+ logger.info("Dispatched run %s", run_id)
68
+
69
+ try:
70
+ logger.info("Launching run %s", run_id)
71
+ self._launcher.launch(run_id)
72
+ except Exception as e:
73
+ logger.exception("Failed to launch run %s: %s", run_id, e)
74
+ with Session(get_engine()) as session:
75
+ failed_run = session.get(Run, run_id)
76
+ if failed_run:
77
+ failed_run.status = "failed"
78
+ failed_run.completed_at = dt.datetime.now(dt.timezone.utc)
79
+ session.add(failed_run)
80
+ session.commit()
81
+
82
+ except Exception as e:
83
+ logger.exception("Queue controller error: %s", e)
84
+ time.sleep(5)
85
+
86
+ def stop(self) -> None:
87
+ """Signal the loop to stop."""
88
+ self._stop_event.set()
@@ -0,0 +1,182 @@
1
+ """Reaper: detects terminated runs via the launcher and marks them failed.
2
+
3
+ The reaper is a single background thread that periodically checks every
4
+ ``dispatched`` run's authoritative state via
5
+ :meth:`~interloper_scheduler.launcher.Launcher.describe_run`:
6
+
7
+ - ``RUNNING`` → leave alone
8
+ - ``SUCCEEDED`` → weird (container said it succeeded but didn't update
9
+ the DB) — mark as failed with a descriptive error
10
+ - ``FAILED`` → mark as failed immediately with the launcher's error
11
+ - ``NOT_FOUND`` → container is gone without a trace — mark as failed
12
+
13
+ A ``timeout`` fallback catches runs the launcher can't see (e.g. when
14
+ the launcher itself doesn't implement ``describe_run``, or the
15
+ infrastructure API is unreachable). Runs older than ``timeout``
16
+ seconds in ``dispatched`` status are reaped regardless.
17
+
18
+ The pattern scales flat: one SQL query per poll cycle, plus one
19
+ launcher API call per dispatched run (which K8s/Docker can serve
20
+ from their local daemon cheaply).
21
+ """
22
+
23
+ from __future__ import annotations
24
+
25
+ import datetime as dt
26
+ import logging
27
+ from threading import Event
28
+ from typing import TYPE_CHECKING
29
+
30
+ import interloper as il
31
+ from interloper_db import Store, get_engine
32
+ from interloper_db.models import Run
33
+ from sqlmodel import Session, select
34
+
35
+ from interloper_scheduler.launcher import RunStatus
36
+
37
+ if TYPE_CHECKING:
38
+ from interloper_scheduler.launcher import Launcher
39
+
40
+ logger = logging.getLogger(__name__)
41
+
42
+
43
+ class Reaper:
44
+ """Periodically reconciles dispatched runs with the launcher's truth.
45
+
46
+ Designed to run in a background thread alongside the
47
+ :class:`~interloper_scheduler.queue.QueueController`::
48
+
49
+ reaper = Reaper(store=store, launcher=launcher)
50
+ thread = threading.Thread(target=reaper.start, daemon=True)
51
+ thread.start()
52
+ """
53
+
54
+ def __init__(
55
+ self,
56
+ store: Store,
57
+ launcher: Launcher | None = None,
58
+ timeout: int = 600,
59
+ poll_interval: int = 10,
60
+ ) -> None:
61
+ """Initialize the reaper.
62
+
63
+ Args:
64
+ store: Store used to persist the failure event and update
65
+ the run status.
66
+ launcher: Optional launcher consulted each poll cycle for
67
+ authoritative run state. Launchers without
68
+ introspection (e.g. in-process) fall back to timeout.
69
+ timeout: Fallback: seconds after which a ``dispatched`` run
70
+ is reaped regardless of what the launcher says.
71
+ poll_interval: Seconds between reaper scans.
72
+ """
73
+ self._store = store
74
+ self._launcher = launcher
75
+ self._timeout = timeout
76
+ self._poll_interval = poll_interval
77
+ self._stop_event = Event()
78
+
79
+ def start(self) -> None:
80
+ """Run the reaper loop until stopped."""
81
+ logger.info(
82
+ "Starting reaper (poll=%ds, timeout=%ds)",
83
+ self._poll_interval,
84
+ self._timeout,
85
+ )
86
+
87
+ while not self._stop_event.is_set():
88
+ try:
89
+ reaped = self._reap()
90
+ if reaped:
91
+ logger.info("Reaped %d dispatched run(s)", reaped)
92
+ except Exception:
93
+ logger.exception("Reaper error")
94
+
95
+ if self._stop_event.wait(self._poll_interval):
96
+ break
97
+
98
+ def stop(self) -> None:
99
+ """Signal the loop to stop."""
100
+ self._stop_event.set()
101
+
102
+ def _reap(self) -> int:
103
+ """Scan dispatched runs and reap any that have terminated.
104
+
105
+ Returns:
106
+ Number of runs reaped this cycle.
107
+ """
108
+ now = dt.datetime.now(dt.timezone.utc)
109
+ timeout_cutoff = now - dt.timedelta(seconds=self._timeout)
110
+
111
+ with Session(get_engine()) as session:
112
+ dispatched_runs = list(session.exec(select(Run).where(Run.status == "dispatched")).all())
113
+
114
+ reaped = 0
115
+ for run in dispatched_runs:
116
+ if self._reap_run(run, now, timeout_cutoff):
117
+ reaped += 1
118
+ return reaped
119
+
120
+ def _reap_run(self, run: Run, now: dt.datetime, timeout_cutoff: dt.datetime) -> bool:
121
+ """Decide whether to reap a single run and do so if needed.
122
+
123
+ Returns:
124
+ ``True`` if the run was reaped.
125
+ """
126
+ assert run.id is not None
127
+
128
+ # 1. Authoritative launcher state (preferred)
129
+ state = None
130
+ if self._launcher is not None:
131
+ try:
132
+ state = self._launcher.describe_run(run.id)
133
+ except Exception:
134
+ logger.exception("Failed to describe run %s", run.id)
135
+
136
+ if state is not None:
137
+ if state.status == RunStatus.RUNNING:
138
+ return False # Trust the launcher — still alive
139
+
140
+ if state.status == RunStatus.SUCCEEDED:
141
+ error = "Run container reported SUCCEEDED but never updated the DB. Possible connectivity issue."
142
+ self._fail_run(run, error)
143
+ return True
144
+
145
+ if state.status == RunStatus.FAILED:
146
+ error = state.error or "Run failed (no error reported by launcher)"
147
+ self._fail_run(run, error)
148
+ return True
149
+
150
+ if state.status == RunStatus.NOT_FOUND:
151
+ pass
152
+
153
+ # 2. Timeout fallback — for launchers without introspection,
154
+ # NOT_FOUND runs, or anything else.
155
+ if run.created_at and run.created_at < timeout_cutoff:
156
+ self._fail_run(run, f"Run timed out after {self._timeout}s (still 'dispatched')")
157
+ return True
158
+
159
+ return False
160
+
161
+ def _fail_run(self, run: Run, error: str) -> None:
162
+ """Mark a run as failed and emit a ``RUN_FAILED`` event."""
163
+ assert run.id is not None
164
+ logger.warning("Reaping run %s: %s", run.id, error)
165
+
166
+ try:
167
+ event = il.Event(
168
+ type=il.EventType.RUN_FAILED,
169
+ metadata={
170
+ "run_id": str(run.id),
171
+ "backfill_id": str(run.backfill_id) if run.backfill_id else None,
172
+ "error": error,
173
+ },
174
+ )
175
+ self._store.save_event(event, org_id=run.org_id, run_id=run.id)
176
+ except Exception:
177
+ logger.exception("Failed to save RUN_FAILED event for run %s", run.id)
178
+
179
+ try:
180
+ self._store.complete_run(run.id, success=False)
181
+ except Exception:
182
+ logger.exception("Failed to mark run %s as failed", run.id)
@@ -0,0 +1,16 @@
1
+ Metadata-Version: 2.3
2
+ Name: interloper-scheduler
3
+ Version: 0.3.0
4
+ Summary: Interloper cron scheduler and queue worker
5
+ Author: Guillaume Onfroy
6
+ Author-email: Guillaume Onfroy <guillaume@digitlcloud.com>
7
+ Requires-Dist: interloper-core
8
+ Requires-Dist: interloper-db
9
+ Requires-Dist: croniter>=5.0.0
10
+ Requires-Dist: interloper-docker ; extra == 'docker'
11
+ Requires-Dist: interloper-k8s ; extra == 'k8s'
12
+ Requires-Python: >=3.10
13
+ Provides-Extra: docker
14
+ Provides-Extra: k8s
15
+ Description-Content-Type: text/markdown
16
+
@@ -0,0 +1,9 @@
1
+ interloper_scheduler/__init__.py,sha256=-MXLlIjqyZzxnz5tGJF8wXMdAk0G55Avcu1KP2-Wahg,451
2
+ interloper_scheduler/cron.py,sha256=9k_9dNMP_lK9z0Jwol3YSeytMvJI9ZWgnQ9NuAhh5bk,7198
3
+ interloper_scheduler/executor.py,sha256=L5NweV1o96q0ZXCvCMDynjbmJ2Wk-A29pBDD87ztv7o,7308
4
+ interloper_scheduler/launcher.py,sha256=Dgm4KPMFm66ll7_RIyudtEun346outeQnYE5DXR5Y5U,6569
5
+ interloper_scheduler/queue.py,sha256=Uhq9-uY-vEHUn6C1HTUgZZZNoYpGfMWdrWaauKLB4yo,2976
6
+ interloper_scheduler/reaper.py,sha256=bFramgDeUtWMNcUlpiRflWY67CCDBLgYm2ZI_2nX-eE,6403
7
+ interloper_scheduler-0.3.0.dist-info/WHEEL,sha256=f5fWSvWsg5Knq5GWa6t1nJIug0Tqo69GqAWD_9LbBKw,81
8
+ interloper_scheduler-0.3.0.dist-info/METADATA,sha256=cVuarCM8daBVUw0cVjuU6jBiiqXNTgqkkVuEs9QMero,499
9
+ interloper_scheduler-0.3.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.11.16
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any