interloper-k8s 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,9 @@
1
+ """Interloper Kubernetes integration for Job-based asset execution."""
2
+
3
+ from interloper_k8s.backfiller import KubernetesBackfiller
4
+ from interloper_k8s.runner import KubernetesRunner
5
+
6
+ __all__ = [
7
+ "KubernetesBackfiller",
8
+ "KubernetesRunner",
9
+ ]
@@ -0,0 +1,503 @@
1
+ """Kubernetes Backfiller implementation for Interloper.
2
+
3
+ This backfiller starts Kubernetes Jobs and invokes the Interloper CLI inside them
4
+ using an inline JSON config. Each partition/window runs as a separate Job, with
5
+ asset scheduling delegated to the configured runner in the inline config.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import threading
11
+ from collections.abc import Callable
12
+ from time import sleep
13
+ from typing import Any, cast
14
+
15
+ from interloper.backfillers.base import Backfiller
16
+ from interloper.cli.config import Config
17
+ from interloper.dag.base import DAG
18
+ from interloper.errors import PartitionError, RunnerError
19
+ from interloper.events.base import Event, EventBus, parse_event_from_log_line
20
+ from interloper.partitioning.base import Partition, PartitionWindow
21
+ from interloper.partitioning.time import TimePartition, TimePartitionWindow
22
+ from interloper.runners.base import Runner
23
+ from interloper.runners.results import ExecutionStatus, RunResult
24
+ from interloper.serialization.backfiller import BackfillerSpec
25
+ from kubernetes import client, config, watch
26
+ from kubernetes.client import V1Job
27
+
28
+
29
+ class KubernetesBackfiller(Backfiller[str]):
30
+ """Run Interloper DAG partitions as individual Kubernetes Jobs.
31
+
32
+ Each partition/window is executed in its own Job. The image must contain
33
+ the `interloper` package (CLI available on PATH).
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ image: str,
39
+ namespace: str = "default",
40
+ max_jobs: int = 4,
41
+ env_vars: dict[str, str] | None = None,
42
+ service_account: str | None = None,
43
+ image_pull_policy: str | None = None,
44
+ image_pull_secrets: list[str] | None = None,
45
+ resources: dict[str, dict[str, str]] | None = None,
46
+ node_selector: dict[str, str] | None = None,
47
+ tolerations: list[dict[str, Any]] | None = None,
48
+ ttl_seconds_after_finished: int = 300,
49
+ runner: Runner | None = None,
50
+ on_event: Callable[[Event], None] | None = None,
51
+ ) -> None:
52
+ """Initialize the KubernetesBackfiller.
53
+
54
+ Args:
55
+ image: Container image to use for job execution.
56
+ namespace: Kubernetes namespace to create jobs in.
57
+ max_jobs: Maximum number of concurrent jobs.
58
+ env_vars: Environment variables to set in the container.
59
+ service_account: Service account name to use for the job.
60
+ image_pull_policy: Image pull policy ("Always", "IfNotPresent", or "Never").
61
+ image_pull_secrets: List of image pull secret names.
62
+ resources: Resource requests/limits dict with 'requests' and 'limits' keys.
63
+ node_selector: Node selector labels for pod scheduling.
64
+ tolerations: List of toleration dicts for pod scheduling.
65
+ ttl_seconds_after_finished: TTL for completed jobs cleanup.
66
+ runner: Runner to use for running assets inside the container.
67
+ on_event: Optional event handler for lifecycle events.
68
+ """
69
+ super().__init__(runner=runner, on_event=on_event)
70
+
71
+ # Force the runner to re-raise exceptions to propagate container exit codes.
72
+ self.runner._reraise = True
73
+
74
+ self._image = image
75
+ self._namespace = namespace
76
+ self._max_jobs = max_jobs
77
+ self._env_vars = env_vars or {}
78
+ self._service_account = service_account
79
+ self._image_pull_policy = image_pull_policy
80
+ self._image_pull_secrets = image_pull_secrets or []
81
+ self._resources = resources
82
+ self._node_selector = node_selector
83
+ self._tolerations = tolerations or []
84
+ self._ttl_seconds_after_finished = ttl_seconds_after_finished
85
+
86
+ self._batch_v1: client.BatchV1Api | None = None
87
+ self._core_v1: client.CoreV1Api | None = None
88
+
89
+ # Track log streaming threads for cleanup
90
+ self._log_threads: dict[str, threading.Thread] = {}
91
+ self._stop_log_streaming = threading.Event()
92
+
93
+ @property
94
+ def _capacity(self) -> int:
95
+ """Maximum number of concurrent jobs."""
96
+ return self._max_jobs
97
+
98
+ def _on_start(self) -> None:
99
+ """Initialize Kubernetes client."""
100
+ try:
101
+ config.load_incluster_config()
102
+ except config.ConfigException:
103
+ try:
104
+ config.load_kube_config()
105
+ except Exception as e:
106
+ raise RunnerError(f"Failed to load Kubernetes config: {e}") from e
107
+
108
+ self._batch_v1 = client.BatchV1Api()
109
+ self._core_v1 = client.CoreV1Api()
110
+ self._stop_log_streaming.clear()
111
+
112
+ def _on_end(self) -> None:
113
+ """Clean up log streaming threads."""
114
+ # Signal all log streaming threads to stop
115
+ self._stop_log_streaming.set()
116
+
117
+ # Wait for threads to finish
118
+ for thread in self._log_threads.values():
119
+ thread.join(timeout=2.0)
120
+ self._log_threads.clear()
121
+
122
+ def _build_command(
123
+ self,
124
+ dag: DAG,
125
+ partition_or_window: Partition | PartitionWindow | None,
126
+ backfill_id: str,
127
+ ) -> list[str]:
128
+ """Build the CLI command for a partition.
129
+
130
+ Args:
131
+ dag: The DAG to execute
132
+ partition_or_window: The partition or window
133
+ backfill_id: The backfill ID
134
+
135
+ Returns:
136
+ Command list for the container
137
+ """
138
+ cfg = Config(dag=dag, runner=self.runner)
139
+
140
+ cmd = [
141
+ "interloper",
142
+ "run",
143
+ "--format=inline",
144
+ f"--backfill-id={backfill_id}",
145
+ cfg.to_json(),
146
+ ]
147
+
148
+ if partition_or_window is None:
149
+ return cmd
150
+
151
+ if isinstance(partition_or_window, TimePartition):
152
+ cmd.extend(["--date", partition_or_window.value.strftime("%Y-%m-%d")])
153
+ elif isinstance(partition_or_window, TimePartitionWindow):
154
+ cmd.extend(
155
+ [
156
+ "--start-date",
157
+ partition_or_window.start.strftime("%Y-%m-%d"),
158
+ "--end-date",
159
+ partition_or_window.end.strftime("%Y-%m-%d"),
160
+ ]
161
+ )
162
+ else:
163
+ raise PartitionError("Unsupported partition or window type")
164
+ return cmd
165
+
166
+ def _build_env(self) -> list[client.V1EnvVar]:
167
+ """Build the environment variables for the container."""
168
+ env_vars = [client.V1EnvVar(name=k, value=v) for k, v in self._env_vars.items()]
169
+ # Enable log-based event streaming
170
+ env_vars.append(client.V1EnvVar(name="INTERLOPER_EVENTS_TO_STDERR", value="true"))
171
+ return env_vars
172
+
173
+ def _build_resources(self) -> client.V1ResourceRequirements | None:
174
+ """Build the resource requirements for the container."""
175
+ if not self._resources:
176
+ return None
177
+ return client.V1ResourceRequirements(
178
+ requests=self._resources.get("requests"),
179
+ limits=self._resources.get("limits"),
180
+ )
181
+
182
+ def _build_tolerations(self) -> list[client.V1Toleration]:
183
+ """Build tolerations for pod scheduling."""
184
+ return [
185
+ client.V1Toleration(
186
+ key=t.get("key"),
187
+ operator=t.get("operator", "Equal"),
188
+ value=t.get("value"),
189
+ effect=t.get("effect"),
190
+ )
191
+ for t in self._tolerations
192
+ ]
193
+
194
+ def _build_job_name(self, partition_or_window: Partition | PartitionWindow | None) -> str:
195
+ """Build the name for the Kubernetes job."""
196
+ name = f"interloper-backfill-{self.state.backfill_id[:8]}"
197
+ if partition_or_window is not None:
198
+ name = f"{name}-{partition_or_window.id}"
199
+ return name[:63].replace(":", "-").replace("_", "-").lower()
200
+
201
+ def _build_labels(self, partition_or_window: Partition | PartitionWindow | None) -> dict[str, str]:
202
+ """Build the labels for the Kubernetes job."""
203
+ return {
204
+ "interloper.backfill_id": self.state.backfill_id[:8],
205
+ }
206
+
207
+ def _build_annotations(self, partition_or_window: Partition | PartitionWindow | None) -> dict[str, str]:
208
+ """Build the annotations for the Kubernetes job."""
209
+ annotations = {}
210
+ if partition_or_window is not None:
211
+ annotations["interloper.partition"] = partition_or_window.id
212
+ return annotations
213
+
214
+ def _start_log_streaming(self, job_name: str) -> None:
215
+ """Start a background thread to stream logs and parse events from a job's pod.
216
+
217
+ Args:
218
+ job_name: The Kubernetes job name to stream logs from
219
+ """
220
+ assert self._core_v1 is not None
221
+ # Capture reference for use in closure
222
+ core_v1 = self._core_v1
223
+
224
+ def stream_logs() -> None:
225
+ try:
226
+ # Wait for pod to be created and running
227
+ pod_name: str | None = None
228
+ while not self._stop_log_streaming.is_set():
229
+ try:
230
+ pods = core_v1.list_namespaced_pod(
231
+ namespace=self._namespace,
232
+ label_selector=f"job-name={job_name}",
233
+ )
234
+ if pods.items:
235
+ pod = pods.items[0]
236
+ if pod.metadata and pod.metadata.name:
237
+ pod_name = pod.metadata.name
238
+ # Check if pod is ready for log streaming
239
+ if pod.status and pod.status.phase in ("Running", "Succeeded", "Failed"):
240
+ break
241
+ except Exception:
242
+ pass
243
+ sleep(0.5)
244
+
245
+ if pod_name is None or self._stop_log_streaming.is_set():
246
+ return
247
+
248
+ # Stream logs from the pod
249
+ w = watch.Watch()
250
+ try:
251
+ for line in w.stream(
252
+ core_v1.read_namespaced_pod_log,
253
+ name=pod_name,
254
+ namespace=self._namespace,
255
+ follow=True,
256
+ ):
257
+ if self._stop_log_streaming.is_set():
258
+ break
259
+
260
+ try:
261
+ # watch.Watch.stream() returns strings for log streaming
262
+ if isinstance(line, str):
263
+ event = parse_event_from_log_line(line)
264
+ if event is not None:
265
+ EventBus.get_instance().emit(event)
266
+ except Exception:
267
+ # Ignore parsing errors, continue streaming
268
+ pass
269
+ except Exception:
270
+ # Pod may have been removed or completed
271
+ pass
272
+ finally:
273
+ w.stop()
274
+ except Exception:
275
+ # Job or pod may have been removed
276
+ pass
277
+
278
+ thread = threading.Thread(target=stream_logs, daemon=True)
279
+ thread.start()
280
+ self._log_threads[job_name] = thread
281
+
282
+ def _stop_job_log_streaming(self, job_name: str) -> None:
283
+ """Stop and clean up the log streaming thread for a job.
284
+
285
+ Args:
286
+ job_name: The Kubernetes job name to stop streaming for
287
+ """
288
+ thread = self._log_threads.pop(job_name, None)
289
+ if thread is not None:
290
+ # Thread will stop on next iteration due to stop flag or pod completion
291
+ thread.join(timeout=1.0)
292
+
293
+ def _submit_run(
294
+ self,
295
+ dag: DAG,
296
+ partition_or_window: Partition | PartitionWindow | None,
297
+ ) -> str:
298
+ """Submit execution of a run as a Kubernetes Job.
299
+
300
+ Args:
301
+ dag: The DAG to execute
302
+ partition_or_window: Either a Partition or PartitionWindow object
303
+
304
+ Returns:
305
+ The job name for tracking
306
+ """
307
+ cmd = self._build_command(dag, partition_or_window, self.state.backfill_id)
308
+ job_name = self._build_job_name(partition_or_window)
309
+ env = self._build_env()
310
+ resources = self._build_resources()
311
+ tolerations = self._build_tolerations()
312
+ labels = self._build_labels(partition_or_window)
313
+ annotations = self._build_annotations(partition_or_window)
314
+
315
+ container = client.V1Container(
316
+ name="interloper",
317
+ image=self._image,
318
+ image_pull_policy=self._image_pull_policy,
319
+ command=cmd[:1],
320
+ args=cmd[1:],
321
+ env=env if env else None,
322
+ resources=resources,
323
+ )
324
+
325
+ pod_spec = client.V1PodSpec(
326
+ containers=[container],
327
+ restart_policy="Never",
328
+ service_account_name=self._service_account,
329
+ node_selector=self._node_selector if self._node_selector else None,
330
+ tolerations=tolerations if tolerations else None,
331
+ image_pull_secrets=[client.V1LocalObjectReference(name=s) for s in self._image_pull_secrets]
332
+ if self._image_pull_secrets
333
+ else None,
334
+ )
335
+
336
+ job_spec = client.V1JobSpec(
337
+ template=client.V1PodTemplateSpec(
338
+ metadata=client.V1ObjectMeta(
339
+ labels=labels,
340
+ annotations=annotations,
341
+ ),
342
+ spec=pod_spec,
343
+ ),
344
+ backoff_limit=0,
345
+ ttl_seconds_after_finished=self._ttl_seconds_after_finished,
346
+ )
347
+
348
+ job = client.V1Job(
349
+ api_version="batch/v1",
350
+ kind="Job",
351
+ metadata=client.V1ObjectMeta(
352
+ name=job_name,
353
+ namespace=self._namespace,
354
+ labels=labels,
355
+ annotations=annotations,
356
+ ),
357
+ spec=job_spec,
358
+ )
359
+
360
+ self.state.mark_run_running(partition_or_window)
361
+
362
+ assert self._batch_v1 is not None
363
+ self._batch_v1.create_namespaced_job(namespace=self._namespace, body=job)
364
+
365
+ # Start log streaming for event collection
366
+ self._start_log_streaming(job_name)
367
+
368
+ return job_name
369
+
370
+ def _wait_any(self, handles: list[str]) -> str:
371
+ """Wait for any job to finish by polling.
372
+
373
+ Args:
374
+ handles: List of job names to wait for
375
+
376
+ Returns:
377
+ The job name that finished
378
+ """
379
+ assert self._batch_v1 is not None
380
+ assert self._core_v1 is not None
381
+
382
+ while True:
383
+ for job_name in handles:
384
+ # Refresh job status
385
+ updated_job = cast(
386
+ V1Job,
387
+ self._batch_v1.read_namespaced_job_status(name=job_name, namespace=self._namespace),
388
+ )
389
+
390
+ assert updated_job.status is not None
391
+ status = updated_job.status
392
+ is_complete = status.succeeded is not None and status.succeeded > 0
393
+ is_failed = status.failed is not None and status.failed > 0
394
+
395
+ if is_complete or is_failed:
396
+ # Stop log streaming for this job
397
+ self._stop_job_log_streaming(job_name)
398
+
399
+ # Get partition ID from annotations
400
+ assert updated_job.metadata is not None and updated_job.metadata.annotations is not None
401
+ partition_id = updated_job.metadata.annotations.get("interloper.partition")
402
+ partition: Partition | PartitionWindow | None = None
403
+
404
+ # Find the matching partition from state
405
+ if partition_id is not None:
406
+ for p in self.state.partitions:
407
+ if p is not None and p.id == partition_id:
408
+ partition = p
409
+ break
410
+ else:
411
+ raise PartitionError(f"Partition {partition} not found in state")
412
+
413
+ if is_complete:
414
+ # TODO: This is not the true RunResult, we need to get it from the container?
415
+ # Missing the asset_executions.
416
+ result = RunResult(partition, ExecutionStatus.COMPLETED)
417
+ self.state.mark_run_completed(partition, result)
418
+ else:
419
+ error_msg = f"Job {job_name} failed"
420
+ self.state.mark_run_failed(partition, error_msg)
421
+
422
+ # Try to get pod logs for debugging
423
+ try:
424
+ pods = self._core_v1.list_namespaced_pod(
425
+ namespace=self._namespace,
426
+ label_selector=f"job-name={job_name}",
427
+ )
428
+ if pods.items:
429
+ pod = pods.items[0]
430
+ assert pod.metadata is not None and pod.metadata.name is not None
431
+ logs = self._core_v1.read_namespaced_pod_log(
432
+ name=pod.metadata.name,
433
+ namespace=self._namespace,
434
+ )
435
+ if logs:
436
+ print("=============== START OF RUN JOB LOGS ==================")
437
+ print(logs)
438
+ print("================ END OF RUN JOB LOGS ===================")
439
+ except Exception:
440
+ pass
441
+
442
+ return job_name
443
+
444
+ sleep(1.0)
445
+
446
+ def _cancel_all(self, handles: list[str]) -> None:
447
+ """Cancel all running jobs.
448
+
449
+ Args:
450
+ handles: List of job names to cancel
451
+ """
452
+ assert self._batch_v1 is not None
453
+
454
+ for job_name in handles:
455
+ # Stop log streaming for this job
456
+ self._stop_job_log_streaming(job_name)
457
+
458
+ job: V1Job | None = None
459
+ try:
460
+ # Get job to retrieve partition from annotations
461
+ job = cast(
462
+ V1Job,
463
+ self._batch_v1.read_namespaced_job(name=job_name, namespace=self._namespace),
464
+ )
465
+ self._batch_v1.delete_namespaced_job(
466
+ name=job_name,
467
+ namespace=self._namespace,
468
+ body=client.V1DeleteOptions(propagation_policy="Background"),
469
+ )
470
+ except Exception:
471
+ pass
472
+ finally:
473
+ if job is not None:
474
+ try:
475
+ assert job.metadata is not None and job.metadata.annotations is not None
476
+ partition_str = job.metadata.annotations.get("interloper.partition", "")
477
+
478
+ # Find the matching partition from state
479
+ for p in self.state.partitions:
480
+ if str(p) == partition_str or (p is None and partition_str == ""):
481
+ self.state.mark_run_cancelled(p)
482
+ break
483
+ except Exception:
484
+ pass
485
+
486
+ def to_spec(self) -> BackfillerSpec:
487
+ """Convert to serializable spec."""
488
+ return BackfillerSpec(
489
+ path=self.path,
490
+ init=dict(
491
+ image=self._image,
492
+ namespace=self._namespace,
493
+ max_jobs=self._max_jobs,
494
+ env_vars=self._env_vars,
495
+ service_account=self._service_account,
496
+ image_pull_policy=self._image_pull_policy,
497
+ image_pull_secrets=self._image_pull_secrets,
498
+ resources=self._resources,
499
+ node_selector=self._node_selector,
500
+ tolerations=self._tolerations,
501
+ ttl_seconds_after_finished=self._ttl_seconds_after_finished,
502
+ ),
503
+ )
@@ -0,0 +1,379 @@
1
+ """Kubernetes-based runner that runs each asset in its own Job.
2
+
3
+ Each submitted asset is executed inside a Kubernetes Job. To allow an asset
4
+ to resolve its upstream dependencies from IO without recomputing them, we pass
5
+ to the container a mini-DAG consisting of the target asset plus all its
6
+ upstream ancestors. The container runs the Interloper CLI with an inline
7
+ config, similar to the `DockerRunner`.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import time
13
+ from collections.abc import Callable
14
+ from typing import Any, cast
15
+
16
+ from interloper.assets.base import Asset
17
+ from interloper.cli.config import Config
18
+ from interloper.dag.base import DAG
19
+ from interloper.errors import PartitionError, RunnerError
20
+ from interloper.events.base import Event
21
+ from interloper.partitioning.base import Partition, PartitionWindow
22
+ from interloper.partitioning.time import TimePartition, TimePartitionWindow
23
+ from interloper.runners.base import Runner
24
+ from interloper.serialization.runner import RunnerSpec
25
+ from kubernetes import client, config
26
+ from kubernetes.client import V1Job
27
+
28
+
29
+ class KubernetesRunner(Runner[str]):
30
+ """Execute assets as individual Kubernetes Jobs.
31
+
32
+ For each asset, constructs a mini-DAG comprising the asset and all its
33
+ upstream ancestors. The mini-DAG is sent to the container via inline JSON.
34
+ Inside the container, all non-target assets are marked as
35
+ `materializable=False` prior to execution to avoid recomputation while
36
+ still enabling IO-based dependency resolution.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ image: str,
42
+ namespace: str = "default",
43
+ max_jobs: int = 4,
44
+ env_vars: dict[str, str] | None = None,
45
+ service_account: str | None = None,
46
+ image_pull_policy: str | None = None,
47
+ image_pull_secrets: list[str] | None = None,
48
+ resources: dict[str, dict[str, str]] | None = None,
49
+ node_selector: dict[str, str] | None = None,
50
+ tolerations: list[dict[str, Any]] | None = None,
51
+ poll_interval: float = 1.0,
52
+ ttl_seconds_after_finished: int = 300,
53
+ fail_fast: bool = False,
54
+ reraise: bool = False,
55
+ on_event: Callable[[Event], None] | None = None,
56
+ ) -> None:
57
+ """Initialize the KubernetesRunner.
58
+
59
+ Args:
60
+ image: Container image to use for job execution.
61
+ namespace: Kubernetes namespace to create jobs in.
62
+ max_jobs: Maximum number of concurrent jobs.
63
+ env_vars: Environment variables to set in the container.
64
+ service_account: Service account name to use for the job.
65
+ image_pull_policy: Image pull policy ("Always", "IfNotPresent", or "Never").
66
+ image_pull_secrets: List of image pull secret names.
67
+ resources: Resource requests/limits dict with 'requests' and 'limits' keys.
68
+ node_selector: Node selector labels for pod scheduling.
69
+ tolerations: List of toleration dicts for pod scheduling.
70
+ poll_interval: Interval in seconds between job status polls.
71
+ ttl_seconds_after_finished: TTL for completed jobs cleanup.
72
+ fail_fast: Stop execution on first failure.
73
+ reraise: Re-raise exceptions.
74
+ on_event: Optional event handler for lifecycle events.
75
+ """
76
+ super().__init__(fail_fast=fail_fast, reraise=reraise, on_event=on_event)
77
+ self._image = image
78
+ self._namespace = namespace
79
+ self._max_jobs = max_jobs
80
+ self._env_vars = env_vars or {}
81
+ self._service_account = service_account
82
+ self._image_pull_policy = image_pull_policy
83
+ self._image_pull_secrets = image_pull_secrets or []
84
+ self._resources = resources
85
+ self._node_selector = node_selector
86
+ self._tolerations = tolerations or []
87
+ self._poll_interval = poll_interval
88
+ self._ttl_seconds_after_finished = ttl_seconds_after_finished
89
+
90
+ self._batch_v1: client.BatchV1Api | None = None
91
+ self._core_v1: client.CoreV1Api | None = None
92
+
93
+ def _on_start(self) -> None:
94
+ """Initialize Kubernetes client."""
95
+ try:
96
+ config.load_incluster_config()
97
+ except config.ConfigException:
98
+ config.load_kube_config()
99
+
100
+ self._batch_v1 = client.BatchV1Api()
101
+ self._core_v1 = client.CoreV1Api()
102
+
103
+ @property
104
+ def _capacity(self) -> int:
105
+ return self._max_jobs
106
+
107
+ def _build_command(
108
+ self,
109
+ dag: DAG,
110
+ partition_or_window: Partition | PartitionWindow | None,
111
+ run_id: str,
112
+ ) -> list[str]:
113
+ """Build the command to execute in the container."""
114
+ cfg = Config(dag=dag)
115
+
116
+ cmd = [
117
+ "interloper",
118
+ "run",
119
+ "--format",
120
+ "inline",
121
+ f"--run-id={run_id}",
122
+ cfg.to_json(),
123
+ ]
124
+
125
+ if isinstance(partition_or_window, TimePartition):
126
+ cmd.extend(["--date", partition_or_window.value.strftime("%Y-%m-%d")])
127
+ elif isinstance(partition_or_window, TimePartitionWindow):
128
+ cmd.extend(
129
+ [
130
+ "--start-date",
131
+ partition_or_window.start.strftime("%Y-%m-%d"),
132
+ "--end-date",
133
+ partition_or_window.end.strftime("%Y-%m-%d"),
134
+ ]
135
+ )
136
+ else:
137
+ raise PartitionError("Unsupported partition or window type")
138
+ return cmd
139
+
140
+ def _build_env(self) -> list[client.V1EnvVar]:
141
+ """Build the environment variables for the container."""
142
+ return [client.V1EnvVar(name=k, value=v) for k, v in self._env_vars.items()]
143
+
144
+ def _build_resources(self) -> client.V1ResourceRequirements | None:
145
+ """Build the resource requirements for the container."""
146
+ if not self._resources:
147
+ return None
148
+ return client.V1ResourceRequirements(
149
+ requests=self._resources.get("requests"),
150
+ limits=self._resources.get("limits"),
151
+ )
152
+
153
+ def _build_job_name(self, asset: Asset) -> str:
154
+ """Build the name for the Kubernetes job."""
155
+ # K8s names must be lowercase, alphanumeric, and can contain hyphens
156
+ safe_key = asset.instance_key.replace(".", "-").replace("_", "-").lower()
157
+ return f"interloper-{self.state.run_id[:8]}-{safe_key}"[:63]
158
+
159
+ def _build_tolerations(self) -> list[client.V1Toleration]:
160
+ """Build tolerations for pod scheduling."""
161
+ return [
162
+ client.V1Toleration(
163
+ key=t.get("key"),
164
+ operator=t.get("operator", "Equal"),
165
+ value=t.get("value"),
166
+ effect=t.get("effect"),
167
+ )
168
+ for t in self._tolerations
169
+ ]
170
+
171
+ def _submit_asset(
172
+ self,
173
+ asset: Asset,
174
+ partition_or_window: Partition | PartitionWindow | None,
175
+ ) -> str:
176
+ """Submit execution of an asset and return the job name for completion tracking.
177
+
178
+ IMPORTANT: this method is not calling the `_execute_asset` method of the base class.
179
+ Therefore, the state has to be updated manually here and in `_wait_any` below.
180
+
181
+ Args:
182
+ asset: The asset to execute
183
+ partition_or_window: Either a Partition or PartitionWindow object
184
+
185
+ Returns:
186
+ The job name (string) for the asset execution
187
+ """
188
+ # Build a mini-DAG: target asset + its parents (non-materializable)
189
+ mini_dag = self.state.dag.mini_dag(asset.instance_key)
190
+
191
+ cmd = self._build_command(mini_dag, partition_or_window, self.state.run_id)
192
+ job_name = self._build_job_name(asset)
193
+ env = self._build_env()
194
+ resources = self._build_resources()
195
+ tolerations = self._build_tolerations()
196
+
197
+ # Build container spec
198
+ container = client.V1Container(
199
+ name="interloper",
200
+ image=self._image,
201
+ image_pull_policy=self._image_pull_policy,
202
+ command=cmd[:1],
203
+ args=cmd[1:],
204
+ env=env if env else None,
205
+ resources=resources,
206
+ )
207
+
208
+ # Build pod spec
209
+ pod_spec = client.V1PodSpec(
210
+ containers=[container],
211
+ restart_policy="Never",
212
+ service_account_name=self._service_account,
213
+ node_selector=self._node_selector if self._node_selector else None,
214
+ tolerations=tolerations if tolerations else None,
215
+ image_pull_secrets=[client.V1LocalObjectReference(name=s) for s in self._image_pull_secrets]
216
+ if self._image_pull_secrets
217
+ else None,
218
+ )
219
+
220
+ # Build job spec
221
+ job_spec = client.V1JobSpec(
222
+ template=client.V1PodTemplateSpec(
223
+ metadata=client.V1ObjectMeta(
224
+ labels={
225
+ "interloper.asset_key": asset.instance_key.replace(".", "-").lower(),
226
+ "interloper.run_id": self.state.run_id[:8],
227
+ }
228
+ ),
229
+ spec=pod_spec,
230
+ ),
231
+ backoff_limit=0,
232
+ ttl_seconds_after_finished=self._ttl_seconds_after_finished,
233
+ )
234
+
235
+ # Build job object
236
+ job = client.V1Job(
237
+ api_version="batch/v1",
238
+ kind="Job",
239
+ metadata=client.V1ObjectMeta(
240
+ name=job_name,
241
+ namespace=self._namespace,
242
+ labels={
243
+ "interloper.asset_key": asset.instance_key.replace(".", "-").lower(),
244
+ "interloper.run_id": self.state.run_id[:8],
245
+ },
246
+ annotations={
247
+ "interloper.asset_key": asset.instance_key,
248
+ },
249
+ ),
250
+ spec=job_spec,
251
+ )
252
+
253
+ self.state.mark_asset_running(asset)
254
+
255
+ # Create the job in Kubernetes
256
+ assert self._batch_v1 is not None
257
+ self._batch_v1.create_namespaced_job(namespace=self._namespace, body=job)
258
+
259
+ return job_name
260
+
261
+ def _wait_any(self, handles: list[str]) -> str:
262
+ """Wait for any job to finish by polling.
263
+
264
+ IMPORTANT: the `_execute_asset` method of the base class is not called by `_submit_asset`.
265
+ Therefore, the state has to be updated manually here and in `_submit_asset` above.
266
+
267
+ Args:
268
+ handles: List of job names to wait for
269
+
270
+ Returns:
271
+ The job name that finished
272
+ """
273
+ assert self._batch_v1 is not None
274
+ assert self._core_v1 is not None
275
+
276
+ while True:
277
+ for job_name in handles:
278
+ # Refresh job status
279
+ updated_job = cast(
280
+ V1Job,
281
+ self._batch_v1.read_namespaced_job_status(name=job_name, namespace=self._namespace),
282
+ )
283
+
284
+ assert updated_job.status is not None
285
+ status = updated_job.status
286
+ is_complete = status.succeeded is not None and status.succeeded > 0
287
+ is_failed = status.failed is not None and status.failed > 0
288
+
289
+ if is_complete or is_failed:
290
+ # Map back to asset
291
+ assert updated_job.metadata is not None and updated_job.metadata.annotations is not None
292
+ asset_key = updated_job.metadata.annotations.get("interloper.asset_key")
293
+ if asset_key is None or asset_key not in self.state.dag.asset_map:
294
+ raise RunnerError("Failed to map job to asset")
295
+ asset = self.state.dag.asset_map[asset_key]
296
+
297
+ if is_complete:
298
+ self.state.mark_asset_completed(asset)
299
+ else:
300
+ error_msg = f"Job {job_name} failed"
301
+
302
+ # Try to get pod logs for debugging
303
+ try:
304
+ pods = self._core_v1.list_namespaced_pod(
305
+ namespace=self._namespace,
306
+ label_selector=f"job-name={job_name}",
307
+ )
308
+ if pods.items:
309
+ pod = pods.items[0]
310
+ logs = self._core_v1.read_namespaced_pod_log(
311
+ name=pod.metadata.name,
312
+ namespace=self._namespace,
313
+ )
314
+ if logs:
315
+ print("=============== START OF ASSET JOB LOGS ================")
316
+ print(logs)
317
+ print("================ END OF ASSET JOB LOGS =================")
318
+ except Exception:
319
+ pass
320
+
321
+ self.state.mark_asset_failed(asset, error_msg)
322
+
323
+ if self._reraise or self._fail_fast:
324
+ raise RunnerError(error_msg)
325
+
326
+ return job_name
327
+
328
+ time.sleep(self._poll_interval)
329
+
330
+ def _cancel_all(self, handles: list[str]) -> None:
331
+ """Cancel all running jobs."""
332
+ assert self._batch_v1 is not None
333
+
334
+ for job_name in handles:
335
+ job: V1Job | None = None
336
+ try:
337
+ # Get job to retrieve asset key from annotations
338
+ job = cast(
339
+ V1Job,
340
+ self._batch_v1.read_namespaced_job(name=job_name, namespace=self._namespace),
341
+ )
342
+ self._batch_v1.delete_namespaced_job(
343
+ name=job_name,
344
+ namespace=self._namespace,
345
+ body=client.V1DeleteOptions(propagation_policy="Background"),
346
+ )
347
+ except Exception:
348
+ pass
349
+ finally:
350
+ if job is not None:
351
+ try:
352
+ assert job.metadata is not None and job.metadata.annotations is not None
353
+ asset_key = job.metadata.annotations.get("interloper.asset_key")
354
+ if asset_key and asset_key in self.state.dag.asset_map:
355
+ asset = self.state.dag.asset_map[asset_key]
356
+ self.state.mark_asset_cancelled(asset)
357
+ except Exception:
358
+ pass
359
+
360
+ def to_spec(self) -> RunnerSpec:
361
+ return RunnerSpec(
362
+ path=self.path,
363
+ init=dict(
364
+ image=self._image,
365
+ namespace=self._namespace,
366
+ max_jobs=self._max_jobs,
367
+ env_vars=self._env_vars,
368
+ service_account=self._service_account,
369
+ image_pull_policy=self._image_pull_policy,
370
+ image_pull_secrets=self._image_pull_secrets,
371
+ resources=self._resources,
372
+ node_selector=self._node_selector,
373
+ tolerations=self._tolerations,
374
+ poll_interval=self._poll_interval,
375
+ ttl_seconds_after_finished=self._ttl_seconds_after_finished,
376
+ fail_fast=self._fail_fast,
377
+ reraise=self._reraise,
378
+ ),
379
+ )
@@ -0,0 +1,14 @@
1
+ Metadata-Version: 2.3
2
+ Name: interloper-k8s
3
+ Version: 0.2.0
4
+ Summary: Interloper Kubernetes integration
5
+ Author: Guillaume Onfroy
6
+ Author-email: Guillaume Onfroy <guillaume@digitlcloud.com>
7
+ Requires-Dist: kubernetes>=31.0.0
8
+ Requires-Dist: interloper-core
9
+ Requires-Python: >=3.10
10
+ Description-Content-Type: text/markdown
11
+
12
+ # interloper-k8s
13
+
14
+ Kubernetes integration for Interloper.
@@ -0,0 +1,6 @@
1
+ interloper_k8s/__init__.py,sha256=471cXCLCZ9UIjiqqEtxLxIlele6zvpPgdZvf4UIWVUg,249
2
+ interloper_k8s/backfiller.py,sha256=skBf4kDGjXVJDW_L0ldUwu9DyYEKT0-bE8fslkx4AJI,19921
3
+ interloper_k8s/runner.py,sha256=8_1BJ42iCiPgHrhYCxyza-4Ys7A9b0s7tMV96Dwpdw0,15103
4
+ interloper_k8s-0.2.0.dist-info/WHEEL,sha256=01-mvBXsCWcapci73Y4TRTWrxqv9JijDtCFiicuPHXE,80
5
+ interloper_k8s-0.2.0.dist-info/METADATA,sha256=wyElv2Nto0MRIoIMXlmFxh9kpQuk38zCtNAZo-iYwAI,372
6
+ interloper_k8s-0.2.0.dist-info/RECORD,,
@@ -0,0 +1,4 @@
1
+ Wheel-Version: 1.0
2
+ Generator: uv 0.10.8
3
+ Root-Is-Purelib: true
4
+ Tag: py3-none-any