interloper-docker 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {interloper_docker-0.2.0 → interloper_docker-0.3.0}/PKG-INFO +2 -4
- interloper_docker-0.3.0/README.md +0 -0
- {interloper_docker-0.2.0 → interloper_docker-0.3.0}/pyproject.toml +5 -10
- {interloper_docker-0.2.0 → interloper_docker-0.3.0}/src/interloper_docker/__init__.py +2 -2
- interloper_docker-0.3.0/src/interloper_docker/launcher.py +165 -0
- interloper_docker-0.3.0/src/interloper_docker/runner.py +343 -0
- interloper_docker-0.2.0/README.md +0 -3
- interloper_docker-0.2.0/src/interloper_docker/backfiller.py +0 -336
- interloper_docker-0.2.0/src/interloper_docker/runner.py +0 -253
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: interloper-docker
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.3.0
|
|
4
4
|
Summary: Interloper Docker integration
|
|
5
5
|
Author: Guillaume Onfroy
|
|
6
6
|
Author-email: Guillaume Onfroy <guillaume@digitlcloud.com>
|
|
7
7
|
Requires-Dist: docker>=7.1.0
|
|
8
8
|
Requires-Dist: interloper-core
|
|
9
|
+
Requires-Dist: interloper-scheduler
|
|
9
10
|
Requires-Python: >=3.10
|
|
10
11
|
Description-Content-Type: text/markdown
|
|
11
12
|
|
|
12
|
-
# interloper-docker
|
|
13
|
-
|
|
14
|
-
Docker execution support for Interloper.
|
|
File without changes
|
|
@@ -3,24 +3,20 @@
|
|
|
3
3
|
# ###############
|
|
4
4
|
[project]
|
|
5
5
|
name = "interloper-docker"
|
|
6
|
-
version = "0.
|
|
6
|
+
version = "0.3.0"
|
|
7
7
|
description = "Interloper Docker integration"
|
|
8
8
|
readme = "README.md"
|
|
9
9
|
authors = [{ name = "Guillaume Onfroy", email = "guillaume@digitlcloud.com" }]
|
|
10
10
|
requires-python = ">=3.10"
|
|
11
|
-
dependencies = [
|
|
12
|
-
"docker>=7.1.0",
|
|
13
|
-
"interloper-core",
|
|
14
|
-
]
|
|
15
|
-
|
|
16
|
-
[project.optional-dependencies]
|
|
11
|
+
dependencies = ["docker>=7.1.0", "interloper-core", "interloper-scheduler"]
|
|
17
12
|
|
|
18
13
|
[build-system]
|
|
19
|
-
requires = ["uv_build>=0.
|
|
14
|
+
requires = ["uv_build>=0.11.5,<0.12"]
|
|
20
15
|
build-backend = "uv_build"
|
|
21
16
|
|
|
22
17
|
[tool.uv.sources]
|
|
23
18
|
interloper-core = { workspace = true }
|
|
19
|
+
interloper-scheduler = { workspace = true }
|
|
24
20
|
|
|
25
21
|
# ###############
|
|
26
22
|
# RUFF
|
|
@@ -33,7 +29,6 @@ extend-select = ["E", "I", "UP", "ANN001", "ANN201", "ANN202"]
|
|
|
33
29
|
|
|
34
30
|
[tool.ruff.lint.per-file-ignores]
|
|
35
31
|
"__init__.py" = ["F401", "F403"]
|
|
36
|
-
"**/schemas/**" = ["E501"]
|
|
37
32
|
"tests/**" = ["ANN", "F811"]
|
|
38
33
|
|
|
39
34
|
# ###############
|
|
@@ -43,4 +38,4 @@ extend-select = ["E", "I", "UP", "ANN001", "ANN201", "ANN202"]
|
|
|
43
38
|
include = ["src"]
|
|
44
39
|
typeCheckingMode = "basic"
|
|
45
40
|
reportMissingParameterType = true
|
|
46
|
-
ignore = ["
|
|
41
|
+
ignore = ["tests/**"]
|
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
"""Interloper Docker integration for container-based asset execution."""
|
|
2
2
|
|
|
3
|
-
from interloper_docker.
|
|
3
|
+
from interloper_docker.launcher import DockerLauncher
|
|
4
4
|
from interloper_docker.runner import DockerRunner
|
|
5
5
|
|
|
6
6
|
__all__ = [
|
|
7
|
-
"
|
|
7
|
+
"DockerLauncher",
|
|
8
8
|
"DockerRunner",
|
|
9
9
|
]
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
"""Docker launcher: runs each job in its own container."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
from typing import Any
|
|
9
|
+
from uuid import UUID
|
|
10
|
+
|
|
11
|
+
import docker
|
|
12
|
+
from interloper.catalog.base import Catalog
|
|
13
|
+
from interloper_scheduler.launcher import Launcher, RunState, RunStatus
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
# TODO: Implement a grace period to check if the container is running before returning in order to avoid
|
|
18
|
+
# stale run statuses due to container startup errors?
|
|
19
|
+
|
|
20
|
+
# TODO: `launch` command should supoort --catalog option to pass the catalog as a serialized string
|
|
21
|
+
# Then use this instead of INTERLOPER_CATALOG
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class DockerLauncher(Launcher):
|
|
25
|
+
"""Launches each run in its own Docker container.
|
|
26
|
+
|
|
27
|
+
The container executes the ``interloper launch <run_id>`` CLI command,
|
|
28
|
+
which hydrates the DAG from the database and runs it to completion.
|
|
29
|
+
|
|
30
|
+
Postgres connection parameters are passed as plain values. The caller
|
|
31
|
+
(``_build_launcher``) injects the app-level defaults; any overrides
|
|
32
|
+
from the launcher YAML config take precedence.
|
|
33
|
+
"""
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
catalog: Catalog,
|
|
38
|
+
postgres_host: str,
|
|
39
|
+
postgres_port: int,
|
|
40
|
+
postgres_user: str,
|
|
41
|
+
postgres_password: str,
|
|
42
|
+
postgres_database: str,
|
|
43
|
+
image: str = "interloper:latest-scheduler",
|
|
44
|
+
runner_type: str = "multi_thread",
|
|
45
|
+
runner_config: dict[str, Any] | None = None,
|
|
46
|
+
volumes: dict[str, dict[str, str]] | None = None,
|
|
47
|
+
) -> None:
|
|
48
|
+
"""Initialize the Docker launcher.
|
|
49
|
+
|
|
50
|
+
Args:
|
|
51
|
+
catalog: Catalog to inject into the container so it builds an identical catalog.
|
|
52
|
+
postgres_host: Postgres host to inject into the container.
|
|
53
|
+
postgres_port: Postgres port to inject into the container.
|
|
54
|
+
postgres_user: Postgres user to inject into the container.
|
|
55
|
+
postgres_password: Postgres password to inject into the container.
|
|
56
|
+
postgres_database: Postgres database to inject into the container.
|
|
57
|
+
image: Docker image to use.
|
|
58
|
+
runner_type: Runner type name forwarded to the container.
|
|
59
|
+
runner_config: Runner-specific kwargs forwarded to the container.
|
|
60
|
+
volumes: Volume mounts for the container. When
|
|
61
|
+
``runner_type`` is ``"docker"``, the Docker socket is
|
|
62
|
+
mounted automatically if not already included.
|
|
63
|
+
"""
|
|
64
|
+
super().__init__(runner_type=runner_type, runner_config=runner_config)
|
|
65
|
+
self._client = docker.from_env()
|
|
66
|
+
self._catalog = catalog
|
|
67
|
+
self._image = image
|
|
68
|
+
self._postgres_host = postgres_host
|
|
69
|
+
self._postgres_port = postgres_port
|
|
70
|
+
self._postgres_user = postgres_user
|
|
71
|
+
self._postgres_password = postgres_password
|
|
72
|
+
self._postgres_database = postgres_database
|
|
73
|
+
self._volumes = dict(volumes or {})
|
|
74
|
+
if runner_type == "docker" and "/var/run/docker.sock" not in self._volumes:
|
|
75
|
+
self._volumes["/var/run/docker.sock"] = {"bind": "/var/run/docker.sock", "mode": "rw"}
|
|
76
|
+
|
|
77
|
+
def launch(self, run_id: UUID) -> None:
|
|
78
|
+
"""Start a container that executes a single run.
|
|
79
|
+
|
|
80
|
+
Args:
|
|
81
|
+
run_id: The run UUID to execute.
|
|
82
|
+
"""
|
|
83
|
+
environment = self._build_environment()
|
|
84
|
+
container_name = f"interloper_run_{str(run_id)[:8]}"
|
|
85
|
+
|
|
86
|
+
try:
|
|
87
|
+
container = self._client.containers.run(
|
|
88
|
+
image=self._image,
|
|
89
|
+
name=container_name,
|
|
90
|
+
command=["interloper", "launch", str(run_id)],
|
|
91
|
+
environment=environment,
|
|
92
|
+
volumes=self._volumes if self._volumes else None,
|
|
93
|
+
user="root" if self._runner_type == "docker" else None,
|
|
94
|
+
detach=True,
|
|
95
|
+
auto_remove=False,
|
|
96
|
+
labels={"interloper.run_id": str(run_id)},
|
|
97
|
+
)
|
|
98
|
+
logger.info("Started container %s for run %s", container.short_id, run_id)
|
|
99
|
+
except Exception:
|
|
100
|
+
logger.exception("Failed to start container for run %s", run_id)
|
|
101
|
+
raise
|
|
102
|
+
|
|
103
|
+
def describe_run(self, run_id: UUID) -> RunState:
|
|
104
|
+
"""Return the authoritative state of a run's container.
|
|
105
|
+
|
|
106
|
+
Used by the reaper to catch failed runs as soon as the
|
|
107
|
+
container terminates, without waiting for the fallback timeout.
|
|
108
|
+
|
|
109
|
+
Args:
|
|
110
|
+
run_id: The run UUID to describe.
|
|
111
|
+
|
|
112
|
+
Returns:
|
|
113
|
+
A :class:`RunState` indicating whether the container is
|
|
114
|
+
still running, has succeeded, has failed, or is gone.
|
|
115
|
+
"""
|
|
116
|
+
container_name = f"interloper_run_{str(run_id)[:8]}"
|
|
117
|
+
try:
|
|
118
|
+
container = self._client.containers.get(container_name)
|
|
119
|
+
except Exception:
|
|
120
|
+
return RunState(status=RunStatus.NOT_FOUND)
|
|
121
|
+
|
|
122
|
+
try:
|
|
123
|
+
container.reload()
|
|
124
|
+
except Exception:
|
|
125
|
+
return RunState(status=RunStatus.NOT_FOUND)
|
|
126
|
+
|
|
127
|
+
state = container.attrs.get("State", {}) if container.attrs else {}
|
|
128
|
+
docker_status = (state.get("Status") or container.status or "").lower()
|
|
129
|
+
|
|
130
|
+
# "running", "created", "restarting", "paused" — still alive
|
|
131
|
+
if docker_status in ("running", "created", "restarting", "paused"):
|
|
132
|
+
return RunState(status=RunStatus.RUNNING)
|
|
133
|
+
|
|
134
|
+
# Terminal states: "exited", "dead", "removing"
|
|
135
|
+
exit_code = state.get("ExitCode")
|
|
136
|
+
if exit_code == 0:
|
|
137
|
+
return RunState(status=RunStatus.SUCCEEDED)
|
|
138
|
+
|
|
139
|
+
# Anything else is a failure
|
|
140
|
+
parts = [f"Container {container.short_id} status={docker_status}"]
|
|
141
|
+
if exit_code is not None:
|
|
142
|
+
parts.append(f"exit_code={exit_code}")
|
|
143
|
+
if state.get("OOMKilled"):
|
|
144
|
+
parts.append("OOMKilled")
|
|
145
|
+
error = state.get("Error") or ""
|
|
146
|
+
if error:
|
|
147
|
+
parts.append(f"error={error}")
|
|
148
|
+
return RunState(status=RunStatus.FAILED, error=" ".join(parts))
|
|
149
|
+
|
|
150
|
+
def _build_environment(self) -> dict[str, str]:
|
|
151
|
+
"""Build environment variables for the container."""
|
|
152
|
+
environment: dict[str, str] = {
|
|
153
|
+
"INTERLOPER_POSTGRES_HOST": self._postgres_host,
|
|
154
|
+
"INTERLOPER_POSTGRES_PORT": str(self._postgres_port),
|
|
155
|
+
"INTERLOPER_POSTGRES_USER": self._postgres_user,
|
|
156
|
+
"INTERLOPER_POSTGRES_PASSWORD": self._postgres_password,
|
|
157
|
+
"INTERLOPER_POSTGRES_DATABASE": self._postgres_database,
|
|
158
|
+
"INTERLOPER_CATALOG": json.dumps(self._catalog.to_paths()),
|
|
159
|
+
"INTERLOPER_RUNNER_TYPE": self._runner_type,
|
|
160
|
+
"INTERLOPER_RUNNER_CONFIG": json.dumps(self._runner_config),
|
|
161
|
+
}
|
|
162
|
+
encryption_key = os.environ.get("SECRETS_ENCRYPTION_KEY")
|
|
163
|
+
if encryption_key:
|
|
164
|
+
environment["SECRETS_ENCRYPTION_KEY"] = encryption_key
|
|
165
|
+
return environment
|
|
@@ -0,0 +1,343 @@
|
|
|
1
|
+
"""Docker-based runner that executes each asset in its own container.
|
|
2
|
+
|
|
3
|
+
Each submitted asset runs inside a fresh container. A mini-DAG comprising
|
|
4
|
+
the target asset and all its upstream ancestors is sent to the container
|
|
5
|
+
via inline JSON so the asset can resolve its upstream dependencies from IO
|
|
6
|
+
without recomputing them.
|
|
7
|
+
|
|
8
|
+
**Real-time events** are streamed via **stderr** using the ``@EVENT:``
|
|
9
|
+
prefix (see :class:`~interloper.events.StderrEventHandler`). Events for
|
|
10
|
+
the target asset are forwarded to the host EventBus; events for
|
|
11
|
+
non-materializable parent assets and container-internal ``RUN_*`` events
|
|
12
|
+
are dropped. The host updates internal state with ``emit=False`` to
|
|
13
|
+
avoid duplicate events.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import json
|
|
19
|
+
import logging
|
|
20
|
+
import threading
|
|
21
|
+
from concurrent.futures import Future, ThreadPoolExecutor
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
import docker
|
|
25
|
+
from docker.client import DockerClient
|
|
26
|
+
from docker.models.containers import Container
|
|
27
|
+
from interloper.asset.base import Asset
|
|
28
|
+
from interloper.errors import RunnerError
|
|
29
|
+
from interloper.events import EventBus, EventType
|
|
30
|
+
from interloper.events.event import parse_event_from_log_line
|
|
31
|
+
from interloper.partitioning.base import Partition, PartitionWindow
|
|
32
|
+
from interloper.partitioning.time import TimePartition, TimePartitionWindow
|
|
33
|
+
from interloper.runner.sync_runner import SyncRunner
|
|
34
|
+
from pydantic import Field, PrivateAttr
|
|
35
|
+
|
|
36
|
+
logger = logging.getLogger(__name__)
|
|
37
|
+
|
|
38
|
+
# Events emitted by the container's inner run — not forwarded to the host
|
|
39
|
+
# because the host manages its own run lifecycle.
|
|
40
|
+
_RUN_EVENTS = frozenset(
|
|
41
|
+
{
|
|
42
|
+
EventType.RUN_STARTED,
|
|
43
|
+
EventType.RUN_COMPLETED,
|
|
44
|
+
EventType.RUN_FAILED,
|
|
45
|
+
}
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class DockerRunner(SyncRunner):
|
|
50
|
+
"""Execute assets as individual Docker containers.
|
|
51
|
+
|
|
52
|
+
For each asset, constructs a mini-DAG comprising the asset and all its
|
|
53
|
+
upstream ancestors. The mini-DAG is sent to the container via inline JSON.
|
|
54
|
+
Inside the container, all non-target assets are marked as
|
|
55
|
+
``materializable=False`` to avoid recomputation while still enabling
|
|
56
|
+
IO-based dependency resolution.
|
|
57
|
+
|
|
58
|
+
Events are emitted by the container process and streamed to the host
|
|
59
|
+
via stderr. The host forwards them to its EventBus (for persistence
|
|
60
|
+
by the scheduler) and updates internal state silently (``emit=False``)
|
|
61
|
+
to avoid duplicate events.
|
|
62
|
+
|
|
63
|
+
Fully synchronous::
|
|
64
|
+
|
|
65
|
+
with DockerRunner(image="my-image", on_event=log_event) as runner:
|
|
66
|
+
result = runner.run(dag)
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
image: str = "interloper:latest-worker"
|
|
70
|
+
max_containers: int = 4
|
|
71
|
+
env_vars: dict[str, str] = Field(default_factory=dict)
|
|
72
|
+
volumes: dict[str, dict[str, str]] | list[str] = Field(default_factory=dict)
|
|
73
|
+
fail_fast: bool = False
|
|
74
|
+
reraise: bool = False
|
|
75
|
+
auto_remove: bool = True
|
|
76
|
+
|
|
77
|
+
_docker: DockerClient = PrivateAttr()
|
|
78
|
+
_poll_pool: ThreadPoolExecutor | None = PrivateAttr(default=None)
|
|
79
|
+
_log_threads: dict[str, threading.Thread] = PrivateAttr(default_factory=dict)
|
|
80
|
+
_stop_log_streaming: threading.Event = PrivateAttr(default_factory=threading.Event)
|
|
81
|
+
_container_map: dict[Future[Any], Container] = PrivateAttr(default_factory=dict)
|
|
82
|
+
|
|
83
|
+
def model_post_init(self, context: Any) -> None:
|
|
84
|
+
"""Initialize Docker client after model initialization."""
|
|
85
|
+
super().model_post_init(context)
|
|
86
|
+
self._docker = docker.from_env()
|
|
87
|
+
|
|
88
|
+
# ------------------------------------------------------------------
|
|
89
|
+
# Scheduling primitives
|
|
90
|
+
# ------------------------------------------------------------------
|
|
91
|
+
|
|
92
|
+
@property
|
|
93
|
+
def _capacity(self) -> int:
|
|
94
|
+
"""Maximum number of concurrent containers."""
|
|
95
|
+
return self.max_containers
|
|
96
|
+
|
|
97
|
+
def _on_start(self) -> None:
|
|
98
|
+
"""Create the polling thread pool."""
|
|
99
|
+
self._stop_log_streaming.clear()
|
|
100
|
+
self._poll_pool = ThreadPoolExecutor(max_workers=self.max_containers)
|
|
101
|
+
|
|
102
|
+
def _on_end(self) -> None:
|
|
103
|
+
"""Shut down log streaming and the polling pool."""
|
|
104
|
+
self._stop_log_streaming.set()
|
|
105
|
+
for thread in self._log_threads.values():
|
|
106
|
+
thread.join(timeout=2.0)
|
|
107
|
+
self._log_threads.clear()
|
|
108
|
+
if self._poll_pool is not None:
|
|
109
|
+
self._poll_pool.shutdown(wait=True, cancel_futures=False)
|
|
110
|
+
self._poll_pool = None
|
|
111
|
+
self._container_map.clear()
|
|
112
|
+
|
|
113
|
+
def _submit_asset(
|
|
114
|
+
self,
|
|
115
|
+
asset: Asset,
|
|
116
|
+
partition_or_window: Partition | PartitionWindow | None,
|
|
117
|
+
) -> Future[Any]:
|
|
118
|
+
"""Launch an asset in a Docker container and return a polling Future.
|
|
119
|
+
|
|
120
|
+
Returns:
|
|
121
|
+
A Future that raises :class:`RunnerError` on container failure.
|
|
122
|
+
"""
|
|
123
|
+
if self._poll_pool is None:
|
|
124
|
+
raise RunnerError("Poll pool not initialized")
|
|
125
|
+
|
|
126
|
+
mini_dag = self.state.dag.mini_dag(asset.id)
|
|
127
|
+
dag_spec = mini_dag.to_spec().model_dump(mode="json")
|
|
128
|
+
|
|
129
|
+
cmd = self._build_command(dag_spec, partition_or_window, self.state.run_id)
|
|
130
|
+
name = self._build_name(asset)
|
|
131
|
+
env = self._build_env()
|
|
132
|
+
volumes = self._build_volumes()
|
|
133
|
+
|
|
134
|
+
# emit=False: the container emits ASSET_STARTED via stderr
|
|
135
|
+
self.state.mark_asset_running(asset, emit=False)
|
|
136
|
+
|
|
137
|
+
try:
|
|
138
|
+
container = self._docker.containers.run(
|
|
139
|
+
image=self.image,
|
|
140
|
+
name=name,
|
|
141
|
+
command=cmd,
|
|
142
|
+
environment=env,
|
|
143
|
+
volumes=volumes if volumes else None,
|
|
144
|
+
labels={"interloper.asset_id": asset.id},
|
|
145
|
+
remove=False,
|
|
146
|
+
detach=True,
|
|
147
|
+
stdout=True,
|
|
148
|
+
stderr=True,
|
|
149
|
+
)
|
|
150
|
+
except Exception as e:
|
|
151
|
+
# Container never started — emit from the host
|
|
152
|
+
self.state.mark_asset_failed(asset, str(e))
|
|
153
|
+
done: Future[None] = Future()
|
|
154
|
+
done.set_result(None)
|
|
155
|
+
return done
|
|
156
|
+
|
|
157
|
+
self._start_log_streaming(container, target_asset_id=asset.id)
|
|
158
|
+
|
|
159
|
+
future = self._poll_pool.submit(self._poll_container, container)
|
|
160
|
+
self._container_map[future] = container
|
|
161
|
+
return future
|
|
162
|
+
|
|
163
|
+
def _handle_completed(self, future: Future[Any], asset: Asset) -> None:
|
|
164
|
+
"""Process a completed container future and clean up.
|
|
165
|
+
|
|
166
|
+
Updates internal state silently (``emit=False``) because the
|
|
167
|
+
container already emitted the real events. Assets already in a
|
|
168
|
+
terminal state (e.g. marked failed during ``_submit_asset``) are
|
|
169
|
+
skipped.
|
|
170
|
+
"""
|
|
171
|
+
container = self._container_map.pop(future, None)
|
|
172
|
+
if container is not None:
|
|
173
|
+
self._stop_container_log_streaming(container)
|
|
174
|
+
|
|
175
|
+
info = self.state.asset_executions.get(asset.id)
|
|
176
|
+
if not (info and info.is_terminal):
|
|
177
|
+
try:
|
|
178
|
+
future.result()
|
|
179
|
+
except Exception as e:
|
|
180
|
+
self.state.mark_asset_failed(asset, str(e), emit=False)
|
|
181
|
+
if self.fail_fast or self.reraise:
|
|
182
|
+
raise
|
|
183
|
+
else:
|
|
184
|
+
self.state.mark_asset_completed(asset, emit=False)
|
|
185
|
+
|
|
186
|
+
if container is not None and self.auto_remove:
|
|
187
|
+
try:
|
|
188
|
+
container.remove()
|
|
189
|
+
except Exception: # noqa: BLE001, S110
|
|
190
|
+
pass
|
|
191
|
+
|
|
192
|
+
def _handle_flushed_future(self, future: Future[Any], asset: Asset) -> None:
|
|
193
|
+
"""Clean up container after flush."""
|
|
194
|
+
container = self._container_map.pop(future, None)
|
|
195
|
+
if container is not None:
|
|
196
|
+
self._stop_container_log_streaming(container)
|
|
197
|
+
|
|
198
|
+
info = self.state.asset_executions.get(asset.id)
|
|
199
|
+
if not (info and info.is_terminal):
|
|
200
|
+
try:
|
|
201
|
+
future.result()
|
|
202
|
+
except Exception as e: # noqa: BLE001
|
|
203
|
+
self.state.mark_asset_failed(asset, str(e), emit=False)
|
|
204
|
+
else:
|
|
205
|
+
self.state.mark_asset_completed(asset, emit=False)
|
|
206
|
+
|
|
207
|
+
if container is not None and self.auto_remove:
|
|
208
|
+
try:
|
|
209
|
+
container.remove()
|
|
210
|
+
except Exception: # noqa: BLE001, S110
|
|
211
|
+
pass
|
|
212
|
+
|
|
213
|
+
# ------------------------------------------------------------------
|
|
214
|
+
# Container polling
|
|
215
|
+
# ------------------------------------------------------------------
|
|
216
|
+
|
|
217
|
+
def _poll_container(self, container: Container) -> None:
|
|
218
|
+
"""Block until the container exits; raise on failure.
|
|
219
|
+
|
|
220
|
+
Raises:
|
|
221
|
+
RunnerError: If the container exits with a non-zero code.
|
|
222
|
+
"""
|
|
223
|
+
result = container.wait()
|
|
224
|
+
status_code = result.get("StatusCode", 1)
|
|
225
|
+
if status_code != 0:
|
|
226
|
+
cid = (container.id or "unknown")[:12]
|
|
227
|
+
raise RunnerError(f"Container {cid} exited with code {status_code}")
|
|
228
|
+
|
|
229
|
+
# ------------------------------------------------------------------
|
|
230
|
+
# Command and environment builders
|
|
231
|
+
# ------------------------------------------------------------------
|
|
232
|
+
|
|
233
|
+
def _build_command(
|
|
234
|
+
self,
|
|
235
|
+
dag_spec: dict[str, Any],
|
|
236
|
+
partition_or_window: Partition | PartitionWindow | None,
|
|
237
|
+
run_id: str,
|
|
238
|
+
) -> list[str]:
|
|
239
|
+
"""Build the CLI command for asset execution in a container."""
|
|
240
|
+
cmd = [
|
|
241
|
+
"interloper",
|
|
242
|
+
"run",
|
|
243
|
+
"--format",
|
|
244
|
+
"inline",
|
|
245
|
+
f"--run-id={run_id}",
|
|
246
|
+
json.dumps(dag_spec),
|
|
247
|
+
]
|
|
248
|
+
|
|
249
|
+
if isinstance(partition_or_window, TimePartition):
|
|
250
|
+
cmd.extend(["--date", partition_or_window.value.strftime("%Y-%m-%d")])
|
|
251
|
+
elif isinstance(partition_or_window, TimePartitionWindow):
|
|
252
|
+
cmd.extend(
|
|
253
|
+
[
|
|
254
|
+
"--start-date",
|
|
255
|
+
partition_or_window.start.strftime("%Y-%m-%d"),
|
|
256
|
+
"--end-date",
|
|
257
|
+
partition_or_window.end.strftime("%Y-%m-%d"),
|
|
258
|
+
]
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
return cmd
|
|
262
|
+
|
|
263
|
+
def _build_env(self) -> dict[str, str]:
|
|
264
|
+
"""Build the environment variables for the container."""
|
|
265
|
+
env = dict(self.env_vars)
|
|
266
|
+
env["INTERLOPER_EVENTS_TO_STDERR"] = "true"
|
|
267
|
+
return env
|
|
268
|
+
|
|
269
|
+
def _build_volumes(self) -> dict[str, dict[str, str]]:
|
|
270
|
+
"""Build the volume mounts for the container."""
|
|
271
|
+
volumes: dict[str, dict[str, str]] = {}
|
|
272
|
+
if isinstance(self.volumes, dict):
|
|
273
|
+
volumes.update(self.volumes)
|
|
274
|
+
elif isinstance(self.volumes, list):
|
|
275
|
+
for volume in self.volumes:
|
|
276
|
+
parts = volume.split(":")
|
|
277
|
+
volumes[parts[0]] = {"bind": parts[1], "mode": "rw"}
|
|
278
|
+
return volumes
|
|
279
|
+
|
|
280
|
+
def _build_name(self, asset: Asset) -> str:
|
|
281
|
+
"""Build the name for the container."""
|
|
282
|
+
return f"interloper_run_{self.state.run_id[:8]}_{asset.id[:8]}"
|
|
283
|
+
|
|
284
|
+
# ------------------------------------------------------------------
|
|
285
|
+
# Real-time event streaming (stderr)
|
|
286
|
+
# ------------------------------------------------------------------
|
|
287
|
+
|
|
288
|
+
def _start_log_streaming(self, container: Container, *, target_asset_id: str) -> None:
|
|
289
|
+
"""Stream events from the container's stderr to the host EventBus.
|
|
290
|
+
|
|
291
|
+
Only events belonging to the **target asset** are forwarded.
|
|
292
|
+
Events for non-materializable parent assets in the mini-DAG and
|
|
293
|
+
container-internal ``RUN_*`` events are dropped.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
container: The Docker container to stream from.
|
|
297
|
+
target_asset_id: Only forward events with this ``asset_id``.
|
|
298
|
+
"""
|
|
299
|
+
cid = (container.id or "???")[:12]
|
|
300
|
+
|
|
301
|
+
def stream_logs() -> None:
|
|
302
|
+
buf = ""
|
|
303
|
+
try:
|
|
304
|
+
for chunk in container.logs(stream=True, follow=True, stdout=False, stderr=True):
|
|
305
|
+
if self._stop_log_streaming.is_set():
|
|
306
|
+
break
|
|
307
|
+
buf += chunk.decode("utf-8", errors="ignore")
|
|
308
|
+
while "\n" in buf:
|
|
309
|
+
line, buf = buf.split("\n", 1)
|
|
310
|
+
line = line.rstrip()
|
|
311
|
+
if not line:
|
|
312
|
+
continue
|
|
313
|
+
try:
|
|
314
|
+
event = parse_event_from_log_line(line)
|
|
315
|
+
if event is not None:
|
|
316
|
+
if event.type in _RUN_EVENTS:
|
|
317
|
+
continue
|
|
318
|
+
event_asset_id = event.metadata.get("asset_id")
|
|
319
|
+
if event_asset_id and event_asset_id != target_asset_id:
|
|
320
|
+
continue
|
|
321
|
+
EventBus.emit(event.type, metadata=event.metadata)
|
|
322
|
+
continue
|
|
323
|
+
except Exception: # noqa: BLE001, S110
|
|
324
|
+
pass
|
|
325
|
+
logger.debug("[container %s] %s", cid, line)
|
|
326
|
+
except Exception: # noqa: BLE001, S110
|
|
327
|
+
pass
|
|
328
|
+
buf = buf.rstrip()
|
|
329
|
+
if buf:
|
|
330
|
+
logger.debug("[container %s] %s", cid, buf)
|
|
331
|
+
|
|
332
|
+
thread = threading.Thread(target=stream_logs, daemon=True)
|
|
333
|
+
thread.start()
|
|
334
|
+
if container.id is not None:
|
|
335
|
+
self._log_threads[container.id] = thread
|
|
336
|
+
|
|
337
|
+
def _stop_container_log_streaming(self, container: Container) -> None:
|
|
338
|
+
"""Stop and clean up the log streaming thread for a container."""
|
|
339
|
+
if container.id is None:
|
|
340
|
+
return
|
|
341
|
+
thread = self._log_threads.pop(container.id, None)
|
|
342
|
+
if thread is not None:
|
|
343
|
+
thread.join(timeout=1.0)
|
|
@@ -1,336 +0,0 @@
|
|
|
1
|
-
"""Docker Backfiller implementation for Interloper.
|
|
2
|
-
|
|
3
|
-
This backfiller starts a Docker container and invokes the Interloper CLI inside it
|
|
4
|
-
using an inline JSON config. It runs the entire DAG in the container, delegating
|
|
5
|
-
asset scheduling to the configured backfiller in the inline config (typically in_process).
|
|
6
|
-
"""
|
|
7
|
-
|
|
8
|
-
from __future__ import annotations
|
|
9
|
-
|
|
10
|
-
import threading
|
|
11
|
-
from collections.abc import Callable
|
|
12
|
-
from time import sleep
|
|
13
|
-
|
|
14
|
-
import docker
|
|
15
|
-
from docker.errors import NotFound
|
|
16
|
-
from docker.models.containers import Container
|
|
17
|
-
from interloper.backfillers.base import Backfiller
|
|
18
|
-
from interloper.cli.config import Config
|
|
19
|
-
from interloper.dag.base import DAG
|
|
20
|
-
from interloper.errors import PartitionError
|
|
21
|
-
from interloper.events.base import Event, EventBus, parse_event_from_log_line
|
|
22
|
-
from interloper.partitioning.base import Partition, PartitionWindow
|
|
23
|
-
from interloper.partitioning.time import TimePartition, TimePartitionWindow
|
|
24
|
-
from interloper.runners.base import Runner
|
|
25
|
-
from interloper.runners.results import ExecutionStatus, RunResult
|
|
26
|
-
from interloper.serialization.backfiller import BackfillerSpec
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
class DockerBackfiller(Backfiller[Container]):
|
|
30
|
-
"""Run an Interloper DAG inside a Docker container via the Interloper CLI.
|
|
31
|
-
|
|
32
|
-
The image must contain the `interloper` package (CLI available on PATH).
|
|
33
|
-
"""
|
|
34
|
-
|
|
35
|
-
def __init__(
|
|
36
|
-
self,
|
|
37
|
-
image: str,
|
|
38
|
-
env_vars: dict[str, str] | None = None,
|
|
39
|
-
max_containers: int = 1,
|
|
40
|
-
runner: Runner | None = None,
|
|
41
|
-
volumes: dict[str, dict[str, str]] | list[str] | None = None,
|
|
42
|
-
dind: bool = False,
|
|
43
|
-
on_event: Callable[[Event], None] | None = None,
|
|
44
|
-
) -> None:
|
|
45
|
-
"""Initialize the DockerBackfiller.
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
image: Docker image to use
|
|
49
|
-
env_vars: Environment variables to pass to the container
|
|
50
|
-
max_containers: Maximum number of concurrent containers (default 1)
|
|
51
|
-
runner: Runner to use for running assets
|
|
52
|
-
volumes: Volume mounts for the container
|
|
53
|
-
dind: If True, mount the Docker socket to enable Docker-in-Docker
|
|
54
|
-
on_event: Optional event handler for lifecycle events
|
|
55
|
-
"""
|
|
56
|
-
super().__init__(runner=runner, on_event=on_event)
|
|
57
|
-
|
|
58
|
-
# Force the runner to re-raise exceptions to make sure the container's exit code is propagated.
|
|
59
|
-
self.runner._reraise = True
|
|
60
|
-
|
|
61
|
-
self._image = image
|
|
62
|
-
self._env_vars = env_vars or {}
|
|
63
|
-
self._max_containers = max_containers
|
|
64
|
-
self._volumes = volumes or {}
|
|
65
|
-
self._dind = dind
|
|
66
|
-
self._docker = docker.from_env()
|
|
67
|
-
|
|
68
|
-
# Track log streaming threads for cleanup
|
|
69
|
-
self._log_threads: dict[str, threading.Thread] = {}
|
|
70
|
-
self._stop_log_streaming = threading.Event()
|
|
71
|
-
|
|
72
|
-
@property
|
|
73
|
-
def _capacity(self) -> int:
|
|
74
|
-
"""Maximum number of concurrent containers."""
|
|
75
|
-
return self._max_containers
|
|
76
|
-
|
|
77
|
-
def _on_start(self) -> None:
|
|
78
|
-
self._stop_log_streaming.clear()
|
|
79
|
-
|
|
80
|
-
def _on_end(self) -> None:
|
|
81
|
-
# Signal all log streaming threads to stop
|
|
82
|
-
self._stop_log_streaming.set()
|
|
83
|
-
|
|
84
|
-
# Wait for threads to finish
|
|
85
|
-
for thread in self._log_threads.values():
|
|
86
|
-
thread.join(timeout=2.0)
|
|
87
|
-
self._log_threads.clear()
|
|
88
|
-
|
|
89
|
-
def _build_command(
|
|
90
|
-
self,
|
|
91
|
-
dag: DAG,
|
|
92
|
-
partition_or_window: Partition | PartitionWindow | None,
|
|
93
|
-
backfill_id: str,
|
|
94
|
-
) -> list[str]:
|
|
95
|
-
"""Build the CLI command for a partition.
|
|
96
|
-
|
|
97
|
-
Args:
|
|
98
|
-
dag: The DAG to execute
|
|
99
|
-
partition_or_window: The partition or window
|
|
100
|
-
backfill_id: The backfill ID
|
|
101
|
-
|
|
102
|
-
Returns:
|
|
103
|
-
Command list for the container
|
|
104
|
-
"""
|
|
105
|
-
config = Config(dag=dag, runner=self.runner)
|
|
106
|
-
|
|
107
|
-
cmd = [
|
|
108
|
-
"interloper",
|
|
109
|
-
"run",
|
|
110
|
-
"--format=inline",
|
|
111
|
-
f"--backfill-id={backfill_id}",
|
|
112
|
-
config.to_json(),
|
|
113
|
-
]
|
|
114
|
-
|
|
115
|
-
if partition_or_window is None:
|
|
116
|
-
return cmd
|
|
117
|
-
|
|
118
|
-
if isinstance(partition_or_window, TimePartition):
|
|
119
|
-
cmd.extend(["--date", partition_or_window.value.strftime("%Y-%m-%d")])
|
|
120
|
-
elif isinstance(partition_or_window, TimePartitionWindow):
|
|
121
|
-
cmd.extend(
|
|
122
|
-
[
|
|
123
|
-
"--start-date",
|
|
124
|
-
partition_or_window.start.strftime("%Y-%m-%d"),
|
|
125
|
-
"--end-date",
|
|
126
|
-
partition_or_window.end.strftime("%Y-%m-%d"),
|
|
127
|
-
]
|
|
128
|
-
)
|
|
129
|
-
else:
|
|
130
|
-
raise PartitionError("Unsupported partition or window type")
|
|
131
|
-
return cmd
|
|
132
|
-
|
|
133
|
-
def _build_env(self) -> dict[str, str]:
|
|
134
|
-
"""Build the environment variables for the container."""
|
|
135
|
-
env = dict(self._env_vars)
|
|
136
|
-
# Enable log-based event streaming
|
|
137
|
-
env["INTERLOPER_EVENTS_TO_STDERR"] = "true"
|
|
138
|
-
return env
|
|
139
|
-
|
|
140
|
-
def _build_volumes(self) -> dict[str, dict[str, str]]:
|
|
141
|
-
"""Build the volume mounts for the container."""
|
|
142
|
-
volumes = {}
|
|
143
|
-
if isinstance(self._volumes, dict):
|
|
144
|
-
volumes.update(self._volumes)
|
|
145
|
-
elif isinstance(self._volumes, list):
|
|
146
|
-
for volume in self._volumes:
|
|
147
|
-
volumes[volume.split(":")[0]] = {"bind": volume.split(":")[1], "mode": "rw"}
|
|
148
|
-
if self._dind:
|
|
149
|
-
volumes["/var/run/docker.sock"] = {"bind": "/var/run/docker.sock", "mode": "rw"}
|
|
150
|
-
return volumes
|
|
151
|
-
|
|
152
|
-
def _build_name(self, partition_or_window: Partition | PartitionWindow | None) -> str:
|
|
153
|
-
"""Build the name for the container."""
|
|
154
|
-
name = f"interloper_backfill_{self.state.backfill_id[:8]}"
|
|
155
|
-
if partition_or_window is not None:
|
|
156
|
-
name += f"-{partition_or_window}"
|
|
157
|
-
return name.replace(":", "-").replace("_", "-").lower()
|
|
158
|
-
|
|
159
|
-
def _start_log_streaming(self, container: Container) -> None:
|
|
160
|
-
"""Start a background thread to stream logs and parse events from a container.
|
|
161
|
-
|
|
162
|
-
Args:
|
|
163
|
-
container: The Docker container to stream logs from
|
|
164
|
-
"""
|
|
165
|
-
|
|
166
|
-
def stream_logs() -> None:
|
|
167
|
-
try:
|
|
168
|
-
# Stream logs from the container (both stdout and stderr)
|
|
169
|
-
for log_line in container.logs(stream=True, follow=True, stdout=True, stderr=True):
|
|
170
|
-
if self._stop_log_streaming.is_set():
|
|
171
|
-
break
|
|
172
|
-
|
|
173
|
-
try:
|
|
174
|
-
line = log_line.decode("utf-8", errors="ignore")
|
|
175
|
-
event = parse_event_from_log_line(line)
|
|
176
|
-
if event is not None:
|
|
177
|
-
EventBus.get_instance().emit(event)
|
|
178
|
-
except Exception:
|
|
179
|
-
# Ignore parsing errors, continue streaming
|
|
180
|
-
pass
|
|
181
|
-
except Exception:
|
|
182
|
-
# Container may have been removed or stopped
|
|
183
|
-
pass
|
|
184
|
-
|
|
185
|
-
thread = threading.Thread(target=stream_logs, daemon=True)
|
|
186
|
-
thread.start()
|
|
187
|
-
if container.id is not None:
|
|
188
|
-
self._log_threads[container.id] = thread
|
|
189
|
-
|
|
190
|
-
def _stop_container_log_streaming(self, container: Container) -> None:
|
|
191
|
-
"""Stop and clean up the log streaming thread for a container.
|
|
192
|
-
|
|
193
|
-
Args:
|
|
194
|
-
container: The Docker container to stop streaming for
|
|
195
|
-
"""
|
|
196
|
-
if container.id is None:
|
|
197
|
-
return
|
|
198
|
-
thread = self._log_threads.pop(container.id, None)
|
|
199
|
-
if thread is not None:
|
|
200
|
-
# Thread will stop on next iteration due to container exit
|
|
201
|
-
thread.join(timeout=1.0)
|
|
202
|
-
|
|
203
|
-
def _submit_run(
|
|
204
|
-
self,
|
|
205
|
-
dag: DAG,
|
|
206
|
-
partition_or_window: Partition | PartitionWindow | None,
|
|
207
|
-
) -> Container:
|
|
208
|
-
"""Submit execution of a run in a Docker container.
|
|
209
|
-
|
|
210
|
-
Args:
|
|
211
|
-
dag: The DAG to execute
|
|
212
|
-
partition_or_window: Either a Partition or PartitionWindow object
|
|
213
|
-
|
|
214
|
-
Returns:
|
|
215
|
-
The container as the handle
|
|
216
|
-
"""
|
|
217
|
-
cmd = self._build_command(dag, partition_or_window, self.state.backfill_id)
|
|
218
|
-
env = self._build_env()
|
|
219
|
-
volumes = self._build_volumes()
|
|
220
|
-
name = self._build_name(partition_or_window)
|
|
221
|
-
|
|
222
|
-
self.state.mark_run_running(partition_or_window)
|
|
223
|
-
|
|
224
|
-
container = self._docker.containers.run(
|
|
225
|
-
image=self._image,
|
|
226
|
-
name=name,
|
|
227
|
-
command=cmd,
|
|
228
|
-
environment=env,
|
|
229
|
-
volumes=volumes if volumes else None,
|
|
230
|
-
remove=False,
|
|
231
|
-
detach=True,
|
|
232
|
-
stdout=True,
|
|
233
|
-
stderr=True,
|
|
234
|
-
)
|
|
235
|
-
# Store partition in container object for _wait_any
|
|
236
|
-
setattr(container, "_interloper_partition", partition_or_window)
|
|
237
|
-
|
|
238
|
-
# Start log streaming for event collection
|
|
239
|
-
self._start_log_streaming(container)
|
|
240
|
-
|
|
241
|
-
return container
|
|
242
|
-
|
|
243
|
-
def _wait_any(self, handles: list[Container]) -> Container:
|
|
244
|
-
"""Wait for any container to complete by polling.
|
|
245
|
-
|
|
246
|
-
Args:
|
|
247
|
-
handles: List of container objects to wait for
|
|
248
|
-
|
|
249
|
-
Returns:
|
|
250
|
-
The container that completed
|
|
251
|
-
"""
|
|
252
|
-
while True:
|
|
253
|
-
for container in handles:
|
|
254
|
-
container.reload()
|
|
255
|
-
|
|
256
|
-
if container.status in ("exited", "dead"):
|
|
257
|
-
# Stop log streaming for this container
|
|
258
|
-
self._stop_container_log_streaming(container)
|
|
259
|
-
|
|
260
|
-
result = container.wait()
|
|
261
|
-
status_code = result.get("StatusCode", 1)
|
|
262
|
-
|
|
263
|
-
# Get partition from container object
|
|
264
|
-
partition = getattr(container, "_interloper_partition", None)
|
|
265
|
-
|
|
266
|
-
if status_code == 0:
|
|
267
|
-
# TODO: This is not the true RunResult, we need to get it from the container?
|
|
268
|
-
# Missing the asset_executions.
|
|
269
|
-
result = RunResult(partition, ExecutionStatus.COMPLETED)
|
|
270
|
-
self.state.mark_run_completed(partition, result)
|
|
271
|
-
else:
|
|
272
|
-
self.state.mark_run_failed(partition, f"Container exited with code {status_code}")
|
|
273
|
-
|
|
274
|
-
try:
|
|
275
|
-
logs = container.logs(stdout=True, stderr=True)
|
|
276
|
-
if logs:
|
|
277
|
-
print("=============== START OF RUN CONTAINER LOGS ==================")
|
|
278
|
-
print(logs.decode("utf-8", errors="ignore"))
|
|
279
|
-
print("================ END OF RUN CONTAINER LOGS ===================")
|
|
280
|
-
except Exception:
|
|
281
|
-
pass
|
|
282
|
-
|
|
283
|
-
# Remove the container after processing
|
|
284
|
-
try:
|
|
285
|
-
container.remove()
|
|
286
|
-
except Exception as e:
|
|
287
|
-
print(f"Error removing container {container.id}: {e}")
|
|
288
|
-
pass
|
|
289
|
-
|
|
290
|
-
return container
|
|
291
|
-
|
|
292
|
-
sleep(1.0)
|
|
293
|
-
|
|
294
|
-
def _cancel_all(self, handles: list[Container]) -> None:
|
|
295
|
-
"""Best-effort cancellation of outstanding containers.
|
|
296
|
-
|
|
297
|
-
Args:
|
|
298
|
-
handles: List of container objects to cancel
|
|
299
|
-
"""
|
|
300
|
-
for container in handles:
|
|
301
|
-
partition = getattr(container, "_interloper_partition", None)
|
|
302
|
-
|
|
303
|
-
# Stop log streaming for this container
|
|
304
|
-
self._stop_container_log_streaming(container)
|
|
305
|
-
|
|
306
|
-
try:
|
|
307
|
-
container.stop(timeout=5)
|
|
308
|
-
except NotFound:
|
|
309
|
-
# Container already removed, mark as cancelled
|
|
310
|
-
if partition is not None:
|
|
311
|
-
self.state.mark_run_cancelled(partition)
|
|
312
|
-
except Exception:
|
|
313
|
-
try:
|
|
314
|
-
container.kill()
|
|
315
|
-
except NotFound:
|
|
316
|
-
# Container already removed
|
|
317
|
-
self.state.mark_run_cancelled(partition)
|
|
318
|
-
except Exception:
|
|
319
|
-
pass
|
|
320
|
-
else:
|
|
321
|
-
# Only mark as cancelled if we successfully stopped/killed
|
|
322
|
-
if partition is not None:
|
|
323
|
-
self.state.mark_run_cancelled(partition)
|
|
324
|
-
|
|
325
|
-
def to_spec(self) -> BackfillerSpec:
|
|
326
|
-
"""Convert to serializable spec."""
|
|
327
|
-
return BackfillerSpec(
|
|
328
|
-
path=self.path,
|
|
329
|
-
init=dict(
|
|
330
|
-
image=self._image,
|
|
331
|
-
env_vars=self._env_vars,
|
|
332
|
-
volumes=self._volumes,
|
|
333
|
-
max_containers=self._max_containers,
|
|
334
|
-
dind=self._dind,
|
|
335
|
-
),
|
|
336
|
-
)
|
|
@@ -1,253 +0,0 @@
|
|
|
1
|
-
"""Docker-based runner that runs each asset in its own container.
|
|
2
|
-
|
|
3
|
-
Each submitted asset is executed inside a fresh container. To allow an asset
|
|
4
|
-
to resolve its upstream dependencies from IO without recomputing them, we pass
|
|
5
|
-
to the container a mini-DAG consisting of the target asset plus all its
|
|
6
|
-
upstream ancestors. The container runs the Interloper CLI with an inline
|
|
7
|
-
config, similar to the `DockerBackfiller`.
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
from __future__ import annotations
|
|
11
|
-
|
|
12
|
-
from collections.abc import Callable
|
|
13
|
-
|
|
14
|
-
import docker
|
|
15
|
-
from docker.models.containers import Container
|
|
16
|
-
from interloper.assets.base import Asset
|
|
17
|
-
from interloper.cli.config import Config
|
|
18
|
-
from interloper.dag.base import DAG
|
|
19
|
-
from interloper.errors import PartitionError, RunnerError
|
|
20
|
-
from interloper.events.base import Event
|
|
21
|
-
from interloper.partitioning.base import Partition, PartitionWindow
|
|
22
|
-
from interloper.partitioning.time import TimePartition, TimePartitionWindow
|
|
23
|
-
from interloper.runners.base import Runner
|
|
24
|
-
from interloper.serialization.runner import RunnerSpec
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class DockerRunner(Runner[Container]):
|
|
28
|
-
"""Execute assets as individual Docker containers.
|
|
29
|
-
|
|
30
|
-
For each asset, constructs a mini-DAG comprising the asset and all its
|
|
31
|
-
upstream ancestors. The mini-DAG is sent to the container via inline JSON.
|
|
32
|
-
Inside the container, all non-target assets are marked as
|
|
33
|
-
`materializable=False` prior to execution to avoid recomputation while
|
|
34
|
-
still enabling IO-based dependency resolution.
|
|
35
|
-
"""
|
|
36
|
-
|
|
37
|
-
def __init__(
|
|
38
|
-
self,
|
|
39
|
-
image: str,
|
|
40
|
-
max_containers: int = 4,
|
|
41
|
-
env_vars: dict[str, str] | None = None,
|
|
42
|
-
volumes: dict[str, dict[str, str]] | list[str] | None = None,
|
|
43
|
-
fail_fast: bool = False,
|
|
44
|
-
reraise: bool = False,
|
|
45
|
-
on_event: Callable[[Event], None] | None = None,
|
|
46
|
-
) -> None:
|
|
47
|
-
"""Initialize the DockerRunner.
|
|
48
|
-
|
|
49
|
-
Args:
|
|
50
|
-
image: Docker image to use for container execution.
|
|
51
|
-
max_containers: Maximum number of concurrent containers.
|
|
52
|
-
env_vars: Environment variables to pass to the container.
|
|
53
|
-
volumes: Volume mounts for the container.
|
|
54
|
-
fail_fast: Stop execution on first failure.
|
|
55
|
-
reraise: Re-raise exceptions.
|
|
56
|
-
on_event: Optional event handler for lifecycle events.
|
|
57
|
-
"""
|
|
58
|
-
super().__init__(fail_fast=fail_fast, reraise=reraise, on_event=on_event)
|
|
59
|
-
self._image = image
|
|
60
|
-
self._max_containers = max_containers
|
|
61
|
-
self._env_vars = env_vars or {}
|
|
62
|
-
self._volumes = volumes or {}
|
|
63
|
-
self._docker = docker.from_env()
|
|
64
|
-
|
|
65
|
-
@property
|
|
66
|
-
def _capacity(self) -> int:
|
|
67
|
-
return self._max_containers
|
|
68
|
-
|
|
69
|
-
def _build_command(
|
|
70
|
-
self,
|
|
71
|
-
dag: DAG,
|
|
72
|
-
partition_or_window: Partition | PartitionWindow | None,
|
|
73
|
-
run_id: str,
|
|
74
|
-
) -> list[str]:
|
|
75
|
-
"""Build the CLI command for asset execution in a container.
|
|
76
|
-
|
|
77
|
-
Args:
|
|
78
|
-
dag: The DAG to execute.
|
|
79
|
-
partition_or_window: The partition or window.
|
|
80
|
-
run_id: The run ID.
|
|
81
|
-
|
|
82
|
-
Returns:
|
|
83
|
-
Command list for the container.
|
|
84
|
-
"""
|
|
85
|
-
config = Config(dag=dag)
|
|
86
|
-
|
|
87
|
-
cmd = [
|
|
88
|
-
"interloper",
|
|
89
|
-
"run",
|
|
90
|
-
"--format",
|
|
91
|
-
"inline",
|
|
92
|
-
f"--run-id={run_id}",
|
|
93
|
-
config.to_json(),
|
|
94
|
-
]
|
|
95
|
-
|
|
96
|
-
if isinstance(partition_or_window, TimePartition):
|
|
97
|
-
cmd.extend(["--date", partition_or_window.value.strftime("%Y-%m-%d")])
|
|
98
|
-
elif isinstance(partition_or_window, TimePartitionWindow):
|
|
99
|
-
cmd.extend(
|
|
100
|
-
[
|
|
101
|
-
"--start-date",
|
|
102
|
-
partition_or_window.start.strftime("%Y-%m-%d"),
|
|
103
|
-
"--end-date",
|
|
104
|
-
partition_or_window.end.strftime("%Y-%m-%d"),
|
|
105
|
-
]
|
|
106
|
-
)
|
|
107
|
-
else:
|
|
108
|
-
raise PartitionError("Unsupported partition or window type")
|
|
109
|
-
return cmd
|
|
110
|
-
|
|
111
|
-
def _build_env(self) -> dict[str, str]:
|
|
112
|
-
"""Build the environment variables for the container."""
|
|
113
|
-
return dict(self._env_vars)
|
|
114
|
-
|
|
115
|
-
def _build_volumes(self) -> dict[str, dict[str, str]]:
|
|
116
|
-
"""Build the volume mounts for the container."""
|
|
117
|
-
volumes = {}
|
|
118
|
-
if isinstance(self._volumes, dict):
|
|
119
|
-
volumes.update(self._volumes)
|
|
120
|
-
elif isinstance(self._volumes, list):
|
|
121
|
-
for volume in self._volumes:
|
|
122
|
-
volumes[volume.split(":")[0]] = {"bind": volume.split(":")[1], "mode": "rw"}
|
|
123
|
-
return volumes
|
|
124
|
-
|
|
125
|
-
def _build_name(self, asset: Asset) -> str:
|
|
126
|
-
"""Build the name for the container."""
|
|
127
|
-
name = f"interloper_run_{self.state.run_id[:8]}-{asset.instance_key}"
|
|
128
|
-
return name.replace(":", "-").replace("_", "-").lower()
|
|
129
|
-
|
|
130
|
-
def _submit_asset(
|
|
131
|
-
self,
|
|
132
|
-
asset: Asset,
|
|
133
|
-
partition_or_window: Partition | PartitionWindow | None,
|
|
134
|
-
) -> Container:
|
|
135
|
-
"""Submit execution of an asset and return the container object for completion tracking.
|
|
136
|
-
|
|
137
|
-
IMPORTANT: this method is not calling the `_execute_asset` method of the base class.
|
|
138
|
-
Therefore, the state has to be updated manually here and in `_wait_any` below.
|
|
139
|
-
|
|
140
|
-
Args:
|
|
141
|
-
asset: The asset to execute
|
|
142
|
-
partition_or_window: Either a Partition or PartitionWindow object
|
|
143
|
-
|
|
144
|
-
Returns:
|
|
145
|
-
The container object for the asset execution
|
|
146
|
-
"""
|
|
147
|
-
# Build a mini-DAG: target asset + its parents (non-materializable)
|
|
148
|
-
mini_dag = self.state.dag.mini_dag(asset.instance_key)
|
|
149
|
-
|
|
150
|
-
cmd = self._build_command(mini_dag, partition_or_window, self.state.run_id)
|
|
151
|
-
name = self._build_name(asset)
|
|
152
|
-
env = self._build_env()
|
|
153
|
-
volumes = self._build_volumes()
|
|
154
|
-
|
|
155
|
-
self.state.mark_asset_running(asset)
|
|
156
|
-
|
|
157
|
-
container = self._docker.containers.run(
|
|
158
|
-
image=self._image,
|
|
159
|
-
name=name,
|
|
160
|
-
command=cmd,
|
|
161
|
-
environment=env,
|
|
162
|
-
volumes=volumes if volumes else None,
|
|
163
|
-
labels={"interloper.asset_key": asset.instance_key},
|
|
164
|
-
remove=False,
|
|
165
|
-
detach=True,
|
|
166
|
-
stdout=True,
|
|
167
|
-
stderr=True,
|
|
168
|
-
)
|
|
169
|
-
|
|
170
|
-
return container
|
|
171
|
-
|
|
172
|
-
def _wait_any(self, handles: list[Container]) -> Container:
|
|
173
|
-
"""Wait for any container to finish by polling.
|
|
174
|
-
|
|
175
|
-
IMPORTANT: the `_execute_asset` method of the base class is not called by `_submit_asset`.
|
|
176
|
-
Therefore, the state has to be updated manually here and in `_submit_asset` above.
|
|
177
|
-
|
|
178
|
-
Args:
|
|
179
|
-
handles: List of container objects to wait for
|
|
180
|
-
|
|
181
|
-
Returns:
|
|
182
|
-
The container object that finished
|
|
183
|
-
"""
|
|
184
|
-
|
|
185
|
-
while True:
|
|
186
|
-
for container in handles:
|
|
187
|
-
container.reload()
|
|
188
|
-
|
|
189
|
-
if container.status in ("exited", "dead"):
|
|
190
|
-
result = container.wait()
|
|
191
|
-
status_code = result.get("StatusCode", 1)
|
|
192
|
-
|
|
193
|
-
# Map back to asset
|
|
194
|
-
asset: Asset | None = None
|
|
195
|
-
asset_key = container.labels.get("interloper.asset_key")
|
|
196
|
-
if asset_key and asset_key in self.state.dag.asset_map:
|
|
197
|
-
asset = self.state.dag.asset_map[asset_key]
|
|
198
|
-
if asset is None:
|
|
199
|
-
raise RunnerError("Failed to map container to asset")
|
|
200
|
-
|
|
201
|
-
if status_code == 0:
|
|
202
|
-
self.state.mark_asset_completed(asset)
|
|
203
|
-
else:
|
|
204
|
-
self.state.mark_asset_failed(asset, f"Container {container.id} exited with code {status_code}")
|
|
205
|
-
|
|
206
|
-
try:
|
|
207
|
-
logs = container.logs(stdout=True, stderr=True)
|
|
208
|
-
if logs:
|
|
209
|
-
print("=============== START OF ASSET CONTAINER LOGS ================")
|
|
210
|
-
print(logs.decode("utf-8", errors="ignore"))
|
|
211
|
-
print("================ END OF ASSET CONTAINER LOGS =================")
|
|
212
|
-
|
|
213
|
-
except Exception:
|
|
214
|
-
pass
|
|
215
|
-
|
|
216
|
-
if self._reraise or self._fail_fast:
|
|
217
|
-
raise RunnerError(f"Container {container.id} exited with code {status_code}")
|
|
218
|
-
|
|
219
|
-
# Remove the container after processing
|
|
220
|
-
try:
|
|
221
|
-
container.remove()
|
|
222
|
-
except Exception as e:
|
|
223
|
-
print(f"Error removing container {container.id}: {e}")
|
|
224
|
-
pass
|
|
225
|
-
|
|
226
|
-
return container
|
|
227
|
-
|
|
228
|
-
def _cancel_all(self, handles: list[Container]) -> None:
|
|
229
|
-
for container in handles:
|
|
230
|
-
try:
|
|
231
|
-
container.stop(timeout=2)
|
|
232
|
-
except Exception:
|
|
233
|
-
try:
|
|
234
|
-
container.kill()
|
|
235
|
-
except Exception:
|
|
236
|
-
pass
|
|
237
|
-
finally:
|
|
238
|
-
asset_key = container.labels.get("interloper.asset_key")
|
|
239
|
-
asset = self.state.dag.asset_map[asset_key]
|
|
240
|
-
self.state.mark_asset_cancelled(asset)
|
|
241
|
-
|
|
242
|
-
def to_spec(self) -> RunnerSpec:
|
|
243
|
-
return RunnerSpec(
|
|
244
|
-
path=self.path,
|
|
245
|
-
init=dict(
|
|
246
|
-
image=self._image,
|
|
247
|
-
max_containers=self._max_containers,
|
|
248
|
-
env_vars=self._env_vars,
|
|
249
|
-
volumes=self._volumes,
|
|
250
|
-
fail_fast=self._fail_fast,
|
|
251
|
-
reraise=self._reraise,
|
|
252
|
-
),
|
|
253
|
-
)
|