flyte 0.2.0b1__py3-none-any.whl → 2.0.0b46__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- flyte/__init__.py +83 -30
- flyte/_bin/connect.py +61 -0
- flyte/_bin/debug.py +38 -0
- flyte/_bin/runtime.py +87 -19
- flyte/_bin/serve.py +351 -0
- flyte/_build.py +3 -2
- flyte/_cache/cache.py +6 -5
- flyte/_cache/local_cache.py +216 -0
- flyte/_code_bundle/_ignore.py +31 -5
- flyte/_code_bundle/_packaging.py +42 -11
- flyte/_code_bundle/_utils.py +57 -34
- flyte/_code_bundle/bundle.py +130 -27
- flyte/_constants.py +1 -0
- flyte/_context.py +21 -5
- flyte/_custom_context.py +73 -0
- flyte/_debug/constants.py +37 -0
- flyte/_debug/utils.py +17 -0
- flyte/_debug/vscode.py +315 -0
- flyte/_deploy.py +396 -75
- flyte/_deployer.py +109 -0
- flyte/_environment.py +94 -11
- flyte/_excepthook.py +37 -0
- flyte/_group.py +2 -1
- flyte/_hash.py +1 -16
- flyte/_image.py +544 -231
- flyte/_initialize.py +456 -316
- flyte/_interface.py +40 -5
- flyte/_internal/controllers/__init__.py +22 -8
- flyte/_internal/controllers/_local_controller.py +159 -35
- flyte/_internal/controllers/_trace.py +18 -10
- flyte/_internal/controllers/remote/__init__.py +38 -9
- flyte/_internal/controllers/remote/_action.py +82 -12
- flyte/_internal/controllers/remote/_client.py +6 -2
- flyte/_internal/controllers/remote/_controller.py +290 -64
- flyte/_internal/controllers/remote/_core.py +155 -95
- flyte/_internal/controllers/remote/_informer.py +40 -20
- flyte/_internal/controllers/remote/_service_protocol.py +2 -2
- flyte/_internal/imagebuild/__init__.py +2 -10
- flyte/_internal/imagebuild/docker_builder.py +391 -84
- flyte/_internal/imagebuild/image_builder.py +111 -55
- flyte/_internal/imagebuild/remote_builder.py +409 -0
- flyte/_internal/imagebuild/utils.py +79 -0
- flyte/_internal/resolvers/_app_env_module.py +92 -0
- flyte/_internal/resolvers/_task_module.py +5 -38
- flyte/_internal/resolvers/app_env.py +26 -0
- flyte/_internal/resolvers/common.py +8 -1
- flyte/_internal/resolvers/default.py +2 -2
- flyte/_internal/runtime/convert.py +319 -36
- flyte/_internal/runtime/entrypoints.py +106 -18
- flyte/_internal/runtime/io.py +71 -23
- flyte/_internal/runtime/resources_serde.py +21 -7
- flyte/_internal/runtime/reuse.py +125 -0
- flyte/_internal/runtime/rusty.py +196 -0
- flyte/_internal/runtime/task_serde.py +239 -66
- flyte/_internal/runtime/taskrunner.py +48 -8
- flyte/_internal/runtime/trigger_serde.py +162 -0
- flyte/_internal/runtime/types_serde.py +7 -16
- flyte/_keyring/file.py +115 -0
- flyte/_link.py +30 -0
- flyte/_logging.py +241 -42
- flyte/_map.py +312 -0
- flyte/_metrics.py +59 -0
- flyte/_module.py +74 -0
- flyte/_pod.py +30 -0
- flyte/_resources.py +296 -33
- flyte/_retry.py +1 -7
- flyte/_reusable_environment.py +72 -7
- flyte/_run.py +462 -132
- flyte/_secret.py +47 -11
- flyte/_serve.py +333 -0
- flyte/_task.py +245 -56
- flyte/_task_environment.py +219 -97
- flyte/_task_plugins.py +47 -0
- flyte/_tools.py +8 -8
- flyte/_trace.py +15 -24
- flyte/_trigger.py +1027 -0
- flyte/_utils/__init__.py +12 -1
- flyte/_utils/asyn.py +3 -1
- flyte/_utils/async_cache.py +139 -0
- flyte/_utils/coro_management.py +5 -4
- flyte/_utils/description_parser.py +19 -0
- flyte/_utils/docker_credentials.py +173 -0
- flyte/_utils/helpers.py +45 -19
- flyte/_utils/module_loader.py +123 -0
- flyte/_utils/org_discovery.py +57 -0
- flyte/_utils/uv_script_parser.py +8 -1
- flyte/_version.py +16 -3
- flyte/app/__init__.py +27 -0
- flyte/app/_app_environment.py +362 -0
- flyte/app/_connector_environment.py +40 -0
- flyte/app/_deploy.py +130 -0
- flyte/app/_parameter.py +343 -0
- flyte/app/_runtime/__init__.py +3 -0
- flyte/app/_runtime/app_serde.py +383 -0
- flyte/app/_types.py +113 -0
- flyte/app/extras/__init__.py +9 -0
- flyte/app/extras/_auth_middleware.py +217 -0
- flyte/app/extras/_fastapi.py +93 -0
- flyte/app/extras/_model_loader/__init__.py +3 -0
- flyte/app/extras/_model_loader/config.py +7 -0
- flyte/app/extras/_model_loader/loader.py +288 -0
- flyte/cli/__init__.py +12 -0
- flyte/cli/_abort.py +28 -0
- flyte/cli/_build.py +114 -0
- flyte/cli/_common.py +493 -0
- flyte/cli/_create.py +371 -0
- flyte/cli/_delete.py +45 -0
- flyte/cli/_deploy.py +401 -0
- flyte/cli/_gen.py +316 -0
- flyte/cli/_get.py +446 -0
- flyte/cli/_option.py +33 -0
- flyte/{_cli → cli}/_params.py +57 -17
- flyte/cli/_plugins.py +209 -0
- flyte/cli/_prefetch.py +292 -0
- flyte/cli/_run.py +690 -0
- flyte/cli/_serve.py +338 -0
- flyte/cli/_update.py +86 -0
- flyte/cli/_user.py +20 -0
- flyte/cli/main.py +246 -0
- flyte/config/__init__.py +2 -167
- flyte/config/_config.py +215 -163
- flyte/config/_internal.py +10 -1
- flyte/config/_reader.py +225 -0
- flyte/connectors/__init__.py +11 -0
- flyte/connectors/_connector.py +330 -0
- flyte/connectors/_server.py +194 -0
- flyte/connectors/utils.py +159 -0
- flyte/errors.py +134 -2
- flyte/extend.py +24 -0
- flyte/extras/_container.py +69 -56
- flyte/git/__init__.py +3 -0
- flyte/git/_config.py +279 -0
- flyte/io/__init__.py +8 -1
- flyte/io/{structured_dataset → _dataframe}/__init__.py +32 -30
- flyte/io/{structured_dataset → _dataframe}/basic_dfs.py +75 -68
- flyte/io/{structured_dataset/structured_dataset.py → _dataframe/dataframe.py} +207 -242
- flyte/io/_dir.py +575 -113
- flyte/io/_file.py +587 -141
- flyte/io/_hashing_io.py +342 -0
- flyte/io/extend.py +7 -0
- flyte/models.py +635 -0
- flyte/prefetch/__init__.py +22 -0
- flyte/prefetch/_hf_model.py +563 -0
- flyte/remote/__init__.py +14 -3
- flyte/remote/_action.py +879 -0
- flyte/remote/_app.py +346 -0
- flyte/remote/_auth_metadata.py +42 -0
- flyte/remote/_client/_protocols.py +62 -4
- flyte/remote/_client/auth/_auth_utils.py +19 -0
- flyte/remote/_client/auth/_authenticators/base.py +8 -2
- flyte/remote/_client/auth/_authenticators/device_code.py +4 -5
- flyte/remote/_client/auth/_authenticators/factory.py +4 -0
- flyte/remote/_client/auth/_authenticators/passthrough.py +79 -0
- flyte/remote/_client/auth/_authenticators/pkce.py +17 -18
- flyte/remote/_client/auth/_channel.py +47 -18
- flyte/remote/_client/auth/_client_config.py +5 -3
- flyte/remote/_client/auth/_keyring.py +15 -2
- flyte/remote/_client/auth/_token_client.py +3 -3
- flyte/remote/_client/controlplane.py +206 -18
- flyte/remote/_common.py +66 -0
- flyte/remote/_data.py +107 -22
- flyte/remote/_logs.py +116 -33
- flyte/remote/_project.py +21 -19
- flyte/remote/_run.py +164 -631
- flyte/remote/_secret.py +72 -29
- flyte/remote/_task.py +387 -46
- flyte/remote/_trigger.py +368 -0
- flyte/remote/_user.py +43 -0
- flyte/report/_report.py +10 -6
- flyte/storage/__init__.py +13 -1
- flyte/storage/_config.py +237 -0
- flyte/storage/_parallel_reader.py +289 -0
- flyte/storage/_storage.py +268 -59
- flyte/syncify/__init__.py +56 -0
- flyte/syncify/_api.py +414 -0
- flyte/types/__init__.py +39 -0
- flyte/types/_interface.py +22 -7
- flyte/{io/pickle/transformer.py → types/_pickle.py} +37 -9
- flyte/types/_string_literals.py +8 -9
- flyte/types/_type_engine.py +226 -126
- flyte/types/_utils.py +1 -1
- flyte-2.0.0b46.data/scripts/debug.py +38 -0
- flyte-2.0.0b46.data/scripts/runtime.py +194 -0
- flyte-2.0.0b46.dist-info/METADATA +352 -0
- flyte-2.0.0b46.dist-info/RECORD +221 -0
- flyte-2.0.0b46.dist-info/entry_points.txt +8 -0
- flyte-2.0.0b46.dist-info/licenses/LICENSE +201 -0
- flyte/_api_commons.py +0 -3
- flyte/_cli/_common.py +0 -299
- flyte/_cli/_create.py +0 -42
- flyte/_cli/_delete.py +0 -23
- flyte/_cli/_deploy.py +0 -140
- flyte/_cli/_get.py +0 -235
- flyte/_cli/_run.py +0 -174
- flyte/_cli/main.py +0 -98
- flyte/_datastructures.py +0 -342
- flyte/_internal/controllers/pbhash.py +0 -39
- flyte/_protos/common/authorization_pb2.py +0 -66
- flyte/_protos/common/authorization_pb2.pyi +0 -108
- flyte/_protos/common/authorization_pb2_grpc.py +0 -4
- flyte/_protos/common/identifier_pb2.py +0 -71
- flyte/_protos/common/identifier_pb2.pyi +0 -82
- flyte/_protos/common/identifier_pb2_grpc.py +0 -4
- flyte/_protos/common/identity_pb2.py +0 -48
- flyte/_protos/common/identity_pb2.pyi +0 -72
- flyte/_protos/common/identity_pb2_grpc.py +0 -4
- flyte/_protos/common/list_pb2.py +0 -36
- flyte/_protos/common/list_pb2.pyi +0 -69
- flyte/_protos/common/list_pb2_grpc.py +0 -4
- flyte/_protos/common/policy_pb2.py +0 -37
- flyte/_protos/common/policy_pb2.pyi +0 -27
- flyte/_protos/common/policy_pb2_grpc.py +0 -4
- flyte/_protos/common/role_pb2.py +0 -37
- flyte/_protos/common/role_pb2.pyi +0 -53
- flyte/_protos/common/role_pb2_grpc.py +0 -4
- flyte/_protos/common/runtime_version_pb2.py +0 -28
- flyte/_protos/common/runtime_version_pb2.pyi +0 -24
- flyte/_protos/common/runtime_version_pb2_grpc.py +0 -4
- flyte/_protos/logs/dataplane/payload_pb2.py +0 -96
- flyte/_protos/logs/dataplane/payload_pb2.pyi +0 -168
- flyte/_protos/logs/dataplane/payload_pb2_grpc.py +0 -4
- flyte/_protos/secret/definition_pb2.py +0 -49
- flyte/_protos/secret/definition_pb2.pyi +0 -93
- flyte/_protos/secret/definition_pb2_grpc.py +0 -4
- flyte/_protos/secret/payload_pb2.py +0 -62
- flyte/_protos/secret/payload_pb2.pyi +0 -94
- flyte/_protos/secret/payload_pb2_grpc.py +0 -4
- flyte/_protos/secret/secret_pb2.py +0 -38
- flyte/_protos/secret/secret_pb2.pyi +0 -6
- flyte/_protos/secret/secret_pb2_grpc.py +0 -198
- flyte/_protos/secret/secret_pb2_grpc_grpc.py +0 -198
- flyte/_protos/validate/validate/validate_pb2.py +0 -76
- flyte/_protos/workflow/node_execution_service_pb2.py +0 -26
- flyte/_protos/workflow/node_execution_service_pb2.pyi +0 -4
- flyte/_protos/workflow/node_execution_service_pb2_grpc.py +0 -32
- flyte/_protos/workflow/queue_service_pb2.py +0 -106
- flyte/_protos/workflow/queue_service_pb2.pyi +0 -141
- flyte/_protos/workflow/queue_service_pb2_grpc.py +0 -172
- flyte/_protos/workflow/run_definition_pb2.py +0 -128
- flyte/_protos/workflow/run_definition_pb2.pyi +0 -310
- flyte/_protos/workflow/run_definition_pb2_grpc.py +0 -4
- flyte/_protos/workflow/run_logs_service_pb2.py +0 -41
- flyte/_protos/workflow/run_logs_service_pb2.pyi +0 -28
- flyte/_protos/workflow/run_logs_service_pb2_grpc.py +0 -69
- flyte/_protos/workflow/run_service_pb2.py +0 -133
- flyte/_protos/workflow/run_service_pb2.pyi +0 -175
- flyte/_protos/workflow/run_service_pb2_grpc.py +0 -412
- flyte/_protos/workflow/state_service_pb2.py +0 -58
- flyte/_protos/workflow/state_service_pb2.pyi +0 -71
- flyte/_protos/workflow/state_service_pb2_grpc.py +0 -138
- flyte/_protos/workflow/task_definition_pb2.py +0 -72
- flyte/_protos/workflow/task_definition_pb2.pyi +0 -65
- flyte/_protos/workflow/task_definition_pb2_grpc.py +0 -4
- flyte/_protos/workflow/task_service_pb2.py +0 -44
- flyte/_protos/workflow/task_service_pb2.pyi +0 -31
- flyte/_protos/workflow/task_service_pb2_grpc.py +0 -104
- flyte/io/_dataframe.py +0 -0
- flyte/io/pickle/__init__.py +0 -0
- flyte/remote/_console.py +0 -18
- flyte-0.2.0b1.dist-info/METADATA +0 -179
- flyte-0.2.0b1.dist-info/RECORD +0 -204
- flyte-0.2.0b1.dist-info/entry_points.txt +0 -3
- /flyte/{_cli → _debug}/__init__.py +0 -0
- /flyte/{_protos → _keyring}/__init__.py +0 -0
- {flyte-0.2.0b1.dist-info → flyte-2.0.0b46.dist-info}/WHEEL +0 -0
- {flyte-0.2.0b1.dist-info → flyte-2.0.0b46.dist-info}/top_level.txt +0 -0
|
@@ -1,17 +1,21 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
|
+
import os
|
|
4
5
|
import sys
|
|
5
6
|
import threading
|
|
6
7
|
from asyncio import Event
|
|
7
8
|
from typing import Awaitable, Coroutine, Optional
|
|
8
9
|
|
|
9
10
|
import grpc.aio
|
|
11
|
+
from aiolimiter import AsyncLimiter
|
|
12
|
+
from flyteidl2.common import identifier_pb2
|
|
13
|
+
from flyteidl2.task import task_definition_pb2
|
|
14
|
+
from flyteidl2.workflow import queue_service_pb2, run_definition_pb2
|
|
15
|
+
from google.protobuf.wrappers_pb2 import StringValue
|
|
10
16
|
|
|
11
17
|
import flyte.errors
|
|
12
18
|
from flyte._logging import log, logger
|
|
13
|
-
from flyte._protos.workflow import queue_service_pb2, run_definition_pb2, task_definition_pb2
|
|
14
|
-
from flyte.errors import RuntimeSystemError
|
|
15
19
|
|
|
16
20
|
from ._action import Action
|
|
17
21
|
from ._informer import InformerCache
|
|
@@ -28,11 +32,11 @@ class Controller:
|
|
|
28
32
|
def __init__(
|
|
29
33
|
self,
|
|
30
34
|
client_coro: Awaitable[ClientSet],
|
|
31
|
-
workers: int =
|
|
32
|
-
max_system_retries: int =
|
|
35
|
+
workers: int = 20,
|
|
36
|
+
max_system_retries: int = 10,
|
|
33
37
|
resource_log_interval_sec: float = 10.0,
|
|
34
|
-
min_backoff_on_err_sec: float = 0.
|
|
35
|
-
thread_wait_timeout_sec: float =
|
|
38
|
+
min_backoff_on_err_sec: float = 0.5,
|
|
39
|
+
thread_wait_timeout_sec: float = 5.0,
|
|
36
40
|
enqueue_timeout_sec: float = 5.0,
|
|
37
41
|
):
|
|
38
42
|
"""
|
|
@@ -49,14 +53,17 @@ class Controller:
|
|
|
49
53
|
self._running = False
|
|
50
54
|
self._resource_log_task = None
|
|
51
55
|
self._workers = workers
|
|
52
|
-
self._max_retries = max_system_retries
|
|
56
|
+
self._max_retries = int(os.getenv("_F_MAX_RETRIES", max_system_retries))
|
|
53
57
|
self._resource_log_interval = resource_log_interval_sec
|
|
54
58
|
self._min_backoff_on_err = min_backoff_on_err_sec
|
|
59
|
+
self._max_backoff_on_err = float(os.getenv("_F_MAX_BFF_ON_ERR", "10.0"))
|
|
55
60
|
self._thread_wait_timeout = thread_wait_timeout_sec
|
|
56
61
|
self._client_coro = client_coro
|
|
57
62
|
self._failure_event: Event | None = None
|
|
58
63
|
self._enqueue_timeout = enqueue_timeout_sec
|
|
59
64
|
self._informer_start_wait_timeout = thread_wait_timeout_sec
|
|
65
|
+
max_qps = int(os.getenv("_F_MAX_QPS", "100"))
|
|
66
|
+
self._rate_limiter = AsyncLimiter(max_qps, 1.0)
|
|
60
67
|
|
|
61
68
|
# Thread management
|
|
62
69
|
self._thread = None
|
|
@@ -79,21 +86,19 @@ class Controller:
|
|
|
79
86
|
"""Public API to submit a resource and wait for completion"""
|
|
80
87
|
return await self._run_coroutine_in_controller_thread(self._bg_submit_action(action))
|
|
81
88
|
|
|
82
|
-
async def get_action(
|
|
83
|
-
self, action_id: run_definition_pb2.ActionIdentifier, parent_action_name: str
|
|
84
|
-
) -> Optional[Action]:
|
|
89
|
+
async def get_action(self, action_id: identifier_pb2.ActionIdentifier, parent_action_name: str) -> Optional[Action]:
|
|
85
90
|
"""Get the action from the informer"""
|
|
86
|
-
|
|
87
|
-
if informer:
|
|
88
|
-
return await informer.get(action_id.name)
|
|
89
|
-
return None
|
|
91
|
+
return await self._run_coroutine_in_controller_thread(self._bg_get_action(action_id, parent_action_name))
|
|
90
92
|
|
|
91
93
|
@log
|
|
92
94
|
async def cancel_action(self, action: Action):
|
|
93
95
|
return await self._run_coroutine_in_controller_thread(self._bg_cancel_action(action))
|
|
94
96
|
|
|
95
97
|
async def _finalize_parent_action(
|
|
96
|
-
self,
|
|
98
|
+
self,
|
|
99
|
+
run_id: identifier_pb2.RunIdentifier,
|
|
100
|
+
parent_action_name: str,
|
|
101
|
+
timeout: Optional[float] = None,
|
|
97
102
|
):
|
|
98
103
|
"""Finalize the parent run"""
|
|
99
104
|
await self._run_coroutine_in_controller_thread(
|
|
@@ -111,19 +116,21 @@ class Controller:
|
|
|
111
116
|
raise RuntimeError("Failure event not initialized")
|
|
112
117
|
self._failure_event.set()
|
|
113
118
|
except asyncio.CancelledError:
|
|
114
|
-
|
|
119
|
+
raise
|
|
115
120
|
|
|
116
121
|
async def _bg_watch_for_errors(self):
|
|
117
122
|
if self._failure_event is None:
|
|
118
123
|
raise RuntimeError("Failure event not initialized")
|
|
119
124
|
await self._failure_event.wait()
|
|
120
125
|
logger.warning(f"Failure event received: {self._failure_event}, cleaning up informers and exiting.")
|
|
126
|
+
self._running = False
|
|
121
127
|
|
|
122
128
|
async def watch_for_errors(self):
|
|
123
129
|
"""Watch for errors in the background thread"""
|
|
124
130
|
await self._run_coroutine_in_controller_thread(self._bg_watch_for_errors())
|
|
125
|
-
raise RuntimeSystemError(
|
|
126
|
-
code="InformerWatchFailure",
|
|
131
|
+
raise flyte.errors.RuntimeSystemError(
|
|
132
|
+
code="InformerWatchFailure",
|
|
133
|
+
message=f"Controller thread failed with exception: {self._get_exception()}",
|
|
127
134
|
)
|
|
128
135
|
|
|
129
136
|
@log
|
|
@@ -142,7 +149,6 @@ class Controller:
|
|
|
142
149
|
with self._thread_com_lock:
|
|
143
150
|
return self._thread_exception
|
|
144
151
|
|
|
145
|
-
@log
|
|
146
152
|
def _start(self):
|
|
147
153
|
"""Start the controller in a separate thread"""
|
|
148
154
|
if self._thread and self._thread.is_alive():
|
|
@@ -155,13 +161,14 @@ class Controller:
|
|
|
155
161
|
self._thread.start()
|
|
156
162
|
|
|
157
163
|
# Wait for the thread to be ready
|
|
158
|
-
logger.info("Waiting for controller thread to be ready...")
|
|
159
164
|
if not self._thread_ready.wait(timeout=self._thread_wait_timeout):
|
|
165
|
+
logger.warning("Controller thread did not finish within timeout")
|
|
160
166
|
raise TimeoutError("Controller thread failed to start in time")
|
|
161
167
|
|
|
162
168
|
if self._get_exception():
|
|
163
|
-
raise RuntimeSystemError(
|
|
164
|
-
type(self._get_exception()).__name__,
|
|
169
|
+
raise flyte.errors.RuntimeSystemError(
|
|
170
|
+
type(self._get_exception()).__name__,
|
|
171
|
+
f"Controller thread startup failed: {self._get_exception()}",
|
|
165
172
|
)
|
|
166
173
|
|
|
167
174
|
logger.info(f"Controller started in thread: {self._thread.name}")
|
|
@@ -190,15 +197,16 @@ class Controller:
|
|
|
190
197
|
# We will wait for this to signal that the thread is ready
|
|
191
198
|
# Signal the main thread that we're ready
|
|
192
199
|
logger.debug("Background thread initialization complete")
|
|
193
|
-
self._thread_ready.set()
|
|
194
200
|
if sys.version_info >= (3, 11):
|
|
195
201
|
async with asyncio.TaskGroup() as tg:
|
|
196
202
|
for i in range(self._workers):
|
|
197
|
-
tg.create_task(self._bg_run())
|
|
203
|
+
tg.create_task(self._bg_run(f"worker-{i}"))
|
|
204
|
+
self._thread_ready.set()
|
|
198
205
|
else:
|
|
199
206
|
tasks = []
|
|
200
207
|
for i in range(self._workers):
|
|
201
|
-
tasks.append(asyncio.create_task(self._bg_run()))
|
|
208
|
+
tasks.append(asyncio.create_task(self._bg_run(f"worker-{i}")))
|
|
209
|
+
self._thread_ready.set()
|
|
202
210
|
await asyncio.gather(*tasks)
|
|
203
211
|
|
|
204
212
|
def _bg_thread_target(self):
|
|
@@ -207,6 +215,7 @@ class Controller:
|
|
|
207
215
|
# Create a new event loop for this thread
|
|
208
216
|
self._loop = asyncio.new_event_loop()
|
|
209
217
|
asyncio.set_event_loop(self._loop)
|
|
218
|
+
self._loop.set_exception_handler(flyte.errors.silence_grpc_polling_error)
|
|
210
219
|
logger.debug(f"Controller thread started with new event loop: {threading.current_thread().name}")
|
|
211
220
|
|
|
212
221
|
# Create an event to signal the errors were observed in the thread's loop
|
|
@@ -216,13 +225,14 @@ class Controller:
|
|
|
216
225
|
except Exception as e:
|
|
217
226
|
logger.error(f"Controller thread encountered an exception: {e}")
|
|
218
227
|
self._set_exception(e)
|
|
228
|
+
self._failure_event.set()
|
|
219
229
|
finally:
|
|
220
230
|
if self._loop and self._loop.is_running():
|
|
221
231
|
self._loop.close()
|
|
222
232
|
logger.debug(f"Controller thread exiting: {threading.current_thread().name}")
|
|
223
233
|
|
|
224
234
|
async def _bg_get_action(
|
|
225
|
-
self, action_id:
|
|
235
|
+
self, action_id: identifier_pb2.ActionIdentifier, parent_action_name: str
|
|
226
236
|
) -> Optional[Action]:
|
|
227
237
|
"""Get the action from the informer"""
|
|
228
238
|
# Ensure the informer is created and wait for it to be ready
|
|
@@ -239,13 +249,15 @@ class Controller:
|
|
|
239
249
|
return None
|
|
240
250
|
|
|
241
251
|
async def _bg_finalize_informer(
|
|
242
|
-
self,
|
|
252
|
+
self,
|
|
253
|
+
run_id: identifier_pb2.RunIdentifier,
|
|
254
|
+
parent_action_name: str,
|
|
255
|
+
timeout: Optional[float] = None,
|
|
243
256
|
):
|
|
244
257
|
informer = await self._informers.remove(run_name=run_id.name, parent_action_name=parent_action_name)
|
|
245
258
|
if informer:
|
|
246
259
|
await informer.stop()
|
|
247
260
|
|
|
248
|
-
@log
|
|
249
261
|
async def _bg_submit_action(self, action: Action) -> Action:
|
|
250
262
|
"""Submit a resource and await its completion, returning the final state"""
|
|
251
263
|
logger.debug(f"{threading.current_thread().name} Submitting action {action.name}")
|
|
@@ -270,7 +282,7 @@ class Controller:
|
|
|
270
282
|
raise ValueError(f"Action {action.name} not found")
|
|
271
283
|
logger.debug(f"{threading.current_thread().name} Removed completion event for action {action.name}")
|
|
272
284
|
await informer.remove(action.name) # TODO we should not remove maybe, we should keep a record of completed?
|
|
273
|
-
logger.debug(f"{threading.current_thread().name} Removed action {action.name}
|
|
285
|
+
logger.debug(f"{threading.current_thread().name} Removed action {action.name}")
|
|
274
286
|
return final_resource
|
|
275
287
|
|
|
276
288
|
async def _bg_cancel_action(self, action: Action):
|
|
@@ -284,17 +296,21 @@ class Controller:
|
|
|
284
296
|
started = action.is_started()
|
|
285
297
|
action.mark_cancelled()
|
|
286
298
|
if started:
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
299
|
+
async with self._rate_limiter:
|
|
300
|
+
logger.info(f"Cancelling action: {action.name}")
|
|
301
|
+
try:
|
|
302
|
+
await self._queue_service.AbortQueuedAction(
|
|
303
|
+
queue_service_pb2.AbortQueuedActionRequest(action_id=action.action_id),
|
|
304
|
+
wait_for_ready=True,
|
|
305
|
+
)
|
|
306
|
+
logger.info(f"Successfully cancelled action: {action.name}")
|
|
307
|
+
except grpc.aio.AioRpcError as e:
|
|
308
|
+
if e.code() in [
|
|
309
|
+
grpc.StatusCode.NOT_FOUND,
|
|
310
|
+
grpc.StatusCode.FAILED_PRECONDITION,
|
|
311
|
+
]:
|
|
312
|
+
logger.info(f"Action {action.name} not found, assumed completed or cancelled.")
|
|
313
|
+
return
|
|
298
314
|
else:
|
|
299
315
|
# If the action is not started, we have to ensure it does not get launched
|
|
300
316
|
logger.info(f"Action {action.name} is not started, no need to cancel.")
|
|
@@ -307,41 +323,72 @@ class Controller:
|
|
|
307
323
|
"""
|
|
308
324
|
Attempt to launch an action.
|
|
309
325
|
"""
|
|
310
|
-
if not action.is_started()
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
+
if not action.is_started():
|
|
327
|
+
async with self._rate_limiter:
|
|
328
|
+
task: run_definition_pb2.TaskAction | None = None
|
|
329
|
+
trace: run_definition_pb2.TraceAction | None = None
|
|
330
|
+
if action.type == "task":
|
|
331
|
+
if action.task is None:
|
|
332
|
+
raise flyte.errors.RuntimeSystemError(
|
|
333
|
+
"NoTaskSpec", "Task Spec not found, cannot launch Task Action."
|
|
334
|
+
)
|
|
335
|
+
cache_key = None
|
|
336
|
+
logger.info(f"Action {action.name} has cache version {action.cache_key}")
|
|
337
|
+
if action.cache_key:
|
|
338
|
+
cache_key = StringValue(value=action.cache_key)
|
|
339
|
+
|
|
340
|
+
task = run_definition_pb2.TaskAction(
|
|
341
|
+
id=task_definition_pb2.TaskIdentifier(
|
|
342
|
+
version=action.task.task_template.id.version,
|
|
343
|
+
org=action.task.task_template.id.org,
|
|
344
|
+
project=action.task.task_template.id.project,
|
|
345
|
+
domain=action.task.task_template.id.domain,
|
|
346
|
+
name=action.task.task_template.id.name,
|
|
326
347
|
),
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
)
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
348
|
+
spec=action.task,
|
|
349
|
+
cache_key=cache_key,
|
|
350
|
+
cluster=action.queue,
|
|
351
|
+
)
|
|
352
|
+
elif action.type == "trace":
|
|
353
|
+
trace = action.trace
|
|
354
|
+
|
|
355
|
+
logger.debug(f"Attempting to launch action: {action.name}")
|
|
356
|
+
try:
|
|
357
|
+
await self._queue_service.EnqueueAction(
|
|
358
|
+
queue_service_pb2.EnqueueActionRequest(
|
|
359
|
+
action_id=action.action_id,
|
|
360
|
+
parent_action_name=action.parent_action_name,
|
|
361
|
+
task=task,
|
|
362
|
+
trace=trace,
|
|
363
|
+
input_uri=action.inputs_uri,
|
|
364
|
+
run_output_base=action.run_output_base,
|
|
365
|
+
group=action.group.name if action.group else None,
|
|
366
|
+
# Subject is not used in the current implementation
|
|
367
|
+
),
|
|
368
|
+
wait_for_ready=True,
|
|
369
|
+
timeout=self._enqueue_timeout,
|
|
370
|
+
)
|
|
371
|
+
logger.info(f"Successfully launched action: {action.name}")
|
|
372
|
+
except grpc.aio.AioRpcError as e:
|
|
373
|
+
if e.code() == grpc.StatusCode.ALREADY_EXISTS:
|
|
374
|
+
logger.info(f"Action {action.name} already exists, continuing to monitor.")
|
|
375
|
+
return
|
|
376
|
+
if e.code() in [
|
|
377
|
+
grpc.StatusCode.FAILED_PRECONDITION,
|
|
378
|
+
grpc.StatusCode.INVALID_ARGUMENT,
|
|
379
|
+
grpc.StatusCode.NOT_FOUND,
|
|
380
|
+
]:
|
|
381
|
+
raise flyte.errors.RuntimeSystemError(
|
|
382
|
+
e.code().name, f"Precondition failed: {e.details()}"
|
|
383
|
+
) from e
|
|
384
|
+
# For all other errors, we will retry with backoff
|
|
385
|
+
logger.exception(
|
|
386
|
+
f"Failed to launch action: {action.name}, Code: {e.code()}, "
|
|
387
|
+
f"Details {e.details()} backing off..."
|
|
388
|
+
)
|
|
389
|
+
logger.debug(f"Action details: {action}")
|
|
390
|
+
raise flyte.errors.SlowDownError(f"Failed to launch action: {e.details()}") from e
|
|
343
391
|
|
|
344
|
-
@log
|
|
345
392
|
async def _bg_process(self, action: Action):
|
|
346
393
|
"""Process resource updates"""
|
|
347
394
|
logger.debug(f"Processing action: name={action.name}, started={action.is_started()}")
|
|
@@ -358,39 +405,52 @@ class Controller:
|
|
|
358
405
|
async def _bg_log_stats(self):
|
|
359
406
|
"""Periodically log resource stats if debug is enabled"""
|
|
360
407
|
while self._running:
|
|
361
|
-
async for
|
|
408
|
+
async for (
|
|
409
|
+
started,
|
|
410
|
+
pending,
|
|
411
|
+
terminal,
|
|
412
|
+
) in self._informers.count_started_pending_terminal_actions():
|
|
362
413
|
logger.info(f"Resource stats: Started={started}, Pending={pending}, Terminal={terminal}")
|
|
363
414
|
await asyncio.sleep(self._resource_log_interval)
|
|
364
415
|
|
|
365
|
-
|
|
366
|
-
async def _bg_run(self):
|
|
416
|
+
async def _bg_run(self, worker_id: str):
|
|
367
417
|
"""Run loop with resource status logging"""
|
|
418
|
+
logger.info(f"Worker {worker_id} started")
|
|
368
419
|
while self._running:
|
|
369
420
|
logger.debug(f"{threading.current_thread().name} Waiting for resource")
|
|
370
421
|
action = await self._shared_queue.get()
|
|
371
422
|
logger.debug(f"{threading.current_thread().name} Got resource {action.name}")
|
|
372
423
|
try:
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
f" crossed threshold {self._max_retries}",
|
|
384
|
-
)
|
|
385
|
-
err.__cause__ = e
|
|
386
|
-
action.set_client_error(err)
|
|
387
|
-
informer = await self._informers.get(
|
|
388
|
-
run_name=action.run_name, parent_action_name=action.parent_action_name
|
|
424
|
+
try:
|
|
425
|
+
await self._bg_process(action)
|
|
426
|
+
except flyte.errors.SlowDownError as e:
|
|
427
|
+
action.retries += 1
|
|
428
|
+
if action.retries > self._max_retries:
|
|
429
|
+
raise
|
|
430
|
+
backoff = min(self._min_backoff_on_err * (2 ** (action.retries - 1)), self._max_backoff_on_err)
|
|
431
|
+
logger.warning(
|
|
432
|
+
f"[{worker_id}] Backing off for {backoff} [retry {action.retries}/{self._max_retries}] "
|
|
433
|
+
f"on action {action.name} due to error: {e}"
|
|
389
434
|
)
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
else:
|
|
435
|
+
await asyncio.sleep(backoff)
|
|
436
|
+
logger.warning(f"[{worker_id}] Retrying action {action.name} after backoff")
|
|
393
437
|
await self._shared_queue.put(action)
|
|
438
|
+
except Exception as e:
|
|
439
|
+
logger.error(f"[{worker_id}] Error in controller loop for {action.name}: {e}")
|
|
440
|
+
err = flyte.errors.RuntimeSystemError(
|
|
441
|
+
code=type(e).__name__,
|
|
442
|
+
message=f"Controller failed, system retries {action.retries} / {self._max_retries} "
|
|
443
|
+
f"crossed threshold, for action {action.name}: {e}",
|
|
444
|
+
worker=worker_id,
|
|
445
|
+
)
|
|
446
|
+
err.__cause__ = e
|
|
447
|
+
action.set_client_error(err)
|
|
448
|
+
informer = await self._informers.get(
|
|
449
|
+
run_name=action.run_name,
|
|
450
|
+
parent_action_name=action.parent_action_name,
|
|
451
|
+
)
|
|
452
|
+
if informer:
|
|
453
|
+
await informer.fire_completion_event(action.name)
|
|
394
454
|
finally:
|
|
395
455
|
self._shared_queue.task_done()
|
|
396
456
|
|
|
@@ -5,9 +5,10 @@ from asyncio import Queue
|
|
|
5
5
|
from typing import AsyncIterator, Callable, Dict, Optional, Tuple, cast
|
|
6
6
|
|
|
7
7
|
import grpc.aio
|
|
8
|
+
from flyteidl2.common import identifier_pb2, phase_pb2
|
|
9
|
+
from flyteidl2.workflow import state_service_pb2
|
|
8
10
|
|
|
9
11
|
from flyte._logging import log, logger
|
|
10
|
-
from flyte._protos.workflow import run_definition_pb2, state_service_pb2
|
|
11
12
|
|
|
12
13
|
from ._action import Action
|
|
13
14
|
from ._service_protocol import StateService
|
|
@@ -38,12 +39,14 @@ class ActionCache:
|
|
|
38
39
|
"""
|
|
39
40
|
Add an action to the cache if it doesn't exist. This is invoked by the watch.
|
|
40
41
|
"""
|
|
41
|
-
logger.
|
|
42
|
+
logger.debug(f"Observing phase {phase_pb2.ActionPhase.Name(state.phase)} for {state.action_id.name}")
|
|
42
43
|
if state.output_uri:
|
|
43
|
-
logger.
|
|
44
|
+
logger.debug(f"Output URI: {state.output_uri}")
|
|
44
45
|
else:
|
|
45
|
-
logger.
|
|
46
|
-
|
|
46
|
+
logger.warning(
|
|
47
|
+
f"{state.action_id.name} has no output URI, in phase {phase_pb2.ActionPhase.Name(state.phase)}"
|
|
48
|
+
)
|
|
49
|
+
if state.phase == phase_pb2.ACTION_PHASE_FAILED:
|
|
47
50
|
logger.error(
|
|
48
51
|
f"Action {state.action_id.name} failed with error (msg):"
|
|
49
52
|
f" [{state.error if state.HasField('error') else None}]"
|
|
@@ -125,12 +128,14 @@ class Informer:
|
|
|
125
128
|
|
|
126
129
|
def __init__(
|
|
127
130
|
self,
|
|
128
|
-
run_id:
|
|
131
|
+
run_id: identifier_pb2.RunIdentifier,
|
|
129
132
|
parent_action_name: str,
|
|
130
133
|
shared_queue: Queue,
|
|
131
134
|
client: Optional[StateService] = None,
|
|
132
|
-
|
|
135
|
+
min_watch_backoff: float = 1.0,
|
|
136
|
+
max_watch_backoff: float = 30.0,
|
|
133
137
|
watch_conn_timeout_sec: float = 5.0,
|
|
138
|
+
max_watch_retries: int = 10,
|
|
134
139
|
):
|
|
135
140
|
self.name = self.mkname(run_name=run_id.name, parent_action_name=parent_action_name)
|
|
136
141
|
self.parent_action_name = parent_action_name
|
|
@@ -141,8 +146,10 @@ class Informer:
|
|
|
141
146
|
self._running = False
|
|
142
147
|
self._watch_task: asyncio.Task | None = None
|
|
143
148
|
self._ready = asyncio.Event()
|
|
144
|
-
self.
|
|
149
|
+
self._min_watch_backoff = min_watch_backoff
|
|
150
|
+
self._max_watch_backoff = max_watch_backoff
|
|
145
151
|
self._watch_conn_timeout_sec = watch_conn_timeout_sec
|
|
152
|
+
self._max_watch_retries = max_watch_retries
|
|
146
153
|
|
|
147
154
|
@classmethod
|
|
148
155
|
def mkname(cls, *, run_name: str, parent_action_name: str) -> str:
|
|
@@ -208,16 +215,19 @@ class Informer:
|
|
|
208
215
|
"""
|
|
209
216
|
# sentinel = False
|
|
210
217
|
retries = 0
|
|
211
|
-
max_retries = 5
|
|
212
218
|
last_exc = None
|
|
213
219
|
while self._running:
|
|
214
|
-
if retries >=
|
|
215
|
-
logger.error(
|
|
220
|
+
if retries >= self._max_watch_retries:
|
|
221
|
+
logger.error(
|
|
222
|
+
f"Informer watch failure retries crossed threshold {retries}/{self._max_watch_retries}, exiting!"
|
|
223
|
+
)
|
|
216
224
|
raise last_exc
|
|
217
225
|
try:
|
|
226
|
+
if retries >= 1:
|
|
227
|
+
logger.warning(f"Informer watch retrying, attempt {retries}/{self._max_watch_retries}")
|
|
218
228
|
watcher = self._client.Watch(
|
|
219
229
|
state_service_pb2.WatchRequest(
|
|
220
|
-
parent_action_id=
|
|
230
|
+
parent_action_id=identifier_pb2.ActionIdentifier(
|
|
221
231
|
name=self.parent_action_name,
|
|
222
232
|
run=self._run_id,
|
|
223
233
|
),
|
|
@@ -235,7 +245,7 @@ class Informer:
|
|
|
235
245
|
await self._shared_queue.put(node)
|
|
236
246
|
# hack to work in the absence of sentinel
|
|
237
247
|
except asyncio.CancelledError:
|
|
238
|
-
logger.
|
|
248
|
+
logger.info(f"Watch cancelled: {self.name}")
|
|
239
249
|
return
|
|
240
250
|
except asyncio.TimeoutError as e:
|
|
241
251
|
logger.error(f"Watch timeout: {self.name}", exc_info=e)
|
|
@@ -249,7 +259,9 @@ class Informer:
|
|
|
249
259
|
logger.exception(f"Watch error: {self.name}", exc_info=e)
|
|
250
260
|
last_exc = e
|
|
251
261
|
retries += 1
|
|
252
|
-
|
|
262
|
+
backoff = min(self._min_watch_backoff * (2**retries), self._max_watch_backoff)
|
|
263
|
+
logger.warning(f"Watch for {self.name} failed, retrying in {backoff} seconds...")
|
|
264
|
+
await asyncio.sleep(backoff)
|
|
253
265
|
|
|
254
266
|
@log
|
|
255
267
|
async def start(self, timeout: Optional[float] = None) -> asyncio.Task:
|
|
@@ -258,7 +270,7 @@ class Informer:
|
|
|
258
270
|
logger.warning("Informer already running")
|
|
259
271
|
return cast(asyncio.Task, self._watch_task)
|
|
260
272
|
self._running = True
|
|
261
|
-
self._watch_task = asyncio.create_task(self.watch())
|
|
273
|
+
self._watch_task = asyncio.create_task(self.watch(), name=f"InformerWatch-{self.parent_action_name}")
|
|
262
274
|
await self.wait_for_cache_sync(timeout=timeout)
|
|
263
275
|
return self._watch_task
|
|
264
276
|
|
|
@@ -288,7 +300,7 @@ class InformerCache:
|
|
|
288
300
|
@log
|
|
289
301
|
async def get_or_create(
|
|
290
302
|
self,
|
|
291
|
-
run_id:
|
|
303
|
+
run_id: identifier_pb2.RunIdentifier,
|
|
292
304
|
parent_action_name: str,
|
|
293
305
|
shared_queue: Queue,
|
|
294
306
|
state_service: StateService,
|
|
@@ -330,20 +342,28 @@ class InformerCache:
|
|
|
330
342
|
async def get(self, *, run_name: str, parent_action_name: str) -> Informer | None:
|
|
331
343
|
"""Get an informer by name"""
|
|
332
344
|
async with self._lock:
|
|
333
|
-
return self._cache.get(
|
|
345
|
+
return self._cache.get(
|
|
346
|
+
Informer.mkname(run_name=run_name, parent_action_name=parent_action_name),
|
|
347
|
+
None,
|
|
348
|
+
)
|
|
334
349
|
|
|
335
350
|
@log
|
|
336
351
|
async def remove(self, *, run_name: str, parent_action_name: str) -> Informer | None:
|
|
337
352
|
"""Remove an informer from the cache"""
|
|
338
353
|
async with self._lock:
|
|
339
|
-
return self._cache.pop(
|
|
354
|
+
return self._cache.pop(
|
|
355
|
+
Informer.mkname(run_name=run_name, parent_action_name=parent_action_name),
|
|
356
|
+
None,
|
|
357
|
+
)
|
|
340
358
|
|
|
341
359
|
async def has(self, *, run_name: str, parent_action_name: str) -> bool:
|
|
342
360
|
"""Check if an informer exists in the cache"""
|
|
343
361
|
async with self._lock:
|
|
344
362
|
return Informer.mkname(run_name=run_name, parent_action_name=parent_action_name) in self._cache
|
|
345
363
|
|
|
346
|
-
async def count_started_pending_terminal_actions(
|
|
364
|
+
async def count_started_pending_terminal_actions(
|
|
365
|
+
self,
|
|
366
|
+
) -> AsyncIterator[Tuple[int, int, int]]:
|
|
347
367
|
"""Log resource stats"""
|
|
348
368
|
async with self._lock:
|
|
349
369
|
for informer in self._cache.values():
|
|
@@ -353,7 +373,7 @@ class InformerCache:
|
|
|
353
373
|
"""Stop all informers and remove them from the cache"""
|
|
354
374
|
async with self._lock:
|
|
355
375
|
while self._cache:
|
|
356
|
-
|
|
376
|
+
_name, informer = self._cache.popitem()
|
|
357
377
|
try:
|
|
358
378
|
await informer.stop()
|
|
359
379
|
except asyncio.CancelledError:
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
from typing import AsyncIterator, Protocol
|
|
4
4
|
|
|
5
|
-
from
|
|
5
|
+
from flyteidl2.workflow import queue_service_pb2, state_service_pb2
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
class StateService(Protocol):
|
|
@@ -33,7 +33,7 @@ class QueueService(Protocol):
|
|
|
33
33
|
req: queue_service_pb2.AbortQueuedActionRequest,
|
|
34
34
|
**kwargs,
|
|
35
35
|
) -> queue_service_pb2.AbortQueuedActionResponse:
|
|
36
|
-
"""
|
|
36
|
+
"""Cancel an enqueued task"""
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class ClientSet(Protocol):
|
|
@@ -1,11 +1,3 @@
|
|
|
1
|
-
import
|
|
2
|
-
from typing import List
|
|
1
|
+
from flyte._internal.imagebuild.image_builder import ImageBuildEngine
|
|
3
2
|
|
|
4
|
-
|
|
5
|
-
from flyte._internal.imagebuild.docker_builder import DockerImageBuilder
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
async def build(images: List[Image]) -> List[str]:
|
|
9
|
-
builder = DockerImageBuilder()
|
|
10
|
-
ts = [asyncio.create_task(builder.build_image(image)) for image in images]
|
|
11
|
-
return list(await asyncio.gather(*ts))
|
|
3
|
+
__all__ = ["ImageBuildEngine"]
|