flyte 0.0.1b0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of flyte might be problematic. Click here for more details.
- flyte/__init__.py +62 -0
- flyte/_api_commons.py +3 -0
- flyte/_bin/__init__.py +0 -0
- flyte/_bin/runtime.py +126 -0
- flyte/_build.py +25 -0
- flyte/_cache/__init__.py +12 -0
- flyte/_cache/cache.py +146 -0
- flyte/_cache/defaults.py +9 -0
- flyte/_cache/policy_function_body.py +42 -0
- flyte/_cli/__init__.py +0 -0
- flyte/_cli/_common.py +287 -0
- flyte/_cli/_create.py +42 -0
- flyte/_cli/_delete.py +23 -0
- flyte/_cli/_deploy.py +140 -0
- flyte/_cli/_get.py +235 -0
- flyte/_cli/_run.py +152 -0
- flyte/_cli/main.py +72 -0
- flyte/_code_bundle/__init__.py +8 -0
- flyte/_code_bundle/_ignore.py +113 -0
- flyte/_code_bundle/_packaging.py +187 -0
- flyte/_code_bundle/_utils.py +339 -0
- flyte/_code_bundle/bundle.py +178 -0
- flyte/_context.py +146 -0
- flyte/_datastructures.py +342 -0
- flyte/_deploy.py +202 -0
- flyte/_doc.py +29 -0
- flyte/_docstring.py +32 -0
- flyte/_environment.py +43 -0
- flyte/_group.py +31 -0
- flyte/_hash.py +23 -0
- flyte/_image.py +760 -0
- flyte/_initialize.py +634 -0
- flyte/_interface.py +84 -0
- flyte/_internal/__init__.py +3 -0
- flyte/_internal/controllers/__init__.py +115 -0
- flyte/_internal/controllers/_local_controller.py +118 -0
- flyte/_internal/controllers/_trace.py +40 -0
- flyte/_internal/controllers/pbhash.py +39 -0
- flyte/_internal/controllers/remote/__init__.py +40 -0
- flyte/_internal/controllers/remote/_action.py +141 -0
- flyte/_internal/controllers/remote/_client.py +43 -0
- flyte/_internal/controllers/remote/_controller.py +361 -0
- flyte/_internal/controllers/remote/_core.py +402 -0
- flyte/_internal/controllers/remote/_informer.py +361 -0
- flyte/_internal/controllers/remote/_service_protocol.py +50 -0
- flyte/_internal/imagebuild/__init__.py +11 -0
- flyte/_internal/imagebuild/docker_builder.py +416 -0
- flyte/_internal/imagebuild/image_builder.py +241 -0
- flyte/_internal/imagebuild/remote_builder.py +0 -0
- flyte/_internal/resolvers/__init__.py +0 -0
- flyte/_internal/resolvers/_task_module.py +54 -0
- flyte/_internal/resolvers/common.py +31 -0
- flyte/_internal/resolvers/default.py +28 -0
- flyte/_internal/runtime/__init__.py +0 -0
- flyte/_internal/runtime/convert.py +199 -0
- flyte/_internal/runtime/entrypoints.py +135 -0
- flyte/_internal/runtime/io.py +136 -0
- flyte/_internal/runtime/resources_serde.py +138 -0
- flyte/_internal/runtime/task_serde.py +210 -0
- flyte/_internal/runtime/taskrunner.py +190 -0
- flyte/_internal/runtime/types_serde.py +54 -0
- flyte/_logging.py +124 -0
- flyte/_protos/__init__.py +0 -0
- flyte/_protos/common/authorization_pb2.py +66 -0
- flyte/_protos/common/authorization_pb2.pyi +108 -0
- flyte/_protos/common/authorization_pb2_grpc.py +4 -0
- flyte/_protos/common/identifier_pb2.py +71 -0
- flyte/_protos/common/identifier_pb2.pyi +82 -0
- flyte/_protos/common/identifier_pb2_grpc.py +4 -0
- flyte/_protos/common/identity_pb2.py +48 -0
- flyte/_protos/common/identity_pb2.pyi +72 -0
- flyte/_protos/common/identity_pb2_grpc.py +4 -0
- flyte/_protos/common/list_pb2.py +36 -0
- flyte/_protos/common/list_pb2.pyi +69 -0
- flyte/_protos/common/list_pb2_grpc.py +4 -0
- flyte/_protos/common/policy_pb2.py +37 -0
- flyte/_protos/common/policy_pb2.pyi +27 -0
- flyte/_protos/common/policy_pb2_grpc.py +4 -0
- flyte/_protos/common/role_pb2.py +37 -0
- flyte/_protos/common/role_pb2.pyi +53 -0
- flyte/_protos/common/role_pb2_grpc.py +4 -0
- flyte/_protos/common/runtime_version_pb2.py +28 -0
- flyte/_protos/common/runtime_version_pb2.pyi +24 -0
- flyte/_protos/common/runtime_version_pb2_grpc.py +4 -0
- flyte/_protos/logs/dataplane/payload_pb2.py +96 -0
- flyte/_protos/logs/dataplane/payload_pb2.pyi +168 -0
- flyte/_protos/logs/dataplane/payload_pb2_grpc.py +4 -0
- flyte/_protos/secret/definition_pb2.py +49 -0
- flyte/_protos/secret/definition_pb2.pyi +93 -0
- flyte/_protos/secret/definition_pb2_grpc.py +4 -0
- flyte/_protos/secret/payload_pb2.py +62 -0
- flyte/_protos/secret/payload_pb2.pyi +94 -0
- flyte/_protos/secret/payload_pb2_grpc.py +4 -0
- flyte/_protos/secret/secret_pb2.py +38 -0
- flyte/_protos/secret/secret_pb2.pyi +6 -0
- flyte/_protos/secret/secret_pb2_grpc.py +198 -0
- flyte/_protos/secret/secret_pb2_grpc_grpc.py +198 -0
- flyte/_protos/validate/validate/validate_pb2.py +76 -0
- flyte/_protos/workflow/node_execution_service_pb2.py +26 -0
- flyte/_protos/workflow/node_execution_service_pb2.pyi +4 -0
- flyte/_protos/workflow/node_execution_service_pb2_grpc.py +32 -0
- flyte/_protos/workflow/queue_service_pb2.py +106 -0
- flyte/_protos/workflow/queue_service_pb2.pyi +141 -0
- flyte/_protos/workflow/queue_service_pb2_grpc.py +172 -0
- flyte/_protos/workflow/run_definition_pb2.py +128 -0
- flyte/_protos/workflow/run_definition_pb2.pyi +310 -0
- flyte/_protos/workflow/run_definition_pb2_grpc.py +4 -0
- flyte/_protos/workflow/run_logs_service_pb2.py +41 -0
- flyte/_protos/workflow/run_logs_service_pb2.pyi +28 -0
- flyte/_protos/workflow/run_logs_service_pb2_grpc.py +69 -0
- flyte/_protos/workflow/run_service_pb2.py +133 -0
- flyte/_protos/workflow/run_service_pb2.pyi +175 -0
- flyte/_protos/workflow/run_service_pb2_grpc.py +412 -0
- flyte/_protos/workflow/state_service_pb2.py +58 -0
- flyte/_protos/workflow/state_service_pb2.pyi +71 -0
- flyte/_protos/workflow/state_service_pb2_grpc.py +138 -0
- flyte/_protos/workflow/task_definition_pb2.py +72 -0
- flyte/_protos/workflow/task_definition_pb2.pyi +65 -0
- flyte/_protos/workflow/task_definition_pb2_grpc.py +4 -0
- flyte/_protos/workflow/task_service_pb2.py +44 -0
- flyte/_protos/workflow/task_service_pb2.pyi +31 -0
- flyte/_protos/workflow/task_service_pb2_grpc.py +104 -0
- flyte/_resources.py +226 -0
- flyte/_retry.py +32 -0
- flyte/_reusable_environment.py +25 -0
- flyte/_run.py +411 -0
- flyte/_secret.py +61 -0
- flyte/_task.py +367 -0
- flyte/_task_environment.py +200 -0
- flyte/_timeout.py +47 -0
- flyte/_tools.py +27 -0
- flyte/_trace.py +128 -0
- flyte/_utils/__init__.py +20 -0
- flyte/_utils/asyn.py +119 -0
- flyte/_utils/coro_management.py +25 -0
- flyte/_utils/file_handling.py +72 -0
- flyte/_utils/helpers.py +108 -0
- flyte/_utils/lazy_module.py +54 -0
- flyte/_utils/uv_script_parser.py +49 -0
- flyte/_version.py +21 -0
- flyte/connectors/__init__.py +0 -0
- flyte/errors.py +143 -0
- flyte/extras/__init__.py +5 -0
- flyte/extras/_container.py +273 -0
- flyte/io/__init__.py +11 -0
- flyte/io/_dataframe.py +0 -0
- flyte/io/_dir.py +448 -0
- flyte/io/_file.py +468 -0
- flyte/io/pickle/__init__.py +0 -0
- flyte/io/pickle/transformer.py +117 -0
- flyte/io/structured_dataset/__init__.py +129 -0
- flyte/io/structured_dataset/basic_dfs.py +219 -0
- flyte/io/structured_dataset/structured_dataset.py +1061 -0
- flyte/py.typed +0 -0
- flyte/remote/__init__.py +25 -0
- flyte/remote/_client/__init__.py +0 -0
- flyte/remote/_client/_protocols.py +131 -0
- flyte/remote/_client/auth/__init__.py +12 -0
- flyte/remote/_client/auth/_authenticators/__init__.py +0 -0
- flyte/remote/_client/auth/_authenticators/base.py +397 -0
- flyte/remote/_client/auth/_authenticators/client_credentials.py +73 -0
- flyte/remote/_client/auth/_authenticators/device_code.py +118 -0
- flyte/remote/_client/auth/_authenticators/external_command.py +79 -0
- flyte/remote/_client/auth/_authenticators/factory.py +200 -0
- flyte/remote/_client/auth/_authenticators/pkce.py +516 -0
- flyte/remote/_client/auth/_channel.py +184 -0
- flyte/remote/_client/auth/_client_config.py +83 -0
- flyte/remote/_client/auth/_default_html.py +32 -0
- flyte/remote/_client/auth/_grpc_utils/__init__.py +0 -0
- flyte/remote/_client/auth/_grpc_utils/auth_interceptor.py +288 -0
- flyte/remote/_client/auth/_grpc_utils/default_metadata_interceptor.py +151 -0
- flyte/remote/_client/auth/_keyring.py +143 -0
- flyte/remote/_client/auth/_token_client.py +260 -0
- flyte/remote/_client/auth/errors.py +16 -0
- flyte/remote/_client/controlplane.py +95 -0
- flyte/remote/_console.py +18 -0
- flyte/remote/_data.py +155 -0
- flyte/remote/_logs.py +116 -0
- flyte/remote/_project.py +86 -0
- flyte/remote/_run.py +873 -0
- flyte/remote/_secret.py +132 -0
- flyte/remote/_task.py +227 -0
- flyte/report/__init__.py +3 -0
- flyte/report/_report.py +178 -0
- flyte/report/_template.html +124 -0
- flyte/storage/__init__.py +24 -0
- flyte/storage/_remote_fs.py +34 -0
- flyte/storage/_storage.py +251 -0
- flyte/storage/_utils.py +5 -0
- flyte/types/__init__.py +13 -0
- flyte/types/_interface.py +25 -0
- flyte/types/_renderer.py +162 -0
- flyte/types/_string_literals.py +120 -0
- flyte/types/_type_engine.py +2210 -0
- flyte/types/_utils.py +80 -0
- flyte-0.0.1b0.dist-info/METADATA +179 -0
- flyte-0.0.1b0.dist-info/RECORD +390 -0
- flyte-0.0.1b0.dist-info/WHEEL +5 -0
- flyte-0.0.1b0.dist-info/entry_points.txt +3 -0
- flyte-0.0.1b0.dist-info/top_level.txt +1 -0
- union/__init__.py +54 -0
- union/_api_commons.py +3 -0
- union/_bin/__init__.py +0 -0
- union/_bin/runtime.py +113 -0
- union/_build.py +25 -0
- union/_cache/__init__.py +12 -0
- union/_cache/cache.py +141 -0
- union/_cache/defaults.py +9 -0
- union/_cache/policy_function_body.py +42 -0
- union/_cli/__init__.py +0 -0
- union/_cli/_common.py +263 -0
- union/_cli/_create.py +40 -0
- union/_cli/_delete.py +23 -0
- union/_cli/_deploy.py +120 -0
- union/_cli/_get.py +162 -0
- union/_cli/_params.py +579 -0
- union/_cli/_run.py +150 -0
- union/_cli/main.py +72 -0
- union/_code_bundle/__init__.py +8 -0
- union/_code_bundle/_ignore.py +113 -0
- union/_code_bundle/_packaging.py +187 -0
- union/_code_bundle/_utils.py +342 -0
- union/_code_bundle/bundle.py +176 -0
- union/_context.py +146 -0
- union/_datastructures.py +295 -0
- union/_deploy.py +185 -0
- union/_doc.py +29 -0
- union/_docstring.py +26 -0
- union/_environment.py +43 -0
- union/_group.py +31 -0
- union/_hash.py +23 -0
- union/_image.py +760 -0
- union/_initialize.py +585 -0
- union/_interface.py +84 -0
- union/_internal/__init__.py +3 -0
- union/_internal/controllers/__init__.py +77 -0
- union/_internal/controllers/_local_controller.py +77 -0
- union/_internal/controllers/pbhash.py +39 -0
- union/_internal/controllers/remote/__init__.py +40 -0
- union/_internal/controllers/remote/_action.py +131 -0
- union/_internal/controllers/remote/_client.py +43 -0
- union/_internal/controllers/remote/_controller.py +169 -0
- union/_internal/controllers/remote/_core.py +341 -0
- union/_internal/controllers/remote/_informer.py +260 -0
- union/_internal/controllers/remote/_service_protocol.py +44 -0
- union/_internal/imagebuild/__init__.py +11 -0
- union/_internal/imagebuild/docker_builder.py +416 -0
- union/_internal/imagebuild/image_builder.py +243 -0
- union/_internal/imagebuild/remote_builder.py +0 -0
- union/_internal/resolvers/__init__.py +0 -0
- union/_internal/resolvers/_task_module.py +31 -0
- union/_internal/resolvers/common.py +24 -0
- union/_internal/resolvers/default.py +27 -0
- union/_internal/runtime/__init__.py +0 -0
- union/_internal/runtime/convert.py +163 -0
- union/_internal/runtime/entrypoints.py +121 -0
- union/_internal/runtime/io.py +136 -0
- union/_internal/runtime/resources_serde.py +134 -0
- union/_internal/runtime/task_serde.py +202 -0
- union/_internal/runtime/taskrunner.py +179 -0
- union/_internal/runtime/types_serde.py +53 -0
- union/_logging.py +124 -0
- union/_protos/__init__.py +0 -0
- union/_protos/common/authorization_pb2.py +66 -0
- union/_protos/common/authorization_pb2.pyi +106 -0
- union/_protos/common/authorization_pb2_grpc.py +4 -0
- union/_protos/common/identifier_pb2.py +71 -0
- union/_protos/common/identifier_pb2.pyi +82 -0
- union/_protos/common/identifier_pb2_grpc.py +4 -0
- union/_protos/common/identity_pb2.py +48 -0
- union/_protos/common/identity_pb2.pyi +72 -0
- union/_protos/common/identity_pb2_grpc.py +4 -0
- union/_protos/common/list_pb2.py +36 -0
- union/_protos/common/list_pb2.pyi +69 -0
- union/_protos/common/list_pb2_grpc.py +4 -0
- union/_protos/common/policy_pb2.py +37 -0
- union/_protos/common/policy_pb2.pyi +27 -0
- union/_protos/common/policy_pb2_grpc.py +4 -0
- union/_protos/common/role_pb2.py +37 -0
- union/_protos/common/role_pb2.pyi +51 -0
- union/_protos/common/role_pb2_grpc.py +4 -0
- union/_protos/common/runtime_version_pb2.py +28 -0
- union/_protos/common/runtime_version_pb2.pyi +24 -0
- union/_protos/common/runtime_version_pb2_grpc.py +4 -0
- union/_protos/logs/dataplane/payload_pb2.py +96 -0
- union/_protos/logs/dataplane/payload_pb2.pyi +168 -0
- union/_protos/logs/dataplane/payload_pb2_grpc.py +4 -0
- union/_protos/secret/definition_pb2.py +49 -0
- union/_protos/secret/definition_pb2.pyi +93 -0
- union/_protos/secret/definition_pb2_grpc.py +4 -0
- union/_protos/secret/payload_pb2.py +62 -0
- union/_protos/secret/payload_pb2.pyi +94 -0
- union/_protos/secret/payload_pb2_grpc.py +4 -0
- union/_protos/secret/secret_pb2.py +38 -0
- union/_protos/secret/secret_pb2.pyi +6 -0
- union/_protos/secret/secret_pb2_grpc.py +198 -0
- union/_protos/validate/validate/validate_pb2.py +76 -0
- union/_protos/workflow/node_execution_service_pb2.py +26 -0
- union/_protos/workflow/node_execution_service_pb2.pyi +4 -0
- union/_protos/workflow/node_execution_service_pb2_grpc.py +32 -0
- union/_protos/workflow/queue_service_pb2.py +75 -0
- union/_protos/workflow/queue_service_pb2.pyi +103 -0
- union/_protos/workflow/queue_service_pb2_grpc.py +172 -0
- union/_protos/workflow/run_definition_pb2.py +100 -0
- union/_protos/workflow/run_definition_pb2.pyi +256 -0
- union/_protos/workflow/run_definition_pb2_grpc.py +4 -0
- union/_protos/workflow/run_logs_service_pb2.py +41 -0
- union/_protos/workflow/run_logs_service_pb2.pyi +28 -0
- union/_protos/workflow/run_logs_service_pb2_grpc.py +69 -0
- union/_protos/workflow/run_service_pb2.py +133 -0
- union/_protos/workflow/run_service_pb2.pyi +173 -0
- union/_protos/workflow/run_service_pb2_grpc.py +412 -0
- union/_protos/workflow/state_service_pb2.py +58 -0
- union/_protos/workflow/state_service_pb2.pyi +69 -0
- union/_protos/workflow/state_service_pb2_grpc.py +138 -0
- union/_protos/workflow/task_definition_pb2.py +72 -0
- union/_protos/workflow/task_definition_pb2.pyi +65 -0
- union/_protos/workflow/task_definition_pb2_grpc.py +4 -0
- union/_protos/workflow/task_service_pb2.py +44 -0
- union/_protos/workflow/task_service_pb2.pyi +31 -0
- union/_protos/workflow/task_service_pb2_grpc.py +104 -0
- union/_resources.py +226 -0
- union/_retry.py +32 -0
- union/_reusable_environment.py +25 -0
- union/_run.py +374 -0
- union/_secret.py +61 -0
- union/_task.py +354 -0
- union/_task_environment.py +186 -0
- union/_timeout.py +47 -0
- union/_tools.py +27 -0
- union/_utils/__init__.py +11 -0
- union/_utils/asyn.py +119 -0
- union/_utils/file_handling.py +71 -0
- union/_utils/helpers.py +46 -0
- union/_utils/lazy_module.py +54 -0
- union/_utils/uv_script_parser.py +49 -0
- union/_version.py +21 -0
- union/connectors/__init__.py +0 -0
- union/errors.py +128 -0
- union/extras/__init__.py +5 -0
- union/extras/_container.py +263 -0
- union/io/__init__.py +11 -0
- union/io/_dataframe.py +0 -0
- union/io/_dir.py +425 -0
- union/io/_file.py +418 -0
- union/io/pickle/__init__.py +0 -0
- union/io/pickle/transformer.py +117 -0
- union/io/structured_dataset/__init__.py +122 -0
- union/io/structured_dataset/basic_dfs.py +219 -0
- union/io/structured_dataset/structured_dataset.py +1057 -0
- union/py.typed +0 -0
- union/remote/__init__.py +23 -0
- union/remote/_client/__init__.py +0 -0
- union/remote/_client/_protocols.py +129 -0
- union/remote/_client/auth/__init__.py +12 -0
- union/remote/_client/auth/_authenticators/__init__.py +0 -0
- union/remote/_client/auth/_authenticators/base.py +391 -0
- union/remote/_client/auth/_authenticators/client_credentials.py +73 -0
- union/remote/_client/auth/_authenticators/device_code.py +120 -0
- union/remote/_client/auth/_authenticators/external_command.py +77 -0
- union/remote/_client/auth/_authenticators/factory.py +200 -0
- union/remote/_client/auth/_authenticators/pkce.py +515 -0
- union/remote/_client/auth/_channel.py +184 -0
- union/remote/_client/auth/_client_config.py +83 -0
- union/remote/_client/auth/_default_html.py +32 -0
- union/remote/_client/auth/_grpc_utils/__init__.py +0 -0
- union/remote/_client/auth/_grpc_utils/auth_interceptor.py +204 -0
- union/remote/_client/auth/_grpc_utils/default_metadata_interceptor.py +144 -0
- union/remote/_client/auth/_keyring.py +154 -0
- union/remote/_client/auth/_token_client.py +258 -0
- union/remote/_client/auth/errors.py +16 -0
- union/remote/_client/controlplane.py +86 -0
- union/remote/_data.py +149 -0
- union/remote/_logs.py +74 -0
- union/remote/_project.py +86 -0
- union/remote/_run.py +820 -0
- union/remote/_secret.py +132 -0
- union/remote/_task.py +193 -0
- union/report/__init__.py +3 -0
- union/report/_report.py +178 -0
- union/report/_template.html +124 -0
- union/storage/__init__.py +24 -0
- union/storage/_remote_fs.py +34 -0
- union/storage/_storage.py +247 -0
- union/storage/_utils.py +5 -0
- union/types/__init__.py +11 -0
- union/types/_renderer.py +162 -0
- union/types/_string_literals.py +120 -0
- union/types/_type_engine.py +2131 -0
- union/types/_utils.py +80 -0
|
@@ -0,0 +1,402 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import sys
|
|
5
|
+
import threading
|
|
6
|
+
from asyncio import Event
|
|
7
|
+
from typing import Awaitable, Coroutine, Optional
|
|
8
|
+
|
|
9
|
+
import grpc.aio
|
|
10
|
+
|
|
11
|
+
import flyte.errors
|
|
12
|
+
from flyte._logging import log, logger
|
|
13
|
+
from flyte._protos.workflow import queue_service_pb2, run_definition_pb2, task_definition_pb2
|
|
14
|
+
from flyte.errors import RuntimeSystemError
|
|
15
|
+
|
|
16
|
+
from ._action import Action
|
|
17
|
+
from ._informer import InformerCache
|
|
18
|
+
from ._service_protocol import ClientSet, QueueService, StateService
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Controller:
|
|
22
|
+
"""
|
|
23
|
+
Generic controller with high-level submit API running in a dedicated thread with its own event loop.
|
|
24
|
+
All methods that begin with _bg_ are run in the controller's event loop, and will need to use
|
|
25
|
+
_run_coroutine_in_controller_thread to run them in the controller's event loop.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
def __init__(
|
|
29
|
+
self,
|
|
30
|
+
client_coro: Awaitable[ClientSet],
|
|
31
|
+
workers: int = 2,
|
|
32
|
+
max_system_retries: int = 5,
|
|
33
|
+
resource_log_interval_sec: float = 10.0,
|
|
34
|
+
min_backoff_on_err_sec: float = 0.1,
|
|
35
|
+
thread_wait_timeout_sec: float = 10.0,
|
|
36
|
+
enqueue_timeout_sec: float = 5.0,
|
|
37
|
+
):
|
|
38
|
+
"""
|
|
39
|
+
Create a new controller instance.
|
|
40
|
+
:param workers: Number of worker threads.
|
|
41
|
+
:param max_system_retries: Maximum number of system retries.
|
|
42
|
+
:param resource_log_interval_sec: Interval for logging resource stats.
|
|
43
|
+
:param min_backoff_on_err_sec: Minimum backoff time on error.
|
|
44
|
+
:param thread_wait_timeout_sec: Timeout for waiting for the controller thread to start.
|
|
45
|
+
:param
|
|
46
|
+
"""
|
|
47
|
+
self._informers = InformerCache()
|
|
48
|
+
self._shared_queue: asyncio.Queue[Action] = asyncio.Queue(maxsize=10000)
|
|
49
|
+
self._running = False
|
|
50
|
+
self._resource_log_task = None
|
|
51
|
+
self._workers = workers
|
|
52
|
+
self._max_retries = max_system_retries
|
|
53
|
+
self._resource_log_interval = resource_log_interval_sec
|
|
54
|
+
self._min_backoff_on_err = min_backoff_on_err_sec
|
|
55
|
+
self._thread_wait_timeout = thread_wait_timeout_sec
|
|
56
|
+
self._client_coro = client_coro
|
|
57
|
+
self._failure_event: Event | None = None
|
|
58
|
+
self._enqueue_timeout = enqueue_timeout_sec
|
|
59
|
+
self._informer_start_wait_timeout = thread_wait_timeout_sec
|
|
60
|
+
|
|
61
|
+
# Thread management
|
|
62
|
+
self._thread = None
|
|
63
|
+
self._loop = None
|
|
64
|
+
self._thread_ready = threading.Event()
|
|
65
|
+
self._thread_exception: Optional[BaseException] = None
|
|
66
|
+
self._thread_com_lock = threading.Lock()
|
|
67
|
+
self._start()
|
|
68
|
+
|
|
69
|
+
# ---------------- Public sync methods, we can add more sync methods if needed
|
|
70
|
+
@log
|
|
71
|
+
def submit_action_sync(self, action: Action) -> Action:
|
|
72
|
+
"""Synchronous version of submit that runs in the controller's event loop"""
|
|
73
|
+
fut = self._run_coroutine_in_controller_thread(self._bg_submit_action(action))
|
|
74
|
+
return fut.result()
|
|
75
|
+
|
|
76
|
+
# --------------- Public async methods
|
|
77
|
+
@log
|
|
78
|
+
async def submit_action(self, action: Action) -> Action:
|
|
79
|
+
"""Public API to submit a resource and wait for completion"""
|
|
80
|
+
return await self._run_coroutine_in_controller_thread(self._bg_submit_action(action))
|
|
81
|
+
|
|
82
|
+
async def get_action(
|
|
83
|
+
self, action_id: run_definition_pb2.ActionIdentifier, parent_action_name: str
|
|
84
|
+
) -> Optional[Action]:
|
|
85
|
+
"""Get the action from the informer"""
|
|
86
|
+
informer = await self._informers.get(run_name=action_id.run.name, parent_action_name=parent_action_name)
|
|
87
|
+
if informer:
|
|
88
|
+
return await informer.get(action_id.name)
|
|
89
|
+
return None
|
|
90
|
+
|
|
91
|
+
@log
|
|
92
|
+
async def cancel_action(self, action: Action):
|
|
93
|
+
return await self._run_coroutine_in_controller_thread(self._bg_cancel_action(action))
|
|
94
|
+
|
|
95
|
+
async def _finalize_parent_action(
|
|
96
|
+
self, run_id: run_definition_pb2.RunIdentifier, parent_action_name: str, timeout: Optional[float] = None
|
|
97
|
+
):
|
|
98
|
+
"""Finalize the parent run"""
|
|
99
|
+
await self._run_coroutine_in_controller_thread(
|
|
100
|
+
self._bg_finalize_informer(run_id=run_id, parent_action_name=parent_action_name, timeout=timeout)
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
def _bg_handle_informer_error(self, task: asyncio.Task):
|
|
104
|
+
"""Handle errors in the informer task"""
|
|
105
|
+
try:
|
|
106
|
+
exc = task.exception()
|
|
107
|
+
if exc:
|
|
108
|
+
logger.error("Informer task failed with exception", exc_info=exc)
|
|
109
|
+
self._set_exception(exc)
|
|
110
|
+
if self._failure_event is None:
|
|
111
|
+
raise RuntimeError("Failure event not initialized")
|
|
112
|
+
self._failure_event.set()
|
|
113
|
+
except asyncio.CancelledError:
|
|
114
|
+
pass
|
|
115
|
+
|
|
116
|
+
async def _bg_watch_for_errors(self):
|
|
117
|
+
if self._failure_event is None:
|
|
118
|
+
raise RuntimeError("Failure event not initialized")
|
|
119
|
+
await self._failure_event.wait()
|
|
120
|
+
logger.warning(f"Failure event received: {self._failure_event}, cleaning up informers and exiting.")
|
|
121
|
+
|
|
122
|
+
async def watch_for_errors(self):
|
|
123
|
+
"""Watch for errors in the background thread"""
|
|
124
|
+
await self._run_coroutine_in_controller_thread(self._bg_watch_for_errors())
|
|
125
|
+
raise RuntimeSystemError(
|
|
126
|
+
code="InformerWatchFailure", message=f"Controller thread failed with exception: {self._get_exception()}"
|
|
127
|
+
)
|
|
128
|
+
|
|
129
|
+
@log
|
|
130
|
+
async def stop(self):
|
|
131
|
+
"""Stop the controller"""
|
|
132
|
+
return await self._run_coroutine_in_controller_thread(self._bg_stop())
|
|
133
|
+
|
|
134
|
+
# ------------- Background thread management methods
|
|
135
|
+
def _set_exception(self, exc: Optional[BaseException]):
|
|
136
|
+
"""Set exception in the thread lock"""
|
|
137
|
+
with self._thread_com_lock:
|
|
138
|
+
self._thread_exception = exc
|
|
139
|
+
|
|
140
|
+
def _get_exception(self) -> Optional[BaseException]:
|
|
141
|
+
"""Get exception in the thread lock"""
|
|
142
|
+
with self._thread_com_lock:
|
|
143
|
+
return self._thread_exception
|
|
144
|
+
|
|
145
|
+
@log
|
|
146
|
+
def _start(self):
|
|
147
|
+
"""Start the controller in a separate thread"""
|
|
148
|
+
if self._thread and self._thread.is_alive():
|
|
149
|
+
logger.warning("Controller thread is already running")
|
|
150
|
+
return
|
|
151
|
+
|
|
152
|
+
self._thread_ready.clear()
|
|
153
|
+
self._set_exception(None)
|
|
154
|
+
self._thread = threading.Thread(target=self._bg_thread_target, daemon=True, name="ControllerThread")
|
|
155
|
+
self._thread.start()
|
|
156
|
+
|
|
157
|
+
# Wait for the thread to be ready
|
|
158
|
+
logger.info("Waiting for controller thread to be ready...")
|
|
159
|
+
if not self._thread_ready.wait(timeout=self._thread_wait_timeout):
|
|
160
|
+
raise TimeoutError("Controller thread failed to start in time")
|
|
161
|
+
|
|
162
|
+
if self._get_exception():
|
|
163
|
+
raise RuntimeSystemError(
|
|
164
|
+
type(self._get_exception()).__name__, f"Controller thread startup failed: {self._get_exception()}"
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
logger.info(f"Controller started in thread: {self._thread.name}")
|
|
168
|
+
|
|
169
|
+
def _run_coroutine_in_controller_thread(self, coro: Coroutine) -> asyncio.Future:
|
|
170
|
+
"""Run a coroutine in the controller's event loop and return the result"""
|
|
171
|
+
with self._thread_com_lock:
|
|
172
|
+
loop = self._loop
|
|
173
|
+
if not self._loop or not self._thread or not self._thread.is_alive():
|
|
174
|
+
raise RuntimeError("Controller thread is not running")
|
|
175
|
+
|
|
176
|
+
assert self._thread.name != threading.current_thread().name, "Cannot run coroutine in the same thread"
|
|
177
|
+
|
|
178
|
+
future = asyncio.run_coroutine_threadsafe(coro, loop)
|
|
179
|
+
return asyncio.wrap_future(future)
|
|
180
|
+
|
|
181
|
+
# ------------- Private methods that run on the background thread
|
|
182
|
+
async def _bg_worker_pool(self):
|
|
183
|
+
logger.debug("Starting controller worker pool")
|
|
184
|
+
self._running = True
|
|
185
|
+
logger.debug("Waiting for Service Client to be ready")
|
|
186
|
+
client_set = await self._client_coro
|
|
187
|
+
self._state_service: StateService = client_set.state_service
|
|
188
|
+
self._queue_service: QueueService = client_set.queue_service
|
|
189
|
+
self._resource_log_task = asyncio.create_task(self._bg_log_stats())
|
|
190
|
+
# We will wait for this to signal that the thread is ready
|
|
191
|
+
# Signal the main thread that we're ready
|
|
192
|
+
logger.debug("Background thread initialization complete")
|
|
193
|
+
self._thread_ready.set()
|
|
194
|
+
if sys.version_info >= (3, 11):
|
|
195
|
+
async with asyncio.TaskGroup() as tg:
|
|
196
|
+
for i in range(self._workers):
|
|
197
|
+
tg.create_task(self._bg_run())
|
|
198
|
+
else:
|
|
199
|
+
tasks = []
|
|
200
|
+
for i in range(self._workers):
|
|
201
|
+
tasks.append(asyncio.create_task(self._bg_run()))
|
|
202
|
+
await asyncio.gather(*tasks)
|
|
203
|
+
|
|
204
|
+
def _bg_thread_target(self):
|
|
205
|
+
"""Target function for the controller thread that creates and manages its own event loop"""
|
|
206
|
+
try:
|
|
207
|
+
# Create a new event loop for this thread
|
|
208
|
+
self._loop = asyncio.new_event_loop()
|
|
209
|
+
asyncio.set_event_loop(self._loop)
|
|
210
|
+
logger.debug(f"Controller thread started with new event loop: {threading.current_thread().name}")
|
|
211
|
+
|
|
212
|
+
# Create an event to signal the errors were observed in the thread's loop
|
|
213
|
+
self._failure_event = Event()
|
|
214
|
+
|
|
215
|
+
self._loop.run_until_complete(self._bg_worker_pool())
|
|
216
|
+
except Exception as e:
|
|
217
|
+
logger.error(f"Controller thread encountered an exception: {e}")
|
|
218
|
+
self._set_exception(e)
|
|
219
|
+
finally:
|
|
220
|
+
if self._loop and self._loop.is_running():
|
|
221
|
+
self._loop.close()
|
|
222
|
+
logger.debug(f"Controller thread exiting: {threading.current_thread().name}")
|
|
223
|
+
|
|
224
|
+
async def _bg_get_action(
|
|
225
|
+
self, action_id: run_definition_pb2.ActionIdentifier, parent_action_name: str
|
|
226
|
+
) -> Optional[Action]:
|
|
227
|
+
"""Get the action from the informer"""
|
|
228
|
+
# Ensure the informer is created and wait for it to be ready
|
|
229
|
+
informer = await self._informers.get_or_create(
|
|
230
|
+
action_id.run,
|
|
231
|
+
parent_action_name,
|
|
232
|
+
self._shared_queue,
|
|
233
|
+
self._state_service,
|
|
234
|
+
fn=self._bg_handle_informer_error,
|
|
235
|
+
timeout=self._informer_start_wait_timeout,
|
|
236
|
+
)
|
|
237
|
+
if informer:
|
|
238
|
+
return await informer.get(action_id.name)
|
|
239
|
+
return None
|
|
240
|
+
|
|
241
|
+
async def _bg_finalize_informer(
|
|
242
|
+
self, run_id: run_definition_pb2.RunIdentifier, parent_action_name: str, timeout: Optional[float] = None
|
|
243
|
+
):
|
|
244
|
+
informer = await self._informers.remove(run_name=run_id.name, parent_action_name=parent_action_name)
|
|
245
|
+
if informer:
|
|
246
|
+
await informer.stop()
|
|
247
|
+
|
|
248
|
+
@log
|
|
249
|
+
async def _bg_submit_action(self, action: Action) -> Action:
|
|
250
|
+
"""Submit a resource and await its completion, returning the final state"""
|
|
251
|
+
logger.debug(f"{threading.current_thread().name} Submitting action {action.name}")
|
|
252
|
+
informer = await self._informers.get_or_create(
|
|
253
|
+
action.action_id.run,
|
|
254
|
+
action.parent_action_name,
|
|
255
|
+
self._shared_queue,
|
|
256
|
+
self._state_service,
|
|
257
|
+
fn=self._bg_handle_informer_error,
|
|
258
|
+
timeout=self._informer_start_wait_timeout,
|
|
259
|
+
)
|
|
260
|
+
await informer.submit(action)
|
|
261
|
+
|
|
262
|
+
logger.debug(f"{threading.current_thread().name} Waiting for completion of {action.name}")
|
|
263
|
+
# Wait for completion
|
|
264
|
+
await informer.wait_for_action_completion(action.name)
|
|
265
|
+
logger.info(f"{threading.current_thread().name} Action {action.name} completed")
|
|
266
|
+
|
|
267
|
+
# Get final resource state and clean up
|
|
268
|
+
final_resource = await informer.get(action.name)
|
|
269
|
+
if final_resource is None:
|
|
270
|
+
raise ValueError(f"Action {action.name} not found")
|
|
271
|
+
logger.debug(f"{threading.current_thread().name} Removed completion event for action {action.name}")
|
|
272
|
+
await informer.remove(action.name) # TODO we should not remove maybe, we should keep a record of completed?
|
|
273
|
+
logger.debug(f"{threading.current_thread().name} Removed action {action.name}, final={final_resource}")
|
|
274
|
+
return final_resource
|
|
275
|
+
|
|
276
|
+
async def _bg_cancel_action(self, action: Action):
|
|
277
|
+
"""
|
|
278
|
+
Cancel an action.
|
|
279
|
+
"""
|
|
280
|
+
if action.is_terminal():
|
|
281
|
+
logger.info(f"Action {action.name} is already terminal, no need to cancel.")
|
|
282
|
+
return
|
|
283
|
+
|
|
284
|
+
started = action.is_started()
|
|
285
|
+
action.mark_cancelled()
|
|
286
|
+
if started:
|
|
287
|
+
logger.info(f"Cancelling action: {action.name}")
|
|
288
|
+
try:
|
|
289
|
+
await self._queue_service.AbortQueuedAction(
|
|
290
|
+
queue_service_pb2.AbortQueuedActionRequest(action_id=action.action_id),
|
|
291
|
+
wait_for_ready=True,
|
|
292
|
+
)
|
|
293
|
+
logger.info(f"Successfully cancelled action: {action.name}")
|
|
294
|
+
except grpc.aio.AioRpcError as e:
|
|
295
|
+
if e.code() in [grpc.StatusCode.NOT_FOUND, grpc.StatusCode.FAILED_PRECONDITION]:
|
|
296
|
+
logger.info(f"Action {action.name} not found, assumed completed or cancelled.")
|
|
297
|
+
return
|
|
298
|
+
else:
|
|
299
|
+
# If the action is not started, we have to ensure it does not get launched
|
|
300
|
+
logger.info(f"Action {action.name} is not started, no need to cancel.")
|
|
301
|
+
|
|
302
|
+
informer = await self._informers.get(run_name=action.run_name, parent_action_name=action.parent_action_name)
|
|
303
|
+
if informer:
|
|
304
|
+
await informer.fire_completion_event(action.name)
|
|
305
|
+
|
|
306
|
+
async def _bg_launch(self, action: Action):
|
|
307
|
+
"""
|
|
308
|
+
Attempt to launch an action.
|
|
309
|
+
"""
|
|
310
|
+
if not action.is_started() and action.task is not None:
|
|
311
|
+
logger.debug(f"Attempting to launch action: {action.name}")
|
|
312
|
+
try:
|
|
313
|
+
await self._queue_service.EnqueueAction(
|
|
314
|
+
queue_service_pb2.EnqueueActionRequest(
|
|
315
|
+
action_id=action.action_id,
|
|
316
|
+
parent_action_name=action.parent_action_name,
|
|
317
|
+
task=queue_service_pb2.TaskAction(
|
|
318
|
+
id=task_definition_pb2.TaskIdentifier(
|
|
319
|
+
version=action.task.task_template.id.version,
|
|
320
|
+
org=action.task.task_template.id.org,
|
|
321
|
+
project=action.task.task_template.id.project,
|
|
322
|
+
domain=action.task.task_template.id.domain,
|
|
323
|
+
name=action.task.task_template.id.name,
|
|
324
|
+
),
|
|
325
|
+
spec=action.task,
|
|
326
|
+
),
|
|
327
|
+
input_uri=action.inputs_uri,
|
|
328
|
+
run_output_base=action.run_output_base,
|
|
329
|
+
group=action.group.name if action.group else None,
|
|
330
|
+
# Subject is not used in the current implementation
|
|
331
|
+
),
|
|
332
|
+
wait_for_ready=True,
|
|
333
|
+
timeout=self._enqueue_timeout,
|
|
334
|
+
)
|
|
335
|
+
logger.info(f"Successfully launched action: {action.name}")
|
|
336
|
+
except grpc.aio.AioRpcError as e:
|
|
337
|
+
if e.code() == grpc.StatusCode.ALREADY_EXISTS:
|
|
338
|
+
logger.info(f"Action {action.name} already exists, continuing to monitor.")
|
|
339
|
+
return
|
|
340
|
+
logger.exception(f"Failed to launch action: {action.name} backing off...")
|
|
341
|
+
logger.debug(f"Action details: {action}")
|
|
342
|
+
raise e
|
|
343
|
+
|
|
344
|
+
@log
|
|
345
|
+
async def _bg_process(self, action: Action):
|
|
346
|
+
"""Process resource updates"""
|
|
347
|
+
logger.debug(f"Processing action: name={action.name}, started={action.is_started()}")
|
|
348
|
+
|
|
349
|
+
if not action.is_started():
|
|
350
|
+
await self._bg_launch(action)
|
|
351
|
+
elif action.is_terminal():
|
|
352
|
+
informer = await self._informers.get(run_name=action.run_name, parent_action_name=action.parent_action_name)
|
|
353
|
+
if informer:
|
|
354
|
+
await informer.fire_completion_event(action.name)
|
|
355
|
+
else:
|
|
356
|
+
logger.debug(f"Resource {action.name} still in progress...")
|
|
357
|
+
|
|
358
|
+
async def _bg_log_stats(self):
|
|
359
|
+
"""Periodically log resource stats if debug is enabled"""
|
|
360
|
+
while self._running:
|
|
361
|
+
async for started, pending, terminal in self._informers.count_started_pending_terminal_actions():
|
|
362
|
+
logger.info(f"Resource stats: Started={started}, Pending={pending}, Terminal={terminal}")
|
|
363
|
+
await asyncio.sleep(self._resource_log_interval)
|
|
364
|
+
|
|
365
|
+
@log
|
|
366
|
+
async def _bg_run(self):
|
|
367
|
+
"""Run loop with resource status logging"""
|
|
368
|
+
while self._running:
|
|
369
|
+
logger.debug(f"{threading.current_thread().name} Waiting for resource")
|
|
370
|
+
action = await self._shared_queue.get()
|
|
371
|
+
logger.debug(f"{threading.current_thread().name} Got resource {action.name}")
|
|
372
|
+
try:
|
|
373
|
+
await self._bg_process(action)
|
|
374
|
+
except Exception as e:
|
|
375
|
+
logger.error(f"Error in controller loop: {e}")
|
|
376
|
+
# TODO we need a better way of handling backoffs currently the entire worker coroutine backs off
|
|
377
|
+
await asyncio.sleep(self._min_backoff_on_err)
|
|
378
|
+
action.increment_retries()
|
|
379
|
+
if action.retries > self._max_retries:
|
|
380
|
+
err = flyte.errors.RuntimeSystemError(
|
|
381
|
+
code=type(e).__name__,
|
|
382
|
+
message=f"Controller failed, system retries {action.retries}"
|
|
383
|
+
f" crossed threshold {self._max_retries}",
|
|
384
|
+
)
|
|
385
|
+
err.__cause__ = e
|
|
386
|
+
action.set_client_error(err)
|
|
387
|
+
informer = await self._informers.get(
|
|
388
|
+
run_name=action.run_name, parent_action_name=action.parent_action_name
|
|
389
|
+
)
|
|
390
|
+
if informer:
|
|
391
|
+
await informer.fire_completion_event(action.name)
|
|
392
|
+
else:
|
|
393
|
+
await self._shared_queue.put(action)
|
|
394
|
+
finally:
|
|
395
|
+
self._shared_queue.task_done()
|
|
396
|
+
|
|
397
|
+
@log
|
|
398
|
+
async def _bg_stop(self):
|
|
399
|
+
"""Stop the controller"""
|
|
400
|
+
self._running = False
|
|
401
|
+
self._resource_log_task.cancel()
|
|
402
|
+
await self._informers.remove_and_stop_all()
|