flyte 0.2.0b1__py3-none-any.whl → 2.0.0b46__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (266) hide show
  1. flyte/__init__.py +83 -30
  2. flyte/_bin/connect.py +61 -0
  3. flyte/_bin/debug.py +38 -0
  4. flyte/_bin/runtime.py +87 -19
  5. flyte/_bin/serve.py +351 -0
  6. flyte/_build.py +3 -2
  7. flyte/_cache/cache.py +6 -5
  8. flyte/_cache/local_cache.py +216 -0
  9. flyte/_code_bundle/_ignore.py +31 -5
  10. flyte/_code_bundle/_packaging.py +42 -11
  11. flyte/_code_bundle/_utils.py +57 -34
  12. flyte/_code_bundle/bundle.py +130 -27
  13. flyte/_constants.py +1 -0
  14. flyte/_context.py +21 -5
  15. flyte/_custom_context.py +73 -0
  16. flyte/_debug/constants.py +37 -0
  17. flyte/_debug/utils.py +17 -0
  18. flyte/_debug/vscode.py +315 -0
  19. flyte/_deploy.py +396 -75
  20. flyte/_deployer.py +109 -0
  21. flyte/_environment.py +94 -11
  22. flyte/_excepthook.py +37 -0
  23. flyte/_group.py +2 -1
  24. flyte/_hash.py +1 -16
  25. flyte/_image.py +544 -231
  26. flyte/_initialize.py +456 -316
  27. flyte/_interface.py +40 -5
  28. flyte/_internal/controllers/__init__.py +22 -8
  29. flyte/_internal/controllers/_local_controller.py +159 -35
  30. flyte/_internal/controllers/_trace.py +18 -10
  31. flyte/_internal/controllers/remote/__init__.py +38 -9
  32. flyte/_internal/controllers/remote/_action.py +82 -12
  33. flyte/_internal/controllers/remote/_client.py +6 -2
  34. flyte/_internal/controllers/remote/_controller.py +290 -64
  35. flyte/_internal/controllers/remote/_core.py +155 -95
  36. flyte/_internal/controllers/remote/_informer.py +40 -20
  37. flyte/_internal/controllers/remote/_service_protocol.py +2 -2
  38. flyte/_internal/imagebuild/__init__.py +2 -10
  39. flyte/_internal/imagebuild/docker_builder.py +391 -84
  40. flyte/_internal/imagebuild/image_builder.py +111 -55
  41. flyte/_internal/imagebuild/remote_builder.py +409 -0
  42. flyte/_internal/imagebuild/utils.py +79 -0
  43. flyte/_internal/resolvers/_app_env_module.py +92 -0
  44. flyte/_internal/resolvers/_task_module.py +5 -38
  45. flyte/_internal/resolvers/app_env.py +26 -0
  46. flyte/_internal/resolvers/common.py +8 -1
  47. flyte/_internal/resolvers/default.py +2 -2
  48. flyte/_internal/runtime/convert.py +319 -36
  49. flyte/_internal/runtime/entrypoints.py +106 -18
  50. flyte/_internal/runtime/io.py +71 -23
  51. flyte/_internal/runtime/resources_serde.py +21 -7
  52. flyte/_internal/runtime/reuse.py +125 -0
  53. flyte/_internal/runtime/rusty.py +196 -0
  54. flyte/_internal/runtime/task_serde.py +239 -66
  55. flyte/_internal/runtime/taskrunner.py +48 -8
  56. flyte/_internal/runtime/trigger_serde.py +162 -0
  57. flyte/_internal/runtime/types_serde.py +7 -16
  58. flyte/_keyring/file.py +115 -0
  59. flyte/_link.py +30 -0
  60. flyte/_logging.py +241 -42
  61. flyte/_map.py +312 -0
  62. flyte/_metrics.py +59 -0
  63. flyte/_module.py +74 -0
  64. flyte/_pod.py +30 -0
  65. flyte/_resources.py +296 -33
  66. flyte/_retry.py +1 -7
  67. flyte/_reusable_environment.py +72 -7
  68. flyte/_run.py +462 -132
  69. flyte/_secret.py +47 -11
  70. flyte/_serve.py +333 -0
  71. flyte/_task.py +245 -56
  72. flyte/_task_environment.py +219 -97
  73. flyte/_task_plugins.py +47 -0
  74. flyte/_tools.py +8 -8
  75. flyte/_trace.py +15 -24
  76. flyte/_trigger.py +1027 -0
  77. flyte/_utils/__init__.py +12 -1
  78. flyte/_utils/asyn.py +3 -1
  79. flyte/_utils/async_cache.py +139 -0
  80. flyte/_utils/coro_management.py +5 -4
  81. flyte/_utils/description_parser.py +19 -0
  82. flyte/_utils/docker_credentials.py +173 -0
  83. flyte/_utils/helpers.py +45 -19
  84. flyte/_utils/module_loader.py +123 -0
  85. flyte/_utils/org_discovery.py +57 -0
  86. flyte/_utils/uv_script_parser.py +8 -1
  87. flyte/_version.py +16 -3
  88. flyte/app/__init__.py +27 -0
  89. flyte/app/_app_environment.py +362 -0
  90. flyte/app/_connector_environment.py +40 -0
  91. flyte/app/_deploy.py +130 -0
  92. flyte/app/_parameter.py +343 -0
  93. flyte/app/_runtime/__init__.py +3 -0
  94. flyte/app/_runtime/app_serde.py +383 -0
  95. flyte/app/_types.py +113 -0
  96. flyte/app/extras/__init__.py +9 -0
  97. flyte/app/extras/_auth_middleware.py +217 -0
  98. flyte/app/extras/_fastapi.py +93 -0
  99. flyte/app/extras/_model_loader/__init__.py +3 -0
  100. flyte/app/extras/_model_loader/config.py +7 -0
  101. flyte/app/extras/_model_loader/loader.py +288 -0
  102. flyte/cli/__init__.py +12 -0
  103. flyte/cli/_abort.py +28 -0
  104. flyte/cli/_build.py +114 -0
  105. flyte/cli/_common.py +493 -0
  106. flyte/cli/_create.py +371 -0
  107. flyte/cli/_delete.py +45 -0
  108. flyte/cli/_deploy.py +401 -0
  109. flyte/cli/_gen.py +316 -0
  110. flyte/cli/_get.py +446 -0
  111. flyte/cli/_option.py +33 -0
  112. flyte/{_cli → cli}/_params.py +57 -17
  113. flyte/cli/_plugins.py +209 -0
  114. flyte/cli/_prefetch.py +292 -0
  115. flyte/cli/_run.py +690 -0
  116. flyte/cli/_serve.py +338 -0
  117. flyte/cli/_update.py +86 -0
  118. flyte/cli/_user.py +20 -0
  119. flyte/cli/main.py +246 -0
  120. flyte/config/__init__.py +2 -167
  121. flyte/config/_config.py +215 -163
  122. flyte/config/_internal.py +10 -1
  123. flyte/config/_reader.py +225 -0
  124. flyte/connectors/__init__.py +11 -0
  125. flyte/connectors/_connector.py +330 -0
  126. flyte/connectors/_server.py +194 -0
  127. flyte/connectors/utils.py +159 -0
  128. flyte/errors.py +134 -2
  129. flyte/extend.py +24 -0
  130. flyte/extras/_container.py +69 -56
  131. flyte/git/__init__.py +3 -0
  132. flyte/git/_config.py +279 -0
  133. flyte/io/__init__.py +8 -1
  134. flyte/io/{structured_dataset → _dataframe}/__init__.py +32 -30
  135. flyte/io/{structured_dataset → _dataframe}/basic_dfs.py +75 -68
  136. flyte/io/{structured_dataset/structured_dataset.py → _dataframe/dataframe.py} +207 -242
  137. flyte/io/_dir.py +575 -113
  138. flyte/io/_file.py +587 -141
  139. flyte/io/_hashing_io.py +342 -0
  140. flyte/io/extend.py +7 -0
  141. flyte/models.py +635 -0
  142. flyte/prefetch/__init__.py +22 -0
  143. flyte/prefetch/_hf_model.py +563 -0
  144. flyte/remote/__init__.py +14 -3
  145. flyte/remote/_action.py +879 -0
  146. flyte/remote/_app.py +346 -0
  147. flyte/remote/_auth_metadata.py +42 -0
  148. flyte/remote/_client/_protocols.py +62 -4
  149. flyte/remote/_client/auth/_auth_utils.py +19 -0
  150. flyte/remote/_client/auth/_authenticators/base.py +8 -2
  151. flyte/remote/_client/auth/_authenticators/device_code.py +4 -5
  152. flyte/remote/_client/auth/_authenticators/factory.py +4 -0
  153. flyte/remote/_client/auth/_authenticators/passthrough.py +79 -0
  154. flyte/remote/_client/auth/_authenticators/pkce.py +17 -18
  155. flyte/remote/_client/auth/_channel.py +47 -18
  156. flyte/remote/_client/auth/_client_config.py +5 -3
  157. flyte/remote/_client/auth/_keyring.py +15 -2
  158. flyte/remote/_client/auth/_token_client.py +3 -3
  159. flyte/remote/_client/controlplane.py +206 -18
  160. flyte/remote/_common.py +66 -0
  161. flyte/remote/_data.py +107 -22
  162. flyte/remote/_logs.py +116 -33
  163. flyte/remote/_project.py +21 -19
  164. flyte/remote/_run.py +164 -631
  165. flyte/remote/_secret.py +72 -29
  166. flyte/remote/_task.py +387 -46
  167. flyte/remote/_trigger.py +368 -0
  168. flyte/remote/_user.py +43 -0
  169. flyte/report/_report.py +10 -6
  170. flyte/storage/__init__.py +13 -1
  171. flyte/storage/_config.py +237 -0
  172. flyte/storage/_parallel_reader.py +289 -0
  173. flyte/storage/_storage.py +268 -59
  174. flyte/syncify/__init__.py +56 -0
  175. flyte/syncify/_api.py +414 -0
  176. flyte/types/__init__.py +39 -0
  177. flyte/types/_interface.py +22 -7
  178. flyte/{io/pickle/transformer.py → types/_pickle.py} +37 -9
  179. flyte/types/_string_literals.py +8 -9
  180. flyte/types/_type_engine.py +226 -126
  181. flyte/types/_utils.py +1 -1
  182. flyte-2.0.0b46.data/scripts/debug.py +38 -0
  183. flyte-2.0.0b46.data/scripts/runtime.py +194 -0
  184. flyte-2.0.0b46.dist-info/METADATA +352 -0
  185. flyte-2.0.0b46.dist-info/RECORD +221 -0
  186. flyte-2.0.0b46.dist-info/entry_points.txt +8 -0
  187. flyte-2.0.0b46.dist-info/licenses/LICENSE +201 -0
  188. flyte/_api_commons.py +0 -3
  189. flyte/_cli/_common.py +0 -299
  190. flyte/_cli/_create.py +0 -42
  191. flyte/_cli/_delete.py +0 -23
  192. flyte/_cli/_deploy.py +0 -140
  193. flyte/_cli/_get.py +0 -235
  194. flyte/_cli/_run.py +0 -174
  195. flyte/_cli/main.py +0 -98
  196. flyte/_datastructures.py +0 -342
  197. flyte/_internal/controllers/pbhash.py +0 -39
  198. flyte/_protos/common/authorization_pb2.py +0 -66
  199. flyte/_protos/common/authorization_pb2.pyi +0 -108
  200. flyte/_protos/common/authorization_pb2_grpc.py +0 -4
  201. flyte/_protos/common/identifier_pb2.py +0 -71
  202. flyte/_protos/common/identifier_pb2.pyi +0 -82
  203. flyte/_protos/common/identifier_pb2_grpc.py +0 -4
  204. flyte/_protos/common/identity_pb2.py +0 -48
  205. flyte/_protos/common/identity_pb2.pyi +0 -72
  206. flyte/_protos/common/identity_pb2_grpc.py +0 -4
  207. flyte/_protos/common/list_pb2.py +0 -36
  208. flyte/_protos/common/list_pb2.pyi +0 -69
  209. flyte/_protos/common/list_pb2_grpc.py +0 -4
  210. flyte/_protos/common/policy_pb2.py +0 -37
  211. flyte/_protos/common/policy_pb2.pyi +0 -27
  212. flyte/_protos/common/policy_pb2_grpc.py +0 -4
  213. flyte/_protos/common/role_pb2.py +0 -37
  214. flyte/_protos/common/role_pb2.pyi +0 -53
  215. flyte/_protos/common/role_pb2_grpc.py +0 -4
  216. flyte/_protos/common/runtime_version_pb2.py +0 -28
  217. flyte/_protos/common/runtime_version_pb2.pyi +0 -24
  218. flyte/_protos/common/runtime_version_pb2_grpc.py +0 -4
  219. flyte/_protos/logs/dataplane/payload_pb2.py +0 -96
  220. flyte/_protos/logs/dataplane/payload_pb2.pyi +0 -168
  221. flyte/_protos/logs/dataplane/payload_pb2_grpc.py +0 -4
  222. flyte/_protos/secret/definition_pb2.py +0 -49
  223. flyte/_protos/secret/definition_pb2.pyi +0 -93
  224. flyte/_protos/secret/definition_pb2_grpc.py +0 -4
  225. flyte/_protos/secret/payload_pb2.py +0 -62
  226. flyte/_protos/secret/payload_pb2.pyi +0 -94
  227. flyte/_protos/secret/payload_pb2_grpc.py +0 -4
  228. flyte/_protos/secret/secret_pb2.py +0 -38
  229. flyte/_protos/secret/secret_pb2.pyi +0 -6
  230. flyte/_protos/secret/secret_pb2_grpc.py +0 -198
  231. flyte/_protos/secret/secret_pb2_grpc_grpc.py +0 -198
  232. flyte/_protos/validate/validate/validate_pb2.py +0 -76
  233. flyte/_protos/workflow/node_execution_service_pb2.py +0 -26
  234. flyte/_protos/workflow/node_execution_service_pb2.pyi +0 -4
  235. flyte/_protos/workflow/node_execution_service_pb2_grpc.py +0 -32
  236. flyte/_protos/workflow/queue_service_pb2.py +0 -106
  237. flyte/_protos/workflow/queue_service_pb2.pyi +0 -141
  238. flyte/_protos/workflow/queue_service_pb2_grpc.py +0 -172
  239. flyte/_protos/workflow/run_definition_pb2.py +0 -128
  240. flyte/_protos/workflow/run_definition_pb2.pyi +0 -310
  241. flyte/_protos/workflow/run_definition_pb2_grpc.py +0 -4
  242. flyte/_protos/workflow/run_logs_service_pb2.py +0 -41
  243. flyte/_protos/workflow/run_logs_service_pb2.pyi +0 -28
  244. flyte/_protos/workflow/run_logs_service_pb2_grpc.py +0 -69
  245. flyte/_protos/workflow/run_service_pb2.py +0 -133
  246. flyte/_protos/workflow/run_service_pb2.pyi +0 -175
  247. flyte/_protos/workflow/run_service_pb2_grpc.py +0 -412
  248. flyte/_protos/workflow/state_service_pb2.py +0 -58
  249. flyte/_protos/workflow/state_service_pb2.pyi +0 -71
  250. flyte/_protos/workflow/state_service_pb2_grpc.py +0 -138
  251. flyte/_protos/workflow/task_definition_pb2.py +0 -72
  252. flyte/_protos/workflow/task_definition_pb2.pyi +0 -65
  253. flyte/_protos/workflow/task_definition_pb2_grpc.py +0 -4
  254. flyte/_protos/workflow/task_service_pb2.py +0 -44
  255. flyte/_protos/workflow/task_service_pb2.pyi +0 -31
  256. flyte/_protos/workflow/task_service_pb2_grpc.py +0 -104
  257. flyte/io/_dataframe.py +0 -0
  258. flyte/io/pickle/__init__.py +0 -0
  259. flyte/remote/_console.py +0 -18
  260. flyte-0.2.0b1.dist-info/METADATA +0 -179
  261. flyte-0.2.0b1.dist-info/RECORD +0 -204
  262. flyte-0.2.0b1.dist-info/entry_points.txt +0 -3
  263. /flyte/{_cli → _debug}/__init__.py +0 -0
  264. /flyte/{_protos → _keyring}/__init__.py +0 -0
  265. {flyte-0.2.0b1.dist-info → flyte-2.0.0b46.dist-info}/WHEEL +0 -0
  266. {flyte-0.2.0b1.dist-info → flyte-2.0.0b46.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import asyncio
4
+ import os
4
5
  import sys
5
6
  import threading
6
7
  from asyncio import Event
7
8
  from typing import Awaitable, Coroutine, Optional
8
9
 
9
10
  import grpc.aio
11
+ from aiolimiter import AsyncLimiter
12
+ from flyteidl2.common import identifier_pb2
13
+ from flyteidl2.task import task_definition_pb2
14
+ from flyteidl2.workflow import queue_service_pb2, run_definition_pb2
15
+ from google.protobuf.wrappers_pb2 import StringValue
10
16
 
11
17
  import flyte.errors
12
18
  from flyte._logging import log, logger
13
- from flyte._protos.workflow import queue_service_pb2, run_definition_pb2, task_definition_pb2
14
- from flyte.errors import RuntimeSystemError
15
19
 
16
20
  from ._action import Action
17
21
  from ._informer import InformerCache
@@ -28,11 +32,11 @@ class Controller:
28
32
  def __init__(
29
33
  self,
30
34
  client_coro: Awaitable[ClientSet],
31
- workers: int = 2,
32
- max_system_retries: int = 5,
35
+ workers: int = 20,
36
+ max_system_retries: int = 10,
33
37
  resource_log_interval_sec: float = 10.0,
34
- min_backoff_on_err_sec: float = 0.1,
35
- thread_wait_timeout_sec: float = 10.0,
38
+ min_backoff_on_err_sec: float = 0.5,
39
+ thread_wait_timeout_sec: float = 5.0,
36
40
  enqueue_timeout_sec: float = 5.0,
37
41
  ):
38
42
  """
@@ -49,14 +53,17 @@ class Controller:
49
53
  self._running = False
50
54
  self._resource_log_task = None
51
55
  self._workers = workers
52
- self._max_retries = max_system_retries
56
+ self._max_retries = int(os.getenv("_F_MAX_RETRIES", max_system_retries))
53
57
  self._resource_log_interval = resource_log_interval_sec
54
58
  self._min_backoff_on_err = min_backoff_on_err_sec
59
+ self._max_backoff_on_err = float(os.getenv("_F_MAX_BFF_ON_ERR", "10.0"))
55
60
  self._thread_wait_timeout = thread_wait_timeout_sec
56
61
  self._client_coro = client_coro
57
62
  self._failure_event: Event | None = None
58
63
  self._enqueue_timeout = enqueue_timeout_sec
59
64
  self._informer_start_wait_timeout = thread_wait_timeout_sec
65
+ max_qps = int(os.getenv("_F_MAX_QPS", "100"))
66
+ self._rate_limiter = AsyncLimiter(max_qps, 1.0)
60
67
 
61
68
  # Thread management
62
69
  self._thread = None
@@ -79,21 +86,19 @@ class Controller:
79
86
  """Public API to submit a resource and wait for completion"""
80
87
  return await self._run_coroutine_in_controller_thread(self._bg_submit_action(action))
81
88
 
82
- async def get_action(
83
- self, action_id: run_definition_pb2.ActionIdentifier, parent_action_name: str
84
- ) -> Optional[Action]:
89
+ async def get_action(self, action_id: identifier_pb2.ActionIdentifier, parent_action_name: str) -> Optional[Action]:
85
90
  """Get the action from the informer"""
86
- informer = await self._informers.get(run_name=action_id.run.name, parent_action_name=parent_action_name)
87
- if informer:
88
- return await informer.get(action_id.name)
89
- return None
91
+ return await self._run_coroutine_in_controller_thread(self._bg_get_action(action_id, parent_action_name))
90
92
 
91
93
  @log
92
94
  async def cancel_action(self, action: Action):
93
95
  return await self._run_coroutine_in_controller_thread(self._bg_cancel_action(action))
94
96
 
95
97
  async def _finalize_parent_action(
96
- self, run_id: run_definition_pb2.RunIdentifier, parent_action_name: str, timeout: Optional[float] = None
98
+ self,
99
+ run_id: identifier_pb2.RunIdentifier,
100
+ parent_action_name: str,
101
+ timeout: Optional[float] = None,
97
102
  ):
98
103
  """Finalize the parent run"""
99
104
  await self._run_coroutine_in_controller_thread(
@@ -111,19 +116,21 @@ class Controller:
111
116
  raise RuntimeError("Failure event not initialized")
112
117
  self._failure_event.set()
113
118
  except asyncio.CancelledError:
114
- pass
119
+ raise
115
120
 
116
121
  async def _bg_watch_for_errors(self):
117
122
  if self._failure_event is None:
118
123
  raise RuntimeError("Failure event not initialized")
119
124
  await self._failure_event.wait()
120
125
  logger.warning(f"Failure event received: {self._failure_event}, cleaning up informers and exiting.")
126
+ self._running = False
121
127
 
122
128
  async def watch_for_errors(self):
123
129
  """Watch for errors in the background thread"""
124
130
  await self._run_coroutine_in_controller_thread(self._bg_watch_for_errors())
125
- raise RuntimeSystemError(
126
- code="InformerWatchFailure", message=f"Controller thread failed with exception: {self._get_exception()}"
131
+ raise flyte.errors.RuntimeSystemError(
132
+ code="InformerWatchFailure",
133
+ message=f"Controller thread failed with exception: {self._get_exception()}",
127
134
  )
128
135
 
129
136
  @log
@@ -142,7 +149,6 @@ class Controller:
142
149
  with self._thread_com_lock:
143
150
  return self._thread_exception
144
151
 
145
- @log
146
152
  def _start(self):
147
153
  """Start the controller in a separate thread"""
148
154
  if self._thread and self._thread.is_alive():
@@ -155,13 +161,14 @@ class Controller:
155
161
  self._thread.start()
156
162
 
157
163
  # Wait for the thread to be ready
158
- logger.info("Waiting for controller thread to be ready...")
159
164
  if not self._thread_ready.wait(timeout=self._thread_wait_timeout):
165
+ logger.warning("Controller thread did not finish within timeout")
160
166
  raise TimeoutError("Controller thread failed to start in time")
161
167
 
162
168
  if self._get_exception():
163
- raise RuntimeSystemError(
164
- type(self._get_exception()).__name__, f"Controller thread startup failed: {self._get_exception()}"
169
+ raise flyte.errors.RuntimeSystemError(
170
+ type(self._get_exception()).__name__,
171
+ f"Controller thread startup failed: {self._get_exception()}",
165
172
  )
166
173
 
167
174
  logger.info(f"Controller started in thread: {self._thread.name}")
@@ -190,15 +197,16 @@ class Controller:
190
197
  # We will wait for this to signal that the thread is ready
191
198
  # Signal the main thread that we're ready
192
199
  logger.debug("Background thread initialization complete")
193
- self._thread_ready.set()
194
200
  if sys.version_info >= (3, 11):
195
201
  async with asyncio.TaskGroup() as tg:
196
202
  for i in range(self._workers):
197
- tg.create_task(self._bg_run())
203
+ tg.create_task(self._bg_run(f"worker-{i}"))
204
+ self._thread_ready.set()
198
205
  else:
199
206
  tasks = []
200
207
  for i in range(self._workers):
201
- tasks.append(asyncio.create_task(self._bg_run()))
208
+ tasks.append(asyncio.create_task(self._bg_run(f"worker-{i}")))
209
+ self._thread_ready.set()
202
210
  await asyncio.gather(*tasks)
203
211
 
204
212
  def _bg_thread_target(self):
@@ -207,6 +215,7 @@ class Controller:
207
215
  # Create a new event loop for this thread
208
216
  self._loop = asyncio.new_event_loop()
209
217
  asyncio.set_event_loop(self._loop)
218
+ self._loop.set_exception_handler(flyte.errors.silence_grpc_polling_error)
210
219
  logger.debug(f"Controller thread started with new event loop: {threading.current_thread().name}")
211
220
 
212
221
  # Create an event to signal the errors were observed in the thread's loop
@@ -216,13 +225,14 @@ class Controller:
216
225
  except Exception as e:
217
226
  logger.error(f"Controller thread encountered an exception: {e}")
218
227
  self._set_exception(e)
228
+ self._failure_event.set()
219
229
  finally:
220
230
  if self._loop and self._loop.is_running():
221
231
  self._loop.close()
222
232
  logger.debug(f"Controller thread exiting: {threading.current_thread().name}")
223
233
 
224
234
  async def _bg_get_action(
225
- self, action_id: run_definition_pb2.ActionIdentifier, parent_action_name: str
235
+ self, action_id: identifier_pb2.ActionIdentifier, parent_action_name: str
226
236
  ) -> Optional[Action]:
227
237
  """Get the action from the informer"""
228
238
  # Ensure the informer is created and wait for it to be ready
@@ -239,13 +249,15 @@ class Controller:
239
249
  return None
240
250
 
241
251
  async def _bg_finalize_informer(
242
- self, run_id: run_definition_pb2.RunIdentifier, parent_action_name: str, timeout: Optional[float] = None
252
+ self,
253
+ run_id: identifier_pb2.RunIdentifier,
254
+ parent_action_name: str,
255
+ timeout: Optional[float] = None,
243
256
  ):
244
257
  informer = await self._informers.remove(run_name=run_id.name, parent_action_name=parent_action_name)
245
258
  if informer:
246
259
  await informer.stop()
247
260
 
248
- @log
249
261
  async def _bg_submit_action(self, action: Action) -> Action:
250
262
  """Submit a resource and await its completion, returning the final state"""
251
263
  logger.debug(f"{threading.current_thread().name} Submitting action {action.name}")
@@ -270,7 +282,7 @@ class Controller:
270
282
  raise ValueError(f"Action {action.name} not found")
271
283
  logger.debug(f"{threading.current_thread().name} Removed completion event for action {action.name}")
272
284
  await informer.remove(action.name) # TODO we should not remove maybe, we should keep a record of completed?
273
- logger.debug(f"{threading.current_thread().name} Removed action {action.name}, final={final_resource}")
285
+ logger.debug(f"{threading.current_thread().name} Removed action {action.name}")
274
286
  return final_resource
275
287
 
276
288
  async def _bg_cancel_action(self, action: Action):
@@ -284,17 +296,21 @@ class Controller:
284
296
  started = action.is_started()
285
297
  action.mark_cancelled()
286
298
  if started:
287
- logger.info(f"Cancelling action: {action.name}")
288
- try:
289
- await self._queue_service.AbortQueuedAction(
290
- queue_service_pb2.AbortQueuedActionRequest(action_id=action.action_id),
291
- wait_for_ready=True,
292
- )
293
- logger.info(f"Successfully cancelled action: {action.name}")
294
- except grpc.aio.AioRpcError as e:
295
- if e.code() in [grpc.StatusCode.NOT_FOUND, grpc.StatusCode.FAILED_PRECONDITION]:
296
- logger.info(f"Action {action.name} not found, assumed completed or cancelled.")
297
- return
299
+ async with self._rate_limiter:
300
+ logger.info(f"Cancelling action: {action.name}")
301
+ try:
302
+ await self._queue_service.AbortQueuedAction(
303
+ queue_service_pb2.AbortQueuedActionRequest(action_id=action.action_id),
304
+ wait_for_ready=True,
305
+ )
306
+ logger.info(f"Successfully cancelled action: {action.name}")
307
+ except grpc.aio.AioRpcError as e:
308
+ if e.code() in [
309
+ grpc.StatusCode.NOT_FOUND,
310
+ grpc.StatusCode.FAILED_PRECONDITION,
311
+ ]:
312
+ logger.info(f"Action {action.name} not found, assumed completed or cancelled.")
313
+ return
298
314
  else:
299
315
  # If the action is not started, we have to ensure it does not get launched
300
316
  logger.info(f"Action {action.name} is not started, no need to cancel.")
@@ -307,41 +323,72 @@ class Controller:
307
323
  """
308
324
  Attempt to launch an action.
309
325
  """
310
- if not action.is_started() and action.task is not None:
311
- logger.debug(f"Attempting to launch action: {action.name}")
312
- try:
313
- await self._queue_service.EnqueueAction(
314
- queue_service_pb2.EnqueueActionRequest(
315
- action_id=action.action_id,
316
- parent_action_name=action.parent_action_name,
317
- task=queue_service_pb2.TaskAction(
318
- id=task_definition_pb2.TaskIdentifier(
319
- version=action.task.task_template.id.version,
320
- org=action.task.task_template.id.org,
321
- project=action.task.task_template.id.project,
322
- domain=action.task.task_template.id.domain,
323
- name=action.task.task_template.id.name,
324
- ),
325
- spec=action.task,
326
+ if not action.is_started():
327
+ async with self._rate_limiter:
328
+ task: run_definition_pb2.TaskAction | None = None
329
+ trace: run_definition_pb2.TraceAction | None = None
330
+ if action.type == "task":
331
+ if action.task is None:
332
+ raise flyte.errors.RuntimeSystemError(
333
+ "NoTaskSpec", "Task Spec not found, cannot launch Task Action."
334
+ )
335
+ cache_key = None
336
+ logger.info(f"Action {action.name} has cache version {action.cache_key}")
337
+ if action.cache_key:
338
+ cache_key = StringValue(value=action.cache_key)
339
+
340
+ task = run_definition_pb2.TaskAction(
341
+ id=task_definition_pb2.TaskIdentifier(
342
+ version=action.task.task_template.id.version,
343
+ org=action.task.task_template.id.org,
344
+ project=action.task.task_template.id.project,
345
+ domain=action.task.task_template.id.domain,
346
+ name=action.task.task_template.id.name,
326
347
  ),
327
- input_uri=action.inputs_uri,
328
- run_output_base=action.run_output_base,
329
- group=action.group.name if action.group else None,
330
- # Subject is not used in the current implementation
331
- ),
332
- wait_for_ready=True,
333
- timeout=self._enqueue_timeout,
334
- )
335
- logger.info(f"Successfully launched action: {action.name}")
336
- except grpc.aio.AioRpcError as e:
337
- if e.code() == grpc.StatusCode.ALREADY_EXISTS:
338
- logger.info(f"Action {action.name} already exists, continuing to monitor.")
339
- return
340
- logger.exception(f"Failed to launch action: {action.name} backing off...")
341
- logger.debug(f"Action details: {action}")
342
- raise e
348
+ spec=action.task,
349
+ cache_key=cache_key,
350
+ cluster=action.queue,
351
+ )
352
+ elif action.type == "trace":
353
+ trace = action.trace
354
+
355
+ logger.debug(f"Attempting to launch action: {action.name}")
356
+ try:
357
+ await self._queue_service.EnqueueAction(
358
+ queue_service_pb2.EnqueueActionRequest(
359
+ action_id=action.action_id,
360
+ parent_action_name=action.parent_action_name,
361
+ task=task,
362
+ trace=trace,
363
+ input_uri=action.inputs_uri,
364
+ run_output_base=action.run_output_base,
365
+ group=action.group.name if action.group else None,
366
+ # Subject is not used in the current implementation
367
+ ),
368
+ wait_for_ready=True,
369
+ timeout=self._enqueue_timeout,
370
+ )
371
+ logger.info(f"Successfully launched action: {action.name}")
372
+ except grpc.aio.AioRpcError as e:
373
+ if e.code() == grpc.StatusCode.ALREADY_EXISTS:
374
+ logger.info(f"Action {action.name} already exists, continuing to monitor.")
375
+ return
376
+ if e.code() in [
377
+ grpc.StatusCode.FAILED_PRECONDITION,
378
+ grpc.StatusCode.INVALID_ARGUMENT,
379
+ grpc.StatusCode.NOT_FOUND,
380
+ ]:
381
+ raise flyte.errors.RuntimeSystemError(
382
+ e.code().name, f"Precondition failed: {e.details()}"
383
+ ) from e
384
+ # For all other errors, we will retry with backoff
385
+ logger.exception(
386
+ f"Failed to launch action: {action.name}, Code: {e.code()}, "
387
+ f"Details {e.details()} backing off..."
388
+ )
389
+ logger.debug(f"Action details: {action}")
390
+ raise flyte.errors.SlowDownError(f"Failed to launch action: {e.details()}") from e
343
391
 
344
- @log
345
392
  async def _bg_process(self, action: Action):
346
393
  """Process resource updates"""
347
394
  logger.debug(f"Processing action: name={action.name}, started={action.is_started()}")
@@ -358,39 +405,52 @@ class Controller:
358
405
  async def _bg_log_stats(self):
359
406
  """Periodically log resource stats if debug is enabled"""
360
407
  while self._running:
361
- async for started, pending, terminal in self._informers.count_started_pending_terminal_actions():
408
+ async for (
409
+ started,
410
+ pending,
411
+ terminal,
412
+ ) in self._informers.count_started_pending_terminal_actions():
362
413
  logger.info(f"Resource stats: Started={started}, Pending={pending}, Terminal={terminal}")
363
414
  await asyncio.sleep(self._resource_log_interval)
364
415
 
365
- @log
366
- async def _bg_run(self):
416
+ async def _bg_run(self, worker_id: str):
367
417
  """Run loop with resource status logging"""
418
+ logger.info(f"Worker {worker_id} started")
368
419
  while self._running:
369
420
  logger.debug(f"{threading.current_thread().name} Waiting for resource")
370
421
  action = await self._shared_queue.get()
371
422
  logger.debug(f"{threading.current_thread().name} Got resource {action.name}")
372
423
  try:
373
- await self._bg_process(action)
374
- except Exception as e:
375
- logger.error(f"Error in controller loop: {e}")
376
- # TODO we need a better way of handling backoffs currently the entire worker coroutine backs off
377
- await asyncio.sleep(self._min_backoff_on_err)
378
- action.increment_retries()
379
- if action.retries > self._max_retries:
380
- err = flyte.errors.RuntimeSystemError(
381
- code=type(e).__name__,
382
- message=f"Controller failed, system retries {action.retries}"
383
- f" crossed threshold {self._max_retries}",
384
- )
385
- err.__cause__ = e
386
- action.set_client_error(err)
387
- informer = await self._informers.get(
388
- run_name=action.run_name, parent_action_name=action.parent_action_name
424
+ try:
425
+ await self._bg_process(action)
426
+ except flyte.errors.SlowDownError as e:
427
+ action.retries += 1
428
+ if action.retries > self._max_retries:
429
+ raise
430
+ backoff = min(self._min_backoff_on_err * (2 ** (action.retries - 1)), self._max_backoff_on_err)
431
+ logger.warning(
432
+ f"[{worker_id}] Backing off for {backoff} [retry {action.retries}/{self._max_retries}] "
433
+ f"on action {action.name} due to error: {e}"
389
434
  )
390
- if informer:
391
- await informer.fire_completion_event(action.name)
392
- else:
435
+ await asyncio.sleep(backoff)
436
+ logger.warning(f"[{worker_id}] Retrying action {action.name} after backoff")
393
437
  await self._shared_queue.put(action)
438
+ except Exception as e:
439
+ logger.error(f"[{worker_id}] Error in controller loop for {action.name}: {e}")
440
+ err = flyte.errors.RuntimeSystemError(
441
+ code=type(e).__name__,
442
+ message=f"Controller failed, system retries {action.retries} / {self._max_retries} "
443
+ f"crossed threshold, for action {action.name}: {e}",
444
+ worker=worker_id,
445
+ )
446
+ err.__cause__ = e
447
+ action.set_client_error(err)
448
+ informer = await self._informers.get(
449
+ run_name=action.run_name,
450
+ parent_action_name=action.parent_action_name,
451
+ )
452
+ if informer:
453
+ await informer.fire_completion_event(action.name)
394
454
  finally:
395
455
  self._shared_queue.task_done()
396
456
 
@@ -5,9 +5,10 @@ from asyncio import Queue
5
5
  from typing import AsyncIterator, Callable, Dict, Optional, Tuple, cast
6
6
 
7
7
  import grpc.aio
8
+ from flyteidl2.common import identifier_pb2, phase_pb2
9
+ from flyteidl2.workflow import state_service_pb2
8
10
 
9
11
  from flyte._logging import log, logger
10
- from flyte._protos.workflow import run_definition_pb2, state_service_pb2
11
12
 
12
13
  from ._action import Action
13
14
  from ._service_protocol import StateService
@@ -38,12 +39,14 @@ class ActionCache:
38
39
  """
39
40
  Add an action to the cache if it doesn't exist. This is invoked by the watch.
40
41
  """
41
- logger.info(f"Observing phase {run_definition_pb2.Phase.Name(state.phase)} for {state.action_id.name}")
42
+ logger.debug(f"Observing phase {phase_pb2.ActionPhase.Name(state.phase)} for {state.action_id.name}")
42
43
  if state.output_uri:
43
- logger.info(f"Output URI: {state.output_uri}")
44
+ logger.debug(f"Output URI: {state.output_uri}")
44
45
  else:
45
- logger.info(f"{state.action_id.name} has no output URI")
46
- if state.phase == run_definition_pb2.Phase.PHASE_FAILED:
46
+ logger.warning(
47
+ f"{state.action_id.name} has no output URI, in phase {phase_pb2.ActionPhase.Name(state.phase)}"
48
+ )
49
+ if state.phase == phase_pb2.ACTION_PHASE_FAILED:
47
50
  logger.error(
48
51
  f"Action {state.action_id.name} failed with error (msg):"
49
52
  f" [{state.error if state.HasField('error') else None}]"
@@ -125,12 +128,14 @@ class Informer:
125
128
 
126
129
  def __init__(
127
130
  self,
128
- run_id: run_definition_pb2.RunIdentifier,
131
+ run_id: identifier_pb2.RunIdentifier,
129
132
  parent_action_name: str,
130
133
  shared_queue: Queue,
131
134
  client: Optional[StateService] = None,
132
- watch_backoff_interval_sec: float = 1.0,
135
+ min_watch_backoff: float = 1.0,
136
+ max_watch_backoff: float = 30.0,
133
137
  watch_conn_timeout_sec: float = 5.0,
138
+ max_watch_retries: int = 10,
134
139
  ):
135
140
  self.name = self.mkname(run_name=run_id.name, parent_action_name=parent_action_name)
136
141
  self.parent_action_name = parent_action_name
@@ -141,8 +146,10 @@ class Informer:
141
146
  self._running = False
142
147
  self._watch_task: asyncio.Task | None = None
143
148
  self._ready = asyncio.Event()
144
- self._watch_backoff_interval_sec = watch_backoff_interval_sec
149
+ self._min_watch_backoff = min_watch_backoff
150
+ self._max_watch_backoff = max_watch_backoff
145
151
  self._watch_conn_timeout_sec = watch_conn_timeout_sec
152
+ self._max_watch_retries = max_watch_retries
146
153
 
147
154
  @classmethod
148
155
  def mkname(cls, *, run_name: str, parent_action_name: str) -> str:
@@ -208,16 +215,19 @@ class Informer:
208
215
  """
209
216
  # sentinel = False
210
217
  retries = 0
211
- max_retries = 5
212
218
  last_exc = None
213
219
  while self._running:
214
- if retries >= max_retries:
215
- logger.error(f"Informer watch failure retries crossed threshold {retries}/{max_retries}, exiting!")
220
+ if retries >= self._max_watch_retries:
221
+ logger.error(
222
+ f"Informer watch failure retries crossed threshold {retries}/{self._max_watch_retries}, exiting!"
223
+ )
216
224
  raise last_exc
217
225
  try:
226
+ if retries >= 1:
227
+ logger.warning(f"Informer watch retrying, attempt {retries}/{self._max_watch_retries}")
218
228
  watcher = self._client.Watch(
219
229
  state_service_pb2.WatchRequest(
220
- parent_action_id=run_definition_pb2.ActionIdentifier(
230
+ parent_action_id=identifier_pb2.ActionIdentifier(
221
231
  name=self.parent_action_name,
222
232
  run=self._run_id,
223
233
  ),
@@ -235,7 +245,7 @@ class Informer:
235
245
  await self._shared_queue.put(node)
236
246
  # hack to work in the absence of sentinel
237
247
  except asyncio.CancelledError:
238
- logger.warning(f"Watch cancelled: {self.name}")
248
+ logger.info(f"Watch cancelled: {self.name}")
239
249
  return
240
250
  except asyncio.TimeoutError as e:
241
251
  logger.error(f"Watch timeout: {self.name}", exc_info=e)
@@ -249,7 +259,9 @@ class Informer:
249
259
  logger.exception(f"Watch error: {self.name}", exc_info=e)
250
260
  last_exc = e
251
261
  retries += 1
252
- await asyncio.sleep(self._watch_backoff_interval_sec)
262
+ backoff = min(self._min_watch_backoff * (2**retries), self._max_watch_backoff)
263
+ logger.warning(f"Watch for {self.name} failed, retrying in {backoff} seconds...")
264
+ await asyncio.sleep(backoff)
253
265
 
254
266
  @log
255
267
  async def start(self, timeout: Optional[float] = None) -> asyncio.Task:
@@ -258,7 +270,7 @@ class Informer:
258
270
  logger.warning("Informer already running")
259
271
  return cast(asyncio.Task, self._watch_task)
260
272
  self._running = True
261
- self._watch_task = asyncio.create_task(self.watch())
273
+ self._watch_task = asyncio.create_task(self.watch(), name=f"InformerWatch-{self.parent_action_name}")
262
274
  await self.wait_for_cache_sync(timeout=timeout)
263
275
  return self._watch_task
264
276
 
@@ -288,7 +300,7 @@ class InformerCache:
288
300
  @log
289
301
  async def get_or_create(
290
302
  self,
291
- run_id: run_definition_pb2.RunIdentifier,
303
+ run_id: identifier_pb2.RunIdentifier,
292
304
  parent_action_name: str,
293
305
  shared_queue: Queue,
294
306
  state_service: StateService,
@@ -330,20 +342,28 @@ class InformerCache:
330
342
  async def get(self, *, run_name: str, parent_action_name: str) -> Informer | None:
331
343
  """Get an informer by name"""
332
344
  async with self._lock:
333
- return self._cache.get(Informer.mkname(run_name=run_name, parent_action_name=parent_action_name), None)
345
+ return self._cache.get(
346
+ Informer.mkname(run_name=run_name, parent_action_name=parent_action_name),
347
+ None,
348
+ )
334
349
 
335
350
  @log
336
351
  async def remove(self, *, run_name: str, parent_action_name: str) -> Informer | None:
337
352
  """Remove an informer from the cache"""
338
353
  async with self._lock:
339
- return self._cache.pop(Informer.mkname(run_name=run_name, parent_action_name=parent_action_name), None)
354
+ return self._cache.pop(
355
+ Informer.mkname(run_name=run_name, parent_action_name=parent_action_name),
356
+ None,
357
+ )
340
358
 
341
359
  async def has(self, *, run_name: str, parent_action_name: str) -> bool:
342
360
  """Check if an informer exists in the cache"""
343
361
  async with self._lock:
344
362
  return Informer.mkname(run_name=run_name, parent_action_name=parent_action_name) in self._cache
345
363
 
346
- async def count_started_pending_terminal_actions(self) -> AsyncIterator[Tuple[int, int, int]]:
364
+ async def count_started_pending_terminal_actions(
365
+ self,
366
+ ) -> AsyncIterator[Tuple[int, int, int]]:
347
367
  """Log resource stats"""
348
368
  async with self._lock:
349
369
  for informer in self._cache.values():
@@ -353,7 +373,7 @@ class InformerCache:
353
373
  """Stop all informers and remove them from the cache"""
354
374
  async with self._lock:
355
375
  while self._cache:
356
- name, informer = self._cache.popitem()
376
+ _name, informer = self._cache.popitem()
357
377
  try:
358
378
  await informer.stop()
359
379
  except asyncio.CancelledError:
@@ -2,7 +2,7 @@ from __future__ import annotations
2
2
 
3
3
  from typing import AsyncIterator, Protocol
4
4
 
5
- from flyte._protos.workflow import queue_service_pb2, state_service_pb2
5
+ from flyteidl2.workflow import queue_service_pb2, state_service_pb2
6
6
 
7
7
 
8
8
  class StateService(Protocol):
@@ -33,7 +33,7 @@ class QueueService(Protocol):
33
33
  req: queue_service_pb2.AbortQueuedActionRequest,
34
34
  **kwargs,
35
35
  ) -> queue_service_pb2.AbortQueuedActionResponse:
36
- """Dequeue a task"""
36
+ """Cancel an enqueued task"""
37
37
 
38
38
 
39
39
  class ClientSet(Protocol):
@@ -1,11 +1,3 @@
1
- import asyncio
2
- from typing import List
1
+ from flyte._internal.imagebuild.image_builder import ImageBuildEngine
3
2
 
4
- from flyte._image import Image
5
- from flyte._internal.imagebuild.docker_builder import DockerImageBuilder
6
-
7
-
8
- async def build(images: List[Image]) -> List[str]:
9
- builder = DockerImageBuilder()
10
- ts = [asyncio.create_task(builder.build_image(image)) for image in images]
11
- return list(await asyncio.gather(*ts))
3
+ __all__ = ["ImageBuildEngine"]