indexify 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +3 -17
- indexify/executor/api_objects.py +12 -0
- indexify/executor/downloader.py +4 -1
- indexify/executor/executor.py +51 -29
- indexify/executor/function_executor/function_executor.py +24 -11
- indexify/executor/function_executor/function_executor_state.py +9 -1
- indexify/executor/function_executor/function_executor_states_container.py +3 -1
- indexify/executor/function_executor/function_executor_status.py +2 -0
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
- indexify/executor/function_executor/single_task_runner.py +15 -11
- indexify/executor/function_executor/task_output.py +35 -2
- indexify/executor/grpc/completed_tasks_container.py +26 -0
- indexify/executor/grpc/function_executor_controller.py +421 -0
- indexify/executor/grpc/state_reconciler.py +24 -34
- indexify/executor/grpc/state_reporter.py +35 -32
- indexify/executor/grpc/task_controller.py +449 -0
- indexify/executor/metrics/task_reporter.py +14 -0
- indexify/executor/task_reporter.py +95 -4
- indexify/executor/task_runner.py +1 -0
- indexify/proto/executor_api.proto +63 -5
- indexify/proto/executor_api_pb2.py +40 -30
- indexify/proto/executor_api_pb2.pyi +118 -3
- indexify/proto/executor_api_pb2_grpc.py +47 -0
- {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/METADATA +1 -1
- {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/RECORD +27 -24
- {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/WHEEL +0 -0
- {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,421 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import Any, Optional
|
3
|
+
|
4
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
5
|
+
InitializeRequest,
|
6
|
+
SerializedObject,
|
7
|
+
)
|
8
|
+
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
9
|
+
|
10
|
+
from indexify.proto.executor_api_pb2 import (
|
11
|
+
FunctionExecutorDescription,
|
12
|
+
)
|
13
|
+
from indexify.proto.executor_api_pb2 import (
|
14
|
+
FunctionExecutorStatus as FunctionExecutorStatusProto,
|
15
|
+
)
|
16
|
+
|
17
|
+
from ..downloader import Downloader
|
18
|
+
from ..function_executor.function_executor import CustomerError, FunctionExecutor
|
19
|
+
from ..function_executor.function_executor_state import FunctionExecutorState
|
20
|
+
from ..function_executor.function_executor_status import FunctionExecutorStatus
|
21
|
+
from ..function_executor.health_checker import HealthCheckResult
|
22
|
+
from ..function_executor.server.function_executor_server_factory import (
|
23
|
+
FunctionExecutorServerConfiguration,
|
24
|
+
FunctionExecutorServerFactory,
|
25
|
+
)
|
26
|
+
|
27
|
+
|
28
|
+
class FunctionExecutorController:
|
29
|
+
def __init__(
|
30
|
+
self,
|
31
|
+
executor_id: str,
|
32
|
+
function_executor_state: FunctionExecutorState,
|
33
|
+
function_executor_description: FunctionExecutorDescription,
|
34
|
+
function_executor_server_factory: FunctionExecutorServerFactory,
|
35
|
+
downloader: Downloader,
|
36
|
+
base_url: str,
|
37
|
+
config_path: str,
|
38
|
+
logger: Any,
|
39
|
+
):
|
40
|
+
"""Initializes the FunctionExecutorController.
|
41
|
+
|
42
|
+
Raises ValueError if the supplied FunctionExecutorDescription is not valid.
|
43
|
+
"""
|
44
|
+
_validate_function_executor_description(function_executor_description)
|
45
|
+
self._executor_id: str = executor_id
|
46
|
+
self._function_executor_state: FunctionExecutorState = function_executor_state
|
47
|
+
self._function_executor_description: FunctionExecutorDescription = (
|
48
|
+
function_executor_description
|
49
|
+
)
|
50
|
+
self._function_executor_server_factory: FunctionExecutorServerFactory = (
|
51
|
+
function_executor_server_factory
|
52
|
+
)
|
53
|
+
self._downloader: Downloader = downloader
|
54
|
+
self._base_url: str = base_url
|
55
|
+
self._config_path: str = config_path
|
56
|
+
self._logger: Any = logger.bind(
|
57
|
+
module=__name__,
|
58
|
+
function_executor_id=function_executor_description.id,
|
59
|
+
namespace=function_executor_description.namespace,
|
60
|
+
graph_name=function_executor_description.graph_name,
|
61
|
+
graph_version=function_executor_description.graph_version,
|
62
|
+
function_name=function_executor_description.function_name,
|
63
|
+
image_uri=function_executor_description.image_uri,
|
64
|
+
)
|
65
|
+
self._reconciliation_loop_task: asyncio.Task = asyncio.create_task(
|
66
|
+
self._reconciliation_loop()
|
67
|
+
)
|
68
|
+
# The locks protects the desired status.
|
69
|
+
self._lock: asyncio.Lock = asyncio.Lock()
|
70
|
+
# The same as the initial FE status.
|
71
|
+
self._desired_status: FunctionExecutorStatusProto = (
|
72
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
|
73
|
+
)
|
74
|
+
self._desired_status_change_notifier: asyncio.Condition = asyncio.Condition(
|
75
|
+
lock=self._lock
|
76
|
+
)
|
77
|
+
|
78
|
+
async def set_desired_status(
|
79
|
+
self, desired_status: FunctionExecutorStatusProto
|
80
|
+
) -> None:
|
81
|
+
"""Updates the desired Function Executor status.
|
82
|
+
|
83
|
+
Reconciliation is done asynchronously.
|
84
|
+
"""
|
85
|
+
async with self._lock:
|
86
|
+
if self._desired_status == desired_status:
|
87
|
+
return
|
88
|
+
self._desired_status = desired_status
|
89
|
+
self._desired_status_change_notifier.notify_all()
|
90
|
+
|
91
|
+
async def _reconciliation_loop(self) -> None:
|
92
|
+
self._logger.info("function executor controller reconciliation loop started")
|
93
|
+
# The same as the initial FE status.
|
94
|
+
last_seen_desired_status: FunctionExecutorStatusProto = (
|
95
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
|
96
|
+
)
|
97
|
+
# The loop is exited via loop async task cancellation on FE shutdown.
|
98
|
+
while True:
|
99
|
+
async with self._lock:
|
100
|
+
while last_seen_desired_status == self._desired_status:
|
101
|
+
await self._desired_status_change_notifier.wait()
|
102
|
+
|
103
|
+
last_seen_desired_status = self._desired_status
|
104
|
+
# It's guaranteed that we don't run _reconcile concurrently multiple times.
|
105
|
+
await self._reconcile(last_seen_desired_status)
|
106
|
+
|
107
|
+
async def _reconcile(self, desired_status: FunctionExecutorStatusProto) -> None:
|
108
|
+
async with self._function_executor_state.lock:
|
109
|
+
current_status: FunctionExecutorStatus = (
|
110
|
+
self._function_executor_state.status
|
111
|
+
)
|
112
|
+
# We have to process all possible combination of current and desired statuses.
|
113
|
+
if current_status == FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR:
|
114
|
+
if (
|
115
|
+
desired_status
|
116
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR
|
117
|
+
):
|
118
|
+
return # Same status, nothing to do.
|
119
|
+
|
120
|
+
# All we can do from the current status is to destroy the FE to possibly recreate it later
|
121
|
+
# if Server requests to do this. This is why we don't accept any other desired statuses.
|
122
|
+
return await self._destroy_or_shutdown_fe_if_desired(desired_status)
|
123
|
+
|
124
|
+
if current_status == FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR:
|
125
|
+
if (
|
126
|
+
desired_status
|
127
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR
|
128
|
+
):
|
129
|
+
return # Same status, nothing to do.
|
130
|
+
|
131
|
+
# All we can do from the current status is to destroy the FE to possibly recreate it later
|
132
|
+
# if Server requests to do this. This is why we don't accept any other desired statuses.
|
133
|
+
return await self._destroy_or_shutdown_fe_if_desired(desired_status)
|
134
|
+
|
135
|
+
if current_status == FunctionExecutorStatus.IDLE:
|
136
|
+
if (
|
137
|
+
desired_status
|
138
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE
|
139
|
+
):
|
140
|
+
return # Same status, nothing to do.
|
141
|
+
|
142
|
+
# Server can only request FE destroy or shutdown when FE has IDLE status.
|
143
|
+
# Transition from IDLE to RUNNING_TASK can only be done by Task controller.
|
144
|
+
# Transition from IDLE to UNHEALTHY can only be done by FE controller.
|
145
|
+
return await self._destroy_or_shutdown_fe_if_desired(desired_status)
|
146
|
+
|
147
|
+
if current_status == FunctionExecutorStatus.RUNNING_TASK:
|
148
|
+
if (
|
149
|
+
desired_status
|
150
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK
|
151
|
+
):
|
152
|
+
return # Same status, nothing to do.
|
153
|
+
|
154
|
+
# Server can only request FE destroy or shutdown when FE has RUNNING_TASK status.
|
155
|
+
# Transition from RUNNING_TASK to UNHEALTHY can only be done by Task controller.
|
156
|
+
return await self._destroy_or_shutdown_fe_if_desired(desired_status)
|
157
|
+
|
158
|
+
if current_status == FunctionExecutorStatus.UNHEALTHY:
|
159
|
+
if (
|
160
|
+
desired_status
|
161
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY
|
162
|
+
):
|
163
|
+
return # Same status, nothing to do.
|
164
|
+
|
165
|
+
# Server can only request FE destroy or shutdown when FE has RUNNING_TASK status.
|
166
|
+
return await self._destroy_or_shutdown_fe_if_desired(desired_status)
|
167
|
+
|
168
|
+
if current_status == FunctionExecutorStatus.DESTROYED:
|
169
|
+
if (
|
170
|
+
desired_status
|
171
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
|
172
|
+
):
|
173
|
+
return # Same status, nothing to do.
|
174
|
+
|
175
|
+
return await self._reconcile_from_destroyed(desired_status)
|
176
|
+
|
177
|
+
# _reconcile() can't be called when current FE status is one of "long running" states
|
178
|
+
# handled by FE controller like STARTING_UP and DESTROYING. This is because _reconcile()
|
179
|
+
# is called with concurrency of 1 and _reconcile() waits until these long running states
|
180
|
+
# (operations) are finished before returning.
|
181
|
+
#
|
182
|
+
# It's not possible to have SHUTDOWN current status because when FE controller transitions to SHUTDOWN
|
183
|
+
# status, it cancels the reconciliation loop task.
|
184
|
+
self._logger.error(
|
185
|
+
"unexpected current function executor status, skipping state reconciliation",
|
186
|
+
current_status=current_status.name,
|
187
|
+
desired_status=FunctionExecutorStatusProto.Name(desired_status),
|
188
|
+
)
|
189
|
+
|
190
|
+
async def _destroy_or_shutdown_fe_if_desired(
|
191
|
+
self, desired_status: FunctionExecutorStatusProto
|
192
|
+
) -> None:
|
193
|
+
"""Destroys the Function Executor if desired status asks for it.
|
194
|
+
|
195
|
+
Otherwise logs an error because other actions are not allowed by the current status.
|
196
|
+
Caller holds the FE state lock.
|
197
|
+
"""
|
198
|
+
if desired_status not in [
|
199
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
|
200
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
|
201
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
|
202
|
+
]:
|
203
|
+
self._logger.error(
|
204
|
+
"unexpected desired function executor status received from server, skipping state reconciliation",
|
205
|
+
current_status=self._function_executor_state.status.name,
|
206
|
+
desired_status=FunctionExecutorStatusProto.Name(desired_status),
|
207
|
+
)
|
208
|
+
return
|
209
|
+
|
210
|
+
await self._destroy_function_executor()
|
211
|
+
# FE state status is now DESTROYED.
|
212
|
+
if (
|
213
|
+
desired_status
|
214
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
|
215
|
+
):
|
216
|
+
await self._shutdown()
|
217
|
+
# No code is executed after this point because reconciliation loop aio task is cancelled.
|
218
|
+
|
219
|
+
async def _reconcile_from_destroyed(
|
220
|
+
self, desired_status: FunctionExecutorStatusProto
|
221
|
+
) -> None:
|
222
|
+
"""Reconciles the FE state when it has DESTROYED status.
|
223
|
+
|
224
|
+
Caller holds the FE state lock.
|
225
|
+
"""
|
226
|
+
if desired_status not in [
|
227
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTING_UP,
|
228
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE,
|
229
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK,
|
230
|
+
FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
|
231
|
+
]:
|
232
|
+
self._logger.error(
|
233
|
+
"unexpected desired function executor status received from server, skipping state reconciliation",
|
234
|
+
current_status=self._function_executor_state.status.name,
|
235
|
+
desired_status=FunctionExecutorStatusProto.Name(desired_status),
|
236
|
+
)
|
237
|
+
return
|
238
|
+
|
239
|
+
if (
|
240
|
+
desired_status
|
241
|
+
== FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
|
242
|
+
):
|
243
|
+
await self._shutdown()
|
244
|
+
# No code is executed after this point because reconciliation loop aio task is cancelled.
|
245
|
+
return
|
246
|
+
|
247
|
+
# All the rest of the allowed desired statuses ask to create the FE.
|
248
|
+
await self._function_executor_state.set_status(
|
249
|
+
FunctionExecutorStatus.STARTING_UP
|
250
|
+
)
|
251
|
+
|
252
|
+
next_status: FunctionExecutorStatus = FunctionExecutorStatus.IDLE
|
253
|
+
next_status_message: str = ""
|
254
|
+
async with _UnlockedLockContextManager(self._function_executor_state.lock):
|
255
|
+
try:
|
256
|
+
function_executor: FunctionExecutor = await _create_function_executor(
|
257
|
+
function_executor_description=self._function_executor_description,
|
258
|
+
function_executor_server_factory=self._function_executor_server_factory,
|
259
|
+
downloader=self._downloader,
|
260
|
+
executor_id=self._executor_id,
|
261
|
+
base_url=self._base_url,
|
262
|
+
config_path=self._config_path,
|
263
|
+
logger=self._logger,
|
264
|
+
)
|
265
|
+
except CustomerError as e:
|
266
|
+
next_status = FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
|
267
|
+
next_status_message = str(e)
|
268
|
+
except Exception as e:
|
269
|
+
next_status = FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
|
270
|
+
|
271
|
+
# FE state lock is acquired again at this point.
|
272
|
+
await self._function_executor_state.set_status(next_status, next_status_message)
|
273
|
+
|
274
|
+
if next_status == FunctionExecutorStatus.IDLE:
|
275
|
+
# Task controllers will notice that this FE is IDLE and start running on it one by one.
|
276
|
+
self._function_executor_state.function_executor = function_executor
|
277
|
+
# Health checker starts after FE creation and gets automatically stopped on FE destroy.
|
278
|
+
self._function_executor_state.function_executor.health_checker().start(
|
279
|
+
self._health_check_failed_callback
|
280
|
+
)
|
281
|
+
|
282
|
+
async def _destroy_function_executor(self) -> None:
|
283
|
+
"""Destroys the Function Executor if it exists.
|
284
|
+
|
285
|
+
Caller holds the FE state lock.
|
286
|
+
"""
|
287
|
+
await self._function_executor_state.set_status(
|
288
|
+
FunctionExecutorStatus.DESTROYING
|
289
|
+
)
|
290
|
+
async with _UnlockedLockContextManager(self._function_executor_state.lock):
|
291
|
+
await self._function_executor_state.function_executor.destroy()
|
292
|
+
await self._function_executor_state.set_status(FunctionExecutorStatus.DESTROYED)
|
293
|
+
self._function_executor_state.function_executor = None
|
294
|
+
|
295
|
+
async def _shutdown(self) -> None:
|
296
|
+
"""Shuts down the controller.
|
297
|
+
|
298
|
+
Caller holds the FE state lock.
|
299
|
+
Raises asyncio.CancelledError on return when called from reconciliation loop.
|
300
|
+
"""
|
301
|
+
self._logger.info("shutting down function executor controller")
|
302
|
+
await self._function_executor_state.set_status(FunctionExecutorStatus.SHUTDOWN)
|
303
|
+
self._reconciliation_loop_task.cancel()
|
304
|
+
await self._reconciliation_loop_task
|
305
|
+
|
306
|
+
async def _health_check_failed_callback(self, result: HealthCheckResult):
|
307
|
+
async with self._function_executor_state.lock:
|
308
|
+
if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
|
309
|
+
return
|
310
|
+
|
311
|
+
if self._function_executor_state.status in (
|
312
|
+
FunctionExecutorStatus.IDLE,
|
313
|
+
FunctionExecutorStatus.RUNNING_TASK,
|
314
|
+
):
|
315
|
+
# There can be false positive health check failures when we're creating
|
316
|
+
# or destroying FEs so we're not interested in them.
|
317
|
+
#
|
318
|
+
# Server should react to this transition into unhealthy state and ask to
|
319
|
+
# destroy this FE.
|
320
|
+
await self._function_executor_state.set_status(
|
321
|
+
FunctionExecutorStatus.UNHEALTHY
|
322
|
+
)
|
323
|
+
|
324
|
+
|
325
|
+
async def _create_function_executor(
|
326
|
+
function_executor_description: FunctionExecutorDescription,
|
327
|
+
function_executor_server_factory: FunctionExecutorServerFactory,
|
328
|
+
downloader: Downloader,
|
329
|
+
executor_id: str,
|
330
|
+
base_url: str,
|
331
|
+
config_path: str,
|
332
|
+
logger: Any,
|
333
|
+
) -> FunctionExecutor:
|
334
|
+
"""Creates a function executor.
|
335
|
+
|
336
|
+
Raises Exception in case of failure.
|
337
|
+
Raises CustomerError if customer code failed during FE creation.
|
338
|
+
"""
|
339
|
+
graph: SerializedObject = await downloader.download_graph(
|
340
|
+
namespace=function_executor_description.namespace,
|
341
|
+
graph_name=function_executor_description.graph_name,
|
342
|
+
graph_version=function_executor_description.graph_version,
|
343
|
+
logger=logger,
|
344
|
+
)
|
345
|
+
|
346
|
+
config: FunctionExecutorServerConfiguration = FunctionExecutorServerConfiguration(
|
347
|
+
executor_id=executor_id,
|
348
|
+
function_executor_id=function_executor_description.id,
|
349
|
+
namespace=function_executor_description.namespace,
|
350
|
+
secret_names=list(function_executor_description.secret_names),
|
351
|
+
)
|
352
|
+
if function_executor_description.HasField("image_uri"):
|
353
|
+
config.image_uri = function_executor_description.image_uri
|
354
|
+
|
355
|
+
initialize_request: InitializeRequest = InitializeRequest(
|
356
|
+
namespace=function_executor_description.namespace,
|
357
|
+
graph_name=function_executor_description.graph_name,
|
358
|
+
graph_version=function_executor_description.graph_version,
|
359
|
+
function_name=function_executor_description.function_name,
|
360
|
+
graph=graph,
|
361
|
+
)
|
362
|
+
customer_code_timeout_sec: Optional[float] = None
|
363
|
+
if function_executor_description.HasField("customer_code_timeout_ms"):
|
364
|
+
# TODO: Add integration tests with FE customer code initialization timeout
|
365
|
+
# when end-to-end implementation is done.
|
366
|
+
customer_code_timeout_sec = (
|
367
|
+
function_executor_description.customer_code_timeout_ms / 1000.0
|
368
|
+
)
|
369
|
+
|
370
|
+
function_executor: FunctionExecutor = FunctionExecutor(
|
371
|
+
server_factory=function_executor_server_factory, logger=logger
|
372
|
+
)
|
373
|
+
|
374
|
+
try:
|
375
|
+
# Raises CustomerError if initialization failed in customer code or customer code timed out.
|
376
|
+
await function_executor.initialize(
|
377
|
+
config=config,
|
378
|
+
initialize_request=initialize_request,
|
379
|
+
base_url=base_url,
|
380
|
+
config_path=config_path,
|
381
|
+
customer_code_timeout_sec=customer_code_timeout_sec,
|
382
|
+
)
|
383
|
+
return function_executor
|
384
|
+
except Exception:
|
385
|
+
await function_executor.destroy()
|
386
|
+
raise
|
387
|
+
|
388
|
+
|
389
|
+
def _validate_function_executor_description(
|
390
|
+
function_executor_description: FunctionExecutorDescription,
|
391
|
+
) -> None:
|
392
|
+
"""Validates the supplied FE description.
|
393
|
+
|
394
|
+
Raises ValueError if the description is not valid.
|
395
|
+
"""
|
396
|
+
validator = MessageValidator(function_executor_description)
|
397
|
+
validator.required_field("id")
|
398
|
+
validator.required_field("namespace")
|
399
|
+
validator.required_field("graph_name")
|
400
|
+
validator.required_field("graph_version")
|
401
|
+
validator.required_field("function_name")
|
402
|
+
# image_uri is optional.
|
403
|
+
# secret_names can be empty.
|
404
|
+
# resource_limits is optional.
|
405
|
+
|
406
|
+
|
407
|
+
class _UnlockedLockContextManager:
|
408
|
+
"""Unlocks its lock on enter to the scope and locks it back on exit."""
|
409
|
+
|
410
|
+
def __init__(
|
411
|
+
self,
|
412
|
+
lock: asyncio.Lock,
|
413
|
+
):
|
414
|
+
self._lock: asyncio.Lock = lock
|
415
|
+
|
416
|
+
async def __aenter__(self):
|
417
|
+
self._lock.release()
|
418
|
+
return self
|
419
|
+
|
420
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
421
|
+
await self._lock.acquire()
|
@@ -30,17 +30,7 @@ from ..function_executor.server.function_executor_server_factory import (
|
|
30
30
|
from ..function_executor.task_input import TaskInput
|
31
31
|
from ..function_executor.task_output import TaskOutput
|
32
32
|
from ..metrics.executor import (
|
33
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ALL,
|
34
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
|
35
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
|
36
|
-
METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
|
37
|
-
metric_task_completion_latency,
|
38
|
-
metric_task_outcome_report_latency,
|
39
|
-
metric_task_outcome_report_retries,
|
40
|
-
metric_task_outcome_reports,
|
41
|
-
metric_tasks_completed,
|
42
33
|
metric_tasks_fetched,
|
43
|
-
metric_tasks_reporting_outcome,
|
44
34
|
)
|
45
35
|
from ..task_reporter import TaskReporter
|
46
36
|
from .channel_manager import ChannelManager
|
@@ -86,29 +76,25 @@ class ExecutorStateReconciler:
|
|
86
76
|
Never raises any exceptions.
|
87
77
|
"""
|
88
78
|
while not self._is_shutdown:
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
exc_info=e,
|
109
|
-
)
|
110
|
-
await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
|
111
|
-
break
|
79
|
+
stub = ExecutorAPIStub(await self._channel_manager.get_channel())
|
80
|
+
while not self._is_shutdown:
|
81
|
+
try:
|
82
|
+
# Report state once before starting the stream so Server
|
83
|
+
# doesn't use old state it knew about this Executor in the past.
|
84
|
+
await self._state_reporter.report_state(stub)
|
85
|
+
desired_states_stream: AsyncGenerator[
|
86
|
+
DesiredExecutorState, None
|
87
|
+
] = stub.get_desired_executor_states(
|
88
|
+
GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
|
89
|
+
)
|
90
|
+
await self._process_desired_states_stream(desired_states_stream)
|
91
|
+
except Exception as e:
|
92
|
+
self._logger.error(
|
93
|
+
f"Failed processing desired states stream, reconnecting in {_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC} sec.",
|
94
|
+
exc_info=e,
|
95
|
+
)
|
96
|
+
await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
|
97
|
+
break
|
112
98
|
|
113
99
|
self._logger.info("State reconciler shutdown.")
|
114
100
|
|
@@ -128,6 +114,7 @@ class ExecutorStateReconciler:
|
|
128
114
|
await self._reconcile_state(new_state)
|
129
115
|
|
130
116
|
async def _reconcile_state(self, new_state: DesiredExecutorState):
|
117
|
+
# TODO: use completed_tasks_container to ignore tasks that were already completed.
|
131
118
|
await self._reconcile_function_executors(new_state)
|
132
119
|
# TODO
|
133
120
|
# await self._reconcile_task_allocations(new_state)
|
@@ -153,6 +140,7 @@ class ExecutorStateReconciler:
|
|
153
140
|
graph_version=desired_function_executor.graph_version,
|
154
141
|
function_name=desired_function_executor.function_name,
|
155
142
|
image_uri=desired_function_executor.image_uri,
|
143
|
+
secret_names=list(desired_function_executor.secret_names),
|
156
144
|
)
|
157
145
|
)
|
158
146
|
|
@@ -296,7 +284,9 @@ class ExecutorStateReconciler:
|
|
296
284
|
while True:
|
297
285
|
logger = logger.bind(retries=reporting_retries)
|
298
286
|
try:
|
299
|
-
await self._task_reporter.report(
|
287
|
+
await self._task_reporter.report(
|
288
|
+
data_payload=task_output, logger=logger
|
289
|
+
)
|
300
290
|
break
|
301
291
|
except Exception as e:
|
302
292
|
logger.error(
|
@@ -90,24 +90,22 @@ class ExecutorStateReporter:
|
|
90
90
|
Never raises any exceptions.
|
91
91
|
"""
|
92
92
|
while not self._is_shutdown:
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
|
110
|
-
break
|
93
|
+
stub = ExecutorAPIStub(await self._channel_manager.get_channel())
|
94
|
+
while not self._is_shutdown:
|
95
|
+
try:
|
96
|
+
# The periodic state reports serve as channel health monitoring requests
|
97
|
+
# (same as TCP keep-alive). Channel Manager returns the same healthy channel
|
98
|
+
# for all RPCs that we do from Executor to Server. So all the RPCs benefit
|
99
|
+
# from this channel health monitoring.
|
100
|
+
await self.report_state(stub)
|
101
|
+
await asyncio.sleep(_REPORTING_INTERVAL_SEC)
|
102
|
+
except Exception as e:
|
103
|
+
self._logger.error(
|
104
|
+
f"Failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
|
105
|
+
exc_info=e,
|
106
|
+
)
|
107
|
+
await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
|
108
|
+
break
|
111
109
|
|
112
110
|
self._logger.info("State reporter shutdown")
|
113
111
|
|
@@ -157,20 +155,25 @@ class ExecutorStateReporter:
|
|
157
155
|
|
158
156
|
async for function_executor_state in self._function_executor_states:
|
159
157
|
function_executor_state: FunctionExecutorState
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
158
|
+
function_executor_state_proto = FunctionExecutorStateProto(
|
159
|
+
description=FunctionExecutorDescription(
|
160
|
+
id=function_executor_state.id,
|
161
|
+
namespace=function_executor_state.namespace,
|
162
|
+
graph_name=function_executor_state.graph_name,
|
163
|
+
graph_version=function_executor_state.graph_version,
|
164
|
+
function_name=function_executor_state.function_name,
|
165
|
+
secret_names=function_executor_state.secret_names,
|
166
|
+
),
|
167
|
+
status=_to_grpc_function_executor_status(
|
168
|
+
function_executor_state.status, self._logger
|
169
|
+
),
|
170
|
+
status_message=function_executor_state.status_message,
|
173
171
|
)
|
172
|
+
if function_executor_state.image_uri:
|
173
|
+
function_executor_state_proto.description.image_uri = (
|
174
|
+
function_executor_state.image_uri
|
175
|
+
)
|
176
|
+
states.append(function_executor_state_proto)
|
174
177
|
|
175
178
|
return states
|
176
179
|
|
@@ -210,7 +213,7 @@ _STATUS_MAPPING: Dict[FunctionExecutorStatus, Any] = {
|
|
210
213
|
FunctionExecutorStatus.UNHEALTHY: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY,
|
211
214
|
FunctionExecutorStatus.DESTROYING: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
|
212
215
|
FunctionExecutorStatus.DESTROYED: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
|
213
|
-
FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.
|
216
|
+
FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
|
214
217
|
}
|
215
218
|
|
216
219
|
|