indexify 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (27) hide show
  1. indexify/cli/cli.py +3 -17
  2. indexify/executor/api_objects.py +12 -0
  3. indexify/executor/downloader.py +4 -1
  4. indexify/executor/executor.py +51 -29
  5. indexify/executor/function_executor/function_executor.py +24 -11
  6. indexify/executor/function_executor/function_executor_state.py +9 -1
  7. indexify/executor/function_executor/function_executor_states_container.py +3 -1
  8. indexify/executor/function_executor/function_executor_status.py +2 -0
  9. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
  10. indexify/executor/function_executor/single_task_runner.py +15 -11
  11. indexify/executor/function_executor/task_output.py +35 -2
  12. indexify/executor/grpc/completed_tasks_container.py +26 -0
  13. indexify/executor/grpc/function_executor_controller.py +421 -0
  14. indexify/executor/grpc/state_reconciler.py +24 -34
  15. indexify/executor/grpc/state_reporter.py +35 -32
  16. indexify/executor/grpc/task_controller.py +449 -0
  17. indexify/executor/metrics/task_reporter.py +14 -0
  18. indexify/executor/task_reporter.py +95 -4
  19. indexify/executor/task_runner.py +1 -0
  20. indexify/proto/executor_api.proto +63 -5
  21. indexify/proto/executor_api_pb2.py +40 -30
  22. indexify/proto/executor_api_pb2.pyi +118 -3
  23. indexify/proto/executor_api_pb2_grpc.py +47 -0
  24. {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/METADATA +1 -1
  25. {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/RECORD +27 -24
  26. {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/WHEEL +0 -0
  27. {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,421 @@
1
+ import asyncio
2
+ from typing import Any, Optional
3
+
4
+ from tensorlake.function_executor.proto.function_executor_pb2 import (
5
+ InitializeRequest,
6
+ SerializedObject,
7
+ )
8
+ from tensorlake.function_executor.proto.message_validator import MessageValidator
9
+
10
+ from indexify.proto.executor_api_pb2 import (
11
+ FunctionExecutorDescription,
12
+ )
13
+ from indexify.proto.executor_api_pb2 import (
14
+ FunctionExecutorStatus as FunctionExecutorStatusProto,
15
+ )
16
+
17
+ from ..downloader import Downloader
18
+ from ..function_executor.function_executor import CustomerError, FunctionExecutor
19
+ from ..function_executor.function_executor_state import FunctionExecutorState
20
+ from ..function_executor.function_executor_status import FunctionExecutorStatus
21
+ from ..function_executor.health_checker import HealthCheckResult
22
+ from ..function_executor.server.function_executor_server_factory import (
23
+ FunctionExecutorServerConfiguration,
24
+ FunctionExecutorServerFactory,
25
+ )
26
+
27
+
28
+ class FunctionExecutorController:
29
+ def __init__(
30
+ self,
31
+ executor_id: str,
32
+ function_executor_state: FunctionExecutorState,
33
+ function_executor_description: FunctionExecutorDescription,
34
+ function_executor_server_factory: FunctionExecutorServerFactory,
35
+ downloader: Downloader,
36
+ base_url: str,
37
+ config_path: str,
38
+ logger: Any,
39
+ ):
40
+ """Initializes the FunctionExecutorController.
41
+
42
+ Raises ValueError if the supplied FunctionExecutorDescription is not valid.
43
+ """
44
+ _validate_function_executor_description(function_executor_description)
45
+ self._executor_id: str = executor_id
46
+ self._function_executor_state: FunctionExecutorState = function_executor_state
47
+ self._function_executor_description: FunctionExecutorDescription = (
48
+ function_executor_description
49
+ )
50
+ self._function_executor_server_factory: FunctionExecutorServerFactory = (
51
+ function_executor_server_factory
52
+ )
53
+ self._downloader: Downloader = downloader
54
+ self._base_url: str = base_url
55
+ self._config_path: str = config_path
56
+ self._logger: Any = logger.bind(
57
+ module=__name__,
58
+ function_executor_id=function_executor_description.id,
59
+ namespace=function_executor_description.namespace,
60
+ graph_name=function_executor_description.graph_name,
61
+ graph_version=function_executor_description.graph_version,
62
+ function_name=function_executor_description.function_name,
63
+ image_uri=function_executor_description.image_uri,
64
+ )
65
+ self._reconciliation_loop_task: asyncio.Task = asyncio.create_task(
66
+ self._reconciliation_loop()
67
+ )
68
+ # The locks protects the desired status.
69
+ self._lock: asyncio.Lock = asyncio.Lock()
70
+ # The same as the initial FE status.
71
+ self._desired_status: FunctionExecutorStatusProto = (
72
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
73
+ )
74
+ self._desired_status_change_notifier: asyncio.Condition = asyncio.Condition(
75
+ lock=self._lock
76
+ )
77
+
78
+ async def set_desired_status(
79
+ self, desired_status: FunctionExecutorStatusProto
80
+ ) -> None:
81
+ """Updates the desired Function Executor status.
82
+
83
+ Reconciliation is done asynchronously.
84
+ """
85
+ async with self._lock:
86
+ if self._desired_status == desired_status:
87
+ return
88
+ self._desired_status = desired_status
89
+ self._desired_status_change_notifier.notify_all()
90
+
91
+ async def _reconciliation_loop(self) -> None:
92
+ self._logger.info("function executor controller reconciliation loop started")
93
+ # The same as the initial FE status.
94
+ last_seen_desired_status: FunctionExecutorStatusProto = (
95
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
96
+ )
97
+ # The loop is exited via loop async task cancellation on FE shutdown.
98
+ while True:
99
+ async with self._lock:
100
+ while last_seen_desired_status == self._desired_status:
101
+ await self._desired_status_change_notifier.wait()
102
+
103
+ last_seen_desired_status = self._desired_status
104
+ # It's guaranteed that we don't run _reconcile concurrently multiple times.
105
+ await self._reconcile(last_seen_desired_status)
106
+
107
+ async def _reconcile(self, desired_status: FunctionExecutorStatusProto) -> None:
108
+ async with self._function_executor_state.lock:
109
+ current_status: FunctionExecutorStatus = (
110
+ self._function_executor_state.status
111
+ )
112
+ # We have to process all possible combination of current and desired statuses.
113
+ if current_status == FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR:
114
+ if (
115
+ desired_status
116
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR
117
+ ):
118
+ return # Same status, nothing to do.
119
+
120
+ # All we can do from the current status is to destroy the FE to possibly recreate it later
121
+ # if Server requests to do this. This is why we don't accept any other desired statuses.
122
+ return await self._destroy_or_shutdown_fe_if_desired(desired_status)
123
+
124
+ if current_status == FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR:
125
+ if (
126
+ desired_status
127
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR
128
+ ):
129
+ return # Same status, nothing to do.
130
+
131
+ # All we can do from the current status is to destroy the FE to possibly recreate it later
132
+ # if Server requests to do this. This is why we don't accept any other desired statuses.
133
+ return await self._destroy_or_shutdown_fe_if_desired(desired_status)
134
+
135
+ if current_status == FunctionExecutorStatus.IDLE:
136
+ if (
137
+ desired_status
138
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE
139
+ ):
140
+ return # Same status, nothing to do.
141
+
142
+ # Server can only request FE destroy or shutdown when FE has IDLE status.
143
+ # Transition from IDLE to RUNNING_TASK can only be done by Task controller.
144
+ # Transition from IDLE to UNHEALTHY can only be done by FE controller.
145
+ return await self._destroy_or_shutdown_fe_if_desired(desired_status)
146
+
147
+ if current_status == FunctionExecutorStatus.RUNNING_TASK:
148
+ if (
149
+ desired_status
150
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK
151
+ ):
152
+ return # Same status, nothing to do.
153
+
154
+ # Server can only request FE destroy or shutdown when FE has RUNNING_TASK status.
155
+ # Transition from RUNNING_TASK to UNHEALTHY can only be done by Task controller.
156
+ return await self._destroy_or_shutdown_fe_if_desired(desired_status)
157
+
158
+ if current_status == FunctionExecutorStatus.UNHEALTHY:
159
+ if (
160
+ desired_status
161
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY
162
+ ):
163
+ return # Same status, nothing to do.
164
+
165
+ # Server can only request FE destroy or shutdown when FE has RUNNING_TASK status.
166
+ return await self._destroy_or_shutdown_fe_if_desired(desired_status)
167
+
168
+ if current_status == FunctionExecutorStatus.DESTROYED:
169
+ if (
170
+ desired_status
171
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
172
+ ):
173
+ return # Same status, nothing to do.
174
+
175
+ return await self._reconcile_from_destroyed(desired_status)
176
+
177
+ # _reconcile() can't be called when current FE status is one of "long running" states
178
+ # handled by FE controller like STARTING_UP and DESTROYING. This is because _reconcile()
179
+ # is called with concurrency of 1 and _reconcile() waits until these long running states
180
+ # (operations) are finished before returning.
181
+ #
182
+ # It's not possible to have SHUTDOWN current status because when FE controller transitions to SHUTDOWN
183
+ # status, it cancels the reconciliation loop task.
184
+ self._logger.error(
185
+ "unexpected current function executor status, skipping state reconciliation",
186
+ current_status=current_status.name,
187
+ desired_status=FunctionExecutorStatusProto.Name(desired_status),
188
+ )
189
+
190
+ async def _destroy_or_shutdown_fe_if_desired(
191
+ self, desired_status: FunctionExecutorStatusProto
192
+ ) -> None:
193
+ """Destroys the Function Executor if desired status asks for it.
194
+
195
+ Otherwise logs an error because other actions are not allowed by the current status.
196
+ Caller holds the FE state lock.
197
+ """
198
+ if desired_status not in [
199
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
200
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
201
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
202
+ ]:
203
+ self._logger.error(
204
+ "unexpected desired function executor status received from server, skipping state reconciliation",
205
+ current_status=self._function_executor_state.status.name,
206
+ desired_status=FunctionExecutorStatusProto.Name(desired_status),
207
+ )
208
+ return
209
+
210
+ await self._destroy_function_executor()
211
+ # FE state status is now DESTROYED.
212
+ if (
213
+ desired_status
214
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
215
+ ):
216
+ await self._shutdown()
217
+ # No code is executed after this point because reconciliation loop aio task is cancelled.
218
+
219
+ async def _reconcile_from_destroyed(
220
+ self, desired_status: FunctionExecutorStatusProto
221
+ ) -> None:
222
+ """Reconciles the FE state when it has DESTROYED status.
223
+
224
+ Caller holds the FE state lock.
225
+ """
226
+ if desired_status not in [
227
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTING_UP,
228
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE,
229
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK,
230
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
231
+ ]:
232
+ self._logger.error(
233
+ "unexpected desired function executor status received from server, skipping state reconciliation",
234
+ current_status=self._function_executor_state.status.name,
235
+ desired_status=FunctionExecutorStatusProto.Name(desired_status),
236
+ )
237
+ return
238
+
239
+ if (
240
+ desired_status
241
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
242
+ ):
243
+ await self._shutdown()
244
+ # No code is executed after this point because reconciliation loop aio task is cancelled.
245
+ return
246
+
247
+ # All the rest of the allowed desired statuses ask to create the FE.
248
+ await self._function_executor_state.set_status(
249
+ FunctionExecutorStatus.STARTING_UP
250
+ )
251
+
252
+ next_status: FunctionExecutorStatus = FunctionExecutorStatus.IDLE
253
+ next_status_message: str = ""
254
+ async with _UnlockedLockContextManager(self._function_executor_state.lock):
255
+ try:
256
+ function_executor: FunctionExecutor = await _create_function_executor(
257
+ function_executor_description=self._function_executor_description,
258
+ function_executor_server_factory=self._function_executor_server_factory,
259
+ downloader=self._downloader,
260
+ executor_id=self._executor_id,
261
+ base_url=self._base_url,
262
+ config_path=self._config_path,
263
+ logger=self._logger,
264
+ )
265
+ except CustomerError as e:
266
+ next_status = FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
267
+ next_status_message = str(e)
268
+ except Exception as e:
269
+ next_status = FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
270
+
271
+ # FE state lock is acquired again at this point.
272
+ await self._function_executor_state.set_status(next_status, next_status_message)
273
+
274
+ if next_status == FunctionExecutorStatus.IDLE:
275
+ # Task controllers will notice that this FE is IDLE and start running on it one by one.
276
+ self._function_executor_state.function_executor = function_executor
277
+ # Health checker starts after FE creation and gets automatically stopped on FE destroy.
278
+ self._function_executor_state.function_executor.health_checker().start(
279
+ self._health_check_failed_callback
280
+ )
281
+
282
+ async def _destroy_function_executor(self) -> None:
283
+ """Destroys the Function Executor if it exists.
284
+
285
+ Caller holds the FE state lock.
286
+ """
287
+ await self._function_executor_state.set_status(
288
+ FunctionExecutorStatus.DESTROYING
289
+ )
290
+ async with _UnlockedLockContextManager(self._function_executor_state.lock):
291
+ await self._function_executor_state.function_executor.destroy()
292
+ await self._function_executor_state.set_status(FunctionExecutorStatus.DESTROYED)
293
+ self._function_executor_state.function_executor = None
294
+
295
+ async def _shutdown(self) -> None:
296
+ """Shuts down the controller.
297
+
298
+ Caller holds the FE state lock.
299
+ Raises asyncio.CancelledError on return when called from reconciliation loop.
300
+ """
301
+ self._logger.info("shutting down function executor controller")
302
+ await self._function_executor_state.set_status(FunctionExecutorStatus.SHUTDOWN)
303
+ self._reconciliation_loop_task.cancel()
304
+ await self._reconciliation_loop_task
305
+
306
+ async def _health_check_failed_callback(self, result: HealthCheckResult):
307
+ async with self._function_executor_state.lock:
308
+ if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
309
+ return
310
+
311
+ if self._function_executor_state.status in (
312
+ FunctionExecutorStatus.IDLE,
313
+ FunctionExecutorStatus.RUNNING_TASK,
314
+ ):
315
+ # There can be false positive health check failures when we're creating
316
+ # or destroying FEs so we're not interested in them.
317
+ #
318
+ # Server should react to this transition into unhealthy state and ask to
319
+ # destroy this FE.
320
+ await self._function_executor_state.set_status(
321
+ FunctionExecutorStatus.UNHEALTHY
322
+ )
323
+
324
+
325
+ async def _create_function_executor(
326
+ function_executor_description: FunctionExecutorDescription,
327
+ function_executor_server_factory: FunctionExecutorServerFactory,
328
+ downloader: Downloader,
329
+ executor_id: str,
330
+ base_url: str,
331
+ config_path: str,
332
+ logger: Any,
333
+ ) -> FunctionExecutor:
334
+ """Creates a function executor.
335
+
336
+ Raises Exception in case of failure.
337
+ Raises CustomerError if customer code failed during FE creation.
338
+ """
339
+ graph: SerializedObject = await downloader.download_graph(
340
+ namespace=function_executor_description.namespace,
341
+ graph_name=function_executor_description.graph_name,
342
+ graph_version=function_executor_description.graph_version,
343
+ logger=logger,
344
+ )
345
+
346
+ config: FunctionExecutorServerConfiguration = FunctionExecutorServerConfiguration(
347
+ executor_id=executor_id,
348
+ function_executor_id=function_executor_description.id,
349
+ namespace=function_executor_description.namespace,
350
+ secret_names=list(function_executor_description.secret_names),
351
+ )
352
+ if function_executor_description.HasField("image_uri"):
353
+ config.image_uri = function_executor_description.image_uri
354
+
355
+ initialize_request: InitializeRequest = InitializeRequest(
356
+ namespace=function_executor_description.namespace,
357
+ graph_name=function_executor_description.graph_name,
358
+ graph_version=function_executor_description.graph_version,
359
+ function_name=function_executor_description.function_name,
360
+ graph=graph,
361
+ )
362
+ customer_code_timeout_sec: Optional[float] = None
363
+ if function_executor_description.HasField("customer_code_timeout_ms"):
364
+ # TODO: Add integration tests with FE customer code initialization timeout
365
+ # when end-to-end implementation is done.
366
+ customer_code_timeout_sec = (
367
+ function_executor_description.customer_code_timeout_ms / 1000.0
368
+ )
369
+
370
+ function_executor: FunctionExecutor = FunctionExecutor(
371
+ server_factory=function_executor_server_factory, logger=logger
372
+ )
373
+
374
+ try:
375
+ # Raises CustomerError if initialization failed in customer code or customer code timed out.
376
+ await function_executor.initialize(
377
+ config=config,
378
+ initialize_request=initialize_request,
379
+ base_url=base_url,
380
+ config_path=config_path,
381
+ customer_code_timeout_sec=customer_code_timeout_sec,
382
+ )
383
+ return function_executor
384
+ except Exception:
385
+ await function_executor.destroy()
386
+ raise
387
+
388
+
389
+ def _validate_function_executor_description(
390
+ function_executor_description: FunctionExecutorDescription,
391
+ ) -> None:
392
+ """Validates the supplied FE description.
393
+
394
+ Raises ValueError if the description is not valid.
395
+ """
396
+ validator = MessageValidator(function_executor_description)
397
+ validator.required_field("id")
398
+ validator.required_field("namespace")
399
+ validator.required_field("graph_name")
400
+ validator.required_field("graph_version")
401
+ validator.required_field("function_name")
402
+ # image_uri is optional.
403
+ # secret_names can be empty.
404
+ # resource_limits is optional.
405
+
406
+
407
+ class _UnlockedLockContextManager:
408
+ """Unlocks its lock on enter to the scope and locks it back on exit."""
409
+
410
+ def __init__(
411
+ self,
412
+ lock: asyncio.Lock,
413
+ ):
414
+ self._lock: asyncio.Lock = lock
415
+
416
+ async def __aenter__(self):
417
+ self._lock.release()
418
+ return self
419
+
420
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
421
+ await self._lock.acquire()
@@ -30,17 +30,7 @@ from ..function_executor.server.function_executor_server_factory import (
30
30
  from ..function_executor.task_input import TaskInput
31
31
  from ..function_executor.task_output import TaskOutput
32
32
  from ..metrics.executor import (
33
- METRIC_TASKS_COMPLETED_OUTCOME_ALL,
34
- METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
35
- METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
36
- METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
37
- metric_task_completion_latency,
38
- metric_task_outcome_report_latency,
39
- metric_task_outcome_report_retries,
40
- metric_task_outcome_reports,
41
- metric_tasks_completed,
42
33
  metric_tasks_fetched,
43
- metric_tasks_reporting_outcome,
44
34
  )
45
35
  from ..task_reporter import TaskReporter
46
36
  from .channel_manager import ChannelManager
@@ -86,29 +76,25 @@ class ExecutorStateReconciler:
86
76
  Never raises any exceptions.
87
77
  """
88
78
  while not self._is_shutdown:
89
- async with await self._channel_manager.get_channel() as server_channel:
90
- server_channel: grpc.aio.Channel
91
- stub = ExecutorAPIStub(server_channel)
92
- while not self._is_shutdown:
93
- try:
94
- # Report state once before starting the stream so Server
95
- # doesn't use old state it knew about this Executor in the past.
96
- await self._state_reporter.report_state(stub)
97
- desired_states_stream: AsyncGenerator[
98
- DesiredExecutorState, None
99
- ] = stub.get_desired_executor_states(
100
- GetDesiredExecutorStatesRequest(
101
- executor_id=self._executor_id
102
- )
103
- )
104
- await self._process_desired_states_stream(desired_states_stream)
105
- except Exception as e:
106
- self._logger.error(
107
- f"Failed processing desired states stream, reconnecting in {_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC} sec.",
108
- exc_info=e,
109
- )
110
- await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
111
- break
79
+ stub = ExecutorAPIStub(await self._channel_manager.get_channel())
80
+ while not self._is_shutdown:
81
+ try:
82
+ # Report state once before starting the stream so Server
83
+ # doesn't use old state it knew about this Executor in the past.
84
+ await self._state_reporter.report_state(stub)
85
+ desired_states_stream: AsyncGenerator[
86
+ DesiredExecutorState, None
87
+ ] = stub.get_desired_executor_states(
88
+ GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
89
+ )
90
+ await self._process_desired_states_stream(desired_states_stream)
91
+ except Exception as e:
92
+ self._logger.error(
93
+ f"Failed processing desired states stream, reconnecting in {_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC} sec.",
94
+ exc_info=e,
95
+ )
96
+ await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
97
+ break
112
98
 
113
99
  self._logger.info("State reconciler shutdown.")
114
100
 
@@ -128,6 +114,7 @@ class ExecutorStateReconciler:
128
114
  await self._reconcile_state(new_state)
129
115
 
130
116
  async def _reconcile_state(self, new_state: DesiredExecutorState):
117
+ # TODO: use completed_tasks_container to ignore tasks that were already completed.
131
118
  await self._reconcile_function_executors(new_state)
132
119
  # TODO
133
120
  # await self._reconcile_task_allocations(new_state)
@@ -153,6 +140,7 @@ class ExecutorStateReconciler:
153
140
  graph_version=desired_function_executor.graph_version,
154
141
  function_name=desired_function_executor.function_name,
155
142
  image_uri=desired_function_executor.image_uri,
143
+ secret_names=list(desired_function_executor.secret_names),
156
144
  )
157
145
  )
158
146
 
@@ -296,7 +284,9 @@ class ExecutorStateReconciler:
296
284
  while True:
297
285
  logger = logger.bind(retries=reporting_retries)
298
286
  try:
299
- await self._task_reporter.report(output=task_output, logger=logger)
287
+ await self._task_reporter.report(
288
+ data_payload=task_output, logger=logger
289
+ )
300
290
  break
301
291
  except Exception as e:
302
292
  logger.error(
@@ -90,24 +90,22 @@ class ExecutorStateReporter:
90
90
  Never raises any exceptions.
91
91
  """
92
92
  while not self._is_shutdown:
93
- async with await self._channel_manager.get_channel() as server_channel:
94
- server_channel: grpc.aio.Channel
95
- stub = ExecutorAPIStub(server_channel)
96
- while not self._is_shutdown:
97
- try:
98
- # The periodic state reports serve as channel health monitoring requests
99
- # (same as TCP keep-alive). Channel Manager returns the same healthy channel
100
- # for all RPCs that we do from Executor to Server. So all the RPCs benefit
101
- # from this channel health monitoring.
102
- await self.report_state(stub)
103
- await asyncio.sleep(_REPORTING_INTERVAL_SEC)
104
- except Exception as e:
105
- self._logger.error(
106
- f"Failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
107
- exc_info=e,
108
- )
109
- await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
110
- break
93
+ stub = ExecutorAPIStub(await self._channel_manager.get_channel())
94
+ while not self._is_shutdown:
95
+ try:
96
+ # The periodic state reports serve as channel health monitoring requests
97
+ # (same as TCP keep-alive). Channel Manager returns the same healthy channel
98
+ # for all RPCs that we do from Executor to Server. So all the RPCs benefit
99
+ # from this channel health monitoring.
100
+ await self.report_state(stub)
101
+ await asyncio.sleep(_REPORTING_INTERVAL_SEC)
102
+ except Exception as e:
103
+ self._logger.error(
104
+ f"Failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
105
+ exc_info=e,
106
+ )
107
+ await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
108
+ break
111
109
 
112
110
  self._logger.info("State reporter shutdown")
113
111
 
@@ -157,20 +155,25 @@ class ExecutorStateReporter:
157
155
 
158
156
  async for function_executor_state in self._function_executor_states:
159
157
  function_executor_state: FunctionExecutorState
160
- states.append(
161
- FunctionExecutorStateProto(
162
- description=FunctionExecutorDescription(
163
- id=function_executor_state.id,
164
- namespace=function_executor_state.namespace,
165
- graph_name=function_executor_state.graph_name,
166
- graph_version=function_executor_state.graph_version,
167
- function_name=function_executor_state.function_name,
168
- ),
169
- status=_to_grpc_function_executor_status(
170
- function_executor_state.status, self._logger
171
- ),
172
- )
158
+ function_executor_state_proto = FunctionExecutorStateProto(
159
+ description=FunctionExecutorDescription(
160
+ id=function_executor_state.id,
161
+ namespace=function_executor_state.namespace,
162
+ graph_name=function_executor_state.graph_name,
163
+ graph_version=function_executor_state.graph_version,
164
+ function_name=function_executor_state.function_name,
165
+ secret_names=function_executor_state.secret_names,
166
+ ),
167
+ status=_to_grpc_function_executor_status(
168
+ function_executor_state.status, self._logger
169
+ ),
170
+ status_message=function_executor_state.status_message,
173
171
  )
172
+ if function_executor_state.image_uri:
173
+ function_executor_state_proto.description.image_uri = (
174
+ function_executor_state.image_uri
175
+ )
176
+ states.append(function_executor_state_proto)
174
177
 
175
178
  return states
176
179
 
@@ -210,7 +213,7 @@ _STATUS_MAPPING: Dict[FunctionExecutorStatus, Any] = {
210
213
  FunctionExecutorStatus.UNHEALTHY: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY,
211
214
  FunctionExecutorStatus.DESTROYING: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
212
215
  FunctionExecutorStatus.DESTROYED: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
213
- FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
216
+ FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
214
217
  }
215
218
 
216
219