indexify 0.3.19__py3-none-any.whl → 0.3.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. indexify/cli/cli.py +12 -0
  2. indexify/executor/api_objects.py +11 -6
  3. indexify/executor/blob_store/blob_store.py +69 -0
  4. indexify/executor/blob_store/local_fs_blob_store.py +48 -0
  5. indexify/executor/blob_store/metrics/blob_store.py +33 -0
  6. indexify/executor/blob_store/s3_blob_store.py +88 -0
  7. indexify/executor/downloader.py +192 -27
  8. indexify/executor/executor.py +29 -13
  9. indexify/executor/function_executor/function_executor.py +1 -1
  10. indexify/executor/function_executor/function_executor_states_container.py +5 -0
  11. indexify/executor/function_executor/function_executor_status.py +2 -0
  12. indexify/executor/function_executor/health_checker.py +7 -2
  13. indexify/executor/function_executor/invocation_state_client.py +4 -2
  14. indexify/executor/function_executor/single_task_runner.py +2 -0
  15. indexify/executor/function_executor/task_output.py +8 -1
  16. indexify/executor/grpc/channel_manager.py +4 -3
  17. indexify/executor/grpc/function_executor_controller.py +163 -193
  18. indexify/executor/grpc/metrics/state_reconciler.py +17 -0
  19. indexify/executor/grpc/metrics/task_controller.py +8 -0
  20. indexify/executor/grpc/state_reconciler.py +305 -188
  21. indexify/executor/grpc/state_reporter.py +18 -10
  22. indexify/executor/grpc/task_controller.py +247 -189
  23. indexify/executor/metrics/task_reporter.py +17 -0
  24. indexify/executor/task_reporter.py +217 -94
  25. indexify/executor/task_runner.py +1 -0
  26. indexify/proto/executor_api.proto +37 -11
  27. indexify/proto/executor_api_pb2.py +49 -47
  28. indexify/proto/executor_api_pb2.pyi +55 -15
  29. {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/METADATA +2 -1
  30. {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/RECORD +32 -27
  31. indexify/executor/grpc/completed_tasks_container.py +0 -26
  32. {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/WHEEL +0 -0
  33. {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/entry_points.txt +0 -0
@@ -1,42 +1,44 @@
1
1
  import asyncio
2
- from typing import Any, AsyncGenerator, List, Optional, Set
2
+ from typing import Any, AsyncGenerator, Dict, Iterable, List, Optional, Set
3
3
 
4
- import grpc
5
- from tensorlake.function_executor.proto.function_executor_pb2 import (
6
- InitializeRequest,
7
- SerializedObject,
8
- )
4
+ from tensorlake.function_executor.proto.message_validator import MessageValidator
9
5
 
10
6
  from indexify.proto.executor_api_pb2 import (
11
7
  DesiredExecutorState,
12
8
  FunctionExecutorDescription,
13
- FunctionExecutorStatus,
14
9
  GetDesiredExecutorStatesRequest,
10
+ TaskAllocation,
15
11
  )
16
12
  from indexify.proto.executor_api_pb2_grpc import (
17
13
  ExecutorAPIStub,
18
14
  )
19
15
 
20
16
  from ..downloader import Downloader
21
- from ..function_executor.function_executor import CustomerError, FunctionExecutor
22
17
  from ..function_executor.function_executor_state import FunctionExecutorState
23
18
  from ..function_executor.function_executor_states_container import (
24
19
  FunctionExecutorStatesContainer,
25
20
  )
21
+ from ..function_executor.function_executor_status import FunctionExecutorStatus
26
22
  from ..function_executor.server.function_executor_server_factory import (
27
- FunctionExecutorServerConfiguration,
28
23
  FunctionExecutorServerFactory,
29
24
  )
30
- from ..function_executor.task_input import TaskInput
31
- from ..function_executor.task_output import TaskOutput
32
- from ..metrics.executor import (
33
- metric_tasks_fetched,
34
- )
35
25
  from ..task_reporter import TaskReporter
36
26
  from .channel_manager import ChannelManager
27
+ from .function_executor_controller import (
28
+ FunctionExecutorController,
29
+ function_executor_logger,
30
+ validate_function_executor_description,
31
+ )
32
+ from .metrics.state_reconciler import (
33
+ metric_state_reconciliation_errors,
34
+ metric_state_reconciliation_latency,
35
+ metric_state_reconciliations,
36
+ )
37
37
  from .state_reporter import ExecutorStateReporter
38
+ from .task_controller import TaskController, task_logger, validate_task
38
39
 
39
40
  _RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
41
+ _RECONCILIATION_RETRIES = 3
40
42
 
41
43
 
42
44
  class ExecutorStateReconciler:
@@ -61,27 +63,45 @@ class ExecutorStateReconciler:
61
63
  self._config_path: Optional[str] = config_path
62
64
  self._downloader: Downloader = downloader
63
65
  self._task_reporter: TaskReporter = task_reporter
64
- self._function_executor_states: FunctionExecutorStatesContainer = (
65
- function_executor_states
66
- )
67
66
  self._channel_manager: ChannelManager = channel_manager
68
67
  self._state_reporter: ExecutorStateReporter = state_reporter
68
+ self._reconciliation_loop_task: Optional[asyncio.Task] = None
69
69
  self._logger: Any = logger.bind(module=__name__)
70
+
71
+ # Mutable state. Doesn't need lock because we access from async tasks running in the same thread.
70
72
  self._is_shutdown: bool = False
71
- self._server_last_clock: Optional[int] = None
73
+ self._function_executor_states: FunctionExecutorStatesContainer = (
74
+ function_executor_states
75
+ )
76
+ self._function_executor_controllers: Dict[str, FunctionExecutorController] = {}
77
+ self._task_controllers: Dict[str, TaskController] = {}
78
+ self._last_server_clock: Optional[int] = None
79
+
80
+ self._last_desired_state_lock = asyncio.Lock()
81
+ self._last_desired_state_change_notifier: asyncio.Condition = asyncio.Condition(
82
+ lock=self._last_desired_state_lock
83
+ )
84
+ self._last_desired_state: Optional[DesiredExecutorState] = None
72
85
 
73
86
  async def run(self):
74
87
  """Runs the state reconciler.
75
88
 
76
89
  Never raises any exceptions.
77
90
  """
91
+ self._reconciliation_loop_task = asyncio.create_task(
92
+ self._reconciliation_loop(),
93
+ name="state reconciler reconciliation loop",
94
+ )
95
+
96
+ # TODO: Move this into a new async task and cancel it in shutdown().
78
97
  while not self._is_shutdown:
79
98
  stub = ExecutorAPIStub(await self._channel_manager.get_channel())
80
99
  while not self._is_shutdown:
81
100
  try:
82
101
  # Report state once before starting the stream so Server
83
- # doesn't use old state it knew about this Executor in the past.
102
+ # doesn't use stale state it knew about this Executor in the past.
84
103
  await self._state_reporter.report_state(stub)
104
+
85
105
  desired_states_stream: AsyncGenerator[
86
106
  DesiredExecutorState, None
87
107
  ] = stub.get_desired_executor_states(
@@ -96,8 +116,6 @@ class ExecutorStateReconciler:
96
116
  await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
97
117
  break
98
118
 
99
- self._logger.info("State reconciler shutdown.")
100
-
101
119
  async def _process_desired_states_stream(
102
120
  self, desired_states: AsyncGenerator[DesiredExecutorState, None]
103
121
  ):
@@ -106,18 +124,26 @@ class ExecutorStateReconciler:
106
124
  return
107
125
 
108
126
  new_state: DesiredExecutorState
109
- if self._server_last_clock is not None:
110
- if self._server_last_clock >= new_state.clock:
111
- continue # Duplicate or outdated message state sent by Server.
127
+ validator: MessageValidator = MessageValidator(new_state)
128
+ try:
129
+ validator.required_field("clock")
130
+ except ValueError as e:
131
+ self._logger.error(
132
+ "Received invalid DesiredExecutorState from Server. Ignoring.",
133
+ exc_info=e,
134
+ )
135
+ continue
112
136
 
113
- self._server_last_clock = new_state.clock
114
- await self._reconcile_state(new_state)
137
+ if self._last_server_clock is not None:
138
+ if self._last_server_clock >= new_state.clock:
139
+ continue # Duplicate or outdated message state sent by Server.
115
140
 
116
- async def _reconcile_state(self, new_state: DesiredExecutorState):
117
- # TODO: use completed_tasks_container to ignore tasks that were already completed.
118
- await self._reconcile_function_executors(new_state)
119
- # TODO
120
- # await self._reconcile_task_allocations(new_state)
141
+ self._last_server_clock = new_state.clock
142
+ # Always read the latest desired state value from the stream so
143
+ # we're never acting on stale desired states.
144
+ async with self._last_desired_state_lock:
145
+ self._last_desired_state = new_state
146
+ self._last_desired_state_change_notifier.notify_all()
121
147
 
122
148
  async def shutdown(self):
123
149
  """Shuts down the state reconciler.
@@ -125,188 +151,279 @@ class ExecutorStateReconciler:
125
151
  Never raises any exceptions.
126
152
  """
127
153
  self._is_shutdown = True
154
+ if self._reconciliation_loop_task is not None:
155
+ self._reconciliation_loop_task.cancel()
156
+ self._logger.info("Reconciliation loop shutdown.")
128
157
 
129
- async def _reconcile_function_executors(self, desired_state: DesiredExecutorState):
130
- desired_function_executor_ids: Set[str] = set()
131
- for desired_function_executor in desired_state.function_executors:
132
- desired_function_executor: FunctionExecutorDescription
133
- desired_function_executor_ids.add(desired_function_executor.id)
158
+ for controller in self._task_controllers.values():
159
+ await controller.destroy()
160
+ # FEs are destroyed in executor.py right now.
161
+ # TODO: Once HTTP loop is removed add all FE state and controllers
162
+ # shutdown logic here. This should allow us to get rid of hacky
163
+ # "cancel all tasks loop" in executor.py shutdown and make the shutdown
164
+ # much more controllable and clean. E.g. we would be able to remove logs
165
+ # suppression from shutdown logic. Also need to shutdown self._function_executor_controllers.
134
166
 
135
- function_executor_state: FunctionExecutorState = (
136
- self._function_executor_states.get_or_create_state(
137
- id=desired_function_executor.id,
138
- namespace=desired_function_executor.namespace,
139
- graph_name=desired_function_executor.graph_name,
140
- graph_version=desired_function_executor.graph_version,
141
- function_name=desired_function_executor.function_name,
142
- image_uri=desired_function_executor.image_uri,
143
- secret_names=list(desired_function_executor.secret_names),
144
- )
145
- )
167
+ async def _reconciliation_loop(self):
168
+ last_reconciled_state: Optional[DesiredExecutorState] = None
169
+ while not self._is_shutdown:
170
+ async with self._last_desired_state_lock:
171
+ # Comparing object identities (references) is enough here to not reconcile
172
+ # the same state twice.
173
+ while self._last_desired_state is last_reconciled_state:
174
+ await self._last_desired_state_change_notifier.wait()
175
+ last_reconciled_state = self._last_desired_state
146
176
 
147
- async with function_executor_state.lock:
148
- if (
149
- function_executor_state.status
150
- == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPED
151
- ):
152
- function_executor_state.status = (
153
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTING_UP
154
- )
155
- try:
156
- function_executor_state.function_executor = (
157
- await self._create_function_executor()
158
- )
159
- function_executor_state.status = (
160
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_IDLE
161
- )
162
- except CustomerError as e:
163
- function_executor_state.status = (
164
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR
165
- )
166
- except Exception as e:
167
- function_executor_state.status = (
168
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR
169
- )
170
- self._logger.error(
171
- f"Failed to create Function Executor", exc_info=e
172
- )
173
-
174
- function_executor_state_ids_to_destroy: List[str] = []
175
- async for function_executor_state in self._function_executor_states:
176
- function_executor_state: FunctionExecutorState
177
- if function_executor_state.id not in desired_function_executor_ids:
178
- function_executor_state_ids_to_destroy.append(
179
- function_executor_state.id
177
+ with metric_state_reconciliation_latency.time():
178
+ metric_state_reconciliations.inc()
179
+ await self._reconcile_state(last_reconciled_state)
180
+ self._state_reporter.update_last_server_clock(
181
+ last_reconciled_state.clock
180
182
  )
181
183
 
182
- for function_executor_state_id in function_executor_state_ids_to_destroy:
183
- function_executor_state: FunctionExecutorState = (
184
- self._function_executor_states.pop_state(function_executor_state_id)
185
- )
186
- async with function_executor_state.lock:
187
- logger = self._function_executor_logger(
188
- id=function_executor_state.id,
189
- namespace=function_executor_state.namespace,
190
- graph_name=function_executor_state.graph_name,
191
- graph_version=function_executor_state.graph_version,
192
- function_name=function_executor_state.function_name,
193
- )
194
- if (
195
- function_executor_state.status
196
- == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK
197
- ):
198
- logger.warning(
199
- "Destroying Function Executor that is running a task. No task output will be reported as this is expected by the Server."
200
- )
201
- function_executor_state.status = (
202
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPING
184
+ async def _reconcile_state(self, desired_state: DesiredExecutorState):
185
+ """Reconciles the desired state with the current state.
186
+
187
+ Doesn't raise any exceptions. Logs all errors for future investigation becase the gRPC protocol
188
+ doesn't allow us to return errors to the Server if it supplied invalid messages.
189
+ """
190
+ for attempt in range(_RECONCILIATION_RETRIES):
191
+ try:
192
+ # Reconcile FEs first because Tasks depend on them.
193
+ await self._reconcile_function_executors(
194
+ desired_state.function_executors
203
195
  )
204
- await function_executor_state.destroy_function_executor()
205
- function_executor_state.status = (
206
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPED
196
+ await self._reconcile_tasks(desired_state.task_allocations)
197
+ return
198
+ except Exception as e:
199
+ self._logger.error(
200
+ "Failed to reconcile desired state. Retrying in 5 secs.",
201
+ exc_info=e,
202
+ attempt=attempt,
203
+ attempts_left=_RECONCILIATION_RETRIES - attempt,
207
204
  )
205
+ await asyncio.sleep(5)
208
206
 
209
- async def _create_function_executor(
210
- self, description: FunctionExecutorDescription
211
- ) -> FunctionExecutor:
212
- logger = self._function_executor_logger(
213
- id=description.id,
214
- namespace=description.namespace,
215
- graph_name=description.graph_name,
216
- graph_version=description.graph_version,
217
- function_name=description.function_name,
218
- )
219
- graph: SerializedObject = await self._downloader.download_graph(
220
- namespace=description.namespace,
221
- graph_name=description.graph_name,
222
- graph_version=description.graph_version,
223
- logger=logger,
207
+ metric_state_reconciliation_errors.inc()
208
+ self._logger.error(
209
+ f"Failed to reconcile desired state after {_RECONCILIATION_RETRIES} attempts.",
224
210
  )
225
- function_executor: FunctionExecutor = FunctionExecutor(
226
- server_factory=self._function_executor_server_factory, logger=logger
211
+
212
+ async def _reconcile_function_executors(
213
+ self, function_executor_descriptions: Iterable[FunctionExecutorDescription]
214
+ ):
215
+ valid_fe_descriptions: List[FunctionExecutorDescription] = (
216
+ self._valid_function_executor_descriptions(function_executor_descriptions)
227
217
  )
228
- config: FunctionExecutorServerConfiguration = (
229
- FunctionExecutorServerConfiguration(
230
- executor_id=self._executor_id,
231
- function_executor_id=description.id,
232
- namespace=description.namespace,
233
- image_uri=description.image_uri,
234
- secret_names=list(description.secret_names),
218
+ for fe_description in valid_fe_descriptions:
219
+ await self._reconcile_function_executor(fe_description)
220
+
221
+ seen_fe_ids: Set[str] = set(map(lambda fe: fe.id, valid_fe_descriptions))
222
+ fe_ids_to_remove = set(self._function_executor_controllers.keys()) - seen_fe_ids
223
+ for function_executor_id in fe_ids_to_remove:
224
+ # Remove the controller before FE shutdown completes so we won't attempt to do it
225
+ # again on the next reconciliations.
226
+ await self._function_executor_controllers.pop(
227
+ function_executor_id
228
+ ).shutdown()
229
+ # Schedule removal of the FE state after shutdown. This is required for Server
230
+ # to known when exactly FE resources are freed so it can put a replacement FE if needed.
231
+ # Running in a separate asyncio task because this will block until the shutdown is complete.
232
+ asyncio.create_task(
233
+ self._remove_function_executor_after_shutdown(function_executor_id),
234
+ name="Remove Function Executor after shutdown",
235
+ )
236
+
237
+ def _valid_function_executor_descriptions(
238
+ self, function_executor_descriptions: Iterable[FunctionExecutorDescription]
239
+ ):
240
+ valid_function_executor_descriptions: List[FunctionExecutorDescription] = []
241
+ for function_executor_description in function_executor_descriptions:
242
+ function_executor_description: FunctionExecutorDescription
243
+ logger = function_executor_logger(
244
+ function_executor_description, self._logger
235
245
  )
236
- )
237
- initialize_request: InitializeRequest = InitializeRequest(
238
- namespace=description.namespace,
239
- graph_name=description.graph_name,
240
- graph_version=description.graph_version,
241
- function_name=description.function_name,
242
- graph=graph,
243
- )
244
246
 
247
+ try:
248
+ validate_function_executor_description(function_executor_description)
249
+ except ValueError as e:
250
+ logger.error(
251
+ "Received invalid FunctionExecutorDescription from Server. Dropping it from desired state.",
252
+ exc_info=e,
253
+ )
254
+ continue
255
+
256
+ valid_function_executor_descriptions.append(function_executor_description)
257
+
258
+ return valid_function_executor_descriptions
259
+
260
+ async def _reconcile_function_executor(
261
+ self, function_executor_description: FunctionExecutorDescription
262
+ ):
263
+ """Reconciles a single Function Executor with the desired state.
264
+
265
+ Doesn't block on any long running operations. Doesn't raise any exceptions.
266
+ """
267
+ if function_executor_description.id not in self._function_executor_controllers:
268
+ await self._create_function_executor(function_executor_description)
269
+
270
+ async def _create_function_executor(
271
+ self, function_executor_description: FunctionExecutorDescription
272
+ ) -> None:
273
+ """Creates Function Executor for the supplied description.
274
+
275
+ Doesn't block on any long running operations. Doesn't raise any exceptions.
276
+ """
277
+ logger = function_executor_logger(function_executor_description, self._logger)
245
278
  try:
246
- await function_executor.initialize(
247
- config=config,
248
- initialize_request=initialize_request,
279
+ # TODO: Store FE description in FE state object once we migrate to gRPC State Reconciler.
280
+ # Then most of these parameters will be removed. Also remove the container and use a simple
281
+ # Dict once FE shutdown logic is moved into reconciler.
282
+ function_executor_state: FunctionExecutorState = (
283
+ await self._function_executor_states.get_or_create_state(
284
+ id=function_executor_description.id,
285
+ namespace=function_executor_description.namespace,
286
+ graph_name=function_executor_description.graph_name,
287
+ graph_version=function_executor_description.graph_version,
288
+ function_name=function_executor_description.function_name,
289
+ image_uri=(
290
+ function_executor_description.image_uri
291
+ if function_executor_description.HasField("image_uri")
292
+ else None
293
+ ),
294
+ secret_names=list(function_executor_description.secret_names),
295
+ )
296
+ )
297
+ controller: FunctionExecutorController = FunctionExecutorController(
298
+ executor_id=self._executor_id,
299
+ function_executor_state=function_executor_state,
300
+ function_executor_description=function_executor_description,
301
+ function_executor_server_factory=self._function_executor_server_factory,
302
+ downloader=self._downloader,
249
303
  base_url=self._base_url,
250
304
  config_path=self._config_path,
305
+ logger=self._logger,
306
+ )
307
+ self._function_executor_controllers[function_executor_description.id] = (
308
+ controller
251
309
  )
252
- return function_executor
253
- except Exception:
254
- await function_executor.destroy()
255
- raise
310
+ # Ask the controller to create the new FE. Task controllers will notice that the FE is eventually
311
+ # IDLE and start running tasks on it. Server currently doesn't explicitly manage the desired FE status.
312
+ await controller.startup()
313
+ except Exception as e:
314
+ logger.error("Failed adding Function Executor", exc_info=e)
256
315
 
257
- async def _cancel_running_tasks(
258
- self, function_executor_state: FunctionExecutorState
259
- ):
260
- pass
316
+ async def _remove_function_executor_after_shutdown(
317
+ self, function_executor_id: str
318
+ ) -> None:
319
+ fe_state: FunctionExecutorState = await self._function_executor_states.get(
320
+ function_executor_id
321
+ )
322
+ async with fe_state.lock:
323
+ await fe_state.wait_status(allowlist=[FunctionExecutorStatus.SHUTDOWN])
324
+ # The whole reconciler could shutdown while we were waiting for the FE to shutdown.
325
+ if not self._is_shutdown:
326
+ await self._function_executor_states.pop(function_executor_id)
261
327
 
262
- def _function_executor_logger(
263
- self,
264
- id: str,
265
- namespace: str,
266
- graph_name: str,
267
- graph_version: str,
268
- function_name: str,
269
- ) -> Any:
270
- return self._logger.bind(
271
- id=id,
272
- namespace=namespace,
273
- graph=graph_name,
274
- graph_version=graph_version,
275
- function_name=function_name,
328
+ async def _reconcile_tasks(self, task_allocations: Iterable[TaskAllocation]):
329
+ valid_task_allocations: List[TaskAllocation] = self._valid_task_allocations(
330
+ task_allocations
331
+ )
332
+ for task_allocation in valid_task_allocations:
333
+ await self._reconcile_task(task_allocation)
334
+
335
+ seen_task_ids: Set[str] = set(
336
+ map(lambda task_allocation: task_allocation.task.id, valid_task_allocations)
276
337
  )
338
+ task_ids_to_remove = set(self._task_controllers.keys()) - seen_task_ids
339
+ for task_id in task_ids_to_remove:
340
+ await self._remove_task(task_id)
277
341
 
278
- async def _report_task_outcome(self, task_output: TaskOutput):
279
- """Reports the task with the given output to the server.
342
+ async def _reconcile_task(self, task_allocation: TaskAllocation):
343
+ """Reconciles a single TaskAllocation with the desired state.
280
344
 
281
- Doesn't raise any Exceptions. Runs till the reporting is successful."""
282
- reporting_retries: int = 0
345
+ Doesn't raise any exceptions.
346
+ """
347
+ if task_allocation.task.id in self._task_controllers:
348
+ # Nothing to do, task allocation already exists and it's immutable.
349
+ return
350
+
351
+ logger = self._task_allocation_logger(task_allocation)
352
+ try:
353
+ function_executor_state: FunctionExecutorState = (
354
+ await self._function_executor_states.get(
355
+ task_allocation.function_executor_id
356
+ )
357
+ )
358
+ self._task_controllers[task_allocation.task.id] = TaskController(
359
+ task=task_allocation.task,
360
+ downloader=self._downloader,
361
+ task_reporter=self._task_reporter,
362
+ function_executor_id=task_allocation.function_executor_id,
363
+ function_executor_state=function_executor_state,
364
+ logger=self._logger,
365
+ )
366
+ except Exception as e:
367
+ logger.error("Failed adding TaskController", exc_info=e)
368
+
369
+ async def _remove_task(self, task_id: str) -> None:
370
+ """Schedules removal of an existing task.
371
+
372
+ Doesn't block on any long running operations. Doesn't raise any exceptions.
373
+ """
374
+ await self._task_controllers.pop(task_id).destroy()
375
+
376
+ def _valid_task_allocations(self, task_allocations: Iterable[TaskAllocation]):
377
+ valid_task_allocations: List[TaskAllocation] = []
378
+ for task_allocation in task_allocations:
379
+ task_allocation: TaskAllocation
380
+ logger = self._task_allocation_logger(task_allocation)
283
381
 
284
- while True:
285
- logger = logger.bind(retries=reporting_retries)
286
382
  try:
287
- await self._task_reporter.report(
288
- data_payload=task_output, logger=logger
383
+ validate_task(task_allocation.task)
384
+ except ValueError as e:
385
+ # There's no way to report this error to Server so just log it.
386
+ logger.error(
387
+ "Received invalid TaskAllocation from Server. Dropping it from desired state.",
388
+ exc_info=e,
289
389
  )
290
- break
291
- except Exception as e:
390
+ continue
391
+
392
+ validator = MessageValidator(task_allocation)
393
+ try:
394
+ validator.required_field("function_executor_id")
395
+ except ValueError as e:
396
+ # There's no way to report this error to Server so just log it.
292
397
  logger.error(
293
- "failed to report task",
398
+ "Received invalid TaskAllocation from Server. Dropping it from desired state.",
294
399
  exc_info=e,
295
400
  )
296
- reporting_retries += 1
297
- metric_task_outcome_report_retries.inc()
298
- await asyncio.sleep(5)
401
+ continue
402
+
403
+ if (
404
+ task_allocation.function_executor_id
405
+ not in self._function_executor_controllers
406
+ ):
407
+ # Current policy: don't report task outcomes for tasks that didn't run.
408
+ # This is required to simplify the protocol so Server doesn't need to care about task states.
409
+ logger.error(
410
+ "Received TaskAllocation for a Function Executor that doesn't exist. Dropping it from desired state."
411
+ )
412
+ continue
299
413
 
300
- metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
301
- if task_output.is_internal_error:
302
- metric_tasks_completed.labels(
303
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
304
- ).inc()
305
- elif task_output.success:
306
- metric_tasks_completed.labels(
307
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
308
- ).inc()
309
- else:
310
- metric_tasks_completed.labels(
311
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
312
- ).inc()
414
+ valid_task_allocations.append(task_allocation)
415
+
416
+ return valid_task_allocations
417
+
418
+ def _task_allocation_logger(self, task_allocation: TaskAllocation) -> Any:
419
+ """Returns a logger for the given TaskAllocation.
420
+
421
+ Doesn't assume that the supplied TaskAllocation is valid.
422
+ """
423
+ return task_logger(task_allocation.task, self._logger).bind(
424
+ function_executor_id=(
425
+ task_allocation.function_executor_id
426
+ if task_allocation.HasField("function_executor_id")
427
+ else None
428
+ )
429
+ )