indexify 0.3.18__py3-none-any.whl → 0.3.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. indexify/cli/cli.py +15 -17
  2. indexify/executor/api_objects.py +12 -0
  3. indexify/executor/blob_store/blob_store.py +69 -0
  4. indexify/executor/blob_store/local_fs_blob_store.py +48 -0
  5. indexify/executor/blob_store/metrics/blob_store.py +33 -0
  6. indexify/executor/blob_store/s3_blob_store.py +85 -0
  7. indexify/executor/downloader.py +149 -25
  8. indexify/executor/executor.py +77 -41
  9. indexify/executor/function_executor/function_executor.py +24 -11
  10. indexify/executor/function_executor/function_executor_state.py +9 -1
  11. indexify/executor/function_executor/function_executor_states_container.py +8 -1
  12. indexify/executor/function_executor/function_executor_status.py +4 -0
  13. indexify/executor/function_executor/health_checker.py +7 -2
  14. indexify/executor/function_executor/invocation_state_client.py +4 -2
  15. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
  16. indexify/executor/function_executor/single_task_runner.py +15 -11
  17. indexify/executor/function_executor/task_output.py +36 -2
  18. indexify/executor/grpc/channel_manager.py +4 -3
  19. indexify/executor/grpc/function_executor_controller.py +391 -0
  20. indexify/executor/grpc/metrics/state_reconciler.py +17 -0
  21. indexify/executor/grpc/metrics/task_controller.py +8 -0
  22. indexify/executor/grpc/state_reconciler.py +324 -217
  23. indexify/executor/grpc/state_reporter.py +52 -41
  24. indexify/executor/grpc/task_controller.py +492 -0
  25. indexify/executor/metrics/task_reporter.py +14 -0
  26. indexify/executor/task_reporter.py +115 -6
  27. indexify/executor/task_runner.py +1 -0
  28. indexify/proto/executor_api.proto +91 -7
  29. indexify/proto/executor_api_pb2.py +49 -37
  30. indexify/proto/executor_api_pb2.pyi +158 -3
  31. indexify/proto/executor_api_pb2_grpc.py +47 -0
  32. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/METADATA +2 -1
  33. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/RECORD +35 -27
  34. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/WHEEL +0 -0
  35. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/entry_points.txt +0 -0
@@ -1,52 +1,44 @@
1
1
  import asyncio
2
- from typing import Any, AsyncGenerator, List, Optional, Set
2
+ from typing import Any, AsyncGenerator, Dict, Iterable, List, Optional, Set
3
3
 
4
- import grpc
5
- from tensorlake.function_executor.proto.function_executor_pb2 import (
6
- InitializeRequest,
7
- SerializedObject,
8
- )
4
+ from tensorlake.function_executor.proto.message_validator import MessageValidator
9
5
 
10
6
  from indexify.proto.executor_api_pb2 import (
11
7
  DesiredExecutorState,
12
8
  FunctionExecutorDescription,
13
- FunctionExecutorStatus,
14
9
  GetDesiredExecutorStatesRequest,
10
+ TaskAllocation,
15
11
  )
16
12
  from indexify.proto.executor_api_pb2_grpc import (
17
13
  ExecutorAPIStub,
18
14
  )
19
15
 
20
16
  from ..downloader import Downloader
21
- from ..function_executor.function_executor import CustomerError, FunctionExecutor
22
17
  from ..function_executor.function_executor_state import FunctionExecutorState
23
18
  from ..function_executor.function_executor_states_container import (
24
19
  FunctionExecutorStatesContainer,
25
20
  )
21
+ from ..function_executor.function_executor_status import FunctionExecutorStatus
26
22
  from ..function_executor.server.function_executor_server_factory import (
27
- FunctionExecutorServerConfiguration,
28
23
  FunctionExecutorServerFactory,
29
24
  )
30
- from ..function_executor.task_input import TaskInput
31
- from ..function_executor.task_output import TaskOutput
32
- from ..metrics.executor import (
33
- METRIC_TASKS_COMPLETED_OUTCOME_ALL,
34
- METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
35
- METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
36
- METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
37
- metric_task_completion_latency,
38
- metric_task_outcome_report_latency,
39
- metric_task_outcome_report_retries,
40
- metric_task_outcome_reports,
41
- metric_tasks_completed,
42
- metric_tasks_fetched,
43
- metric_tasks_reporting_outcome,
44
- )
45
25
  from ..task_reporter import TaskReporter
46
26
  from .channel_manager import ChannelManager
27
+ from .function_executor_controller import (
28
+ FunctionExecutorController,
29
+ function_executor_logger,
30
+ validate_function_executor_description,
31
+ )
32
+ from .metrics.state_reconciler import (
33
+ metric_state_reconciliation_errors,
34
+ metric_state_reconciliation_latency,
35
+ metric_state_reconciliations,
36
+ )
47
37
  from .state_reporter import ExecutorStateReporter
38
+ from .task_controller import TaskController, task_logger, validate_task
48
39
 
49
40
  _RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
41
+ _RECONCILIATION_RETRIES = 3
50
42
 
51
43
 
52
44
  class ExecutorStateReconciler:
@@ -71,46 +63,58 @@ class ExecutorStateReconciler:
71
63
  self._config_path: Optional[str] = config_path
72
64
  self._downloader: Downloader = downloader
73
65
  self._task_reporter: TaskReporter = task_reporter
74
- self._function_executor_states: FunctionExecutorStatesContainer = (
75
- function_executor_states
76
- )
77
66
  self._channel_manager: ChannelManager = channel_manager
78
67
  self._state_reporter: ExecutorStateReporter = state_reporter
68
+ self._reconciliation_loop_task: Optional[asyncio.Task] = None
79
69
  self._logger: Any = logger.bind(module=__name__)
70
+
71
+ # Mutable state. Doesn't need lock because we access from async tasks running in the same thread.
80
72
  self._is_shutdown: bool = False
81
- self._server_last_clock: Optional[int] = None
73
+ self._function_executor_states: FunctionExecutorStatesContainer = (
74
+ function_executor_states
75
+ )
76
+ self._function_executor_controllers: Dict[str, FunctionExecutorController] = {}
77
+ self._task_controllers: Dict[str, TaskController] = {}
78
+ self._last_server_clock: Optional[int] = None
79
+
80
+ self._last_desired_state_lock = asyncio.Lock()
81
+ self._last_desired_state_change_notifier: asyncio.Condition = asyncio.Condition(
82
+ lock=self._last_desired_state_lock
83
+ )
84
+ self._last_desired_state: Optional[DesiredExecutorState] = None
82
85
 
83
86
  async def run(self):
84
87
  """Runs the state reconciler.
85
88
 
86
89
  Never raises any exceptions.
87
90
  """
91
+ self._reconciliation_loop_task = asyncio.create_task(
92
+ self._reconciliation_loop(),
93
+ name="state reconciler reconciliation loop",
94
+ )
95
+
96
+ # TODO: Move this into a new async task and cancel it in shutdown().
88
97
  while not self._is_shutdown:
89
- async with await self._channel_manager.get_channel() as server_channel:
90
- server_channel: grpc.aio.Channel
91
- stub = ExecutorAPIStub(server_channel)
92
- while not self._is_shutdown:
93
- try:
94
- # Report state once before starting the stream so Server
95
- # doesn't use old state it knew about this Executor in the past.
96
- await self._state_reporter.report_state(stub)
97
- desired_states_stream: AsyncGenerator[
98
- DesiredExecutorState, None
99
- ] = stub.get_desired_executor_states(
100
- GetDesiredExecutorStatesRequest(
101
- executor_id=self._executor_id
102
- )
103
- )
104
- await self._process_desired_states_stream(desired_states_stream)
105
- except Exception as e:
106
- self._logger.error(
107
- f"Failed processing desired states stream, reconnecting in {_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC} sec.",
108
- exc_info=e,
109
- )
110
- await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
111
- break
112
-
113
- self._logger.info("State reconciler shutdown.")
98
+ stub = ExecutorAPIStub(await self._channel_manager.get_channel())
99
+ while not self._is_shutdown:
100
+ try:
101
+ # Report state once before starting the stream so Server
102
+ # doesn't use stale state it knew about this Executor in the past.
103
+ await self._state_reporter.report_state(stub)
104
+
105
+ desired_states_stream: AsyncGenerator[
106
+ DesiredExecutorState, None
107
+ ] = stub.get_desired_executor_states(
108
+ GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
109
+ )
110
+ await self._process_desired_states_stream(desired_states_stream)
111
+ except Exception as e:
112
+ self._logger.error(
113
+ f"Failed processing desired states stream, reconnecting in {_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC} sec.",
114
+ exc_info=e,
115
+ )
116
+ await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
117
+ break
114
118
 
115
119
  async def _process_desired_states_stream(
116
120
  self, desired_states: AsyncGenerator[DesiredExecutorState, None]
@@ -120,17 +124,26 @@ class ExecutorStateReconciler:
120
124
  return
121
125
 
122
126
  new_state: DesiredExecutorState
123
- if self._server_last_clock is not None:
124
- if self._server_last_clock >= new_state.clock:
125
- continue # Duplicate or outdated message state sent by Server.
127
+ validator: MessageValidator = MessageValidator(new_state)
128
+ try:
129
+ validator.required_field("clock")
130
+ except ValueError as e:
131
+ self._logger.error(
132
+ "Received invalid DesiredExecutorState from Server. Ignoring.",
133
+ exc_info=e,
134
+ )
135
+ continue
126
136
 
127
- self._server_last_clock = new_state.clock
128
- await self._reconcile_state(new_state)
137
+ if self._last_server_clock is not None:
138
+ if self._last_server_clock >= new_state.clock:
139
+ continue # Duplicate or outdated message state sent by Server.
129
140
 
130
- async def _reconcile_state(self, new_state: DesiredExecutorState):
131
- await self._reconcile_function_executors(new_state)
132
- # TODO
133
- # await self._reconcile_task_allocations(new_state)
141
+ self._last_server_clock = new_state.clock
142
+ # Always read the latest desired state value from the stream so
143
+ # we're never acting on stale desired states.
144
+ async with self._last_desired_state_lock:
145
+ self._last_desired_state = new_state
146
+ self._last_desired_state_change_notifier.notify_all()
134
147
 
135
148
  async def shutdown(self):
136
149
  """Shuts down the state reconciler.
@@ -138,185 +151,279 @@ class ExecutorStateReconciler:
138
151
  Never raises any exceptions.
139
152
  """
140
153
  self._is_shutdown = True
154
+ if self._reconciliation_loop_task is not None:
155
+ self._reconciliation_loop_task.cancel()
156
+ self._logger.info("Reconciliation loop shutdown.")
141
157
 
142
- async def _reconcile_function_executors(self, desired_state: DesiredExecutorState):
143
- desired_function_executor_ids: Set[str] = set()
144
- for desired_function_executor in desired_state.function_executors:
145
- desired_function_executor: FunctionExecutorDescription
146
- desired_function_executor_ids.add(desired_function_executor.id)
158
+ for controller in self._task_controllers.values():
159
+ await controller.destroy()
160
+ # FEs are destroyed in executor.py right now.
161
+ # TODO: Once HTTP loop is removed add all FE state and controllers
162
+ # shutdown logic here. This should allow us to get rid of hacky
163
+ # "cancel all tasks loop" in executor.py shutdown and make the shutdown
164
+ # much more controllable and clean. E.g. we would be able to remove logs
165
+ # suppression from shutdown logic. Also need to shutdown self._function_executor_controllers.
147
166
 
148
- function_executor_state: FunctionExecutorState = (
149
- self._function_executor_states.get_or_create_state(
150
- id=desired_function_executor.id,
151
- namespace=desired_function_executor.namespace,
152
- graph_name=desired_function_executor.graph_name,
153
- graph_version=desired_function_executor.graph_version,
154
- function_name=desired_function_executor.function_name,
155
- image_uri=desired_function_executor.image_uri,
156
- )
157
- )
167
+ async def _reconciliation_loop(self):
168
+ last_reconciled_state: Optional[DesiredExecutorState] = None
169
+ while not self._is_shutdown:
170
+ async with self._last_desired_state_lock:
171
+ # Comparing object identities (references) is enough here to not reconcile
172
+ # the same state twice.
173
+ while self._last_desired_state is last_reconciled_state:
174
+ await self._last_desired_state_change_notifier.wait()
175
+ last_reconciled_state = self._last_desired_state
158
176
 
159
- async with function_executor_state.lock:
160
- if (
161
- function_executor_state.status
162
- == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPED
163
- ):
164
- function_executor_state.status = (
165
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTING_UP
166
- )
167
- try:
168
- function_executor_state.function_executor = (
169
- await self._create_function_executor()
170
- )
171
- function_executor_state.status = (
172
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_IDLE
173
- )
174
- except CustomerError as e:
175
- function_executor_state.status = (
176
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR
177
- )
178
- except Exception as e:
179
- function_executor_state.status = (
180
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR
181
- )
182
- self._logger.error(
183
- f"Failed to create Function Executor", exc_info=e
184
- )
185
-
186
- function_executor_state_ids_to_destroy: List[str] = []
187
- async for function_executor_state in self._function_executor_states:
188
- function_executor_state: FunctionExecutorState
189
- if function_executor_state.id not in desired_function_executor_ids:
190
- function_executor_state_ids_to_destroy.append(
191
- function_executor_state.id
177
+ with metric_state_reconciliation_latency.time():
178
+ metric_state_reconciliations.inc()
179
+ await self._reconcile_state(last_reconciled_state)
180
+ self._state_reporter.update_last_server_clock(
181
+ last_reconciled_state.clock
192
182
  )
193
183
 
194
- for function_executor_state_id in function_executor_state_ids_to_destroy:
195
- function_executor_state: FunctionExecutorState = (
196
- self._function_executor_states.pop_state(function_executor_state_id)
197
- )
198
- async with function_executor_state.lock:
199
- logger = self._function_executor_logger(
200
- id=function_executor_state.id,
201
- namespace=function_executor_state.namespace,
202
- graph_name=function_executor_state.graph_name,
203
- graph_version=function_executor_state.graph_version,
204
- function_name=function_executor_state.function_name,
205
- )
206
- if (
207
- function_executor_state.status
208
- == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK
209
- ):
210
- logger.warning(
211
- "Destroying Function Executor that is running a task. No task output will be reported as this is expected by the Server."
212
- )
213
- function_executor_state.status = (
214
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPING
184
+ async def _reconcile_state(self, desired_state: DesiredExecutorState):
185
+ """Reconciles the desired state with the current state.
186
+
187
+ Doesn't raise any exceptions. Logs all errors for future investigation becase the gRPC protocol
188
+ doesn't allow us to return errors to the Server if it supplied invalid messages.
189
+ """
190
+ for attempt in range(_RECONCILIATION_RETRIES):
191
+ try:
192
+ # Reconcile FEs first because Tasks depend on them.
193
+ await self._reconcile_function_executors(
194
+ desired_state.function_executors
215
195
  )
216
- await function_executor_state.destroy_function_executor()
217
- function_executor_state.status = (
218
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPED
196
+ await self._reconcile_tasks(desired_state.task_allocations)
197
+ return
198
+ except Exception as e:
199
+ self._logger.error(
200
+ "Failed to reconcile desired state. Retrying in 5 secs.",
201
+ exc_info=e,
202
+ attempt=attempt,
203
+ attempts_left=_RECONCILIATION_RETRIES - attempt,
219
204
  )
205
+ await asyncio.sleep(5)
220
206
 
221
- async def _create_function_executor(
222
- self, description: FunctionExecutorDescription
223
- ) -> FunctionExecutor:
224
- logger = self._function_executor_logger(
225
- id=description.id,
226
- namespace=description.namespace,
227
- graph_name=description.graph_name,
228
- graph_version=description.graph_version,
229
- function_name=description.function_name,
230
- )
231
- graph: SerializedObject = await self._downloader.download_graph(
232
- namespace=description.namespace,
233
- graph_name=description.graph_name,
234
- graph_version=description.graph_version,
235
- logger=logger,
207
+ metric_state_reconciliation_errors.inc()
208
+ self._logger.error(
209
+ f"Failed to reconcile desired state after {_RECONCILIATION_RETRIES} attempts.",
236
210
  )
237
- function_executor: FunctionExecutor = FunctionExecutor(
238
- server_factory=self._function_executor_server_factory, logger=logger
211
+
212
+ async def _reconcile_function_executors(
213
+ self, function_executor_descriptions: Iterable[FunctionExecutorDescription]
214
+ ):
215
+ valid_fe_descriptions: List[FunctionExecutorDescription] = (
216
+ self._valid_function_executor_descriptions(function_executor_descriptions)
239
217
  )
240
- config: FunctionExecutorServerConfiguration = (
241
- FunctionExecutorServerConfiguration(
242
- executor_id=self._executor_id,
243
- function_executor_id=description.id,
244
- namespace=description.namespace,
245
- image_uri=description.image_uri,
246
- secret_names=list(description.secret_names),
218
+ for fe_description in valid_fe_descriptions:
219
+ await self._reconcile_function_executor(fe_description)
220
+
221
+ seen_fe_ids: Set[str] = set(map(lambda fe: fe.id, valid_fe_descriptions))
222
+ fe_ids_to_remove = set(self._function_executor_controllers.keys()) - seen_fe_ids
223
+ for function_executor_id in fe_ids_to_remove:
224
+ # Remove the controller before FE shutdown completes so we won't attempt to do it
225
+ # again on the next reconciliations.
226
+ await self._function_executor_controllers.pop(
227
+ function_executor_id
228
+ ).shutdown()
229
+ # Schedule removal of the FE state after shutdown. This is required for Server
230
+ # to known when exactly FE resources are freed so it can put a replacement FE if needed.
231
+ # Running in a separate asyncio task because this will block until the shutdown is complete.
232
+ asyncio.create_task(
233
+ self._remove_function_executor_after_shutdown(function_executor_id),
234
+ name="Remove Function Executor after shutdown",
235
+ )
236
+
237
+ def _valid_function_executor_descriptions(
238
+ self, function_executor_descriptions: Iterable[FunctionExecutorDescription]
239
+ ):
240
+ valid_function_executor_descriptions: List[FunctionExecutorDescription] = []
241
+ for function_executor_description in function_executor_descriptions:
242
+ function_executor_description: FunctionExecutorDescription
243
+ logger = function_executor_logger(
244
+ function_executor_description, self._logger
247
245
  )
248
- )
249
- initialize_request: InitializeRequest = InitializeRequest(
250
- namespace=description.namespace,
251
- graph_name=description.graph_name,
252
- graph_version=description.graph_version,
253
- function_name=description.function_name,
254
- graph=graph,
255
- )
256
246
 
247
+ try:
248
+ validate_function_executor_description(function_executor_description)
249
+ except ValueError as e:
250
+ logger.error(
251
+ "Received invalid FunctionExecutorDescription from Server. Dropping it from desired state.",
252
+ exc_info=e,
253
+ )
254
+ continue
255
+
256
+ valid_function_executor_descriptions.append(function_executor_description)
257
+
258
+ return valid_function_executor_descriptions
259
+
260
+ async def _reconcile_function_executor(
261
+ self, function_executor_description: FunctionExecutorDescription
262
+ ):
263
+ """Reconciles a single Function Executor with the desired state.
264
+
265
+ Doesn't block on any long running operations. Doesn't raise any exceptions.
266
+ """
267
+ if function_executor_description.id not in self._function_executor_controllers:
268
+ await self._create_function_executor(function_executor_description)
269
+
270
+ async def _create_function_executor(
271
+ self, function_executor_description: FunctionExecutorDescription
272
+ ) -> None:
273
+ """Creates Function Executor for the supplied description.
274
+
275
+ Doesn't block on any long running operations. Doesn't raise any exceptions.
276
+ """
277
+ logger = function_executor_logger(function_executor_description, self._logger)
257
278
  try:
258
- await function_executor.initialize(
259
- config=config,
260
- initialize_request=initialize_request,
279
+ # TODO: Store FE description in FE state object once we migrate to gRPC State Reconciler.
280
+ # Then most of these parameters will be removed. Also remove the container and use a simple
281
+ # Dict once FE shutdown logic is moved into reconciler.
282
+ function_executor_state: FunctionExecutorState = (
283
+ await self._function_executor_states.get_or_create_state(
284
+ id=function_executor_description.id,
285
+ namespace=function_executor_description.namespace,
286
+ graph_name=function_executor_description.graph_name,
287
+ graph_version=function_executor_description.graph_version,
288
+ function_name=function_executor_description.function_name,
289
+ image_uri=(
290
+ function_executor_description.image_uri
291
+ if function_executor_description.HasField("image_uri")
292
+ else None
293
+ ),
294
+ secret_names=list(function_executor_description.secret_names),
295
+ )
296
+ )
297
+ controller: FunctionExecutorController = FunctionExecutorController(
298
+ executor_id=self._executor_id,
299
+ function_executor_state=function_executor_state,
300
+ function_executor_description=function_executor_description,
301
+ function_executor_server_factory=self._function_executor_server_factory,
302
+ downloader=self._downloader,
261
303
  base_url=self._base_url,
262
304
  config_path=self._config_path,
305
+ logger=self._logger,
306
+ )
307
+ self._function_executor_controllers[function_executor_description.id] = (
308
+ controller
263
309
  )
264
- return function_executor
265
- except Exception:
266
- await function_executor.destroy()
267
- raise
310
+ # Ask the controller to create the new FE. Task controllers will notice that the FE is eventually
311
+ # IDLE and start running tasks on it. Server currently doesn't explicitly manage the desired FE status.
312
+ await controller.startup()
313
+ except Exception as e:
314
+ logger.error("Failed adding Function Executor", exc_info=e)
268
315
 
269
- async def _cancel_running_tasks(
270
- self, function_executor_state: FunctionExecutorState
271
- ):
272
- pass
316
+ async def _remove_function_executor_after_shutdown(
317
+ self, function_executor_id: str
318
+ ) -> None:
319
+ fe_state: FunctionExecutorState = await self._function_executor_states.get(
320
+ function_executor_id
321
+ )
322
+ async with fe_state.lock:
323
+ await fe_state.wait_status(allowlist=[FunctionExecutorStatus.SHUTDOWN])
324
+ # The whole reconciler could shutdown while we were waiting for the FE to shutdown.
325
+ if not self._is_shutdown:
326
+ await self._function_executor_states.pop(function_executor_id)
273
327
 
274
- def _function_executor_logger(
275
- self,
276
- id: str,
277
- namespace: str,
278
- graph_name: str,
279
- graph_version: str,
280
- function_name: str,
281
- ) -> Any:
282
- return self._logger.bind(
283
- id=id,
284
- namespace=namespace,
285
- graph=graph_name,
286
- graph_version=graph_version,
287
- function_name=function_name,
328
+ async def _reconcile_tasks(self, task_allocations: Iterable[TaskAllocation]):
329
+ valid_task_allocations: List[TaskAllocation] = self._valid_task_allocations(
330
+ task_allocations
331
+ )
332
+ for task_allocation in valid_task_allocations:
333
+ await self._reconcile_task(task_allocation)
334
+
335
+ seen_task_ids: Set[str] = set(
336
+ map(lambda task_allocation: task_allocation.task.id, valid_task_allocations)
288
337
  )
338
+ task_ids_to_remove = set(self._task_controllers.keys()) - seen_task_ids
339
+ for task_id in task_ids_to_remove:
340
+ await self._remove_task(task_id)
341
+
342
+ async def _reconcile_task(self, task_allocation: TaskAllocation):
343
+ """Reconciles a single TaskAllocation with the desired state.
344
+
345
+ Doesn't raise any exceptions.
346
+ """
347
+ if task_allocation.task.id in self._task_controllers:
348
+ # Nothing to do, task allocation already exists and it's immutable.
349
+ return
289
350
 
290
- async def _report_task_outcome(self, task_output: TaskOutput):
291
- """Reports the task with the given output to the server.
351
+ logger = self._task_allocation_logger(task_allocation)
352
+ try:
353
+ function_executor_state: FunctionExecutorState = (
354
+ await self._function_executor_states.get(
355
+ task_allocation.function_executor_id
356
+ )
357
+ )
358
+ self._task_controllers[task_allocation.task.id] = TaskController(
359
+ task=task_allocation.task,
360
+ downloader=self._downloader,
361
+ task_reporter=self._task_reporter,
362
+ function_executor_id=task_allocation.function_executor_id,
363
+ function_executor_state=function_executor_state,
364
+ logger=self._logger,
365
+ )
366
+ except Exception as e:
367
+ logger.error("Failed adding TaskController", exc_info=e)
292
368
 
293
- Doesn't raise any Exceptions. Runs till the reporting is successful."""
294
- reporting_retries: int = 0
369
+ async def _remove_task(self, task_id: str) -> None:
370
+ """Schedules removal of an existing task.
371
+
372
+ Doesn't block on any long running operations. Doesn't raise any exceptions.
373
+ """
374
+ await self._task_controllers.pop(task_id).destroy()
375
+
376
+ def _valid_task_allocations(self, task_allocations: Iterable[TaskAllocation]):
377
+ valid_task_allocations: List[TaskAllocation] = []
378
+ for task_allocation in task_allocations:
379
+ task_allocation: TaskAllocation
380
+ logger = self._task_allocation_logger(task_allocation)
295
381
 
296
- while True:
297
- logger = logger.bind(retries=reporting_retries)
298
382
  try:
299
- await self._task_reporter.report(output=task_output, logger=logger)
300
- break
301
- except Exception as e:
383
+ validate_task(task_allocation.task)
384
+ except ValueError as e:
385
+ # There's no way to report this error to Server so just log it.
302
386
  logger.error(
303
- "failed to report task",
387
+ "Received invalid TaskAllocation from Server. Dropping it from desired state.",
304
388
  exc_info=e,
305
389
  )
306
- reporting_retries += 1
307
- metric_task_outcome_report_retries.inc()
308
- await asyncio.sleep(5)
390
+ continue
309
391
 
310
- metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
311
- if task_output.is_internal_error:
312
- metric_tasks_completed.labels(
313
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
314
- ).inc()
315
- elif task_output.success:
316
- metric_tasks_completed.labels(
317
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
318
- ).inc()
319
- else:
320
- metric_tasks_completed.labels(
321
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
322
- ).inc()
392
+ validator = MessageValidator(task_allocation)
393
+ try:
394
+ validator.required_field("function_executor_id")
395
+ except ValueError as e:
396
+ # There's no way to report this error to Server so just log it.
397
+ logger.error(
398
+ "Received invalid TaskAllocation from Server. Dropping it from desired state.",
399
+ exc_info=e,
400
+ )
401
+ continue
402
+
403
+ if (
404
+ task_allocation.function_executor_id
405
+ not in self._function_executor_controllers
406
+ ):
407
+ # Current policy: don't report task outcomes for tasks that didn't run.
408
+ # This is required to simplify the protocol so Server doesn't need to care about task states.
409
+ logger.error(
410
+ "Received TaskAllocation for a Function Executor that doesn't exist. Dropping it from desired state."
411
+ )
412
+ continue
413
+
414
+ valid_task_allocations.append(task_allocation)
415
+
416
+ return valid_task_allocations
417
+
418
+ def _task_allocation_logger(self, task_allocation: TaskAllocation) -> Any:
419
+ """Returns a logger for the given TaskAllocation.
420
+
421
+ Doesn't assume that the supplied TaskAllocation is valid.
422
+ """
423
+ return task_logger(task_allocation.task, self._logger).bind(
424
+ function_executor_id=(
425
+ task_allocation.function_executor_id
426
+ if task_allocation.HasField("function_executor_id")
427
+ else None
428
+ )
429
+ )