indexify 0.3.19__py3-none-any.whl → 0.3.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +12 -0
- indexify/executor/api_objects.py +11 -6
- indexify/executor/blob_store/blob_store.py +69 -0
- indexify/executor/blob_store/local_fs_blob_store.py +48 -0
- indexify/executor/blob_store/metrics/blob_store.py +33 -0
- indexify/executor/blob_store/s3_blob_store.py +88 -0
- indexify/executor/downloader.py +192 -27
- indexify/executor/executor.py +29 -13
- indexify/executor/function_executor/function_executor.py +1 -1
- indexify/executor/function_executor/function_executor_states_container.py +5 -0
- indexify/executor/function_executor/function_executor_status.py +2 -0
- indexify/executor/function_executor/health_checker.py +7 -2
- indexify/executor/function_executor/invocation_state_client.py +4 -2
- indexify/executor/function_executor/single_task_runner.py +2 -0
- indexify/executor/function_executor/task_output.py +8 -1
- indexify/executor/grpc/channel_manager.py +4 -3
- indexify/executor/grpc/function_executor_controller.py +163 -193
- indexify/executor/grpc/metrics/state_reconciler.py +17 -0
- indexify/executor/grpc/metrics/task_controller.py +8 -0
- indexify/executor/grpc/state_reconciler.py +305 -188
- indexify/executor/grpc/state_reporter.py +18 -10
- indexify/executor/grpc/task_controller.py +247 -189
- indexify/executor/metrics/task_reporter.py +17 -0
- indexify/executor/task_reporter.py +217 -94
- indexify/executor/task_runner.py +1 -0
- indexify/proto/executor_api.proto +37 -11
- indexify/proto/executor_api_pb2.py +49 -47
- indexify/proto/executor_api_pb2.pyi +55 -15
- {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/METADATA +2 -1
- {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/RECORD +32 -27
- indexify/executor/grpc/completed_tasks_container.py +0 -26
- {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/WHEEL +0 -0
- {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/entry_points.txt +0 -0
@@ -1,42 +1,44 @@
|
|
1
1
|
import asyncio
|
2
|
-
from typing import Any, AsyncGenerator, List, Optional, Set
|
2
|
+
from typing import Any, AsyncGenerator, Dict, Iterable, List, Optional, Set
|
3
3
|
|
4
|
-
import
|
5
|
-
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
6
|
-
InitializeRequest,
|
7
|
-
SerializedObject,
|
8
|
-
)
|
4
|
+
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
9
5
|
|
10
6
|
from indexify.proto.executor_api_pb2 import (
|
11
7
|
DesiredExecutorState,
|
12
8
|
FunctionExecutorDescription,
|
13
|
-
FunctionExecutorStatus,
|
14
9
|
GetDesiredExecutorStatesRequest,
|
10
|
+
TaskAllocation,
|
15
11
|
)
|
16
12
|
from indexify.proto.executor_api_pb2_grpc import (
|
17
13
|
ExecutorAPIStub,
|
18
14
|
)
|
19
15
|
|
20
16
|
from ..downloader import Downloader
|
21
|
-
from ..function_executor.function_executor import CustomerError, FunctionExecutor
|
22
17
|
from ..function_executor.function_executor_state import FunctionExecutorState
|
23
18
|
from ..function_executor.function_executor_states_container import (
|
24
19
|
FunctionExecutorStatesContainer,
|
25
20
|
)
|
21
|
+
from ..function_executor.function_executor_status import FunctionExecutorStatus
|
26
22
|
from ..function_executor.server.function_executor_server_factory import (
|
27
|
-
FunctionExecutorServerConfiguration,
|
28
23
|
FunctionExecutorServerFactory,
|
29
24
|
)
|
30
|
-
from ..function_executor.task_input import TaskInput
|
31
|
-
from ..function_executor.task_output import TaskOutput
|
32
|
-
from ..metrics.executor import (
|
33
|
-
metric_tasks_fetched,
|
34
|
-
)
|
35
25
|
from ..task_reporter import TaskReporter
|
36
26
|
from .channel_manager import ChannelManager
|
27
|
+
from .function_executor_controller import (
|
28
|
+
FunctionExecutorController,
|
29
|
+
function_executor_logger,
|
30
|
+
validate_function_executor_description,
|
31
|
+
)
|
32
|
+
from .metrics.state_reconciler import (
|
33
|
+
metric_state_reconciliation_errors,
|
34
|
+
metric_state_reconciliation_latency,
|
35
|
+
metric_state_reconciliations,
|
36
|
+
)
|
37
37
|
from .state_reporter import ExecutorStateReporter
|
38
|
+
from .task_controller import TaskController, task_logger, validate_task
|
38
39
|
|
39
40
|
_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
|
41
|
+
_RECONCILIATION_RETRIES = 3
|
40
42
|
|
41
43
|
|
42
44
|
class ExecutorStateReconciler:
|
@@ -61,27 +63,45 @@ class ExecutorStateReconciler:
|
|
61
63
|
self._config_path: Optional[str] = config_path
|
62
64
|
self._downloader: Downloader = downloader
|
63
65
|
self._task_reporter: TaskReporter = task_reporter
|
64
|
-
self._function_executor_states: FunctionExecutorStatesContainer = (
|
65
|
-
function_executor_states
|
66
|
-
)
|
67
66
|
self._channel_manager: ChannelManager = channel_manager
|
68
67
|
self._state_reporter: ExecutorStateReporter = state_reporter
|
68
|
+
self._reconciliation_loop_task: Optional[asyncio.Task] = None
|
69
69
|
self._logger: Any = logger.bind(module=__name__)
|
70
|
+
|
71
|
+
# Mutable state. Doesn't need lock because we access from async tasks running in the same thread.
|
70
72
|
self._is_shutdown: bool = False
|
71
|
-
self.
|
73
|
+
self._function_executor_states: FunctionExecutorStatesContainer = (
|
74
|
+
function_executor_states
|
75
|
+
)
|
76
|
+
self._function_executor_controllers: Dict[str, FunctionExecutorController] = {}
|
77
|
+
self._task_controllers: Dict[str, TaskController] = {}
|
78
|
+
self._last_server_clock: Optional[int] = None
|
79
|
+
|
80
|
+
self._last_desired_state_lock = asyncio.Lock()
|
81
|
+
self._last_desired_state_change_notifier: asyncio.Condition = asyncio.Condition(
|
82
|
+
lock=self._last_desired_state_lock
|
83
|
+
)
|
84
|
+
self._last_desired_state: Optional[DesiredExecutorState] = None
|
72
85
|
|
73
86
|
async def run(self):
|
74
87
|
"""Runs the state reconciler.
|
75
88
|
|
76
89
|
Never raises any exceptions.
|
77
90
|
"""
|
91
|
+
self._reconciliation_loop_task = asyncio.create_task(
|
92
|
+
self._reconciliation_loop(),
|
93
|
+
name="state reconciler reconciliation loop",
|
94
|
+
)
|
95
|
+
|
96
|
+
# TODO: Move this into a new async task and cancel it in shutdown().
|
78
97
|
while not self._is_shutdown:
|
79
98
|
stub = ExecutorAPIStub(await self._channel_manager.get_channel())
|
80
99
|
while not self._is_shutdown:
|
81
100
|
try:
|
82
101
|
# Report state once before starting the stream so Server
|
83
|
-
# doesn't use
|
102
|
+
# doesn't use stale state it knew about this Executor in the past.
|
84
103
|
await self._state_reporter.report_state(stub)
|
104
|
+
|
85
105
|
desired_states_stream: AsyncGenerator[
|
86
106
|
DesiredExecutorState, None
|
87
107
|
] = stub.get_desired_executor_states(
|
@@ -96,8 +116,6 @@ class ExecutorStateReconciler:
|
|
96
116
|
await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
|
97
117
|
break
|
98
118
|
|
99
|
-
self._logger.info("State reconciler shutdown.")
|
100
|
-
|
101
119
|
async def _process_desired_states_stream(
|
102
120
|
self, desired_states: AsyncGenerator[DesiredExecutorState, None]
|
103
121
|
):
|
@@ -106,18 +124,26 @@ class ExecutorStateReconciler:
|
|
106
124
|
return
|
107
125
|
|
108
126
|
new_state: DesiredExecutorState
|
109
|
-
|
110
|
-
|
111
|
-
|
127
|
+
validator: MessageValidator = MessageValidator(new_state)
|
128
|
+
try:
|
129
|
+
validator.required_field("clock")
|
130
|
+
except ValueError as e:
|
131
|
+
self._logger.error(
|
132
|
+
"Received invalid DesiredExecutorState from Server. Ignoring.",
|
133
|
+
exc_info=e,
|
134
|
+
)
|
135
|
+
continue
|
112
136
|
|
113
|
-
self.
|
114
|
-
|
137
|
+
if self._last_server_clock is not None:
|
138
|
+
if self._last_server_clock >= new_state.clock:
|
139
|
+
continue # Duplicate or outdated message state sent by Server.
|
115
140
|
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
141
|
+
self._last_server_clock = new_state.clock
|
142
|
+
# Always read the latest desired state value from the stream so
|
143
|
+
# we're never acting on stale desired states.
|
144
|
+
async with self._last_desired_state_lock:
|
145
|
+
self._last_desired_state = new_state
|
146
|
+
self._last_desired_state_change_notifier.notify_all()
|
121
147
|
|
122
148
|
async def shutdown(self):
|
123
149
|
"""Shuts down the state reconciler.
|
@@ -125,188 +151,279 @@ class ExecutorStateReconciler:
|
|
125
151
|
Never raises any exceptions.
|
126
152
|
"""
|
127
153
|
self._is_shutdown = True
|
154
|
+
if self._reconciliation_loop_task is not None:
|
155
|
+
self._reconciliation_loop_task.cancel()
|
156
|
+
self._logger.info("Reconciliation loop shutdown.")
|
128
157
|
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
158
|
+
for controller in self._task_controllers.values():
|
159
|
+
await controller.destroy()
|
160
|
+
# FEs are destroyed in executor.py right now.
|
161
|
+
# TODO: Once HTTP loop is removed add all FE state and controllers
|
162
|
+
# shutdown logic here. This should allow us to get rid of hacky
|
163
|
+
# "cancel all tasks loop" in executor.py shutdown and make the shutdown
|
164
|
+
# much more controllable and clean. E.g. we would be able to remove logs
|
165
|
+
# suppression from shutdown logic. Also need to shutdown self._function_executor_controllers.
|
134
166
|
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
)
|
145
|
-
)
|
167
|
+
async def _reconciliation_loop(self):
|
168
|
+
last_reconciled_state: Optional[DesiredExecutorState] = None
|
169
|
+
while not self._is_shutdown:
|
170
|
+
async with self._last_desired_state_lock:
|
171
|
+
# Comparing object identities (references) is enough here to not reconcile
|
172
|
+
# the same state twice.
|
173
|
+
while self._last_desired_state is last_reconciled_state:
|
174
|
+
await self._last_desired_state_change_notifier.wait()
|
175
|
+
last_reconciled_state = self._last_desired_state
|
146
176
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
function_executor_state.status = (
|
153
|
-
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTING_UP
|
154
|
-
)
|
155
|
-
try:
|
156
|
-
function_executor_state.function_executor = (
|
157
|
-
await self._create_function_executor()
|
158
|
-
)
|
159
|
-
function_executor_state.status = (
|
160
|
-
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_IDLE
|
161
|
-
)
|
162
|
-
except CustomerError as e:
|
163
|
-
function_executor_state.status = (
|
164
|
-
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR
|
165
|
-
)
|
166
|
-
except Exception as e:
|
167
|
-
function_executor_state.status = (
|
168
|
-
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR
|
169
|
-
)
|
170
|
-
self._logger.error(
|
171
|
-
f"Failed to create Function Executor", exc_info=e
|
172
|
-
)
|
173
|
-
|
174
|
-
function_executor_state_ids_to_destroy: List[str] = []
|
175
|
-
async for function_executor_state in self._function_executor_states:
|
176
|
-
function_executor_state: FunctionExecutorState
|
177
|
-
if function_executor_state.id not in desired_function_executor_ids:
|
178
|
-
function_executor_state_ids_to_destroy.append(
|
179
|
-
function_executor_state.id
|
177
|
+
with metric_state_reconciliation_latency.time():
|
178
|
+
metric_state_reconciliations.inc()
|
179
|
+
await self._reconcile_state(last_reconciled_state)
|
180
|
+
self._state_reporter.update_last_server_clock(
|
181
|
+
last_reconciled_state.clock
|
180
182
|
)
|
181
183
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
)
|
194
|
-
if (
|
195
|
-
function_executor_state.status
|
196
|
-
== FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK
|
197
|
-
):
|
198
|
-
logger.warning(
|
199
|
-
"Destroying Function Executor that is running a task. No task output will be reported as this is expected by the Server."
|
200
|
-
)
|
201
|
-
function_executor_state.status = (
|
202
|
-
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPING
|
184
|
+
async def _reconcile_state(self, desired_state: DesiredExecutorState):
|
185
|
+
"""Reconciles the desired state with the current state.
|
186
|
+
|
187
|
+
Doesn't raise any exceptions. Logs all errors for future investigation becase the gRPC protocol
|
188
|
+
doesn't allow us to return errors to the Server if it supplied invalid messages.
|
189
|
+
"""
|
190
|
+
for attempt in range(_RECONCILIATION_RETRIES):
|
191
|
+
try:
|
192
|
+
# Reconcile FEs first because Tasks depend on them.
|
193
|
+
await self._reconcile_function_executors(
|
194
|
+
desired_state.function_executors
|
203
195
|
)
|
204
|
-
await
|
205
|
-
|
206
|
-
|
196
|
+
await self._reconcile_tasks(desired_state.task_allocations)
|
197
|
+
return
|
198
|
+
except Exception as e:
|
199
|
+
self._logger.error(
|
200
|
+
"Failed to reconcile desired state. Retrying in 5 secs.",
|
201
|
+
exc_info=e,
|
202
|
+
attempt=attempt,
|
203
|
+
attempts_left=_RECONCILIATION_RETRIES - attempt,
|
207
204
|
)
|
205
|
+
await asyncio.sleep(5)
|
208
206
|
|
209
|
-
|
210
|
-
self
|
211
|
-
|
212
|
-
logger = self._function_executor_logger(
|
213
|
-
id=description.id,
|
214
|
-
namespace=description.namespace,
|
215
|
-
graph_name=description.graph_name,
|
216
|
-
graph_version=description.graph_version,
|
217
|
-
function_name=description.function_name,
|
218
|
-
)
|
219
|
-
graph: SerializedObject = await self._downloader.download_graph(
|
220
|
-
namespace=description.namespace,
|
221
|
-
graph_name=description.graph_name,
|
222
|
-
graph_version=description.graph_version,
|
223
|
-
logger=logger,
|
207
|
+
metric_state_reconciliation_errors.inc()
|
208
|
+
self._logger.error(
|
209
|
+
f"Failed to reconcile desired state after {_RECONCILIATION_RETRIES} attempts.",
|
224
210
|
)
|
225
|
-
|
226
|
-
|
211
|
+
|
212
|
+
async def _reconcile_function_executors(
|
213
|
+
self, function_executor_descriptions: Iterable[FunctionExecutorDescription]
|
214
|
+
):
|
215
|
+
valid_fe_descriptions: List[FunctionExecutorDescription] = (
|
216
|
+
self._valid_function_executor_descriptions(function_executor_descriptions)
|
227
217
|
)
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
218
|
+
for fe_description in valid_fe_descriptions:
|
219
|
+
await self._reconcile_function_executor(fe_description)
|
220
|
+
|
221
|
+
seen_fe_ids: Set[str] = set(map(lambda fe: fe.id, valid_fe_descriptions))
|
222
|
+
fe_ids_to_remove = set(self._function_executor_controllers.keys()) - seen_fe_ids
|
223
|
+
for function_executor_id in fe_ids_to_remove:
|
224
|
+
# Remove the controller before FE shutdown completes so we won't attempt to do it
|
225
|
+
# again on the next reconciliations.
|
226
|
+
await self._function_executor_controllers.pop(
|
227
|
+
function_executor_id
|
228
|
+
).shutdown()
|
229
|
+
# Schedule removal of the FE state after shutdown. This is required for Server
|
230
|
+
# to known when exactly FE resources are freed so it can put a replacement FE if needed.
|
231
|
+
# Running in a separate asyncio task because this will block until the shutdown is complete.
|
232
|
+
asyncio.create_task(
|
233
|
+
self._remove_function_executor_after_shutdown(function_executor_id),
|
234
|
+
name="Remove Function Executor after shutdown",
|
235
|
+
)
|
236
|
+
|
237
|
+
def _valid_function_executor_descriptions(
|
238
|
+
self, function_executor_descriptions: Iterable[FunctionExecutorDescription]
|
239
|
+
):
|
240
|
+
valid_function_executor_descriptions: List[FunctionExecutorDescription] = []
|
241
|
+
for function_executor_description in function_executor_descriptions:
|
242
|
+
function_executor_description: FunctionExecutorDescription
|
243
|
+
logger = function_executor_logger(
|
244
|
+
function_executor_description, self._logger
|
235
245
|
)
|
236
|
-
)
|
237
|
-
initialize_request: InitializeRequest = InitializeRequest(
|
238
|
-
namespace=description.namespace,
|
239
|
-
graph_name=description.graph_name,
|
240
|
-
graph_version=description.graph_version,
|
241
|
-
function_name=description.function_name,
|
242
|
-
graph=graph,
|
243
|
-
)
|
244
246
|
|
247
|
+
try:
|
248
|
+
validate_function_executor_description(function_executor_description)
|
249
|
+
except ValueError as e:
|
250
|
+
logger.error(
|
251
|
+
"Received invalid FunctionExecutorDescription from Server. Dropping it from desired state.",
|
252
|
+
exc_info=e,
|
253
|
+
)
|
254
|
+
continue
|
255
|
+
|
256
|
+
valid_function_executor_descriptions.append(function_executor_description)
|
257
|
+
|
258
|
+
return valid_function_executor_descriptions
|
259
|
+
|
260
|
+
async def _reconcile_function_executor(
|
261
|
+
self, function_executor_description: FunctionExecutorDescription
|
262
|
+
):
|
263
|
+
"""Reconciles a single Function Executor with the desired state.
|
264
|
+
|
265
|
+
Doesn't block on any long running operations. Doesn't raise any exceptions.
|
266
|
+
"""
|
267
|
+
if function_executor_description.id not in self._function_executor_controllers:
|
268
|
+
await self._create_function_executor(function_executor_description)
|
269
|
+
|
270
|
+
async def _create_function_executor(
|
271
|
+
self, function_executor_description: FunctionExecutorDescription
|
272
|
+
) -> None:
|
273
|
+
"""Creates Function Executor for the supplied description.
|
274
|
+
|
275
|
+
Doesn't block on any long running operations. Doesn't raise any exceptions.
|
276
|
+
"""
|
277
|
+
logger = function_executor_logger(function_executor_description, self._logger)
|
245
278
|
try:
|
246
|
-
|
247
|
-
|
248
|
-
|
279
|
+
# TODO: Store FE description in FE state object once we migrate to gRPC State Reconciler.
|
280
|
+
# Then most of these parameters will be removed. Also remove the container and use a simple
|
281
|
+
# Dict once FE shutdown logic is moved into reconciler.
|
282
|
+
function_executor_state: FunctionExecutorState = (
|
283
|
+
await self._function_executor_states.get_or_create_state(
|
284
|
+
id=function_executor_description.id,
|
285
|
+
namespace=function_executor_description.namespace,
|
286
|
+
graph_name=function_executor_description.graph_name,
|
287
|
+
graph_version=function_executor_description.graph_version,
|
288
|
+
function_name=function_executor_description.function_name,
|
289
|
+
image_uri=(
|
290
|
+
function_executor_description.image_uri
|
291
|
+
if function_executor_description.HasField("image_uri")
|
292
|
+
else None
|
293
|
+
),
|
294
|
+
secret_names=list(function_executor_description.secret_names),
|
295
|
+
)
|
296
|
+
)
|
297
|
+
controller: FunctionExecutorController = FunctionExecutorController(
|
298
|
+
executor_id=self._executor_id,
|
299
|
+
function_executor_state=function_executor_state,
|
300
|
+
function_executor_description=function_executor_description,
|
301
|
+
function_executor_server_factory=self._function_executor_server_factory,
|
302
|
+
downloader=self._downloader,
|
249
303
|
base_url=self._base_url,
|
250
304
|
config_path=self._config_path,
|
305
|
+
logger=self._logger,
|
306
|
+
)
|
307
|
+
self._function_executor_controllers[function_executor_description.id] = (
|
308
|
+
controller
|
251
309
|
)
|
252
|
-
|
253
|
-
|
254
|
-
await
|
255
|
-
|
310
|
+
# Ask the controller to create the new FE. Task controllers will notice that the FE is eventually
|
311
|
+
# IDLE and start running tasks on it. Server currently doesn't explicitly manage the desired FE status.
|
312
|
+
await controller.startup()
|
313
|
+
except Exception as e:
|
314
|
+
logger.error("Failed adding Function Executor", exc_info=e)
|
256
315
|
|
257
|
-
async def
|
258
|
-
self,
|
259
|
-
):
|
260
|
-
|
316
|
+
async def _remove_function_executor_after_shutdown(
|
317
|
+
self, function_executor_id: str
|
318
|
+
) -> None:
|
319
|
+
fe_state: FunctionExecutorState = await self._function_executor_states.get(
|
320
|
+
function_executor_id
|
321
|
+
)
|
322
|
+
async with fe_state.lock:
|
323
|
+
await fe_state.wait_status(allowlist=[FunctionExecutorStatus.SHUTDOWN])
|
324
|
+
# The whole reconciler could shutdown while we were waiting for the FE to shutdown.
|
325
|
+
if not self._is_shutdown:
|
326
|
+
await self._function_executor_states.pop(function_executor_id)
|
261
327
|
|
262
|
-
def
|
263
|
-
self
|
264
|
-
|
265
|
-
|
266
|
-
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
id=id,
|
272
|
-
namespace=namespace,
|
273
|
-
graph=graph_name,
|
274
|
-
graph_version=graph_version,
|
275
|
-
function_name=function_name,
|
328
|
+
async def _reconcile_tasks(self, task_allocations: Iterable[TaskAllocation]):
|
329
|
+
valid_task_allocations: List[TaskAllocation] = self._valid_task_allocations(
|
330
|
+
task_allocations
|
331
|
+
)
|
332
|
+
for task_allocation in valid_task_allocations:
|
333
|
+
await self._reconcile_task(task_allocation)
|
334
|
+
|
335
|
+
seen_task_ids: Set[str] = set(
|
336
|
+
map(lambda task_allocation: task_allocation.task.id, valid_task_allocations)
|
276
337
|
)
|
338
|
+
task_ids_to_remove = set(self._task_controllers.keys()) - seen_task_ids
|
339
|
+
for task_id in task_ids_to_remove:
|
340
|
+
await self._remove_task(task_id)
|
277
341
|
|
278
|
-
async def
|
279
|
-
"""
|
342
|
+
async def _reconcile_task(self, task_allocation: TaskAllocation):
|
343
|
+
"""Reconciles a single TaskAllocation with the desired state.
|
280
344
|
|
281
|
-
Doesn't raise any
|
282
|
-
|
345
|
+
Doesn't raise any exceptions.
|
346
|
+
"""
|
347
|
+
if task_allocation.task.id in self._task_controllers:
|
348
|
+
# Nothing to do, task allocation already exists and it's immutable.
|
349
|
+
return
|
350
|
+
|
351
|
+
logger = self._task_allocation_logger(task_allocation)
|
352
|
+
try:
|
353
|
+
function_executor_state: FunctionExecutorState = (
|
354
|
+
await self._function_executor_states.get(
|
355
|
+
task_allocation.function_executor_id
|
356
|
+
)
|
357
|
+
)
|
358
|
+
self._task_controllers[task_allocation.task.id] = TaskController(
|
359
|
+
task=task_allocation.task,
|
360
|
+
downloader=self._downloader,
|
361
|
+
task_reporter=self._task_reporter,
|
362
|
+
function_executor_id=task_allocation.function_executor_id,
|
363
|
+
function_executor_state=function_executor_state,
|
364
|
+
logger=self._logger,
|
365
|
+
)
|
366
|
+
except Exception as e:
|
367
|
+
logger.error("Failed adding TaskController", exc_info=e)
|
368
|
+
|
369
|
+
async def _remove_task(self, task_id: str) -> None:
|
370
|
+
"""Schedules removal of an existing task.
|
371
|
+
|
372
|
+
Doesn't block on any long running operations. Doesn't raise any exceptions.
|
373
|
+
"""
|
374
|
+
await self._task_controllers.pop(task_id).destroy()
|
375
|
+
|
376
|
+
def _valid_task_allocations(self, task_allocations: Iterable[TaskAllocation]):
|
377
|
+
valid_task_allocations: List[TaskAllocation] = []
|
378
|
+
for task_allocation in task_allocations:
|
379
|
+
task_allocation: TaskAllocation
|
380
|
+
logger = self._task_allocation_logger(task_allocation)
|
283
381
|
|
284
|
-
while True:
|
285
|
-
logger = logger.bind(retries=reporting_retries)
|
286
382
|
try:
|
287
|
-
|
288
|
-
|
383
|
+
validate_task(task_allocation.task)
|
384
|
+
except ValueError as e:
|
385
|
+
# There's no way to report this error to Server so just log it.
|
386
|
+
logger.error(
|
387
|
+
"Received invalid TaskAllocation from Server. Dropping it from desired state.",
|
388
|
+
exc_info=e,
|
289
389
|
)
|
290
|
-
|
291
|
-
|
390
|
+
continue
|
391
|
+
|
392
|
+
validator = MessageValidator(task_allocation)
|
393
|
+
try:
|
394
|
+
validator.required_field("function_executor_id")
|
395
|
+
except ValueError as e:
|
396
|
+
# There's no way to report this error to Server so just log it.
|
292
397
|
logger.error(
|
293
|
-
"
|
398
|
+
"Received invalid TaskAllocation from Server. Dropping it from desired state.",
|
294
399
|
exc_info=e,
|
295
400
|
)
|
296
|
-
|
297
|
-
|
298
|
-
|
401
|
+
continue
|
402
|
+
|
403
|
+
if (
|
404
|
+
task_allocation.function_executor_id
|
405
|
+
not in self._function_executor_controllers
|
406
|
+
):
|
407
|
+
# Current policy: don't report task outcomes for tasks that didn't run.
|
408
|
+
# This is required to simplify the protocol so Server doesn't need to care about task states.
|
409
|
+
logger.error(
|
410
|
+
"Received TaskAllocation for a Function Executor that doesn't exist. Dropping it from desired state."
|
411
|
+
)
|
412
|
+
continue
|
299
413
|
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
414
|
+
valid_task_allocations.append(task_allocation)
|
415
|
+
|
416
|
+
return valid_task_allocations
|
417
|
+
|
418
|
+
def _task_allocation_logger(self, task_allocation: TaskAllocation) -> Any:
|
419
|
+
"""Returns a logger for the given TaskAllocation.
|
420
|
+
|
421
|
+
Doesn't assume that the supplied TaskAllocation is valid.
|
422
|
+
"""
|
423
|
+
return task_logger(task_allocation.task, self._logger).bind(
|
424
|
+
function_executor_id=(
|
425
|
+
task_allocation.function_executor_id
|
426
|
+
if task_allocation.HasField("function_executor_id")
|
427
|
+
else None
|
428
|
+
)
|
429
|
+
)
|