indexify 0.3.18__py3-none-any.whl → 0.3.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +15 -17
- indexify/executor/api_objects.py +12 -0
- indexify/executor/blob_store/blob_store.py +69 -0
- indexify/executor/blob_store/local_fs_blob_store.py +48 -0
- indexify/executor/blob_store/metrics/blob_store.py +33 -0
- indexify/executor/blob_store/s3_blob_store.py +85 -0
- indexify/executor/downloader.py +149 -25
- indexify/executor/executor.py +77 -41
- indexify/executor/function_executor/function_executor.py +24 -11
- indexify/executor/function_executor/function_executor_state.py +9 -1
- indexify/executor/function_executor/function_executor_states_container.py +8 -1
- indexify/executor/function_executor/function_executor_status.py +4 -0
- indexify/executor/function_executor/health_checker.py +7 -2
- indexify/executor/function_executor/invocation_state_client.py +4 -2
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
- indexify/executor/function_executor/single_task_runner.py +15 -11
- indexify/executor/function_executor/task_output.py +36 -2
- indexify/executor/grpc/channel_manager.py +4 -3
- indexify/executor/grpc/function_executor_controller.py +391 -0
- indexify/executor/grpc/metrics/state_reconciler.py +17 -0
- indexify/executor/grpc/metrics/task_controller.py +8 -0
- indexify/executor/grpc/state_reconciler.py +324 -217
- indexify/executor/grpc/state_reporter.py +52 -41
- indexify/executor/grpc/task_controller.py +492 -0
- indexify/executor/metrics/task_reporter.py +14 -0
- indexify/executor/task_reporter.py +115 -6
- indexify/executor/task_runner.py +1 -0
- indexify/proto/executor_api.proto +91 -7
- indexify/proto/executor_api_pb2.py +49 -37
- indexify/proto/executor_api_pb2.pyi +158 -3
- indexify/proto/executor_api_pb2_grpc.py +47 -0
- {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/METADATA +2 -1
- {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/RECORD +35 -27
- {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/WHEEL +0 -0
- {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/entry_points.txt +0 -0
@@ -1,52 +1,44 @@
|
|
1
1
|
import asyncio
|
2
|
-
from typing import Any, AsyncGenerator, List, Optional, Set
|
2
|
+
from typing import Any, AsyncGenerator, Dict, Iterable, List, Optional, Set
|
3
3
|
|
4
|
-
import
|
5
|
-
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
6
|
-
InitializeRequest,
|
7
|
-
SerializedObject,
|
8
|
-
)
|
4
|
+
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
9
5
|
|
10
6
|
from indexify.proto.executor_api_pb2 import (
|
11
7
|
DesiredExecutorState,
|
12
8
|
FunctionExecutorDescription,
|
13
|
-
FunctionExecutorStatus,
|
14
9
|
GetDesiredExecutorStatesRequest,
|
10
|
+
TaskAllocation,
|
15
11
|
)
|
16
12
|
from indexify.proto.executor_api_pb2_grpc import (
|
17
13
|
ExecutorAPIStub,
|
18
14
|
)
|
19
15
|
|
20
16
|
from ..downloader import Downloader
|
21
|
-
from ..function_executor.function_executor import CustomerError, FunctionExecutor
|
22
17
|
from ..function_executor.function_executor_state import FunctionExecutorState
|
23
18
|
from ..function_executor.function_executor_states_container import (
|
24
19
|
FunctionExecutorStatesContainer,
|
25
20
|
)
|
21
|
+
from ..function_executor.function_executor_status import FunctionExecutorStatus
|
26
22
|
from ..function_executor.server.function_executor_server_factory import (
|
27
|
-
FunctionExecutorServerConfiguration,
|
28
23
|
FunctionExecutorServerFactory,
|
29
24
|
)
|
30
|
-
from ..function_executor.task_input import TaskInput
|
31
|
-
from ..function_executor.task_output import TaskOutput
|
32
|
-
from ..metrics.executor import (
|
33
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ALL,
|
34
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
|
35
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
|
36
|
-
METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
|
37
|
-
metric_task_completion_latency,
|
38
|
-
metric_task_outcome_report_latency,
|
39
|
-
metric_task_outcome_report_retries,
|
40
|
-
metric_task_outcome_reports,
|
41
|
-
metric_tasks_completed,
|
42
|
-
metric_tasks_fetched,
|
43
|
-
metric_tasks_reporting_outcome,
|
44
|
-
)
|
45
25
|
from ..task_reporter import TaskReporter
|
46
26
|
from .channel_manager import ChannelManager
|
27
|
+
from .function_executor_controller import (
|
28
|
+
FunctionExecutorController,
|
29
|
+
function_executor_logger,
|
30
|
+
validate_function_executor_description,
|
31
|
+
)
|
32
|
+
from .metrics.state_reconciler import (
|
33
|
+
metric_state_reconciliation_errors,
|
34
|
+
metric_state_reconciliation_latency,
|
35
|
+
metric_state_reconciliations,
|
36
|
+
)
|
47
37
|
from .state_reporter import ExecutorStateReporter
|
38
|
+
from .task_controller import TaskController, task_logger, validate_task
|
48
39
|
|
49
40
|
_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
|
41
|
+
_RECONCILIATION_RETRIES = 3
|
50
42
|
|
51
43
|
|
52
44
|
class ExecutorStateReconciler:
|
@@ -71,46 +63,58 @@ class ExecutorStateReconciler:
|
|
71
63
|
self._config_path: Optional[str] = config_path
|
72
64
|
self._downloader: Downloader = downloader
|
73
65
|
self._task_reporter: TaskReporter = task_reporter
|
74
|
-
self._function_executor_states: FunctionExecutorStatesContainer = (
|
75
|
-
function_executor_states
|
76
|
-
)
|
77
66
|
self._channel_manager: ChannelManager = channel_manager
|
78
67
|
self._state_reporter: ExecutorStateReporter = state_reporter
|
68
|
+
self._reconciliation_loop_task: Optional[asyncio.Task] = None
|
79
69
|
self._logger: Any = logger.bind(module=__name__)
|
70
|
+
|
71
|
+
# Mutable state. Doesn't need lock because we access from async tasks running in the same thread.
|
80
72
|
self._is_shutdown: bool = False
|
81
|
-
self.
|
73
|
+
self._function_executor_states: FunctionExecutorStatesContainer = (
|
74
|
+
function_executor_states
|
75
|
+
)
|
76
|
+
self._function_executor_controllers: Dict[str, FunctionExecutorController] = {}
|
77
|
+
self._task_controllers: Dict[str, TaskController] = {}
|
78
|
+
self._last_server_clock: Optional[int] = None
|
79
|
+
|
80
|
+
self._last_desired_state_lock = asyncio.Lock()
|
81
|
+
self._last_desired_state_change_notifier: asyncio.Condition = asyncio.Condition(
|
82
|
+
lock=self._last_desired_state_lock
|
83
|
+
)
|
84
|
+
self._last_desired_state: Optional[DesiredExecutorState] = None
|
82
85
|
|
83
86
|
async def run(self):
|
84
87
|
"""Runs the state reconciler.
|
85
88
|
|
86
89
|
Never raises any exceptions.
|
87
90
|
"""
|
91
|
+
self._reconciliation_loop_task = asyncio.create_task(
|
92
|
+
self._reconciliation_loop(),
|
93
|
+
name="state reconciler reconciliation loop",
|
94
|
+
)
|
95
|
+
|
96
|
+
# TODO: Move this into a new async task and cancel it in shutdown().
|
88
97
|
while not self._is_shutdown:
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
)
|
110
|
-
await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
|
111
|
-
break
|
112
|
-
|
113
|
-
self._logger.info("State reconciler shutdown.")
|
98
|
+
stub = ExecutorAPIStub(await self._channel_manager.get_channel())
|
99
|
+
while not self._is_shutdown:
|
100
|
+
try:
|
101
|
+
# Report state once before starting the stream so Server
|
102
|
+
# doesn't use stale state it knew about this Executor in the past.
|
103
|
+
await self._state_reporter.report_state(stub)
|
104
|
+
|
105
|
+
desired_states_stream: AsyncGenerator[
|
106
|
+
DesiredExecutorState, None
|
107
|
+
] = stub.get_desired_executor_states(
|
108
|
+
GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
|
109
|
+
)
|
110
|
+
await self._process_desired_states_stream(desired_states_stream)
|
111
|
+
except Exception as e:
|
112
|
+
self._logger.error(
|
113
|
+
f"Failed processing desired states stream, reconnecting in {_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC} sec.",
|
114
|
+
exc_info=e,
|
115
|
+
)
|
116
|
+
await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
|
117
|
+
break
|
114
118
|
|
115
119
|
async def _process_desired_states_stream(
|
116
120
|
self, desired_states: AsyncGenerator[DesiredExecutorState, None]
|
@@ -120,17 +124,26 @@ class ExecutorStateReconciler:
|
|
120
124
|
return
|
121
125
|
|
122
126
|
new_state: DesiredExecutorState
|
123
|
-
|
124
|
-
|
125
|
-
|
127
|
+
validator: MessageValidator = MessageValidator(new_state)
|
128
|
+
try:
|
129
|
+
validator.required_field("clock")
|
130
|
+
except ValueError as e:
|
131
|
+
self._logger.error(
|
132
|
+
"Received invalid DesiredExecutorState from Server. Ignoring.",
|
133
|
+
exc_info=e,
|
134
|
+
)
|
135
|
+
continue
|
126
136
|
|
127
|
-
self.
|
128
|
-
|
137
|
+
if self._last_server_clock is not None:
|
138
|
+
if self._last_server_clock >= new_state.clock:
|
139
|
+
continue # Duplicate or outdated message state sent by Server.
|
129
140
|
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
141
|
+
self._last_server_clock = new_state.clock
|
142
|
+
# Always read the latest desired state value from the stream so
|
143
|
+
# we're never acting on stale desired states.
|
144
|
+
async with self._last_desired_state_lock:
|
145
|
+
self._last_desired_state = new_state
|
146
|
+
self._last_desired_state_change_notifier.notify_all()
|
134
147
|
|
135
148
|
async def shutdown(self):
|
136
149
|
"""Shuts down the state reconciler.
|
@@ -138,185 +151,279 @@ class ExecutorStateReconciler:
|
|
138
151
|
Never raises any exceptions.
|
139
152
|
"""
|
140
153
|
self._is_shutdown = True
|
154
|
+
if self._reconciliation_loop_task is not None:
|
155
|
+
self._reconciliation_loop_task.cancel()
|
156
|
+
self._logger.info("Reconciliation loop shutdown.")
|
141
157
|
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
158
|
+
for controller in self._task_controllers.values():
|
159
|
+
await controller.destroy()
|
160
|
+
# FEs are destroyed in executor.py right now.
|
161
|
+
# TODO: Once HTTP loop is removed add all FE state and controllers
|
162
|
+
# shutdown logic here. This should allow us to get rid of hacky
|
163
|
+
# "cancel all tasks loop" in executor.py shutdown and make the shutdown
|
164
|
+
# much more controllable and clean. E.g. we would be able to remove logs
|
165
|
+
# suppression from shutdown logic. Also need to shutdown self._function_executor_controllers.
|
147
166
|
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
)
|
167
|
+
async def _reconciliation_loop(self):
|
168
|
+
last_reconciled_state: Optional[DesiredExecutorState] = None
|
169
|
+
while not self._is_shutdown:
|
170
|
+
async with self._last_desired_state_lock:
|
171
|
+
# Comparing object identities (references) is enough here to not reconcile
|
172
|
+
# the same state twice.
|
173
|
+
while self._last_desired_state is last_reconciled_state:
|
174
|
+
await self._last_desired_state_change_notifier.wait()
|
175
|
+
last_reconciled_state = self._last_desired_state
|
158
176
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
function_executor_state.status = (
|
165
|
-
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTING_UP
|
166
|
-
)
|
167
|
-
try:
|
168
|
-
function_executor_state.function_executor = (
|
169
|
-
await self._create_function_executor()
|
170
|
-
)
|
171
|
-
function_executor_state.status = (
|
172
|
-
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_IDLE
|
173
|
-
)
|
174
|
-
except CustomerError as e:
|
175
|
-
function_executor_state.status = (
|
176
|
-
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR
|
177
|
-
)
|
178
|
-
except Exception as e:
|
179
|
-
function_executor_state.status = (
|
180
|
-
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR
|
181
|
-
)
|
182
|
-
self._logger.error(
|
183
|
-
f"Failed to create Function Executor", exc_info=e
|
184
|
-
)
|
185
|
-
|
186
|
-
function_executor_state_ids_to_destroy: List[str] = []
|
187
|
-
async for function_executor_state in self._function_executor_states:
|
188
|
-
function_executor_state: FunctionExecutorState
|
189
|
-
if function_executor_state.id not in desired_function_executor_ids:
|
190
|
-
function_executor_state_ids_to_destroy.append(
|
191
|
-
function_executor_state.id
|
177
|
+
with metric_state_reconciliation_latency.time():
|
178
|
+
metric_state_reconciliations.inc()
|
179
|
+
await self._reconcile_state(last_reconciled_state)
|
180
|
+
self._state_reporter.update_last_server_clock(
|
181
|
+
last_reconciled_state.clock
|
192
182
|
)
|
193
183
|
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
)
|
206
|
-
if (
|
207
|
-
function_executor_state.status
|
208
|
-
== FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK
|
209
|
-
):
|
210
|
-
logger.warning(
|
211
|
-
"Destroying Function Executor that is running a task. No task output will be reported as this is expected by the Server."
|
212
|
-
)
|
213
|
-
function_executor_state.status = (
|
214
|
-
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPING
|
184
|
+
async def _reconcile_state(self, desired_state: DesiredExecutorState):
|
185
|
+
"""Reconciles the desired state with the current state.
|
186
|
+
|
187
|
+
Doesn't raise any exceptions. Logs all errors for future investigation becase the gRPC protocol
|
188
|
+
doesn't allow us to return errors to the Server if it supplied invalid messages.
|
189
|
+
"""
|
190
|
+
for attempt in range(_RECONCILIATION_RETRIES):
|
191
|
+
try:
|
192
|
+
# Reconcile FEs first because Tasks depend on them.
|
193
|
+
await self._reconcile_function_executors(
|
194
|
+
desired_state.function_executors
|
215
195
|
)
|
216
|
-
await
|
217
|
-
|
218
|
-
|
196
|
+
await self._reconcile_tasks(desired_state.task_allocations)
|
197
|
+
return
|
198
|
+
except Exception as e:
|
199
|
+
self._logger.error(
|
200
|
+
"Failed to reconcile desired state. Retrying in 5 secs.",
|
201
|
+
exc_info=e,
|
202
|
+
attempt=attempt,
|
203
|
+
attempts_left=_RECONCILIATION_RETRIES - attempt,
|
219
204
|
)
|
205
|
+
await asyncio.sleep(5)
|
220
206
|
|
221
|
-
|
222
|
-
self
|
223
|
-
|
224
|
-
logger = self._function_executor_logger(
|
225
|
-
id=description.id,
|
226
|
-
namespace=description.namespace,
|
227
|
-
graph_name=description.graph_name,
|
228
|
-
graph_version=description.graph_version,
|
229
|
-
function_name=description.function_name,
|
230
|
-
)
|
231
|
-
graph: SerializedObject = await self._downloader.download_graph(
|
232
|
-
namespace=description.namespace,
|
233
|
-
graph_name=description.graph_name,
|
234
|
-
graph_version=description.graph_version,
|
235
|
-
logger=logger,
|
207
|
+
metric_state_reconciliation_errors.inc()
|
208
|
+
self._logger.error(
|
209
|
+
f"Failed to reconcile desired state after {_RECONCILIATION_RETRIES} attempts.",
|
236
210
|
)
|
237
|
-
|
238
|
-
|
211
|
+
|
212
|
+
async def _reconcile_function_executors(
|
213
|
+
self, function_executor_descriptions: Iterable[FunctionExecutorDescription]
|
214
|
+
):
|
215
|
+
valid_fe_descriptions: List[FunctionExecutorDescription] = (
|
216
|
+
self._valid_function_executor_descriptions(function_executor_descriptions)
|
239
217
|
)
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
218
|
+
for fe_description in valid_fe_descriptions:
|
219
|
+
await self._reconcile_function_executor(fe_description)
|
220
|
+
|
221
|
+
seen_fe_ids: Set[str] = set(map(lambda fe: fe.id, valid_fe_descriptions))
|
222
|
+
fe_ids_to_remove = set(self._function_executor_controllers.keys()) - seen_fe_ids
|
223
|
+
for function_executor_id in fe_ids_to_remove:
|
224
|
+
# Remove the controller before FE shutdown completes so we won't attempt to do it
|
225
|
+
# again on the next reconciliations.
|
226
|
+
await self._function_executor_controllers.pop(
|
227
|
+
function_executor_id
|
228
|
+
).shutdown()
|
229
|
+
# Schedule removal of the FE state after shutdown. This is required for Server
|
230
|
+
# to known when exactly FE resources are freed so it can put a replacement FE if needed.
|
231
|
+
# Running in a separate asyncio task because this will block until the shutdown is complete.
|
232
|
+
asyncio.create_task(
|
233
|
+
self._remove_function_executor_after_shutdown(function_executor_id),
|
234
|
+
name="Remove Function Executor after shutdown",
|
235
|
+
)
|
236
|
+
|
237
|
+
def _valid_function_executor_descriptions(
|
238
|
+
self, function_executor_descriptions: Iterable[FunctionExecutorDescription]
|
239
|
+
):
|
240
|
+
valid_function_executor_descriptions: List[FunctionExecutorDescription] = []
|
241
|
+
for function_executor_description in function_executor_descriptions:
|
242
|
+
function_executor_description: FunctionExecutorDescription
|
243
|
+
logger = function_executor_logger(
|
244
|
+
function_executor_description, self._logger
|
247
245
|
)
|
248
|
-
)
|
249
|
-
initialize_request: InitializeRequest = InitializeRequest(
|
250
|
-
namespace=description.namespace,
|
251
|
-
graph_name=description.graph_name,
|
252
|
-
graph_version=description.graph_version,
|
253
|
-
function_name=description.function_name,
|
254
|
-
graph=graph,
|
255
|
-
)
|
256
246
|
|
247
|
+
try:
|
248
|
+
validate_function_executor_description(function_executor_description)
|
249
|
+
except ValueError as e:
|
250
|
+
logger.error(
|
251
|
+
"Received invalid FunctionExecutorDescription from Server. Dropping it from desired state.",
|
252
|
+
exc_info=e,
|
253
|
+
)
|
254
|
+
continue
|
255
|
+
|
256
|
+
valid_function_executor_descriptions.append(function_executor_description)
|
257
|
+
|
258
|
+
return valid_function_executor_descriptions
|
259
|
+
|
260
|
+
async def _reconcile_function_executor(
|
261
|
+
self, function_executor_description: FunctionExecutorDescription
|
262
|
+
):
|
263
|
+
"""Reconciles a single Function Executor with the desired state.
|
264
|
+
|
265
|
+
Doesn't block on any long running operations. Doesn't raise any exceptions.
|
266
|
+
"""
|
267
|
+
if function_executor_description.id not in self._function_executor_controllers:
|
268
|
+
await self._create_function_executor(function_executor_description)
|
269
|
+
|
270
|
+
async def _create_function_executor(
|
271
|
+
self, function_executor_description: FunctionExecutorDescription
|
272
|
+
) -> None:
|
273
|
+
"""Creates Function Executor for the supplied description.
|
274
|
+
|
275
|
+
Doesn't block on any long running operations. Doesn't raise any exceptions.
|
276
|
+
"""
|
277
|
+
logger = function_executor_logger(function_executor_description, self._logger)
|
257
278
|
try:
|
258
|
-
|
259
|
-
|
260
|
-
|
279
|
+
# TODO: Store FE description in FE state object once we migrate to gRPC State Reconciler.
|
280
|
+
# Then most of these parameters will be removed. Also remove the container and use a simple
|
281
|
+
# Dict once FE shutdown logic is moved into reconciler.
|
282
|
+
function_executor_state: FunctionExecutorState = (
|
283
|
+
await self._function_executor_states.get_or_create_state(
|
284
|
+
id=function_executor_description.id,
|
285
|
+
namespace=function_executor_description.namespace,
|
286
|
+
graph_name=function_executor_description.graph_name,
|
287
|
+
graph_version=function_executor_description.graph_version,
|
288
|
+
function_name=function_executor_description.function_name,
|
289
|
+
image_uri=(
|
290
|
+
function_executor_description.image_uri
|
291
|
+
if function_executor_description.HasField("image_uri")
|
292
|
+
else None
|
293
|
+
),
|
294
|
+
secret_names=list(function_executor_description.secret_names),
|
295
|
+
)
|
296
|
+
)
|
297
|
+
controller: FunctionExecutorController = FunctionExecutorController(
|
298
|
+
executor_id=self._executor_id,
|
299
|
+
function_executor_state=function_executor_state,
|
300
|
+
function_executor_description=function_executor_description,
|
301
|
+
function_executor_server_factory=self._function_executor_server_factory,
|
302
|
+
downloader=self._downloader,
|
261
303
|
base_url=self._base_url,
|
262
304
|
config_path=self._config_path,
|
305
|
+
logger=self._logger,
|
306
|
+
)
|
307
|
+
self._function_executor_controllers[function_executor_description.id] = (
|
308
|
+
controller
|
263
309
|
)
|
264
|
-
|
265
|
-
|
266
|
-
await
|
267
|
-
|
310
|
+
# Ask the controller to create the new FE. Task controllers will notice that the FE is eventually
|
311
|
+
# IDLE and start running tasks on it. Server currently doesn't explicitly manage the desired FE status.
|
312
|
+
await controller.startup()
|
313
|
+
except Exception as e:
|
314
|
+
logger.error("Failed adding Function Executor", exc_info=e)
|
268
315
|
|
269
|
-
async def
|
270
|
-
self,
|
271
|
-
):
|
272
|
-
|
316
|
+
async def _remove_function_executor_after_shutdown(
|
317
|
+
self, function_executor_id: str
|
318
|
+
) -> None:
|
319
|
+
fe_state: FunctionExecutorState = await self._function_executor_states.get(
|
320
|
+
function_executor_id
|
321
|
+
)
|
322
|
+
async with fe_state.lock:
|
323
|
+
await fe_state.wait_status(allowlist=[FunctionExecutorStatus.SHUTDOWN])
|
324
|
+
# The whole reconciler could shutdown while we were waiting for the FE to shutdown.
|
325
|
+
if not self._is_shutdown:
|
326
|
+
await self._function_executor_states.pop(function_executor_id)
|
273
327
|
|
274
|
-
def
|
275
|
-
self
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
283
|
-
id=id,
|
284
|
-
namespace=namespace,
|
285
|
-
graph=graph_name,
|
286
|
-
graph_version=graph_version,
|
287
|
-
function_name=function_name,
|
328
|
+
async def _reconcile_tasks(self, task_allocations: Iterable[TaskAllocation]):
|
329
|
+
valid_task_allocations: List[TaskAllocation] = self._valid_task_allocations(
|
330
|
+
task_allocations
|
331
|
+
)
|
332
|
+
for task_allocation in valid_task_allocations:
|
333
|
+
await self._reconcile_task(task_allocation)
|
334
|
+
|
335
|
+
seen_task_ids: Set[str] = set(
|
336
|
+
map(lambda task_allocation: task_allocation.task.id, valid_task_allocations)
|
288
337
|
)
|
338
|
+
task_ids_to_remove = set(self._task_controllers.keys()) - seen_task_ids
|
339
|
+
for task_id in task_ids_to_remove:
|
340
|
+
await self._remove_task(task_id)
|
341
|
+
|
342
|
+
async def _reconcile_task(self, task_allocation: TaskAllocation):
|
343
|
+
"""Reconciles a single TaskAllocation with the desired state.
|
344
|
+
|
345
|
+
Doesn't raise any exceptions.
|
346
|
+
"""
|
347
|
+
if task_allocation.task.id in self._task_controllers:
|
348
|
+
# Nothing to do, task allocation already exists and it's immutable.
|
349
|
+
return
|
289
350
|
|
290
|
-
|
291
|
-
|
351
|
+
logger = self._task_allocation_logger(task_allocation)
|
352
|
+
try:
|
353
|
+
function_executor_state: FunctionExecutorState = (
|
354
|
+
await self._function_executor_states.get(
|
355
|
+
task_allocation.function_executor_id
|
356
|
+
)
|
357
|
+
)
|
358
|
+
self._task_controllers[task_allocation.task.id] = TaskController(
|
359
|
+
task=task_allocation.task,
|
360
|
+
downloader=self._downloader,
|
361
|
+
task_reporter=self._task_reporter,
|
362
|
+
function_executor_id=task_allocation.function_executor_id,
|
363
|
+
function_executor_state=function_executor_state,
|
364
|
+
logger=self._logger,
|
365
|
+
)
|
366
|
+
except Exception as e:
|
367
|
+
logger.error("Failed adding TaskController", exc_info=e)
|
292
368
|
|
293
|
-
|
294
|
-
|
369
|
+
async def _remove_task(self, task_id: str) -> None:
|
370
|
+
"""Schedules removal of an existing task.
|
371
|
+
|
372
|
+
Doesn't block on any long running operations. Doesn't raise any exceptions.
|
373
|
+
"""
|
374
|
+
await self._task_controllers.pop(task_id).destroy()
|
375
|
+
|
376
|
+
def _valid_task_allocations(self, task_allocations: Iterable[TaskAllocation]):
|
377
|
+
valid_task_allocations: List[TaskAllocation] = []
|
378
|
+
for task_allocation in task_allocations:
|
379
|
+
task_allocation: TaskAllocation
|
380
|
+
logger = self._task_allocation_logger(task_allocation)
|
295
381
|
|
296
|
-
while True:
|
297
|
-
logger = logger.bind(retries=reporting_retries)
|
298
382
|
try:
|
299
|
-
|
300
|
-
|
301
|
-
|
383
|
+
validate_task(task_allocation.task)
|
384
|
+
except ValueError as e:
|
385
|
+
# There's no way to report this error to Server so just log it.
|
302
386
|
logger.error(
|
303
|
-
"
|
387
|
+
"Received invalid TaskAllocation from Server. Dropping it from desired state.",
|
304
388
|
exc_info=e,
|
305
389
|
)
|
306
|
-
|
307
|
-
metric_task_outcome_report_retries.inc()
|
308
|
-
await asyncio.sleep(5)
|
390
|
+
continue
|
309
391
|
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
392
|
+
validator = MessageValidator(task_allocation)
|
393
|
+
try:
|
394
|
+
validator.required_field("function_executor_id")
|
395
|
+
except ValueError as e:
|
396
|
+
# There's no way to report this error to Server so just log it.
|
397
|
+
logger.error(
|
398
|
+
"Received invalid TaskAllocation from Server. Dropping it from desired state.",
|
399
|
+
exc_info=e,
|
400
|
+
)
|
401
|
+
continue
|
402
|
+
|
403
|
+
if (
|
404
|
+
task_allocation.function_executor_id
|
405
|
+
not in self._function_executor_controllers
|
406
|
+
):
|
407
|
+
# Current policy: don't report task outcomes for tasks that didn't run.
|
408
|
+
# This is required to simplify the protocol so Server doesn't need to care about task states.
|
409
|
+
logger.error(
|
410
|
+
"Received TaskAllocation for a Function Executor that doesn't exist. Dropping it from desired state."
|
411
|
+
)
|
412
|
+
continue
|
413
|
+
|
414
|
+
valid_task_allocations.append(task_allocation)
|
415
|
+
|
416
|
+
return valid_task_allocations
|
417
|
+
|
418
|
+
def _task_allocation_logger(self, task_allocation: TaskAllocation) -> Any:
|
419
|
+
"""Returns a logger for the given TaskAllocation.
|
420
|
+
|
421
|
+
Doesn't assume that the supplied TaskAllocation is valid.
|
422
|
+
"""
|
423
|
+
return task_logger(task_allocation.task, self._logger).bind(
|
424
|
+
function_executor_id=(
|
425
|
+
task_allocation.function_executor_id
|
426
|
+
if task_allocation.HasField("function_executor_id")
|
427
|
+
else None
|
428
|
+
)
|
429
|
+
)
|