indexify 0.3.19__py3-none-any.whl → 0.3.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. indexify/cli/cli.py +12 -0
  2. indexify/executor/blob_store/blob_store.py +69 -0
  3. indexify/executor/blob_store/local_fs_blob_store.py +48 -0
  4. indexify/executor/blob_store/metrics/blob_store.py +33 -0
  5. indexify/executor/blob_store/s3_blob_store.py +85 -0
  6. indexify/executor/downloader.py +145 -24
  7. indexify/executor/executor.py +26 -12
  8. indexify/executor/function_executor/function_executor.py +1 -1
  9. indexify/executor/function_executor/function_executor_states_container.py +5 -0
  10. indexify/executor/function_executor/function_executor_status.py +2 -0
  11. indexify/executor/function_executor/health_checker.py +7 -2
  12. indexify/executor/function_executor/invocation_state_client.py +4 -2
  13. indexify/executor/function_executor/task_output.py +2 -1
  14. indexify/executor/grpc/channel_manager.py +4 -3
  15. indexify/executor/grpc/function_executor_controller.py +163 -193
  16. indexify/executor/grpc/metrics/state_reconciler.py +17 -0
  17. indexify/executor/grpc/metrics/task_controller.py +8 -0
  18. indexify/executor/grpc/state_reconciler.py +305 -188
  19. indexify/executor/grpc/state_reporter.py +18 -10
  20. indexify/executor/grpc/task_controller.py +232 -189
  21. indexify/executor/task_reporter.py +23 -5
  22. indexify/proto/executor_api.proto +37 -11
  23. indexify/proto/executor_api_pb2.py +49 -47
  24. indexify/proto/executor_api_pb2.pyi +55 -15
  25. {indexify-0.3.19.dist-info → indexify-0.3.20.dist-info}/METADATA +2 -1
  26. {indexify-0.3.19.dist-info → indexify-0.3.20.dist-info}/RECORD +28 -23
  27. indexify/executor/grpc/completed_tasks_container.py +0 -26
  28. {indexify-0.3.19.dist-info → indexify-0.3.20.dist-info}/WHEEL +0 -0
  29. {indexify-0.3.19.dist-info → indexify-0.3.20.dist-info}/entry_points.txt +0 -0
@@ -318,6 +318,6 @@ async def _initialize_server(
318
318
  except grpc.aio.AioRpcError as e:
319
319
  if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
320
320
  raise CustomerError(
321
- f"Customer code timeout {customer_code_timeout_sec} sec expired"
321
+ f"Customer code timeout of {customer_code_timeout_sec:.3f} sec expired"
322
322
  ) from e
323
323
  raise
@@ -54,6 +54,11 @@ class FunctionExecutorStatesContainer:
54
54
 
55
55
  return self._states[id]
56
56
 
57
+ async def get(self, id: str) -> FunctionExecutorState:
58
+ """Get the state with the given ID. Raises Exception if the state does not exist."""
59
+ async with self._lock:
60
+ return self._states[id]
61
+
57
62
  async def __aiter__(self) -> AsyncGenerator[FunctionExecutorState, None]:
58
63
  async with self._lock:
59
64
  for state in self._states.values():
@@ -34,6 +34,8 @@ class FunctionExecutorStatus(Enum):
34
34
  SHUTDOWN = "Shutdown" # Permanent stop state
35
35
 
36
36
 
37
+ # TODO: After removing HTTP code simplify state transitions by not allowing to
38
+ # startup an FE after it was destroyed. grpc protocol treats FEs as ephimeral and never revives them.
37
39
  def is_status_change_allowed(
38
40
  current_status: FunctionExecutorStatus, new_status: FunctionExecutorStatus
39
41
  ) -> bool:
@@ -107,7 +107,9 @@ class HealthChecker:
107
107
  return
108
108
 
109
109
  self._health_check_failed_callback = callback
110
- self._health_check_loop_task = asyncio.create_task(self._health_check_loop())
110
+ self._health_check_loop_task = asyncio.create_task(
111
+ self._health_check_loop(), name="function executor health checker loop"
112
+ )
111
113
 
112
114
  def stop(self) -> None:
113
115
  """Stops the periodic health checks.
@@ -126,7 +128,10 @@ class HealthChecker:
126
128
  break
127
129
  await asyncio.sleep(HEALTH_CHECK_POLL_PERIOD_SEC)
128
130
 
129
- asyncio.create_task(self._health_check_failed_callback(result))
131
+ asyncio.create_task(
132
+ self._health_check_failed_callback(result),
133
+ name="function executor health check failure callback",
134
+ )
130
135
  self._health_check_loop_task = None
131
136
 
132
137
 
@@ -67,7 +67,8 @@ class InvocationStateClient:
67
67
  self._response_generator()
68
68
  )
69
69
  self._request_loop_task = asyncio.create_task(
70
- self._request_loop(server_requests)
70
+ self._request_loop(server_requests),
71
+ name="graph invocation state client request processing loop",
71
72
  )
72
73
 
73
74
  def add_task_to_invocation_id_entry(self, task_id: str, invocation_id: str) -> None:
@@ -100,7 +101,8 @@ class InvocationStateClient:
100
101
  pass
101
102
  except asyncio.CancelledError:
102
103
  # This async task was cancelled by destroy(). Normal situation too.
103
- pass
104
+ # This exception should not be suppressed, see Python asyncio docs.
105
+ raise
104
106
  except Exception as e:
105
107
  metric_request_read_errors.inc()
106
108
  self._logger.error(
@@ -83,6 +83,7 @@ class TaskOutput:
83
83
  function_name: str,
84
84
  graph_version: str,
85
85
  graph_invocation_id: str,
86
+ timeout_sec: float,
86
87
  ) -> "TaskOutput":
87
88
  """Creates a TaskOutput for an function timeout error."""
88
89
  # Task stdout, stderr is not available.
@@ -93,6 +94,6 @@ class TaskOutput:
93
94
  function_name=function_name,
94
95
  graph_version=graph_version,
95
96
  graph_invocation_id=graph_invocation_id,
96
- stderr="Function execution timed out.",
97
+ stderr=f"Function exceeded its configured timeout of {timeout_sec:.3f} sec.",
97
98
  is_internal_error=False,
98
99
  )
@@ -69,6 +69,10 @@ class ChannelManager:
69
69
  certificate_chain=certificate_chain,
70
70
  )
71
71
 
72
+ async def destroy(self):
73
+ if self._channel is not None:
74
+ await self._destroy_locked_channel()
75
+
72
76
  async def get_channel(self) -> grpc.aio.Channel:
73
77
  """Returns a channel to the gRPC server.
74
78
 
@@ -155,6 +159,3 @@ class ChannelManager:
155
159
  except Exception as e:
156
160
  self._logger.error("failed closing channel", exc_info=e)
157
161
  self._channel = None
158
-
159
- async def shutdown(self):
160
- pass
@@ -25,6 +25,60 @@ from ..function_executor.server.function_executor_server_factory import (
25
25
  )
26
26
 
27
27
 
28
+ def validate_function_executor_description(
29
+ function_executor_description: FunctionExecutorDescription,
30
+ ) -> None:
31
+ """Validates the supplied FE description.
32
+
33
+ Raises ValueError if the description is not valid.
34
+ """
35
+ validator = MessageValidator(function_executor_description)
36
+ validator.required_field("id")
37
+ validator.required_field("namespace")
38
+ validator.required_field("graph_name")
39
+ validator.required_field("graph_version")
40
+ validator.required_field("function_name")
41
+ # TODO: Make graph required after we migrate to direct S3 downloads.
42
+ # image_uri is optional.
43
+ # secret_names can be empty.
44
+ # resource_limits is optional.
45
+
46
+
47
+ def function_executor_logger(
48
+ function_executor_description: FunctionExecutorDescription, logger: Any
49
+ ) -> Any:
50
+ """Returns a logger bound with the FE's metadata.
51
+
52
+ The function assumes that the FE might be invalid."""
53
+ return logger.bind(
54
+ function_executor_id=(
55
+ function_executor_description.id
56
+ if function_executor_description.HasField("id")
57
+ else None
58
+ ),
59
+ namespace=(
60
+ function_executor_description.namespace
61
+ if function_executor_description.HasField("namespace")
62
+ else None
63
+ ),
64
+ graph_name=(
65
+ function_executor_description.graph_name
66
+ if function_executor_description.HasField("graph_name")
67
+ else None
68
+ ),
69
+ graph_version=(
70
+ function_executor_description.graph_version
71
+ if function_executor_description.HasField("graph_version")
72
+ else None
73
+ ),
74
+ function_name=(
75
+ function_executor_description.function_name
76
+ if function_executor_description.HasField("function_name")
77
+ else None
78
+ ),
79
+ )
80
+
81
+
28
82
  class FunctionExecutorController:
29
83
  def __init__(
30
84
  self,
@@ -39,9 +93,9 @@ class FunctionExecutorController:
39
93
  ):
40
94
  """Initializes the FunctionExecutorController.
41
95
 
42
- Raises ValueError if the supplied FunctionExecutorDescription is not valid.
96
+ The supplied FunctionExecutorDescription must be already validated by the caller
97
+ using validate_function_executor_description().
43
98
  """
44
- _validate_function_executor_description(function_executor_description)
45
99
  self._executor_id: str = executor_id
46
100
  self._function_executor_state: FunctionExecutorState = function_executor_state
47
101
  self._function_executor_description: FunctionExecutorDescription = (
@@ -53,17 +107,10 @@ class FunctionExecutorController:
53
107
  self._downloader: Downloader = downloader
54
108
  self._base_url: str = base_url
55
109
  self._config_path: str = config_path
56
- self._logger: Any = logger.bind(
110
+ self._logger: Any = function_executor_logger(
111
+ function_executor_description, logger
112
+ ).bind(
57
113
  module=__name__,
58
- function_executor_id=function_executor_description.id,
59
- namespace=function_executor_description.namespace,
60
- graph_name=function_executor_description.graph_name,
61
- graph_version=function_executor_description.graph_version,
62
- function_name=function_executor_description.function_name,
63
- image_uri=function_executor_description.image_uri,
64
- )
65
- self._reconciliation_loop_task: asyncio.Task = asyncio.create_task(
66
- self._reconciliation_loop()
67
114
  )
68
115
  # The locks protects the desired status.
69
116
  self._lock: asyncio.Lock = asyncio.Lock()
@@ -74,13 +121,31 @@ class FunctionExecutorController:
74
121
  self._desired_status_change_notifier: asyncio.Condition = asyncio.Condition(
75
122
  lock=self._lock
76
123
  )
124
+ # Automatically start the controller on creation.
125
+ self._reconciliation_loop_task: asyncio.Task = asyncio.create_task(
126
+ self._reconciliation_loop(),
127
+ name="function executor controller reconciliation loop",
128
+ )
129
+
130
+ def function_executor_description(self) -> FunctionExecutorDescription:
131
+ return self._function_executor_description
132
+
133
+ async def startup(self) -> None:
134
+ await self._set_desired_status(
135
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE
136
+ )
77
137
 
78
- async def set_desired_status(
138
+ async def shutdown(self) -> None:
139
+ await self._set_desired_status(
140
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
141
+ )
142
+
143
+ async def _set_desired_status(
79
144
  self, desired_status: FunctionExecutorStatusProto
80
145
  ) -> None:
81
146
  """Updates the desired Function Executor status.
82
147
 
83
- Reconciliation is done asynchronously.
148
+ Reconciliation is done asynchronously. Doesn't raise any exceptions.
84
149
  """
85
150
  async with self._lock:
86
151
  if self._desired_status == desired_status:
@@ -105,146 +170,73 @@ class FunctionExecutorController:
105
170
  await self._reconcile(last_seen_desired_status)
106
171
 
107
172
  async def _reconcile(self, desired_status: FunctionExecutorStatusProto) -> None:
108
- async with self._function_executor_state.lock:
109
- current_status: FunctionExecutorStatus = (
110
- self._function_executor_state.status
111
- )
112
- # We have to process all possible combination of current and desired statuses.
113
- if current_status == FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR:
114
- if (
115
- desired_status
116
- == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR
117
- ):
118
- return # Same status, nothing to do.
119
-
120
- # All we can do from the current status is to destroy the FE to possibly recreate it later
121
- # if Server requests to do this. This is why we don't accept any other desired statuses.
122
- return await self._destroy_or_shutdown_fe_if_desired(desired_status)
123
-
124
- if current_status == FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR:
125
- if (
126
- desired_status
127
- == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR
128
- ):
129
- return # Same status, nothing to do.
130
-
131
- # All we can do from the current status is to destroy the FE to possibly recreate it later
132
- # if Server requests to do this. This is why we don't accept any other desired statuses.
133
- return await self._destroy_or_shutdown_fe_if_desired(desired_status)
134
-
135
- if current_status == FunctionExecutorStatus.IDLE:
136
- if (
137
- desired_status
138
- == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE
139
- ):
140
- return # Same status, nothing to do.
141
-
142
- # Server can only request FE destroy or shutdown when FE has IDLE status.
143
- # Transition from IDLE to RUNNING_TASK can only be done by Task controller.
144
- # Transition from IDLE to UNHEALTHY can only be done by FE controller.
145
- return await self._destroy_or_shutdown_fe_if_desired(desired_status)
146
-
147
- if current_status == FunctionExecutorStatus.RUNNING_TASK:
148
- if (
149
- desired_status
150
- == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK
151
- ):
152
- return # Same status, nothing to do.
173
+ """Reconciles the FE status with the desired status.
153
174
 
154
- # Server can only request FE destroy or shutdown when FE has RUNNING_TASK status.
155
- # Transition from RUNNING_TASK to UNHEALTHY can only be done by Task controller.
156
- return await self._destroy_or_shutdown_fe_if_desired(desired_status)
157
-
158
- if current_status == FunctionExecutorStatus.UNHEALTHY:
159
- if (
160
- desired_status
161
- == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY
162
- ):
163
- return # Same status, nothing to do.
164
-
165
- # Server can only request FE destroy or shutdown when FE has RUNNING_TASK status.
166
- return await self._destroy_or_shutdown_fe_if_desired(desired_status)
167
-
168
- if current_status == FunctionExecutorStatus.DESTROYED:
169
- if (
170
- desired_status
171
- == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
172
- ):
173
- return # Same status, nothing to do.
174
-
175
- return await self._reconcile_from_destroyed(desired_status)
176
-
177
- # _reconcile() can't be called when current FE status is one of "long running" states
178
- # handled by FE controller like STARTING_UP and DESTROYING. This is because _reconcile()
179
- # is called with concurrency of 1 and _reconcile() waits until these long running states
180
- # (operations) are finished before returning.
181
- #
182
- # It's not possible to have SHUTDOWN current status because when FE controller transitions to SHUTDOWN
183
- # status, it cancels the reconciliation loop task.
184
- self._logger.error(
185
- "unexpected current function executor status, skipping state reconciliation",
186
- current_status=current_status.name,
187
- desired_status=FunctionExecutorStatusProto.Name(desired_status),
188
- )
175
+ Doesn't raise any exceptions."""
176
+ async with self._function_executor_state.lock:
177
+ if (
178
+ desired_status
179
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE
180
+ ):
181
+ return await self._startup()
182
+ elif (
183
+ desired_status
184
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
185
+ ):
186
+ # Shutdown can be requested with any current status.
187
+ return await self._shutdown()
188
+ else:
189
+ self._logger.error(
190
+ "unexpected desired function executor status received from server, skipping state reconciliation",
191
+ current_status=self._function_executor_state.status.name,
192
+ desired_status=FunctionExecutorStatusProto.Name(desired_status),
193
+ )
189
194
 
190
- async def _destroy_or_shutdown_fe_if_desired(
191
- self, desired_status: FunctionExecutorStatusProto
192
- ) -> None:
193
- """Destroys the Function Executor if desired status asks for it.
195
+ async def _shutdown(self) -> None:
196
+ """Shutsdown the Function Executor and frees all of its resources.
194
197
 
195
- Otherwise logs an error because other actions are not allowed by the current status.
196
- Caller holds the FE state lock.
198
+ Caller holds the FE state lock. Doesn't raise any exceptions.
197
199
  """
198
- if desired_status not in [
199
- FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
200
- FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
201
- FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
200
+ # Run destroy sequence if current FE status requires it (see allows FE status transitions).
201
+ # We won't see DESTROYING and STARTING_UP statuses here because FE reconciliation is done
202
+ # with concurrency of 1.
203
+ if self._function_executor_state.status in [
204
+ FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
205
+ FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
206
+ FunctionExecutorStatus.IDLE,
207
+ FunctionExecutorStatus.RUNNING_TASK,
208
+ FunctionExecutorStatus.UNHEALTHY,
202
209
  ]:
203
- self._logger.error(
204
- "unexpected desired function executor status received from server, skipping state reconciliation",
205
- current_status=self._function_executor_state.status.name,
206
- desired_status=FunctionExecutorStatusProto.Name(desired_status),
210
+ await self._function_executor_state.set_status(
211
+ FunctionExecutorStatus.DESTROYING
207
212
  )
208
- return
213
+ if self._function_executor_state.function_executor is not None:
214
+ async with _UnlockedLockContextManager(
215
+ self._function_executor_state.lock
216
+ ):
217
+ await self._function_executor_state.function_executor.destroy()
218
+ await self._function_executor_state.set_status(
219
+ FunctionExecutorStatus.DESTROYED
220
+ )
221
+ self._function_executor_state.function_executor = None
209
222
 
210
- await self._destroy_function_executor()
211
- # FE state status is now DESTROYED.
212
- if (
213
- desired_status
214
- == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
215
- ):
216
- await self._shutdown()
217
- # No code is executed after this point because reconciliation loop aio task is cancelled.
223
+ self._logger.info("shutting down function executor controller")
224
+ await self._function_executor_state.set_status(FunctionExecutorStatus.SHUTDOWN)
225
+ self._reconciliation_loop_task.cancel()
226
+ # No code is executed after this point because reconciliation loop aio task is cancelled.
218
227
 
219
- async def _reconcile_from_destroyed(
220
- self, desired_status: FunctionExecutorStatusProto
221
- ) -> None:
222
- """Reconciles the FE state when it has DESTROYED status.
228
+ async def _startup(self) -> None:
229
+ """Startups the FE if possible.
223
230
 
224
- Caller holds the FE state lock.
231
+ Caller holds the FE state lock. Doesn't raise any exceptions.
225
232
  """
226
- if desired_status not in [
227
- FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTING_UP,
228
- FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE,
229
- FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK,
230
- FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
231
- ]:
233
+ if self._function_executor_state.status != FunctionExecutorStatus.DESTROYED:
232
234
  self._logger.error(
233
- "unexpected desired function executor status received from server, skipping state reconciliation",
235
+ "Can't startup Function Executor from its current state, skipping startup",
234
236
  current_status=self._function_executor_state.status.name,
235
- desired_status=FunctionExecutorStatusProto.Name(desired_status),
236
237
  )
237
238
  return
238
239
 
239
- if (
240
- desired_status
241
- == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
242
- ):
243
- await self._shutdown()
244
- # No code is executed after this point because reconciliation loop aio task is cancelled.
245
- return
246
-
247
- # All the rest of the allowed desired statuses ask to create the FE.
248
240
  await self._function_executor_state.set_status(
249
241
  FunctionExecutorStatus.STARTING_UP
250
242
  )
@@ -267,6 +259,7 @@ class FunctionExecutorController:
267
259
  next_status_message = str(e)
268
260
  except Exception as e:
269
261
  next_status = FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
262
+ self._logger.error("failed to create function executor", exc_info=e)
270
263
 
271
264
  # FE state lock is acquired again at this point.
272
265
  await self._function_executor_state.set_status(next_status, next_status_message)
@@ -279,47 +272,35 @@ class FunctionExecutorController:
279
272
  self._health_check_failed_callback
280
273
  )
281
274
 
282
- async def _destroy_function_executor(self) -> None:
283
- """Destroys the Function Executor if it exists.
284
-
285
- Caller holds the FE state lock.
286
- """
287
- await self._function_executor_state.set_status(
288
- FunctionExecutorStatus.DESTROYING
289
- )
290
- async with _UnlockedLockContextManager(self._function_executor_state.lock):
291
- await self._function_executor_state.function_executor.destroy()
292
- await self._function_executor_state.set_status(FunctionExecutorStatus.DESTROYED)
293
- self._function_executor_state.function_executor = None
294
-
295
- async def _shutdown(self) -> None:
296
- """Shuts down the controller.
297
-
298
- Caller holds the FE state lock.
299
- Raises asyncio.CancelledError on return when called from reconciliation loop.
300
- """
301
- self._logger.info("shutting down function executor controller")
302
- await self._function_executor_state.set_status(FunctionExecutorStatus.SHUTDOWN)
303
- self._reconciliation_loop_task.cancel()
304
- await self._reconciliation_loop_task
305
-
306
275
  async def _health_check_failed_callback(self, result: HealthCheckResult):
307
276
  async with self._function_executor_state.lock:
308
277
  if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
309
278
  return
310
279
 
311
- if self._function_executor_state.status in (
280
+ # There can be false positive health check failures when we're creating
281
+ # or destroying FEs so we only react to health check failures when we expect
282
+ # the FE to be healthy.
283
+ if self._function_executor_state.status not in (
312
284
  FunctionExecutorStatus.IDLE,
313
285
  FunctionExecutorStatus.RUNNING_TASK,
314
286
  ):
315
- # There can be false positive health check failures when we're creating
316
- # or destroying FEs so we're not interested in them.
317
- #
318
- # Server should react to this transition into unhealthy state and ask to
319
- # destroy this FE.
320
- await self._function_executor_state.set_status(
321
- FunctionExecutorStatus.UNHEALTHY
322
- )
287
+ return
288
+
289
+ await self._function_executor_state.set_status(
290
+ FunctionExecutorStatus.UNHEALTHY
291
+ )
292
+ function_executor: FunctionExecutor = (
293
+ self._function_executor_state.function_executor
294
+ )
295
+ self._function_executor_state.function_executor = None
296
+
297
+ self._logger.error(
298
+ "Function Executor health check failed, destroying Function Executor",
299
+ health_check_fail_reason=result.reason,
300
+ )
301
+ # Destroy the unhealthy FE asap so it doesn't consume resources.
302
+ # Do it with unlocked state lock to not stop other work on this FE state.
303
+ await function_executor.destroy()
323
304
 
324
305
 
325
306
  async def _create_function_executor(
@@ -341,12 +322,18 @@ async def _create_function_executor(
341
322
  graph_name=function_executor_description.graph_name,
342
323
  graph_version=function_executor_description.graph_version,
343
324
  logger=logger,
325
+ data_payload=(
326
+ function_executor_description.graph
327
+ if function_executor_description.HasField("graph")
328
+ else None
329
+ ),
344
330
  )
345
331
 
346
332
  config: FunctionExecutorServerConfiguration = FunctionExecutorServerConfiguration(
347
333
  executor_id=executor_id,
348
334
  function_executor_id=function_executor_description.id,
349
335
  namespace=function_executor_description.namespace,
336
+ image_uri=None,
350
337
  secret_names=list(function_executor_description.secret_names),
351
338
  )
352
339
  if function_executor_description.HasField("image_uri"):
@@ -361,8 +348,6 @@ async def _create_function_executor(
361
348
  )
362
349
  customer_code_timeout_sec: Optional[float] = None
363
350
  if function_executor_description.HasField("customer_code_timeout_ms"):
364
- # TODO: Add integration tests with FE customer code initialization timeout
365
- # when end-to-end implementation is done.
366
351
  customer_code_timeout_sec = (
367
352
  function_executor_description.customer_code_timeout_ms / 1000.0
368
353
  )
@@ -381,29 +366,14 @@ async def _create_function_executor(
381
366
  customer_code_timeout_sec=customer_code_timeout_sec,
382
367
  )
383
368
  return function_executor
384
- except Exception:
369
+ except (Exception, asyncio.CancelledError):
370
+ # Destroy the failed to startup FE asap so it doesn't consume resources.
371
+ # Destroy the FE also if the FE initialization got cancelled to not leak
372
+ # allocated resources.
385
373
  await function_executor.destroy()
386
374
  raise
387
375
 
388
376
 
389
- def _validate_function_executor_description(
390
- function_executor_description: FunctionExecutorDescription,
391
- ) -> None:
392
- """Validates the supplied FE description.
393
-
394
- Raises ValueError if the description is not valid.
395
- """
396
- validator = MessageValidator(function_executor_description)
397
- validator.required_field("id")
398
- validator.required_field("namespace")
399
- validator.required_field("graph_name")
400
- validator.required_field("graph_version")
401
- validator.required_field("function_name")
402
- # image_uri is optional.
403
- # secret_names can be empty.
404
- # resource_limits is optional.
405
-
406
-
407
377
  class _UnlockedLockContextManager:
408
378
  """Unlocks its lock on enter to the scope and locks it back on exit."""
409
379
 
@@ -0,0 +1,17 @@
1
+ import prometheus_client
2
+
3
+ from ...monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ metric_state_reconciliations = prometheus_client.Counter(
6
+ "state_reconciliations",
7
+ "Number of Executor state reconciliations",
8
+ )
9
+ metric_state_reconciliation_errors = prometheus_client.Counter(
10
+ "state_reconciliation_errors",
11
+ "Number of Executor state reconciliation errors after all retries",
12
+ )
13
+ metric_state_reconciliation_latency: prometheus_client.Histogram = (
14
+ latency_metric_for_fast_operation(
15
+ "state_reconciliation", "Executor state reconciliation"
16
+ )
17
+ )
@@ -0,0 +1,8 @@
1
+ import prometheus_client
2
+
3
+ from ...monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ metric_task_cancellations = prometheus_client.Counter(
6
+ "task_cancellations",
7
+ "Number of times a task was cancelled",
8
+ )