indexify 0.3.19__py3-none-any.whl → 0.3.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. indexify/cli/cli.py +12 -0
  2. indexify/executor/api_objects.py +11 -6
  3. indexify/executor/blob_store/blob_store.py +69 -0
  4. indexify/executor/blob_store/local_fs_blob_store.py +48 -0
  5. indexify/executor/blob_store/metrics/blob_store.py +33 -0
  6. indexify/executor/blob_store/s3_blob_store.py +88 -0
  7. indexify/executor/downloader.py +192 -27
  8. indexify/executor/executor.py +29 -13
  9. indexify/executor/function_executor/function_executor.py +1 -1
  10. indexify/executor/function_executor/function_executor_states_container.py +5 -0
  11. indexify/executor/function_executor/function_executor_status.py +2 -0
  12. indexify/executor/function_executor/health_checker.py +7 -2
  13. indexify/executor/function_executor/invocation_state_client.py +4 -2
  14. indexify/executor/function_executor/single_task_runner.py +2 -0
  15. indexify/executor/function_executor/task_output.py +8 -1
  16. indexify/executor/grpc/channel_manager.py +4 -3
  17. indexify/executor/grpc/function_executor_controller.py +163 -193
  18. indexify/executor/grpc/metrics/state_reconciler.py +17 -0
  19. indexify/executor/grpc/metrics/task_controller.py +8 -0
  20. indexify/executor/grpc/state_reconciler.py +305 -188
  21. indexify/executor/grpc/state_reporter.py +18 -10
  22. indexify/executor/grpc/task_controller.py +247 -189
  23. indexify/executor/metrics/task_reporter.py +17 -0
  24. indexify/executor/task_reporter.py +217 -94
  25. indexify/executor/task_runner.py +1 -0
  26. indexify/proto/executor_api.proto +37 -11
  27. indexify/proto/executor_api_pb2.py +49 -47
  28. indexify/proto/executor_api_pb2.pyi +55 -15
  29. {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/METADATA +2 -1
  30. {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/RECORD +32 -27
  31. indexify/executor/grpc/completed_tasks_container.py +0 -26
  32. {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/WHEEL +0 -0
  33. {indexify-0.3.19.dist-info → indexify-0.3.21.dist-info}/entry_points.txt +0 -0
@@ -12,6 +12,7 @@ from tensorlake.utils.logging import suppress as suppress_logging
12
12
  from indexify.proto.executor_api_pb2 import ExecutorStatus
13
13
 
14
14
  from .api_objects import FunctionURI, Task
15
+ from .blob_store.blob_store import BLOBStore
15
16
  from .downloader import Downloader
16
17
  from .executor_flavor import ExecutorFlavor
17
18
  from .function_executor.function_executor_states_container import (
@@ -69,6 +70,7 @@ class Executor:
69
70
  monitoring_server_host: str,
70
71
  monitoring_server_port: int,
71
72
  enable_grpc_state_reconciler: bool,
73
+ blob_store: BLOBStore,
72
74
  ):
73
75
  self._logger = structlog.get_logger(module=__name__)
74
76
  self._is_shutdown: bool = False
@@ -95,7 +97,10 @@ class Executor:
95
97
  self._function_executor_states
96
98
  )
97
99
  self._downloader = Downloader(
98
- code_path=code_path, base_url=self._base_url, config_path=config_path
100
+ code_path=code_path,
101
+ base_url=self._base_url,
102
+ blob_store=blob_store,
103
+ config_path=config_path,
99
104
  )
100
105
  self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
101
106
  self._function_executor_server_factory = function_executor_server_factory
@@ -123,6 +128,7 @@ class Executor:
123
128
  executor_id=id,
124
129
  config_path=config_path,
125
130
  channel_manager=self._channel_manager,
131
+ blob_store=blob_store,
126
132
  )
127
133
 
128
134
  # HTTP mode task runner
@@ -189,12 +195,15 @@ class Executor:
189
195
  signum, self.shutdown, asyncio.get_event_loop()
190
196
  )
191
197
 
192
- asyncio.get_event_loop().create_task(self._monitoring_server.run())
193
- if self._state_reporter is not None:
194
- self._state_reporter.update_executor_status(
195
- ExecutorStatus.EXECUTOR_STATUS_RUNNING
196
- )
197
- asyncio.get_event_loop().create_task(self._state_reporter.run())
198
+ asyncio.get_event_loop().create_task(
199
+ self._monitoring_server.run(), name="monitoring server runner"
200
+ )
201
+ self._state_reporter.update_executor_status(
202
+ ExecutorStatus.EXECUTOR_STATUS_RUNNING
203
+ )
204
+ asyncio.get_event_loop().create_task(
205
+ self._state_reporter.run(), name="state reporter runner"
206
+ )
198
207
 
199
208
  metric_executor_state.state("running")
200
209
  self._startup_probe_handler.set_ready()
@@ -215,7 +224,6 @@ class Executor:
215
224
  """Runs the gRPC state reconciler and state reporter.
216
225
 
217
226
  Never raises any exceptions."""
218
- asyncio.create_task(self._state_reporter.run())
219
227
  await self._state_reconciler.run()
220
228
 
221
229
  async def _http_task_runner_loop(self):
@@ -224,11 +232,15 @@ class Executor:
224
232
  async for task in self._task_fetcher.run():
225
233
  metric_tasks_fetched.inc()
226
234
  if not self._is_shutdown:
227
- asyncio.create_task(self._run_task(task))
235
+ asyncio.create_task(
236
+ self._run_task(task), name="task runner (http mode)"
237
+ )
238
+ self._logger.info("fetching tasks finished, reconnecting in 5 seconds")
228
239
  except Exception as e:
229
240
  self._logger.error(
230
241
  "failed fetching tasks, retrying in 5 seconds", exc_info=e
231
242
  )
243
+ if not self._is_shutdown:
232
244
  await asyncio.sleep(5)
233
245
 
234
246
  async def _run_task(self, task: Task) -> None:
@@ -250,6 +262,7 @@ class Executor:
250
262
  function_name=task.compute_fn,
251
263
  graph_version=task.graph_version,
252
264
  graph_invocation_id=task.invocation_id,
265
+ output_payload_uri_prefix=task.output_payload_uri_prefix,
253
266
  )
254
267
  logger.error("task execution failed", exc_info=e)
255
268
 
@@ -293,17 +306,19 @@ class Executor:
293
306
  graph_name=task.compute_graph,
294
307
  graph_version=task.graph_version,
295
308
  logger=logger,
309
+ data_payload=task.graph_payload,
296
310
  )
297
311
  input: SerializedObject = await self._downloader.download_input(
298
312
  namespace=task.namespace,
299
313
  graph_name=task.compute_graph,
300
314
  graph_invocation_id=task.invocation_id,
301
315
  input_key=task.input_key,
316
+ data_payload=task.input_payload,
302
317
  logger=logger,
303
318
  )
304
319
  init_value: Optional[SerializedObject] = (
305
320
  None
306
- if task.reducer_output_id is None
321
+ if task.reducer_output_id is None and task.reducer_input_payload is None
307
322
  else (
308
323
  await self._downloader.download_init_value(
309
324
  namespace=task.namespace,
@@ -311,6 +326,7 @@ class Executor:
311
326
  function_name=task.compute_fn,
312
327
  graph_invocation_id=task.invocation_id,
313
328
  reducer_output_key=task.reducer_output_id,
329
+ data_payload=task.reducer_input_payload,
314
330
  logger=logger,
315
331
  )
316
332
  )
@@ -380,12 +396,12 @@ class Executor:
380
396
  if self._task_runner is not None:
381
397
  await self._task_runner.shutdown()
382
398
 
383
- if self._channel_manager is not None:
384
- await self._channel_manager.shutdown()
385
399
  if self._state_reporter is not None:
386
400
  await self._state_reporter.shutdown()
387
401
  if self._state_reconciler is not None:
388
402
  await self._state_reconciler.shutdown()
403
+ if self._channel_manager is not None:
404
+ await self._channel_manager.destroy()
389
405
 
390
406
  # We need to shutdown all users of FE states first,
391
407
  # otherwise states might disappear unexpectedly and we might
@@ -397,7 +413,7 @@ class Executor:
397
413
  # The current task is cancelled, the code after this line will not run.
398
414
 
399
415
  def shutdown(self, loop):
400
- loop.create_task(self._shutdown(loop))
416
+ loop.create_task(self._shutdown(loop), name="executor shutdown")
401
417
 
402
418
  def _task_logger(self, task: Task) -> Any:
403
419
  return self._logger.bind(
@@ -318,6 +318,6 @@ async def _initialize_server(
318
318
  except grpc.aio.AioRpcError as e:
319
319
  if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
320
320
  raise CustomerError(
321
- f"Customer code timeout {customer_code_timeout_sec} sec expired"
321
+ f"Customer code timeout of {customer_code_timeout_sec:.3f} sec expired"
322
322
  ) from e
323
323
  raise
@@ -54,6 +54,11 @@ class FunctionExecutorStatesContainer:
54
54
 
55
55
  return self._states[id]
56
56
 
57
+ async def get(self, id: str) -> FunctionExecutorState:
58
+ """Get the state with the given ID. Raises Exception if the state does not exist."""
59
+ async with self._lock:
60
+ return self._states[id]
61
+
57
62
  async def __aiter__(self) -> AsyncGenerator[FunctionExecutorState, None]:
58
63
  async with self._lock:
59
64
  for state in self._states.values():
@@ -34,6 +34,8 @@ class FunctionExecutorStatus(Enum):
34
34
  SHUTDOWN = "Shutdown" # Permanent stop state
35
35
 
36
36
 
37
+ # TODO: After removing HTTP code simplify state transitions by not allowing to
38
+ # startup an FE after it was destroyed. grpc protocol treats FEs as ephimeral and never revives them.
37
39
  def is_status_change_allowed(
38
40
  current_status: FunctionExecutorStatus, new_status: FunctionExecutorStatus
39
41
  ) -> bool:
@@ -107,7 +107,9 @@ class HealthChecker:
107
107
  return
108
108
 
109
109
  self._health_check_failed_callback = callback
110
- self._health_check_loop_task = asyncio.create_task(self._health_check_loop())
110
+ self._health_check_loop_task = asyncio.create_task(
111
+ self._health_check_loop(), name="function executor health checker loop"
112
+ )
111
113
 
112
114
  def stop(self) -> None:
113
115
  """Stops the periodic health checks.
@@ -126,7 +128,10 @@ class HealthChecker:
126
128
  break
127
129
  await asyncio.sleep(HEALTH_CHECK_POLL_PERIOD_SEC)
128
130
 
129
- asyncio.create_task(self._health_check_failed_callback(result))
131
+ asyncio.create_task(
132
+ self._health_check_failed_callback(result),
133
+ name="function executor health check failure callback",
134
+ )
130
135
  self._health_check_loop_task = None
131
136
 
132
137
 
@@ -67,7 +67,8 @@ class InvocationStateClient:
67
67
  self._response_generator()
68
68
  )
69
69
  self._request_loop_task = asyncio.create_task(
70
- self._request_loop(server_requests)
70
+ self._request_loop(server_requests),
71
+ name="graph invocation state client request processing loop",
71
72
  )
72
73
 
73
74
  def add_task_to_invocation_id_entry(self, task_id: str, invocation_id: str) -> None:
@@ -100,7 +101,8 @@ class InvocationStateClient:
100
101
  pass
101
102
  except asyncio.CancelledError:
102
103
  # This async task was cancelled by destroy(). Normal situation too.
103
- pass
104
+ # This exception should not be suppressed, see Python asyncio docs.
105
+ raise
104
106
  except Exception as e:
105
107
  metric_request_read_errors.inc()
106
108
  self._logger.error(
@@ -96,6 +96,7 @@ class SingleTaskRunner:
96
96
  graph_invocation_id=self._task_input.task.invocation_id,
97
97
  stderr=str(e),
98
98
  success=False,
99
+ output_payload_uri_prefix=self._task_input.task.output_payload_uri_prefix,
99
100
  )
100
101
 
101
102
  try:
@@ -311,6 +312,7 @@ def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
311
312
  reducer=response.is_reducer,
312
313
  success=response.success,
313
314
  metrics=metrics,
315
+ output_payload_uri_prefix=task.output_payload_uri_prefix,
314
316
  )
315
317
 
316
318
  if response.HasField("function_output"):
@@ -25,6 +25,7 @@ class TaskOutput:
25
25
  function_name: str,
26
26
  graph_version: str,
27
27
  graph_invocation_id: str,
28
+ output_payload_uri_prefix: Optional[str],
28
29
  output_encoding: Optional[str] = None,
29
30
  function_output: Optional[FunctionOutput] = None,
30
31
  router_output: Optional[RouterOutput] = None,
@@ -50,6 +51,7 @@ class TaskOutput:
50
51
  self.is_internal_error = is_internal_error
51
52
  self.metrics = metrics
52
53
  self.output_encoding = output_encoding
54
+ self.output_payload_uri_prefix = output_payload_uri_prefix
53
55
 
54
56
  @classmethod
55
57
  def internal_error(
@@ -60,6 +62,7 @@ class TaskOutput:
60
62
  function_name: str,
61
63
  graph_version: str,
62
64
  graph_invocation_id: str,
65
+ output_payload_uri_prefix: Optional[str],
63
66
  ) -> "TaskOutput":
64
67
  """Creates a TaskOutput for an internal error."""
65
68
  # We are not sharing internal error messages with the customer.
@@ -72,6 +75,7 @@ class TaskOutput:
72
75
  graph_invocation_id=graph_invocation_id,
73
76
  stderr="Platform failed to execute the function.",
74
77
  is_internal_error=True,
78
+ output_payload_uri_prefix=output_payload_uri_prefix,
75
79
  )
76
80
 
77
81
  @classmethod
@@ -83,6 +87,8 @@ class TaskOutput:
83
87
  function_name: str,
84
88
  graph_version: str,
85
89
  graph_invocation_id: str,
90
+ timeout_sec: float,
91
+ output_payload_uri_prefix: Optional[str],
86
92
  ) -> "TaskOutput":
87
93
  """Creates a TaskOutput for an function timeout error."""
88
94
  # Task stdout, stderr is not available.
@@ -93,6 +99,7 @@ class TaskOutput:
93
99
  function_name=function_name,
94
100
  graph_version=graph_version,
95
101
  graph_invocation_id=graph_invocation_id,
96
- stderr="Function execution timed out.",
102
+ stderr=f"Function exceeded its configured timeout of {timeout_sec:.3f} sec.",
97
103
  is_internal_error=False,
104
+ output_payload_uri_prefix=output_payload_uri_prefix,
98
105
  )
@@ -69,6 +69,10 @@ class ChannelManager:
69
69
  certificate_chain=certificate_chain,
70
70
  )
71
71
 
72
+ async def destroy(self):
73
+ if self._channel is not None:
74
+ await self._destroy_locked_channel()
75
+
72
76
  async def get_channel(self) -> grpc.aio.Channel:
73
77
  """Returns a channel to the gRPC server.
74
78
 
@@ -155,6 +159,3 @@ class ChannelManager:
155
159
  except Exception as e:
156
160
  self._logger.error("failed closing channel", exc_info=e)
157
161
  self._channel = None
158
-
159
- async def shutdown(self):
160
- pass