indexify 0.4.12__tar.gz → 0.4.14__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (68) hide show
  1. {indexify-0.4.12 → indexify-0.4.14}/PKG-INFO +2 -2
  2. {indexify-0.4.12 → indexify-0.4.14}/pyproject.toml +2 -2
  3. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/create_function_executor.py +10 -15
  4. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/function_executor_controller.py +25 -11
  5. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/task_output.py +53 -1
  6. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/state_reconciler.py +26 -6
  7. {indexify-0.4.12 → indexify-0.4.14}/README.md +0 -0
  8. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/cli/__init__.py +0 -0
  9. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/cli/build_image.py +0 -0
  10. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/cli/deploy.py +0 -0
  11. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/cli/executor.py +0 -0
  12. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/README.md +0 -0
  13. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/blob_store/blob_store.py +0 -0
  14. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/blob_store/local_fs_blob_store.py +0 -0
  15. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/blob_store/metrics/blob_store.py +0 -0
  16. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/blob_store/s3_blob_store.py +0 -0
  17. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/channel_manager.py +0 -0
  18. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/executor.py +0 -0
  19. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_allowlist.py +0 -0
  20. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor/function_executor.py +0 -0
  21. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor/health_checker.py +0 -0
  22. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor/invocation_state_client.py +0 -0
  23. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor/metrics/function_executor.py +0 -0
  24. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor/metrics/health_checker.py +0 -0
  25. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor/metrics/invocation_state_client.py +0 -0
  26. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
  27. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
  28. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
  29. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
  30. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +0 -0
  31. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/__init__.py +0 -0
  32. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/completed_task_metrics.py +0 -0
  33. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/debug_event_loop.py +0 -0
  34. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/downloads.py +0 -0
  35. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/events.py +0 -0
  36. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -0
  37. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/loggers.py +0 -0
  38. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/message_validators.py +0 -0
  39. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +0 -0
  40. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/metrics/downloads.py +0 -0
  41. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/metrics/function_executor_controller.py +0 -0
  42. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/metrics/run_task.py +0 -0
  43. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -0
  44. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/prepare_task.py +0 -0
  45. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/run_task.py +0 -0
  46. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/task_info.py +0 -0
  47. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/terminate_function_executor.py +0 -0
  48. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/function_executor_controller/upload_task_output.py +0 -0
  49. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/host_resources/host_resources.py +0 -0
  50. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/host_resources/nvidia_gpu.py +0 -0
  51. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/host_resources/nvidia_gpu_allocator.py +0 -0
  52. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/metrics/channel_manager.py +0 -0
  53. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/metrics/executor.py +0 -0
  54. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/metrics/state_reconciler.py +0 -0
  55. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/metrics/state_reporter.py +0 -0
  56. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/monitoring/handler.py +0 -0
  57. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/monitoring/health_check_handler.py +0 -0
  58. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +0 -0
  59. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/monitoring/health_checker/health_checker.py +0 -0
  60. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/monitoring/metrics.py +0 -0
  61. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/monitoring/prometheus_metrics_handler.py +0 -0
  62. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/monitoring/server.py +0 -0
  63. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/monitoring/startup_probe_handler.py +0 -0
  64. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/executor/state_reporter.py +0 -0
  65. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/proto/executor_api.proto +0 -0
  66. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/proto/executor_api_pb2.py +0 -0
  67. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/proto/executor_api_pb2.pyi +0 -0
  68. {indexify-0.4.12 → indexify-0.4.14}/src/indexify/proto/executor_api_pb2_grpc.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: indexify
3
- Version: 0.4.12
3
+ Version: 0.4.14
4
4
  Summary: Open Source Indexify components and helper tools
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -17,7 +17,7 @@ Requires-Dist: aiohttp (>=3.11.0,<4.0.0)
17
17
  Requires-Dist: boto3 (>=1.37.30,<2.0.0)
18
18
  Requires-Dist: prometheus-client (>=0.21.1,<0.22.0)
19
19
  Requires-Dist: psutil (>=7.0.0,<8.0.0)
20
- Requires-Dist: tensorlake (==0.2.8)
20
+ Requires-Dist: tensorlake (==0.2.15)
21
21
  Project-URL: Repository, https://github.com/tensorlakeai/indexify
22
22
  Description-Content-Type: text/markdown
23
23
 
@@ -1,7 +1,7 @@
1
1
  [tool.poetry]
2
2
  name = "indexify"
3
3
  # Incremented if any of the components provided in this packages are updated.
4
- version = "0.4.12"
4
+ version = "0.4.14"
5
5
  description = "Open Source Indexify components and helper tools"
6
6
  authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
7
7
  license = "Apache 2.0"
@@ -25,7 +25,7 @@ prometheus-client = "^0.21.1"
25
25
  psutil = "^7.0.0"
26
26
  # Adds function-executor binary, utils lib, sdk used in indexify-cli commands.
27
27
  # We need to specify the tensorlake version exactly because pip install doesn't respect poetry.lock files.
28
- tensorlake = "0.2.8"
28
+ tensorlake = "0.2.15"
29
29
  # Uncomment the next line to use local tensorlake package (only for development!)
30
30
  # tensorlake = { path = "../tensorlake", develop = true }
31
31
  # pydantic is provided by tensorlake
@@ -5,7 +5,6 @@ from typing import Any, Optional, Tuple
5
5
  from tensorlake.function_executor.proto.function_executor_pb2 import (
6
6
  InitializeRequest,
7
7
  SerializedObject,
8
- SerializedObjectEncoding,
9
8
  )
10
9
 
11
10
  from indexify.executor.blob_store.blob_store import BLOBStore
@@ -73,25 +72,21 @@ async def create_function_executor(
73
72
  except BaseException as e:
74
73
  if isinstance(e, asyncio.CancelledError):
75
74
  logger.info("function executor startup was cancelled")
76
- return FunctionExecutorCreated(
77
- function_executor=None,
78
- output=FunctionExecutorStartupOutput(
79
- function_executor_description=function_executor_description,
80
- termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_REMOVED_FROM_DESIRED_STATE,
81
- ),
82
- )
83
75
  else:
84
76
  logger.error(
85
77
  "failed to create function executor due to platform error",
86
78
  exc_info=e,
87
79
  )
88
- return FunctionExecutorCreated(
89
- function_executor=None,
90
- output=FunctionExecutorStartupOutput(
91
- function_executor_description=function_executor_description,
92
- termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR,
93
- ),
94
- )
80
+
81
+ # Cancelled FE startup means that Server removed it from desired state so it doesn't matter what termination_reason we return
82
+ # in this case cause this FE will be removed from Executor reported state.
83
+ return FunctionExecutorCreated(
84
+ function_executor=None,
85
+ output=FunctionExecutorStartupOutput(
86
+ function_executor_description=function_executor_description,
87
+ termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR,
88
+ ),
89
+ )
95
90
 
96
91
 
97
92
  async def _initialization_result_to_fe_creation_output(
@@ -252,7 +252,7 @@ class FunctionExecutorController:
252
252
  """Shutsdown the Function Executor and frees all of its resources.
253
253
 
254
254
  No task outcomes and outputs are getting reported to Server after this call.
255
- Doesn't raise any exceptions. Blocks until the shutdown is complete.
255
+ Doesn't raise any exceptions. Blocks until the shutdown is complete. Idempotent.
256
256
  """
257
257
  self._add_event(ShutdownInitiated(), source="shutdown")
258
258
  try:
@@ -261,10 +261,9 @@ class FunctionExecutorController:
261
261
  pass # Expected exception on shutdown
262
262
  except Exception as e:
263
263
  self._logger.error(
264
- "function executor controller control loop raised unexpected exception",
264
+ "function executor controller control loop task raised unexpected exception",
265
265
  exc_info=e,
266
266
  )
267
- self._logger.info("function executor controller shutdown finished")
268
267
 
269
268
  def _update_internal_state(self, new_state: _FE_CONTROLLER_STATE) -> None:
270
269
  """Updates the internal state of the Function Executor Controller.
@@ -461,12 +460,25 @@ class FunctionExecutorController:
461
460
  if event.function_executor is None:
462
461
  # Server needs to increment attempts counter for all the tasks that were pending while FE was starting up.
463
462
  # This prevents infinite retries if FEs consistently fail to start up.
463
+ # The allocations we marked here also need to not used FE terminated failure reason in their outputs
464
+ # because FE terminated means that the allocation wasn't the cause of the FE termination.
465
+ allocation_ids_caused_termination: List[str] = []
466
+ for task_info in self._tasks.values():
467
+ task_logger = task_allocation_logger(task_info.allocation, self._logger)
468
+ task_logger.info(
469
+ "marking allocation failed on function executor startup failure"
470
+ )
471
+ allocation_ids_caused_termination.append(
472
+ task_info.allocation.allocation_id
473
+ )
474
+ task_info.output = TaskOutput.function_executor_startup_failed(
475
+ allocation=task_info.allocation,
476
+ fe_startup_output=event.output,
477
+ logger=task_logger,
478
+ )
464
479
  self._start_termination(
465
480
  fe_termination_reason=event.output.termination_reason,
466
- allocation_ids_caused_termination=[
467
- task_info.allocation.allocation_id
468
- for task_info in self._tasks.values()
469
- ],
481
+ allocation_ids_caused_termination=allocation_ids_caused_termination,
470
482
  )
471
483
  return
472
484
 
@@ -593,9 +605,11 @@ class FunctionExecutorController:
593
605
  _FE_CONTROLLER_STATE.TERMINATING,
594
606
  _FE_CONTROLLER_STATE.TERMINATED,
595
607
  ]:
596
- task_info.output = TaskOutput.function_executor_terminated(
597
- task_info.allocation
598
- )
608
+ if task_info.output is None:
609
+ # The output can be set already by FE startup failure handler.
610
+ task_info.output = TaskOutput.function_executor_terminated(
611
+ task_info.allocation
612
+ )
599
613
  self._start_task_output_upload(task_info)
600
614
  elif self._internal_state == _FE_CONTROLLER_STATE.RUNNING:
601
615
  self._running_task = task_info
@@ -787,7 +801,7 @@ class FunctionExecutorController:
787
801
  self._state_reporter.remove_function_executor_state(self.function_executor_id())
788
802
  self._state_reporter.schedule_state_report()
789
803
 
790
- self._logger.info("function executor controller control loop finished")
804
+ self._logger.info("function executor controller shutdown finished")
791
805
  debug_print_events(events=self._events, logger=self._logger)
792
806
 
793
807
 
@@ -1,4 +1,4 @@
1
- from typing import Dict, List, Optional
1
+ from typing import Any, Dict, List, Optional
2
2
 
3
3
  from tensorlake.function_executor.proto.function_executor_pb2 import (
4
4
  SerializedObject,
@@ -6,11 +6,14 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
6
6
 
7
7
  from indexify.proto.executor_api_pb2 import (
8
8
  DataPayload,
9
+ FunctionExecutorTerminationReason,
9
10
  TaskAllocation,
10
11
  TaskFailureReason,
11
12
  TaskOutcomeCode,
12
13
  )
13
14
 
15
+ from .function_executor_startup_output import FunctionExecutorStartupOutput
16
+
14
17
 
15
18
  class TaskMetrics:
16
19
  """Metrics for a task."""
@@ -105,3 +108,52 @@ class TaskOutput:
105
108
  outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
106
109
  failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED,
107
110
  )
111
+
112
+ @classmethod
113
+ def function_executor_startup_failed(
114
+ cls,
115
+ allocation: TaskAllocation,
116
+ fe_startup_output: FunctionExecutorStartupOutput,
117
+ logger: Any,
118
+ ) -> "TaskOutput":
119
+ """Creates a TaskOutput for the case when we fail a task because its FE startup failed."""
120
+ output = TaskOutput(
121
+ allocation=allocation,
122
+ outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
123
+ failure_reason=_fe_startup_failure_reason_to_task_failure_reason(
124
+ fe_startup_output.termination_reason, logger
125
+ ),
126
+ )
127
+ # Use FE startup stdout, stderr for allocations that we failed because FE startup failed.
128
+ output.uploaded_stdout = fe_startup_output.stdout
129
+ output.uploaded_stderr = fe_startup_output.stderr
130
+ return output
131
+
132
+
133
+ def _fe_startup_failure_reason_to_task_failure_reason(
134
+ fe_termination_reason: FunctionExecutorTerminationReason, logger: Any
135
+ ) -> TaskFailureReason:
136
+ # Only need to check FE termination reasons happening on FE startup.
137
+ if (
138
+ fe_termination_reason
139
+ == FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_ERROR
140
+ ):
141
+ return TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR
142
+ elif (
143
+ fe_termination_reason
144
+ == FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_TIMEOUT
145
+ ):
146
+ return TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_TIMEOUT
147
+ elif (
148
+ fe_termination_reason
149
+ == FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR
150
+ ):
151
+ return TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR
152
+ else:
153
+ logger.error(
154
+ "unexpected function executor startup failure reason",
155
+ fe_termination_reason=FunctionExecutorTerminationReason.Name(
156
+ fe_termination_reason
157
+ ),
158
+ )
159
+ return TaskFailureReason.TASK_FAILURE_REASON_UNKNOWN
@@ -66,6 +66,7 @@ class ExecutorStateReconciler:
66
66
  self._desired_states_reader_task: Optional[asyncio.Task] = None
67
67
  self._reconciliation_loop_task: Optional[asyncio.Task] = None
68
68
  self._function_executor_controllers: Dict[str, FunctionExecutorController] = {}
69
+ self._shutting_down_fe_ids: Set[str] = set()
69
70
  self._last_server_clock: Optional[int] = None
70
71
 
71
72
  self._last_desired_state_lock = asyncio.Lock()
@@ -320,14 +321,35 @@ class ExecutorStateReconciler:
320
321
  logger.error("failed adding Function Executor", exc_info=e)
321
322
 
322
323
  def _remove_function_executor_controller(self, function_executor_id: str) -> None:
323
- fe_controller: FunctionExecutorController = (
324
- self._function_executor_controllers.pop(function_executor_id)
325
- )
324
+ # Don't remove the FE controller from self._function_executor_controllers until
325
+ # its shutdown is complete. Otherwise, if Server re-adds the FE to desired state
326
+ # before FE shutdown completes then we'll have two FE controllers for the same
327
+ # FE ID which results in many bugs.
328
+ if function_executor_id in self._shutting_down_fe_ids:
329
+ return
330
+
331
+ self._shutting_down_fe_ids.add(function_executor_id)
326
332
  asyncio.create_task(
327
- fe_controller.shutdown(),
333
+ self._shutdown_function_executor_controller(function_executor_id),
328
334
  name=f"Shutdown Function Executor {function_executor_id}",
329
335
  )
330
336
 
337
+ async def _shutdown_function_executor_controller(
338
+ self, function_executor_id: str
339
+ ) -> None:
340
+ # We are not cancelling this aio task in self.shutdown(). Because of this the code here should
341
+ # not fail if the FE controller is not found in internal data structures. It can be removed
342
+ # by self.shutdown() at any time while we're running this aio task.
343
+ fe_controller: Optional[FunctionExecutorController] = (
344
+ self._function_executor_controllers.get(function_executor_id)
345
+ )
346
+ if fe_controller is None:
347
+ return
348
+
349
+ await fe_controller.shutdown()
350
+ self._function_executor_controllers.pop(function_executor_id, None)
351
+ self._shutting_down_fe_ids.discard(function_executor_id)
352
+
331
353
  def _reconcile_tasks(self, task_allocations: Iterable[TaskAllocation]):
332
354
  valid_task_allocations: List[TaskAllocation] = self._valid_task_allocations(
333
355
  task_allocations
@@ -393,8 +415,6 @@ class ExecutorStateReconciler:
393
415
  task_allocation.function_executor_id
394
416
  not in self._function_executor_controllers
395
417
  ):
396
- # Current policy: don't report task outcomes for tasks that didn't run.
397
- # This is required to simplify the protocol so Server doesn't need to care about task states.
398
418
  logger.error(
399
419
  "received TaskAllocation for a Function Executor that doesn't exist, dropping it from desired state"
400
420
  )
File without changes