indexify 0.3.27__tar.gz → 0.3.29__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {indexify-0.3.27 → indexify-0.3.29}/PKG-INFO +2 -1
  2. {indexify-0.3.27 → indexify-0.3.29}/pyproject.toml +2 -1
  3. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/cli/cli.py +23 -12
  4. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/function_executor_state.py +1 -5
  5. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/function_executor_states_container.py +4 -0
  6. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/health_checker.py +3 -1
  7. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/task_output.py +1 -1
  8. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/grpc/function_executor_controller.py +10 -4
  9. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/grpc/state_reconciler.py +12 -9
  10. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/grpc/state_reporter.py +36 -14
  11. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/grpc/task_controller.py +2 -1
  12. indexify-0.3.29/src/indexify/executor/host_resources/host_resources.py +104 -0
  13. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/host_resources/nvidia_gpu.py +4 -0
  14. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/task_reporter.py +1 -4
  15. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/proto/executor_api.proto +6 -7
  16. indexify-0.3.29/src/indexify/proto/executor_api_pb2.py +86 -0
  17. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/proto/executor_api_pb2.pyi +9 -8
  18. indexify-0.3.27/src/indexify/executor/host_resources/host_resources.py +0 -50
  19. indexify-0.3.27/src/indexify/proto/executor_api_pb2.py +0 -86
  20. {indexify-0.3.27 → indexify-0.3.29}/README.md +0 -0
  21. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/README.md +0 -0
  22. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/api_objects.py +0 -0
  23. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/blob_store/blob_store.py +0 -0
  24. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/blob_store/local_fs_blob_store.py +0 -0
  25. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/blob_store/metrics/blob_store.py +0 -0
  26. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/blob_store/s3_blob_store.py +0 -0
  27. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/downloader.py +0 -0
  28. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/executor.py +0 -0
  29. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/executor_flavor.py +0 -0
  30. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/function_executor.py +0 -0
  31. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/function_executor_status.py +0 -0
  32. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/invocation_state_client.py +0 -0
  33. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/metrics/function_executor.py +0 -0
  34. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/metrics/function_executor_state.py +0 -0
  35. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -0
  36. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/metrics/health_checker.py +0 -0
  37. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/metrics/invocation_state_client.py +0 -0
  38. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/metrics/single_task_runner.py +0 -0
  39. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
  40. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
  41. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
  42. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
  43. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +0 -0
  44. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/single_task_runner.py +0 -0
  45. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/function_executor/task_input.py +0 -0
  46. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/grpc/channel_manager.py +0 -0
  47. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/grpc/metrics/channel_manager.py +0 -0
  48. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/grpc/metrics/state_reconciler.py +0 -0
  49. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/grpc/metrics/state_reporter.py +0 -0
  50. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/grpc/metrics/task_controller.py +0 -0
  51. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/host_resources/nvidia_gpu_allocator.py +0 -0
  52. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/metrics/downloader.py +0 -0
  53. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/metrics/executor.py +0 -0
  54. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/metrics/task_fetcher.py +0 -0
  55. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/metrics/task_reporter.py +0 -0
  56. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/metrics/task_runner.py +0 -0
  57. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/monitoring/function_allowlist.py +0 -0
  58. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/monitoring/handler.py +0 -0
  59. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/monitoring/health_check_handler.py +0 -0
  60. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +0 -0
  61. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/monitoring/health_checker/health_checker.py +0 -0
  62. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/monitoring/metrics.py +0 -0
  63. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/monitoring/prometheus_metrics_handler.py +0 -0
  64. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/monitoring/server.py +0 -0
  65. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/monitoring/startup_probe_handler.py +0 -0
  66. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/runtime_probes.py +0 -0
  67. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/task_fetcher.py +0 -0
  68. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/executor/task_runner.py +0 -0
  69. {indexify-0.3.27 → indexify-0.3.29}/src/indexify/proto/executor_api_pb2_grpc.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: indexify
3
- Version: 0.3.27
3
+ Version: 0.3.29
4
4
  Summary: Open Source Indexify components and helper tools
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -17,6 +17,7 @@ Classifier: Programming Language :: Python :: 3.13
17
17
  Requires-Dist: aiohttp (>=3.11.0,<4.0.0)
18
18
  Requires-Dist: boto3 (>=1.37.30,<2.0.0)
19
19
  Requires-Dist: prometheus-client (>=0.21.1,<0.22.0)
20
+ Requires-Dist: psutil (>=7.0.0,<8.0.0)
20
21
  Requires-Dist: rich (>=13.9.2,<14.0.0)
21
22
  Requires-Dist: tensorlake (>=0.1)
22
23
  Requires-Dist: typer (>=0.12,<0.13)
@@ -1,7 +1,7 @@
1
1
  [tool.poetry]
2
2
  name = "indexify"
3
3
  # Incremented if any of the components provided in this packages are updated.
4
- version = "0.3.27"
4
+ version = "0.3.29"
5
5
  description = "Open Source Indexify components and helper tools"
6
6
  authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
7
7
  license = "Apache 2.0"
@@ -22,6 +22,7 @@ python = "^3.9"
22
22
  # Executor only
23
23
  aiohttp = "^3.11.0"
24
24
  prometheus-client = "^0.21.1"
25
+ psutil = "^7.0.0"
25
26
  # Adds function-executor binary and utils lib.
26
27
  tensorlake = ">=0.1"
27
28
  # Uncomment the next line to use local tensorlake package (only for development!)
@@ -69,7 +69,7 @@ def build_image(
69
69
  exec(open(workflow_file_path).read(), globals_dict)
70
70
  except FileNotFoundError as e:
71
71
  raise Exception(
72
- f"Could not find workflow file to execute at: " f"`{workflow_file_path}`"
72
+ f"Could not find workflow file to execute at: `{workflow_file_path}`"
73
73
  )
74
74
  for _, obj in globals_dict.items():
75
75
  if type(obj) and isinstance(obj, Image):
@@ -122,6 +122,15 @@ def executor(
122
122
  help="Port where to run Executor Monitoring server",
123
123
  ),
124
124
  ] = 7000,
125
+ labels: Annotated[
126
+ List[str],
127
+ typer.Option(
128
+ "--label",
129
+ "-l",
130
+ help="Executor key-value label to be sent to the Server. "
131
+ "Specified as <key>=<value>",
132
+ ),
133
+ ] = [],
125
134
  enable_grpc_state_reconciler: Annotated[
126
135
  bool,
127
136
  typer.Option(
@@ -132,18 +141,10 @@ def executor(
132
141
  ),
133
142
  ),
134
143
  ] = False,
135
- labels: Annotated[
136
- List[str],
137
- typer.Option(
138
- "--label",
139
- "-l",
140
- help="Executor key-value label to be sent to the Server. "
141
- "Specified as <key>=<value>",
142
- ),
143
- ] = [],
144
144
  ):
145
145
  if dev:
146
- configure_development_mode_logging()
146
+ compact_tracebacks: bool = os.getenv("INDEXIFY_COMPACT_TRACEBACKS", "1") == "1"
147
+ configure_development_mode_logging(compact_tracebacks=compact_tracebacks)
147
148
  else:
148
149
  configure_production_mode_logging()
149
150
  if function_uris is None:
@@ -200,6 +201,16 @@ def executor(
200
201
  s3=S3BLOBStore(),
201
202
  )
202
203
 
204
+ host_resources_provider: HostResourcesProvider = HostResourcesProvider(
205
+ gpu_allocator=NvidiaGPUAllocator(logger),
206
+ # Assuming a simple setup in OSS where Executor container has a single file system
207
+ # used by all Function Executors and all the container resources are available to all Function Executors.
208
+ function_executors_ephimeral_disks_path="/",
209
+ host_overhead_cpus=0,
210
+ host_overhead_memory_gb=0,
211
+ host_overhead_function_executors_ephimeral_disks_gb=0,
212
+ )
213
+
203
214
  prometheus_client.Info("cli", "CLI information").info(
204
215
  {
205
216
  "package": "indexify",
@@ -226,7 +237,7 @@ def executor(
226
237
  monitoring_server_port=monitoring_server_port,
227
238
  enable_grpc_state_reconciler=enable_grpc_state_reconciler,
228
239
  blob_store=blob_store,
229
- host_resources_provider=HostResourcesProvider(NvidiaGPUAllocator(logger)),
240
+ host_resources_provider=host_resources_provider,
230
241
  ).run()
231
242
 
232
243
 
@@ -49,7 +49,6 @@ class FunctionExecutorState:
49
49
  # TODO: Move graph_version to immutable fields once we migrate to gRPC State Reconciler.
50
50
  self.graph_version: str = graph_version
51
51
  self.status: FunctionExecutorStatus = FunctionExecutorStatus.DESTROYED
52
- self.status_message: str = ""
53
52
  self.status_change_notifier: asyncio.Condition = asyncio.Condition(
54
53
  lock=self.lock
55
54
  )
@@ -65,9 +64,7 @@ class FunctionExecutorState:
65
64
  while self.status not in allowlist:
66
65
  await self.status_change_notifier.wait()
67
66
 
68
- async def set_status(
69
- self, new_status: FunctionExecutorStatus, status_message: str = ""
70
- ) -> None:
67
+ async def set_status(self, new_status: FunctionExecutorStatus) -> None:
71
68
  """Sets the status of the Function Executor.
72
69
 
73
70
  The caller must hold the lock.
@@ -84,7 +81,6 @@ class FunctionExecutorState:
84
81
  metric_function_executors_with_status.labels(status=self.status.name).dec()
85
82
  metric_function_executors_with_status.labels(status=new_status.name).inc()
86
83
  self.status = new_status
87
- self.status_message = status_message
88
84
  self.status_change_notifier.notify_all()
89
85
  else:
90
86
  raise ValueError(
@@ -71,6 +71,10 @@ class FunctionExecutorStatesContainer:
71
71
  metric_function_executor_states_count.set(len(self._states))
72
72
  return state
73
73
 
74
+ def exists(self, id: str) -> bool:
75
+ """Check if the state with the given ID exists."""
76
+ return id in self._states
77
+
74
78
  async def shutdown(self):
75
79
  # Function Executors are outside the Executor process
76
80
  # so they need to get cleaned up explicitly and reliably.
@@ -19,7 +19,9 @@ from .metrics.health_checker import (
19
19
  )
20
20
  from .server.client_configuration import HEALTH_CHECK_TIMEOUT_SEC
21
21
 
22
- HEALTH_CHECK_POLL_PERIOD_SEC = 10
22
+ # Use lowest feasible value for now to detect FE crashes quickly because
23
+ # we're only doing periodic health checks now.
24
+ HEALTH_CHECK_POLL_PERIOD_SEC = 5
23
25
 
24
26
 
25
27
  class HealthCheckResult:
@@ -99,7 +99,7 @@ class TaskOutput:
99
99
  function_name=function_name,
100
100
  graph_version=graph_version,
101
101
  graph_invocation_id=graph_invocation_id,
102
- stderr=f"Function exceeded its configured timeout of {timeout_sec:.3f} sec.",
102
+ stderr=f"Function or router exceeded its configured timeout of {timeout_sec:.3f} sec.",
103
103
  is_internal_error=False,
104
104
  output_payload_uri_prefix=output_payload_uri_prefix,
105
105
  )
@@ -250,7 +250,6 @@ class FunctionExecutorController:
250
250
  )
251
251
 
252
252
  next_status: FunctionExecutorStatus = FunctionExecutorStatus.IDLE
253
- next_status_message: str = ""
254
253
  async with _UnlockedLockContextManager(self._function_executor_state.lock):
255
254
  try:
256
255
  function_executor: FunctionExecutor = await _create_function_executor(
@@ -264,13 +263,20 @@ class FunctionExecutorController:
264
263
  )
265
264
  except CustomerError as e:
266
265
  next_status = FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
267
- next_status_message = str(e)
266
+ # TODO: Save stdout and stderr of customer code that ran during FE creation into BLOBs and uncomment the corresponding tests.
267
+ self._logger.error(
268
+ "failed to create function executor due to error in customer code",
269
+ exc_info=e,
270
+ )
268
271
  except Exception as e:
269
272
  next_status = FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
270
- self._logger.error("failed to create function executor", exc_info=e)
273
+ self._logger.error(
274
+ "failed to create function executor due to platform error",
275
+ exc_info=e,
276
+ )
271
277
 
272
278
  # FE state lock is acquired again at this point.
273
- await self._function_executor_state.set_status(next_status, next_status_message)
279
+ await self._function_executor_state.set_status(next_status)
274
280
 
275
281
  if next_status == FunctionExecutorStatus.IDLE:
276
282
  # Task controllers will notice that this FE is IDLE and start running on it one by one.
@@ -137,14 +137,16 @@ class ExecutorStateReconciler:
137
137
  )
138
138
  continue
139
139
 
140
- if self._last_server_clock is not None:
141
- if self._last_server_clock >= new_state.clock:
142
- self._logger.warning(
143
- "received outdated DesiredExecutorState from Server, ignoring",
144
- current_clock=self._last_server_clock,
145
- ignored_clock=new_state.clock,
146
- )
147
- continue # Duplicate or outdated message state sent by Server.
140
+ # TODO: The clock is only incremented when function executors have actionable changes and not on new allocations.
141
+ # Therefore the clock cannot currently be used as an idempotency token.
142
+ # if self._last_server_clock is not None:
143
+ # if self._last_server_clock >= new_state.clock:
144
+ # self._logger.warning(
145
+ # "received outdated DesiredExecutorState from Server, ignoring",
146
+ # current_clock=self._last_server_clock,
147
+ # ignored_clock=new_state.clock,
148
+ # )
149
+ # continue # Duplicate or outdated message state sent by Server.
148
150
 
149
151
  self._last_server_clock = new_state.clock
150
152
  # Always read the latest desired state value from the stream so
@@ -272,7 +274,8 @@ class ExecutorStateReconciler:
272
274
 
273
275
  Doesn't block on any long running operations. Doesn't raise any exceptions.
274
276
  """
275
- if function_executor_description.id not in self._function_executor_controllers:
277
+
278
+ if not self._function_executor_states.exists(function_executor_description.id):
276
279
  await self._create_function_executor(function_executor_description)
277
280
 
278
281
  async def _create_function_executor(
@@ -73,11 +73,11 @@ class ExecutorStateReporter:
73
73
  function_executor_states
74
74
  )
75
75
  self._channel_manager = channel_manager
76
+ self._host_resources_provider: HostResourcesProvider = host_resources_provider
76
77
  self._logger: Any = logger.bind(module=__name__)
77
78
  self._reporting_interval_sec: int = reporting_interval_sec
78
- self._total_host_resources: HostResourcesProto = _host_resources_to_proto(
79
- host_resources_provider.total_resources(logger)
80
- )
79
+ self._total_host_resources: Optional[HostResourcesProto] = None
80
+ self._total_function_executor_resources: Optional[HostResourcesProto] = None
81
81
 
82
82
  self._is_shutdown: bool = False
83
83
  self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
@@ -85,7 +85,9 @@ class ExecutorStateReporter:
85
85
  function_allowlist
86
86
  )
87
87
  self._labels.update(_label_values_to_strings(RuntimeProbes().probe().labels))
88
- self._last_server_clock: Optional[int] = None
88
+ self._last_server_clock: int = (
89
+ 0 # Server expects initial value to be 0 until it is set by Server.
90
+ )
89
91
 
90
92
  def update_executor_status(self, value: ExecutorStatus):
91
93
  self._executor_status = value
@@ -98,7 +100,7 @@ class ExecutorStateReporter:
98
100
 
99
101
  Never raises any exceptions.
100
102
  """
101
- # TODO: Move this into a new async task and cancel it in shutdown().
103
+ # TODO: Move this method into a new async task and cancel it in shutdown().
102
104
  while not self._is_shutdown:
103
105
  stub = ExecutorAPIStub(await self._channel_manager.get_channel())
104
106
  while not self._is_shutdown:
@@ -111,19 +113,39 @@ class ExecutorStateReporter:
111
113
  await asyncio.sleep(self._reporting_interval_sec)
112
114
  except Exception as e:
113
115
  self._logger.error(
114
- f"Failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
116
+ f"failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
115
117
  exc_info=e,
116
118
  )
117
119
  await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
118
120
  break
119
121
 
120
- self._logger.info("State reporter shutdown")
122
+ self._logger.info("state reporter shutdown")
121
123
 
122
124
  async def report_state(self, stub: ExecutorAPIStub):
123
125
  """Reports the current state to the server represented by the supplied stub.
124
126
 
125
127
  Raises exceptions on failure.
126
128
  """
129
+ if self._total_host_resources is None:
130
+ # We need to fetch total resources only once, because they are not changing.
131
+ total_host_resources: HostResources = (
132
+ await self._host_resources_provider.total_host_resources(self._logger)
133
+ )
134
+ total_function_executor_resources: HostResources = (
135
+ await self._host_resources_provider.total_function_executor_resources(
136
+ self._logger
137
+ )
138
+ )
139
+ self._logger.info(
140
+ "detected host resources",
141
+ total_host_resources=total_host_resources,
142
+ total_function_executor_resources=total_function_executor_resources,
143
+ )
144
+ self._total_host_resources = _host_resources_to_proto(total_host_resources)
145
+ self._total_function_executor_resources = _host_resources_to_proto(
146
+ total_function_executor_resources
147
+ )
148
+
127
149
  with (
128
150
  metric_state_report_errors.count_exceptions(),
129
151
  metric_state_report_latency.time(),
@@ -136,16 +158,15 @@ class ExecutorStateReporter:
136
158
  flavor=_to_grpc_executor_flavor(self._flavor, self._logger),
137
159
  version=self._version,
138
160
  status=self._executor_status,
139
- # Server requires free_resources to be set but ignores its value for now.
140
- free_resources=self._total_host_resources,
161
+ total_function_executor_resources=self._total_function_executor_resources,
141
162
  total_resources=self._total_host_resources,
142
163
  allowed_functions=self._allowed_functions,
143
164
  function_executor_states=await self._fetch_function_executor_states(),
144
165
  labels=self._labels,
145
166
  )
146
167
  state.state_hash = _state_hash(state)
147
- if self._last_server_clock is not None:
148
- state.server_clock = self._last_server_clock
168
+ # Set fields not included in the state hash.
169
+ state.server_clock = self._last_server_clock
149
170
 
150
171
  await stub.report_executor_state(
151
172
  ReportExecutorStateRequest(executor_state=state),
@@ -176,7 +197,6 @@ class ExecutorStateReporter:
176
197
  status=_to_grpc_function_executor_status(
177
198
  function_executor_state.status, self._logger
178
199
  ),
179
- status_message=function_executor_state.status_message,
180
200
  )
181
201
  if function_executor_state.image_uri:
182
202
  function_executor_state_proto.description.image_uri = (
@@ -227,7 +247,7 @@ def _to_grpc_function_executor_status(
227
247
  )
228
248
 
229
249
  if result == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNKNOWN:
230
- logger.error("Unexpected Function Executor status", status=status)
250
+ logger.error("unexpected Function Executor status", status=status)
231
251
 
232
252
  return result
233
253
 
@@ -246,7 +266,7 @@ def _to_grpc_executor_flavor(
246
266
  )
247
267
 
248
268
  if result == ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN:
249
- logger.error("Unexpected Executor flavor", flavor=flavor)
269
+ logger.error("unexpected Executor flavor", flavor=flavor)
250
270
 
251
271
  return result
252
272
 
@@ -287,5 +307,7 @@ def _gpu_model_to_proto(gpu_model: NVIDIA_GPU_MODEL) -> GPUModelProto:
287
307
  return GPUModelProto.GPU_MODEL_NVIDIA_A100_80GB
288
308
  elif gpu_model == NVIDIA_GPU_MODEL.H100_80GB:
289
309
  return GPUModelProto.GPU_MODEL_NVIDIA_H100_80GB
310
+ elif gpu_model == NVIDIA_GPU_MODEL.TESLA_T4:
311
+ return GPUModelProto.GPU_MODEL_NVIDIA_TESLA_T4
290
312
  else:
291
313
  return GPUModelProto.GPU_MODEL_UNKNOWN
@@ -203,7 +203,7 @@ class TaskController:
203
203
  reducer_output_key=(
204
204
  self._task.reducer_output_key
205
205
  if self._task.HasField("reducer_output_key")
206
- else ""
206
+ else None
207
207
  ),
208
208
  data_payload=(
209
209
  self._task.reducer_input
@@ -233,6 +233,7 @@ class TaskController:
233
233
  # and no other tasks run on this FE because it'd result in undefined behavior.
234
234
  if self._is_timed_out:
235
235
  next_status = FunctionExecutorStatus.UNHEALTHY
236
+ # TODO: When task controller is removed do FE health check here to stop scheduling tasks on unhealthy FE asap.
236
237
  await self._release_function_executor(next_status=next_status)
237
238
 
238
239
  async def _acquire_function_executor(self) -> None:
@@ -0,0 +1,104 @@
1
+ import asyncio
2
+ from typing import Any, List, Optional
3
+
4
+ import psutil
5
+ from pydantic import BaseModel
6
+
7
+ from .nvidia_gpu import NvidiaGPUInfo
8
+ from .nvidia_gpu_allocator import NvidiaGPUAllocator
9
+
10
+
11
+ class HostResources(BaseModel):
12
+ cpu_count: int
13
+ memory_mb: int
14
+ disk_mb: int
15
+ gpus: List[NvidiaGPUInfo]
16
+
17
+
18
+ class HostResourcesProvider:
19
+ """
20
+ HostResourcesProvider is a class that provides information about the host resources.
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ gpu_allocator: NvidiaGPUAllocator,
26
+ function_executors_ephimeral_disks_path: str,
27
+ host_overhead_cpus: int,
28
+ host_overhead_memory_gb: int,
29
+ host_overhead_function_executors_ephimeral_disks_gb: int,
30
+ ):
31
+ """Creates a HostResourcesProvider.
32
+
33
+ Args:
34
+ gpu_allocator: The GPU allocator to use for GPU information.
35
+ function_executors_ephimeral_disks_path: The path to file system used as ephimeral disk space by Function Executors.
36
+ host_overhead_cpus: The number of CPUs reserved for use by host (can't be used by Function Executors).
37
+ host_overhead_memory_gb: The amount of memory reserved for use by host (can't be used by Function Executors).
38
+ host_overhead_function_executors_ephimeral_disks_gb: The amount of ephimeral disk space reserved for use by host (can't be used by Function Executors).
39
+ """
40
+ self._gpu_allocator: NvidiaGPUAllocator = gpu_allocator
41
+ self._function_executors_ephimeral_disks_path: str = (
42
+ function_executors_ephimeral_disks_path
43
+ )
44
+ self._host_overhead_cpus: int = host_overhead_cpus
45
+ self._host_overhead_memory_gb: int = host_overhead_memory_gb
46
+ self._host_overhead_function_executors_ephimeral_disks_gb: int = (
47
+ host_overhead_function_executors_ephimeral_disks_gb
48
+ )
49
+
50
+ async def total_host_resources(self, logger: Any) -> HostResources:
51
+ """Returns all hardware resources that exist at the host.
52
+
53
+ Raises Exception on error.
54
+ """
55
+ # Run psutil library calls in a separate thread to not block the event loop.
56
+ return await asyncio.to_thread(self._total_host_resources, logger=logger)
57
+
58
+ async def total_function_executor_resources(self, logger: Any) -> HostResources:
59
+ """Returns all hardware resources on the host that are usable by Function Executors.
60
+
61
+ Raises Exception on error.
62
+ """
63
+ total_resources: HostResources = await self.total_host_resources(logger=logger)
64
+ return HostResources(
65
+ cpu_count=max(0, total_resources.cpu_count - self._host_overhead_cpus),
66
+ memory_mb=max(
67
+ 0, total_resources.memory_mb - self._host_overhead_memory_gb * 1024
68
+ ),
69
+ disk_mb=max(
70
+ 0,
71
+ total_resources.disk_mb
72
+ - self._host_overhead_function_executors_ephimeral_disks_gb * 1024,
73
+ ),
74
+ gpus=total_resources.gpus,
75
+ )
76
+
77
+ def _total_host_resources(self, logger: Any) -> HostResources:
78
+ logger = logger.bind(module=__name__)
79
+
80
+ # If users disable Hyper-Threading in OS then we'd only see physical cores here.
81
+ # This allows users to control if logical or physical cores are used for resource
82
+ # reporting and for running the functions.
83
+ cpu_count: Optional[int] = psutil.cpu_count(logical=True)
84
+ if cpu_count is None:
85
+ logger.warning(
86
+ "Unable to determine CPU count. Defaulting to 0.",
87
+ cpu_count=cpu_count,
88
+ )
89
+ cpu_count = 0
90
+
91
+ memory_mb: int = int(psutil.virtual_memory().total / 1024 / 1024)
92
+ disk_mb = int(
93
+ psutil.disk_usage(self._function_executors_ephimeral_disks_path).total
94
+ / 1024
95
+ / 1024
96
+ )
97
+ all_gpus: List[NvidiaGPUInfo] = self._gpu_allocator.list_all()
98
+
99
+ return HostResources(
100
+ cpu_count=cpu_count,
101
+ memory_mb=memory_mb,
102
+ disk_mb=disk_mb,
103
+ gpus=all_gpus,
104
+ )
@@ -11,6 +11,7 @@ class NVIDIA_GPU_MODEL(str, Enum):
11
11
  A100_40GB = "A100-40GB"
12
12
  A100_80GB = "A100-80GB"
13
13
  H100_80GB = "H100"
14
+ TESLA_T4 = "T4"
14
15
 
15
16
 
16
17
  class NvidiaGPUInfo(BaseModel):
@@ -52,6 +53,7 @@ def fetch_nvidia_gpu_infos(logger: Any) -> List[NvidiaGPUInfo]:
52
53
  # 0, NVIDIA A100-SXM4-80GB, GPU-89fdc1e1-18b2-f499-c12b-82bcb9bfb3fa
53
54
  # 1, NVIDIA A100-PCIE-40GB, GPU-e9c9aa65-bff3-405a-ab7c-dc879cc88169
54
55
  # 2, NVIDIA H100 80GB HBM3, GPU-8c35f4c9-4dff-c9a2-866f-afb5d82e1dd7
56
+ # 3, Tesla T4, GPU-2a7fadae-a692-1c44-2c57-6645a0d117e4
55
57
  parts = line.split(",")
56
58
  index = parts[0].strip()
57
59
  product_name = parts[1].strip()
@@ -64,6 +66,8 @@ def fetch_nvidia_gpu_infos(logger: Any) -> List[NvidiaGPUInfo]:
64
66
  model = NVIDIA_GPU_MODEL.A100_40GB
65
67
  elif product_name.startswith("NVIDIA H100"):
66
68
  model = NVIDIA_GPU_MODEL.H100_80GB
69
+ elif product_name.startswith("Tesla T4"):
70
+ model = NVIDIA_GPU_MODEL.TESLA_T4
67
71
  else:
68
72
  logger.warning(
69
73
  "Unknown GPU model was detected, ignoring", nvidia_smi_output=line
@@ -323,15 +323,12 @@ class TaskReporter:
323
323
  invocation_id=output.graph_invocation_id,
324
324
  executor_id=self._executor_id,
325
325
  task_id=output.task_id,
326
+ reducer=output.reducer,
326
327
  )
327
328
  output_files: List[Any] = []
328
- if output is None:
329
- return task_result, output_files
330
-
331
329
  task_result.outcome = (
332
330
  TASK_OUTCOME_SUCCESS if output.success else TASK_OUTCOME_FAILURE
333
331
  )
334
- task_result.reducer = output.reducer
335
332
 
336
333
  _process_function_output(
337
334
  function_output=output.function_output, output_files=output_files
@@ -32,6 +32,7 @@ enum GPUModel {
32
32
  GPU_MODEL_NVIDIA_A100_40GB = 1;
33
33
  GPU_MODEL_NVIDIA_A100_80GB = 2;
34
34
  GPU_MODEL_NVIDIA_H100_80GB = 3;
35
+ GPU_MODEL_NVIDIA_TESLA_T4 = 4;
35
36
  }
36
37
 
37
38
  // Free GPUs available at the Executor.
@@ -102,10 +103,7 @@ message FunctionExecutorDescription {
102
103
  message FunctionExecutorState {
103
104
  optional FunctionExecutorDescription description = 1;
104
105
  optional FunctionExecutorStatus status = 2;
105
- // Human readable message clarifying the status.
106
- // Currently it contains error message from customer code
107
- // if status is FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR.
108
- optional string status_message = 3;
106
+ reserved 3;
109
107
  }
110
108
 
111
109
  enum ExecutorStatus {
@@ -130,10 +128,10 @@ message ExecutorState {
130
128
  optional ExecutorFlavor flavor = 4;
131
129
  optional string version = 5;
132
130
  optional ExecutorStatus status = 6;
133
- // Total resources available at the Executor.
131
+ // Total resources at the Executor.
134
132
  optional HostResources total_resources = 13;
135
- // Free resources available at the Executor.
136
- optional HostResources free_resources = 7; // Not used right now.
133
+ // Total resources usable by Function Executors.
134
+ optional HostResources total_function_executor_resources = 7;
137
135
  // Empty allowed_functions list means that any function can run on the Executor.
138
136
  repeated AllowedFunction allowed_functions = 8;
139
137
  repeated FunctionExecutorState function_executor_states = 9;
@@ -141,6 +139,7 @@ message ExecutorState {
141
139
  optional string state_hash = 11;
142
140
  // Server supplied clock value of the latest desired executor state that was
143
141
  // reconciled by Executor. Not included into state_hash.
142
+ // Initial value on Executor startup is 0.
144
143
  optional uint64 server_clock = 12;
145
144
  }
146
145
 
@@ -0,0 +1,86 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
3
+ # NO CHECKED-IN PROTOBUF GENCODE
4
+ # source: indexify/proto/executor_api.proto
5
+ # Protobuf Python Version: 5.29.0
6
+ """Generated protocol buffer code."""
7
+ from google.protobuf import descriptor as _descriptor
8
+ from google.protobuf import descriptor_pool as _descriptor_pool
9
+ from google.protobuf import runtime_version as _runtime_version
10
+ from google.protobuf import symbol_database as _symbol_database
11
+ from google.protobuf.internal import builder as _builder
12
+
13
+ _runtime_version.ValidateProtobufRuntimeVersion(
14
+ _runtime_version.Domain.PUBLIC, 5, 29, 0, "", "indexify/proto/executor_api.proto"
15
+ )
16
+ # @@protoc_insertion_point(imports)
17
+
18
+ _sym_db = _symbol_database.Default()
19
+
20
+
21
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
22
+ b'\n!indexify/proto/executor_api.proto\x12\x0f\x65xecutor_api_pb"\x87\x02\n\x0b\x44\x61taPayload\x12\x11\n\x04path\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x11\n\x04size\x18\x02 \x01(\x04H\x01\x88\x01\x01\x12\x18\n\x0bsha256_hash\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x10\n\x03uri\x18\x04 \x01(\tH\x03\x88\x01\x01\x12;\n\x08\x65ncoding\x18\x05 \x01(\x0e\x32$.executor_api_pb.DataPayloadEncodingH\x04\x88\x01\x01\x12\x1d\n\x10\x65ncoding_version\x18\x06 \x01(\x04H\x05\x88\x01\x01\x42\x07\n\x05_pathB\x07\n\x05_sizeB\x0e\n\x0c_sha256_hashB\x06\n\x04_uriB\x0b\n\t_encodingB\x13\n\x11_encoding_version"k\n\x0cGPUResources\x12\x12\n\x05\x63ount\x18\x01 \x01(\rH\x00\x88\x01\x01\x12-\n\x05model\x18\x02 \x01(\x0e\x32\x19.executor_api_pb.GPUModelH\x01\x88\x01\x01\x42\x08\n\x06_countB\x08\n\x06_modelJ\x04\x08\x03\x10\x04"\xc2\x01\n\rHostResources\x12\x16\n\tcpu_count\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x19\n\x0cmemory_bytes\x18\x02 \x01(\x04H\x01\x88\x01\x01\x12\x17\n\ndisk_bytes\x18\x03 \x01(\x04H\x02\x88\x01\x01\x12/\n\x03gpu\x18\x04 \x01(\x0b\x32\x1d.executor_api_pb.GPUResourcesH\x03\x88\x01\x01\x42\x0c\n\n_cpu_countB\x0f\n\r_memory_bytesB\r\n\x0b_disk_bytesB\x06\n\x04_gpu"\xbb\x01\n\x0f\x41llowedFunction\x12\x16\n\tnamespace\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x17\n\ngraph_name\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x42\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_function_nameB\x10\n\x0e_graph_version"\xc5\x01\n\x19\x46unctionExecutorResources\x12\x1b\n\x0e\x63pu_ms_per_sec\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x19\n\x0cmemory_bytes\x18\x02 \x01(\x04H\x01\x88\x01\x01\x12\x17\n\ndisk_bytes\x18\x03 \x01(\x04H\x02\x88\x01\x01\x12\x16\n\tgpu_count\x18\x04 \x01(\rH\x03\x88\x01\x01\x42\x11\n\x0f_cpu_ms_per_secB\x0f\n\r_memory_bytesB\r\n\x0b_disk_bytesB\x0c\n\n_gpu_count"\xbf\x04\n\x1b\x46unctionExecutorDescription\x12\x0f\n\x02id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x05 \x01(\tH\x04\x88\x01\x01\x12\x16\n\timage_uri\x18\x06 \x01(\tH\x05\x88\x01\x01\x12\x14\n\x0csecret_names\x18\x07 \x03(\t\x12<\n\x0fresource_limits\x18\x08 \x01(\x0b\x32\x1e.executor_api_pb.HostResourcesH\x06\x88\x01\x01\x12%\n\x18\x63ustomer_code_timeout_ms\x18\t \x01(\rH\x07\x88\x01\x01\x12\x30\n\x05graph\x18\n \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\x08\x88\x01\x01\x12\x42\n\tresources\x18\x0b \x01(\x0b\x32*.executor_api_pb.FunctionExecutorResourcesH\t\x88\x01\x01\x42\x05\n\x03_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_graph_versionB\x10\n\x0e_function_nameB\x0c\n\n_image_uriB\x12\n\x10_resource_limitsB\x1b\n\x19_customer_code_timeout_msB\x08\n\x06_graphB\x0c\n\n_resources"\xbe\x01\n\x15\x46unctionExecutorState\x12\x46\n\x0b\x64\x65scription\x18\x01 \x01(\x0b\x32,.executor_api_pb.FunctionExecutorDescriptionH\x00\x88\x01\x01\x12<\n\x06status\x18\x02 \x01(\x0e\x32\'.executor_api_pb.FunctionExecutorStatusH\x01\x88\x01\x01\x42\x0e\n\x0c_descriptionB\t\n\x07_statusJ\x04\x08\x03\x10\x04"\xc3\x06\n\rExecutorState\x12\x18\n\x0b\x65xecutor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x1d\n\x10\x64\x65velopment_mode\x18\x02 \x01(\x08H\x01\x88\x01\x01\x12\x15\n\x08hostname\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x34\n\x06\x66lavor\x18\x04 \x01(\x0e\x32\x1f.executor_api_pb.ExecutorFlavorH\x03\x88\x01\x01\x12\x14\n\x07version\x18\x05 \x01(\tH\x04\x88\x01\x01\x12\x34\n\x06status\x18\x06 \x01(\x0e\x32\x1f.executor_api_pb.ExecutorStatusH\x05\x88\x01\x01\x12<\n\x0ftotal_resources\x18\r \x01(\x0b\x32\x1e.executor_api_pb.HostResourcesH\x06\x88\x01\x01\x12N\n!total_function_executor_resources\x18\x07 \x01(\x0b\x32\x1e.executor_api_pb.HostResourcesH\x07\x88\x01\x01\x12;\n\x11\x61llowed_functions\x18\x08 \x03(\x0b\x32 .executor_api_pb.AllowedFunction\x12H\n\x18\x66unction_executor_states\x18\t \x03(\x0b\x32&.executor_api_pb.FunctionExecutorState\x12:\n\x06labels\x18\n \x03(\x0b\x32*.executor_api_pb.ExecutorState.LabelsEntry\x12\x17\n\nstate_hash\x18\x0b \x01(\tH\x08\x88\x01\x01\x12\x19\n\x0cserver_clock\x18\x0c \x01(\x04H\t\x88\x01\x01\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0e\n\x0c_executor_idB\x13\n\x11_development_modeB\x0b\n\t_hostnameB\t\n\x07_flavorB\n\n\x08_versionB\t\n\x07_statusB\x12\n\x10_total_resourcesB$\n"_total_function_executor_resourcesB\r\n\x0b_state_hashB\x0f\n\r_server_clock"l\n\x1aReportExecutorStateRequest\x12;\n\x0e\x65xecutor_state\x18\x01 \x01(\x0b\x32\x1e.executor_api_pb.ExecutorStateH\x00\x88\x01\x01\x42\x11\n\x0f_executor_state"\x1d\n\x1bReportExecutorStateResponse"\xcf\x01\n\x0fTaskRetryPolicy\x12\x18\n\x0bmax_retries\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x1d\n\x10initial_delay_ms\x18\x02 \x01(\rH\x01\x88\x01\x01\x12\x19\n\x0cmax_delay_ms\x18\x03 \x01(\rH\x02\x88\x01\x01\x12\x1d\n\x10\x64\x65lay_multiplier\x18\x04 \x01(\rH\x03\x88\x01\x01\x42\x0e\n\x0c_max_retriesB\x13\n\x11_initial_delay_msB\x0f\n\r_max_delay_msB\x13\n\x11_delay_multiplier"\xa4\x05\n\x04Task\x12\x0f\n\x02id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x05 \x01(\tH\x04\x88\x01\x01\x12 \n\x13graph_invocation_id\x18\x06 \x01(\tH\x05\x88\x01\x01\x12\x16\n\tinput_key\x18\x08 \x01(\tH\x06\x88\x01\x01\x12\x1f\n\x12reducer_output_key\x18\t \x01(\tH\x07\x88\x01\x01\x12\x17\n\ntimeout_ms\x18\n \x01(\rH\x08\x88\x01\x01\x12\x30\n\x05input\x18\x0b \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\t\x88\x01\x01\x12\x38\n\rreducer_input\x18\x0c \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\n\x88\x01\x01\x12&\n\x19output_payload_uri_prefix\x18\r \x01(\tH\x0b\x88\x01\x01\x12;\n\x0cretry_policy\x18\x0e \x01(\x0b\x32 .executor_api_pb.TaskRetryPolicyH\x0c\x88\x01\x01\x42\x05\n\x03_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_graph_versionB\x10\n\x0e_function_nameB\x16\n\x14_graph_invocation_idB\x0c\n\n_input_keyB\x15\n\x13_reducer_output_keyB\r\n\x0b_timeout_msB\x08\n\x06_inputB\x10\n\x0e_reducer_inputB\x1c\n\x1a_output_payload_uri_prefixB\x0f\n\r_retry_policy"\x7f\n\x0eTaskAllocation\x12!\n\x14\x66unction_executor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12(\n\x04task\x18\x02 \x01(\x0b\x32\x15.executor_api_pb.TaskH\x01\x88\x01\x01\x42\x17\n\x15_function_executor_idB\x07\n\x05_task"K\n\x1fGetDesiredExecutorStatesRequest\x12\x18\n\x0b\x65xecutor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x42\x0e\n\x0c_executor_id"\xb9\x01\n\x14\x44\x65siredExecutorState\x12H\n\x12\x66unction_executors\x18\x01 \x03(\x0b\x32,.executor_api_pb.FunctionExecutorDescription\x12\x39\n\x10task_allocations\x18\x02 \x03(\x0b\x32\x1f.executor_api_pb.TaskAllocation\x12\x12\n\x05\x63lock\x18\x03 \x01(\x04H\x00\x88\x01\x01\x42\x08\n\x06_clock"\x87\x06\n\x18ReportTaskOutcomeRequest\x12\x14\n\x07task_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x04 \x01(\tH\x03\x88\x01\x01\x12 \n\x13graph_invocation_id\x18\x06 \x01(\tH\x04\x88\x01\x01\x12\x32\n\x07outcome\x18\x07 \x01(\x0e\x32\x1c.executor_api_pb.TaskOutcomeH\x05\x88\x01\x01\x12\x1a\n\rinvocation_id\x18\x08 \x01(\tH\x06\x88\x01\x01\x12\x18\n\x0b\x65xecutor_id\x18\t \x01(\tH\x07\x88\x01\x01\x12\x14\n\x07reducer\x18\n \x01(\x08H\x08\x88\x01\x01\x12\x16\n\x0enext_functions\x18\x0b \x03(\t\x12\x30\n\nfn_outputs\x18\x0c \x03(\x0b\x32\x1c.executor_api_pb.DataPayload\x12\x31\n\x06stdout\x18\x0e \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\t\x88\x01\x01\x12\x31\n\x06stderr\x18\x0f \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\n\x88\x01\x01\x12=\n\x0foutput_encoding\x18\r \x01(\x0e\x32\x1f.executor_api_pb.OutputEncodingH\x0b\x88\x01\x01\x12$\n\x17output_encoding_version\x18\x05 \x01(\x04H\x0c\x88\x01\x01\x42\n\n\x08_task_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_function_nameB\x16\n\x14_graph_invocation_idB\n\n\x08_outcomeB\x10\n\x0e_invocation_idB\x0e\n\x0c_executor_idB\n\n\x08_reducerB\t\n\x07_stdoutB\t\n\x07_stderrB\x12\n\x10_output_encodingB\x1a\n\x18_output_encoding_version"\x1b\n\x19ReportTaskOutcomeResponse*\xab\x01\n\x13\x44\x61taPayloadEncoding\x12!\n\x1d\x44\x41TA_PAYLOAD_ENCODING_UNKNOWN\x10\x00\x12#\n\x1f\x44\x41TA_PAYLOAD_ENCODING_UTF8_JSON\x10\x01\x12#\n\x1f\x44\x41TA_PAYLOAD_ENCODING_UTF8_TEXT\x10\x02\x12\'\n#DATA_PAYLOAD_ENCODING_BINARY_PICKLE\x10\x03*\xa0\x01\n\x08GPUModel\x12\x15\n\x11GPU_MODEL_UNKNOWN\x10\x00\x12\x1e\n\x1aGPU_MODEL_NVIDIA_A100_40GB\x10\x01\x12\x1e\n\x1aGPU_MODEL_NVIDIA_A100_80GB\x10\x02\x12\x1e\n\x1aGPU_MODEL_NVIDIA_H100_80GB\x10\x03\x12\x1d\n\x19GPU_MODEL_NVIDIA_TESLA_T4\x10\x04*\xca\x03\n\x16\x46unctionExecutorStatus\x12$\n FUNCTION_EXECUTOR_STATUS_UNKNOWN\x10\x00\x12(\n$FUNCTION_EXECUTOR_STATUS_STARTING_UP\x10\x01\x12:\n6FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR\x10\x02\x12:\n6FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR\x10\x03\x12!\n\x1d\x46UNCTION_EXECUTOR_STATUS_IDLE\x10\x04\x12)\n%FUNCTION_EXECUTOR_STATUS_RUNNING_TASK\x10\x05\x12&\n"FUNCTION_EXECUTOR_STATUS_UNHEALTHY\x10\x06\x12%\n!FUNCTION_EXECUTOR_STATUS_STOPPING\x10\x07\x12$\n FUNCTION_EXECUTOR_STATUS_STOPPED\x10\x08\x12%\n!FUNCTION_EXECUTOR_STATUS_SHUTDOWN\x10\t*\xc3\x01\n\x0e\x45xecutorStatus\x12\x1b\n\x17\x45XECUTOR_STATUS_UNKNOWN\x10\x00\x12\x1f\n\x1b\x45XECUTOR_STATUS_STARTING_UP\x10\x01\x12\x1b\n\x17\x45XECUTOR_STATUS_RUNNING\x10\x02\x12\x1b\n\x17\x45XECUTOR_STATUS_DRAINED\x10\x03\x12\x1c\n\x18\x45XECUTOR_STATUS_STOPPING\x10\x04\x12\x1b\n\x17\x45XECUTOR_STATUS_STOPPED\x10\x05*d\n\x0e\x45xecutorFlavor\x12\x1b\n\x17\x45XECUTOR_FLAVOR_UNKNOWN\x10\x00\x12\x17\n\x13\x45XECUTOR_FLAVOR_OSS\x10\x01\x12\x1c\n\x18\x45XECUTOR_FLAVOR_PLATFORM\x10\x02*[\n\x0bTaskOutcome\x12\x18\n\x14TASK_OUTCOME_UNKNOWN\x10\x00\x12\x18\n\x14TASK_OUTCOME_SUCCESS\x10\x01\x12\x18\n\x14TASK_OUTCOME_FAILURE\x10\x02*\x7f\n\x0eOutputEncoding\x12\x1b\n\x17OUTPUT_ENCODING_UNKNOWN\x10\x00\x12\x18\n\x14OUTPUT_ENCODING_JSON\x10\x01\x12\x1a\n\x16OUTPUT_ENCODING_PICKLE\x10\x02\x12\x1a\n\x16OUTPUT_ENCODING_BINARY\x10\x03\x32\xef\x02\n\x0b\x45xecutorAPI\x12t\n\x15report_executor_state\x12+.executor_api_pb.ReportExecutorStateRequest\x1a,.executor_api_pb.ReportExecutorStateResponse"\x00\x12z\n\x1bget_desired_executor_states\x12\x30.executor_api_pb.GetDesiredExecutorStatesRequest\x1a%.executor_api_pb.DesiredExecutorState"\x00\x30\x01\x12n\n\x13report_task_outcome\x12).executor_api_pb.ReportTaskOutcomeRequest\x1a*.executor_api_pb.ReportTaskOutcomeResponse"\x00\x62\x06proto3'
23
+ )
24
+
25
+ _globals = globals()
26
+ _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
27
+ _builder.BuildTopDescriptorsAndMessages(
28
+ DESCRIPTOR, "indexify.proto.executor_api_pb2", _globals
29
+ )
30
+ if not _descriptor._USE_C_DESCRIPTORS:
31
+ DESCRIPTOR._loaded_options = None
32
+ _globals["_EXECUTORSTATE_LABELSENTRY"]._loaded_options = None
33
+ _globals["_EXECUTORSTATE_LABELSENTRY"]._serialized_options = b"8\001"
34
+ _globals["_DATAPAYLOADENCODING"]._serialized_start = 4857
35
+ _globals["_DATAPAYLOADENCODING"]._serialized_end = 5028
36
+ _globals["_GPUMODEL"]._serialized_start = 5031
37
+ _globals["_GPUMODEL"]._serialized_end = 5191
38
+ _globals["_FUNCTIONEXECUTORSTATUS"]._serialized_start = 5194
39
+ _globals["_FUNCTIONEXECUTORSTATUS"]._serialized_end = 5652
40
+ _globals["_EXECUTORSTATUS"]._serialized_start = 5655
41
+ _globals["_EXECUTORSTATUS"]._serialized_end = 5850
42
+ _globals["_EXECUTORFLAVOR"]._serialized_start = 5852
43
+ _globals["_EXECUTORFLAVOR"]._serialized_end = 5952
44
+ _globals["_TASKOUTCOME"]._serialized_start = 5954
45
+ _globals["_TASKOUTCOME"]._serialized_end = 6045
46
+ _globals["_OUTPUTENCODING"]._serialized_start = 6047
47
+ _globals["_OUTPUTENCODING"]._serialized_end = 6174
48
+ _globals["_DATAPAYLOAD"]._serialized_start = 55
49
+ _globals["_DATAPAYLOAD"]._serialized_end = 318
50
+ _globals["_GPURESOURCES"]._serialized_start = 320
51
+ _globals["_GPURESOURCES"]._serialized_end = 427
52
+ _globals["_HOSTRESOURCES"]._serialized_start = 430
53
+ _globals["_HOSTRESOURCES"]._serialized_end = 624
54
+ _globals["_ALLOWEDFUNCTION"]._serialized_start = 627
55
+ _globals["_ALLOWEDFUNCTION"]._serialized_end = 814
56
+ _globals["_FUNCTIONEXECUTORRESOURCES"]._serialized_start = 817
57
+ _globals["_FUNCTIONEXECUTORRESOURCES"]._serialized_end = 1014
58
+ _globals["_FUNCTIONEXECUTORDESCRIPTION"]._serialized_start = 1017
59
+ _globals["_FUNCTIONEXECUTORDESCRIPTION"]._serialized_end = 1592
60
+ _globals["_FUNCTIONEXECUTORSTATE"]._serialized_start = 1595
61
+ _globals["_FUNCTIONEXECUTORSTATE"]._serialized_end = 1785
62
+ _globals["_EXECUTORSTATE"]._serialized_start = 1788
63
+ _globals["_EXECUTORSTATE"]._serialized_end = 2623
64
+ _globals["_EXECUTORSTATE_LABELSENTRY"]._serialized_start = 2404
65
+ _globals["_EXECUTORSTATE_LABELSENTRY"]._serialized_end = 2449
66
+ _globals["_REPORTEXECUTORSTATEREQUEST"]._serialized_start = 2625
67
+ _globals["_REPORTEXECUTORSTATEREQUEST"]._serialized_end = 2733
68
+ _globals["_REPORTEXECUTORSTATERESPONSE"]._serialized_start = 2735
69
+ _globals["_REPORTEXECUTORSTATERESPONSE"]._serialized_end = 2764
70
+ _globals["_TASKRETRYPOLICY"]._serialized_start = 2767
71
+ _globals["_TASKRETRYPOLICY"]._serialized_end = 2974
72
+ _globals["_TASK"]._serialized_start = 2977
73
+ _globals["_TASK"]._serialized_end = 3653
74
+ _globals["_TASKALLOCATION"]._serialized_start = 3655
75
+ _globals["_TASKALLOCATION"]._serialized_end = 3782
76
+ _globals["_GETDESIREDEXECUTORSTATESREQUEST"]._serialized_start = 3784
77
+ _globals["_GETDESIREDEXECUTORSTATESREQUEST"]._serialized_end = 3859
78
+ _globals["_DESIREDEXECUTORSTATE"]._serialized_start = 3862
79
+ _globals["_DESIREDEXECUTORSTATE"]._serialized_end = 4047
80
+ _globals["_REPORTTASKOUTCOMEREQUEST"]._serialized_start = 4050
81
+ _globals["_REPORTTASKOUTCOMEREQUEST"]._serialized_end = 4825
82
+ _globals["_REPORTTASKOUTCOMERESPONSE"]._serialized_start = 4827
83
+ _globals["_REPORTTASKOUTCOMERESPONSE"]._serialized_end = 4854
84
+ _globals["_EXECUTORAPI"]._serialized_start = 6177
85
+ _globals["_EXECUTORAPI"]._serialized_end = 6544
86
+ # @@protoc_insertion_point(module_scope)
@@ -24,6 +24,7 @@ class GPUModel(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
24
24
  GPU_MODEL_NVIDIA_A100_40GB: _ClassVar[GPUModel]
25
25
  GPU_MODEL_NVIDIA_A100_80GB: _ClassVar[GPUModel]
26
26
  GPU_MODEL_NVIDIA_H100_80GB: _ClassVar[GPUModel]
27
+ GPU_MODEL_NVIDIA_TESLA_T4: _ClassVar[GPUModel]
27
28
 
28
29
  class FunctionExecutorStatus(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
29
30
  __slots__ = ()
@@ -78,6 +79,7 @@ GPU_MODEL_UNKNOWN: GPUModel
78
79
  GPU_MODEL_NVIDIA_A100_40GB: GPUModel
79
80
  GPU_MODEL_NVIDIA_A100_80GB: GPUModel
80
81
  GPU_MODEL_NVIDIA_H100_80GB: GPUModel
82
+ GPU_MODEL_NVIDIA_TESLA_T4: GPUModel
81
83
  FUNCTION_EXECUTOR_STATUS_UNKNOWN: FunctionExecutorStatus
82
84
  FUNCTION_EXECUTOR_STATUS_STARTING_UP: FunctionExecutorStatus
83
85
  FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR: FunctionExecutorStatus
@@ -245,18 +247,15 @@ class FunctionExecutorDescription(_message.Message):
245
247
  ) -> None: ...
246
248
 
247
249
  class FunctionExecutorState(_message.Message):
248
- __slots__ = ("description", "status", "status_message")
250
+ __slots__ = ("description", "status")
249
251
  DESCRIPTION_FIELD_NUMBER: _ClassVar[int]
250
252
  STATUS_FIELD_NUMBER: _ClassVar[int]
251
- STATUS_MESSAGE_FIELD_NUMBER: _ClassVar[int]
252
253
  description: FunctionExecutorDescription
253
254
  status: FunctionExecutorStatus
254
- status_message: str
255
255
  def __init__(
256
256
  self,
257
257
  description: _Optional[_Union[FunctionExecutorDescription, _Mapping]] = ...,
258
258
  status: _Optional[_Union[FunctionExecutorStatus, str]] = ...,
259
- status_message: _Optional[str] = ...,
260
259
  ) -> None: ...
261
260
 
262
261
  class ExecutorState(_message.Message):
@@ -268,7 +267,7 @@ class ExecutorState(_message.Message):
268
267
  "version",
269
268
  "status",
270
269
  "total_resources",
271
- "free_resources",
270
+ "total_function_executor_resources",
272
271
  "allowed_functions",
273
272
  "function_executor_states",
274
273
  "labels",
@@ -293,7 +292,7 @@ class ExecutorState(_message.Message):
293
292
  VERSION_FIELD_NUMBER: _ClassVar[int]
294
293
  STATUS_FIELD_NUMBER: _ClassVar[int]
295
294
  TOTAL_RESOURCES_FIELD_NUMBER: _ClassVar[int]
296
- FREE_RESOURCES_FIELD_NUMBER: _ClassVar[int]
295
+ TOTAL_FUNCTION_EXECUTOR_RESOURCES_FIELD_NUMBER: _ClassVar[int]
297
296
  ALLOWED_FUNCTIONS_FIELD_NUMBER: _ClassVar[int]
298
297
  FUNCTION_EXECUTOR_STATES_FIELD_NUMBER: _ClassVar[int]
299
298
  LABELS_FIELD_NUMBER: _ClassVar[int]
@@ -306,7 +305,7 @@ class ExecutorState(_message.Message):
306
305
  version: str
307
306
  status: ExecutorStatus
308
307
  total_resources: HostResources
309
- free_resources: HostResources
308
+ total_function_executor_resources: HostResources
310
309
  allowed_functions: _containers.RepeatedCompositeFieldContainer[AllowedFunction]
311
310
  function_executor_states: _containers.RepeatedCompositeFieldContainer[
312
311
  FunctionExecutorState
@@ -323,7 +322,9 @@ class ExecutorState(_message.Message):
323
322
  version: _Optional[str] = ...,
324
323
  status: _Optional[_Union[ExecutorStatus, str]] = ...,
325
324
  total_resources: _Optional[_Union[HostResources, _Mapping]] = ...,
326
- free_resources: _Optional[_Union[HostResources, _Mapping]] = ...,
325
+ total_function_executor_resources: _Optional[
326
+ _Union[HostResources, _Mapping]
327
+ ] = ...,
327
328
  allowed_functions: _Optional[
328
329
  _Iterable[_Union[AllowedFunction, _Mapping]]
329
330
  ] = ...,
@@ -1,50 +0,0 @@
1
- from typing import List
2
-
3
- from pydantic import BaseModel
4
-
5
- from .nvidia_gpu import NvidiaGPUInfo
6
- from .nvidia_gpu_allocator import NvidiaGPUAllocator
7
-
8
-
9
- class HostResources(BaseModel):
10
- cpu_count: int
11
- memory_mb: int
12
- disk_mb: int
13
- gpus: List[NvidiaGPUInfo]
14
-
15
-
16
- class HostResourcesProvider:
17
- """
18
- HostResourcesProvider is a class that provides information about the host resources.
19
- """
20
-
21
- def __init__(self, gpu_allocator: NvidiaGPUAllocator):
22
- self._gpu_allocator: NvidiaGPUAllocator = gpu_allocator
23
-
24
- def total_resources(self, logger) -> HostResources:
25
- """Returns all hardware resources that exist at the host.
26
-
27
- Raises Exception on error.
28
- """
29
- logger = logger.bind(module=__name__)
30
-
31
- return HostResources(
32
- cpu_count=0, # TODO: Implement for Linux and MacOS hosts
33
- memory_mb=0, # TODO: Implement for Linux and MacOS hosts
34
- disk_mb=0, # TODO: Implement for Linux and MacOS hosts
35
- gpus=self._gpu_allocator.list_all(),
36
- )
37
-
38
- def free_resources(self, logger) -> HostResources:
39
- """Returns all hardware resources that are free at the host.
40
-
41
- Raises Exception on error.
42
- """
43
- logger = logger.bind(module=__name__)
44
-
45
- return HostResources(
46
- cpu_count=0, # TODO: Implement for Linux and MacOS hosts
47
- memory_mb=0, # TODO: Implement for Linux and MacOS hosts
48
- disk_mb=0, # TODO: Implement for Linux and MacOS hosts
49
- gpus=self._gpu_allocator.list_free(),
50
- )
@@ -1,86 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # Generated by the protocol buffer compiler. DO NOT EDIT!
3
- # NO CHECKED-IN PROTOBUF GENCODE
4
- # source: indexify/proto/executor_api.proto
5
- # Protobuf Python Version: 5.29.0
6
- """Generated protocol buffer code."""
7
- from google.protobuf import descriptor as _descriptor
8
- from google.protobuf import descriptor_pool as _descriptor_pool
9
- from google.protobuf import runtime_version as _runtime_version
10
- from google.protobuf import symbol_database as _symbol_database
11
- from google.protobuf.internal import builder as _builder
12
-
13
- _runtime_version.ValidateProtobufRuntimeVersion(
14
- _runtime_version.Domain.PUBLIC, 5, 29, 0, "", "indexify/proto/executor_api.proto"
15
- )
16
- # @@protoc_insertion_point(imports)
17
-
18
- _sym_db = _symbol_database.Default()
19
-
20
-
21
- DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
22
- b'\n!indexify/proto/executor_api.proto\x12\x0f\x65xecutor_api_pb"\x87\x02\n\x0b\x44\x61taPayload\x12\x11\n\x04path\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x11\n\x04size\x18\x02 \x01(\x04H\x01\x88\x01\x01\x12\x18\n\x0bsha256_hash\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x10\n\x03uri\x18\x04 \x01(\tH\x03\x88\x01\x01\x12;\n\x08\x65ncoding\x18\x05 \x01(\x0e\x32$.executor_api_pb.DataPayloadEncodingH\x04\x88\x01\x01\x12\x1d\n\x10\x65ncoding_version\x18\x06 \x01(\x04H\x05\x88\x01\x01\x42\x07\n\x05_pathB\x07\n\x05_sizeB\x0e\n\x0c_sha256_hashB\x06\n\x04_uriB\x0b\n\t_encodingB\x13\n\x11_encoding_version"k\n\x0cGPUResources\x12\x12\n\x05\x63ount\x18\x01 \x01(\rH\x00\x88\x01\x01\x12-\n\x05model\x18\x02 \x01(\x0e\x32\x19.executor_api_pb.GPUModelH\x01\x88\x01\x01\x42\x08\n\x06_countB\x08\n\x06_modelJ\x04\x08\x03\x10\x04"\xc2\x01\n\rHostResources\x12\x16\n\tcpu_count\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x19\n\x0cmemory_bytes\x18\x02 \x01(\x04H\x01\x88\x01\x01\x12\x17\n\ndisk_bytes\x18\x03 \x01(\x04H\x02\x88\x01\x01\x12/\n\x03gpu\x18\x04 \x01(\x0b\x32\x1d.executor_api_pb.GPUResourcesH\x03\x88\x01\x01\x42\x0c\n\n_cpu_countB\x0f\n\r_memory_bytesB\r\n\x0b_disk_bytesB\x06\n\x04_gpu"\xbb\x01\n\x0f\x41llowedFunction\x12\x16\n\tnamespace\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x17\n\ngraph_name\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x42\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_function_nameB\x10\n\x0e_graph_version"\xc5\x01\n\x19\x46unctionExecutorResources\x12\x1b\n\x0e\x63pu_ms_per_sec\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x19\n\x0cmemory_bytes\x18\x02 \x01(\x04H\x01\x88\x01\x01\x12\x17\n\ndisk_bytes\x18\x03 \x01(\x04H\x02\x88\x01\x01\x12\x16\n\tgpu_count\x18\x04 \x01(\rH\x03\x88\x01\x01\x42\x11\n\x0f_cpu_ms_per_secB\x0f\n\r_memory_bytesB\r\n\x0b_disk_bytesB\x0c\n\n_gpu_count"\xbf\x04\n\x1b\x46unctionExecutorDescription\x12\x0f\n\x02id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x05 \x01(\tH\x04\x88\x01\x01\x12\x16\n\timage_uri\x18\x06 \x01(\tH\x05\x88\x01\x01\x12\x14\n\x0csecret_names\x18\x07 \x03(\t\x12<\n\x0fresource_limits\x18\x08 \x01(\x0b\x32\x1e.executor_api_pb.HostResourcesH\x06\x88\x01\x01\x12%\n\x18\x63ustomer_code_timeout_ms\x18\t \x01(\rH\x07\x88\x01\x01\x12\x30\n\x05graph\x18\n \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\x08\x88\x01\x01\x12\x42\n\tresources\x18\x0b \x01(\x0b\x32*.executor_api_pb.FunctionExecutorResourcesH\t\x88\x01\x01\x42\x05\n\x03_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_graph_versionB\x10\n\x0e_function_nameB\x0c\n\n_image_uriB\x12\n\x10_resource_limitsB\x1b\n\x19_customer_code_timeout_msB\x08\n\x06_graphB\x0c\n\n_resources"\xe8\x01\n\x15\x46unctionExecutorState\x12\x46\n\x0b\x64\x65scription\x18\x01 \x01(\x0b\x32,.executor_api_pb.FunctionExecutorDescriptionH\x00\x88\x01\x01\x12<\n\x06status\x18\x02 \x01(\x0e\x32\'.executor_api_pb.FunctionExecutorStatusH\x01\x88\x01\x01\x12\x1b\n\x0estatus_message\x18\x03 \x01(\tH\x02\x88\x01\x01\x42\x0e\n\x0c_descriptionB\t\n\x07_statusB\x11\n\x0f_status_message"\x9d\x06\n\rExecutorState\x12\x18\n\x0b\x65xecutor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x1d\n\x10\x64\x65velopment_mode\x18\x02 \x01(\x08H\x01\x88\x01\x01\x12\x15\n\x08hostname\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x34\n\x06\x66lavor\x18\x04 \x01(\x0e\x32\x1f.executor_api_pb.ExecutorFlavorH\x03\x88\x01\x01\x12\x14\n\x07version\x18\x05 \x01(\tH\x04\x88\x01\x01\x12\x34\n\x06status\x18\x06 \x01(\x0e\x32\x1f.executor_api_pb.ExecutorStatusH\x05\x88\x01\x01\x12<\n\x0ftotal_resources\x18\r \x01(\x0b\x32\x1e.executor_api_pb.HostResourcesH\x06\x88\x01\x01\x12;\n\x0e\x66ree_resources\x18\x07 \x01(\x0b\x32\x1e.executor_api_pb.HostResourcesH\x07\x88\x01\x01\x12;\n\x11\x61llowed_functions\x18\x08 \x03(\x0b\x32 .executor_api_pb.AllowedFunction\x12H\n\x18\x66unction_executor_states\x18\t \x03(\x0b\x32&.executor_api_pb.FunctionExecutorState\x12:\n\x06labels\x18\n \x03(\x0b\x32*.executor_api_pb.ExecutorState.LabelsEntry\x12\x17\n\nstate_hash\x18\x0b \x01(\tH\x08\x88\x01\x01\x12\x19\n\x0cserver_clock\x18\x0c \x01(\x04H\t\x88\x01\x01\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0e\n\x0c_executor_idB\x13\n\x11_development_modeB\x0b\n\t_hostnameB\t\n\x07_flavorB\n\n\x08_versionB\t\n\x07_statusB\x12\n\x10_total_resourcesB\x11\n\x0f_free_resourcesB\r\n\x0b_state_hashB\x0f\n\r_server_clock"l\n\x1aReportExecutorStateRequest\x12;\n\x0e\x65xecutor_state\x18\x01 \x01(\x0b\x32\x1e.executor_api_pb.ExecutorStateH\x00\x88\x01\x01\x42\x11\n\x0f_executor_state"\x1d\n\x1bReportExecutorStateResponse"\xcf\x01\n\x0fTaskRetryPolicy\x12\x18\n\x0bmax_retries\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x1d\n\x10initial_delay_ms\x18\x02 \x01(\rH\x01\x88\x01\x01\x12\x19\n\x0cmax_delay_ms\x18\x03 \x01(\rH\x02\x88\x01\x01\x12\x1d\n\x10\x64\x65lay_multiplier\x18\x04 \x01(\rH\x03\x88\x01\x01\x42\x0e\n\x0c_max_retriesB\x13\n\x11_initial_delay_msB\x0f\n\r_max_delay_msB\x13\n\x11_delay_multiplier"\xa4\x05\n\x04Task\x12\x0f\n\x02id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x05 \x01(\tH\x04\x88\x01\x01\x12 \n\x13graph_invocation_id\x18\x06 \x01(\tH\x05\x88\x01\x01\x12\x16\n\tinput_key\x18\x08 \x01(\tH\x06\x88\x01\x01\x12\x1f\n\x12reducer_output_key\x18\t \x01(\tH\x07\x88\x01\x01\x12\x17\n\ntimeout_ms\x18\n \x01(\rH\x08\x88\x01\x01\x12\x30\n\x05input\x18\x0b \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\t\x88\x01\x01\x12\x38\n\rreducer_input\x18\x0c \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\n\x88\x01\x01\x12&\n\x19output_payload_uri_prefix\x18\r \x01(\tH\x0b\x88\x01\x01\x12;\n\x0cretry_policy\x18\x0e \x01(\x0b\x32 .executor_api_pb.TaskRetryPolicyH\x0c\x88\x01\x01\x42\x05\n\x03_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_graph_versionB\x10\n\x0e_function_nameB\x16\n\x14_graph_invocation_idB\x0c\n\n_input_keyB\x15\n\x13_reducer_output_keyB\r\n\x0b_timeout_msB\x08\n\x06_inputB\x10\n\x0e_reducer_inputB\x1c\n\x1a_output_payload_uri_prefixB\x0f\n\r_retry_policy"\x7f\n\x0eTaskAllocation\x12!\n\x14\x66unction_executor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12(\n\x04task\x18\x02 \x01(\x0b\x32\x15.executor_api_pb.TaskH\x01\x88\x01\x01\x42\x17\n\x15_function_executor_idB\x07\n\x05_task"K\n\x1fGetDesiredExecutorStatesRequest\x12\x18\n\x0b\x65xecutor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x42\x0e\n\x0c_executor_id"\xb9\x01\n\x14\x44\x65siredExecutorState\x12H\n\x12\x66unction_executors\x18\x01 \x03(\x0b\x32,.executor_api_pb.FunctionExecutorDescription\x12\x39\n\x10task_allocations\x18\x02 \x03(\x0b\x32\x1f.executor_api_pb.TaskAllocation\x12\x12\n\x05\x63lock\x18\x03 \x01(\x04H\x00\x88\x01\x01\x42\x08\n\x06_clock"\x87\x06\n\x18ReportTaskOutcomeRequest\x12\x14\n\x07task_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x04 \x01(\tH\x03\x88\x01\x01\x12 \n\x13graph_invocation_id\x18\x06 \x01(\tH\x04\x88\x01\x01\x12\x32\n\x07outcome\x18\x07 \x01(\x0e\x32\x1c.executor_api_pb.TaskOutcomeH\x05\x88\x01\x01\x12\x1a\n\rinvocation_id\x18\x08 \x01(\tH\x06\x88\x01\x01\x12\x18\n\x0b\x65xecutor_id\x18\t \x01(\tH\x07\x88\x01\x01\x12\x14\n\x07reducer\x18\n \x01(\x08H\x08\x88\x01\x01\x12\x16\n\x0enext_functions\x18\x0b \x03(\t\x12\x30\n\nfn_outputs\x18\x0c \x03(\x0b\x32\x1c.executor_api_pb.DataPayload\x12\x31\n\x06stdout\x18\x0e \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\t\x88\x01\x01\x12\x31\n\x06stderr\x18\x0f \x01(\x0b\x32\x1c.executor_api_pb.DataPayloadH\n\x88\x01\x01\x12=\n\x0foutput_encoding\x18\r \x01(\x0e\x32\x1f.executor_api_pb.OutputEncodingH\x0b\x88\x01\x01\x12$\n\x17output_encoding_version\x18\x05 \x01(\x04H\x0c\x88\x01\x01\x42\n\n\x08_task_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_function_nameB\x16\n\x14_graph_invocation_idB\n\n\x08_outcomeB\x10\n\x0e_invocation_idB\x0e\n\x0c_executor_idB\n\n\x08_reducerB\t\n\x07_stdoutB\t\n\x07_stderrB\x12\n\x10_output_encodingB\x1a\n\x18_output_encoding_version"\x1b\n\x19ReportTaskOutcomeResponse*\xab\x01\n\x13\x44\x61taPayloadEncoding\x12!\n\x1d\x44\x41TA_PAYLOAD_ENCODING_UNKNOWN\x10\x00\x12#\n\x1f\x44\x41TA_PAYLOAD_ENCODING_UTF8_JSON\x10\x01\x12#\n\x1f\x44\x41TA_PAYLOAD_ENCODING_UTF8_TEXT\x10\x02\x12\'\n#DATA_PAYLOAD_ENCODING_BINARY_PICKLE\x10\x03*\x81\x01\n\x08GPUModel\x12\x15\n\x11GPU_MODEL_UNKNOWN\x10\x00\x12\x1e\n\x1aGPU_MODEL_NVIDIA_A100_40GB\x10\x01\x12\x1e\n\x1aGPU_MODEL_NVIDIA_A100_80GB\x10\x02\x12\x1e\n\x1aGPU_MODEL_NVIDIA_H100_80GB\x10\x03*\xca\x03\n\x16\x46unctionExecutorStatus\x12$\n FUNCTION_EXECUTOR_STATUS_UNKNOWN\x10\x00\x12(\n$FUNCTION_EXECUTOR_STATUS_STARTING_UP\x10\x01\x12:\n6FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR\x10\x02\x12:\n6FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR\x10\x03\x12!\n\x1d\x46UNCTION_EXECUTOR_STATUS_IDLE\x10\x04\x12)\n%FUNCTION_EXECUTOR_STATUS_RUNNING_TASK\x10\x05\x12&\n"FUNCTION_EXECUTOR_STATUS_UNHEALTHY\x10\x06\x12%\n!FUNCTION_EXECUTOR_STATUS_STOPPING\x10\x07\x12$\n FUNCTION_EXECUTOR_STATUS_STOPPED\x10\x08\x12%\n!FUNCTION_EXECUTOR_STATUS_SHUTDOWN\x10\t*\xc3\x01\n\x0e\x45xecutorStatus\x12\x1b\n\x17\x45XECUTOR_STATUS_UNKNOWN\x10\x00\x12\x1f\n\x1b\x45XECUTOR_STATUS_STARTING_UP\x10\x01\x12\x1b\n\x17\x45XECUTOR_STATUS_RUNNING\x10\x02\x12\x1b\n\x17\x45XECUTOR_STATUS_DRAINED\x10\x03\x12\x1c\n\x18\x45XECUTOR_STATUS_STOPPING\x10\x04\x12\x1b\n\x17\x45XECUTOR_STATUS_STOPPED\x10\x05*d\n\x0e\x45xecutorFlavor\x12\x1b\n\x17\x45XECUTOR_FLAVOR_UNKNOWN\x10\x00\x12\x17\n\x13\x45XECUTOR_FLAVOR_OSS\x10\x01\x12\x1c\n\x18\x45XECUTOR_FLAVOR_PLATFORM\x10\x02*[\n\x0bTaskOutcome\x12\x18\n\x14TASK_OUTCOME_UNKNOWN\x10\x00\x12\x18\n\x14TASK_OUTCOME_SUCCESS\x10\x01\x12\x18\n\x14TASK_OUTCOME_FAILURE\x10\x02*\x7f\n\x0eOutputEncoding\x12\x1b\n\x17OUTPUT_ENCODING_UNKNOWN\x10\x00\x12\x18\n\x14OUTPUT_ENCODING_JSON\x10\x01\x12\x1a\n\x16OUTPUT_ENCODING_PICKLE\x10\x02\x12\x1a\n\x16OUTPUT_ENCODING_BINARY\x10\x03\x32\xef\x02\n\x0b\x45xecutorAPI\x12t\n\x15report_executor_state\x12+.executor_api_pb.ReportExecutorStateRequest\x1a,.executor_api_pb.ReportExecutorStateResponse"\x00\x12z\n\x1bget_desired_executor_states\x12\x30.executor_api_pb.GetDesiredExecutorStatesRequest\x1a%.executor_api_pb.DesiredExecutorState"\x00\x30\x01\x12n\n\x13report_task_outcome\x12).executor_api_pb.ReportTaskOutcomeRequest\x1a*.executor_api_pb.ReportTaskOutcomeResponse"\x00\x62\x06proto3'
23
- )
24
-
25
- _globals = globals()
26
- _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
27
- _builder.BuildTopDescriptorsAndMessages(
28
- DESCRIPTOR, "indexify.proto.executor_api_pb2", _globals
29
- )
30
- if not _descriptor._USE_C_DESCRIPTORS:
31
- DESCRIPTOR._loaded_options = None
32
- _globals["_EXECUTORSTATE_LABELSENTRY"]._loaded_options = None
33
- _globals["_EXECUTORSTATE_LABELSENTRY"]._serialized_options = b"8\001"
34
- _globals["_DATAPAYLOADENCODING"]._serialized_start = 4861
35
- _globals["_DATAPAYLOADENCODING"]._serialized_end = 5032
36
- _globals["_GPUMODEL"]._serialized_start = 5035
37
- _globals["_GPUMODEL"]._serialized_end = 5164
38
- _globals["_FUNCTIONEXECUTORSTATUS"]._serialized_start = 5167
39
- _globals["_FUNCTIONEXECUTORSTATUS"]._serialized_end = 5625
40
- _globals["_EXECUTORSTATUS"]._serialized_start = 5628
41
- _globals["_EXECUTORSTATUS"]._serialized_end = 5823
42
- _globals["_EXECUTORFLAVOR"]._serialized_start = 5825
43
- _globals["_EXECUTORFLAVOR"]._serialized_end = 5925
44
- _globals["_TASKOUTCOME"]._serialized_start = 5927
45
- _globals["_TASKOUTCOME"]._serialized_end = 6018
46
- _globals["_OUTPUTENCODING"]._serialized_start = 6020
47
- _globals["_OUTPUTENCODING"]._serialized_end = 6147
48
- _globals["_DATAPAYLOAD"]._serialized_start = 55
49
- _globals["_DATAPAYLOAD"]._serialized_end = 318
50
- _globals["_GPURESOURCES"]._serialized_start = 320
51
- _globals["_GPURESOURCES"]._serialized_end = 427
52
- _globals["_HOSTRESOURCES"]._serialized_start = 430
53
- _globals["_HOSTRESOURCES"]._serialized_end = 624
54
- _globals["_ALLOWEDFUNCTION"]._serialized_start = 627
55
- _globals["_ALLOWEDFUNCTION"]._serialized_end = 814
56
- _globals["_FUNCTIONEXECUTORRESOURCES"]._serialized_start = 817
57
- _globals["_FUNCTIONEXECUTORRESOURCES"]._serialized_end = 1014
58
- _globals["_FUNCTIONEXECUTORDESCRIPTION"]._serialized_start = 1017
59
- _globals["_FUNCTIONEXECUTORDESCRIPTION"]._serialized_end = 1592
60
- _globals["_FUNCTIONEXECUTORSTATE"]._serialized_start = 1595
61
- _globals["_FUNCTIONEXECUTORSTATE"]._serialized_end = 1827
62
- _globals["_EXECUTORSTATE"]._serialized_start = 1830
63
- _globals["_EXECUTORSTATE"]._serialized_end = 2627
64
- _globals["_EXECUTORSTATE_LABELSENTRY"]._serialized_start = 2427
65
- _globals["_EXECUTORSTATE_LABELSENTRY"]._serialized_end = 2472
66
- _globals["_REPORTEXECUTORSTATEREQUEST"]._serialized_start = 2629
67
- _globals["_REPORTEXECUTORSTATEREQUEST"]._serialized_end = 2737
68
- _globals["_REPORTEXECUTORSTATERESPONSE"]._serialized_start = 2739
69
- _globals["_REPORTEXECUTORSTATERESPONSE"]._serialized_end = 2768
70
- _globals["_TASKRETRYPOLICY"]._serialized_start = 2771
71
- _globals["_TASKRETRYPOLICY"]._serialized_end = 2978
72
- _globals["_TASK"]._serialized_start = 2981
73
- _globals["_TASK"]._serialized_end = 3657
74
- _globals["_TASKALLOCATION"]._serialized_start = 3659
75
- _globals["_TASKALLOCATION"]._serialized_end = 3786
76
- _globals["_GETDESIREDEXECUTORSTATESREQUEST"]._serialized_start = 3788
77
- _globals["_GETDESIREDEXECUTORSTATESREQUEST"]._serialized_end = 3863
78
- _globals["_DESIREDEXECUTORSTATE"]._serialized_start = 3866
79
- _globals["_DESIREDEXECUTORSTATE"]._serialized_end = 4051
80
- _globals["_REPORTTASKOUTCOMEREQUEST"]._serialized_start = 4054
81
- _globals["_REPORTTASKOUTCOMEREQUEST"]._serialized_end = 4829
82
- _globals["_REPORTTASKOUTCOMERESPONSE"]._serialized_start = 4831
83
- _globals["_REPORTTASKOUTCOMERESPONSE"]._serialized_end = 4858
84
- _globals["_EXECUTORAPI"]._serialized_start = 6150
85
- _globals["_EXECUTORAPI"]._serialized_end = 6517
86
- # @@protoc_insertion_point(module_scope)
File without changes