indexify 0.3.31__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. indexify/cli/__init__.py +18 -0
  2. indexify/cli/build_image.py +51 -0
  3. indexify/cli/deploy.py +57 -0
  4. indexify/cli/executor.py +205 -0
  5. indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
  6. indexify/executor/executor.py +57 -313
  7. indexify/executor/function_allowlist.py +59 -0
  8. indexify/executor/function_executor/function_executor.py +12 -6
  9. indexify/executor/function_executor/invocation_state_client.py +25 -3
  10. indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
  11. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
  12. indexify/executor/function_executor_controller/__init__.py +13 -0
  13. indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
  14. indexify/executor/function_executor_controller/create_function_executor.py +158 -0
  15. indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
  16. indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
  17. indexify/executor/function_executor_controller/downloads.py +199 -0
  18. indexify/executor/function_executor_controller/events.py +172 -0
  19. indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
  20. indexify/executor/function_executor_controller/loggers.py +57 -0
  21. indexify/executor/function_executor_controller/message_validators.py +69 -0
  22. indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
  23. indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
  24. indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
  25. indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
  26. indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
  27. indexify/executor/function_executor_controller/prepare_task.py +38 -0
  28. indexify/executor/function_executor_controller/run_task.py +201 -0
  29. indexify/executor/function_executor_controller/task_info.py +33 -0
  30. indexify/executor/function_executor_controller/task_output.py +122 -0
  31. indexify/executor/function_executor_controller/upload_task_output.py +234 -0
  32. indexify/executor/host_resources/host_resources.py +20 -25
  33. indexify/executor/host_resources/nvidia_gpu_allocator.py +8 -1
  34. indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
  35. indexify/executor/metrics/executor.py +0 -47
  36. indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
  37. indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
  38. indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
  39. indexify/executor/monitoring/health_checker/health_checker.py +0 -11
  40. indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
  41. indexify/executor/state_reporter.py +364 -0
  42. indexify/proto/executor_api.proto +68 -60
  43. indexify/proto/executor_api_pb2.py +52 -52
  44. indexify/proto/executor_api_pb2.pyi +129 -108
  45. indexify/proto/executor_api_pb2_grpc.py +0 -47
  46. {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/METADATA +2 -5
  47. indexify-0.4.3.dist-info/RECORD +68 -0
  48. indexify-0.4.3.dist-info/entry_points.txt +3 -0
  49. indexify/cli/cli.py +0 -268
  50. indexify/executor/api_objects.py +0 -92
  51. indexify/executor/downloader.py +0 -417
  52. indexify/executor/executor_flavor.py +0 -7
  53. indexify/executor/function_executor/function_executor_state.py +0 -107
  54. indexify/executor/function_executor/function_executor_states_container.py +0 -93
  55. indexify/executor/function_executor/function_executor_status.py +0 -95
  56. indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
  57. indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
  58. indexify/executor/function_executor/single_task_runner.py +0 -345
  59. indexify/executor/function_executor/task_input.py +0 -21
  60. indexify/executor/function_executor/task_output.py +0 -105
  61. indexify/executor/grpc/function_executor_controller.py +0 -418
  62. indexify/executor/grpc/metrics/task_controller.py +0 -8
  63. indexify/executor/grpc/state_reporter.py +0 -317
  64. indexify/executor/grpc/task_controller.py +0 -508
  65. indexify/executor/metrics/task_fetcher.py +0 -21
  66. indexify/executor/metrics/task_reporter.py +0 -53
  67. indexify/executor/metrics/task_runner.py +0 -52
  68. indexify/executor/monitoring/function_allowlist.py +0 -25
  69. indexify/executor/runtime_probes.py +0 -68
  70. indexify/executor/task_fetcher.py +0 -96
  71. indexify/executor/task_reporter.py +0 -459
  72. indexify/executor/task_runner.py +0 -177
  73. indexify-0.3.31.dist-info/RECORD +0 -68
  74. indexify-0.3.31.dist-info/entry_points.txt +0 -3
  75. {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/WHEEL +0 -0
@@ -1,317 +0,0 @@
1
- import asyncio
2
- import hashlib
3
- from socket import gethostname
4
- from typing import Any, Dict, List, Optional
5
-
6
- from indexify.proto.executor_api_pb2 import (
7
- AllowedFunction,
8
- )
9
- from indexify.proto.executor_api_pb2 import ExecutorFlavor as ExecutorFlavorProto
10
- from indexify.proto.executor_api_pb2 import (
11
- ExecutorState,
12
- ExecutorStatus,
13
- FunctionExecutorDescription,
14
- )
15
- from indexify.proto.executor_api_pb2 import (
16
- FunctionExecutorState as FunctionExecutorStateProto,
17
- )
18
- from indexify.proto.executor_api_pb2 import (
19
- FunctionExecutorStatus as FunctionExecutorStatusProto,
20
- )
21
- from indexify.proto.executor_api_pb2 import GPUModel as GPUModelProto
22
- from indexify.proto.executor_api_pb2 import GPUResources as GPUResourcesProto
23
- from indexify.proto.executor_api_pb2 import HostResources as HostResourcesProto
24
- from indexify.proto.executor_api_pb2 import (
25
- ReportExecutorStateRequest,
26
- )
27
- from indexify.proto.executor_api_pb2_grpc import ExecutorAPIStub
28
-
29
- from ..api_objects import FunctionURI
30
- from ..executor_flavor import ExecutorFlavor
31
- from ..function_executor.function_executor_state import FunctionExecutorState
32
- from ..function_executor.function_executor_states_container import (
33
- FunctionExecutorStatesContainer,
34
- )
35
- from ..function_executor.function_executor_status import FunctionExecutorStatus
36
- from ..host_resources.host_resources import HostResources, HostResourcesProvider
37
- from ..host_resources.nvidia_gpu import NVIDIA_GPU_MODEL
38
- from ..runtime_probes import RuntimeProbes
39
- from .channel_manager import ChannelManager
40
- from .metrics.state_reporter import (
41
- metric_state_report_errors,
42
- metric_state_report_latency,
43
- metric_state_report_rpcs,
44
- )
45
-
46
- _REPORTING_INTERVAL_SEC = 5
47
- _REPORT_RPC_TIMEOUT_SEC = 5
48
- _REPORT_BACKOFF_ON_ERROR_SEC = 5
49
-
50
-
51
- class ExecutorStateReporter:
52
- def __init__(
53
- self,
54
- executor_id: str,
55
- development_mode: bool,
56
- flavor: ExecutorFlavor,
57
- version: str,
58
- labels: Dict[str, str],
59
- function_allowlist: Optional[List[FunctionURI]],
60
- function_executor_states: FunctionExecutorStatesContainer,
61
- channel_manager: ChannelManager,
62
- host_resources_provider: HostResourcesProvider,
63
- logger: Any,
64
- reporting_interval_sec: int = _REPORTING_INTERVAL_SEC,
65
- ):
66
- self._executor_id: str = executor_id
67
- self._development_mode: bool = development_mode
68
- self._flavor: ExecutorFlavor = flavor
69
- self._version: str = version
70
- self._labels: Dict[str, str] = labels.copy()
71
- self._hostname: str = gethostname()
72
- self._function_executor_states: FunctionExecutorStatesContainer = (
73
- function_executor_states
74
- )
75
- self._channel_manager = channel_manager
76
- self._host_resources_provider: HostResourcesProvider = host_resources_provider
77
- self._logger: Any = logger.bind(module=__name__)
78
- self._reporting_interval_sec: int = reporting_interval_sec
79
- self._total_host_resources: Optional[HostResourcesProto] = None
80
- self._total_function_executor_resources: Optional[HostResourcesProto] = None
81
-
82
- self._is_shutdown: bool = False
83
- self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
84
- self._allowed_functions: List[AllowedFunction] = _to_grpc_allowed_functions(
85
- function_allowlist
86
- )
87
- self._labels.update(_label_values_to_strings(RuntimeProbes().probe().labels))
88
- self._last_server_clock: int = (
89
- 0 # Server expects initial value to be 0 until it is set by Server.
90
- )
91
-
92
- def update_executor_status(self, value: ExecutorStatus):
93
- self._executor_status = value
94
-
95
- def update_last_server_clock(self, value: int):
96
- self._last_server_clock = value
97
-
98
- async def run(self):
99
- """Runs the state reporter.
100
-
101
- Never raises any exceptions.
102
- """
103
- # TODO: Move this method into a new async task and cancel it in shutdown().
104
- while not self._is_shutdown:
105
- stub = ExecutorAPIStub(await self._channel_manager.get_channel())
106
- while not self._is_shutdown:
107
- try:
108
- # The periodic state reports serve as channel health monitoring requests
109
- # (same as TCP keep-alive). Channel Manager returns the same healthy channel
110
- # for all RPCs that we do from Executor to Server. So all the RPCs benefit
111
- # from this channel health monitoring.
112
- await self.report_state(stub)
113
- await asyncio.sleep(self._reporting_interval_sec)
114
- except Exception as e:
115
- self._logger.error(
116
- f"failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
117
- exc_info=e,
118
- )
119
- await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
120
- break
121
-
122
- self._logger.info("state reporter shutdown")
123
-
124
- async def report_state(self, stub: ExecutorAPIStub):
125
- """Reports the current state to the server represented by the supplied stub.
126
-
127
- Raises exceptions on failure.
128
- """
129
- if self._total_host_resources is None:
130
- # We need to fetch total resources only once, because they are not changing.
131
- total_host_resources: HostResources = (
132
- await self._host_resources_provider.total_host_resources(self._logger)
133
- )
134
- total_function_executor_resources: HostResources = (
135
- await self._host_resources_provider.total_function_executor_resources(
136
- self._logger
137
- )
138
- )
139
- self._logger.info(
140
- "detected host resources",
141
- total_host_resources=total_host_resources,
142
- total_function_executor_resources=total_function_executor_resources,
143
- )
144
- self._total_host_resources = _host_resources_to_proto(total_host_resources)
145
- self._total_function_executor_resources = _host_resources_to_proto(
146
- total_function_executor_resources
147
- )
148
-
149
- with (
150
- metric_state_report_errors.count_exceptions(),
151
- metric_state_report_latency.time(),
152
- ):
153
- metric_state_report_rpcs.inc()
154
- state = ExecutorState(
155
- executor_id=self._executor_id,
156
- development_mode=self._development_mode,
157
- hostname=self._hostname,
158
- flavor=_to_grpc_executor_flavor(self._flavor, self._logger),
159
- version=self._version,
160
- status=self._executor_status,
161
- total_function_executor_resources=self._total_function_executor_resources,
162
- total_resources=self._total_host_resources,
163
- allowed_functions=self._allowed_functions,
164
- function_executor_states=await self._fetch_function_executor_states(),
165
- labels=self._labels,
166
- )
167
- state.state_hash = _state_hash(state)
168
- # Set fields not included in the state hash.
169
- state.server_clock = self._last_server_clock
170
-
171
- await stub.report_executor_state(
172
- ReportExecutorStateRequest(executor_state=state),
173
- timeout=_REPORT_RPC_TIMEOUT_SEC,
174
- )
175
-
176
- async def shutdown(self):
177
- """Shuts down the state reporter.
178
-
179
- Never raises any exceptions.
180
- """
181
- self._is_shutdown = True
182
-
183
- async def _fetch_function_executor_states(self) -> List[FunctionExecutorStateProto]:
184
- states = []
185
-
186
- async for function_executor_state in self._function_executor_states:
187
- function_executor_state: FunctionExecutorState
188
- function_executor_state_proto = FunctionExecutorStateProto(
189
- description=FunctionExecutorDescription(
190
- id=function_executor_state.id,
191
- namespace=function_executor_state.namespace,
192
- graph_name=function_executor_state.graph_name,
193
- graph_version=function_executor_state.graph_version,
194
- function_name=function_executor_state.function_name,
195
- secret_names=function_executor_state.secret_names,
196
- ),
197
- status=_to_grpc_function_executor_status(
198
- function_executor_state.status, self._logger
199
- ),
200
- )
201
- if function_executor_state.image_uri:
202
- function_executor_state_proto.description.image_uri = (
203
- function_executor_state.image_uri
204
- )
205
- states.append(function_executor_state_proto)
206
-
207
- return states
208
-
209
-
210
- def _to_grpc_allowed_functions(function_allowlist: Optional[List[FunctionURI]]):
211
- if function_allowlist is None:
212
- return []
213
-
214
- allowed_functions: List[AllowedFunction] = []
215
- for function_uri in function_allowlist:
216
- function_uri: FunctionURI
217
- allowed_function = AllowedFunction(
218
- namespace=function_uri.namespace,
219
- graph_name=function_uri.compute_graph,
220
- function_name=function_uri.compute_fn,
221
- )
222
- if function_uri.version is not None:
223
- allowed_function.graph_version = function_uri.version
224
- allowed_functions.append(allowed_function)
225
-
226
- return allowed_functions
227
-
228
-
229
- _STATUS_MAPPING: Dict[FunctionExecutorStatus, Any] = {
230
- FunctionExecutorStatus.STARTING_UP: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTING_UP,
231
- FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR,
232
- FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR,
233
- FunctionExecutorStatus.IDLE: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE,
234
- FunctionExecutorStatus.RUNNING_TASK: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK,
235
- FunctionExecutorStatus.UNHEALTHY: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY,
236
- FunctionExecutorStatus.DESTROYING: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
237
- FunctionExecutorStatus.DESTROYED: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
238
- FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
239
- }
240
-
241
-
242
- def _to_grpc_function_executor_status(
243
- status: FunctionExecutorStatus, logger: Any
244
- ) -> FunctionExecutorStatusProto:
245
- result: FunctionExecutorStatusProto = _STATUS_MAPPING.get(
246
- status, FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNKNOWN
247
- )
248
-
249
- if result == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNKNOWN:
250
- logger.error("unexpected Function Executor status", status=status)
251
-
252
- return result
253
-
254
-
255
- _FLAVOR_MAPPING = {
256
- ExecutorFlavor.OSS: ExecutorFlavorProto.EXECUTOR_FLAVOR_OSS,
257
- ExecutorFlavor.PLATFORM: ExecutorFlavorProto.EXECUTOR_FLAVOR_PLATFORM,
258
- }
259
-
260
-
261
- def _to_grpc_executor_flavor(
262
- flavor: ExecutorFlavor, logger: Any
263
- ) -> ExecutorFlavorProto:
264
- result: ExecutorFlavorProto = _FLAVOR_MAPPING.get(
265
- flavor, ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN
266
- )
267
-
268
- if result == ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN:
269
- logger.error("unexpected Executor flavor", flavor=flavor)
270
-
271
- return result
272
-
273
-
274
- def _label_values_to_strings(labels: Dict[str, Any]) -> Dict[str, str]:
275
- return {k: str(v) for k, v in labels.items()}
276
-
277
-
278
- def _state_hash(state: ExecutorState) -> str:
279
- serialized_state: bytes = state.SerializeToString(deterministic=True)
280
- hasher = hashlib.sha256(usedforsecurity=False)
281
- hasher.update(serialized_state)
282
- return hasher.hexdigest()
283
-
284
-
285
- def _host_resources_to_proto(host_resources: HostResources) -> HostResourcesProto:
286
- proto = HostResourcesProto(
287
- cpu_count=host_resources.cpu_count,
288
- memory_bytes=host_resources.memory_mb * 1024 * 1024,
289
- disk_bytes=host_resources.disk_mb * 1024 * 1024,
290
- )
291
- if len(host_resources.gpus) > 0:
292
- proto.gpu.CopyFrom(
293
- GPUResourcesProto(
294
- count=len(host_resources.gpus),
295
- model=_gpu_model_to_proto(
296
- host_resources.gpus[0].model
297
- ), # All GPUs have the same model
298
- )
299
- )
300
- return proto
301
-
302
-
303
- def _gpu_model_to_proto(gpu_model: NVIDIA_GPU_MODEL) -> GPUModelProto:
304
- if gpu_model == NVIDIA_GPU_MODEL.A100_40GB:
305
- return GPUModelProto.GPU_MODEL_NVIDIA_A100_40GB
306
- elif gpu_model == NVIDIA_GPU_MODEL.A100_80GB:
307
- return GPUModelProto.GPU_MODEL_NVIDIA_A100_80GB
308
- elif gpu_model == NVIDIA_GPU_MODEL.H100_80GB:
309
- return GPUModelProto.GPU_MODEL_NVIDIA_H100_80GB
310
- elif gpu_model == NVIDIA_GPU_MODEL.TESLA_T4:
311
- return GPUModelProto.GPU_MODEL_NVIDIA_TESLA_T4
312
- elif gpu_model == NVIDIA_GPU_MODEL.A6000:
313
- return GPUModelProto.GPU_MODEL_NVIDIA_A6000
314
- elif gpu_model == NVIDIA_GPU_MODEL.A10:
315
- return GPUModelProto.GPU_MODEL_NVIDIA_A10
316
- else:
317
- return GPUModelProto.GPU_MODEL_UNKNOWN