indexify 0.3.30__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. indexify/cli/__init__.py +18 -0
  2. indexify/cli/build_image.py +51 -0
  3. indexify/cli/deploy.py +57 -0
  4. indexify/cli/executor.py +205 -0
  5. indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
  6. indexify/executor/executor.py +57 -311
  7. indexify/executor/function_allowlist.py +59 -0
  8. indexify/executor/function_executor/function_executor.py +12 -6
  9. indexify/executor/function_executor/invocation_state_client.py +25 -3
  10. indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
  11. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
  12. indexify/executor/function_executor_controller/__init__.py +13 -0
  13. indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
  14. indexify/executor/function_executor_controller/create_function_executor.py +154 -0
  15. indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
  16. indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
  17. indexify/executor/function_executor_controller/downloads.py +199 -0
  18. indexify/executor/function_executor_controller/events.py +172 -0
  19. indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
  20. indexify/executor/function_executor_controller/loggers.py +57 -0
  21. indexify/executor/function_executor_controller/message_validators.py +65 -0
  22. indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
  23. indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
  24. indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
  25. indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
  26. indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
  27. indexify/executor/function_executor_controller/prepare_task.py +38 -0
  28. indexify/executor/function_executor_controller/run_task.py +201 -0
  29. indexify/executor/function_executor_controller/task_info.py +33 -0
  30. indexify/executor/function_executor_controller/task_output.py +122 -0
  31. indexify/executor/function_executor_controller/upload_task_output.py +234 -0
  32. indexify/executor/host_resources/host_resources.py +20 -25
  33. indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
  34. indexify/executor/metrics/executor.py +0 -47
  35. indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
  36. indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
  37. indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
  38. indexify/executor/monitoring/health_checker/health_checker.py +0 -11
  39. indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
  40. indexify/executor/state_reporter.py +364 -0
  41. indexify/proto/executor_api.proto +67 -59
  42. indexify/proto/executor_api_pb2.py +52 -52
  43. indexify/proto/executor_api_pb2.pyi +125 -104
  44. indexify/proto/executor_api_pb2_grpc.py +0 -47
  45. {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/METADATA +1 -3
  46. indexify-0.4.2.dist-info/RECORD +68 -0
  47. indexify-0.4.2.dist-info/entry_points.txt +3 -0
  48. indexify/cli/cli.py +0 -267
  49. indexify/executor/api_objects.py +0 -92
  50. indexify/executor/downloader.py +0 -417
  51. indexify/executor/executor_flavor.py +0 -7
  52. indexify/executor/function_executor/function_executor_state.py +0 -107
  53. indexify/executor/function_executor/function_executor_states_container.py +0 -93
  54. indexify/executor/function_executor/function_executor_status.py +0 -95
  55. indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
  56. indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
  57. indexify/executor/function_executor/single_task_runner.py +0 -345
  58. indexify/executor/function_executor/task_input.py +0 -21
  59. indexify/executor/function_executor/task_output.py +0 -105
  60. indexify/executor/grpc/function_executor_controller.py +0 -418
  61. indexify/executor/grpc/metrics/task_controller.py +0 -8
  62. indexify/executor/grpc/state_reporter.py +0 -314
  63. indexify/executor/grpc/task_controller.py +0 -508
  64. indexify/executor/metrics/task_fetcher.py +0 -21
  65. indexify/executor/metrics/task_reporter.py +0 -53
  66. indexify/executor/metrics/task_runner.py +0 -52
  67. indexify/executor/monitoring/function_allowlist.py +0 -25
  68. indexify/executor/runtime_probes.py +0 -68
  69. indexify/executor/task_fetcher.py +0 -96
  70. indexify/executor/task_reporter.py +0 -459
  71. indexify/executor/task_runner.py +0 -177
  72. indexify-0.3.30.dist-info/RECORD +0 -68
  73. indexify-0.3.30.dist-info/entry_points.txt +0 -3
  74. {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/WHEEL +0 -0
@@ -1,314 +0,0 @@
1
- import asyncio
2
- import hashlib
3
- from socket import gethostname
4
- from typing import Any, Dict, List, Optional
5
-
6
- from indexify.proto.executor_api_pb2 import (
7
- AllowedFunction,
8
- )
9
- from indexify.proto.executor_api_pb2 import ExecutorFlavor as ExecutorFlavorProto
10
- from indexify.proto.executor_api_pb2 import (
11
- ExecutorState,
12
- ExecutorStatus,
13
- FunctionExecutorDescription,
14
- )
15
- from indexify.proto.executor_api_pb2 import (
16
- FunctionExecutorState as FunctionExecutorStateProto,
17
- )
18
- from indexify.proto.executor_api_pb2 import (
19
- FunctionExecutorStatus as FunctionExecutorStatusProto,
20
- )
21
- from indexify.proto.executor_api_pb2 import GPUModel as GPUModelProto
22
- from indexify.proto.executor_api_pb2 import GPUResources as GPUResourcesProto
23
- from indexify.proto.executor_api_pb2 import HostResources as HostResourcesProto
24
- from indexify.proto.executor_api_pb2 import (
25
- ReportExecutorStateRequest,
26
- )
27
- from indexify.proto.executor_api_pb2_grpc import ExecutorAPIStub
28
-
29
- from ..api_objects import FunctionURI
30
- from ..executor_flavor import ExecutorFlavor
31
- from ..function_executor.function_executor_state import FunctionExecutorState
32
- from ..function_executor.function_executor_states_container import (
33
- FunctionExecutorStatesContainer,
34
- )
35
- from ..function_executor.function_executor_status import FunctionExecutorStatus
36
- from ..host_resources.host_resources import HostResources, HostResourcesProvider
37
- from ..host_resources.nvidia_gpu import NVIDIA_GPU_MODEL
38
- from ..runtime_probes import RuntimeProbes
39
- from .channel_manager import ChannelManager
40
- from .metrics.state_reporter import (
41
- metric_state_report_errors,
42
- metric_state_report_latency,
43
- metric_state_report_rpcs,
44
- )
45
-
46
- _REPORTING_INTERVAL_SEC = 5
47
- _REPORT_RPC_TIMEOUT_SEC = 5
48
- _REPORT_BACKOFF_ON_ERROR_SEC = 5
49
-
50
-
51
- class ExecutorStateReporter:
52
- def __init__(
53
- self,
54
- executor_id: str,
55
- flavor: ExecutorFlavor,
56
- version: str,
57
- labels: Dict[str, str],
58
- function_allowlist: Optional[List[FunctionURI]],
59
- function_executor_states: FunctionExecutorStatesContainer,
60
- channel_manager: ChannelManager,
61
- host_resources_provider: HostResourcesProvider,
62
- logger: Any,
63
- reporting_interval_sec: int = _REPORTING_INTERVAL_SEC,
64
- ):
65
- self._executor_id: str = executor_id
66
- self._flavor: ExecutorFlavor = flavor
67
- self._version: str = version
68
- self._labels: Dict[str, str] = labels.copy()
69
- self._hostname: str = gethostname()
70
- self._function_executor_states: FunctionExecutorStatesContainer = (
71
- function_executor_states
72
- )
73
- self._channel_manager = channel_manager
74
- self._host_resources_provider: HostResourcesProvider = host_resources_provider
75
- self._logger: Any = logger.bind(module=__name__)
76
- self._reporting_interval_sec: int = reporting_interval_sec
77
- self._total_host_resources: Optional[HostResourcesProto] = None
78
- self._total_function_executor_resources: Optional[HostResourcesProto] = None
79
-
80
- self._is_shutdown: bool = False
81
- self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
82
- self._allowed_functions: List[AllowedFunction] = _to_grpc_allowed_functions(
83
- function_allowlist
84
- )
85
- self._labels.update(_label_values_to_strings(RuntimeProbes().probe().labels))
86
- self._last_server_clock: int = (
87
- 0 # Server expects initial value to be 0 until it is set by Server.
88
- )
89
-
90
- def update_executor_status(self, value: ExecutorStatus):
91
- self._executor_status = value
92
-
93
- def update_last_server_clock(self, value: int):
94
- self._last_server_clock = value
95
-
96
- async def run(self):
97
- """Runs the state reporter.
98
-
99
- Never raises any exceptions.
100
- """
101
- # TODO: Move this method into a new async task and cancel it in shutdown().
102
- while not self._is_shutdown:
103
- stub = ExecutorAPIStub(await self._channel_manager.get_channel())
104
- while not self._is_shutdown:
105
- try:
106
- # The periodic state reports serve as channel health monitoring requests
107
- # (same as TCP keep-alive). Channel Manager returns the same healthy channel
108
- # for all RPCs that we do from Executor to Server. So all the RPCs benefit
109
- # from this channel health monitoring.
110
- await self.report_state(stub)
111
- await asyncio.sleep(self._reporting_interval_sec)
112
- except Exception as e:
113
- self._logger.error(
114
- f"failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
115
- exc_info=e,
116
- )
117
- await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
118
- break
119
-
120
- self._logger.info("state reporter shutdown")
121
-
122
- async def report_state(self, stub: ExecutorAPIStub):
123
- """Reports the current state to the server represented by the supplied stub.
124
-
125
- Raises exceptions on failure.
126
- """
127
- if self._total_host_resources is None:
128
- # We need to fetch total resources only once, because they are not changing.
129
- total_host_resources: HostResources = (
130
- await self._host_resources_provider.total_host_resources(self._logger)
131
- )
132
- total_function_executor_resources: HostResources = (
133
- await self._host_resources_provider.total_function_executor_resources(
134
- self._logger
135
- )
136
- )
137
- self._logger.info(
138
- "detected host resources",
139
- total_host_resources=total_host_resources,
140
- total_function_executor_resources=total_function_executor_resources,
141
- )
142
- self._total_host_resources = _host_resources_to_proto(total_host_resources)
143
- self._total_function_executor_resources = _host_resources_to_proto(
144
- total_function_executor_resources
145
- )
146
-
147
- with (
148
- metric_state_report_errors.count_exceptions(),
149
- metric_state_report_latency.time(),
150
- ):
151
- metric_state_report_rpcs.inc()
152
- state = ExecutorState(
153
- executor_id=self._executor_id,
154
- hostname=self._hostname,
155
- flavor=_to_grpc_executor_flavor(self._flavor, self._logger),
156
- version=self._version,
157
- status=self._executor_status,
158
- total_function_executor_resources=self._total_function_executor_resources,
159
- total_resources=self._total_host_resources,
160
- allowed_functions=self._allowed_functions,
161
- function_executor_states=await self._fetch_function_executor_states(),
162
- labels=self._labels,
163
- )
164
- state.state_hash = _state_hash(state)
165
- # Set fields not included in the state hash.
166
- state.server_clock = self._last_server_clock
167
-
168
- await stub.report_executor_state(
169
- ReportExecutorStateRequest(executor_state=state),
170
- timeout=_REPORT_RPC_TIMEOUT_SEC,
171
- )
172
-
173
- async def shutdown(self):
174
- """Shuts down the state reporter.
175
-
176
- Never raises any exceptions.
177
- """
178
- self._is_shutdown = True
179
-
180
- async def _fetch_function_executor_states(self) -> List[FunctionExecutorStateProto]:
181
- states = []
182
-
183
- async for function_executor_state in self._function_executor_states:
184
- function_executor_state: FunctionExecutorState
185
- function_executor_state_proto = FunctionExecutorStateProto(
186
- description=FunctionExecutorDescription(
187
- id=function_executor_state.id,
188
- namespace=function_executor_state.namespace,
189
- graph_name=function_executor_state.graph_name,
190
- graph_version=function_executor_state.graph_version,
191
- function_name=function_executor_state.function_name,
192
- secret_names=function_executor_state.secret_names,
193
- ),
194
- status=_to_grpc_function_executor_status(
195
- function_executor_state.status, self._logger
196
- ),
197
- )
198
- if function_executor_state.image_uri:
199
- function_executor_state_proto.description.image_uri = (
200
- function_executor_state.image_uri
201
- )
202
- states.append(function_executor_state_proto)
203
-
204
- return states
205
-
206
-
207
- def _to_grpc_allowed_functions(function_allowlist: Optional[List[FunctionURI]]):
208
- if function_allowlist is None:
209
- return []
210
-
211
- allowed_functions: List[AllowedFunction] = []
212
- for function_uri in function_allowlist:
213
- function_uri: FunctionURI
214
- allowed_function = AllowedFunction(
215
- namespace=function_uri.namespace,
216
- graph_name=function_uri.compute_graph,
217
- function_name=function_uri.compute_fn,
218
- )
219
- if function_uri.version is not None:
220
- allowed_function.graph_version = function_uri.version
221
- allowed_functions.append(allowed_function)
222
-
223
- return allowed_functions
224
-
225
-
226
- _STATUS_MAPPING: Dict[FunctionExecutorStatus, Any] = {
227
- FunctionExecutorStatus.STARTING_UP: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTING_UP,
228
- FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR,
229
- FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR,
230
- FunctionExecutorStatus.IDLE: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE,
231
- FunctionExecutorStatus.RUNNING_TASK: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK,
232
- FunctionExecutorStatus.UNHEALTHY: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY,
233
- FunctionExecutorStatus.DESTROYING: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
234
- FunctionExecutorStatus.DESTROYED: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
235
- FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
236
- }
237
-
238
-
239
- def _to_grpc_function_executor_status(
240
- status: FunctionExecutorStatus, logger: Any
241
- ) -> FunctionExecutorStatusProto:
242
- result: FunctionExecutorStatusProto = _STATUS_MAPPING.get(
243
- status, FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNKNOWN
244
- )
245
-
246
- if result == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNKNOWN:
247
- logger.error("unexpected Function Executor status", status=status)
248
-
249
- return result
250
-
251
-
252
- _FLAVOR_MAPPING = {
253
- ExecutorFlavor.OSS: ExecutorFlavorProto.EXECUTOR_FLAVOR_OSS,
254
- ExecutorFlavor.PLATFORM: ExecutorFlavorProto.EXECUTOR_FLAVOR_PLATFORM,
255
- }
256
-
257
-
258
- def _to_grpc_executor_flavor(
259
- flavor: ExecutorFlavor, logger: Any
260
- ) -> ExecutorFlavorProto:
261
- result: ExecutorFlavorProto = _FLAVOR_MAPPING.get(
262
- flavor, ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN
263
- )
264
-
265
- if result == ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN:
266
- logger.error("unexpected Executor flavor", flavor=flavor)
267
-
268
- return result
269
-
270
-
271
- def _label_values_to_strings(labels: Dict[str, Any]) -> Dict[str, str]:
272
- return {k: str(v) for k, v in labels.items()}
273
-
274
-
275
- def _state_hash(state: ExecutorState) -> str:
276
- serialized_state: bytes = state.SerializeToString(deterministic=True)
277
- hasher = hashlib.sha256(usedforsecurity=False)
278
- hasher.update(serialized_state)
279
- return hasher.hexdigest()
280
-
281
-
282
- def _host_resources_to_proto(host_resources: HostResources) -> HostResourcesProto:
283
- proto = HostResourcesProto(
284
- cpu_count=host_resources.cpu_count,
285
- memory_bytes=host_resources.memory_mb * 1024 * 1024,
286
- disk_bytes=host_resources.disk_mb * 1024 * 1024,
287
- )
288
- if len(host_resources.gpus) > 0:
289
- proto.gpu.CopyFrom(
290
- GPUResourcesProto(
291
- count=len(host_resources.gpus),
292
- model=_gpu_model_to_proto(
293
- host_resources.gpus[0].model
294
- ), # All GPUs have the same model
295
- )
296
- )
297
- return proto
298
-
299
-
300
- def _gpu_model_to_proto(gpu_model: NVIDIA_GPU_MODEL) -> GPUModelProto:
301
- if gpu_model == NVIDIA_GPU_MODEL.A100_40GB:
302
- return GPUModelProto.GPU_MODEL_NVIDIA_A100_40GB
303
- elif gpu_model == NVIDIA_GPU_MODEL.A100_80GB:
304
- return GPUModelProto.GPU_MODEL_NVIDIA_A100_80GB
305
- elif gpu_model == NVIDIA_GPU_MODEL.H100_80GB:
306
- return GPUModelProto.GPU_MODEL_NVIDIA_H100_80GB
307
- elif gpu_model == NVIDIA_GPU_MODEL.TESLA_T4:
308
- return GPUModelProto.GPU_MODEL_NVIDIA_TESLA_T4
309
- elif gpu_model == NVIDIA_GPU_MODEL.A6000:
310
- return GPUModelProto.GPU_MODEL_NVIDIA_A6000
311
- elif gpu_model == NVIDIA_GPU_MODEL.A10:
312
- return GPUModelProto.GPU_MODEL_NVIDIA_A10
313
- else:
314
- return GPUModelProto.GPU_MODEL_UNKNOWN