indexify 0.3.30__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. indexify/cli/__init__.py +18 -0
  2. indexify/cli/build_image.py +51 -0
  3. indexify/cli/deploy.py +57 -0
  4. indexify/cli/executor.py +205 -0
  5. indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
  6. indexify/executor/executor.py +57 -311
  7. indexify/executor/function_allowlist.py +59 -0
  8. indexify/executor/function_executor/function_executor.py +12 -6
  9. indexify/executor/function_executor/invocation_state_client.py +25 -3
  10. indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
  11. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
  12. indexify/executor/function_executor_controller/__init__.py +13 -0
  13. indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
  14. indexify/executor/function_executor_controller/create_function_executor.py +154 -0
  15. indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
  16. indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
  17. indexify/executor/function_executor_controller/downloads.py +199 -0
  18. indexify/executor/function_executor_controller/events.py +172 -0
  19. indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
  20. indexify/executor/function_executor_controller/loggers.py +57 -0
  21. indexify/executor/function_executor_controller/message_validators.py +65 -0
  22. indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
  23. indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
  24. indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
  25. indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
  26. indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
  27. indexify/executor/function_executor_controller/prepare_task.py +38 -0
  28. indexify/executor/function_executor_controller/run_task.py +201 -0
  29. indexify/executor/function_executor_controller/task_info.py +33 -0
  30. indexify/executor/function_executor_controller/task_output.py +122 -0
  31. indexify/executor/function_executor_controller/upload_task_output.py +234 -0
  32. indexify/executor/host_resources/host_resources.py +20 -25
  33. indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
  34. indexify/executor/metrics/executor.py +0 -47
  35. indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
  36. indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
  37. indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
  38. indexify/executor/monitoring/health_checker/health_checker.py +0 -11
  39. indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
  40. indexify/executor/state_reporter.py +364 -0
  41. indexify/proto/executor_api.proto +67 -59
  42. indexify/proto/executor_api_pb2.py +52 -52
  43. indexify/proto/executor_api_pb2.pyi +125 -104
  44. indexify/proto/executor_api_pb2_grpc.py +0 -47
  45. {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/METADATA +1 -3
  46. indexify-0.4.2.dist-info/RECORD +68 -0
  47. indexify-0.4.2.dist-info/entry_points.txt +3 -0
  48. indexify/cli/cli.py +0 -267
  49. indexify/executor/api_objects.py +0 -92
  50. indexify/executor/downloader.py +0 -417
  51. indexify/executor/executor_flavor.py +0 -7
  52. indexify/executor/function_executor/function_executor_state.py +0 -107
  53. indexify/executor/function_executor/function_executor_states_container.py +0 -93
  54. indexify/executor/function_executor/function_executor_status.py +0 -95
  55. indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
  56. indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
  57. indexify/executor/function_executor/single_task_runner.py +0 -345
  58. indexify/executor/function_executor/task_input.py +0 -21
  59. indexify/executor/function_executor/task_output.py +0 -105
  60. indexify/executor/grpc/function_executor_controller.py +0 -418
  61. indexify/executor/grpc/metrics/task_controller.py +0 -8
  62. indexify/executor/grpc/state_reporter.py +0 -314
  63. indexify/executor/grpc/task_controller.py +0 -508
  64. indexify/executor/metrics/task_fetcher.py +0 -21
  65. indexify/executor/metrics/task_reporter.py +0 -53
  66. indexify/executor/metrics/task_runner.py +0 -52
  67. indexify/executor/monitoring/function_allowlist.py +0 -25
  68. indexify/executor/runtime_probes.py +0 -68
  69. indexify/executor/task_fetcher.py +0 -96
  70. indexify/executor/task_reporter.py +0 -459
  71. indexify/executor/task_runner.py +0 -177
  72. indexify-0.3.30.dist-info/RECORD +0 -68
  73. indexify-0.3.30.dist-info/entry_points.txt +0 -3
  74. {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/WHEEL +0 -0
@@ -1,418 +0,0 @@
1
- import asyncio
2
- from typing import Any, Optional
3
-
4
- from tensorlake.function_executor.proto.function_executor_pb2 import (
5
- InitializeRequest,
6
- SerializedObject,
7
- )
8
- from tensorlake.function_executor.proto.message_validator import MessageValidator
9
-
10
- from indexify.proto.executor_api_pb2 import (
11
- FunctionExecutorDescription,
12
- FunctionExecutorResources,
13
- )
14
- from indexify.proto.executor_api_pb2 import (
15
- FunctionExecutorStatus as FunctionExecutorStatusProto,
16
- )
17
-
18
- from ..downloader import Downloader
19
- from ..function_executor.function_executor import CustomerError, FunctionExecutor
20
- from ..function_executor.function_executor_state import FunctionExecutorState
21
- from ..function_executor.function_executor_status import FunctionExecutorStatus
22
- from ..function_executor.health_checker import HealthCheckResult
23
- from ..function_executor.server.function_executor_server_factory import (
24
- FunctionExecutorServerConfiguration,
25
- FunctionExecutorServerFactory,
26
- )
27
-
28
-
29
- def validate_function_executor_description(
30
- function_executor_description: FunctionExecutorDescription,
31
- ) -> None:
32
- """Validates the supplied FE description.
33
-
34
- Raises ValueError if the description is not valid.
35
- """
36
- validator = MessageValidator(function_executor_description)
37
- validator.required_field("id")
38
- validator.required_field("namespace")
39
- validator.required_field("graph_name")
40
- validator.required_field("graph_version")
41
- validator.required_field("function_name")
42
- # TODO: Make graph required after we migrate to direct S3 downloads.
43
- # image_uri is optional.
44
- # secret_names can be empty.
45
- # resource_limits is optional.
46
- # TODO: Make resources required after we migrate Server to them.
47
- # validator.required_field("resources")
48
- # validator = MessageValidator(function_executor_description.resources)
49
- # validator.required_field("cpu_ms_per_sec")
50
- # validator.required_field("memory_bytes")
51
- # validator.required_field("disk_bytes")
52
- # validator.required_field("gpu_count")
53
-
54
-
55
- def function_executor_logger(
56
- function_executor_description: FunctionExecutorDescription, logger: Any
57
- ) -> Any:
58
- """Returns a logger bound with the FE's metadata.
59
-
60
- The function assumes that the FE might be invalid."""
61
- return logger.bind(
62
- function_executor_id=(
63
- function_executor_description.id
64
- if function_executor_description.HasField("id")
65
- else None
66
- ),
67
- namespace=(
68
- function_executor_description.namespace
69
- if function_executor_description.HasField("namespace")
70
- else None
71
- ),
72
- graph_name=(
73
- function_executor_description.graph_name
74
- if function_executor_description.HasField("graph_name")
75
- else None
76
- ),
77
- graph_version=(
78
- function_executor_description.graph_version
79
- if function_executor_description.HasField("graph_version")
80
- else None
81
- ),
82
- function_name=(
83
- function_executor_description.function_name
84
- if function_executor_description.HasField("function_name")
85
- else None
86
- ),
87
- )
88
-
89
-
90
- class FunctionExecutorController:
91
- def __init__(
92
- self,
93
- executor_id: str,
94
- function_executor_state: FunctionExecutorState,
95
- function_executor_description: FunctionExecutorDescription,
96
- function_executor_server_factory: FunctionExecutorServerFactory,
97
- downloader: Downloader,
98
- base_url: str,
99
- config_path: str,
100
- logger: Any,
101
- ):
102
- """Initializes the FunctionExecutorController.
103
-
104
- The supplied FunctionExecutorDescription must be already validated by the caller
105
- using validate_function_executor_description().
106
- """
107
- self._executor_id: str = executor_id
108
- self._function_executor_state: FunctionExecutorState = function_executor_state
109
- self._function_executor_description: FunctionExecutorDescription = (
110
- function_executor_description
111
- )
112
- self._function_executor_server_factory: FunctionExecutorServerFactory = (
113
- function_executor_server_factory
114
- )
115
- self._downloader: Downloader = downloader
116
- self._base_url: str = base_url
117
- self._config_path: str = config_path
118
- self._logger: Any = function_executor_logger(
119
- function_executor_description, logger
120
- ).bind(
121
- module=__name__,
122
- )
123
- # The locks protects the desired status.
124
- self._lock: asyncio.Lock = asyncio.Lock()
125
- # The same as the initial FE status.
126
- self._desired_status: FunctionExecutorStatusProto = (
127
- FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
128
- )
129
- self._desired_status_change_notifier: asyncio.Condition = asyncio.Condition(
130
- lock=self._lock
131
- )
132
- # Automatically start the controller on creation.
133
- self._reconciliation_loop_task: asyncio.Task = asyncio.create_task(
134
- self._reconciliation_loop(),
135
- name="function executor controller reconciliation loop",
136
- )
137
-
138
- def function_executor_description(self) -> FunctionExecutorDescription:
139
- return self._function_executor_description
140
-
141
- async def startup(self) -> None:
142
- await self._set_desired_status(
143
- FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE
144
- )
145
-
146
- async def shutdown(self) -> None:
147
- await self._set_desired_status(
148
- FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
149
- )
150
-
151
- async def _set_desired_status(
152
- self, desired_status: FunctionExecutorStatusProto
153
- ) -> None:
154
- """Updates the desired Function Executor status.
155
-
156
- Reconciliation is done asynchronously. Doesn't raise any exceptions.
157
- """
158
- async with self._lock:
159
- if self._desired_status == desired_status:
160
- return
161
- self._desired_status = desired_status
162
- self._desired_status_change_notifier.notify_all()
163
-
164
- async def _reconciliation_loop(self) -> None:
165
- self._logger.info("function executor controller reconciliation loop started")
166
- # The same as the initial FE status.
167
- last_seen_desired_status: FunctionExecutorStatusProto = (
168
- FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
169
- )
170
- # The loop is exited via loop async task cancellation on FE shutdown.
171
- while True:
172
- async with self._lock:
173
- while last_seen_desired_status == self._desired_status:
174
- await self._desired_status_change_notifier.wait()
175
-
176
- last_seen_desired_status = self._desired_status
177
- # It's guaranteed that we don't run _reconcile concurrently multiple times.
178
- await self._reconcile(last_seen_desired_status)
179
-
180
- async def _reconcile(self, desired_status: FunctionExecutorStatusProto) -> None:
181
- """Reconciles the FE status with the desired status.
182
-
183
- Doesn't raise any exceptions."""
184
- async with self._function_executor_state.lock:
185
- if (
186
- desired_status
187
- == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE
188
- ):
189
- return await self._startup()
190
- elif (
191
- desired_status
192
- == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
193
- ):
194
- # Shutdown can be requested with any current status.
195
- return await self._shutdown()
196
- else:
197
- self._logger.error(
198
- "unexpected desired function executor status received from server, skipping state reconciliation",
199
- current_status=self._function_executor_state.status.name,
200
- desired_status=FunctionExecutorStatusProto.Name(desired_status),
201
- )
202
-
203
- async def _shutdown(self) -> None:
204
- """Shutsdown the Function Executor and frees all of its resources.
205
-
206
- Caller holds the FE state lock. Doesn't raise any exceptions.
207
- """
208
- # Run destroy sequence if current FE status requires it (see allows FE status transitions).
209
- # We won't see DESTROYING and STARTING_UP statuses here because FE reconciliation is done
210
- # with concurrency of 1.
211
- if self._function_executor_state.status in [
212
- FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
213
- FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
214
- FunctionExecutorStatus.IDLE,
215
- FunctionExecutorStatus.RUNNING_TASK,
216
- FunctionExecutorStatus.UNHEALTHY,
217
- ]:
218
- await self._function_executor_state.set_status(
219
- FunctionExecutorStatus.DESTROYING
220
- )
221
- if self._function_executor_state.function_executor is not None:
222
- async with _UnlockedLockContextManager(
223
- self._function_executor_state.lock
224
- ):
225
- await self._function_executor_state.function_executor.destroy()
226
- await self._function_executor_state.set_status(
227
- FunctionExecutorStatus.DESTROYED
228
- )
229
- self._function_executor_state.function_executor = None
230
-
231
- self._logger.info("shutting down function executor controller")
232
- await self._function_executor_state.set_status(FunctionExecutorStatus.SHUTDOWN)
233
- self._reconciliation_loop_task.cancel()
234
- # No code is executed after this point because reconciliation loop aio task is cancelled.
235
-
236
- async def _startup(self) -> None:
237
- """Startups the FE if possible.
238
-
239
- Caller holds the FE state lock. Doesn't raise any exceptions.
240
- """
241
- if self._function_executor_state.status != FunctionExecutorStatus.DESTROYED:
242
- self._logger.error(
243
- "Can't startup Function Executor from its current state, skipping startup",
244
- current_status=self._function_executor_state.status.name,
245
- )
246
- return
247
-
248
- await self._function_executor_state.set_status(
249
- FunctionExecutorStatus.STARTING_UP
250
- )
251
-
252
- next_status: FunctionExecutorStatus = FunctionExecutorStatus.IDLE
253
- async with _UnlockedLockContextManager(self._function_executor_state.lock):
254
- try:
255
- function_executor: FunctionExecutor = await _create_function_executor(
256
- function_executor_description=self._function_executor_description,
257
- function_executor_server_factory=self._function_executor_server_factory,
258
- downloader=self._downloader,
259
- executor_id=self._executor_id,
260
- base_url=self._base_url,
261
- config_path=self._config_path,
262
- logger=self._logger,
263
- )
264
- except CustomerError as e:
265
- next_status = FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
266
- # TODO: Save stdout and stderr of customer code that ran during FE creation into BLOBs and uncomment the corresponding tests.
267
- self._logger.error(
268
- "failed to create function executor due to error in customer code",
269
- exc_info=e,
270
- )
271
- except Exception as e:
272
- next_status = FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
273
- self._logger.error(
274
- "failed to create function executor due to platform error",
275
- exc_info=e,
276
- )
277
-
278
- # FE state lock is acquired again at this point.
279
- await self._function_executor_state.set_status(next_status)
280
-
281
- if next_status == FunctionExecutorStatus.IDLE:
282
- # Task controllers will notice that this FE is IDLE and start running on it one by one.
283
- self._function_executor_state.function_executor = function_executor
284
- # Health checker starts after FE creation and gets automatically stopped on FE destroy.
285
- self._function_executor_state.function_executor.health_checker().start(
286
- self._health_check_failed_callback
287
- )
288
-
289
- async def _health_check_failed_callback(self, result: HealthCheckResult):
290
- async with self._function_executor_state.lock:
291
- if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
292
- return
293
-
294
- # There can be false positive health check failures when we're creating
295
- # or destroying FEs so we only react to health check failures when we expect
296
- # the FE to be healthy.
297
- if self._function_executor_state.status not in (
298
- FunctionExecutorStatus.IDLE,
299
- FunctionExecutorStatus.RUNNING_TASK,
300
- ):
301
- return
302
-
303
- await self._function_executor_state.set_status(
304
- FunctionExecutorStatus.UNHEALTHY
305
- )
306
- function_executor: FunctionExecutor = (
307
- self._function_executor_state.function_executor
308
- )
309
- self._function_executor_state.function_executor = None
310
-
311
- self._logger.error(
312
- "Function Executor health check failed, destroying Function Executor",
313
- health_check_fail_reason=result.reason,
314
- )
315
- # Destroy the unhealthy FE asap so it doesn't consume resources.
316
- # Do it with unlocked state lock to not stop other work on this FE state.
317
- await function_executor.destroy()
318
-
319
-
320
- async def _create_function_executor(
321
- function_executor_description: FunctionExecutorDescription,
322
- function_executor_server_factory: FunctionExecutorServerFactory,
323
- downloader: Downloader,
324
- executor_id: str,
325
- base_url: str,
326
- config_path: str,
327
- logger: Any,
328
- ) -> FunctionExecutor:
329
- """Creates a function executor.
330
-
331
- Raises Exception in case of failure.
332
- Raises CustomerError if customer code failed during FE creation.
333
- """
334
- graph: SerializedObject = await downloader.download_graph(
335
- namespace=function_executor_description.namespace,
336
- graph_name=function_executor_description.graph_name,
337
- graph_version=function_executor_description.graph_version,
338
- logger=logger,
339
- data_payload=(
340
- function_executor_description.graph
341
- if function_executor_description.HasField("graph")
342
- else None
343
- ),
344
- )
345
-
346
- config: FunctionExecutorServerConfiguration = FunctionExecutorServerConfiguration(
347
- executor_id=executor_id,
348
- function_executor_id=function_executor_description.id,
349
- namespace=function_executor_description.namespace,
350
- graph_name=function_executor_description.graph_name,
351
- graph_version=function_executor_description.graph_version,
352
- function_name=function_executor_description.function_name,
353
- image_uri=None,
354
- secret_names=list(function_executor_description.secret_names),
355
- cpu_ms_per_sec=None,
356
- memory_bytes=None,
357
- disk_bytes=None,
358
- gpu_count=0,
359
- )
360
- if function_executor_description.HasField("image_uri"):
361
- config.image_uri = function_executor_description.image_uri
362
- if function_executor_description.HasField("resources"):
363
- resources: FunctionExecutorResources = function_executor_description.resources
364
- config.cpu_ms_per_sec = resources.cpu_ms_per_sec
365
- config.memory_bytes = resources.memory_bytes
366
- config.disk_bytes = resources.disk_bytes
367
- config.gpu_count = resources.gpu_count
368
-
369
- initialize_request: InitializeRequest = InitializeRequest(
370
- namespace=function_executor_description.namespace,
371
- graph_name=function_executor_description.graph_name,
372
- graph_version=function_executor_description.graph_version,
373
- function_name=function_executor_description.function_name,
374
- graph=graph,
375
- )
376
- customer_code_timeout_sec: Optional[float] = None
377
- if function_executor_description.HasField("customer_code_timeout_ms"):
378
- customer_code_timeout_sec = (
379
- function_executor_description.customer_code_timeout_ms / 1000.0
380
- )
381
-
382
- function_executor: FunctionExecutor = FunctionExecutor(
383
- server_factory=function_executor_server_factory, logger=logger
384
- )
385
-
386
- try:
387
- # Raises CustomerError if initialization failed in customer code or customer code timed out.
388
- await function_executor.initialize(
389
- config=config,
390
- initialize_request=initialize_request,
391
- base_url=base_url,
392
- config_path=config_path,
393
- customer_code_timeout_sec=customer_code_timeout_sec,
394
- )
395
- return function_executor
396
- except (Exception, asyncio.CancelledError):
397
- # Destroy the failed to startup FE asap so it doesn't consume resources.
398
- # Destroy the FE also if the FE initialization got cancelled to not leak
399
- # allocated resources.
400
- await function_executor.destroy()
401
- raise
402
-
403
-
404
- class _UnlockedLockContextManager:
405
- """Unlocks its lock on enter to the scope and locks it back on exit."""
406
-
407
- def __init__(
408
- self,
409
- lock: asyncio.Lock,
410
- ):
411
- self._lock: asyncio.Lock = lock
412
-
413
- async def __aenter__(self):
414
- self._lock.release()
415
- return self
416
-
417
- async def __aexit__(self, exc_type, exc_val, exc_tb):
418
- await self._lock.acquire()
@@ -1,8 +0,0 @@
1
- import prometheus_client
2
-
3
- from ...monitoring.metrics import latency_metric_for_fast_operation
4
-
5
- metric_task_cancellations = prometheus_client.Counter(
6
- "task_cancellations",
7
- "Number of times a task was cancelled",
8
- )