indexify 0.3.18__py3-none-any.whl → 0.3.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. indexify/cli/cli.py +15 -17
  2. indexify/executor/api_objects.py +12 -0
  3. indexify/executor/blob_store/blob_store.py +69 -0
  4. indexify/executor/blob_store/local_fs_blob_store.py +48 -0
  5. indexify/executor/blob_store/metrics/blob_store.py +33 -0
  6. indexify/executor/blob_store/s3_blob_store.py +85 -0
  7. indexify/executor/downloader.py +149 -25
  8. indexify/executor/executor.py +77 -41
  9. indexify/executor/function_executor/function_executor.py +24 -11
  10. indexify/executor/function_executor/function_executor_state.py +9 -1
  11. indexify/executor/function_executor/function_executor_states_container.py +8 -1
  12. indexify/executor/function_executor/function_executor_status.py +4 -0
  13. indexify/executor/function_executor/health_checker.py +7 -2
  14. indexify/executor/function_executor/invocation_state_client.py +4 -2
  15. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
  16. indexify/executor/function_executor/single_task_runner.py +15 -11
  17. indexify/executor/function_executor/task_output.py +36 -2
  18. indexify/executor/grpc/channel_manager.py +4 -3
  19. indexify/executor/grpc/function_executor_controller.py +391 -0
  20. indexify/executor/grpc/metrics/state_reconciler.py +17 -0
  21. indexify/executor/grpc/metrics/task_controller.py +8 -0
  22. indexify/executor/grpc/state_reconciler.py +324 -217
  23. indexify/executor/grpc/state_reporter.py +52 -41
  24. indexify/executor/grpc/task_controller.py +492 -0
  25. indexify/executor/metrics/task_reporter.py +14 -0
  26. indexify/executor/task_reporter.py +115 -6
  27. indexify/executor/task_runner.py +1 -0
  28. indexify/proto/executor_api.proto +91 -7
  29. indexify/proto/executor_api_pb2.py +49 -37
  30. indexify/proto/executor_api_pb2.pyi +158 -3
  31. indexify/proto/executor_api_pb2_grpc.py +47 -0
  32. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/METADATA +2 -1
  33. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/RECORD +35 -27
  34. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/WHEEL +0 -0
  35. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,391 @@
1
+ import asyncio
2
+ from typing import Any, Optional
3
+
4
+ from tensorlake.function_executor.proto.function_executor_pb2 import (
5
+ InitializeRequest,
6
+ SerializedObject,
7
+ )
8
+ from tensorlake.function_executor.proto.message_validator import MessageValidator
9
+
10
+ from indexify.proto.executor_api_pb2 import (
11
+ FunctionExecutorDescription,
12
+ )
13
+ from indexify.proto.executor_api_pb2 import (
14
+ FunctionExecutorStatus as FunctionExecutorStatusProto,
15
+ )
16
+
17
+ from ..downloader import Downloader
18
+ from ..function_executor.function_executor import CustomerError, FunctionExecutor
19
+ from ..function_executor.function_executor_state import FunctionExecutorState
20
+ from ..function_executor.function_executor_status import FunctionExecutorStatus
21
+ from ..function_executor.health_checker import HealthCheckResult
22
+ from ..function_executor.server.function_executor_server_factory import (
23
+ FunctionExecutorServerConfiguration,
24
+ FunctionExecutorServerFactory,
25
+ )
26
+
27
+
28
+ def validate_function_executor_description(
29
+ function_executor_description: FunctionExecutorDescription,
30
+ ) -> None:
31
+ """Validates the supplied FE description.
32
+
33
+ Raises ValueError if the description is not valid.
34
+ """
35
+ validator = MessageValidator(function_executor_description)
36
+ validator.required_field("id")
37
+ validator.required_field("namespace")
38
+ validator.required_field("graph_name")
39
+ validator.required_field("graph_version")
40
+ validator.required_field("function_name")
41
+ # TODO: Make graph required after we migrate to direct S3 downloads.
42
+ # image_uri is optional.
43
+ # secret_names can be empty.
44
+ # resource_limits is optional.
45
+
46
+
47
+ def function_executor_logger(
48
+ function_executor_description: FunctionExecutorDescription, logger: Any
49
+ ) -> Any:
50
+ """Returns a logger bound with the FE's metadata.
51
+
52
+ The function assumes that the FE might be invalid."""
53
+ return logger.bind(
54
+ function_executor_id=(
55
+ function_executor_description.id
56
+ if function_executor_description.HasField("id")
57
+ else None
58
+ ),
59
+ namespace=(
60
+ function_executor_description.namespace
61
+ if function_executor_description.HasField("namespace")
62
+ else None
63
+ ),
64
+ graph_name=(
65
+ function_executor_description.graph_name
66
+ if function_executor_description.HasField("graph_name")
67
+ else None
68
+ ),
69
+ graph_version=(
70
+ function_executor_description.graph_version
71
+ if function_executor_description.HasField("graph_version")
72
+ else None
73
+ ),
74
+ function_name=(
75
+ function_executor_description.function_name
76
+ if function_executor_description.HasField("function_name")
77
+ else None
78
+ ),
79
+ )
80
+
81
+
82
+ class FunctionExecutorController:
83
+ def __init__(
84
+ self,
85
+ executor_id: str,
86
+ function_executor_state: FunctionExecutorState,
87
+ function_executor_description: FunctionExecutorDescription,
88
+ function_executor_server_factory: FunctionExecutorServerFactory,
89
+ downloader: Downloader,
90
+ base_url: str,
91
+ config_path: str,
92
+ logger: Any,
93
+ ):
94
+ """Initializes the FunctionExecutorController.
95
+
96
+ The supplied FunctionExecutorDescription must be already validated by the caller
97
+ using validate_function_executor_description().
98
+ """
99
+ self._executor_id: str = executor_id
100
+ self._function_executor_state: FunctionExecutorState = function_executor_state
101
+ self._function_executor_description: FunctionExecutorDescription = (
102
+ function_executor_description
103
+ )
104
+ self._function_executor_server_factory: FunctionExecutorServerFactory = (
105
+ function_executor_server_factory
106
+ )
107
+ self._downloader: Downloader = downloader
108
+ self._base_url: str = base_url
109
+ self._config_path: str = config_path
110
+ self._logger: Any = function_executor_logger(
111
+ function_executor_description, logger
112
+ ).bind(
113
+ module=__name__,
114
+ )
115
+ # The locks protects the desired status.
116
+ self._lock: asyncio.Lock = asyncio.Lock()
117
+ # The same as the initial FE status.
118
+ self._desired_status: FunctionExecutorStatusProto = (
119
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
120
+ )
121
+ self._desired_status_change_notifier: asyncio.Condition = asyncio.Condition(
122
+ lock=self._lock
123
+ )
124
+ # Automatically start the controller on creation.
125
+ self._reconciliation_loop_task: asyncio.Task = asyncio.create_task(
126
+ self._reconciliation_loop(),
127
+ name="function executor controller reconciliation loop",
128
+ )
129
+
130
+ def function_executor_description(self) -> FunctionExecutorDescription:
131
+ return self._function_executor_description
132
+
133
+ async def startup(self) -> None:
134
+ await self._set_desired_status(
135
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE
136
+ )
137
+
138
+ async def shutdown(self) -> None:
139
+ await self._set_desired_status(
140
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
141
+ )
142
+
143
+ async def _set_desired_status(
144
+ self, desired_status: FunctionExecutorStatusProto
145
+ ) -> None:
146
+ """Updates the desired Function Executor status.
147
+
148
+ Reconciliation is done asynchronously. Doesn't raise any exceptions.
149
+ """
150
+ async with self._lock:
151
+ if self._desired_status == desired_status:
152
+ return
153
+ self._desired_status = desired_status
154
+ self._desired_status_change_notifier.notify_all()
155
+
156
+ async def _reconciliation_loop(self) -> None:
157
+ self._logger.info("function executor controller reconciliation loop started")
158
+ # The same as the initial FE status.
159
+ last_seen_desired_status: FunctionExecutorStatusProto = (
160
+ FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED
161
+ )
162
+ # The loop is exited via loop async task cancellation on FE shutdown.
163
+ while True:
164
+ async with self._lock:
165
+ while last_seen_desired_status == self._desired_status:
166
+ await self._desired_status_change_notifier.wait()
167
+
168
+ last_seen_desired_status = self._desired_status
169
+ # It's guaranteed that we don't run _reconcile concurrently multiple times.
170
+ await self._reconcile(last_seen_desired_status)
171
+
172
+ async def _reconcile(self, desired_status: FunctionExecutorStatusProto) -> None:
173
+ """Reconciles the FE status with the desired status.
174
+
175
+ Doesn't raise any exceptions."""
176
+ async with self._function_executor_state.lock:
177
+ if (
178
+ desired_status
179
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_IDLE
180
+ ):
181
+ return await self._startup()
182
+ elif (
183
+ desired_status
184
+ == FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN
185
+ ):
186
+ # Shutdown can be requested with any current status.
187
+ return await self._shutdown()
188
+ else:
189
+ self._logger.error(
190
+ "unexpected desired function executor status received from server, skipping state reconciliation",
191
+ current_status=self._function_executor_state.status.name,
192
+ desired_status=FunctionExecutorStatusProto.Name(desired_status),
193
+ )
194
+
195
+ async def _shutdown(self) -> None:
196
+ """Shutsdown the Function Executor and frees all of its resources.
197
+
198
+ Caller holds the FE state lock. Doesn't raise any exceptions.
199
+ """
200
+ # Run destroy sequence if current FE status requires it (see allows FE status transitions).
201
+ # We won't see DESTROYING and STARTING_UP statuses here because FE reconciliation is done
202
+ # with concurrency of 1.
203
+ if self._function_executor_state.status in [
204
+ FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
205
+ FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
206
+ FunctionExecutorStatus.IDLE,
207
+ FunctionExecutorStatus.RUNNING_TASK,
208
+ FunctionExecutorStatus.UNHEALTHY,
209
+ ]:
210
+ await self._function_executor_state.set_status(
211
+ FunctionExecutorStatus.DESTROYING
212
+ )
213
+ if self._function_executor_state.function_executor is not None:
214
+ async with _UnlockedLockContextManager(
215
+ self._function_executor_state.lock
216
+ ):
217
+ await self._function_executor_state.function_executor.destroy()
218
+ await self._function_executor_state.set_status(
219
+ FunctionExecutorStatus.DESTROYED
220
+ )
221
+ self._function_executor_state.function_executor = None
222
+
223
+ self._logger.info("shutting down function executor controller")
224
+ await self._function_executor_state.set_status(FunctionExecutorStatus.SHUTDOWN)
225
+ self._reconciliation_loop_task.cancel()
226
+ # No code is executed after this point because reconciliation loop aio task is cancelled.
227
+
228
+ async def _startup(self) -> None:
229
+ """Startups the FE if possible.
230
+
231
+ Caller holds the FE state lock. Doesn't raise any exceptions.
232
+ """
233
+ if self._function_executor_state.status != FunctionExecutorStatus.DESTROYED:
234
+ self._logger.error(
235
+ "Can't startup Function Executor from its current state, skipping startup",
236
+ current_status=self._function_executor_state.status.name,
237
+ )
238
+ return
239
+
240
+ await self._function_executor_state.set_status(
241
+ FunctionExecutorStatus.STARTING_UP
242
+ )
243
+
244
+ next_status: FunctionExecutorStatus = FunctionExecutorStatus.IDLE
245
+ next_status_message: str = ""
246
+ async with _UnlockedLockContextManager(self._function_executor_state.lock):
247
+ try:
248
+ function_executor: FunctionExecutor = await _create_function_executor(
249
+ function_executor_description=self._function_executor_description,
250
+ function_executor_server_factory=self._function_executor_server_factory,
251
+ downloader=self._downloader,
252
+ executor_id=self._executor_id,
253
+ base_url=self._base_url,
254
+ config_path=self._config_path,
255
+ logger=self._logger,
256
+ )
257
+ except CustomerError as e:
258
+ next_status = FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR
259
+ next_status_message = str(e)
260
+ except Exception as e:
261
+ next_status = FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR
262
+ self._logger.error("failed to create function executor", exc_info=e)
263
+
264
+ # FE state lock is acquired again at this point.
265
+ await self._function_executor_state.set_status(next_status, next_status_message)
266
+
267
+ if next_status == FunctionExecutorStatus.IDLE:
268
+ # Task controllers will notice that this FE is IDLE and start running on it one by one.
269
+ self._function_executor_state.function_executor = function_executor
270
+ # Health checker starts after FE creation and gets automatically stopped on FE destroy.
271
+ self._function_executor_state.function_executor.health_checker().start(
272
+ self._health_check_failed_callback
273
+ )
274
+
275
+ async def _health_check_failed_callback(self, result: HealthCheckResult):
276
+ async with self._function_executor_state.lock:
277
+ if self._function_executor_state.status == FunctionExecutorStatus.UNHEALTHY:
278
+ return
279
+
280
+ # There can be false positive health check failures when we're creating
281
+ # or destroying FEs so we only react to health check failures when we expect
282
+ # the FE to be healthy.
283
+ if self._function_executor_state.status not in (
284
+ FunctionExecutorStatus.IDLE,
285
+ FunctionExecutorStatus.RUNNING_TASK,
286
+ ):
287
+ return
288
+
289
+ await self._function_executor_state.set_status(
290
+ FunctionExecutorStatus.UNHEALTHY
291
+ )
292
+ function_executor: FunctionExecutor = (
293
+ self._function_executor_state.function_executor
294
+ )
295
+ self._function_executor_state.function_executor = None
296
+
297
+ self._logger.error(
298
+ "Function Executor health check failed, destroying Function Executor",
299
+ health_check_fail_reason=result.reason,
300
+ )
301
+ # Destroy the unhealthy FE asap so it doesn't consume resources.
302
+ # Do it with unlocked state lock to not stop other work on this FE state.
303
+ await function_executor.destroy()
304
+
305
+
306
+ async def _create_function_executor(
307
+ function_executor_description: FunctionExecutorDescription,
308
+ function_executor_server_factory: FunctionExecutorServerFactory,
309
+ downloader: Downloader,
310
+ executor_id: str,
311
+ base_url: str,
312
+ config_path: str,
313
+ logger: Any,
314
+ ) -> FunctionExecutor:
315
+ """Creates a function executor.
316
+
317
+ Raises Exception in case of failure.
318
+ Raises CustomerError if customer code failed during FE creation.
319
+ """
320
+ graph: SerializedObject = await downloader.download_graph(
321
+ namespace=function_executor_description.namespace,
322
+ graph_name=function_executor_description.graph_name,
323
+ graph_version=function_executor_description.graph_version,
324
+ logger=logger,
325
+ data_payload=(
326
+ function_executor_description.graph
327
+ if function_executor_description.HasField("graph")
328
+ else None
329
+ ),
330
+ )
331
+
332
+ config: FunctionExecutorServerConfiguration = FunctionExecutorServerConfiguration(
333
+ executor_id=executor_id,
334
+ function_executor_id=function_executor_description.id,
335
+ namespace=function_executor_description.namespace,
336
+ image_uri=None,
337
+ secret_names=list(function_executor_description.secret_names),
338
+ )
339
+ if function_executor_description.HasField("image_uri"):
340
+ config.image_uri = function_executor_description.image_uri
341
+
342
+ initialize_request: InitializeRequest = InitializeRequest(
343
+ namespace=function_executor_description.namespace,
344
+ graph_name=function_executor_description.graph_name,
345
+ graph_version=function_executor_description.graph_version,
346
+ function_name=function_executor_description.function_name,
347
+ graph=graph,
348
+ )
349
+ customer_code_timeout_sec: Optional[float] = None
350
+ if function_executor_description.HasField("customer_code_timeout_ms"):
351
+ customer_code_timeout_sec = (
352
+ function_executor_description.customer_code_timeout_ms / 1000.0
353
+ )
354
+
355
+ function_executor: FunctionExecutor = FunctionExecutor(
356
+ server_factory=function_executor_server_factory, logger=logger
357
+ )
358
+
359
+ try:
360
+ # Raises CustomerError if initialization failed in customer code or customer code timed out.
361
+ await function_executor.initialize(
362
+ config=config,
363
+ initialize_request=initialize_request,
364
+ base_url=base_url,
365
+ config_path=config_path,
366
+ customer_code_timeout_sec=customer_code_timeout_sec,
367
+ )
368
+ return function_executor
369
+ except (Exception, asyncio.CancelledError):
370
+ # Destroy the failed to startup FE asap so it doesn't consume resources.
371
+ # Destroy the FE also if the FE initialization got cancelled to not leak
372
+ # allocated resources.
373
+ await function_executor.destroy()
374
+ raise
375
+
376
+
377
+ class _UnlockedLockContextManager:
378
+ """Unlocks its lock on enter to the scope and locks it back on exit."""
379
+
380
+ def __init__(
381
+ self,
382
+ lock: asyncio.Lock,
383
+ ):
384
+ self._lock: asyncio.Lock = lock
385
+
386
+ async def __aenter__(self):
387
+ self._lock.release()
388
+ return self
389
+
390
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
391
+ await self._lock.acquire()
@@ -0,0 +1,17 @@
1
+ import prometheus_client
2
+
3
+ from ...monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ metric_state_reconciliations = prometheus_client.Counter(
6
+ "state_reconciliations",
7
+ "Number of Executor state reconciliations",
8
+ )
9
+ metric_state_reconciliation_errors = prometheus_client.Counter(
10
+ "state_reconciliation_errors",
11
+ "Number of Executor state reconciliation errors after all retries",
12
+ )
13
+ metric_state_reconciliation_latency: prometheus_client.Histogram = (
14
+ latency_metric_for_fast_operation(
15
+ "state_reconciliation", "Executor state reconciliation"
16
+ )
17
+ )
@@ -0,0 +1,8 @@
1
+ import prometheus_client
2
+
3
+ from ...monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ metric_task_cancellations = prometheus_client.Counter(
6
+ "task_cancellations",
7
+ "Number of times a task was cancelled",
8
+ )