indexify 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (36) hide show
  1. indexify/cli/executor.py +2 -9
  2. indexify/executor/blob_store/blob_store.py +110 -26
  3. indexify/executor/blob_store/local_fs_blob_store.py +41 -1
  4. indexify/executor/blob_store/metrics/blob_store.py +87 -15
  5. indexify/executor/blob_store/s3_blob_store.py +112 -1
  6. indexify/executor/function_executor/function_executor.py +32 -56
  7. indexify/executor/function_executor/invocation_state_client.py +10 -3
  8. indexify/executor/function_executor/server/function_executor_server_factory.py +0 -1
  9. indexify/executor/function_executor_controller/create_function_executor.py +129 -116
  10. indexify/executor/function_executor_controller/downloads.py +34 -86
  11. indexify/executor/function_executor_controller/events.py +13 -7
  12. indexify/executor/function_executor_controller/finalize_task.py +184 -0
  13. indexify/executor/function_executor_controller/function_executor_controller.py +121 -78
  14. indexify/executor/function_executor_controller/message_validators.py +10 -3
  15. indexify/executor/function_executor_controller/metrics/downloads.py +8 -52
  16. indexify/executor/function_executor_controller/metrics/finalize_task.py +20 -0
  17. indexify/executor/function_executor_controller/metrics/prepare_task.py +18 -0
  18. indexify/executor/function_executor_controller/metrics/run_task.py +5 -4
  19. indexify/executor/function_executor_controller/prepare_task.py +232 -14
  20. indexify/executor/function_executor_controller/run_task.py +189 -81
  21. indexify/executor/function_executor_controller/task_info.py +4 -7
  22. indexify/executor/function_executor_controller/task_input.py +21 -0
  23. indexify/executor/function_executor_controller/task_output.py +41 -33
  24. indexify/executor/function_executor_controller/terminate_function_executor.py +6 -1
  25. indexify/executor/logging.py +69 -0
  26. indexify/executor/monitoring/metrics.py +22 -0
  27. indexify/proto/executor_api.proto +11 -3
  28. indexify/proto/executor_api_pb2.py +54 -54
  29. indexify/proto/executor_api_pb2.pyi +8 -1
  30. {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/METADATA +6 -7
  31. {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/RECORD +33 -31
  32. indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -21
  33. indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -39
  34. indexify/executor/function_executor_controller/upload_task_output.py +0 -274
  35. {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/WHEEL +0 -0
  36. {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,20 @@
1
+ import prometheus_client
2
+
3
+ from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ # Task finalization metrics.
6
+ metric_task_finalizations: prometheus_client.Counter = prometheus_client.Counter(
7
+ "task_finalizations",
8
+ "Number of task finalizations",
9
+ )
10
+ metric_task_finalization_errors: prometheus_client.Counter = prometheus_client.Counter(
11
+ "task_finalization_errors",
12
+ "Number of task finalization errors",
13
+ )
14
+ metric_tasks_finalizing: prometheus_client.Gauge = prometheus_client.Gauge(
15
+ "tasks_finalizing",
16
+ "Number of tasks currently finalizing",
17
+ )
18
+ metric_task_finalization_latency: prometheus_client.Histogram = (
19
+ latency_metric_for_fast_operation("task_finalization", "task finalization")
20
+ )
@@ -0,0 +1,18 @@
1
+ import prometheus_client
2
+
3
+ from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ metric_task_preparations: prometheus_client.Counter = prometheus_client.Counter(
6
+ "task_preparations", "Number of task preparations for execution"
7
+ )
8
+ metric_task_preparation_errors: prometheus_client.Counter = prometheus_client.Counter(
9
+ "task_preparation_errors", "Number of task preparation errors"
10
+ )
11
+ metric_task_preparation_latency: prometheus_client.Histogram = (
12
+ latency_metric_for_fast_operation(
13
+ "task_preparation", "task preparation for execution"
14
+ )
15
+ )
16
+ metric_tasks_getting_prepared: prometheus_client.Gauge = prometheus_client.Gauge(
17
+ "tasks_getting_prepared", "Number of tasks currently getting prepared for execution"
18
+ )
@@ -6,23 +6,24 @@ from indexify.executor.monitoring.metrics import (
6
6
 
7
7
  metric_function_executor_run_task_rpcs: prometheus_client.Counter = (
8
8
  prometheus_client.Counter(
9
- "function_executor_run_task_rpcs", "Number of Function Executor run task RPCs"
9
+ "function_executor_run_task_rpcs",
10
+ "Number of Function Executor run task lifecycle RPC sequences",
10
11
  )
11
12
  )
12
13
  metric_function_executor_run_task_rpc_errors: prometheus_client.Counter = (
13
14
  prometheus_client.Counter(
14
15
  "function_executor_run_task_rpc_errors",
15
- "Number of Function Executor run task RPC errors",
16
+ "Number of Function Executor run task lifecycle RPC errors",
16
17
  )
17
18
  )
18
19
  metric_function_executor_run_task_rpc_latency: prometheus_client.Histogram = (
19
20
  latency_metric_for_customer_controlled_operation(
20
- "function_executor_run_task_rpc", "Function Executor run task RPC"
21
+ "function_executor_run_task_rpc", "Function Executor run task lifecycle RPC"
21
22
  )
22
23
  )
23
24
  metric_function_executor_run_task_rpcs_in_progress: prometheus_client.Gauge = (
24
25
  prometheus_client.Gauge(
25
26
  "function_executor_run_task_rpcs_in_progress",
26
- "Number of Function Executor run task RPCs in progress",
27
+ "Number of Function Executor run task lifecycle RPCs in progress",
27
28
  )
28
29
  )
@@ -1,38 +1,256 @@
1
- from typing import Any
1
+ import asyncio
2
+ import time
3
+ from typing import Any, List, Optional
4
+
5
+ from tensorlake.function_executor.proto.function_executor_pb2 import (
6
+ BLOB,
7
+ BLOBChunk,
8
+ FunctionInputs,
9
+ SerializedObjectInsideBLOB,
10
+ )
2
11
 
3
12
  from indexify.executor.blob_store.blob_store import BLOBStore
13
+ from indexify.proto.executor_api_pb2 import DataPayload, Task
4
14
 
5
- from .downloads import download_init_value, download_input
15
+ from .downloads import serialized_object_manifest_from_data_payload_proto
6
16
  from .events import TaskPreparationFinished
17
+ from .metrics.prepare_task import (
18
+ metric_task_preparation_errors,
19
+ metric_task_preparation_latency,
20
+ metric_task_preparations,
21
+ metric_tasks_getting_prepared,
22
+ )
7
23
  from .task_info import TaskInfo
24
+ from .task_input import TaskInput
25
+
26
+ # The following constants are subject to S3 limits,
27
+ # see https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html.
28
+ #
29
+ # 7 days - max presigned URL validity duration and limit on max function duration
30
+ _MAX_PRESIGNED_URI_EXPIRATION_SEC: int = 7 * 24 * 60 * 60
31
+ # This chunk size gives the best performance with S3. Based on our benchmarking.
32
+ _BLOB_OPTIMAL_CHUNK_SIZE_BYTES: int = 100 * 1024 * 1024 # 100 MB
33
+ # Max output size with optimal chunks is 100 * 100 MB = 10 GB.
34
+ # Each chunk requires a separate S3 presign operation, so we limit the number of optimal chunks to 100.
35
+ # S3 presign operations are local, it typically takes 30 ms per 100 URLs.
36
+ _OUTPUT_BLOB_OPTIMAL_CHUNKS_COUNT: int = 100
37
+ # This chunk size gives ~20% slower performance with S3 compared to optimal. Based on our benchmarking.
38
+ _OUTPUT_BLOB_SLOWER_CHUNK_SIZE_BYTES: int = 1 * 1024 * 1024 * 1024 # 1 GB
39
+ # Max output size with slower chunks is 100 * 1 GB = 100 GB.
40
+ _OUTPUT_BLOB_SLOWER_CHUNKS_COUNT: int = 100
41
+ # Invocation error output is using a single chunk.
42
+ _INVOCATION_ERROR_MAX_SIZE_BYTES: int = 10 * 1024 * 1024 # 10 MB
8
43
 
9
44
 
10
45
  async def prepare_task(
11
46
  task_info: TaskInfo, blob_store: BLOBStore, logger: Any
12
47
  ) -> TaskPreparationFinished:
13
- """Prepares the task by downloading the input and init value if available.
48
+ """Prepares the task for execution.
14
49
 
50
+ If successful then the task is runnable.
15
51
  Doesn't raise any exceptions.
16
52
  """
17
53
  logger = logger.bind(module=__name__)
54
+ start_time = time.monotonic()
18
55
  try:
19
- task_info.input = await download_input(
20
- data_payload=task_info.allocation.task.input,
56
+ with (
57
+ metric_task_preparation_errors.count_exceptions(),
58
+ metric_tasks_getting_prepared.track_inprogress(),
59
+ metric_task_preparation_latency.time(),
60
+ ):
61
+ metric_task_preparations.inc()
62
+ task_info.input = await _prepare_task_input(
63
+ task_info=task_info,
64
+ blob_store=blob_store,
65
+ logger=logger,
66
+ )
67
+ logger.info(
68
+ "Task was prepared for execution",
69
+ duration=time.monotonic() - start_time,
70
+ )
71
+ return TaskPreparationFinished(
72
+ task_info=task_info,
73
+ is_success=True,
74
+ )
75
+ except asyncio.CancelledError:
76
+ return TaskPreparationFinished(task_info=task_info, is_success=False)
77
+ except BaseException as e:
78
+ logger.error(
79
+ "Failed to prepare task for execution",
80
+ exc_info=e,
81
+ duration=time.monotonic() - start_time,
82
+ )
83
+ return TaskPreparationFinished(task_info=task_info, is_success=False)
84
+
85
+
86
+ async def _prepare_task_input(
87
+ task_info: TaskInfo, blob_store: BLOBStore, logger: Any
88
+ ) -> TaskInput:
89
+ """Prepares the task for execution.
90
+
91
+ Raises an exception on error.
92
+ """
93
+ task: Task = task_info.allocation.task
94
+ function_init_value_blob: Optional[BLOB] = None
95
+ function_init_value: Optional[SerializedObjectInsideBLOB] = None
96
+ if task.HasField("reducer_input"):
97
+ function_init_value_blob = await _presign_function_input_blob(
98
+ data_payload=task.reducer_input,
21
99
  blob_store=blob_store,
22
100
  logger=logger,
23
101
  )
102
+ function_init_value = _to_serialized_object_inside_blob(task.reducer_input)
103
+
104
+ function_outputs_blob_uri: str = (
105
+ f"{task.output_payload_uri_prefix}.{task_info.allocation.allocation_id}.output"
106
+ )
107
+ invocation_error_blob_uri: str = (
108
+ f"{task.invocation_error_payload_uri_prefix}.{task.graph_invocation_id}.inverr"
109
+ )
110
+
111
+ # The uploads are completed when finalizing the task.
112
+ function_outputs_blob_upload_id: Optional[str] = None
113
+ invocation_error_blob_upload_id: Optional[str] = None
114
+
115
+ try:
116
+ function_outputs_blob_upload_id = await blob_store.create_multipart_upload(
117
+ uri=function_outputs_blob_uri,
118
+ logger=logger,
119
+ )
120
+ invocation_error_blob_upload_id = await blob_store.create_multipart_upload(
121
+ uri=invocation_error_blob_uri,
122
+ logger=logger,
123
+ )
124
+ except BaseException:
125
+ if function_outputs_blob_upload_id is not None:
126
+ await blob_store.abort_multipart_upload(
127
+ uri=function_outputs_blob_uri,
128
+ upload_id=function_outputs_blob_upload_id,
129
+ logger=logger,
130
+ )
131
+ if invocation_error_blob_upload_id is not None:
132
+ await blob_store.abort_multipart_upload(
133
+ uri=invocation_error_blob_uri,
134
+ upload_id=invocation_error_blob_upload_id,
135
+ logger=logger,
136
+ )
137
+ raise
24
138
 
25
- if task_info.allocation.task.HasField("reducer_input"):
26
- task_info.init_value = await download_init_value(
27
- data_payload=task_info.allocation.task.reducer_input,
139
+ return TaskInput(
140
+ function_inputs=FunctionInputs(
141
+ function_input_blob=await _presign_function_input_blob(
142
+ data_payload=task.input,
143
+ blob_store=blob_store,
144
+ logger=logger,
145
+ ),
146
+ function_input=_to_serialized_object_inside_blob(task.input),
147
+ function_init_value_blob=function_init_value_blob,
148
+ function_init_value=function_init_value,
149
+ function_outputs_blob=await _presign_function_outputs_blob(
150
+ uri=function_outputs_blob_uri,
151
+ upload_id=function_outputs_blob_upload_id,
28
152
  blob_store=blob_store,
29
153
  logger=logger,
154
+ ),
155
+ invocation_error_blob=await _presign_invocation_error_blob(
156
+ uri=invocation_error_blob_uri,
157
+ upload_id=invocation_error_blob_upload_id,
158
+ blob_store=blob_store,
159
+ logger=logger,
160
+ ),
161
+ ),
162
+ function_outputs_blob_uri=function_outputs_blob_uri,
163
+ function_outputs_blob_upload_id=function_outputs_blob_upload_id,
164
+ invocation_error_blob_uri=invocation_error_blob_uri,
165
+ invocation_error_blob_upload_id=invocation_error_blob_upload_id,
166
+ )
167
+
168
+
169
+ async def _presign_function_input_blob(
170
+ data_payload: DataPayload, blob_store: BLOBStore, logger: Any
171
+ ) -> BLOB:
172
+ get_blob_uri: str = await blob_store.presign_get_uri(
173
+ uri=data_payload.uri,
174
+ expires_in_sec=_MAX_PRESIGNED_URI_EXPIRATION_SEC,
175
+ logger=logger,
176
+ )
177
+ chunks: List[BLOBChunk] = []
178
+
179
+ while len(chunks) * _BLOB_OPTIMAL_CHUNK_SIZE_BYTES < data_payload.size:
180
+ chunks.append(
181
+ BLOBChunk(
182
+ uri=get_blob_uri, # The URI allows to read any byte range in the BLOB.
183
+ size=_BLOB_OPTIMAL_CHUNK_SIZE_BYTES,
184
+ # ETag is only set by FE when returning BLOBs to us
30
185
  )
186
+ )
31
187
 
32
- return TaskPreparationFinished(task_info=task_info, is_success=True)
33
- except Exception as e:
34
- logger.error(
35
- "Failed to prepare task",
36
- exc_info=e,
188
+ return BLOB(
189
+ chunks=chunks,
190
+ )
191
+
192
+
193
+ async def _presign_function_outputs_blob(
194
+ uri: str, upload_id: str, blob_store: BLOBStore, logger: Any
195
+ ) -> BLOB:
196
+ """Presigns the output blob for the task."""
197
+ chunks: List[BLOBChunk] = []
198
+
199
+ while len(chunks) != (
200
+ _OUTPUT_BLOB_OPTIMAL_CHUNKS_COUNT + _OUTPUT_BLOB_SLOWER_CHUNKS_COUNT
201
+ ):
202
+ upload_chunk_uri: str = await blob_store.presign_upload_part_uri(
203
+ uri=uri,
204
+ part_number=len(chunks) + 1,
205
+ upload_id=upload_id,
206
+ expires_in_sec=_MAX_PRESIGNED_URI_EXPIRATION_SEC,
207
+ logger=logger,
37
208
  )
38
- return TaskPreparationFinished(task_info=task_info, is_success=False)
209
+
210
+ chunk_size: int = (
211
+ _BLOB_OPTIMAL_CHUNK_SIZE_BYTES
212
+ if len(chunks) < _OUTPUT_BLOB_OPTIMAL_CHUNKS_COUNT
213
+ else _OUTPUT_BLOB_SLOWER_CHUNK_SIZE_BYTES
214
+ )
215
+ chunks.append(
216
+ BLOBChunk(
217
+ uri=upload_chunk_uri,
218
+ size=chunk_size,
219
+ # ETag is only set by FE when returning BLOBs to us
220
+ )
221
+ )
222
+
223
+ return BLOB(
224
+ chunks=chunks,
225
+ )
226
+
227
+
228
+ async def _presign_invocation_error_blob(
229
+ uri: str, upload_id: str, blob_store: BLOBStore, logger: Any
230
+ ) -> BLOB:
231
+ """Presigns the output blob for the invocation error."""
232
+ upload_chunk_uri: str = await blob_store.presign_upload_part_uri(
233
+ uri=uri,
234
+ part_number=1,
235
+ upload_id=upload_id,
236
+ expires_in_sec=_MAX_PRESIGNED_URI_EXPIRATION_SEC,
237
+ logger=logger,
238
+ )
239
+ return BLOB(
240
+ chunks=[
241
+ BLOBChunk(
242
+ uri=upload_chunk_uri,
243
+ size=_INVOCATION_ERROR_MAX_SIZE_BYTES,
244
+ # ETag is only set by FE when returning BLOBs to us
245
+ )
246
+ ]
247
+ )
248
+
249
+
250
+ def _to_serialized_object_inside_blob(
251
+ data_payload: DataPayload,
252
+ ) -> SerializedObjectInsideBLOB:
253
+ return SerializedObjectInsideBLOB(
254
+ manifest=serialized_object_manifest_from_data_payload_proto(data_payload),
255
+ offset=data_payload.offset,
256
+ )