indexify 0.4.22__py3-none-any.whl → 0.4.24__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. indexify/cli/executor.py +2 -9
  2. indexify/executor/blob_store/blob_store.py +110 -26
  3. indexify/executor/blob_store/local_fs_blob_store.py +41 -1
  4. indexify/executor/blob_store/metrics/blob_store.py +87 -15
  5. indexify/executor/blob_store/s3_blob_store.py +112 -1
  6. indexify/executor/function_executor/function_executor.py +32 -56
  7. indexify/executor/function_executor/invocation_state_client.py +10 -3
  8. indexify/executor/function_executor/server/function_executor_server_factory.py +0 -1
  9. indexify/executor/function_executor_controller/create_function_executor.py +129 -116
  10. indexify/executor/function_executor_controller/downloads.py +34 -86
  11. indexify/executor/function_executor_controller/events.py +13 -7
  12. indexify/executor/function_executor_controller/finalize_task.py +184 -0
  13. indexify/executor/function_executor_controller/function_executor_controller.py +121 -78
  14. indexify/executor/function_executor_controller/message_validators.py +10 -3
  15. indexify/executor/function_executor_controller/metrics/downloads.py +8 -52
  16. indexify/executor/function_executor_controller/metrics/finalize_task.py +20 -0
  17. indexify/executor/function_executor_controller/metrics/prepare_task.py +18 -0
  18. indexify/executor/function_executor_controller/prepare_task.py +232 -14
  19. indexify/executor/function_executor_controller/run_task.py +77 -61
  20. indexify/executor/function_executor_controller/task_info.py +4 -7
  21. indexify/executor/function_executor_controller/task_input.py +21 -0
  22. indexify/executor/function_executor_controller/task_output.py +26 -35
  23. indexify/executor/function_executor_controller/terminate_function_executor.py +6 -1
  24. indexify/executor/logging.py +69 -0
  25. indexify/executor/monitoring/metrics.py +22 -0
  26. indexify/proto/executor_api.proto +11 -3
  27. indexify/proto/executor_api_pb2.py +54 -54
  28. indexify/proto/executor_api_pb2.pyi +8 -1
  29. {indexify-0.4.22.dist-info → indexify-0.4.24.dist-info}/METADATA +6 -6
  30. {indexify-0.4.22.dist-info → indexify-0.4.24.dist-info}/RECORD +32 -30
  31. indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -21
  32. indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -39
  33. indexify/executor/function_executor_controller/upload_task_output.py +0 -274
  34. {indexify-0.4.22.dist-info → indexify-0.4.24.dist-info}/WHEEL +0 -0
  35. {indexify-0.4.22.dist-info → indexify-0.4.24.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,20 @@
1
+ import prometheus_client
2
+
3
+ from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ # Task finalization metrics.
6
+ metric_task_finalizations: prometheus_client.Counter = prometheus_client.Counter(
7
+ "task_finalizations",
8
+ "Number of task finalizations",
9
+ )
10
+ metric_task_finalization_errors: prometheus_client.Counter = prometheus_client.Counter(
11
+ "task_finalization_errors",
12
+ "Number of task finalization errors",
13
+ )
14
+ metric_tasks_finalizing: prometheus_client.Gauge = prometheus_client.Gauge(
15
+ "tasks_finalizing",
16
+ "Number of tasks currently finalizing",
17
+ )
18
+ metric_task_finalization_latency: prometheus_client.Histogram = (
19
+ latency_metric_for_fast_operation("task_finalization", "task finalization")
20
+ )
@@ -0,0 +1,18 @@
1
+ import prometheus_client
2
+
3
+ from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ metric_task_preparations: prometheus_client.Counter = prometheus_client.Counter(
6
+ "task_preparations", "Number of task preparations for execution"
7
+ )
8
+ metric_task_preparation_errors: prometheus_client.Counter = prometheus_client.Counter(
9
+ "task_preparation_errors", "Number of task preparation errors"
10
+ )
11
+ metric_task_preparation_latency: prometheus_client.Histogram = (
12
+ latency_metric_for_fast_operation(
13
+ "task_preparation", "task preparation for execution"
14
+ )
15
+ )
16
+ metric_tasks_getting_prepared: prometheus_client.Gauge = prometheus_client.Gauge(
17
+ "tasks_getting_prepared", "Number of tasks currently getting prepared for execution"
18
+ )
@@ -1,38 +1,256 @@
1
- from typing import Any
1
+ import asyncio
2
+ import time
3
+ from typing import Any, List, Optional
4
+
5
+ from tensorlake.function_executor.proto.function_executor_pb2 import (
6
+ BLOB,
7
+ BLOBChunk,
8
+ FunctionInputs,
9
+ SerializedObjectInsideBLOB,
10
+ )
2
11
 
3
12
  from indexify.executor.blob_store.blob_store import BLOBStore
13
+ from indexify.proto.executor_api_pb2 import DataPayload, Task
4
14
 
5
- from .downloads import download_init_value, download_input
15
+ from .downloads import serialized_object_manifest_from_data_payload_proto
6
16
  from .events import TaskPreparationFinished
17
+ from .metrics.prepare_task import (
18
+ metric_task_preparation_errors,
19
+ metric_task_preparation_latency,
20
+ metric_task_preparations,
21
+ metric_tasks_getting_prepared,
22
+ )
7
23
  from .task_info import TaskInfo
24
+ from .task_input import TaskInput
25
+
26
+ # The following constants are subject to S3 limits,
27
+ # see https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html.
28
+ #
29
+ # 7 days - max presigned URL validity duration and limit on max function duration
30
+ _MAX_PRESIGNED_URI_EXPIRATION_SEC: int = 7 * 24 * 60 * 60
31
+ # This chunk size gives the best performance with S3. Based on our benchmarking.
32
+ _BLOB_OPTIMAL_CHUNK_SIZE_BYTES: int = 100 * 1024 * 1024 # 100 MB
33
+ # Max output size with optimal chunks is 100 * 100 MB = 10 GB.
34
+ # Each chunk requires a separate S3 presign operation, so we limit the number of optimal chunks to 100.
35
+ # S3 presign operations are local, it typically takes 30 ms per 100 URLs.
36
+ _OUTPUT_BLOB_OPTIMAL_CHUNKS_COUNT: int = 100
37
+ # This chunk size gives ~20% slower performance with S3 compared to optimal. Based on our benchmarking.
38
+ _OUTPUT_BLOB_SLOWER_CHUNK_SIZE_BYTES: int = 1 * 1024 * 1024 * 1024 # 1 GB
39
+ # Max output size with slower chunks is 100 * 1 GB = 100 GB.
40
+ _OUTPUT_BLOB_SLOWER_CHUNKS_COUNT: int = 100
41
+ # Invocation error output is using a single chunk.
42
+ _INVOCATION_ERROR_MAX_SIZE_BYTES: int = 10 * 1024 * 1024 # 10 MB
8
43
 
9
44
 
10
45
  async def prepare_task(
11
46
  task_info: TaskInfo, blob_store: BLOBStore, logger: Any
12
47
  ) -> TaskPreparationFinished:
13
- """Prepares the task by downloading the input and init value if available.
48
+ """Prepares the task for execution.
14
49
 
50
+ If successful then the task is runnable.
15
51
  Doesn't raise any exceptions.
16
52
  """
17
53
  logger = logger.bind(module=__name__)
54
+ start_time = time.monotonic()
18
55
  try:
19
- task_info.input = await download_input(
20
- data_payload=task_info.allocation.task.input,
56
+ with (
57
+ metric_task_preparation_errors.count_exceptions(),
58
+ metric_tasks_getting_prepared.track_inprogress(),
59
+ metric_task_preparation_latency.time(),
60
+ ):
61
+ metric_task_preparations.inc()
62
+ task_info.input = await _prepare_task_input(
63
+ task_info=task_info,
64
+ blob_store=blob_store,
65
+ logger=logger,
66
+ )
67
+ logger.info(
68
+ "Task was prepared for execution",
69
+ duration=time.monotonic() - start_time,
70
+ )
71
+ return TaskPreparationFinished(
72
+ task_info=task_info,
73
+ is_success=True,
74
+ )
75
+ except asyncio.CancelledError:
76
+ return TaskPreparationFinished(task_info=task_info, is_success=False)
77
+ except BaseException as e:
78
+ logger.error(
79
+ "Failed to prepare task for execution",
80
+ exc_info=e,
81
+ duration=time.monotonic() - start_time,
82
+ )
83
+ return TaskPreparationFinished(task_info=task_info, is_success=False)
84
+
85
+
86
+ async def _prepare_task_input(
87
+ task_info: TaskInfo, blob_store: BLOBStore, logger: Any
88
+ ) -> TaskInput:
89
+ """Prepares the task for execution.
90
+
91
+ Raises an exception on error.
92
+ """
93
+ task: Task = task_info.allocation.task
94
+ function_init_value_blob: Optional[BLOB] = None
95
+ function_init_value: Optional[SerializedObjectInsideBLOB] = None
96
+ if task.HasField("reducer_input"):
97
+ function_init_value_blob = await _presign_function_input_blob(
98
+ data_payload=task.reducer_input,
21
99
  blob_store=blob_store,
22
100
  logger=logger,
23
101
  )
102
+ function_init_value = _to_serialized_object_inside_blob(task.reducer_input)
103
+
104
+ function_outputs_blob_uri: str = (
105
+ f"{task.output_payload_uri_prefix}.{task_info.allocation.allocation_id}.output"
106
+ )
107
+ invocation_error_blob_uri: str = (
108
+ f"{task.invocation_error_payload_uri_prefix}.{task.graph_invocation_id}.inverr"
109
+ )
110
+
111
+ # The uploads are completed when finalizing the task.
112
+ function_outputs_blob_upload_id: Optional[str] = None
113
+ invocation_error_blob_upload_id: Optional[str] = None
114
+
115
+ try:
116
+ function_outputs_blob_upload_id = await blob_store.create_multipart_upload(
117
+ uri=function_outputs_blob_uri,
118
+ logger=logger,
119
+ )
120
+ invocation_error_blob_upload_id = await blob_store.create_multipart_upload(
121
+ uri=invocation_error_blob_uri,
122
+ logger=logger,
123
+ )
124
+ except BaseException:
125
+ if function_outputs_blob_upload_id is not None:
126
+ await blob_store.abort_multipart_upload(
127
+ uri=function_outputs_blob_uri,
128
+ upload_id=function_outputs_blob_upload_id,
129
+ logger=logger,
130
+ )
131
+ if invocation_error_blob_upload_id is not None:
132
+ await blob_store.abort_multipart_upload(
133
+ uri=invocation_error_blob_uri,
134
+ upload_id=invocation_error_blob_upload_id,
135
+ logger=logger,
136
+ )
137
+ raise
24
138
 
25
- if task_info.allocation.task.HasField("reducer_input"):
26
- task_info.init_value = await download_init_value(
27
- data_payload=task_info.allocation.task.reducer_input,
139
+ return TaskInput(
140
+ function_inputs=FunctionInputs(
141
+ function_input_blob=await _presign_function_input_blob(
142
+ data_payload=task.input,
143
+ blob_store=blob_store,
144
+ logger=logger,
145
+ ),
146
+ function_input=_to_serialized_object_inside_blob(task.input),
147
+ function_init_value_blob=function_init_value_blob,
148
+ function_init_value=function_init_value,
149
+ function_outputs_blob=await _presign_function_outputs_blob(
150
+ uri=function_outputs_blob_uri,
151
+ upload_id=function_outputs_blob_upload_id,
28
152
  blob_store=blob_store,
29
153
  logger=logger,
154
+ ),
155
+ invocation_error_blob=await _presign_invocation_error_blob(
156
+ uri=invocation_error_blob_uri,
157
+ upload_id=invocation_error_blob_upload_id,
158
+ blob_store=blob_store,
159
+ logger=logger,
160
+ ),
161
+ ),
162
+ function_outputs_blob_uri=function_outputs_blob_uri,
163
+ function_outputs_blob_upload_id=function_outputs_blob_upload_id,
164
+ invocation_error_blob_uri=invocation_error_blob_uri,
165
+ invocation_error_blob_upload_id=invocation_error_blob_upload_id,
166
+ )
167
+
168
+
169
+ async def _presign_function_input_blob(
170
+ data_payload: DataPayload, blob_store: BLOBStore, logger: Any
171
+ ) -> BLOB:
172
+ get_blob_uri: str = await blob_store.presign_get_uri(
173
+ uri=data_payload.uri,
174
+ expires_in_sec=_MAX_PRESIGNED_URI_EXPIRATION_SEC,
175
+ logger=logger,
176
+ )
177
+ chunks: List[BLOBChunk] = []
178
+
179
+ while len(chunks) * _BLOB_OPTIMAL_CHUNK_SIZE_BYTES < data_payload.size:
180
+ chunks.append(
181
+ BLOBChunk(
182
+ uri=get_blob_uri, # The URI allows to read any byte range in the BLOB.
183
+ size=_BLOB_OPTIMAL_CHUNK_SIZE_BYTES,
184
+ # ETag is only set by FE when returning BLOBs to us
30
185
  )
186
+ )
31
187
 
32
- return TaskPreparationFinished(task_info=task_info, is_success=True)
33
- except Exception as e:
34
- logger.error(
35
- "Failed to prepare task",
36
- exc_info=e,
188
+ return BLOB(
189
+ chunks=chunks,
190
+ )
191
+
192
+
193
+ async def _presign_function_outputs_blob(
194
+ uri: str, upload_id: str, blob_store: BLOBStore, logger: Any
195
+ ) -> BLOB:
196
+ """Presigns the output blob for the task."""
197
+ chunks: List[BLOBChunk] = []
198
+
199
+ while len(chunks) != (
200
+ _OUTPUT_BLOB_OPTIMAL_CHUNKS_COUNT + _OUTPUT_BLOB_SLOWER_CHUNKS_COUNT
201
+ ):
202
+ upload_chunk_uri: str = await blob_store.presign_upload_part_uri(
203
+ uri=uri,
204
+ part_number=len(chunks) + 1,
205
+ upload_id=upload_id,
206
+ expires_in_sec=_MAX_PRESIGNED_URI_EXPIRATION_SEC,
207
+ logger=logger,
37
208
  )
38
- return TaskPreparationFinished(task_info=task_info, is_success=False)
209
+
210
+ chunk_size: int = (
211
+ _BLOB_OPTIMAL_CHUNK_SIZE_BYTES
212
+ if len(chunks) < _OUTPUT_BLOB_OPTIMAL_CHUNKS_COUNT
213
+ else _OUTPUT_BLOB_SLOWER_CHUNK_SIZE_BYTES
214
+ )
215
+ chunks.append(
216
+ BLOBChunk(
217
+ uri=upload_chunk_uri,
218
+ size=chunk_size,
219
+ # ETag is only set by FE when returning BLOBs to us
220
+ )
221
+ )
222
+
223
+ return BLOB(
224
+ chunks=chunks,
225
+ )
226
+
227
+
228
+ async def _presign_invocation_error_blob(
229
+ uri: str, upload_id: str, blob_store: BLOBStore, logger: Any
230
+ ) -> BLOB:
231
+ """Presigns the output blob for the invocation error."""
232
+ upload_chunk_uri: str = await blob_store.presign_upload_part_uri(
233
+ uri=uri,
234
+ part_number=1,
235
+ upload_id=upload_id,
236
+ expires_in_sec=_MAX_PRESIGNED_URI_EXPIRATION_SEC,
237
+ logger=logger,
238
+ )
239
+ return BLOB(
240
+ chunks=[
241
+ BLOBChunk(
242
+ uri=upload_chunk_uri,
243
+ size=_INVOCATION_ERROR_MAX_SIZE_BYTES,
244
+ # ETag is only set by FE when returning BLOBs to us
245
+ )
246
+ ]
247
+ )
248
+
249
+
250
+ def _to_serialized_object_inside_blob(
251
+ data_payload: DataPayload,
252
+ ) -> SerializedObjectInsideBLOB:
253
+ return SerializedObjectInsideBLOB(
254
+ manifest=serialized_object_manifest_from_data_payload_proto(data_payload),
255
+ offset=data_payload.offset,
256
+ )
@@ -1,18 +1,17 @@
1
1
  import asyncio
2
- import os
3
- import random
4
2
  import time
5
3
  from typing import Any, Optional
6
4
 
7
5
  import grpc
8
6
  from tensorlake.function_executor.proto.function_executor_pb2 import (
7
+ BLOB,
9
8
  AwaitTaskProgress,
10
9
  AwaitTaskRequest,
11
10
  CreateTaskRequest,
12
11
  DeleteTaskRequest,
13
- FunctionInputs,
14
- SerializedObject,
12
+ SerializedObjectInsideBLOB,
15
13
  Task,
14
+ TaskDiagnostics,
16
15
  )
17
16
  from tensorlake.function_executor.proto.function_executor_pb2 import (
18
17
  TaskFailureReason as FETaskFailureReason,
@@ -47,10 +46,6 @@ from .metrics.run_task import (
47
46
  from .task_info import TaskInfo
48
47
  from .task_output import TaskMetrics, TaskOutput
49
48
 
50
- _ENABLE_INJECT_TASK_CANCELLATIONS = (
51
- os.getenv("INDEXIFY_INJECT_TASK_CANCELLATIONS", "0") == "1"
52
- )
53
-
54
49
  _CREATE_TASK_TIMEOUT_SECS = 5
55
50
  _DELETE_TASK_TIMEOUT_SECS = 5
56
51
 
@@ -63,23 +58,31 @@ async def run_task_on_function_executor(
63
58
  Doesn't raise any exceptions.
64
59
  """
65
60
  logger = logger.bind(module=__name__)
61
+
62
+ if task_info.input is None:
63
+ logger.error(
64
+ "task input is None, this should never happen",
65
+ )
66
+ task_info.output = TaskOutput.internal_error(
67
+ allocation=task_info.allocation,
68
+ execution_start_time=None,
69
+ execution_end_time=None,
70
+ )
71
+ return TaskExecutionFinished(
72
+ task_info=task_info,
73
+ function_executor_termination_reason=None,
74
+ )
75
+
66
76
  task = Task(
67
- task_id=task_info.allocation.task.id,
68
77
  namespace=task_info.allocation.task.namespace,
69
78
  graph_name=task_info.allocation.task.graph_name,
70
79
  graph_version=task_info.allocation.task.graph_version,
71
80
  function_name=task_info.allocation.task.function_name,
72
81
  graph_invocation_id=task_info.allocation.task.graph_invocation_id,
82
+ task_id=task_info.allocation.task.id,
73
83
  allocation_id=task_info.allocation.allocation_id,
74
- request=FunctionInputs(function_input=task_info.input),
84
+ request=task_info.input.function_inputs,
75
85
  )
76
- # Don't keep the input in memory after we started running the task.
77
- task_info.input = None
78
-
79
- if task_info.init_value is not None:
80
- task.request.function_init_value.CopyFrom(task_info.init_value)
81
- # Don't keep the init value in memory after we started running the task.
82
- task_info.init_value = None
83
86
 
84
87
  function_executor.invocation_state_client().add_task_to_invocation_id_entry(
85
88
  task_id=task_info.allocation.task.id,
@@ -104,10 +107,13 @@ async def run_task_on_function_executor(
104
107
  # If this RPC failed due to customer code crashing the server we won't be
105
108
  # able to detect this. We'll treat this as our own error for now and thus
106
109
  # let the AioRpcError to be raised here.
107
- timeout_sec = task_info.allocation.task.timeout_ms / 1000.0
110
+ timeout_sec: float = task_info.allocation.task.timeout_ms / 1000.0
108
111
  try:
112
+ # This aio task can only be cancelled during this await call.
109
113
  task_result = await _run_task_rpcs(task, function_executor, timeout_sec)
110
114
 
115
+ _process_task_diagnostics(task_result.diagnostics, logger)
116
+
111
117
  task_info.output = _task_output_from_function_executor_result(
112
118
  allocation=task_info.allocation,
113
119
  result=task_result,
@@ -123,7 +129,6 @@ async def run_task_on_function_executor(
123
129
  )
124
130
  task_info.output = TaskOutput.function_timeout(
125
131
  allocation=task_info.allocation,
126
- timeout_sec=timeout_sec,
127
132
  execution_start_time=execution_start_time,
128
133
  execution_end_time=time.monotonic(),
129
134
  )
@@ -156,8 +161,8 @@ async def run_task_on_function_executor(
156
161
  execution_start_time=execution_start_time,
157
162
  execution_end_time=time.monotonic(),
158
163
  )
159
-
160
164
  except asyncio.CancelledError:
165
+ # Handle aio task cancellation during `await _run_task_rpcs`.
161
166
  # The task is still running in FE, we only cancelled the client-side RPC.
162
167
  function_executor_termination_reason = (
163
168
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED
@@ -192,16 +197,21 @@ async def run_task_on_function_executor(
192
197
  task_info.output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
193
198
  and function_executor_termination_reason is None
194
199
  ):
195
- # Check if the task failed because the FE is unhealthy to prevent more tasks failing.
196
- result: HealthCheckResult = await function_executor.health_checker().check()
197
- if not result.is_healthy:
198
- function_executor_termination_reason = (
199
- FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
200
- )
201
- logger.error(
202
- "Function Executor health check failed after running task, shutting down Function Executor",
203
- health_check_fail_reason=result.reason,
204
- )
200
+ try:
201
+ # Check if the task failed because the FE is unhealthy to prevent more tasks failing.
202
+ result: HealthCheckResult = await function_executor.health_checker().check()
203
+ if not result.is_healthy:
204
+ function_executor_termination_reason = (
205
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
206
+ )
207
+ logger.error(
208
+ "Function Executor health check failed after running task, shutting down Function Executor",
209
+ health_check_fail_reason=result.reason,
210
+ )
211
+ except asyncio.CancelledError:
212
+ # The aio task was cancelled during the health check await.
213
+ # We can't conclude anything about the health of the FE here.
214
+ pass
205
215
 
206
216
  _log_task_execution_finished(output=task_info.output, logger=logger)
207
217
 
@@ -215,8 +225,7 @@ async def _run_task_rpcs(
215
225
  task: Task, function_executor: FunctionExecutor, timeout_sec: float
216
226
  ) -> TaskResult:
217
227
  """Runs the task, returning the result, reporting errors via exceptions."""
218
-
219
- response: AwaitTaskProgress
228
+ task_result: Optional[TaskResult] = None
220
229
  channel: grpc.aio.Channel = function_executor.channel()
221
230
  fe_stub = FunctionExecutorStub(channel)
222
231
 
@@ -231,24 +240,19 @@ async def _run_task_rpcs(
231
240
  try:
232
241
  while True:
233
242
  # Wait for next response with fresh timeout each time
234
- response = await asyncio.wait_for(await_rpc.read(), timeout=timeout_sec)
235
- if response.WhichOneof("response") == "task_result":
236
- # We're done waiting.
243
+ response: AwaitTaskProgress = await asyncio.wait_for(
244
+ await_rpc.read(), timeout=timeout_sec
245
+ )
246
+
247
+ if response == grpc.aio.EOF:
248
+ break
249
+ elif response.WhichOneof("response") == "task_result":
250
+ task_result = response.task_result
237
251
  break
238
252
 
239
253
  # NB: We don't actually check for other message types
240
254
  # here; any message from the FE is treated as an
241
255
  # indication that it's making forward progress.
242
-
243
- if response == grpc.aio.EOF:
244
- # Protocol error: we should get a task_result before
245
- # we see the RPC complete.
246
- raise grpc.aio.AioRpcError(
247
- grpc.StatusCode.CANCELLED,
248
- None,
249
- None,
250
- "Function Executor didn't return function/task alloc response",
251
- )
252
256
  finally:
253
257
  # Cancel the outstanding RPC to ensure any resources in use
254
258
  # are cleaned up; note that this is idempotent (in case the
@@ -260,7 +264,15 @@ async def _run_task_rpcs(
260
264
  DeleteTaskRequest(task_id=task.task_id), timeout=_DELETE_TASK_TIMEOUT_SECS
261
265
  )
262
266
 
263
- return response.task_result
267
+ if task_result is None:
268
+ raise grpc.aio.AioRpcError(
269
+ grpc.StatusCode.CANCELLED,
270
+ None,
271
+ None,
272
+ "Function Executor didn't return function/task alloc result",
273
+ )
274
+
275
+ return task_result
264
276
 
265
277
 
266
278
  def _task_output_from_function_executor_result(
@@ -271,8 +283,6 @@ def _task_output_from_function_executor_result(
271
283
  logger: Any,
272
284
  ) -> TaskOutput:
273
285
  response_validator = MessageValidator(result)
274
- response_validator.required_field("stdout")
275
- response_validator.required_field("stderr")
276
286
  response_validator.required_field("outcome_code")
277
287
 
278
288
  metrics = TaskMetrics(counters={}, timers={})
@@ -285,7 +295,8 @@ def _task_output_from_function_executor_result(
285
295
  result.outcome_code, logger=logger
286
296
  )
287
297
  failure_reason: Optional[TaskFailureReason] = None
288
- invocation_error_output: Optional[SerializedObject] = None
298
+ invocation_error_output: Optional[SerializedObjectInsideBLOB] = None
299
+ uploaded_invocation_error_blob: Optional[BLOB] = None
289
300
 
290
301
  if outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE:
291
302
  response_validator.required_field("failure_reason")
@@ -294,25 +305,22 @@ def _task_output_from_function_executor_result(
294
305
  )
295
306
  if failure_reason == TaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR:
296
307
  response_validator.required_field("invocation_error_output")
308
+ response_validator.required_field("uploaded_invocation_error_blob")
297
309
  invocation_error_output = result.invocation_error_output
298
-
299
- if _ENABLE_INJECT_TASK_CANCELLATIONS:
300
- logger.warning("injecting cancellation failure for the task allocation")
301
- if (
302
- random.random() < 0.5
303
- ): # 50% chance to get stable reproduction in manual testing
304
- outcome_code = TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
305
- failure_reason = TaskFailureReason.TASK_FAILURE_REASON_TASK_CANCELLED
310
+ uploaded_invocation_error_blob = result.uploaded_invocation_error_blob
311
+ elif outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS:
312
+ # function_outputs can have no items, this happens when the function returns None.
313
+ response_validator.required_field("uploaded_function_outputs_blob")
306
314
 
307
315
  return TaskOutput(
308
316
  allocation=allocation,
309
317
  outcome_code=outcome_code,
310
318
  failure_reason=failure_reason,
319
+ function_outputs=list(result.function_outputs),
320
+ uploaded_function_outputs_blob=result.uploaded_function_outputs_blob,
311
321
  invocation_error_output=invocation_error_output,
312
- function_outputs=result.function_outputs,
313
- next_functions=result.next_functions,
314
- stdout=result.stdout,
315
- stderr=result.stderr,
322
+ uploaded_invocation_error_blob=uploaded_invocation_error_blob,
323
+ next_functions=list(result.next_functions),
316
324
  metrics=metrics,
317
325
  execution_start_time=execution_start_time,
318
326
  execution_end_time=execution_end_time,
@@ -332,6 +340,14 @@ def _log_task_execution_finished(output: TaskOutput, logger: Any) -> None:
332
340
  )
333
341
 
334
342
 
343
+ def _process_task_diagnostics(task_diagnostics: TaskDiagnostics, logger: Any) -> None:
344
+ MessageValidator(task_diagnostics).required_field("function_executor_log")
345
+ # Uncomment these lines once we stop printing FE logs to stdout/stderr.
346
+ # Print FE logs directly to Executor logs so operators can see them.
347
+ # logger.info("Function Executor logs during task execution:")
348
+ # print(task_diagnostics.function_executor_log)
349
+
350
+
335
351
  def _to_task_outcome_code(
336
352
  fe_task_outcome_code: FETaskOutcomeCode, logger
337
353
  ) -> TaskOutcomeCode:
@@ -2,10 +2,9 @@ import asyncio
2
2
  from dataclasses import dataclass
3
3
  from typing import Optional
4
4
 
5
- from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
6
-
7
5
  from indexify.proto.executor_api_pb2 import TaskAllocation
8
6
 
7
+ from .task_input import TaskInput
9
8
  from .task_output import TaskOutput
10
9
 
11
10
 
@@ -22,11 +21,9 @@ class TaskInfo:
22
21
  is_cancelled: bool = False
23
22
  # aio task that is currently executing a lifecycle step of this task.
24
23
  aio_task: Optional[asyncio.Task] = None
25
- # Downloaded input if function was prepared successfully.
26
- input: Optional[SerializedObject] = None
27
- # Downloaded init value if function was prepared successfully and is a reducer.
28
- init_value: Optional[SerializedObject] = None
29
- # Output of the task.
24
+ # Input if function was prepared successfully.
25
+ input: Optional[TaskInput] = None
26
+ # Output of the task, always set when the task is completed.
30
27
  output: Optional[TaskOutput] = None
31
28
  # True if the task is fully completed and was added to state reporter.
32
29
  is_completed: bool = False
@@ -0,0 +1,21 @@
1
+ from tensorlake.function_executor.proto.function_executor_pb2 import FunctionInputs
2
+
3
+
4
+ class TaskInput:
5
+ """Represents the input for a task in the function executor controller."""
6
+
7
+ def __init__(
8
+ self,
9
+ function_inputs: FunctionInputs,
10
+ function_outputs_blob_uri: str,
11
+ function_outputs_blob_upload_id: str,
12
+ invocation_error_blob_uri: str,
13
+ invocation_error_blob_upload_id: str,
14
+ ):
15
+ # Actual input object sent to FE.
16
+ self.function_inputs = function_inputs
17
+ # Executor side function input related bookkeeping.
18
+ self.function_outputs_blob_uri = function_outputs_blob_uri
19
+ self.function_outputs_blob_upload_id = function_outputs_blob_upload_id
20
+ self.invocation_error_blob_uri = invocation_error_blob_uri
21
+ self.invocation_error_blob_upload_id = invocation_error_blob_upload_id