indexify 0.3.20__py3-none-any.whl → 0.3.21__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,6 +3,13 @@ from typing import Any, Dict, List, Optional
3
3
  from pydantic import BaseModel
4
4
 
5
5
 
6
+ class DataPayload(BaseModel):
7
+ path: str
8
+ size: int
9
+ sha256_hash: str
10
+ content_type: Optional[str] = None
11
+
12
+
6
13
  class Task(BaseModel):
7
14
  id: str
8
15
  namespace: str
@@ -16,6 +23,10 @@ class Task(BaseModel):
16
23
  "image_uri defines the URI of the image of this task. Optional since some executors do not require it."
17
24
  secret_names: Optional[List[str]] = None
18
25
  "secret_names defines the names of the secrets to set on function executor. Optional for backward compatibility."
26
+ graph_payload: Optional[DataPayload] = None
27
+ input_payload: Optional[DataPayload] = None
28
+ reducer_input_payload: Optional[DataPayload] = None
29
+ output_payload_uri_prefix: Optional[str] = None
19
30
 
20
31
 
21
32
  class FunctionURI(BaseModel):
@@ -49,12 +60,6 @@ class TaskResult(BaseModel):
49
60
  reducer: bool = False
50
61
 
51
62
 
52
- class DataPayload(BaseModel):
53
- path: str
54
- size: int
55
- sha256_hash: str
56
-
57
-
58
63
  class IngestFnOutputsResponse(BaseModel):
59
64
  data_payloads: List[DataPayload]
60
65
  stdout: Optional[DataPayload] = None
@@ -27,7 +27,7 @@ class LocalFSBLOBStore:
27
27
 
28
28
  def _sync_get(self, path: str) -> bytes:
29
29
  if not os.path.isabs(path):
30
- raise ValueError(f"Path {path} is not absolute")
30
+ raise ValueError(f"Path {path} must be absolute")
31
31
 
32
32
  if os.path.exists(path):
33
33
  with open(path, mode="rb") as blob_file:
@@ -37,7 +37,7 @@ class LocalFSBLOBStore:
37
37
 
38
38
  def _sync_put(self, path: str, value: bytes) -> None:
39
39
  if not os.path.isabs(path):
40
- raise ValueError(f"Path {path} is not absolute")
40
+ raise ValueError(f"Path {path} must be absolute")
41
41
 
42
42
  os.makedirs(os.path.dirname(path), exist_ok=True)
43
43
  with open(path, mode="wb") as blob_file:
@@ -77,8 +77,11 @@ class S3BLOBStore:
77
77
 
78
78
 
79
79
  def _bucket_name_and_object_key_from_uri(uri: str) -> tuple[str, str]:
80
+ # Example S3 object URI:
81
+ # s3://test-indexify-server-blob-store-eugene-20250411/225b83f4-2aed-40a7-adee-b7a681f817f2
80
82
  if not uri.startswith("s3://"):
81
83
  raise ValueError(f"S3 URI '{uri}' is missing 's3://' prefix")
84
+
82
85
  parts = uri[5:].split("/", 1)
83
86
  if len(parts) != 2:
84
87
  raise ValueError(f"Failed parsing bucket name from S3 URI '{uri}'")
@@ -1,6 +1,6 @@
1
1
  import asyncio
2
2
  import os
3
- from typing import Any, Optional
3
+ from typing import Any, Optional, Union
4
4
 
5
5
  import httpx
6
6
  import nanoid
@@ -8,11 +8,12 @@ from tensorlake.function_executor.proto.function_executor_pb2 import SerializedO
8
8
  from tensorlake.function_executor.proto.message_validator import MessageValidator
9
9
  from tensorlake.utils.http_client import get_httpx_client
10
10
 
11
+ from indexify.proto.executor_api_pb2 import DataPayload as DataPayloadProto
11
12
  from indexify.proto.executor_api_pb2 import (
12
- DataPayload,
13
13
  DataPayloadEncoding,
14
14
  )
15
15
 
16
+ from .api_objects import DataPayload
16
17
  from .blob_store.blob_store import BLOBStore
17
18
  from .metrics.downloader import (
18
19
  metric_graph_download_errors,
@@ -49,7 +50,7 @@ class Downloader:
49
50
  namespace: str,
50
51
  graph_name: str,
51
52
  graph_version: str,
52
- data_payload: Optional[DataPayload],
53
+ data_payload: Optional[Union[DataPayload, DataPayloadProto]],
53
54
  logger: Any,
54
55
  ) -> SerializedObject:
55
56
  logger = logger.bind(module=__name__)
@@ -99,7 +100,7 @@ class Downloader:
99
100
  function_name: str,
100
101
  graph_invocation_id: str,
101
102
  reducer_output_key: str,
102
- data_payload: Optional[DataPayload],
103
+ data_payload: Optional[Union[DataPayload, DataPayloadProto]],
103
104
  logger: Any,
104
105
  ) -> SerializedObject:
105
106
  logger = logger.bind(module=__name__)
@@ -124,7 +125,7 @@ class Downloader:
124
125
  namespace: str,
125
126
  graph_name: str,
126
127
  graph_version: str,
127
- data_payload: Optional[DataPayload],
128
+ data_payload: Optional[Union[DataPayload, DataPayloadProto]],
128
129
  logger: Any,
129
130
  ) -> SerializedObject:
130
131
  # Cache graph to reduce load on the server.
@@ -151,7 +152,7 @@ class Downloader:
151
152
  graph_version=graph_version,
152
153
  logger=logger,
153
154
  )
154
- else:
155
+ elif isinstance(data_payload, DataPayloadProto):
155
156
  (
156
157
  MessageValidator(data_payload)
157
158
  .required_field("uri")
@@ -160,7 +161,15 @@ class Downloader:
160
161
  data: bytes = await self._blob_store.get(
161
162
  uri=data_payload.uri, logger=logger
162
163
  )
163
- return _data_payload_to_serialized_object(
164
+ return _serialized_object_from_data_payload_proto(
165
+ data_payload=data_payload,
166
+ data=data,
167
+ )
168
+ elif isinstance(data_payload, DataPayload):
169
+ data: bytes = await self._blob_store.get(
170
+ uri=data_payload.path, logger=logger
171
+ )
172
+ return _serialized_object_from_data_payload(
164
173
  data_payload=data_payload,
165
174
  data=data,
166
175
  )
@@ -204,7 +213,7 @@ class Downloader:
204
213
  graph_name: str,
205
214
  graph_invocation_id: str,
206
215
  input_key: str,
207
- data_payload: Optional[DataPayload],
216
+ data_payload: Optional[Union[DataPayload, DataPayloadProto]],
208
217
  logger: Any,
209
218
  ) -> SerializedObject:
210
219
  if data_payload is None:
@@ -221,7 +230,7 @@ class Downloader:
221
230
  return await self._fetch_function_input_from_server(
222
231
  input_key=input_key, logger=logger
223
232
  )
224
- else:
233
+ elif isinstance(data_payload, DataPayloadProto):
225
234
  (
226
235
  MessageValidator(data_payload)
227
236
  .required_field("uri")
@@ -230,7 +239,15 @@ class Downloader:
230
239
  data: bytes = await self._blob_store.get(
231
240
  uri=data_payload.uri, logger=logger
232
241
  )
233
- return _data_payload_to_serialized_object(
242
+ return _serialized_object_from_data_payload_proto(
243
+ data_payload=data_payload,
244
+ data=data,
245
+ )
246
+ elif isinstance(data_payload, DataPayload):
247
+ data: bytes = await self._blob_store.get(
248
+ uri=data_payload.path, logger=logger
249
+ )
250
+ return _serialized_object_from_data_payload(
234
251
  data_payload=data_payload,
235
252
  data=data,
236
253
  )
@@ -242,7 +259,7 @@ class Downloader:
242
259
  function_name: str,
243
260
  graph_invocation_id: str,
244
261
  reducer_output_key: str,
245
- data_payload: Optional[DataPayload],
262
+ data_payload: Optional[Union[DataPayload, DataPayloadProto]],
246
263
  logger: Any,
247
264
  ) -> SerializedObject:
248
265
  if data_payload is None:
@@ -254,7 +271,7 @@ class Downloader:
254
271
  reducer_output_key=reducer_output_key,
255
272
  logger=logger,
256
273
  )
257
- else:
274
+ elif isinstance(data_payload, DataPayloadProto):
258
275
  (
259
276
  MessageValidator(data_payload)
260
277
  .required_field("uri")
@@ -263,7 +280,15 @@ class Downloader:
263
280
  data: bytes = await self._blob_store.get(
264
281
  uri=data_payload.uri, logger=logger
265
282
  )
266
- return _data_payload_to_serialized_object(
283
+ return _serialized_object_from_data_payload_proto(
284
+ data_payload=data_payload,
285
+ data=data,
286
+ )
287
+ elif isinstance(data_payload, DataPayload):
288
+ data: bytes = await self._blob_store.get(
289
+ uri=data_payload.path, logger=logger
290
+ )
291
+ return _serialized_object_from_data_payload(
267
292
  data_payload=data_payload,
268
293
  data=data,
269
294
  )
@@ -315,7 +340,11 @@ class Downloader:
315
340
  async def _fetch_url(
316
341
  self, url: str, resource_description: str, logger: Any
317
342
  ) -> SerializedObject:
318
- logger.info(f"fetching {resource_description}", url=url)
343
+ logger.warning(
344
+ f"downloading resource from Server",
345
+ url=url,
346
+ resource_description=resource_description,
347
+ )
319
348
  response: httpx.Response = await self._client.get(url)
320
349
  try:
321
350
  response.raise_for_status()
@@ -346,8 +375,23 @@ def serialized_object_from_http_response(response: httpx.Response) -> Serialized
346
375
  )
347
376
 
348
377
 
349
- def _data_payload_to_serialized_object(
378
+ def _serialized_object_from_data_payload(
350
379
  data_payload: DataPayload, data: bytes
380
+ ) -> SerializedObject:
381
+ """Converts the given data payload and its data into SerializedObject accepted by Function Executor."""
382
+ if data_payload.content_type in [
383
+ "application/octet-stream",
384
+ "application/pickle",
385
+ ]:
386
+ return SerializedObject(bytes=data, content_type=data_payload.content_type)
387
+ else:
388
+ return SerializedObject(
389
+ string=data.decode("utf-8"), content_type=data_payload.content_type
390
+ )
391
+
392
+
393
+ def _serialized_object_from_data_payload_proto(
394
+ data_payload: DataPayloadProto, data: bytes
351
395
  ) -> SerializedObject:
352
396
  """Converts the given data payload and its data into SerializedObject accepted by Function Executor.
353
397
 
@@ -128,6 +128,7 @@ class Executor:
128
128
  executor_id=id,
129
129
  config_path=config_path,
130
130
  channel_manager=self._channel_manager,
131
+ blob_store=blob_store,
131
132
  )
132
133
 
133
134
  # HTTP mode task runner
@@ -261,6 +262,7 @@ class Executor:
261
262
  function_name=task.compute_fn,
262
263
  graph_version=task.graph_version,
263
264
  graph_invocation_id=task.invocation_id,
265
+ output_payload_uri_prefix=task.output_payload_uri_prefix,
264
266
  )
265
267
  logger.error("task execution failed", exc_info=e)
266
268
 
@@ -304,19 +306,19 @@ class Executor:
304
306
  graph_name=task.compute_graph,
305
307
  graph_version=task.graph_version,
306
308
  logger=logger,
307
- data_payload=None,
309
+ data_payload=task.graph_payload,
308
310
  )
309
311
  input: SerializedObject = await self._downloader.download_input(
310
312
  namespace=task.namespace,
311
313
  graph_name=task.compute_graph,
312
314
  graph_invocation_id=task.invocation_id,
313
315
  input_key=task.input_key,
314
- data_payload=None,
316
+ data_payload=task.input_payload,
315
317
  logger=logger,
316
318
  )
317
319
  init_value: Optional[SerializedObject] = (
318
320
  None
319
- if task.reducer_output_id is None
321
+ if task.reducer_output_id is None and task.reducer_input_payload is None
320
322
  else (
321
323
  await self._downloader.download_init_value(
322
324
  namespace=task.namespace,
@@ -324,7 +326,7 @@ class Executor:
324
326
  function_name=task.compute_fn,
325
327
  graph_invocation_id=task.invocation_id,
326
328
  reducer_output_key=task.reducer_output_id,
327
- data_payload=None,
329
+ data_payload=task.reducer_input_payload,
328
330
  logger=logger,
329
331
  )
330
332
  )
@@ -96,6 +96,7 @@ class SingleTaskRunner:
96
96
  graph_invocation_id=self._task_input.task.invocation_id,
97
97
  stderr=str(e),
98
98
  success=False,
99
+ output_payload_uri_prefix=self._task_input.task.output_payload_uri_prefix,
99
100
  )
100
101
 
101
102
  try:
@@ -311,6 +312,7 @@ def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
311
312
  reducer=response.is_reducer,
312
313
  success=response.success,
313
314
  metrics=metrics,
315
+ output_payload_uri_prefix=task.output_payload_uri_prefix,
314
316
  )
315
317
 
316
318
  if response.HasField("function_output"):
@@ -25,6 +25,7 @@ class TaskOutput:
25
25
  function_name: str,
26
26
  graph_version: str,
27
27
  graph_invocation_id: str,
28
+ output_payload_uri_prefix: Optional[str],
28
29
  output_encoding: Optional[str] = None,
29
30
  function_output: Optional[FunctionOutput] = None,
30
31
  router_output: Optional[RouterOutput] = None,
@@ -50,6 +51,7 @@ class TaskOutput:
50
51
  self.is_internal_error = is_internal_error
51
52
  self.metrics = metrics
52
53
  self.output_encoding = output_encoding
54
+ self.output_payload_uri_prefix = output_payload_uri_prefix
53
55
 
54
56
  @classmethod
55
57
  def internal_error(
@@ -60,6 +62,7 @@ class TaskOutput:
60
62
  function_name: str,
61
63
  graph_version: str,
62
64
  graph_invocation_id: str,
65
+ output_payload_uri_prefix: Optional[str],
63
66
  ) -> "TaskOutput":
64
67
  """Creates a TaskOutput for an internal error."""
65
68
  # We are not sharing internal error messages with the customer.
@@ -72,6 +75,7 @@ class TaskOutput:
72
75
  graph_invocation_id=graph_invocation_id,
73
76
  stderr="Platform failed to execute the function.",
74
77
  is_internal_error=True,
78
+ output_payload_uri_prefix=output_payload_uri_prefix,
75
79
  )
76
80
 
77
81
  @classmethod
@@ -84,6 +88,7 @@ class TaskOutput:
84
88
  graph_version: str,
85
89
  graph_invocation_id: str,
86
90
  timeout_sec: float,
91
+ output_payload_uri_prefix: Optional[str],
87
92
  ) -> "TaskOutput":
88
93
  """Creates a TaskOutput for an function timeout error."""
89
94
  # Task stdout, stderr is not available.
@@ -96,4 +101,5 @@ class TaskOutput:
96
101
  graph_invocation_id=graph_invocation_id,
97
102
  stderr=f"Function exceeded its configured timeout of {timeout_sec:.3f} sec.",
98
103
  is_internal_error=False,
104
+ output_payload_uri_prefix=output_payload_uri_prefix,
99
105
  )
@@ -396,6 +396,11 @@ class TaskController:
396
396
  function_name=self._task.function_name,
397
397
  graph_version=self._task.graph_version,
398
398
  graph_invocation_id=self._task.graph_invocation_id,
399
+ output_payload_uri_prefix=(
400
+ self._task.output_payload_uri_prefix
401
+ if self._task.HasField("output_payload_uri_prefix")
402
+ else None
403
+ ),
399
404
  )
400
405
 
401
406
  def _function_timeout_output(self, timeout_sec: float) -> TaskOutput:
@@ -407,6 +412,11 @@ class TaskController:
407
412
  graph_version=self._task.graph_version,
408
413
  graph_invocation_id=self._task.graph_invocation_id,
409
414
  timeout_sec=timeout_sec,
415
+ output_payload_uri_prefix=(
416
+ self._task.output_payload_uri_prefix
417
+ if self._task.HasField("output_payload_uri_prefix")
418
+ else None
419
+ ),
410
420
  )
411
421
 
412
422
 
@@ -437,6 +447,11 @@ def _task_output_from_function_executor_response(
437
447
  reducer=response.is_reducer,
438
448
  success=response.success,
439
449
  metrics=metrics,
450
+ output_payload_uri_prefix=(
451
+ task.output_payload_uri_prefix
452
+ if task.HasField("output_payload_uri_prefix")
453
+ else None
454
+ ),
440
455
  )
441
456
 
442
457
  if response.HasField("function_output"):
@@ -21,6 +21,23 @@ metric_server_ingest_files_latency: prometheus_client.Histogram = (
21
21
  )
22
22
  )
23
23
 
24
+ metric_task_output_blob_store_uploads: prometheus_client.Counter = (
25
+ prometheus_client.Counter(
26
+ "task_output_blob_store_uploads", "Number of task output uploads to blob store"
27
+ )
28
+ )
29
+ metric_task_output_blob_store_upload_errors: prometheus_client.Counter = (
30
+ prometheus_client.Counter(
31
+ "task_output_blob_store_upload_errors",
32
+ "Number of failed task output uploads to blob store",
33
+ )
34
+ )
35
+ metric_task_output_blob_store_upload_latency: prometheus_client.Histogram = (
36
+ latency_metric_for_fast_operation(
37
+ "task_output_blob_store_upload", "Upload task output to blob store"
38
+ )
39
+ )
40
+
24
41
  metric_report_task_outcome_rpcs = prometheus_client.Counter(
25
42
  "report_task_outcome_rpcs",
26
43
  "Number of report task outcome RPCs to Server",
@@ -1,4 +1,5 @@
1
1
  import asyncio
2
+ import hashlib
2
3
  import time
3
4
  from typing import Any, List, Optional, Tuple
4
5
 
@@ -7,8 +8,8 @@ from httpx import Timeout
7
8
  from tensorlake.function_executor.proto.function_executor_pb2 import FunctionOutput
8
9
  from tensorlake.utils.http_client import get_httpx_client
9
10
 
11
+ from indexify.proto.executor_api_pb2 import DataPayload as DataPayloadProto
10
12
  from indexify.proto.executor_api_pb2 import (
11
- DataPayload,
12
13
  DataPayloadEncoding,
13
14
  OutputEncoding,
14
15
  ReportTaskOutcomeRequest,
@@ -19,10 +20,12 @@ from indexify.proto.executor_api_pb2_grpc import ExecutorAPIStub
19
20
  from .api_objects import (
20
21
  TASK_OUTCOME_FAILURE,
21
22
  TASK_OUTCOME_SUCCESS,
23
+ DataPayload,
22
24
  IngestFnOutputsResponse,
23
25
  RouterOutput,
24
26
  TaskResult,
25
27
  )
28
+ from .blob_store.blob_store import BLOBStore
26
29
  from .function_executor.task_output import TaskOutput
27
30
  from .grpc.channel_manager import ChannelManager
28
31
  from .metrics.task_reporter import (
@@ -32,6 +35,9 @@ from .metrics.task_reporter import (
32
35
  metric_server_ingest_files_errors,
33
36
  metric_server_ingest_files_latency,
34
37
  metric_server_ingest_files_requests,
38
+ metric_task_output_blob_store_upload_errors,
39
+ metric_task_output_blob_store_upload_latency,
40
+ metric_task_output_blob_store_uploads,
35
41
  )
36
42
 
37
43
 
@@ -63,6 +69,7 @@ class TaskReporter:
63
69
  base_url: str,
64
70
  executor_id: str,
65
71
  channel_manager: ChannelManager,
72
+ blob_store: BLOBStore,
66
73
  config_path: Optional[str] = None,
67
74
  ):
68
75
  self._base_url = base_url
@@ -75,6 +82,7 @@ class TaskReporter:
75
82
  # results in not reusing established TCP connections to server.
76
83
  self._client = get_httpx_client(config_path, make_async=False)
77
84
  self._channel_manager = channel_manager
85
+ self._blob_store = blob_store
78
86
 
79
87
  async def shutdown(self) -> None:
80
88
  """Shuts down the task reporter.
@@ -95,9 +103,13 @@ class TaskReporter:
95
103
  )
96
104
  return
97
105
 
98
- task_result, output_files, output_summary = self._process_task_output(output)
99
- task_result_data = task_result.model_dump_json(exclude_none=True)
106
+ # TODO: If the files are uploaded successfully,
107
+ # we should record that so that if we fail to report
108
+ # the task outcome, we don't retry the upload.
109
+ # This will save us some time and resources.
110
+ # It's good to do this once we delete all the legacy code paths.
100
111
 
112
+ output_summary: TaskOutputSummary = _task_output_summary(output)
101
113
  logger.info(
102
114
  "reporting task outcome",
103
115
  total_bytes=output_summary.total_bytes,
@@ -111,56 +123,15 @@ class TaskReporter:
111
123
  stderr_bytes=output_summary.stderr_total_bytes,
112
124
  )
113
125
 
114
- kwargs = {
115
- "data": {"task_result": task_result_data},
116
- # Use httpx default timeout of 5s for all timeout types.
117
- # For read timeouts, use 5 minutes to allow for large file uploads.
118
- "timeout": Timeout(
119
- 5.0,
120
- read=5.0 * 60,
121
- ),
122
- "files": output_files if len(output_files) > 0 else FORCE_MULTIPART,
123
- }
124
-
125
- # TODO: Instead of uploading the files to server, upload them to S3.
126
- start_time = time.time()
127
- with metric_server_ingest_files_latency.time():
128
- metric_server_ingest_files_requests.inc()
129
- # Run in a separate thread to not block the main event loop.
130
- response = await asyncio.to_thread(
131
- self._client.post,
132
- url=f"{self._base_url}/internal/ingest_fn_outputs",
133
- **kwargs,
134
- )
135
- end_time = time.time()
136
- logger.info(
137
- "files uploaded",
138
- response_time=end_time - start_time,
139
- response_code=response.status_code,
140
- )
141
-
142
- try:
143
- response.raise_for_status()
144
- except Exception as e:
145
- metric_server_ingest_files_errors.inc()
146
- # Caller catches and logs the exception.
147
- raise Exception(
148
- "failed to upload files. "
149
- f"Response code: {response.status_code}. "
150
- f"Response text: '{response.text}'."
151
- ) from e
152
-
153
- # TODO: If the files are uploaded successfully,
154
- # we should record that so that if we fail to report
155
- # the task outcome, we don't retry the upload.
156
- # This will save us some time and resources.
126
+ if output.output_payload_uri_prefix is None:
127
+ ingested_files = await self._ingest_files_at_server(output, logger)
128
+ else:
129
+ ingested_files = await self._ingest_files_at_blob_store(output, logger)
157
130
 
158
- ingested_files_response = response.json()
159
- ingested_files = IngestFnOutputsResponse.model_validate(ingested_files_response)
160
131
  fn_outputs = []
161
132
  for data_payload in ingested_files.data_payloads:
162
133
  fn_outputs.append(
163
- DataPayload(
134
+ DataPayloadProto(
164
135
  path=data_payload.path, # TODO: stop using this deprecated field once Server side migration is done.
165
136
  uri=data_payload.path,
166
137
  size=data_payload.size,
@@ -170,8 +141,8 @@ class TaskReporter:
170
141
  )
171
142
  )
172
143
  stdout, stderr = None, None
173
- if ingested_files.stdout:
174
- stdout = DataPayload(
144
+ if ingested_files.stdout is not None:
145
+ stdout = DataPayloadProto(
175
146
  path=ingested_files.stdout.path, # TODO: stop using this deprecated field once Server side migration is done.
176
147
  uri=ingested_files.stdout.path,
177
148
  size=ingested_files.stdout.size,
@@ -179,8 +150,8 @@ class TaskReporter:
179
150
  encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
180
151
  encoding_version=0,
181
152
  )
182
- if ingested_files.stderr:
183
- stderr = DataPayload(
153
+ if ingested_files.stderr is not None:
154
+ stderr = DataPayloadProto(
184
155
  path=ingested_files.stderr.path, # TODO: stop using this deprecated field once Server side migration is done.
185
156
  uri=ingested_files.stderr.path,
186
157
  size=ingested_files.stderr.size,
@@ -218,9 +189,132 @@ class TaskReporter:
218
189
  logger.error("failed to report task outcome", error=e)
219
190
  raise e
220
191
 
221
- def _process_task_output(
222
- self, output: TaskOutput
223
- ) -> Tuple[TaskResult, List[Any], TaskOutputSummary]:
192
+ async def _ingest_files_at_server(
193
+ self, output: TaskOutput, logger: Any
194
+ ) -> IngestFnOutputsResponse:
195
+ logger.warning("uploading task output files to server (deprecated mode)")
196
+
197
+ task_result, output_files = self._process_task_output(output)
198
+ task_result_data = task_result.model_dump_json(exclude_none=True)
199
+
200
+ kwargs = {
201
+ "data": {"task_result": task_result_data},
202
+ # Use httpx default timeout of 5s for all timeout types.
203
+ # For read timeouts, use 5 minutes to allow for large file uploads.
204
+ "timeout": Timeout(
205
+ 5.0,
206
+ read=5.0 * 60,
207
+ ),
208
+ "files": output_files if len(output_files) > 0 else FORCE_MULTIPART,
209
+ }
210
+
211
+ start_time = time.time()
212
+ with metric_server_ingest_files_latency.time():
213
+ metric_server_ingest_files_requests.inc()
214
+ # Run in a separate thread to not block the main event loop.
215
+ response = await asyncio.to_thread(
216
+ self._client.post,
217
+ url=f"{self._base_url}/internal/ingest_fn_outputs",
218
+ **kwargs,
219
+ )
220
+ end_time = time.time()
221
+ logger.info(
222
+ "files uploaded to server",
223
+ response_time=end_time - start_time,
224
+ response_code=response.status_code,
225
+ )
226
+
227
+ try:
228
+ response.raise_for_status()
229
+ except Exception as e:
230
+ metric_server_ingest_files_errors.inc()
231
+ # Caller catches and logs the exception.
232
+ raise Exception(
233
+ "failed to upload files. "
234
+ f"Response code: {response.status_code}. "
235
+ f"Response text: '{response.text}'."
236
+ ) from e
237
+
238
+ ingested_files_response = response.json()
239
+ return IngestFnOutputsResponse.model_validate(ingested_files_response)
240
+
241
+ async def _ingest_files_at_blob_store(
242
+ self, output: TaskOutput, logger: Any
243
+ ) -> IngestFnOutputsResponse:
244
+ start_time = time.time()
245
+ with (
246
+ metric_task_output_blob_store_upload_latency.time(),
247
+ metric_task_output_blob_store_upload_errors.count_exceptions(),
248
+ ):
249
+ metric_task_output_blob_store_uploads.inc()
250
+ response = await self._upload_output_to_blob_store(output, logger)
251
+
252
+ logger.info(
253
+ "files uploaded to blob store",
254
+ duration=time.time() - start_time,
255
+ )
256
+ return response
257
+
258
+ async def _upload_output_to_blob_store(
259
+ self, output: TaskOutput, logger: Any
260
+ ) -> IngestFnOutputsResponse:
261
+ data_payloads: List[DataPayload] = []
262
+ stdout: Optional[DataPayload] = None
263
+ stderr: Optional[DataPayload] = None
264
+
265
+ if output.stdout is not None:
266
+ stdout_url = f"{output.output_payload_uri_prefix}.{output.task_id}.stdout"
267
+ stdout_bytes: bytes = output.stdout.encode()
268
+ await self._blob_store.put(stdout_url, stdout_bytes, logger)
269
+ stdout = DataPayload(
270
+ path=stdout_url,
271
+ size=len(stdout_bytes),
272
+ sha256_hash=_compute_hash(stdout_bytes),
273
+ )
274
+
275
+ if output.stderr is not None:
276
+ stderr_url = f"{output.output_payload_uri_prefix}.{output.task_id}.stderr"
277
+ stderr_bytes: bytes = output.stderr.encode()
278
+ await self._blob_store.put(stderr_url, stderr_bytes, logger)
279
+ stderr = DataPayload(
280
+ path=stderr_url,
281
+ size=len(stderr_bytes),
282
+ sha256_hash=_compute_hash(stderr_bytes),
283
+ )
284
+
285
+ if output.function_output is not None:
286
+ for func_output_item in output.function_output.outputs:
287
+ node_output_sequence = len(data_payloads)
288
+ if output.reducer:
289
+ # Reducer tasks have to write their results into the same blob.
290
+ output_url = (
291
+ f"{output.output_payload_uri_prefix}.{node_output_sequence}"
292
+ )
293
+ else:
294
+ # Regular tasks write their results into different blobs made unique using task ids.
295
+ output_url = f"{output.output_payload_uri_prefix}.{output.task_id}.{node_output_sequence}"
296
+
297
+ output_bytes: bytes = (
298
+ func_output_item.bytes
299
+ if func_output_item.HasField("bytes")
300
+ else func_output_item.string.encode()
301
+ )
302
+ await self._blob_store.put(output_url, output_bytes, logger)
303
+ data_payloads.append(
304
+ DataPayload(
305
+ path=output_url,
306
+ size=len(output_bytes),
307
+ sha256_hash=_compute_hash(output_bytes),
308
+ )
309
+ )
310
+
311
+ return IngestFnOutputsResponse(
312
+ data_payloads=data_payloads,
313
+ stdout=stdout,
314
+ stderr=stderr,
315
+ )
316
+
317
+ def _process_task_output(self, output: TaskOutput) -> Tuple[TaskResult, List[Any]]:
224
318
  task_result = TaskResult(
225
319
  outcome="failure",
226
320
  namespace=output.namespace,
@@ -231,9 +325,8 @@ class TaskReporter:
231
325
  task_id=output.task_id,
232
326
  )
233
327
  output_files: List[Any] = []
234
- summary: TaskOutputSummary = TaskOutputSummary()
235
328
  if output is None:
236
- return task_result, output_files, summary
329
+ return task_result, output_files
237
330
 
238
331
  task_result.outcome = (
239
332
  TASK_OUTCOME_SUCCESS if output.success else TASK_OUTCOME_FAILURE
@@ -241,33 +334,19 @@ class TaskReporter:
241
334
  task_result.reducer = output.reducer
242
335
 
243
336
  _process_function_output(
244
- function_output=output.function_output,
245
- output_files=output_files,
246
- summary=summary,
337
+ function_output=output.function_output, output_files=output_files
247
338
  )
248
339
  _process_router_output(
249
- router_output=output.router_output, task_result=task_result, summary=summary
250
- )
251
- _process_stdout(
252
- stdout=output.stdout, output_files=output_files, summary=summary
253
- )
254
- _process_stderr(
255
- stderr=output.stderr, output_files=output_files, summary=summary
340
+ router_output=output.router_output, task_result=task_result
256
341
  )
342
+ _process_stdout(stdout=output.stdout, output_files=output_files)
343
+ _process_stderr(stderr=output.stderr, output_files=output_files)
257
344
 
258
- summary.total_bytes = (
259
- summary.output_total_bytes
260
- + summary.stdout_total_bytes
261
- + summary.stderr_total_bytes
262
- )
263
-
264
- return task_result, output_files, summary
345
+ return task_result, output_files
265
346
 
266
347
 
267
348
  def _process_function_output(
268
- function_output: Optional[FunctionOutput],
269
- output_files: List[Any],
270
- summary: TaskOutputSummary,
349
+ function_output: Optional[FunctionOutput], output_files: List[Any]
271
350
  ) -> None:
272
351
  if function_output is None:
273
352
  return
@@ -280,25 +359,19 @@ def _process_function_output(
280
359
  (nanoid.generate(), payload, output.content_type),
281
360
  )
282
361
  )
283
- summary.output_count += 1
284
- summary.output_total_bytes += len(payload)
285
362
 
286
363
 
287
364
  def _process_router_output(
288
365
  router_output: Optional[RouterOutput],
289
366
  task_result: TaskResult,
290
- summary: TaskOutputSummary,
291
367
  ) -> None:
292
368
  if router_output is None:
293
369
  return
294
370
 
295
371
  task_result.router_output = RouterOutput(edges=router_output.edges)
296
- summary.router_output_count += 1
297
372
 
298
373
 
299
- def _process_stdout(
300
- stdout: Optional[str], output_files: List[Any], summary: TaskOutputSummary
301
- ) -> None:
374
+ def _process_stdout(stdout: Optional[str], output_files: List[Any]) -> None:
302
375
  if stdout is None:
303
376
  return
304
377
 
@@ -312,13 +385,9 @@ def _process_stdout(
312
385
  ),
313
386
  )
314
387
  )
315
- summary.stdout_count += 1
316
- summary.stdout_total_bytes += len(stdout)
317
388
 
318
389
 
319
- def _process_stderr(
320
- stderr: Optional[str], output_files: List[Any], summary: TaskOutputSummary
321
- ) -> None:
390
+ def _process_stderr(stderr: Optional[str], output_files: List[Any]) -> None:
322
391
  if stderr is None:
323
392
  return
324
393
 
@@ -332,8 +401,38 @@ def _process_stderr(
332
401
  ),
333
402
  )
334
403
  )
335
- summary.stderr_count += 1
336
- summary.stderr_total_bytes += len(stderr)
404
+
405
+
406
+ def _task_output_summary(output: TaskOutput) -> TaskOutputSummary:
407
+ summary: TaskOutputSummary = TaskOutputSummary()
408
+
409
+ if output.stdout is not None:
410
+ summary.stdout_count += 1
411
+ summary.stdout_total_bytes += len(output.stdout)
412
+
413
+ if output.stderr is not None:
414
+ summary.stderr_count += 1
415
+ summary.stderr_total_bytes += len(output.stderr)
416
+
417
+ if output.function_output is not None:
418
+ for func_output_item in output.function_output.outputs:
419
+ output_len: bytes = len(
420
+ func_output_item.bytes
421
+ if func_output_item.HasField("bytes")
422
+ else func_output_item.string
423
+ )
424
+ summary.output_count += 1
425
+ summary.output_total_bytes += output_len
426
+
427
+ if output.router_output is not None:
428
+ summary.router_output_count += 1
429
+
430
+ summary.total_bytes = (
431
+ summary.output_total_bytes
432
+ + summary.stdout_total_bytes
433
+ + summary.stderr_total_bytes
434
+ )
435
+ return summary
337
436
 
338
437
 
339
438
  def _to_grpc_task_outcome(task_output: TaskOutput) -> TaskOutcome:
@@ -355,3 +454,9 @@ def _to_grpc_data_payload_encoding(task_output: TaskOutput) -> DataPayloadEncodi
355
454
  return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_JSON
356
455
  else:
357
456
  return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_BINARY_PICKLE
457
+
458
+
459
+ def _compute_hash(data: bytes) -> str:
460
+ hasher = hashlib.sha256(usedforsecurity=False)
461
+ hasher.update(data)
462
+ return hasher.hexdigest()
@@ -85,6 +85,7 @@ class TaskRunner:
85
85
  function_name=task_input.task.compute_fn,
86
86
  graph_version=task_input.task.graph_version,
87
87
  graph_invocation_id=task_input.task.invocation_id,
88
+ output_payload_uri_prefix=task_input.task.output_payload_uri_prefix,
88
89
  )
89
90
  finally:
90
91
  if state is not None:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: indexify
3
- Version: 0.3.20
3
+ Version: 0.3.21
4
4
  Summary: Open Source Indexify components and helper tools
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -1,12 +1,12 @@
1
1
  indexify/cli/cli.py,sha256=L-QVyRO-nGazfrdIbhxYvxMwhEnNjJ4H32Lw9pz464I,9345
2
2
  indexify/executor/README.md,sha256=ozC6_hMkhQQNVCMEpBxwiUALz6lwErPQxNxQfQDqnG4,2029
3
- indexify/executor/api_objects.py,sha256=qKQMEjr18xIq7yjpnsLPnWXP4KsVQ9O76EsbYoaL_qQ,1507
3
+ indexify/executor/api_objects.py,sha256=UEMpD_kKr6BNfu2JdYzq1CPNrEmVZYhwlZRKf3hB7KI,1750
4
4
  indexify/executor/blob_store/blob_store.py,sha256=XViw_KRfFSNqwcFYwMZixZF-EYCjXK2AQHdt0xh4UVo,2368
5
- indexify/executor/blob_store/local_fs_blob_store.py,sha256=hDZXrcjKmYoH7ob0AAZ-7Zz1aoksxD8ArQEzmUSjkY4,1807
5
+ indexify/executor/blob_store/local_fs_blob_store.py,sha256=6LexqMBGXp8f6Ka95R6xMIUyDutrZJABOMNcp-ssa98,1809
6
6
  indexify/executor/blob_store/metrics/blob_store.py,sha256=5_xiPREeHWFtxFh1NupDsF8zP4pmUPgLNNn-UE9Uzvc,1008
7
- indexify/executor/blob_store/s3_blob_store.py,sha256=4Hn1r5XpiNdfa-buRge0hR36j8j3xmuQC-7noOoOFo0,3319
8
- indexify/executor/downloader.py,sha256=0tRQY8jT03-R36bBo5MsoLjxkoehWCY9-61fsoW3Gng,13507
9
- indexify/executor/executor.py,sha256=uZiERFDKBQLmfPgun6bxEN7JE-BEEqmdGwawLjTAjf0,16499
7
+ indexify/executor/blob_store/s3_blob_store.py,sha256=G3B_V3gUE7XbUY42lDtBczUKuA7q8S7MD43tx1aHrJo,3445
8
+ indexify/executor/downloader.py,sha256=7wMuOHNX0w3HM89RZaYFix_VpNfSmZl_O52Y7QB-ivU,15483
9
+ indexify/executor/executor.py,sha256=K_xNJDImvIxgpE_ypw1ERyrijqOSe0EZIyVRVCqztVw,16697
10
10
  indexify/executor/executor_flavor.py,sha256=uilzDQVVYlQGR1MVnrUC4NevUActDWHdnJkr38M6kTk,118
11
11
  indexify/executor/function_executor/function_executor.py,sha256=agfUxzSQ-2TqkpMhW3OvOSMF_EhpemetaL3_dYp29Ro,11888
12
12
  indexify/executor/function_executor/function_executor_state.py,sha256=ljPm1IrRMJ8hFklwvFp7Xax2HMpUIOHm0DwOxxMcy7U,4336
@@ -25,9 +25,9 @@ indexify/executor/function_executor/server/function_executor_server.py,sha256=_D
25
25
  indexify/executor/function_executor/server/function_executor_server_factory.py,sha256=cP93a3t1AfGx8qwageNLVdTwG52UOwzYNbbyrPq2TUQ,1692
26
26
  indexify/executor/function_executor/server/subprocess_function_executor_server.py,sha256=JekDOqF7oFD4J6zcN3xB0Dxd1cgpEXMOsb_rKZOeBlI,668
27
27
  indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py,sha256=g1AUbhOoPsdhp_50Ayahdyv1Ix5-nEBE8orOQfkATpM,4470
28
- indexify/executor/function_executor/single_task_runner.py,sha256=OY3a2znwtuv0_Pqn9r5ScGhekNn1OquznKmaGnQL71k,13979
28
+ indexify/executor/function_executor/single_task_runner.py,sha256=wHllYigXy5R9ZiJREDf1EazqxUrhgxEJaSYGQwPUnZU,14140
29
29
  indexify/executor/function_executor/task_input.py,sha256=wSrHR4m0juiGClQyeVdhRC37QzDt6Rrjq-ZXJkfBi9k,584
30
- indexify/executor/function_executor/task_output.py,sha256=snh25Ynj7uss1SUIeivpDYWdRkODoDpdPrQrGvnjyIs,3077
30
+ indexify/executor/function_executor/task_output.py,sha256=vi0W75T8qbxjtAtI-812HksCv986lH_ijEqc3q0CVww,3424
31
31
  indexify/executor/grpc/channel_manager.py,sha256=ihDkLoiGBLfSmoA2szbntjCfL3E_NDf5LABRXE7YRec,6330
32
32
  indexify/executor/grpc/function_executor_controller.py,sha256=b_xqiXsA79yIMCHtRJv0i0px6QMToIj9nCTc_km9fHk,16070
33
33
  indexify/executor/grpc/metrics/channel_manager.py,sha256=k-WArgklmP5WhjcmFmrgRblB7yc3XlaOXO8owRyV-mw,649
@@ -36,11 +36,11 @@ indexify/executor/grpc/metrics/state_reporter.py,sha256=GggBEjMzQUYIG95LtTS4fUg1
36
36
  indexify/executor/grpc/metrics/task_controller.py,sha256=9Nm86nGxL2rZ3rAORB0_CBdO--Fe4MBrewVW4CqGyOU,222
37
37
  indexify/executor/grpc/state_reconciler.py,sha256=Yx_r1oUBGWjHWqUSUhap2ALZpz_o1Ue-SdL7BOgWQQU,19273
38
38
  indexify/executor/grpc/state_reporter.py,sha256=iixvl1nzJ_SfQhTUJpBEfiVlFPvOxQxvZyqT5Szk43Q,10503
39
- indexify/executor/grpc/task_controller.py,sha256=IQkxKYfj8TLhmoLi4xDVjEKBZlK6ttP1w2a9kOlX58A,20277
39
+ indexify/executor/grpc/task_controller.py,sha256=JrRUkVa8pGoYWBg_RVfn1ThvDhYCgJSMACFPUww7Lys,20851
40
40
  indexify/executor/metrics/downloader.py,sha256=lctPh8xjkXeLEFJnl1hNrD1yEhLhIl5sggsR4Yoe_Zc,2746
41
41
  indexify/executor/metrics/executor.py,sha256=ua-Vv_k1CB4juJdF7tEBQbBMksqWAA3iXKKMKXZUCLk,2369
42
42
  indexify/executor/metrics/task_fetcher.py,sha256=iJEwCLzYr2cuz7hRvNiqaa2nvQP4OrA0hm0iJY0YKG0,736
43
- indexify/executor/metrics/task_reporter.py,sha256=44VT_c7njXlTtPl4xjlsYrIHpaiVAnvhhQdj56RPU6o,1215
43
+ indexify/executor/metrics/task_reporter.py,sha256=DKieoeNvyyP5R0V62OZns_dDjr_UkFcJ4eeuiy4kvkM,1837
44
44
  indexify/executor/metrics/task_runner.py,sha256=ZGFrl7zzfUdgPZnklxRIbnv9wVcHIQRhOGNqn9V2hSk,2047
45
45
  indexify/executor/monitoring/function_allowlist.py,sha256=wUGeiv3aAGWMlQXzHXq9O6MVHby6Tu-zY4U0MyWiQu0,683
46
46
  indexify/executor/monitoring/handler.py,sha256=Cj1cu_LcsAP0tdviqNhoEtGm4h0OJAxxzW9C2YdNXYU,240
@@ -53,13 +53,13 @@ indexify/executor/monitoring/server.py,sha256=yzdYhcxnmY6uTQUMt3vatF5jilN52ZtfFs
53
53
  indexify/executor/monitoring/startup_probe_handler.py,sha256=zXXsBU15SMlBx1bSFpxWDfed1VHtKKnwvLQ8-frpG98,425
54
54
  indexify/executor/runtime_probes.py,sha256=bo6Dq6AGZpJH099j0DHtVSDEH80tv3j9MXf3VXSx_p8,2182
55
55
  indexify/executor/task_fetcher.py,sha256=p3iEsWyGi0ZMPAv0183smzOUD1KycQ_dXsyd9mpB9IU,3529
56
- indexify/executor/task_reporter.py,sha256=AbestQmKxpJo0pfJWnB_9ziOnAdFmMScvWzFTeyG6X4,12382
57
- indexify/executor/task_runner.py,sha256=2-f6Uupu_aVPg34G27yuiPTC6eE4XnT-qizpzPryzDI,7134
56
+ indexify/executor/task_reporter.py,sha256=7X-IdLdwNBIfFbazG_4rtfR1A0ZFt03JGYpVJQUTKpE,16704
57
+ indexify/executor/task_runner.py,sha256=UupZbGxU9BN4i1t6M8tH-5k3s4eUPEhMhar1YI0Aztk,7219
58
58
  indexify/proto/executor_api.proto,sha256=K1lwFmk042GA1tp8s633FZJVg6Fi8f8LtAuFj8Gz7XU,9930
59
59
  indexify/proto/executor_api_pb2.py,sha256=5y570_FIgc6WFhHVAKWFieMuUhyKBA7rPJJ4DJ5hcCM,14054
60
60
  indexify/proto/executor_api_pb2.pyi,sha256=5eJJJjPNdTMSttNUOtzGwADbASsCh7138de_Y3l8uq4,18612
61
61
  indexify/proto/executor_api_pb2_grpc.py,sha256=GGiDtyQlA2382E_ZyKUBYcWNEJHH_RlulieStKfkJXI,9514
62
- indexify-0.3.20.dist-info/METADATA,sha256=rg0FBC-z-e_Ft09YgQ5ldhaS3u5XKtmqxNFFMa9074A,1198
63
- indexify-0.3.20.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
64
- indexify-0.3.20.dist-info/entry_points.txt,sha256=GU9wmsgvN7nQw3N2X0PMYn1RSvF6CrhH9RuC2D8d3Gk,53
65
- indexify-0.3.20.dist-info/RECORD,,
62
+ indexify-0.3.21.dist-info/METADATA,sha256=FnyHAJo968nOsbZyCvFSL0rW-6bsaCAilmCZVOTL9lg,1198
63
+ indexify-0.3.21.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
64
+ indexify-0.3.21.dist-info/entry_points.txt,sha256=GU9wmsgvN7nQw3N2X0PMYn1RSvF6CrhH9RuC2D8d3Gk,53
65
+ indexify-0.3.21.dist-info/RECORD,,