indexify 0.3.19__py3-none-any.whl → 0.3.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (29) hide show
  1. indexify/cli/cli.py +12 -0
  2. indexify/executor/blob_store/blob_store.py +69 -0
  3. indexify/executor/blob_store/local_fs_blob_store.py +48 -0
  4. indexify/executor/blob_store/metrics/blob_store.py +33 -0
  5. indexify/executor/blob_store/s3_blob_store.py +85 -0
  6. indexify/executor/downloader.py +145 -24
  7. indexify/executor/executor.py +26 -12
  8. indexify/executor/function_executor/function_executor.py +1 -1
  9. indexify/executor/function_executor/function_executor_states_container.py +5 -0
  10. indexify/executor/function_executor/function_executor_status.py +2 -0
  11. indexify/executor/function_executor/health_checker.py +7 -2
  12. indexify/executor/function_executor/invocation_state_client.py +4 -2
  13. indexify/executor/function_executor/task_output.py +2 -1
  14. indexify/executor/grpc/channel_manager.py +4 -3
  15. indexify/executor/grpc/function_executor_controller.py +163 -193
  16. indexify/executor/grpc/metrics/state_reconciler.py +17 -0
  17. indexify/executor/grpc/metrics/task_controller.py +8 -0
  18. indexify/executor/grpc/state_reconciler.py +305 -188
  19. indexify/executor/grpc/state_reporter.py +18 -10
  20. indexify/executor/grpc/task_controller.py +232 -189
  21. indexify/executor/task_reporter.py +23 -5
  22. indexify/proto/executor_api.proto +37 -11
  23. indexify/proto/executor_api_pb2.py +49 -47
  24. indexify/proto/executor_api_pb2.pyi +55 -15
  25. {indexify-0.3.19.dist-info → indexify-0.3.20.dist-info}/METADATA +2 -1
  26. {indexify-0.3.19.dist-info → indexify-0.3.20.dist-info}/RECORD +28 -23
  27. indexify/executor/grpc/completed_tasks_container.py +0 -26
  28. {indexify-0.3.19.dist-info → indexify-0.3.20.dist-info}/WHEEL +0 -0
  29. {indexify-0.3.19.dist-info → indexify-0.3.20.dist-info}/entry_points.txt +0 -0
indexify/cli/cli.py CHANGED
@@ -25,6 +25,9 @@ from rich.theme import Theme
25
25
  from tensorlake.functions_sdk.image import Image
26
26
 
27
27
  from indexify.executor.api_objects import FunctionURI
28
+ from indexify.executor.blob_store.blob_store import BLOBStore
29
+ from indexify.executor.blob_store.local_fs_blob_store import LocalFSBLOBStore
30
+ from indexify.executor.blob_store.s3_blob_store import S3BLOBStore
28
31
  from indexify.executor.executor import Executor
29
32
  from indexify.executor.executor_flavor import ExecutorFlavor
30
33
  from indexify.executor.function_executor.server.subprocess_function_executor_server_factory import (
@@ -197,6 +200,14 @@ def executor(
197
200
  )
198
201
  exit(1)
199
202
 
203
+ # Enable all available blob stores in OSS because we don't know which one is going to be used.
204
+ blob_store: BLOBStore = BLOBStore(
205
+ # Local FS mode is used in tests and in cases when user wants to store data on NFS.
206
+ local=LocalFSBLOBStore(),
207
+ # S3 is initiliazed lazily so it's okay to create it even if the user is not going to use it.
208
+ s3=S3BLOBStore(),
209
+ )
210
+
200
211
  prometheus_client.Info("cli", "CLI information").info(
201
212
  {
202
213
  "package": "indexify",
@@ -222,6 +233,7 @@ def executor(
222
233
  monitoring_server_host=monitoring_server_host,
223
234
  monitoring_server_port=monitoring_server_port,
224
235
  enable_grpc_state_reconciler=enable_grpc_state_reconciler,
236
+ blob_store=blob_store,
225
237
  ).run()
226
238
 
227
239
 
@@ -0,0 +1,69 @@
1
+ from typing import Any, Optional
2
+
3
+ from .local_fs_blob_store import LocalFSBLOBStore
4
+ from .metrics.blob_store import (
5
+ metric_get_blob_errors,
6
+ metric_get_blob_latency,
7
+ metric_get_blob_requests,
8
+ metric_put_blob_errors,
9
+ metric_put_blob_latency,
10
+ metric_put_blob_requests,
11
+ )
12
+ from .s3_blob_store import S3BLOBStore
13
+
14
+
15
+ class BLOBStore:
16
+ """Dispatches generic BLOB store calls to their real backends."""
17
+
18
+ def __init__(
19
+ self, local: Optional[LocalFSBLOBStore] = None, s3: Optional[S3BLOBStore] = None
20
+ ):
21
+ """Creates a BLOB store that uses the supplied BLOB stores."""
22
+ self._local: Optional[LocalFSBLOBStore] = local
23
+ self._s3: Optional[S3BLOBStore] = s3
24
+
25
+ async def get(self, uri: str, logger: Any) -> bytes:
26
+ """Returns binary value stored in BLOB with the supplied URI.
27
+
28
+ Raises Exception on error. Raises KeyError if the BLOB doesn't exist.
29
+ """
30
+ with (
31
+ metric_get_blob_errors.count_exceptions(),
32
+ metric_get_blob_latency.time(),
33
+ ):
34
+ metric_get_blob_requests.inc()
35
+ if _is_file_uri(uri):
36
+ self._check_local_is_available()
37
+ return await self._local.get(uri, logger)
38
+ else:
39
+ self._check_s3_is_available()
40
+ return await self._s3.get(uri, logger)
41
+
42
+ async def put(self, uri: str, value: bytes, logger: Any) -> None:
43
+ """Stores the supplied binary value in a BLOB with the supplied URI.
44
+
45
+ Overwrites existing BLOB. Raises Exception on error.
46
+ """
47
+ with (
48
+ metric_put_blob_errors.count_exceptions(),
49
+ metric_put_blob_latency.time(),
50
+ ):
51
+ metric_put_blob_requests.inc()
52
+ if _is_file_uri(uri):
53
+ self._check_local_is_available()
54
+ await self._local.put(uri, value, logger)
55
+ else:
56
+ self._check_s3_is_available()
57
+ await self._s3.put(uri, value, logger)
58
+
59
+ def _check_local_is_available(self):
60
+ if self._local is None:
61
+ raise RuntimeError("Local file system BLOB store is not available")
62
+
63
+ def _check_s3_is_available(self):
64
+ if self._s3 is None:
65
+ raise RuntimeError("S3 BLOB store is not available")
66
+
67
+
68
+ def _is_file_uri(uri: str) -> bool:
69
+ return uri.startswith("file://")
@@ -0,0 +1,48 @@
1
+ import asyncio
2
+ import os
3
+ import os.path
4
+ from typing import Any
5
+
6
+
7
+ class LocalFSBLOBStore:
8
+ """BLOB store that stores BLOBs in local file system."""
9
+
10
+ async def get(self, uri: str, logger: Any) -> bytes:
11
+ """Returns binary value stored in file at the supplied URI.
12
+
13
+ The URI must be a file URI (starts with "file://"). The path must be absolute.
14
+ Raises Exception on error. Raises KeyError if the file doesn't exist.
15
+ """
16
+ # Run synchronous code in a thread to not block the event loop.
17
+ return await asyncio.to_thread(self._sync_get, _path_from_file_uri(uri))
18
+
19
+ async def put(self, uri: str, value: bytes, logger: Any) -> None:
20
+ """Stores the supplied binary value in a file at the supplied URI.
21
+
22
+ The URI must be a file URI (starts with "file://"). The path must be absolute.
23
+ Overwrites existing file. Raises Exception on error.
24
+ """
25
+ # Run synchronous code in a thread to not block the event loop.
26
+ return await asyncio.to_thread(self._sync_put, _path_from_file_uri(uri), value)
27
+
28
+ def _sync_get(self, path: str) -> bytes:
29
+ if not os.path.isabs(path):
30
+ raise ValueError(f"Path {path} is not absolute")
31
+
32
+ if os.path.exists(path):
33
+ with open(path, mode="rb") as blob_file:
34
+ return blob_file.read()
35
+ else:
36
+ raise KeyError(f"File at {path} does not exist")
37
+
38
+ def _sync_put(self, path: str, value: bytes) -> None:
39
+ if not os.path.isabs(path):
40
+ raise ValueError(f"Path {path} is not absolute")
41
+
42
+ os.makedirs(os.path.dirname(path), exist_ok=True)
43
+ with open(path, mode="wb") as blob_file:
44
+ blob_file.write(value)
45
+
46
+
47
+ def _path_from_file_uri(uri: str) -> str:
48
+ return uri[7:] # strip "file://" prefix
@@ -0,0 +1,33 @@
1
+ import prometheus_client
2
+
3
+ from ...monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ metric_get_blob_requests: prometheus_client.Counter = prometheus_client.Counter(
6
+ "get_blob_requests",
7
+ "Number of get blob requests",
8
+ )
9
+ metric_get_blob_errors: prometheus_client.Counter = prometheus_client.Counter(
10
+ "get_blob_request_errors",
11
+ "Number of get blob request errors",
12
+ )
13
+ metric_get_blob_latency: prometheus_client.Histogram = (
14
+ latency_metric_for_fast_operation(
15
+ "get_blob_request",
16
+ "get blob request",
17
+ )
18
+ )
19
+
20
+ metric_put_blob_requests: prometheus_client.Counter = prometheus_client.Counter(
21
+ "put_blob_requests",
22
+ "Number of put blob requests",
23
+ )
24
+ metric_put_blob_errors: prometheus_client.Counter = prometheus_client.Counter(
25
+ "put_blob_request_errors",
26
+ "Number of put blob request errors",
27
+ )
28
+ metric_put_blob_latency: prometheus_client.Histogram = (
29
+ latency_metric_for_fast_operation(
30
+ "put_blob_request",
31
+ "put blob request",
32
+ )
33
+ )
@@ -0,0 +1,85 @@
1
+ import asyncio
2
+ from typing import Any, Optional
3
+
4
+ import boto3
5
+ from botocore.config import Config as BotoConfig
6
+ from botocore.exceptions import ClientError as BotoClientError
7
+
8
+ _MAX_RETRIES = 3
9
+
10
+
11
+ class S3BLOBStore:
12
+ def __init__(self):
13
+ self._s3_client: Optional[Any] = None
14
+
15
+ def _lazy_create_client(self):
16
+ """Creates S3 client if it doesn't exist.
17
+
18
+ We create the client lazily only if S3 is used.
19
+ This is because S3 BLOB store is always created by Executor
20
+ and the creation will fail if user didn't configure S3 credentials and etc.
21
+ """
22
+ if self._s3_client is not None:
23
+ return
24
+
25
+ # The credentials and etc are fetched by boto3 library automatically following
26
+ # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#configuring-credentials
27
+ # This provides a lot of flexibility for the user and follows a well-known and documented logic.
28
+ self._s3_client = boto3.client(
29
+ "s3",
30
+ config=BotoConfig(
31
+ # https://boto3.amazonaws.com/v1/documentation/api/latest/guide/retries.html#standard-retry-mode
32
+ retries={
33
+ "max_attempts": _MAX_RETRIES,
34
+ "mode": "standard",
35
+ }
36
+ ),
37
+ )
38
+
39
+ async def get(self, uri: str, logger: Any) -> bytes:
40
+ """Returns binary value stored in S3 object at the supplied URI.
41
+
42
+ The URI must be S3 URI (starts with "s3://").
43
+ Raises Exception on error. Raises KeyError if the object doesn't exist.
44
+ """
45
+ try:
46
+ self._lazy_create_client()
47
+ bucket_name, key = _bucket_name_and_object_key_from_uri(uri)
48
+ response = await asyncio.to_thread(
49
+ self._s3_client.get_object, Bucket=bucket_name, Key=key
50
+ )
51
+ return response["Body"].read()
52
+ except BotoClientError as e:
53
+ logger.error("failed to get S3 object", uri=uri, exc_info=e)
54
+
55
+ if e.response["Error"]["Code"] == "NoSuchKey":
56
+ raise KeyError(f"Object {key} does not exist in bucket {bucket_name}")
57
+ raise
58
+ except Exception as e:
59
+ logger.error("failed to get S3 object", uri=uri, exc_info=e)
60
+ raise
61
+
62
+ async def put(self, uri: str, value: bytes, logger: Any) -> None:
63
+ """Stores the supplied binary value in a S3 object at the supplied URI.
64
+
65
+ The URI must be S3 URI (starts with "s3://").
66
+ Overwrites existing object. Raises Exception on error.
67
+ """
68
+ try:
69
+ self._lazy_create_client()
70
+ bucket_name, key = _bucket_name_and_object_key_from_uri(uri)
71
+ await asyncio.to_thread(
72
+ self._s3_client.put_object, Bucket=bucket_name, Key=key, Body=value
73
+ )
74
+ except Exception as e:
75
+ logger.error("failed to set S3 object", uri=uri, exc_info=e)
76
+ raise
77
+
78
+
79
+ def _bucket_name_and_object_key_from_uri(uri: str) -> tuple[str, str]:
80
+ if not uri.startswith("s3://"):
81
+ raise ValueError(f"S3 URI '{uri}' is missing 's3://' prefix")
82
+ parts = uri[5:].split("/", 1)
83
+ if len(parts) != 2:
84
+ raise ValueError(f"Failed parsing bucket name from S3 URI '{uri}'")
85
+ return parts[0], parts[1] # bucket_name, key
@@ -5,9 +5,15 @@ from typing import Any, Optional
5
5
  import httpx
6
6
  import nanoid
7
7
  from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
8
+ from tensorlake.function_executor.proto.message_validator import MessageValidator
8
9
  from tensorlake.utils.http_client import get_httpx_client
9
10
 
10
- from .api_objects import Task
11
+ from indexify.proto.executor_api_pb2 import (
12
+ DataPayload,
13
+ DataPayloadEncoding,
14
+ )
15
+
16
+ from .blob_store.blob_store import BLOBStore
11
17
  from .metrics.downloader import (
12
18
  metric_graph_download_errors,
13
19
  metric_graph_download_latency,
@@ -27,14 +33,24 @@ from .metrics.downloader import (
27
33
 
28
34
  class Downloader:
29
35
  def __init__(
30
- self, code_path: str, base_url: str, config_path: Optional[str] = None
36
+ self,
37
+ code_path: str,
38
+ base_url: str,
39
+ blob_store: BLOBStore,
40
+ config_path: Optional[str] = None,
31
41
  ):
32
- self.code_path = code_path
42
+ self._code_path = code_path
33
43
  self._base_url = base_url
34
44
  self._client = get_httpx_client(config_path, make_async=True)
45
+ self._blob_store: BLOBStore = blob_store
35
46
 
36
47
  async def download_graph(
37
- self, namespace: str, graph_name: str, graph_version: str, logger: Any
48
+ self,
49
+ namespace: str,
50
+ graph_name: str,
51
+ graph_version: str,
52
+ data_payload: Optional[DataPayload],
53
+ logger: Any,
38
54
  ) -> SerializedObject:
39
55
  logger = logger.bind(module=__name__)
40
56
  with (
@@ -47,6 +63,7 @@ class Downloader:
47
63
  namespace=namespace,
48
64
  graph_name=graph_name,
49
65
  graph_version=graph_version,
66
+ data_payload=data_payload,
50
67
  logger=logger,
51
68
  )
52
69
 
@@ -56,6 +73,7 @@ class Downloader:
56
73
  graph_name: str,
57
74
  graph_invocation_id: str,
58
75
  input_key: str,
76
+ data_payload: Optional[DataPayload],
59
77
  logger: Any,
60
78
  ) -> SerializedObject:
61
79
  logger = logger.bind(module=__name__)
@@ -70,6 +88,7 @@ class Downloader:
70
88
  graph_name=graph_name,
71
89
  graph_invocation_id=graph_invocation_id,
72
90
  input_key=input_key,
91
+ data_payload=data_payload,
73
92
  logger=logger,
74
93
  )
75
94
 
@@ -80,6 +99,7 @@ class Downloader:
80
99
  function_name: str,
81
100
  graph_invocation_id: str,
82
101
  reducer_output_key: str,
102
+ data_payload: Optional[DataPayload],
83
103
  logger: Any,
84
104
  ) -> SerializedObject:
85
105
  logger = logger.bind(module=__name__)
@@ -89,21 +109,27 @@ class Downloader:
89
109
  metric_reducer_init_value_download_latency.time(),
90
110
  ):
91
111
  metric_reducer_init_value_downloads.inc()
92
- return await self._fetch_function_init_value(
112
+ return await self._download_init_value(
93
113
  namespace=namespace,
94
114
  graph_name=graph_name,
95
115
  function_name=function_name,
96
116
  graph_invocation_id=graph_invocation_id,
97
117
  reducer_output_key=reducer_output_key,
118
+ data_payload=data_payload,
98
119
  logger=logger,
99
120
  )
100
121
 
101
122
  async def _download_graph(
102
- self, namespace: str, graph_name: str, graph_version: str, logger: Any
123
+ self,
124
+ namespace: str,
125
+ graph_name: str,
126
+ graph_version: str,
127
+ data_payload: Optional[DataPayload],
128
+ logger: Any,
103
129
  ) -> SerializedObject:
104
130
  # Cache graph to reduce load on the server.
105
131
  graph_path = os.path.join(
106
- self.code_path,
132
+ self._code_path,
107
133
  "graph_cache",
108
134
  namespace,
109
135
  graph_name,
@@ -118,17 +144,33 @@ class Downloader:
118
144
  metric_graphs_from_cache.inc()
119
145
  return graph
120
146
 
121
- graph: SerializedObject = await self._fetch_graph(
122
- namespace=namespace,
123
- graph_name=graph_name,
124
- graph_version=graph_version,
125
- logger=logger,
126
- )
147
+ if data_payload is None:
148
+ graph: SerializedObject = await self._fetch_graph_from_server(
149
+ namespace=namespace,
150
+ graph_name=graph_name,
151
+ graph_version=graph_version,
152
+ logger=logger,
153
+ )
154
+ else:
155
+ (
156
+ MessageValidator(data_payload)
157
+ .required_field("uri")
158
+ .required_field("encoding")
159
+ )
160
+ data: bytes = await self._blob_store.get(
161
+ uri=data_payload.uri, logger=logger
162
+ )
163
+ return _data_payload_to_serialized_object(
164
+ data_payload=data_payload,
165
+ data=data,
166
+ )
167
+
127
168
  # Filesystem operations are synchronous.
128
169
  # Run in a separate thread to not block the main event loop.
129
170
  # We don't need to wait for the write completion so we use create_task.
130
171
  asyncio.create_task(
131
- asyncio.to_thread(self._write_cached_graph, graph_path, graph)
172
+ asyncio.to_thread(self._write_cached_graph, graph_path, graph),
173
+ name="graph cache write",
132
174
  )
133
175
 
134
176
  return graph
@@ -145,7 +187,7 @@ class Downloader:
145
187
  # Another task already cached the graph.
146
188
  return None
147
189
 
148
- tmp_path = os.path.join(self.code_path, "task_graph_cache", nanoid.generate())
190
+ tmp_path = os.path.join(self._code_path, "task_graph_cache", nanoid.generate())
149
191
  os.makedirs(os.path.dirname(tmp_path), exist_ok=True)
150
192
  with open(tmp_path, "wb") as f:
151
193
  f.write(graph.SerializeToString())
@@ -162,21 +204,71 @@ class Downloader:
162
204
  graph_name: str,
163
205
  graph_invocation_id: str,
164
206
  input_key: str,
207
+ data_payload: Optional[DataPayload],
208
+ logger: Any,
209
+ ) -> SerializedObject:
210
+ if data_payload is None:
211
+ first_function_in_graph = graph_invocation_id == input_key.split("|")[-1]
212
+ if first_function_in_graph:
213
+ # The first function in Graph gets its input from graph invocation payload.
214
+ return await self._fetch_graph_invocation_payload_from_server(
215
+ namespace=namespace,
216
+ graph_name=graph_name,
217
+ graph_invocation_id=graph_invocation_id,
218
+ logger=logger,
219
+ )
220
+ else:
221
+ return await self._fetch_function_input_from_server(
222
+ input_key=input_key, logger=logger
223
+ )
224
+ else:
225
+ (
226
+ MessageValidator(data_payload)
227
+ .required_field("uri")
228
+ .required_field("encoding")
229
+ )
230
+ data: bytes = await self._blob_store.get(
231
+ uri=data_payload.uri, logger=logger
232
+ )
233
+ return _data_payload_to_serialized_object(
234
+ data_payload=data_payload,
235
+ data=data,
236
+ )
237
+
238
+ async def _download_init_value(
239
+ self,
240
+ namespace: str,
241
+ graph_name: str,
242
+ function_name: str,
243
+ graph_invocation_id: str,
244
+ reducer_output_key: str,
245
+ data_payload: Optional[DataPayload],
165
246
  logger: Any,
166
247
  ) -> SerializedObject:
167
- first_function_in_graph = graph_invocation_id == input_key.split("|")[-1]
168
- if first_function_in_graph:
169
- # The first function in Graph gets its input from graph invocation payload.
170
- return await self._fetch_graph_invocation_payload(
248
+ if data_payload is None:
249
+ return await self._fetch_function_init_value_from_server(
171
250
  namespace=namespace,
172
251
  graph_name=graph_name,
252
+ function_name=function_name,
173
253
  graph_invocation_id=graph_invocation_id,
254
+ reducer_output_key=reducer_output_key,
174
255
  logger=logger,
175
256
  )
176
257
  else:
177
- return await self._fetch_function_input(input_key=input_key, logger=logger)
258
+ (
259
+ MessageValidator(data_payload)
260
+ .required_field("uri")
261
+ .required_field("encoding")
262
+ )
263
+ data: bytes = await self._blob_store.get(
264
+ uri=data_payload.uri, logger=logger
265
+ )
266
+ return _data_payload_to_serialized_object(
267
+ data_payload=data_payload,
268
+ data=data,
269
+ )
178
270
 
179
- async def _fetch_graph(
271
+ async def _fetch_graph_from_server(
180
272
  self, namespace: str, graph_name: str, graph_version: str, logger: Any
181
273
  ) -> SerializedObject:
182
274
  """Downloads the compute graph for the task and returns it."""
@@ -186,7 +278,7 @@ class Downloader:
186
278
  logger=logger,
187
279
  )
188
280
 
189
- async def _fetch_graph_invocation_payload(
281
+ async def _fetch_graph_invocation_payload_from_server(
190
282
  self, namespace: str, graph_name: str, graph_invocation_id: str, logger: Any
191
283
  ) -> SerializedObject:
192
284
  return await self._fetch_url(
@@ -195,7 +287,7 @@ class Downloader:
195
287
  logger=logger,
196
288
  )
197
289
 
198
- async def _fetch_function_input(
290
+ async def _fetch_function_input_from_server(
199
291
  self, input_key: str, logger: Any
200
292
  ) -> SerializedObject:
201
293
  return await self._fetch_url(
@@ -204,7 +296,7 @@ class Downloader:
204
296
  logger=logger,
205
297
  )
206
298
 
207
- async def _fetch_function_init_value(
299
+ async def _fetch_function_init_value_from_server(
208
300
  self,
209
301
  namespace: str,
210
302
  graph_name: str,
@@ -252,3 +344,32 @@ def serialized_object_from_http_response(response: httpx.Response) -> Serialized
252
344
  return SerializedObject(
253
345
  string=response.text, content_type=response.headers["content-type"]
254
346
  )
347
+
348
+
349
+ def _data_payload_to_serialized_object(
350
+ data_payload: DataPayload, data: bytes
351
+ ) -> SerializedObject:
352
+ """Converts the given data payload and its data into SerializedObject accepted by Function Executor.
353
+
354
+ Raises ValueError if the supplied data payload can't be converted into serialized object.
355
+ """
356
+ if data_payload.encoding == DataPayloadEncoding.DATA_PAYLOAD_ENCODING_BINARY_PICKLE:
357
+ return SerializedObject(
358
+ bytes=data,
359
+ content_type="application/octet-stream",
360
+ )
361
+ elif data_payload.encoding == DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT:
362
+ return SerializedObject(
363
+ content_type="text/plain",
364
+ string=data.decode("utf-8"),
365
+ )
366
+ elif data_payload.encoding == DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_JSON:
367
+ result = SerializedObject(
368
+ content_type="application/json",
369
+ string=data.decode("utf-8"),
370
+ )
371
+ return result
372
+
373
+ raise ValueError(
374
+ f"Can't convert data payload {data_payload} into serialized object"
375
+ )
@@ -12,6 +12,7 @@ from tensorlake.utils.logging import suppress as suppress_logging
12
12
  from indexify.proto.executor_api_pb2 import ExecutorStatus
13
13
 
14
14
  from .api_objects import FunctionURI, Task
15
+ from .blob_store.blob_store import BLOBStore
15
16
  from .downloader import Downloader
16
17
  from .executor_flavor import ExecutorFlavor
17
18
  from .function_executor.function_executor_states_container import (
@@ -69,6 +70,7 @@ class Executor:
69
70
  monitoring_server_host: str,
70
71
  monitoring_server_port: int,
71
72
  enable_grpc_state_reconciler: bool,
73
+ blob_store: BLOBStore,
72
74
  ):
73
75
  self._logger = structlog.get_logger(module=__name__)
74
76
  self._is_shutdown: bool = False
@@ -95,7 +97,10 @@ class Executor:
95
97
  self._function_executor_states
96
98
  )
97
99
  self._downloader = Downloader(
98
- code_path=code_path, base_url=self._base_url, config_path=config_path
100
+ code_path=code_path,
101
+ base_url=self._base_url,
102
+ blob_store=blob_store,
103
+ config_path=config_path,
99
104
  )
100
105
  self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
101
106
  self._function_executor_server_factory = function_executor_server_factory
@@ -189,12 +194,15 @@ class Executor:
189
194
  signum, self.shutdown, asyncio.get_event_loop()
190
195
  )
191
196
 
192
- asyncio.get_event_loop().create_task(self._monitoring_server.run())
193
- if self._state_reporter is not None:
194
- self._state_reporter.update_executor_status(
195
- ExecutorStatus.EXECUTOR_STATUS_RUNNING
196
- )
197
- asyncio.get_event_loop().create_task(self._state_reporter.run())
197
+ asyncio.get_event_loop().create_task(
198
+ self._monitoring_server.run(), name="monitoring server runner"
199
+ )
200
+ self._state_reporter.update_executor_status(
201
+ ExecutorStatus.EXECUTOR_STATUS_RUNNING
202
+ )
203
+ asyncio.get_event_loop().create_task(
204
+ self._state_reporter.run(), name="state reporter runner"
205
+ )
198
206
 
199
207
  metric_executor_state.state("running")
200
208
  self._startup_probe_handler.set_ready()
@@ -215,7 +223,6 @@ class Executor:
215
223
  """Runs the gRPC state reconciler and state reporter.
216
224
 
217
225
  Never raises any exceptions."""
218
- asyncio.create_task(self._state_reporter.run())
219
226
  await self._state_reconciler.run()
220
227
 
221
228
  async def _http_task_runner_loop(self):
@@ -224,11 +231,15 @@ class Executor:
224
231
  async for task in self._task_fetcher.run():
225
232
  metric_tasks_fetched.inc()
226
233
  if not self._is_shutdown:
227
- asyncio.create_task(self._run_task(task))
234
+ asyncio.create_task(
235
+ self._run_task(task), name="task runner (http mode)"
236
+ )
237
+ self._logger.info("fetching tasks finished, reconnecting in 5 seconds")
228
238
  except Exception as e:
229
239
  self._logger.error(
230
240
  "failed fetching tasks, retrying in 5 seconds", exc_info=e
231
241
  )
242
+ if not self._is_shutdown:
232
243
  await asyncio.sleep(5)
233
244
 
234
245
  async def _run_task(self, task: Task) -> None:
@@ -293,12 +304,14 @@ class Executor:
293
304
  graph_name=task.compute_graph,
294
305
  graph_version=task.graph_version,
295
306
  logger=logger,
307
+ data_payload=None,
296
308
  )
297
309
  input: SerializedObject = await self._downloader.download_input(
298
310
  namespace=task.namespace,
299
311
  graph_name=task.compute_graph,
300
312
  graph_invocation_id=task.invocation_id,
301
313
  input_key=task.input_key,
314
+ data_payload=None,
302
315
  logger=logger,
303
316
  )
304
317
  init_value: Optional[SerializedObject] = (
@@ -311,6 +324,7 @@ class Executor:
311
324
  function_name=task.compute_fn,
312
325
  graph_invocation_id=task.invocation_id,
313
326
  reducer_output_key=task.reducer_output_id,
327
+ data_payload=None,
314
328
  logger=logger,
315
329
  )
316
330
  )
@@ -380,12 +394,12 @@ class Executor:
380
394
  if self._task_runner is not None:
381
395
  await self._task_runner.shutdown()
382
396
 
383
- if self._channel_manager is not None:
384
- await self._channel_manager.shutdown()
385
397
  if self._state_reporter is not None:
386
398
  await self._state_reporter.shutdown()
387
399
  if self._state_reconciler is not None:
388
400
  await self._state_reconciler.shutdown()
401
+ if self._channel_manager is not None:
402
+ await self._channel_manager.destroy()
389
403
 
390
404
  # We need to shutdown all users of FE states first,
391
405
  # otherwise states might disappear unexpectedly and we might
@@ -397,7 +411,7 @@ class Executor:
397
411
  # The current task is cancelled, the code after this line will not run.
398
412
 
399
413
  def shutdown(self, loop):
400
- loop.create_task(self._shutdown(loop))
414
+ loop.create_task(self._shutdown(loop), name="executor shutdown")
401
415
 
402
416
  def _task_logger(self, task: Task) -> Any:
403
417
  return self._logger.bind(