indexify 0.4.22__py3-none-any.whl → 0.4.23__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. indexify/cli/executor.py +2 -9
  2. indexify/executor/blob_store/blob_store.py +110 -26
  3. indexify/executor/blob_store/local_fs_blob_store.py +41 -1
  4. indexify/executor/blob_store/metrics/blob_store.py +87 -15
  5. indexify/executor/blob_store/s3_blob_store.py +112 -1
  6. indexify/executor/function_executor/function_executor.py +32 -56
  7. indexify/executor/function_executor/invocation_state_client.py +10 -3
  8. indexify/executor/function_executor/server/function_executor_server_factory.py +0 -1
  9. indexify/executor/function_executor_controller/create_function_executor.py +129 -116
  10. indexify/executor/function_executor_controller/downloads.py +34 -86
  11. indexify/executor/function_executor_controller/events.py +13 -7
  12. indexify/executor/function_executor_controller/finalize_task.py +184 -0
  13. indexify/executor/function_executor_controller/function_executor_controller.py +121 -78
  14. indexify/executor/function_executor_controller/message_validators.py +10 -3
  15. indexify/executor/function_executor_controller/metrics/downloads.py +8 -52
  16. indexify/executor/function_executor_controller/metrics/finalize_task.py +20 -0
  17. indexify/executor/function_executor_controller/metrics/prepare_task.py +18 -0
  18. indexify/executor/function_executor_controller/prepare_task.py +232 -14
  19. indexify/executor/function_executor_controller/run_task.py +77 -61
  20. indexify/executor/function_executor_controller/task_info.py +4 -7
  21. indexify/executor/function_executor_controller/task_input.py +21 -0
  22. indexify/executor/function_executor_controller/task_output.py +26 -35
  23. indexify/executor/function_executor_controller/terminate_function_executor.py +6 -1
  24. indexify/executor/logging.py +69 -0
  25. indexify/executor/monitoring/metrics.py +22 -0
  26. indexify/proto/executor_api.proto +11 -3
  27. indexify/proto/executor_api_pb2.py +54 -54
  28. indexify/proto/executor_api_pb2.pyi +8 -1
  29. {indexify-0.4.22.dist-info → indexify-0.4.23.dist-info}/METADATA +6 -7
  30. {indexify-0.4.22.dist-info → indexify-0.4.23.dist-info}/RECORD +32 -30
  31. indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -21
  32. indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -39
  33. indexify/executor/function_executor_controller/upload_task_output.py +0 -274
  34. {indexify-0.4.22.dist-info → indexify-0.4.23.dist-info}/WHEEL +0 -0
  35. {indexify-0.4.22.dist-info → indexify-0.4.23.dist-info}/entry_points.txt +0 -0
indexify/cli/executor.py CHANGED
@@ -1,4 +1,4 @@
1
- from tensorlake.utils.logging import (
1
+ from indexify.executor.logging import (
2
2
  configure_development_mode_logging,
3
3
  configure_logging_early,
4
4
  configure_production_mode_logging,
@@ -162,13 +162,6 @@ def executor(
162
162
  shutil.rmtree(str(executor_cache_path))
163
163
  executor_cache_path.mkdir(parents=True, exist_ok=True)
164
164
 
165
- blob_store: BLOBStore = BLOBStore(
166
- # Local FS mode is used in tests and in cases when user wants to store data on NFS.
167
- local=LocalFSBLOBStore(),
168
- # S3 is initiliazed lazily so it's okay to create it even if the user is not going to use it.
169
- s3=S3BLOBStore(),
170
- )
171
-
172
165
  host_resources_provider: HostResourcesProvider = HostResourcesProvider(
173
166
  gpu_allocator=NvidiaGPUAllocator(logger),
174
167
  # Assuming a simple setup in OSS where Executor container has a single file system
@@ -200,6 +193,6 @@ def executor(
200
193
  config_path=config_path,
201
194
  monitoring_server_host=monitoring_server_host,
202
195
  monitoring_server_port=monitoring_server_port,
203
- blob_store=blob_store,
196
+ blob_store=BLOBStore(),
204
197
  host_resources_provider=host_resources_provider,
205
198
  ).run()
@@ -1,13 +1,25 @@
1
- from typing import Any, Optional
1
+ from typing import Any
2
2
 
3
3
  from .local_fs_blob_store import LocalFSBLOBStore
4
4
  from .metrics.blob_store import (
5
+ metric_abort_multipart_upload_errors,
6
+ metric_abort_multipart_upload_latency,
7
+ metric_abort_multipart_upload_requests,
8
+ metric_complete_multipart_upload_errors,
9
+ metric_complete_multipart_upload_latency,
10
+ metric_complete_multipart_upload_requests,
11
+ metric_create_multipart_upload_errors,
12
+ metric_create_multipart_upload_latency,
13
+ metric_create_multipart_upload_requests,
5
14
  metric_get_blob_errors,
6
15
  metric_get_blob_latency,
7
16
  metric_get_blob_requests,
8
- metric_put_blob_errors,
9
- metric_put_blob_latency,
10
- metric_put_blob_requests,
17
+ metric_presign_uri_errors,
18
+ metric_presign_uri_latency,
19
+ metric_presign_uri_requests,
20
+ metric_upload_blob_errors,
21
+ metric_upload_blob_latency,
22
+ metric_upload_blob_requests,
11
23
  )
12
24
  from .s3_blob_store import S3BLOBStore
13
25
 
@@ -15,12 +27,9 @@ from .s3_blob_store import S3BLOBStore
15
27
  class BLOBStore:
16
28
  """Dispatches generic BLOB store calls to their real backends."""
17
29
 
18
- def __init__(
19
- self, local: Optional[LocalFSBLOBStore] = None, s3: Optional[S3BLOBStore] = None
20
- ):
21
- """Creates a BLOB store that uses the supplied BLOB stores."""
22
- self._local: Optional[LocalFSBLOBStore] = local
23
- self._s3: Optional[S3BLOBStore] = s3
30
+ def __init__(self):
31
+ self._local: LocalFSBLOBStore = LocalFSBLOBStore()
32
+ self._s3: S3BLOBStore = S3BLOBStore()
24
33
 
25
34
  async def get(self, uri: str, logger: Any) -> bytes:
26
35
  """Returns binary value stored in BLOB with the supplied URI.
@@ -33,36 +42,111 @@ class BLOBStore:
33
42
  ):
34
43
  metric_get_blob_requests.inc()
35
44
  if _is_file_uri(uri):
36
- self._check_local_is_available()
37
45
  return await self._local.get(uri, logger)
38
46
  else:
39
- self._check_s3_is_available()
40
47
  return await self._s3.get(uri, logger)
41
48
 
42
- async def put(self, uri: str, value: bytes, logger: Any) -> None:
49
+ async def presign_get_uri(self, uri: str, expires_in_sec: int, logger: Any) -> str:
50
+ """Returns a presigned URI for getting the BLOB with the supplied URI.
51
+
52
+ The URI allows to read any byte range in the BLOB."""
53
+ with (
54
+ metric_presign_uri_errors.count_exceptions(),
55
+ metric_presign_uri_latency.time(),
56
+ ):
57
+ metric_presign_uri_requests.inc()
58
+ if _is_file_uri(uri):
59
+ return await self._local.presign_get_uri(uri, expires_in_sec, logger)
60
+ else:
61
+ return await self._s3.presign_get_uri(uri, expires_in_sec, logger)
62
+
63
+ async def upload(self, uri: str, value: bytes, logger: Any) -> None:
43
64
  """Stores the supplied binary value in a BLOB with the supplied URI.
44
65
 
45
66
  Overwrites existing BLOB. Raises Exception on error.
46
67
  """
47
68
  with (
48
- metric_put_blob_errors.count_exceptions(),
49
- metric_put_blob_latency.time(),
69
+ metric_upload_blob_errors.count_exceptions(),
70
+ metric_upload_blob_latency.time(),
71
+ ):
72
+ metric_upload_blob_requests.inc()
73
+ if _is_file_uri(uri):
74
+ await self._local.upload(uri, value, logger)
75
+ else:
76
+ await self._s3.upload(uri, value, logger)
77
+
78
+ async def create_multipart_upload(self, uri: str, logger: Any) -> str:
79
+ """Creates a multipart upload for BLOB with the supplied URI and returns the upload ID."""
80
+ with (
81
+ metric_create_multipart_upload_errors.count_exceptions(),
82
+ metric_create_multipart_upload_latency.time(),
50
83
  ):
51
- metric_put_blob_requests.inc()
84
+ metric_create_multipart_upload_requests.inc()
52
85
  if _is_file_uri(uri):
53
- self._check_local_is_available()
54
- await self._local.put(uri, value, logger)
86
+ return await self._local.create_multipart_upload(uri, logger)
55
87
  else:
56
- self._check_s3_is_available()
57
- await self._s3.put(uri, value, logger)
88
+ return await self._s3.create_multipart_upload(uri, logger)
58
89
 
59
- def _check_local_is_available(self):
60
- if self._local is None:
61
- raise RuntimeError("Local file system BLOB store is not available")
90
+ async def complete_multipart_upload(
91
+ self, uri: str, upload_id: str, parts_etags: list[str], logger: Any
92
+ ) -> None:
93
+ """Completes a multipart upload for BLOB with the supplied URI.
62
94
 
63
- def _check_s3_is_available(self):
64
- if self._s3 is None:
65
- raise RuntimeError("S3 BLOB store is not available")
95
+ parts_etags is a list of ETags for the parts that were uploaded.
96
+ The list is ordered by part number starting from 1.
97
+ """
98
+ with (
99
+ metric_complete_multipart_upload_errors.count_exceptions(),
100
+ metric_complete_multipart_upload_latency.time(),
101
+ ):
102
+ metric_complete_multipart_upload_requests.inc()
103
+ if _is_file_uri(uri):
104
+ await self._local.complete_multipart_upload(
105
+ uri, upload_id, parts_etags, logger
106
+ )
107
+ else:
108
+ await self._s3.complete_multipart_upload(
109
+ uri, upload_id, parts_etags, logger
110
+ )
111
+
112
+ async def abort_multipart_upload(
113
+ self, uri: str, upload_id: str, logger: Any
114
+ ) -> None:
115
+ """Aborts a multipart upload for BLOB with the supplied URI."""
116
+ with (
117
+ metric_abort_multipart_upload_errors.count_exceptions(),
118
+ metric_abort_multipart_upload_latency.time(),
119
+ ):
120
+ metric_abort_multipart_upload_requests.inc()
121
+ if _is_file_uri(uri):
122
+ await self._local.abort_multipart_upload(uri, upload_id, logger)
123
+ else:
124
+ await self._s3.abort_multipart_upload(uri, upload_id, logger)
125
+
126
+ async def presign_upload_part_uri(
127
+ self,
128
+ uri: str,
129
+ part_number: int,
130
+ upload_id: str,
131
+ expires_in_sec: int,
132
+ logger: Any,
133
+ ) -> str:
134
+ """Returns a presigned URI for uploading a part in a multipart upload.
135
+
136
+ part_number starts from 1."""
137
+ with (
138
+ metric_presign_uri_errors.count_exceptions(),
139
+ metric_presign_uri_latency.time(),
140
+ ):
141
+ metric_presign_uri_requests.inc()
142
+ if _is_file_uri(uri):
143
+ return await self._local.presign_upload_part_uri(
144
+ uri, part_number, upload_id, expires_in_sec, logger
145
+ )
146
+ else:
147
+ return await self._s3.presign_upload_part_uri(
148
+ uri, part_number, upload_id, expires_in_sec, logger
149
+ )
66
150
 
67
151
 
68
152
  def _is_file_uri(uri: str) -> bool:
@@ -16,7 +16,14 @@ class LocalFSBLOBStore:
16
16
  # Run synchronous code in a thread to not block the event loop.
17
17
  return await asyncio.to_thread(self._sync_get, _path_from_file_uri(uri))
18
18
 
19
- async def put(self, uri: str, value: bytes, logger: Any) -> None:
19
+ async def presign_get_uri(self, uri: str, expires_in_sec: int, logger: Any) -> str:
20
+ """Returns a presigned URI for getting the file at the supplied URI.
21
+
22
+ For local files, just returns the file URI itself.
23
+ """
24
+ return uri
25
+
26
+ async def upload(self, uri: str, value: bytes, logger: Any) -> None:
20
27
  """Stores the supplied binary value in a file at the supplied URI.
21
28
 
22
29
  The URI must be a file URI (starts with "file://"). The path must be absolute.
@@ -25,6 +32,39 @@ class LocalFSBLOBStore:
25
32
  # Run synchronous code in a thread to not block the event loop.
26
33
  return await asyncio.to_thread(self._sync_put, _path_from_file_uri(uri), value)
27
34
 
35
+ async def create_multipart_upload(self, uri: str, logger: Any) -> str:
36
+ """Creates a multipart upload for local file and returns a dummy upload ID."""
37
+ # Local files do not require multipart upload, return a dummy ID
38
+ return "local-multipart-upload-id"
39
+
40
+ async def complete_multipart_upload(
41
+ self, uri: str, upload_id: str, parts_etags: list[str], logger: Any
42
+ ) -> None:
43
+ """Completes a multipart upload for local file. No-op for local files."""
44
+ # No action needed for local files
45
+ return None
46
+
47
+ async def abort_multipart_upload(
48
+ self, uri: str, upload_id: str, logger: Any
49
+ ) -> None:
50
+ """Aborts a multipart upload for local file. No-op for local files."""
51
+ # No action needed for local files
52
+ return None
53
+
54
+ async def presign_upload_part_uri(
55
+ self,
56
+ uri: str,
57
+ part_number: int,
58
+ upload_id: str,
59
+ expires_in_sec: int,
60
+ logger: Any,
61
+ ) -> str:
62
+ """Returns a presigned URI for uploading a part in a multipart upload for local file.
63
+
64
+ For local files, just returns the file URI itself.
65
+ """
66
+ return uri
67
+
28
68
  def _sync_get(self, path: str) -> bytes:
29
69
  if not os.path.isabs(path):
30
70
  raise ValueError(f"Path {path} must be absolute")
@@ -3,31 +3,103 @@ import prometheus_client
3
3
  from ...monitoring.metrics import latency_metric_for_fast_operation
4
4
 
5
5
  metric_get_blob_requests: prometheus_client.Counter = prometheus_client.Counter(
6
- "get_blob_requests",
7
- "Number of get blob requests",
6
+ "blob_store_get_requests",
7
+ "Number of get BLOB requests in BLOB store",
8
8
  )
9
9
  metric_get_blob_errors: prometheus_client.Counter = prometheus_client.Counter(
10
- "get_blob_request_errors",
11
- "Number of get blob request errors",
10
+ "blob_store_get_request_errors",
11
+ "Number of get BLOB request errors in BLOB store",
12
12
  )
13
13
  metric_get_blob_latency: prometheus_client.Histogram = (
14
14
  latency_metric_for_fast_operation(
15
- "get_blob_request",
16
- "get blob request",
15
+ "blob_store_get",
16
+ "BLOB store get BLOB request",
17
17
  )
18
18
  )
19
19
 
20
- metric_put_blob_requests: prometheus_client.Counter = prometheus_client.Counter(
21
- "put_blob_requests",
22
- "Number of put blob requests",
20
+ metric_presign_uri_requests: prometheus_client.Counter = prometheus_client.Counter(
21
+ "blob_store_presign_uri_requests",
22
+ "Number of presign URI requests in BLOB store",
23
23
  )
24
- metric_put_blob_errors: prometheus_client.Counter = prometheus_client.Counter(
25
- "put_blob_request_errors",
26
- "Number of put blob request errors",
24
+ metric_presign_uri_errors: prometheus_client.Counter = prometheus_client.Counter(
25
+ "blob_store_presign_uri_request_errors",
26
+ "Number of presign URI request errors in BLOB store",
27
27
  )
28
- metric_put_blob_latency: prometheus_client.Histogram = (
28
+ metric_presign_uri_latency: prometheus_client.Histogram = (
29
29
  latency_metric_for_fast_operation(
30
- "put_blob_request",
31
- "put blob request",
30
+ "blob_store_presign_uri",
31
+ "BLOB store presign URI request",
32
+ )
33
+ )
34
+
35
+ metric_upload_blob_requests: prometheus_client.Counter = prometheus_client.Counter(
36
+ "blob_store_upload_requests",
37
+ "Number of upload BLOB requests in BLOB store",
38
+ )
39
+ metric_upload_blob_errors: prometheus_client.Counter = prometheus_client.Counter(
40
+ "blob_store_upload_request_errors",
41
+ "Number of upload BLOB request errors in BLOB store",
42
+ )
43
+ metric_upload_blob_latency: prometheus_client.Histogram = (
44
+ latency_metric_for_fast_operation(
45
+ "blob_store_upload",
46
+ "BLOB store upload BLOB request",
47
+ )
48
+ )
49
+
50
+ metric_create_multipart_upload_requests: prometheus_client.Counter = (
51
+ prometheus_client.Counter(
52
+ "blob_store_create_multipart_upload_requests",
53
+ "Number of create multipart upload requests in BLOB store",
54
+ )
55
+ )
56
+ metric_create_multipart_upload_errors: prometheus_client.Counter = (
57
+ prometheus_client.Counter(
58
+ "blob_store_create_multipart_upload_request_errors",
59
+ "Number of create multipart upload request errors in BLOB store",
60
+ )
61
+ )
62
+ metric_create_multipart_upload_latency: prometheus_client.Histogram = (
63
+ latency_metric_for_fast_operation(
64
+ "blob_store_create_multipart_upload_request",
65
+ "create multipart upload request in BLOB store",
66
+ )
67
+ )
68
+
69
+ metric_complete_multipart_upload_requests: prometheus_client.Counter = (
70
+ prometheus_client.Counter(
71
+ "blob_store_complete_multipart_upload_requests",
72
+ "Number of complete multipart upload requests in BLOB store",
73
+ )
74
+ )
75
+ metric_complete_multipart_upload_errors: prometheus_client.Counter = (
76
+ prometheus_client.Counter(
77
+ "blob_store_complete_multipart_upload_request_errors",
78
+ "Number of complete multipart upload request errors in BLOB store",
79
+ )
80
+ )
81
+ metric_complete_multipart_upload_latency: prometheus_client.Histogram = (
82
+ latency_metric_for_fast_operation(
83
+ "blob_store_complete_multipart_upload_request",
84
+ "complete multipart upload request in BLOB store",
85
+ )
86
+ )
87
+
88
+ metric_abort_multipart_upload_requests: prometheus_client.Counter = (
89
+ prometheus_client.Counter(
90
+ "blob_store_abort_multipart_upload_requests",
91
+ "Number of abort multipart upload requests in BLOB store",
92
+ )
93
+ )
94
+ metric_abort_multipart_upload_errors: prometheus_client.Counter = (
95
+ prometheus_client.Counter(
96
+ "blob_store_abort_multipart_upload_request_errors",
97
+ "Number of abort multipart upload request errors in BLOB store",
98
+ )
99
+ )
100
+ metric_abort_multipart_upload_latency: prometheus_client.Histogram = (
101
+ latency_metric_for_fast_operation(
102
+ "blob_store_abort_multipart_upload_request",
103
+ "abort multipart upload request in BLOB store",
32
104
  )
33
105
  )
@@ -59,7 +59,28 @@ class S3BLOBStore:
59
59
  logger.error("failed to get S3 object", uri=uri, exc_info=e)
60
60
  raise
61
61
 
62
- async def put(self, uri: str, value: bytes, logger: Any) -> None:
62
+ async def presign_get_uri(self, uri: str, expires_in_sec: int, logger: Any) -> str:
63
+ """Returns a presigned URI for getting the S3 object at the supplied URI."""
64
+ self._lazy_create_client()
65
+ bucket_name, key = _bucket_name_and_object_key_from_uri(uri)
66
+ try:
67
+ s3_uri: str = await asyncio.to_thread(
68
+ self._s3_client.generate_presigned_url,
69
+ ClientMethod="get_object",
70
+ Params={"Bucket": bucket_name, "Key": key},
71
+ ExpiresIn=expires_in_sec,
72
+ )
73
+ return s3_uri.replace("https://", "s3://", 1)
74
+ except Exception as e:
75
+ logger.error(
76
+ "failed to presign URI for get_object operation",
77
+ uri=uri,
78
+ exc_info=e,
79
+ expires_in_sec=expires_in_sec,
80
+ )
81
+ raise
82
+
83
+ async def upload(self, uri: str, value: bytes, logger: Any) -> None:
63
84
  """Stores the supplied binary value in a S3 object at the supplied URI.
64
85
 
65
86
  The URI must be S3 URI (starts with "s3://").
@@ -75,6 +96,96 @@ class S3BLOBStore:
75
96
  logger.error("failed to set S3 object", uri=uri, exc_info=e)
76
97
  raise
77
98
 
99
+ async def create_multipart_upload(self, uri: str, logger: Any) -> str:
100
+ """Creates a multipart upload for S3 object and returns the upload ID."""
101
+ self._lazy_create_client()
102
+ bucket_name, key = _bucket_name_and_object_key_from_uri(uri)
103
+ try:
104
+ response = await asyncio.to_thread(
105
+ self._s3_client.create_multipart_upload,
106
+ Bucket=bucket_name,
107
+ Key=key,
108
+ )
109
+ return response["UploadId"]
110
+ except Exception as e:
111
+ logger.error("failed to create multipart upload", uri=uri, exc_info=e)
112
+ raise
113
+
114
+ async def complete_multipart_upload(
115
+ self, uri: str, upload_id: str, parts_etags: list[str], logger: Any
116
+ ) -> None:
117
+ """Completes a multipart upload for S3 object."""
118
+ self._lazy_create_client()
119
+ bucket_name, key = _bucket_name_and_object_key_from_uri(uri)
120
+ try:
121
+ await asyncio.to_thread(
122
+ self._s3_client.complete_multipart_upload,
123
+ Bucket=bucket_name,
124
+ Key=key,
125
+ UploadId=upload_id,
126
+ MultipartUpload={
127
+ "Parts": [
128
+ {"ETag": etag, "PartNumber": i + 1}
129
+ for i, etag in enumerate(parts_etags)
130
+ ]
131
+ },
132
+ )
133
+ except Exception as e:
134
+ logger.error("failed to complete multipart upload", uri=uri, exc_info=e)
135
+ raise
136
+
137
+ async def abort_multipart_upload(
138
+ self, uri: str, upload_id: str, logger: Any
139
+ ) -> None:
140
+ """Aborts a multipart upload for S3 object."""
141
+ self._lazy_create_client()
142
+ bucket_name, key = _bucket_name_and_object_key_from_uri(uri)
143
+ try:
144
+ await asyncio.to_thread(
145
+ self._s3_client.abort_multipart_upload,
146
+ Bucket=bucket_name,
147
+ Key=key,
148
+ UploadId=upload_id,
149
+ )
150
+ except Exception as e:
151
+ logger.error("failed to abort multipart upload", uri=uri, exc_info=e)
152
+ raise
153
+
154
+ async def presign_upload_part_uri(
155
+ self,
156
+ uri: str,
157
+ part_number: int,
158
+ upload_id: str,
159
+ expires_in_sec: int,
160
+ logger: Any,
161
+ ) -> str:
162
+ """Returns a presigned URI for uploading a part in a multipart upload for S3 object."""
163
+ self._lazy_create_client()
164
+ bucket_name, key = _bucket_name_and_object_key_from_uri(uri)
165
+ try:
166
+ response = await asyncio.to_thread(
167
+ self._s3_client.generate_presigned_url,
168
+ ClientMethod="upload_part",
169
+ Params={
170
+ "Bucket": bucket_name,
171
+ "Key": key,
172
+ "UploadId": upload_id,
173
+ "PartNumber": part_number,
174
+ },
175
+ ExpiresIn=expires_in_sec,
176
+ )
177
+ return response
178
+ except Exception as e:
179
+ logger.error(
180
+ "failed to presign URI for upload_part operation",
181
+ uri=uri,
182
+ exc_info=e,
183
+ part_number=part_number,
184
+ upload_id=upload_id,
185
+ expires_in_sec=expires_in_sec,
186
+ )
187
+ raise
188
+
78
189
 
79
190
  def _bucket_name_and_object_key_from_uri(uri: str) -> tuple[str, str]:
80
191
  # Example S3 object URI:
@@ -1,14 +1,11 @@
1
1
  import asyncio
2
2
  from dataclasses import dataclass
3
- from enum import Enum
4
3
  from typing import Any, Optional
5
4
 
6
5
  import grpc
7
6
  from tensorlake.function_executor.proto.function_executor_pb2 import (
8
7
  InfoRequest,
9
8
  InfoResponse,
10
- InitializationFailureReason,
11
- InitializationOutcomeCode,
12
9
  InitializeRequest,
13
10
  InitializeResponse,
14
11
  )
@@ -18,6 +15,8 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
18
15
  from tensorlake.function_executor.proto.message_validator import MessageValidator
19
16
  from tensorlake.utils.http_client import get_httpx_client
20
17
 
18
+ from indexify.executor.monitoring.metrics import IdempotentCounterChanger
19
+
21
20
  from .health_checker import HealthChecker
22
21
  from .invocation_state_client import InvocationStateClient
23
22
  from .metrics.function_executor import (
@@ -60,19 +59,14 @@ from .server.function_executor_server_factory import (
60
59
  )
61
60
 
62
61
 
63
- class FunctionExecutorInitializationError(Enum):
64
- FUNCTION_TIMEOUT = 1
65
- FUNCTION_ERROR = 2
66
-
67
-
68
62
  @dataclass
69
63
  class FunctionExecutorInitializationResult:
70
64
  """Result of FunctionExecutor initialization."""
71
65
 
72
- # None error means success.
73
- error: Optional[FunctionExecutorInitializationError] = None
74
- stdout: Optional[str] = None
75
- stderr: Optional[str] = None
66
+ # If True, timed out waiting for the Function Executor to initialize.
67
+ is_timeout: bool
68
+ # FE is unresponsive if response is None.
69
+ response: Optional[InitializeResponse]
76
70
 
77
71
 
78
72
  class FunctionExecutor:
@@ -89,12 +83,17 @@ class FunctionExecutor:
89
83
 
90
84
  def __init__(self, server_factory: FunctionExecutorServerFactory, logger: Any):
91
85
  self._server_factory: FunctionExecutorServerFactory = server_factory
92
- self._logger = logger.bind(module=__name__)
86
+ self._logger: Any = logger.bind(module=__name__)
93
87
  self._server: Optional[FunctionExecutorServer] = None
94
88
  self._channel: Optional[grpc.aio.Channel] = None
95
89
  self._invocation_state_client: Optional[InvocationStateClient] = None
96
90
  self._health_checker: Optional[HealthChecker] = None
97
- metric_function_executors_count.inc()
91
+ self._function_executors_counter_changer: IdempotentCounterChanger = (
92
+ IdempotentCounterChanger(
93
+ metric_function_executors_count,
94
+ )
95
+ )
96
+ self._function_executors_counter_changer.inc()
98
97
 
99
98
  async def initialize(
100
99
  self,
@@ -102,11 +101,11 @@ class FunctionExecutor:
102
101
  initialize_request: InitializeRequest,
103
102
  base_url: str,
104
103
  config_path: Optional[str],
105
- customer_code_timeout_sec: Optional[float] = None,
104
+ customer_code_timeout_sec: float,
106
105
  ) -> FunctionExecutorInitializationResult:
107
106
  """Creates and initializes a FunctionExecutorServer and all resources associated with it.
108
107
 
109
- Raises an Exception if an internal error occured."""
108
+ Raises an Exception if an Executor side internal error occured."""
110
109
  try:
111
110
  with (
112
111
  metric_create_errors.count_exceptions(),
@@ -126,7 +125,7 @@ class FunctionExecutor:
126
125
  await self._create_health_checker(self._channel, stub)
127
126
 
128
127
  return await _initialize_server(
129
- stub, initialize_request, customer_code_timeout_sec
128
+ stub, initialize_request, customer_code_timeout_sec, self._logger
130
129
  )
131
130
  except Exception:
132
131
  await self.destroy()
@@ -152,7 +151,7 @@ class FunctionExecutor:
152
151
  metric_destroy_errors.count_exceptions(),
153
152
  metric_destroy_latency.time(),
154
153
  ):
155
- metric_function_executors_count.dec()
154
+ self._function_executors_counter_changer.dec()
156
155
  metric_destroys.inc()
157
156
  await self._destroy_health_checker()
158
157
  await self._destroy_invocation_state_client()
@@ -306,7 +305,8 @@ async def _collect_server_info(stub: FunctionExecutorStub) -> None:
306
305
  async def _initialize_server(
307
306
  stub: FunctionExecutorStub,
308
307
  initialize_request: InitializeRequest,
309
- customer_code_timeout_sec: Optional[float],
308
+ customer_code_timeout_sec: float,
309
+ logger: Any,
310
310
  ) -> FunctionExecutorInitializationResult:
311
311
  with (
312
312
  metric_initialize_rpc_errors.count_exceptions(),
@@ -317,46 +317,22 @@ async def _initialize_server(
317
317
  initialize_request,
318
318
  timeout=customer_code_timeout_sec,
319
319
  )
320
-
321
- if (
322
- initialize_response.outcome_code
323
- == InitializationOutcomeCode.INITIALIZE_OUTCOME_CODE_SUCCESS
324
- ):
320
+ return FunctionExecutorInitializationResult(
321
+ is_timeout=False,
322
+ response=initialize_response,
323
+ )
324
+ except grpc.aio.AioRpcError as e:
325
+ if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
325
326
  return FunctionExecutorInitializationResult(
326
- stdout=initialize_response.stdout, stderr=initialize_response.stderr
327
+ is_timeout=True,
328
+ response=None,
327
329
  )
328
- elif (
329
- initialize_response.outcome_code
330
- == InitializationOutcomeCode.INITIALIZE_OUTCOME_CODE_FAILURE
331
- ):
332
- if (
333
- initialize_response.failure_reason
334
- == InitializationFailureReason.INITIALIZATION_FAILURE_REASON_FUNCTION_ERROR
335
- ):
336
- return FunctionExecutorInitializationResult(
337
- error=FunctionExecutorInitializationError.FUNCTION_ERROR,
338
- stdout=initialize_response.stdout,
339
- stderr=initialize_response.stderr,
340
- )
341
- elif (
342
- initialize_response.failure_reason
343
- == InitializationFailureReason.INITIALIZATION_FAILURE_REASON_INTERNAL_ERROR
344
- ):
345
- # Don't add stdout/stderr because this is customer data.
346
- raise RuntimeError("initialize RPC failed with internal error")
347
- else:
348
- raise ValueError(
349
- f"unexpected failure reason {InitializationFailureReason.Name(initialize_response.failure_reason)} in initialize RPC response"
350
- )
351
330
  else:
352
- raise ValueError(
353
- f"unexpected outcome code {InitializationOutcomeCode.Name(initialize_response.outcome_code)} in initialize RPC response"
331
+ logger.error(
332
+ "Function Executor initialize RPC failed",
333
+ exc_info=e,
354
334
  )
355
-
356
- except grpc.aio.AioRpcError as e:
357
- if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
358
335
  return FunctionExecutorInitializationResult(
359
- error=FunctionExecutorInitializationError.FUNCTION_TIMEOUT,
360
- stderr=f"Function initialization exceeded its configured timeout of {customer_code_timeout_sec:.3f} sec.",
336
+ is_timeout=False,
337
+ response=None,
361
338
  )
362
- raise