indexify 0.3.18__tar.gz → 0.3.19__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {indexify-0.3.18 → indexify-0.3.19}/PKG-INFO +1 -1
- {indexify-0.3.18 → indexify-0.3.19}/pyproject.toml +3 -1
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/cli/cli.py +3 -17
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/api_objects.py +12 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/downloader.py +4 -1
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/executor.py +51 -29
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/function_executor.py +24 -11
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/function_executor_state.py +9 -1
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/function_executor_states_container.py +3 -1
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/function_executor_status.py +2 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/single_task_runner.py +15 -11
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/task_output.py +35 -2
- indexify-0.3.19/src/indexify/executor/grpc/completed_tasks_container.py +26 -0
- indexify-0.3.19/src/indexify/executor/grpc/function_executor_controller.py +421 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/grpc/state_reconciler.py +24 -34
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/grpc/state_reporter.py +35 -32
- indexify-0.3.19/src/indexify/executor/grpc/task_controller.py +449 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/metrics/task_reporter.py +14 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/task_reporter.py +95 -4
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/task_runner.py +1 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/proto/executor_api.proto +63 -5
- indexify-0.3.19/src/indexify/proto/executor_api_pb2.py +80 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/proto/executor_api_pb2.pyi +118 -3
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/proto/executor_api_pb2_grpc.py +47 -0
- indexify-0.3.18/src/indexify/proto/executor_api_pb2.py +0 -70
- {indexify-0.3.18 → indexify-0.3.19}/README.md +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/README.md +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/executor_flavor.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/health_checker.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/invocation_state_client.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/metrics/function_executor.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/metrics/function_executor_state.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/metrics/health_checker.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/metrics/invocation_state_client.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/metrics/single_task_runner.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/task_input.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/grpc/channel_manager.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/grpc/metrics/channel_manager.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/grpc/metrics/state_reporter.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/metrics/downloader.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/metrics/executor.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/metrics/task_fetcher.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/metrics/task_runner.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/monitoring/function_allowlist.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/monitoring/handler.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/monitoring/health_check_handler.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/monitoring/health_checker/health_checker.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/monitoring/metrics.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/monitoring/prometheus_metrics_handler.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/monitoring/server.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/monitoring/startup_probe_handler.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/runtime_probes.py +0 -0
- {indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/task_fetcher.py +0 -0
@@ -1,7 +1,7 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "indexify"
|
3
3
|
# Incremented if any of the components provided in this packages are updated.
|
4
|
-
version = "0.3.
|
4
|
+
version = "0.3.19"
|
5
5
|
description = "Open Source Indexify components and helper tools"
|
6
6
|
authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
|
7
7
|
license = "Apache 2.0"
|
@@ -24,6 +24,8 @@ aiohttp = "^3.11.0"
|
|
24
24
|
prometheus-client = "^0.21.1"
|
25
25
|
# Adds function-executor binary and utils lib.
|
26
26
|
tensorlake = ">=0.1"
|
27
|
+
# Uncomment the next line to use local tensorlake package (only for development!)
|
28
|
+
# tensorlake = { path = "../tensorlake", develop = true }
|
27
29
|
# pydantic is provided by tensorlake
|
28
30
|
# httpx-sse is provided by tensorlake
|
29
31
|
# grpcio is provided by tensorlake
|
@@ -78,6 +78,7 @@ def build_image(
|
|
78
78
|
)
|
79
79
|
def executor(
|
80
80
|
server_addr: str = "localhost:8900",
|
81
|
+
grpc_server_addr: str = "localhost:8901",
|
81
82
|
dev: Annotated[
|
82
83
|
bool, typer.Option("--dev", "-d", help="Run the executor in development mode")
|
83
84
|
] = False,
|
@@ -120,16 +121,6 @@ def executor(
|
|
120
121
|
help="Port where to run Executor Monitoring server",
|
121
122
|
),
|
122
123
|
] = 7000,
|
123
|
-
grpc_server_addr: Annotated[
|
124
|
-
Optional[str],
|
125
|
-
typer.Option(
|
126
|
-
"--grpc-server-addr",
|
127
|
-
help=(
|
128
|
-
"(exprimental) Address of server gRPC API to connect to, e.g. 'localhost:8901'.\n"
|
129
|
-
"Enables gRPC state reporter that will periodically report the state of the Function Executors to Server\n"
|
130
|
-
),
|
131
|
-
),
|
132
|
-
] = None,
|
133
124
|
enable_grpc_state_reconciler: Annotated[
|
134
125
|
bool,
|
135
126
|
typer.Option(
|
@@ -166,11 +157,6 @@ def executor(
|
|
166
157
|
"--executor-id should be at least 10 characters long and only include characters _-[0-9][a-z][A-Z]"
|
167
158
|
)
|
168
159
|
|
169
|
-
if enable_grpc_state_reconciler and grpc_server_addr is None:
|
170
|
-
raise typer.BadParameter(
|
171
|
-
"--grpc-server-addr must be set when --enable-grpc-state-reconciler is set"
|
172
|
-
)
|
173
|
-
|
174
160
|
kv_labels: Dict[str, str] = {}
|
175
161
|
for label in labels:
|
176
162
|
key, value = label.split("=")
|
@@ -183,6 +169,7 @@ def executor(
|
|
183
169
|
"starting executor",
|
184
170
|
hostname=gethostname(),
|
185
171
|
server_addr=server_addr,
|
172
|
+
grpc_server_addr=grpc_server_addr,
|
186
173
|
config_path=config_path,
|
187
174
|
executor_version=executor_version,
|
188
175
|
labels=kv_labels,
|
@@ -192,7 +179,6 @@ def executor(
|
|
192
179
|
dev_mode=dev,
|
193
180
|
monitoring_server_host=monitoring_server_host,
|
194
181
|
monitoring_server_port=monitoring_server_port,
|
195
|
-
grpc_server_addr=grpc_server_addr,
|
196
182
|
enable_grpc_state_reconciler=enable_grpc_state_reconciler,
|
197
183
|
)
|
198
184
|
|
@@ -231,10 +217,10 @@ def executor(
|
|
231
217
|
server_ports=range(ports[0], ports[1]),
|
232
218
|
),
|
233
219
|
server_addr=server_addr,
|
220
|
+
grpc_server_addr=grpc_server_addr,
|
234
221
|
config_path=config_path,
|
235
222
|
monitoring_server_host=monitoring_server_host,
|
236
223
|
monitoring_server_port=monitoring_server_port,
|
237
|
-
grpc_server_addr=grpc_server_addr,
|
238
224
|
enable_grpc_state_reconciler=enable_grpc_state_reconciler,
|
239
225
|
).run()
|
240
226
|
|
@@ -49,5 +49,17 @@ class TaskResult(BaseModel):
|
|
49
49
|
reducer: bool = False
|
50
50
|
|
51
51
|
|
52
|
+
class DataPayload(BaseModel):
|
53
|
+
path: str
|
54
|
+
size: int
|
55
|
+
sha256_hash: str
|
56
|
+
|
57
|
+
|
58
|
+
class IngestFnOutputsResponse(BaseModel):
|
59
|
+
data_payloads: List[DataPayload]
|
60
|
+
stdout: Optional[DataPayload] = None
|
61
|
+
stderr: Optional[DataPayload] = None
|
62
|
+
|
63
|
+
|
52
64
|
TASK_OUTCOME_SUCCESS = "success"
|
53
65
|
TASK_OUTCOME_FAILURE = "failure"
|
@@ -241,7 +241,10 @@ class Downloader:
|
|
241
241
|
def serialized_object_from_http_response(response: httpx.Response) -> SerializedObject:
|
242
242
|
# We're hardcoding the content type currently used by Python SDK. It might change in the future.
|
243
243
|
# There's no other way for now to determine if the response is a bytes or string.
|
244
|
-
if response.headers["content-type"]
|
244
|
+
if response.headers["content-type"] in [
|
245
|
+
"application/octet-stream",
|
246
|
+
"application/pickle",
|
247
|
+
]:
|
245
248
|
return SerializedObject(
|
246
249
|
bytes=response.content, content_type=response.headers["content-type"]
|
247
250
|
)
|
@@ -64,10 +64,10 @@ class Executor:
|
|
64
64
|
function_allowlist: Optional[List[FunctionURI]],
|
65
65
|
function_executor_server_factory: FunctionExecutorServerFactory,
|
66
66
|
server_addr: str,
|
67
|
+
grpc_server_addr: str,
|
67
68
|
config_path: Optional[str],
|
68
69
|
monitoring_server_host: str,
|
69
70
|
monitoring_server_port: int,
|
70
|
-
grpc_server_addr: Optional[str],
|
71
71
|
enable_grpc_state_reconciler: bool,
|
72
72
|
):
|
73
73
|
self._logger = structlog.get_logger(module=__name__)
|
@@ -97,43 +97,40 @@ class Executor:
|
|
97
97
|
self._downloader = Downloader(
|
98
98
|
code_path=code_path, base_url=self._base_url, config_path=config_path
|
99
99
|
)
|
100
|
+
self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
|
101
|
+
self._function_executor_server_factory = function_executor_server_factory
|
102
|
+
self._channel_manager = ChannelManager(
|
103
|
+
server_address=grpc_server_addr,
|
104
|
+
config_path=config_path,
|
105
|
+
logger=self._logger,
|
106
|
+
)
|
107
|
+
self._state_reporter = ExecutorStateReporter(
|
108
|
+
executor_id=id,
|
109
|
+
flavor=flavor,
|
110
|
+
version=version,
|
111
|
+
labels=labels,
|
112
|
+
development_mode=development_mode,
|
113
|
+
function_allowlist=self._function_allowlist,
|
114
|
+
function_executor_states=self._function_executor_states,
|
115
|
+
channel_manager=self._channel_manager,
|
116
|
+
logger=self._logger,
|
117
|
+
)
|
118
|
+
self._state_reporter.update_executor_status(
|
119
|
+
ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
|
120
|
+
)
|
100
121
|
self._task_reporter = TaskReporter(
|
101
122
|
base_url=self._base_url,
|
102
123
|
executor_id=id,
|
103
124
|
config_path=config_path,
|
125
|
+
channel_manager=self._channel_manager,
|
104
126
|
)
|
105
|
-
self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
|
106
|
-
self._function_executor_server_factory = function_executor_server_factory
|
107
127
|
|
108
|
-
# HTTP mode
|
128
|
+
# HTTP mode task runner
|
109
129
|
self._task_runner: Optional[TaskRunner] = None
|
110
130
|
self._task_fetcher: Optional[TaskFetcher] = None
|
111
|
-
# gRPC mode
|
112
|
-
self._channel_manager: Optional[ChannelManager] = None
|
113
|
-
self._state_reporter: Optional[ExecutorStateReporter] = None
|
131
|
+
# gRPC mode state reconciler that runs tasks
|
114
132
|
self._state_reconciler: Optional[ExecutorStateReconciler] = None
|
115
133
|
|
116
|
-
if grpc_server_addr is not None:
|
117
|
-
self._channel_manager = ChannelManager(
|
118
|
-
server_address=grpc_server_addr,
|
119
|
-
config_path=config_path,
|
120
|
-
logger=self._logger,
|
121
|
-
)
|
122
|
-
self._state_reporter = ExecutorStateReporter(
|
123
|
-
executor_id=id,
|
124
|
-
flavor=flavor,
|
125
|
-
version=version,
|
126
|
-
labels=labels,
|
127
|
-
development_mode=development_mode,
|
128
|
-
function_allowlist=self._function_allowlist,
|
129
|
-
function_executor_states=self._function_executor_states,
|
130
|
-
channel_manager=self._channel_manager,
|
131
|
-
logger=self._logger,
|
132
|
-
)
|
133
|
-
self._state_reporter.update_executor_status(
|
134
|
-
ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
|
135
|
-
)
|
136
|
-
|
137
134
|
if enable_grpc_state_reconciler:
|
138
135
|
self._state_reconciler = ExecutorStateReconciler(
|
139
136
|
executor_id=id,
|
@@ -171,8 +168,8 @@ class Executor:
|
|
171
168
|
"version": version,
|
172
169
|
"code_path": str(code_path),
|
173
170
|
"server_addr": server_addr,
|
174
|
-
"config_path": str(config_path),
|
175
171
|
"grpc_server_addr": str(grpc_server_addr),
|
172
|
+
"config_path": str(config_path),
|
176
173
|
"enable_grpc_state_reconciler": str(enable_grpc_state_reconciler),
|
177
174
|
"hostname": gethostname(),
|
178
175
|
}
|
@@ -256,6 +253,9 @@ class Executor:
|
|
256
253
|
)
|
257
254
|
logger.error("task execution failed", exc_info=e)
|
258
255
|
|
256
|
+
if output.metrics is not None:
|
257
|
+
self.log_function_metrics(output)
|
258
|
+
|
259
259
|
with (
|
260
260
|
metric_tasks_reporting_outcome.track_inprogress(),
|
261
261
|
metric_task_outcome_report_latency.time(),
|
@@ -265,6 +265,28 @@ class Executor:
|
|
265
265
|
|
266
266
|
metric_task_completion_latency.observe(time.monotonic() - start_time)
|
267
267
|
|
268
|
+
def log_function_metrics(self, output: TaskOutput):
|
269
|
+
for counter_name, counter_value in output.metrics.counters.items():
|
270
|
+
self._logger.info(
|
271
|
+
f"function_metric",
|
272
|
+
counter_name=counter_name,
|
273
|
+
counter_value=counter_value,
|
274
|
+
invocation_id=output.graph_invocation_id,
|
275
|
+
function_name=output.function_name,
|
276
|
+
graph_name=output.graph_name,
|
277
|
+
namespace=output.namespace,
|
278
|
+
)
|
279
|
+
for timer_name, timer_value in output.metrics.timers.items():
|
280
|
+
self._logger.info(
|
281
|
+
f"function_metric",
|
282
|
+
timer_name=timer_name,
|
283
|
+
timer_value=timer_value,
|
284
|
+
invocation_id=output.graph_invocation_id,
|
285
|
+
function_name=output.function_name,
|
286
|
+
graph_name=output.graph_name,
|
287
|
+
namespace=output.namespace,
|
288
|
+
)
|
289
|
+
|
268
290
|
async def _run_task_and_get_output(self, task: Task, logger: Any) -> TaskOutput:
|
269
291
|
graph: SerializedObject = await self._downloader.download_graph(
|
270
292
|
namespace=task.namespace,
|
{indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/function_executor.py
RENAMED
@@ -88,6 +88,7 @@ class FunctionExecutor:
|
|
88
88
|
initialize_request: InitializeRequest,
|
89
89
|
base_url: str,
|
90
90
|
config_path: Optional[str],
|
91
|
+
customer_code_timeout_sec: Optional[float] = None,
|
91
92
|
):
|
92
93
|
"""Creates and initializes a FunctionExecutorServer and all resources associated with it.
|
93
94
|
|
@@ -103,7 +104,9 @@ class FunctionExecutor:
|
|
103
104
|
await self._establish_channel()
|
104
105
|
stub: FunctionExecutorStub = FunctionExecutorStub(self._channel)
|
105
106
|
await _collect_server_info(stub)
|
106
|
-
await _initialize_server(
|
107
|
+
await _initialize_server(
|
108
|
+
stub, initialize_request, customer_code_timeout_sec
|
109
|
+
)
|
107
110
|
await self._create_invocation_state_client(
|
108
111
|
stub=stub,
|
109
112
|
base_url=base_url,
|
@@ -293,18 +296,28 @@ async def _collect_server_info(stub: FunctionExecutorStub) -> None:
|
|
293
296
|
|
294
297
|
|
295
298
|
async def _initialize_server(
|
296
|
-
stub: FunctionExecutorStub,
|
299
|
+
stub: FunctionExecutorStub,
|
300
|
+
initialize_request: InitializeRequest,
|
301
|
+
customer_code_timeout_sec: Optional[float],
|
297
302
|
) -> None:
|
298
303
|
with (
|
299
304
|
metric_initialize_rpc_errors.count_exceptions(),
|
300
305
|
metric_initialize_rpc_latency.time(),
|
301
306
|
):
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
307
|
+
try:
|
308
|
+
initialize_response: InitializeResponse = await stub.initialize(
|
309
|
+
initialize_request,
|
310
|
+
timeout=customer_code_timeout_sec,
|
311
|
+
)
|
312
|
+
if initialize_response.success:
|
313
|
+
return
|
314
|
+
if initialize_response.HasField("customer_error"):
|
315
|
+
raise CustomerError(initialize_response.customer_error)
|
316
|
+
else:
|
317
|
+
raise Exception("initialize RPC failed at function executor server")
|
318
|
+
except grpc.aio.AioRpcError as e:
|
319
|
+
if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
|
320
|
+
raise CustomerError(
|
321
|
+
f"Customer code timeout {customer_code_timeout_sec} sec expired"
|
322
|
+
) from e
|
323
|
+
raise
|
@@ -25,6 +25,7 @@ class FunctionExecutorState:
|
|
25
25
|
graph_version: str,
|
26
26
|
function_name: str,
|
27
27
|
image_uri: Optional[str],
|
28
|
+
secret_names: List[str],
|
28
29
|
logger: Any,
|
29
30
|
):
|
30
31
|
# Read only fields.
|
@@ -33,6 +34,7 @@ class FunctionExecutorState:
|
|
33
34
|
self.graph_name: str = graph_name
|
34
35
|
self.function_name: str = function_name
|
35
36
|
self.image_uri: Optional[str] = image_uri
|
37
|
+
self.secret_names: List[str] = secret_names
|
36
38
|
self._logger: Any = logger.bind(
|
37
39
|
module=__name__,
|
38
40
|
function_executor_id=id,
|
@@ -47,6 +49,7 @@ class FunctionExecutorState:
|
|
47
49
|
# TODO: Move graph_version to immutable fields once we migrate to gRPC State Reconciler.
|
48
50
|
self.graph_version: str = graph_version
|
49
51
|
self.status: FunctionExecutorStatus = FunctionExecutorStatus.DESTROYED
|
52
|
+
self.status_message: str = ""
|
50
53
|
self.status_change_notifier: asyncio.Condition = asyncio.Condition(
|
51
54
|
lock=self.lock
|
52
55
|
)
|
@@ -62,7 +65,9 @@ class FunctionExecutorState:
|
|
62
65
|
while self.status not in allowlist:
|
63
66
|
await self.status_change_notifier.wait()
|
64
67
|
|
65
|
-
async def set_status(
|
68
|
+
async def set_status(
|
69
|
+
self, new_status: FunctionExecutorStatus, status_message: str = ""
|
70
|
+
) -> None:
|
66
71
|
"""Sets the status of the Function Executor.
|
67
72
|
|
68
73
|
The caller must hold the lock.
|
@@ -70,6 +75,7 @@ class FunctionExecutorState:
|
|
70
75
|
"""
|
71
76
|
self.check_locked()
|
72
77
|
if is_status_change_allowed(self.status, new_status):
|
78
|
+
# If status didn't change then still log it for visibility.
|
73
79
|
self._logger.info(
|
74
80
|
"function executor status changed",
|
75
81
|
old_status=self.status.name,
|
@@ -78,12 +84,14 @@ class FunctionExecutorState:
|
|
78
84
|
metric_function_executors_with_status.labels(status=self.status.name).dec()
|
79
85
|
metric_function_executors_with_status.labels(status=new_status.name).inc()
|
80
86
|
self.status = new_status
|
87
|
+
self.status_message = status_message
|
81
88
|
self.status_change_notifier.notify_all()
|
82
89
|
else:
|
83
90
|
raise ValueError(
|
84
91
|
f"Invalid status change from {self.status} to {new_status}"
|
85
92
|
)
|
86
93
|
|
94
|
+
# TODO: Delete this method once HTTP protocol is removed as it's used only there.
|
87
95
|
async def destroy_function_executor(self) -> None:
|
88
96
|
"""Destroys the Function Executor if it exists.
|
89
97
|
|
@@ -1,5 +1,5 @@
|
|
1
1
|
import asyncio
|
2
|
-
from typing import Any, AsyncGenerator, Dict, Optional
|
2
|
+
from typing import Any, AsyncGenerator, Dict, List, Optional
|
3
3
|
|
4
4
|
from .function_executor_state import FunctionExecutorState
|
5
5
|
from .function_executor_status import FunctionExecutorStatus
|
@@ -26,6 +26,7 @@ class FunctionExecutorStatesContainer:
|
|
26
26
|
graph_version: str,
|
27
27
|
function_name: str,
|
28
28
|
image_uri: Optional[str],
|
29
|
+
secret_names: List[str],
|
29
30
|
) -> FunctionExecutorState:
|
30
31
|
"""Get or create a function executor state with the given ID.
|
31
32
|
|
@@ -45,6 +46,7 @@ class FunctionExecutorStatesContainer:
|
|
45
46
|
graph_version=graph_version,
|
46
47
|
function_name=function_name,
|
47
48
|
image_uri=image_uri,
|
49
|
+
secret_names=secret_names,
|
48
50
|
logger=self._logger,
|
49
51
|
)
|
50
52
|
self._states[id] = state
|
@@ -23,6 +23,7 @@ class FunctionExecutorStatus(Enum):
|
|
23
23
|
UNHEALTHY = "Unhealthy"
|
24
24
|
# STARTUP_FAILED_CUSTOMER_ERROR -> DESTROYING
|
25
25
|
# STARTUP_FAILED_PLATFORM_ERROR -> DESTROYING
|
26
|
+
# RUNNING_TASK -> DESTROYING
|
26
27
|
# UNHEALTHY -> DESTROYING
|
27
28
|
# IDLE -> DESTROYING
|
28
29
|
DESTROYING = "Destroying"
|
@@ -69,6 +70,7 @@ def is_status_change_allowed(
|
|
69
70
|
],
|
70
71
|
FunctionExecutorStatus.RUNNING_TASK: [
|
71
72
|
FunctionExecutorStatus.RUNNING_TASK,
|
73
|
+
FunctionExecutorStatus.DESTROYING,
|
72
74
|
FunctionExecutorStatus.IDLE,
|
73
75
|
FunctionExecutorStatus.UNHEALTHY,
|
74
76
|
FunctionExecutorStatus.SHUTDOWN,
|
@@ -25,6 +25,12 @@ class SubprocessFunctionExecutorServerFactory(FunctionExecutorServerFactory):
|
|
25
25
|
logger = logger.bind(module=__name__)
|
26
26
|
port: Optional[int] = None
|
27
27
|
|
28
|
+
if len(config.secret_names) > 0:
|
29
|
+
logger.warning(
|
30
|
+
"Subprocess Function Executor does not support secrets. Please supply secrets as environment variables.",
|
31
|
+
secret_names=config.secret_names,
|
32
|
+
)
|
33
|
+
|
28
34
|
try:
|
29
35
|
port = self._allocate_port()
|
30
36
|
args = [
|
{indexify-0.3.18 → indexify-0.3.19}/src/indexify/executor/function_executor/single_task_runner.py
RENAMED
@@ -10,6 +10,7 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
|
|
10
10
|
from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
11
11
|
FunctionExecutorStub,
|
12
12
|
)
|
13
|
+
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
13
14
|
|
14
15
|
from ..api_objects import Task
|
15
16
|
from .function_executor import CustomerError, FunctionExecutor
|
@@ -26,7 +27,7 @@ from .server.function_executor_server_factory import (
|
|
26
27
|
FunctionExecutorServerFactory,
|
27
28
|
)
|
28
29
|
from .task_input import TaskInput
|
29
|
-
from .task_output import TaskOutput
|
30
|
+
from .task_output import TaskMetrics, TaskOutput
|
30
31
|
|
31
32
|
|
32
33
|
class SingleTaskRunner:
|
@@ -286,16 +287,17 @@ class _RunningTaskContextManager:
|
|
286
287
|
|
287
288
|
|
288
289
|
def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
297
|
-
if
|
298
|
-
|
290
|
+
response_validator = MessageValidator(response)
|
291
|
+
response_validator.required_field("stdout")
|
292
|
+
response_validator.required_field("stderr")
|
293
|
+
response_validator.required_field("is_reducer")
|
294
|
+
response_validator.required_field("success")
|
295
|
+
|
296
|
+
metrics = TaskMetrics(counters={}, timers={})
|
297
|
+
if response.HasField("metrics"):
|
298
|
+
# Can be None if e.g. function failed.
|
299
|
+
metrics.counters = dict(response.metrics.counters)
|
300
|
+
metrics.timers = dict(response.metrics.timers)
|
299
301
|
|
300
302
|
output = TaskOutput(
|
301
303
|
task_id=task.id,
|
@@ -308,10 +310,12 @@ def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
|
|
308
310
|
stderr=response.stderr,
|
309
311
|
reducer=response.is_reducer,
|
310
312
|
success=response.success,
|
313
|
+
metrics=metrics,
|
311
314
|
)
|
312
315
|
|
313
316
|
if response.HasField("function_output"):
|
314
317
|
output.function_output = response.function_output
|
318
|
+
output.output_encoding = response.function_output.output_encoding
|
315
319
|
if response.HasField("router_output"):
|
316
320
|
output.router_output = response.router_output
|
317
321
|
|
@@ -1,11 +1,17 @@
|
|
1
|
-
from typing import Optional
|
1
|
+
from typing import Dict, Optional
|
2
2
|
|
3
3
|
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
4
4
|
FunctionOutput,
|
5
5
|
RouterOutput,
|
6
6
|
)
|
7
7
|
|
8
|
-
|
8
|
+
|
9
|
+
class TaskMetrics:
|
10
|
+
"""Metrics for a task."""
|
11
|
+
|
12
|
+
def __init__(self, counters: Dict[str, int], timers: Dict[str, float]):
|
13
|
+
self.counters = counters
|
14
|
+
self.timers = timers
|
9
15
|
|
10
16
|
|
11
17
|
class TaskOutput:
|
@@ -19,6 +25,7 @@ class TaskOutput:
|
|
19
25
|
function_name: str,
|
20
26
|
graph_version: str,
|
21
27
|
graph_invocation_id: str,
|
28
|
+
output_encoding: Optional[str] = None,
|
22
29
|
function_output: Optional[FunctionOutput] = None,
|
23
30
|
router_output: Optional[RouterOutput] = None,
|
24
31
|
stdout: Optional[str] = None,
|
@@ -26,6 +33,7 @@ class TaskOutput:
|
|
26
33
|
reducer: bool = False,
|
27
34
|
success: bool = False,
|
28
35
|
is_internal_error: bool = False,
|
36
|
+
metrics: Optional[TaskMetrics] = None,
|
29
37
|
):
|
30
38
|
self.task_id = task_id
|
31
39
|
self.namespace = namespace
|
@@ -40,6 +48,8 @@ class TaskOutput:
|
|
40
48
|
self.reducer = reducer
|
41
49
|
self.success = success
|
42
50
|
self.is_internal_error = is_internal_error
|
51
|
+
self.metrics = metrics
|
52
|
+
self.output_encoding = output_encoding
|
43
53
|
|
44
54
|
@classmethod
|
45
55
|
def internal_error(
|
@@ -63,3 +73,26 @@ class TaskOutput:
|
|
63
73
|
stderr="Platform failed to execute the function.",
|
64
74
|
is_internal_error=True,
|
65
75
|
)
|
76
|
+
|
77
|
+
@classmethod
|
78
|
+
def function_timeout(
|
79
|
+
cls,
|
80
|
+
task_id: str,
|
81
|
+
namespace: str,
|
82
|
+
graph_name: str,
|
83
|
+
function_name: str,
|
84
|
+
graph_version: str,
|
85
|
+
graph_invocation_id: str,
|
86
|
+
) -> "TaskOutput":
|
87
|
+
"""Creates a TaskOutput for an function timeout error."""
|
88
|
+
# Task stdout, stderr is not available.
|
89
|
+
return TaskOutput(
|
90
|
+
task_id=task_id,
|
91
|
+
namespace=namespace,
|
92
|
+
graph_name=graph_name,
|
93
|
+
function_name=function_name,
|
94
|
+
graph_version=graph_version,
|
95
|
+
graph_invocation_id=graph_invocation_id,
|
96
|
+
stderr="Function execution timed out.",
|
97
|
+
is_internal_error=False,
|
98
|
+
)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import List, Set
|
3
|
+
|
4
|
+
|
5
|
+
class CompletedTasksContainer:
|
6
|
+
"""An asyncio concurrent container for the completed task IDs."""
|
7
|
+
|
8
|
+
def __init__(self):
|
9
|
+
# The fields below are protected by the lock.
|
10
|
+
self._lock: asyncio.Lock = asyncio.Lock()
|
11
|
+
self._completed_task_ids: Set[str] = set()
|
12
|
+
|
13
|
+
async def add(self, task_id: str) -> None:
|
14
|
+
"""Add a task to the container."""
|
15
|
+
async with self._lock:
|
16
|
+
self._completed_task_ids.add(task_id)
|
17
|
+
|
18
|
+
async def contains(self, task_id: str) -> bool:
|
19
|
+
"""Check if the task is in the container."""
|
20
|
+
async with self._lock:
|
21
|
+
return task_id in self._completed_task_ids
|
22
|
+
|
23
|
+
async def replace(self, task_ids: List[str]) -> None:
|
24
|
+
"""Replaces the task IDs with the supplied task IDs."""
|
25
|
+
async with self._lock:
|
26
|
+
self._completed_task_ids = set(task_ids)
|