indexify 0.3.30__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/__init__.py +18 -0
- indexify/cli/build_image.py +51 -0
- indexify/cli/deploy.py +57 -0
- indexify/cli/executor.py +205 -0
- indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
- indexify/executor/executor.py +57 -311
- indexify/executor/function_allowlist.py +59 -0
- indexify/executor/function_executor/function_executor.py +12 -6
- indexify/executor/function_executor/invocation_state_client.py +25 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
- indexify/executor/function_executor_controller/__init__.py +13 -0
- indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
- indexify/executor/function_executor_controller/create_function_executor.py +154 -0
- indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
- indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
- indexify/executor/function_executor_controller/downloads.py +199 -0
- indexify/executor/function_executor_controller/events.py +172 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
- indexify/executor/function_executor_controller/loggers.py +57 -0
- indexify/executor/function_executor_controller/message_validators.py +65 -0
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
- indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
- indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
- indexify/executor/function_executor_controller/prepare_task.py +38 -0
- indexify/executor/function_executor_controller/run_task.py +201 -0
- indexify/executor/function_executor_controller/task_info.py +33 -0
- indexify/executor/function_executor_controller/task_output.py +122 -0
- indexify/executor/function_executor_controller/upload_task_output.py +234 -0
- indexify/executor/host_resources/host_resources.py +20 -25
- indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
- indexify/executor/metrics/executor.py +0 -47
- indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
- indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
- indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
- indexify/executor/monitoring/health_checker/health_checker.py +0 -11
- indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
- indexify/executor/state_reporter.py +364 -0
- indexify/proto/executor_api.proto +67 -59
- indexify/proto/executor_api_pb2.py +52 -52
- indexify/proto/executor_api_pb2.pyi +125 -104
- indexify/proto/executor_api_pb2_grpc.py +0 -47
- {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/METADATA +1 -3
- indexify-0.4.2.dist-info/RECORD +68 -0
- indexify-0.4.2.dist-info/entry_points.txt +3 -0
- indexify/cli/cli.py +0 -267
- indexify/executor/api_objects.py +0 -92
- indexify/executor/downloader.py +0 -417
- indexify/executor/executor_flavor.py +0 -7
- indexify/executor/function_executor/function_executor_state.py +0 -107
- indexify/executor/function_executor/function_executor_states_container.py +0 -93
- indexify/executor/function_executor/function_executor_status.py +0 -95
- indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
- indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
- indexify/executor/function_executor/single_task_runner.py +0 -345
- indexify/executor/function_executor/task_input.py +0 -21
- indexify/executor/function_executor/task_output.py +0 -105
- indexify/executor/grpc/function_executor_controller.py +0 -418
- indexify/executor/grpc/metrics/task_controller.py +0 -8
- indexify/executor/grpc/state_reporter.py +0 -314
- indexify/executor/grpc/task_controller.py +0 -508
- indexify/executor/metrics/task_fetcher.py +0 -21
- indexify/executor/metrics/task_reporter.py +0 -53
- indexify/executor/metrics/task_runner.py +0 -52
- indexify/executor/monitoring/function_allowlist.py +0 -25
- indexify/executor/runtime_probes.py +0 -68
- indexify/executor/task_fetcher.py +0 -96
- indexify/executor/task_reporter.py +0 -459
- indexify/executor/task_runner.py +0 -177
- indexify-0.3.30.dist-info/RECORD +0 -68
- indexify-0.3.30.dist-info/entry_points.txt +0 -3
- {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,57 @@
|
|
1
|
+
from typing import Any
|
2
|
+
|
3
|
+
from indexify.proto.executor_api_pb2 import (
|
4
|
+
FunctionExecutorDescription,
|
5
|
+
Task,
|
6
|
+
)
|
7
|
+
|
8
|
+
|
9
|
+
def function_executor_logger(
|
10
|
+
function_executor_description: FunctionExecutorDescription, logger: Any
|
11
|
+
) -> Any:
|
12
|
+
"""Returns a logger bound with the FE's metadata.
|
13
|
+
|
14
|
+
The function assumes that the FE might be invalid."""
|
15
|
+
return logger.bind(
|
16
|
+
function_executor_id=(
|
17
|
+
function_executor_description.id
|
18
|
+
if function_executor_description.HasField("id")
|
19
|
+
else None
|
20
|
+
),
|
21
|
+
namespace=(
|
22
|
+
function_executor_description.namespace
|
23
|
+
if function_executor_description.HasField("namespace")
|
24
|
+
else None
|
25
|
+
),
|
26
|
+
graph_name=(
|
27
|
+
function_executor_description.graph_name
|
28
|
+
if function_executor_description.HasField("graph_name")
|
29
|
+
else None
|
30
|
+
),
|
31
|
+
graph_version=(
|
32
|
+
function_executor_description.graph_version
|
33
|
+
if function_executor_description.HasField("graph_version")
|
34
|
+
else None
|
35
|
+
),
|
36
|
+
function_name=(
|
37
|
+
function_executor_description.function_name
|
38
|
+
if function_executor_description.HasField("function_name")
|
39
|
+
else None
|
40
|
+
),
|
41
|
+
)
|
42
|
+
|
43
|
+
|
44
|
+
def task_logger(task: Task, logger: Any) -> Any:
|
45
|
+
"""Returns a logger bound with the task's metadata.
|
46
|
+
|
47
|
+
The function assumes that the task might be invalid."""
|
48
|
+
return logger.bind(
|
49
|
+
task_id=task.id if task.HasField("id") else None,
|
50
|
+
namespace=task.namespace if task.HasField("namespace") else None,
|
51
|
+
graph_name=task.graph_name if task.HasField("graph_name") else None,
|
52
|
+
graph_version=task.graph_version if task.HasField("graph_version") else None,
|
53
|
+
function_name=task.function_name if task.HasField("function_name") else None,
|
54
|
+
graph_invocation_id=(
|
55
|
+
task.graph_invocation_id if task.HasField("graph_invocation_id") else None
|
56
|
+
),
|
57
|
+
)
|
@@ -0,0 +1,65 @@
|
|
1
|
+
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
2
|
+
|
3
|
+
from indexify.proto.executor_api_pb2 import (
|
4
|
+
DataPayload,
|
5
|
+
FunctionExecutorDescription,
|
6
|
+
Task,
|
7
|
+
)
|
8
|
+
|
9
|
+
|
10
|
+
def validate_function_executor_description(
|
11
|
+
function_executor_description: FunctionExecutorDescription,
|
12
|
+
) -> None:
|
13
|
+
"""Validates the supplied FE description.
|
14
|
+
|
15
|
+
Raises ValueError if the description is not valid.
|
16
|
+
"""
|
17
|
+
validator = MessageValidator(function_executor_description)
|
18
|
+
validator.required_field("id")
|
19
|
+
validator.required_field("namespace")
|
20
|
+
validator.required_field("graph_name")
|
21
|
+
validator.required_field("graph_version")
|
22
|
+
validator.required_field("function_name")
|
23
|
+
# image_uri is optional.
|
24
|
+
# secret_names can be empty.
|
25
|
+
validator.required_field("customer_code_timeout_ms")
|
26
|
+
validator.required_field("graph")
|
27
|
+
validator.required_field("resources")
|
28
|
+
|
29
|
+
_validate_data_payload(function_executor_description.graph)
|
30
|
+
|
31
|
+
validator = MessageValidator(function_executor_description.resources)
|
32
|
+
validator.required_field("cpu_ms_per_sec")
|
33
|
+
validator.required_field("memory_bytes")
|
34
|
+
validator.required_field("disk_bytes")
|
35
|
+
validator.required_field("gpu_count")
|
36
|
+
|
37
|
+
|
38
|
+
def validate_task(task: Task) -> None:
|
39
|
+
"""Validates the supplied Task.
|
40
|
+
|
41
|
+
Raises ValueError if the Task is not valid.
|
42
|
+
"""
|
43
|
+
validator = MessageValidator(task)
|
44
|
+
validator.required_field("id")
|
45
|
+
validator.required_field("namespace")
|
46
|
+
validator.required_field("graph_name")
|
47
|
+
validator.required_field("graph_version")
|
48
|
+
validator.required_field("function_name")
|
49
|
+
validator.required_field("graph_invocation_id")
|
50
|
+
validator.required_field("timeout_ms")
|
51
|
+
validator.required_field("input")
|
52
|
+
validator.required_field("output_payload_uri_prefix")
|
53
|
+
validator.required_field("retry_policy")
|
54
|
+
|
55
|
+
_validate_data_payload(task.input)
|
56
|
+
if task.HasField("reducer_input"):
|
57
|
+
_validate_data_payload(task.reducer_input)
|
58
|
+
|
59
|
+
|
60
|
+
def _validate_data_payload(data_payload: DataPayload) -> None:
|
61
|
+
"""Validates the supplied DataPayload.
|
62
|
+
|
63
|
+
Raises ValueError if the DataPayload is not valid.
|
64
|
+
"""
|
65
|
+
(MessageValidator(data_payload).required_field("uri").required_field("encoding"))
|
@@ -0,0 +1,68 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from indexify.executor.monitoring.metrics import (
|
4
|
+
latency_metric_for_customer_controlled_operation,
|
5
|
+
)
|
6
|
+
|
7
|
+
metric_tasks_completed: prometheus_client.Counter = prometheus_client.Counter(
|
8
|
+
"tasks_completed",
|
9
|
+
"Number of tasks that were completed",
|
10
|
+
["outcome_code", "failure_reason"],
|
11
|
+
)
|
12
|
+
METRIC_TASKS_COMPLETED_OUTCOME_CODE_ALL = "all"
|
13
|
+
METRIC_TASKS_COMPLETED_OUTCOME_CODE_SUCCESS = "success"
|
14
|
+
METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE = "failure"
|
15
|
+
|
16
|
+
METRIC_TASKS_COMPLETED_FAILURE_REASON_ALL = "all"
|
17
|
+
# Used when the task is successfull.
|
18
|
+
METRIC_TASKS_COMPLETED_FAILURE_REASON_NONE = "none"
|
19
|
+
# Matches TASK_FAILURE_REASON_UNKNOWN
|
20
|
+
METRIC_TASKS_COMPLETED_FAILURE_REASON_UNKNOWN = "unknown"
|
21
|
+
# Includes all function errors including timeouts to reduce cardinality.
|
22
|
+
METRIC_TASKS_COMPLETED_FAILURE_REASON_FUNCTION_ERROR = "function_error"
|
23
|
+
# Includes all internal errors to reduce cardinality.
|
24
|
+
METRIC_TASKS_COMPLETED_FAILURE_REASON_INTERNAL_ERROR = "internal_error"
|
25
|
+
# Matches TASK_FAILURE_REASON_TASK_CANCELLED
|
26
|
+
METRIC_TASKS_COMPLETED_FAILURE_REASON_TASK_CANCELLED = "task_cancelled"
|
27
|
+
# Matches TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED
|
28
|
+
METRIC_TASKS_COMPLETED_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED = (
|
29
|
+
"function_executor_terminated"
|
30
|
+
)
|
31
|
+
|
32
|
+
# Valid combinations of the labels:
|
33
|
+
metric_tasks_completed.labels(
|
34
|
+
outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_ALL,
|
35
|
+
failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_ALL,
|
36
|
+
)
|
37
|
+
metric_tasks_completed.labels(
|
38
|
+
outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_SUCCESS,
|
39
|
+
failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_NONE,
|
40
|
+
)
|
41
|
+
|
42
|
+
metric_tasks_completed.labels(
|
43
|
+
outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
|
44
|
+
failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_UNKNOWN,
|
45
|
+
)
|
46
|
+
metric_tasks_completed.labels(
|
47
|
+
outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
|
48
|
+
failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_FUNCTION_ERROR,
|
49
|
+
)
|
50
|
+
metric_tasks_completed.labels(
|
51
|
+
outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
|
52
|
+
failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_INTERNAL_ERROR,
|
53
|
+
)
|
54
|
+
metric_tasks_completed.labels(
|
55
|
+
outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
|
56
|
+
failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_TASK_CANCELLED,
|
57
|
+
)
|
58
|
+
metric_tasks_completed.labels(
|
59
|
+
outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
|
60
|
+
failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED,
|
61
|
+
)
|
62
|
+
|
63
|
+
metric_task_completion_latency: prometheus_client.Histogram = (
|
64
|
+
latency_metric_for_customer_controlled_operation(
|
65
|
+
"task_completion",
|
66
|
+
"task completion from the moment it got fetched until its output got uploaded to blob store",
|
67
|
+
)
|
68
|
+
)
|
indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py}
RENAMED
@@ -1,8 +1,6 @@
|
|
1
1
|
import prometheus_client
|
2
2
|
|
3
|
-
from
|
4
|
-
|
5
|
-
# This file contains all metrics used by Downloader.
|
3
|
+
from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
|
6
4
|
|
7
5
|
# Graph download metrics
|
8
6
|
metric_graph_downloads: prometheus_client.Counter = prometheus_client.Counter(
|
@@ -0,0 +1,60 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from indexify.executor.monitoring.metrics import (
|
4
|
+
latency_metric_for_customer_controlled_operation,
|
5
|
+
latency_metric_for_fast_operation,
|
6
|
+
)
|
7
|
+
|
8
|
+
metric_control_loop_handle_event_latency: prometheus_client.Histogram = (
|
9
|
+
latency_metric_for_fast_operation(
|
10
|
+
"handle_function_executor_control_loop_event",
|
11
|
+
"Handle Function Executor control loop event",
|
12
|
+
)
|
13
|
+
)
|
14
|
+
|
15
|
+
metric_tasks_fetched: prometheus_client.Counter = prometheus_client.Counter(
|
16
|
+
"tasks_fetched", "Number of tasks that were fetched from Server"
|
17
|
+
)
|
18
|
+
|
19
|
+
metric_schedule_task_latency: prometheus_client.Histogram = (
|
20
|
+
latency_metric_for_customer_controlled_operation(
|
21
|
+
"schedule_task",
|
22
|
+
"Schedule a task for execution after it got ready for execution",
|
23
|
+
)
|
24
|
+
)
|
25
|
+
metric_runnable_tasks: prometheus_client.Gauge = prometheus_client.Gauge(
|
26
|
+
"runnable_tasks",
|
27
|
+
"Number of tasks that are ready for execution but are waiting to get scheduled to run on Function Executor (typically waiting for a free Function Executor)",
|
28
|
+
)
|
29
|
+
metric_runnable_tasks_per_function_name: prometheus_client.Gauge = (
|
30
|
+
prometheus_client.Gauge(
|
31
|
+
"runnable_tasks_per_function_name",
|
32
|
+
"Number of tasks that are ready for execution but are waiting to get scheduled to run on Function Executor (typically waiting for a free Function Executor)",
|
33
|
+
["function_name"],
|
34
|
+
)
|
35
|
+
)
|
36
|
+
|
37
|
+
metric_function_executors_with_status: prometheus_client.Gauge = (
|
38
|
+
prometheus_client.Gauge(
|
39
|
+
"function_executors_with_status",
|
40
|
+
"Number of Function Executors with a particular status",
|
41
|
+
["status"],
|
42
|
+
)
|
43
|
+
)
|
44
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN = "unknown"
|
45
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING = "pending"
|
46
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING = "running"
|
47
|
+
METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED = "terminated"
|
48
|
+
|
49
|
+
metric_function_executors_with_status.labels(
|
50
|
+
status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
|
51
|
+
)
|
52
|
+
metric_function_executors_with_status.labels(
|
53
|
+
status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING
|
54
|
+
)
|
55
|
+
metric_function_executors_with_status.labels(
|
56
|
+
status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING
|
57
|
+
)
|
58
|
+
metric_function_executors_with_status.labels(
|
59
|
+
status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED
|
60
|
+
)
|
@@ -1,8 +1,8 @@
|
|
1
1
|
import prometheus_client
|
2
2
|
|
3
|
-
from
|
4
|
-
|
5
|
-
|
3
|
+
from indexify.executor.monitoring.metrics import (
|
4
|
+
latency_metric_for_customer_controlled_operation,
|
5
|
+
)
|
6
6
|
|
7
7
|
metric_function_executor_run_task_rpcs: prometheus_client.Counter = (
|
8
8
|
prometheus_client.Counter(
|
@@ -20,3 +20,9 @@ metric_function_executor_run_task_rpc_latency: prometheus_client.Histogram = (
|
|
20
20
|
"function_executor_run_task_rpc", "Function Executor run task RPC"
|
21
21
|
)
|
22
22
|
)
|
23
|
+
metric_function_executor_run_task_rpcs_in_progress: prometheus_client.Gauge = (
|
24
|
+
prometheus_client.Gauge(
|
25
|
+
"function_executor_run_task_rpcs_in_progress",
|
26
|
+
"Number of Function Executor run task RPCs in progress",
|
27
|
+
)
|
28
|
+
)
|
@@ -0,0 +1,39 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
# Task output upload metrics.
|
6
|
+
metric_task_output_uploads: prometheus_client.Counter = prometheus_client.Counter(
|
7
|
+
"task_output_uploads",
|
8
|
+
"Number of task output uploads",
|
9
|
+
)
|
10
|
+
metric_tasks_uploading_outputs: prometheus_client.Gauge = prometheus_client.Gauge(
|
11
|
+
"tasks_uploading_output",
|
12
|
+
"Number of tasks currently uploading their outputs",
|
13
|
+
)
|
14
|
+
metric_task_output_upload_latency: prometheus_client.Histogram = (
|
15
|
+
latency_metric_for_fast_operation("task_output_upload", "task output upload")
|
16
|
+
)
|
17
|
+
metric_task_output_upload_retries: prometheus_client.Counter = (
|
18
|
+
prometheus_client.Counter(
|
19
|
+
"tasks_output_upload_retries", "Number of task output upload retries"
|
20
|
+
)
|
21
|
+
)
|
22
|
+
|
23
|
+
# Metrics for individual blob store operations.
|
24
|
+
metric_task_output_blob_store_uploads: prometheus_client.Counter = (
|
25
|
+
prometheus_client.Counter(
|
26
|
+
"task_output_blob_store_uploads", "Number of task output uploads to blob store"
|
27
|
+
)
|
28
|
+
)
|
29
|
+
metric_task_output_blob_store_upload_errors: prometheus_client.Counter = (
|
30
|
+
prometheus_client.Counter(
|
31
|
+
"task_output_blob_store_upload_errors",
|
32
|
+
"Number of failed task output uploads to blob store",
|
33
|
+
)
|
34
|
+
)
|
35
|
+
metric_task_output_blob_store_upload_latency: prometheus_client.Histogram = (
|
36
|
+
latency_metric_for_fast_operation(
|
37
|
+
"task_output_blob_store_upload", "Upload task output to blob store"
|
38
|
+
)
|
39
|
+
)
|
@@ -0,0 +1,38 @@
|
|
1
|
+
from typing import Any
|
2
|
+
|
3
|
+
from indexify.executor.blob_store.blob_store import BLOBStore
|
4
|
+
|
5
|
+
from .downloads import download_init_value, download_input
|
6
|
+
from .events import TaskPreparationFinished
|
7
|
+
from .task_info import TaskInfo
|
8
|
+
|
9
|
+
|
10
|
+
async def prepare_task(
|
11
|
+
task_info: TaskInfo, blob_store: BLOBStore, logger: Any
|
12
|
+
) -> TaskPreparationFinished:
|
13
|
+
"""Prepares the task by downloading the input and init value if available.
|
14
|
+
|
15
|
+
Doesn't raise any exceptions.
|
16
|
+
"""
|
17
|
+
logger = logger.bind(module=__name__)
|
18
|
+
try:
|
19
|
+
task_info.input = await download_input(
|
20
|
+
data_payload=task_info.task.input,
|
21
|
+
blob_store=blob_store,
|
22
|
+
logger=logger,
|
23
|
+
)
|
24
|
+
|
25
|
+
if task_info.task.HasField("reducer_input"):
|
26
|
+
task_info.init_value = await download_init_value(
|
27
|
+
data_payload=task_info.task.reducer_input,
|
28
|
+
blob_store=blob_store,
|
29
|
+
logger=logger,
|
30
|
+
)
|
31
|
+
|
32
|
+
return TaskPreparationFinished(task_info=task_info, is_success=True)
|
33
|
+
except Exception as e:
|
34
|
+
logger.error(
|
35
|
+
"Failed to prepare task",
|
36
|
+
exc_info=e,
|
37
|
+
)
|
38
|
+
return TaskPreparationFinished(task_info=task_info, is_success=False)
|
@@ -0,0 +1,201 @@
|
|
1
|
+
import asyncio
|
2
|
+
import time
|
3
|
+
from typing import Any, Optional
|
4
|
+
|
5
|
+
import grpc
|
6
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
7
|
+
RunTaskRequest,
|
8
|
+
RunTaskResponse,
|
9
|
+
)
|
10
|
+
from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
11
|
+
FunctionExecutorStub,
|
12
|
+
)
|
13
|
+
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
14
|
+
|
15
|
+
from indexify.executor.function_executor.function_executor import FunctionExecutor
|
16
|
+
from indexify.executor.function_executor.health_checker import HealthCheckResult
|
17
|
+
from indexify.proto.executor_api_pb2 import (
|
18
|
+
FunctionExecutorTerminationReason,
|
19
|
+
Task,
|
20
|
+
TaskFailureReason,
|
21
|
+
TaskOutcomeCode,
|
22
|
+
)
|
23
|
+
|
24
|
+
from .events import TaskExecutionFinished
|
25
|
+
from .metrics.run_task import (
|
26
|
+
metric_function_executor_run_task_rpc_errors,
|
27
|
+
metric_function_executor_run_task_rpc_latency,
|
28
|
+
metric_function_executor_run_task_rpcs,
|
29
|
+
metric_function_executor_run_task_rpcs_in_progress,
|
30
|
+
)
|
31
|
+
from .task_info import TaskInfo
|
32
|
+
from .task_output import TaskMetrics, TaskOutput
|
33
|
+
|
34
|
+
|
35
|
+
async def run_task_on_function_executor(
|
36
|
+
task_info: TaskInfo, function_executor: FunctionExecutor, logger: Any
|
37
|
+
) -> TaskExecutionFinished:
|
38
|
+
"""Runs the task on the Function Executor and sets task_info.output with the result.
|
39
|
+
|
40
|
+
Doesn't raise any exceptions.
|
41
|
+
"""
|
42
|
+
logger = logger.bind(module=__name__)
|
43
|
+
request: RunTaskRequest = RunTaskRequest(
|
44
|
+
namespace=task_info.task.namespace,
|
45
|
+
graph_name=task_info.task.graph_name,
|
46
|
+
graph_version=task_info.task.graph_version,
|
47
|
+
function_name=task_info.task.function_name,
|
48
|
+
graph_invocation_id=task_info.task.graph_invocation_id,
|
49
|
+
task_id=task_info.task.id,
|
50
|
+
function_input=task_info.input,
|
51
|
+
)
|
52
|
+
# Don't keep the input in memory after we started running the task.
|
53
|
+
task_info.input = None
|
54
|
+
|
55
|
+
if task_info.init_value is not None:
|
56
|
+
request.function_init_value.CopyFrom(task_info.init_value)
|
57
|
+
# Don't keep the init value in memory after we started running the task.
|
58
|
+
task_info.init_value = None
|
59
|
+
|
60
|
+
function_executor.invocation_state_client().add_task_to_invocation_id_entry(
|
61
|
+
task_id=task_info.task.id,
|
62
|
+
invocation_id=task_info.task.graph_invocation_id,
|
63
|
+
)
|
64
|
+
|
65
|
+
metric_function_executor_run_task_rpcs.inc()
|
66
|
+
metric_function_executor_run_task_rpcs_in_progress.inc()
|
67
|
+
start_time = time.monotonic()
|
68
|
+
# Not None if the Function Executor should be terminated after running the task.
|
69
|
+
function_executor_termination_reason: Optional[
|
70
|
+
FunctionExecutorTerminationReason
|
71
|
+
] = None
|
72
|
+
|
73
|
+
# If this RPC failed due to customer code crashing the server we won't be
|
74
|
+
# able to detect this. We'll treat this as our own error for now and thus
|
75
|
+
# let the AioRpcError to be raised here.
|
76
|
+
timeout_sec = task_info.task.timeout_ms / 1000.0
|
77
|
+
try:
|
78
|
+
channel: grpc.aio.Channel = function_executor.channel()
|
79
|
+
response: RunTaskResponse = await FunctionExecutorStub(channel).run_task(
|
80
|
+
request, timeout=timeout_sec
|
81
|
+
)
|
82
|
+
task_info.output = _task_output_from_function_executor_response(
|
83
|
+
task=task_info.task,
|
84
|
+
response=response,
|
85
|
+
allocation_id=task_info.allocation_id,
|
86
|
+
)
|
87
|
+
except grpc.aio.AioRpcError as e:
|
88
|
+
if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
|
89
|
+
# The task is still running in FE, we only cancelled the client-side RPC.
|
90
|
+
function_executor_termination_reason = (
|
91
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT
|
92
|
+
)
|
93
|
+
task_info.output = TaskOutput.function_timeout(
|
94
|
+
task=task_info.task,
|
95
|
+
allocation_id=task_info.allocation_id,
|
96
|
+
timeout_sec=timeout_sec,
|
97
|
+
)
|
98
|
+
else:
|
99
|
+
metric_function_executor_run_task_rpc_errors.inc()
|
100
|
+
logger.error("task execution failed", exc_info=e)
|
101
|
+
task_info.output = TaskOutput.internal_error(
|
102
|
+
task=task_info.task, allocation_id=task_info.allocation_id
|
103
|
+
)
|
104
|
+
except asyncio.CancelledError:
|
105
|
+
# The task is still running in FE, we only cancelled the client-side RPC.
|
106
|
+
function_executor_termination_reason = (
|
107
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED
|
108
|
+
)
|
109
|
+
task_info.output = TaskOutput.task_cancelled(
|
110
|
+
task=task_info.task, allocation_id=task_info.allocation_id
|
111
|
+
)
|
112
|
+
except Exception as e:
|
113
|
+
metric_function_executor_run_task_rpc_errors.inc()
|
114
|
+
logger.error("task execution failed", exc_info=e)
|
115
|
+
task_info.output = TaskOutput.internal_error(
|
116
|
+
task=task_info.task, allocation_id=task_info.allocation_id
|
117
|
+
)
|
118
|
+
|
119
|
+
metric_function_executor_run_task_rpc_latency.observe(time.monotonic() - start_time)
|
120
|
+
metric_function_executor_run_task_rpcs_in_progress.dec()
|
121
|
+
|
122
|
+
function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
|
123
|
+
task_id=task_info.task.id,
|
124
|
+
)
|
125
|
+
|
126
|
+
if (
|
127
|
+
task_info.output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
|
128
|
+
and function_executor_termination_reason is None
|
129
|
+
):
|
130
|
+
# Check if the task failed because the FE is unhealthy to prevent more tasks failing.
|
131
|
+
result: HealthCheckResult = await function_executor.health_checker().check()
|
132
|
+
if not result.is_healthy:
|
133
|
+
function_executor_termination_reason = (
|
134
|
+
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
|
135
|
+
)
|
136
|
+
logger.error(
|
137
|
+
"Function Executor health check failed after running task, shutting down Function Executor",
|
138
|
+
health_check_fail_reason=result.reason,
|
139
|
+
)
|
140
|
+
|
141
|
+
_log_task_execution_finished(output=task_info.output, logger=logger)
|
142
|
+
|
143
|
+
return TaskExecutionFinished(
|
144
|
+
task_info=task_info,
|
145
|
+
function_executor_termination_reason=function_executor_termination_reason,
|
146
|
+
)
|
147
|
+
|
148
|
+
|
149
|
+
def _task_output_from_function_executor_response(
|
150
|
+
task: Task, response: RunTaskResponse, allocation_id: str
|
151
|
+
) -> TaskOutput:
|
152
|
+
response_validator = MessageValidator(response)
|
153
|
+
response_validator.required_field("stdout")
|
154
|
+
response_validator.required_field("stderr")
|
155
|
+
response_validator.required_field("is_reducer")
|
156
|
+
response_validator.required_field("success")
|
157
|
+
|
158
|
+
metrics = TaskMetrics(counters={}, timers={})
|
159
|
+
if response.HasField("metrics"):
|
160
|
+
# Can be None if e.g. function failed.
|
161
|
+
metrics.counters = dict(response.metrics.counters)
|
162
|
+
metrics.timers = dict(response.metrics.timers)
|
163
|
+
|
164
|
+
output = TaskOutput(
|
165
|
+
task=task,
|
166
|
+
allocation_id=allocation_id,
|
167
|
+
outcome_code=(
|
168
|
+
TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS
|
169
|
+
if response.success
|
170
|
+
else TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
|
171
|
+
),
|
172
|
+
failure_reason=(
|
173
|
+
None
|
174
|
+
if response.success
|
175
|
+
else TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR
|
176
|
+
),
|
177
|
+
stdout=response.stdout,
|
178
|
+
stderr=response.stderr,
|
179
|
+
reducer=response.is_reducer,
|
180
|
+
metrics=metrics,
|
181
|
+
)
|
182
|
+
|
183
|
+
if response.HasField("function_output"):
|
184
|
+
output.function_output = response.function_output
|
185
|
+
if response.HasField("router_output"):
|
186
|
+
output.router_output = response.router_output
|
187
|
+
|
188
|
+
return output
|
189
|
+
|
190
|
+
|
191
|
+
def _log_task_execution_finished(output: TaskOutput, logger: Any) -> None:
|
192
|
+
logger.info(
|
193
|
+
"finished running task",
|
194
|
+
success=output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS,
|
195
|
+
outcome_code=TaskOutcomeCode.Name(output.outcome_code),
|
196
|
+
failure_reason=(
|
197
|
+
TaskFailureReason.Name(output.failure_reason)
|
198
|
+
if output.failure_reason is not None
|
199
|
+
else None
|
200
|
+
),
|
201
|
+
)
|
@@ -0,0 +1,33 @@
|
|
1
|
+
import asyncio
|
2
|
+
from dataclasses import dataclass
|
3
|
+
from typing import Optional
|
4
|
+
|
5
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
|
6
|
+
|
7
|
+
from indexify.proto.executor_api_pb2 import Task
|
8
|
+
|
9
|
+
from .task_output import TaskOutput
|
10
|
+
|
11
|
+
|
12
|
+
@dataclass
|
13
|
+
class TaskInfo:
|
14
|
+
"""Object used to track a task during its full lifecycle in the FunctionExecutorController."""
|
15
|
+
|
16
|
+
task: Task
|
17
|
+
allocation_id: str
|
18
|
+
# time.monotonic() timestamp
|
19
|
+
start_time: float
|
20
|
+
# time.monotonic() timestamp when the task was prepared for execution
|
21
|
+
prepared_time: float = 0.0
|
22
|
+
# True if the task was cancelled.
|
23
|
+
is_cancelled: bool = False
|
24
|
+
# aio task that is currently executing a lifecycle step of this task.
|
25
|
+
aio_task: Optional[asyncio.Task] = None
|
26
|
+
# Downloaded input if function was prepared successfully.
|
27
|
+
input: Optional[SerializedObject] = None
|
28
|
+
# Downloaded init value if function was prepared successfully and is a reducer.
|
29
|
+
init_value: Optional[SerializedObject] = None
|
30
|
+
# Output of the task.
|
31
|
+
output: Optional[TaskOutput] = None
|
32
|
+
# True if the task is fully completed and was added to state reporter.
|
33
|
+
is_completed: bool = False
|