indexify 0.4.29__py3-none-any.whl → 0.4.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/executor/function_executor_controller/__init__.py +2 -2
- indexify/executor/function_executor_controller/completed_task_allocation_metrics.py +87 -0
- indexify/executor/function_executor_controller/events.py +29 -33
- indexify/executor/function_executor_controller/{finalize_task.py → finalize_task_allocation.py} +45 -37
- indexify/executor/function_executor_controller/function_executor_controller.py +194 -180
- indexify/executor/function_executor_controller/loggers.py +15 -17
- indexify/executor/function_executor_controller/message_validators.py +4 -12
- indexify/executor/function_executor_controller/metrics/completed_task_allocation_metrics.py +70 -0
- indexify/executor/function_executor_controller/metrics/finalize_task_allocation.py +26 -0
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +12 -11
- indexify/executor/function_executor_controller/metrics/prepare_task_allocation.py +27 -0
- indexify/executor/function_executor_controller/{prepare_task.py → prepare_task_allocation.py} +33 -29
- indexify/executor/function_executor_controller/{run_task.py → run_task_allocation.py} +54 -51
- indexify/executor/function_executor_controller/{task_info.py → task_allocation_info.py} +6 -6
- indexify/executor/function_executor_controller/{task_input.py → task_allocation_input.py} +2 -2
- indexify/executor/function_executor_controller/{task_output.py → task_allocation_output.py} +24 -24
- indexify/executor/state_reconciler.py +23 -19
- {indexify-0.4.29.dist-info → indexify-0.4.30.dist-info}/METADATA +2 -2
- {indexify-0.4.29.dist-info → indexify-0.4.30.dist-info}/RECORD +22 -22
- indexify/executor/function_executor_controller/completed_task_metrics.py +0 -83
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +0 -68
- indexify/executor/function_executor_controller/metrics/finalize_task.py +0 -20
- indexify/executor/function_executor_controller/metrics/prepare_task.py +0 -18
- /indexify/executor/function_executor_controller/metrics/{run_task.py → run_task_allocation.py} +0 -0
- {indexify-0.4.29.dist-info → indexify-0.4.30.dist-info}/WHEEL +0 -0
- {indexify-0.4.29.dist-info → indexify-0.4.30.dist-info}/entry_points.txt +0 -0
@@ -49,7 +49,21 @@ def task_allocation_logger(task_allocation: TaskAllocation, logger: Any) -> Any:
|
|
49
49
|
Doesn't assume that the supplied TaskAllocation is valid.
|
50
50
|
"""
|
51
51
|
if task_allocation.HasField("task"):
|
52
|
-
|
52
|
+
task: Task = task_allocation.task
|
53
|
+
logger = logger.bind(
|
54
|
+
task_id=task.id if task.HasField("id") else None,
|
55
|
+
namespace=task.namespace if task.HasField("namespace") else None,
|
56
|
+
graph=task.graph_name if task.HasField("graph_name") else None,
|
57
|
+
graph_version=(
|
58
|
+
task.graph_version if task.HasField("graph_version") else None
|
59
|
+
),
|
60
|
+
fn=task.function_name if task.HasField("function_name") else None,
|
61
|
+
invocation_id=(
|
62
|
+
task.graph_invocation_id
|
63
|
+
if task.HasField("graph_invocation_id")
|
64
|
+
else None
|
65
|
+
),
|
66
|
+
)
|
53
67
|
return logger.bind(
|
54
68
|
allocation_id=(
|
55
69
|
task_allocation.allocation_id
|
@@ -87,19 +101,3 @@ def task_result_logger(task_result: TaskResult, logger: Any) -> Any:
|
|
87
101
|
else None
|
88
102
|
),
|
89
103
|
)
|
90
|
-
|
91
|
-
|
92
|
-
def _task_logger(task: Task, logger: Any) -> Any:
|
93
|
-
"""Returns a logger bound with the task's metadata.
|
94
|
-
|
95
|
-
The function assumes that the task might be invalid."""
|
96
|
-
return logger.bind(
|
97
|
-
task_id=task.id if task.HasField("id") else None,
|
98
|
-
namespace=task.namespace if task.HasField("namespace") else None,
|
99
|
-
graph=task.graph_name if task.HasField("graph_name") else None,
|
100
|
-
graph_version=task.graph_version if task.HasField("graph_version") else None,
|
101
|
-
fn=task.function_name if task.HasField("function_name") else None,
|
102
|
-
invocation_id=(
|
103
|
-
task.graph_invocation_id if task.HasField("graph_invocation_id") else None
|
104
|
-
),
|
105
|
-
)
|
@@ -3,7 +3,6 @@ from tensorlake.function_executor.proto.message_validator import MessageValidato
|
|
3
3
|
from indexify.proto.executor_api_pb2 import (
|
4
4
|
DataPayload,
|
5
5
|
FunctionExecutorDescription,
|
6
|
-
Task,
|
7
6
|
TaskAllocation,
|
8
7
|
)
|
9
8
|
|
@@ -50,15 +49,8 @@ def validate_task_allocation(task_allocation: TaskAllocation) -> None:
|
|
50
49
|
validator.required_field("allocation_id")
|
51
50
|
if not task_allocation.HasField("task"):
|
52
51
|
raise ValueError("TaskAllocation must have a 'task' field.")
|
53
|
-
_validate_task(task_allocation.task)
|
54
52
|
|
55
|
-
|
56
|
-
def _validate_task(task: Task) -> None:
|
57
|
-
"""Validates the supplied Task.
|
58
|
-
|
59
|
-
Raises ValueError if the Task is not valid.
|
60
|
-
"""
|
61
|
-
validator = MessageValidator(task)
|
53
|
+
validator = MessageValidator(task_allocation.task)
|
62
54
|
validator.required_field("id")
|
63
55
|
validator.required_field("namespace")
|
64
56
|
validator.required_field("graph_name")
|
@@ -70,9 +62,9 @@ def _validate_task(task: Task) -> None:
|
|
70
62
|
validator.required_field("output_payload_uri_prefix")
|
71
63
|
validator.required_field("retry_policy")
|
72
64
|
|
73
|
-
_validate_data_payload(task.input)
|
74
|
-
if task.HasField("reducer_input"):
|
75
|
-
_validate_data_payload(task.reducer_input)
|
65
|
+
_validate_data_payload(task_allocation.task.input)
|
66
|
+
if task_allocation.task.HasField("reducer_input"):
|
67
|
+
_validate_data_payload(task_allocation.task.reducer_input)
|
76
68
|
|
77
69
|
|
78
70
|
def _validate_data_payload(data_payload: DataPayload) -> None:
|
@@ -0,0 +1,70 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from indexify.executor.monitoring.metrics import (
|
4
|
+
latency_metric_for_customer_controlled_operation,
|
5
|
+
)
|
6
|
+
|
7
|
+
metric_task_allocations_completed: prometheus_client.Counter = (
|
8
|
+
prometheus_client.Counter(
|
9
|
+
"task_allocations_completed",
|
10
|
+
"Number of task allocations that were completed",
|
11
|
+
["outcome_code", "failure_reason"],
|
12
|
+
)
|
13
|
+
)
|
14
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_ALL = "all"
|
15
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_SUCCESS = "success"
|
16
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE = "failure"
|
17
|
+
|
18
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_ALL = "all"
|
19
|
+
# Used when the task is successfull.
|
20
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_NONE = "none"
|
21
|
+
# Matches TASK_FAILURE_REASON_UNKNOWN
|
22
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_UNKNOWN = "unknown"
|
23
|
+
# Includes all function errors including timeouts to reduce cardinality.
|
24
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_FUNCTION_ERROR = "function_error"
|
25
|
+
# Includes all internal errors to reduce cardinality.
|
26
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_INTERNAL_ERROR = "internal_error"
|
27
|
+
# Matches TASK_FAILURE_REASON_TASK_CANCELLED
|
28
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_TASK_CANCELLED = "task_cancelled"
|
29
|
+
# Matches TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED
|
30
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED = (
|
31
|
+
"function_executor_terminated"
|
32
|
+
)
|
33
|
+
|
34
|
+
# Valid combinations of the labels:
|
35
|
+
metric_task_allocations_completed.labels(
|
36
|
+
outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_ALL,
|
37
|
+
failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_ALL,
|
38
|
+
)
|
39
|
+
metric_task_allocations_completed.labels(
|
40
|
+
outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_SUCCESS,
|
41
|
+
failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_NONE,
|
42
|
+
)
|
43
|
+
|
44
|
+
metric_task_allocations_completed.labels(
|
45
|
+
outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
|
46
|
+
failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_UNKNOWN,
|
47
|
+
)
|
48
|
+
metric_task_allocations_completed.labels(
|
49
|
+
outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
|
50
|
+
failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_FUNCTION_ERROR,
|
51
|
+
)
|
52
|
+
metric_task_allocations_completed.labels(
|
53
|
+
outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
|
54
|
+
failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_INTERNAL_ERROR,
|
55
|
+
)
|
56
|
+
metric_task_allocations_completed.labels(
|
57
|
+
outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
|
58
|
+
failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_TASK_CANCELLED,
|
59
|
+
)
|
60
|
+
metric_task_allocations_completed.labels(
|
61
|
+
outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
|
62
|
+
failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED,
|
63
|
+
)
|
64
|
+
|
65
|
+
metric_task_allocation_completion_latency: prometheus_client.Histogram = (
|
66
|
+
latency_metric_for_customer_controlled_operation(
|
67
|
+
"task_allocation_completion",
|
68
|
+
"task allocation completion from the moment it got fetched until its output got uploaded to blob store",
|
69
|
+
)
|
70
|
+
)
|
@@ -0,0 +1,26 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
# Task allocation finalization metrics.
|
6
|
+
metric_task_allocation_finalizations: prometheus_client.Counter = (
|
7
|
+
prometheus_client.Counter(
|
8
|
+
"task_allocation_finalizations",
|
9
|
+
"Number of task allocation finalizations",
|
10
|
+
)
|
11
|
+
)
|
12
|
+
metric_task_allocation_finalization_errors: prometheus_client.Counter = (
|
13
|
+
prometheus_client.Counter(
|
14
|
+
"task_allocation_finalization_errors",
|
15
|
+
"Number of task allocation finalization errors",
|
16
|
+
)
|
17
|
+
)
|
18
|
+
metric_task_allocations_finalizing: prometheus_client.Gauge = prometheus_client.Gauge(
|
19
|
+
"task_allocations_finalizing",
|
20
|
+
"Number of task allocations currently finalizing",
|
21
|
+
)
|
22
|
+
metric_task_allocation_finalization_latency: prometheus_client.Histogram = (
|
23
|
+
latency_metric_for_fast_operation(
|
24
|
+
"task_allocation_finalization", "task allocation finalization"
|
25
|
+
)
|
26
|
+
)
|
@@ -12,24 +12,25 @@ metric_control_loop_handle_event_latency: prometheus_client.Histogram = (
|
|
12
12
|
)
|
13
13
|
)
|
14
14
|
|
15
|
-
|
16
|
-
"
|
15
|
+
metric_task_allocations_fetched: prometheus_client.Counter = prometheus_client.Counter(
|
16
|
+
"task_allocations_fetched",
|
17
|
+
"Number of task allocations that were fetched from Server",
|
17
18
|
)
|
18
19
|
|
19
|
-
|
20
|
+
metric_schedule_task_allocation_latency: prometheus_client.Histogram = (
|
20
21
|
latency_metric_for_customer_controlled_operation(
|
21
|
-
"
|
22
|
-
"Schedule a task for execution after it got ready for execution",
|
22
|
+
"schedule_task_allocation",
|
23
|
+
"Schedule a task allocation for execution after it got ready for execution",
|
23
24
|
)
|
24
25
|
)
|
25
|
-
|
26
|
-
"
|
27
|
-
"Number of
|
26
|
+
metric_runnable_task_allocations: prometheus_client.Gauge = prometheus_client.Gauge(
|
27
|
+
"runnable_task_allocations",
|
28
|
+
"Number of task allocations that are ready for execution but are waiting to get scheduled to run on Function Executor (typically waiting for a free Function Executor)",
|
28
29
|
)
|
29
|
-
|
30
|
+
metric_runnable_task_allocations_per_function_name: prometheus_client.Gauge = (
|
30
31
|
prometheus_client.Gauge(
|
31
|
-
"
|
32
|
-
"Number of
|
32
|
+
"runnable_task_allocations_per_function_name",
|
33
|
+
"Number of task allocations that are ready for execution but are waiting to get scheduled to run on Function Executor (typically waiting for a free Function Executor)",
|
33
34
|
["function_name"],
|
34
35
|
)
|
35
36
|
)
|
@@ -0,0 +1,27 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
metric_task_allocation_preparations: prometheus_client.Counter = (
|
6
|
+
prometheus_client.Counter(
|
7
|
+
"task_allocation_preparations",
|
8
|
+
"Number of task allocation preparations for execution",
|
9
|
+
)
|
10
|
+
)
|
11
|
+
metric_task_allocation_preparation_errors: prometheus_client.Counter = (
|
12
|
+
prometheus_client.Counter(
|
13
|
+
"task_allocation_preparation_errors",
|
14
|
+
"Number of task allocation preparation errors",
|
15
|
+
)
|
16
|
+
)
|
17
|
+
metric_task_allocation_preparation_latency: prometheus_client.Histogram = (
|
18
|
+
latency_metric_for_fast_operation(
|
19
|
+
"task_allocation_preparation", "task allocation preparation for execution"
|
20
|
+
)
|
21
|
+
)
|
22
|
+
metric_task_allocations_getting_prepared: prometheus_client.Gauge = (
|
23
|
+
prometheus_client.Gauge(
|
24
|
+
"task_allocations_getting_prepared",
|
25
|
+
"Number of task allocations currently getting prepared for execution",
|
26
|
+
)
|
27
|
+
)
|
indexify/executor/function_executor_controller/{prepare_task.py → prepare_task_allocation.py}
RENAMED
@@ -13,15 +13,15 @@ from indexify.executor.blob_store.blob_store import BLOBStore
|
|
13
13
|
from indexify.proto.executor_api_pb2 import DataPayload, Task
|
14
14
|
|
15
15
|
from .downloads import serialized_object_manifest_from_data_payload_proto
|
16
|
-
from .events import
|
17
|
-
from .metrics.
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
16
|
+
from .events import TaskAllocationPreparationFinished
|
17
|
+
from .metrics.prepare_task_allocation import (
|
18
|
+
metric_task_allocation_preparation_errors,
|
19
|
+
metric_task_allocation_preparation_latency,
|
20
|
+
metric_task_allocation_preparations,
|
21
|
+
metric_task_allocations_getting_prepared,
|
22
22
|
)
|
23
|
-
from .
|
24
|
-
from .
|
23
|
+
from .task_allocation_info import TaskAllocationInfo
|
24
|
+
from .task_allocation_input import TaskAllocationInput
|
25
25
|
|
26
26
|
# The following constants are subject to S3 limits,
|
27
27
|
# see https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html.
|
@@ -42,9 +42,9 @@ _OUTPUT_BLOB_SLOWER_CHUNKS_COUNT: int = 100
|
|
42
42
|
_INVOCATION_ERROR_MAX_SIZE_BYTES: int = 10 * 1024 * 1024 # 10 MB
|
43
43
|
|
44
44
|
|
45
|
-
async def
|
46
|
-
|
47
|
-
) ->
|
45
|
+
async def prepare_task_allocation(
|
46
|
+
alloc_info: TaskAllocationInfo, blob_store: BLOBStore, logger: Any
|
47
|
+
) -> TaskAllocationPreparationFinished:
|
48
48
|
"""Prepares the task for execution.
|
49
49
|
|
50
50
|
If successful then the task is runnable.
|
@@ -54,43 +54,47 @@ async def prepare_task(
|
|
54
54
|
start_time = time.monotonic()
|
55
55
|
try:
|
56
56
|
with (
|
57
|
-
|
58
|
-
|
59
|
-
|
57
|
+
metric_task_allocation_preparation_errors.count_exceptions(),
|
58
|
+
metric_task_allocations_getting_prepared.track_inprogress(),
|
59
|
+
metric_task_allocation_preparation_latency.time(),
|
60
60
|
):
|
61
|
-
|
62
|
-
|
63
|
-
|
61
|
+
metric_task_allocation_preparations.inc()
|
62
|
+
alloc_info.input = await _prepare_task_alloc_input(
|
63
|
+
alloc_info=alloc_info,
|
64
64
|
blob_store=blob_store,
|
65
65
|
logger=logger,
|
66
66
|
)
|
67
67
|
logger.info(
|
68
|
-
"task was prepared for execution",
|
68
|
+
"task allocation was prepared for execution",
|
69
69
|
duration=time.monotonic() - start_time,
|
70
70
|
)
|
71
|
-
return
|
72
|
-
|
71
|
+
return TaskAllocationPreparationFinished(
|
72
|
+
alloc_info=alloc_info,
|
73
73
|
is_success=True,
|
74
74
|
)
|
75
75
|
except asyncio.CancelledError:
|
76
|
-
return
|
76
|
+
return TaskAllocationPreparationFinished(
|
77
|
+
alloc_info=alloc_info, is_success=False
|
78
|
+
)
|
77
79
|
except BaseException as e:
|
78
80
|
logger.error(
|
79
|
-
"failed to prepare task for execution",
|
81
|
+
"failed to prepare task allocation for execution",
|
80
82
|
exc_info=e,
|
81
83
|
duration=time.monotonic() - start_time,
|
82
84
|
)
|
83
|
-
return
|
85
|
+
return TaskAllocationPreparationFinished(
|
86
|
+
alloc_info=alloc_info, is_success=False
|
87
|
+
)
|
84
88
|
|
85
89
|
|
86
|
-
async def
|
87
|
-
|
88
|
-
) ->
|
90
|
+
async def _prepare_task_alloc_input(
|
91
|
+
alloc_info: TaskAllocationInfo, blob_store: BLOBStore, logger: Any
|
92
|
+
) -> TaskAllocationInput:
|
89
93
|
"""Prepares the task for execution.
|
90
94
|
|
91
95
|
Raises an exception on error.
|
92
96
|
"""
|
93
|
-
task: Task =
|
97
|
+
task: Task = alloc_info.allocation.task
|
94
98
|
function_init_value_blob: Optional[BLOB] = None
|
95
99
|
function_init_value: Optional[SerializedObjectInsideBLOB] = None
|
96
100
|
if task.HasField("reducer_input"):
|
@@ -102,7 +106,7 @@ async def _prepare_task_input(
|
|
102
106
|
function_init_value = _to_serialized_object_inside_blob(task.reducer_input)
|
103
107
|
|
104
108
|
function_outputs_blob_uri: str = (
|
105
|
-
f"{task.output_payload_uri_prefix}.{
|
109
|
+
f"{task.output_payload_uri_prefix}.{alloc_info.allocation.allocation_id}.output"
|
106
110
|
)
|
107
111
|
invocation_error_blob_uri: str = (
|
108
112
|
f"{task.invocation_error_payload_uri_prefix}.{task.graph_invocation_id}.inverr"
|
@@ -136,7 +140,7 @@ async def _prepare_task_input(
|
|
136
140
|
)
|
137
141
|
raise
|
138
142
|
|
139
|
-
return
|
143
|
+
return TaskAllocationInput(
|
140
144
|
function_inputs=FunctionInputs(
|
141
145
|
function_input_blob=await _presign_function_input_blob(
|
142
146
|
data_payload=task.input,
|
@@ -36,57 +36,57 @@ from indexify.proto.executor_api_pb2 import (
|
|
36
36
|
TaskOutcomeCode,
|
37
37
|
)
|
38
38
|
|
39
|
-
from .events import
|
40
|
-
from .metrics.
|
39
|
+
from .events import TaskAllocationExecutionFinished
|
40
|
+
from .metrics.run_task_allocation import (
|
41
41
|
metric_function_executor_run_task_rpc_errors,
|
42
42
|
metric_function_executor_run_task_rpc_latency,
|
43
43
|
metric_function_executor_run_task_rpcs,
|
44
44
|
metric_function_executor_run_task_rpcs_in_progress,
|
45
45
|
)
|
46
|
-
from .
|
47
|
-
from .
|
46
|
+
from .task_allocation_info import TaskAllocationInfo
|
47
|
+
from .task_allocation_output import TaskAllocationMetrics, TaskAllocationOutput
|
48
48
|
|
49
49
|
_CREATE_TASK_TIMEOUT_SECS = 5
|
50
50
|
_DELETE_TASK_TIMEOUT_SECS = 5
|
51
51
|
|
52
52
|
|
53
|
-
async def
|
54
|
-
|
55
|
-
) ->
|
56
|
-
"""Runs the task on the Function Executor and sets
|
53
|
+
async def run_task_allocation_on_function_executor(
|
54
|
+
alloc_info: TaskAllocationInfo, function_executor: FunctionExecutor, logger: Any
|
55
|
+
) -> TaskAllocationExecutionFinished:
|
56
|
+
"""Runs the task on the Function Executor and sets alloc_info.output with the result.
|
57
57
|
|
58
58
|
Doesn't raise any exceptions.
|
59
59
|
"""
|
60
60
|
logger = logger.bind(module=__name__)
|
61
61
|
|
62
|
-
if
|
62
|
+
if alloc_info.input is None:
|
63
63
|
logger.error(
|
64
|
-
"task input is None, this should never happen",
|
64
|
+
"task allocation input is None, this should never happen",
|
65
65
|
)
|
66
|
-
|
67
|
-
allocation=
|
66
|
+
alloc_info.output = TaskAllocationOutput.internal_error(
|
67
|
+
allocation=alloc_info.allocation,
|
68
68
|
execution_start_time=None,
|
69
69
|
execution_end_time=None,
|
70
70
|
)
|
71
|
-
return
|
72
|
-
|
71
|
+
return TaskAllocationExecutionFinished(
|
72
|
+
alloc_info=alloc_info,
|
73
73
|
function_executor_termination_reason=None,
|
74
74
|
)
|
75
75
|
|
76
76
|
task = Task(
|
77
|
-
namespace=
|
78
|
-
graph_name=
|
79
|
-
graph_version=
|
80
|
-
function_name=
|
81
|
-
graph_invocation_id=
|
82
|
-
task_id=
|
83
|
-
allocation_id=
|
84
|
-
request=
|
77
|
+
namespace=alloc_info.allocation.task.namespace,
|
78
|
+
graph_name=alloc_info.allocation.task.graph_name,
|
79
|
+
graph_version=alloc_info.allocation.task.graph_version,
|
80
|
+
function_name=alloc_info.allocation.task.function_name,
|
81
|
+
graph_invocation_id=alloc_info.allocation.task.graph_invocation_id,
|
82
|
+
task_id=alloc_info.allocation.task.id,
|
83
|
+
allocation_id=alloc_info.allocation.allocation_id,
|
84
|
+
request=alloc_info.input.function_inputs,
|
85
85
|
)
|
86
86
|
|
87
87
|
function_executor.invocation_state_client().add_task_to_invocation_id_entry(
|
88
|
-
task_id=
|
89
|
-
invocation_id=
|
88
|
+
task_id=alloc_info.allocation.task.id,
|
89
|
+
invocation_id=alloc_info.allocation.task.graph_invocation_id,
|
90
90
|
)
|
91
91
|
|
92
92
|
metric_function_executor_run_task_rpcs.inc()
|
@@ -107,15 +107,15 @@ async def run_task_on_function_executor(
|
|
107
107
|
# If this RPC failed due to customer code crashing the server we won't be
|
108
108
|
# able to detect this. We'll treat this as our own error for now and thus
|
109
109
|
# let the AioRpcError to be raised here.
|
110
|
-
timeout_sec: float =
|
110
|
+
timeout_sec: float = alloc_info.allocation.task.timeout_ms / 1000.0
|
111
111
|
try:
|
112
112
|
# This aio task can only be cancelled during this await call.
|
113
113
|
task_result = await _run_task_rpcs(task, function_executor, timeout_sec)
|
114
114
|
|
115
115
|
_process_task_diagnostics(task_result.diagnostics, logger)
|
116
116
|
|
117
|
-
|
118
|
-
allocation=
|
117
|
+
alloc_info.output = _task_alloc_output_from_fe_result(
|
118
|
+
allocation=alloc_info.allocation,
|
119
119
|
result=task_result,
|
120
120
|
execution_start_time=execution_start_time,
|
121
121
|
execution_end_time=time.monotonic(),
|
@@ -127,8 +127,8 @@ async def run_task_on_function_executor(
|
|
127
127
|
function_executor_termination_reason = (
|
128
128
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT
|
129
129
|
)
|
130
|
-
|
131
|
-
allocation=
|
130
|
+
alloc_info.output = TaskAllocationOutput.function_timeout(
|
131
|
+
allocation=alloc_info.allocation,
|
132
132
|
execution_start_time=execution_start_time,
|
133
133
|
execution_end_time=time.monotonic(),
|
134
134
|
)
|
@@ -150,14 +150,16 @@ async def run_task_on_function_executor(
|
|
150
150
|
# This is either a create_task() RPC timeout or a
|
151
151
|
# delete_task() RPC timeout; either suggests that the FE
|
152
152
|
# is unhealthy.
|
153
|
-
logger.error(
|
153
|
+
logger.error(
|
154
|
+
"task allocationmanagement RPC execution deadline exceeded", exc_info=e
|
155
|
+
)
|
154
156
|
else:
|
155
157
|
# This is a status from an unsuccessful RPC; this
|
156
158
|
# shouldn't happen, but we handle it.
|
157
|
-
logger.error("task management RPC failed", exc_info=e)
|
159
|
+
logger.error("task allocation management RPC failed", exc_info=e)
|
158
160
|
|
159
|
-
|
160
|
-
allocation=
|
161
|
+
alloc_info.output = TaskAllocationOutput.function_executor_unresponsive(
|
162
|
+
allocation=alloc_info.allocation,
|
161
163
|
execution_start_time=execution_start_time,
|
162
164
|
execution_end_time=time.monotonic(),
|
163
165
|
)
|
@@ -167,8 +169,8 @@ async def run_task_on_function_executor(
|
|
167
169
|
function_executor_termination_reason = (
|
168
170
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED
|
169
171
|
)
|
170
|
-
|
171
|
-
allocation=
|
172
|
+
alloc_info.output = TaskAllocationOutput.task_allocation_cancelled(
|
173
|
+
allocation=alloc_info.allocation,
|
172
174
|
execution_start_time=execution_start_time,
|
173
175
|
execution_end_time=time.monotonic(),
|
174
176
|
)
|
@@ -176,10 +178,11 @@ async def run_task_on_function_executor(
|
|
176
178
|
# This is an unexpected exception; we believe that this
|
177
179
|
# indicates an internal error.
|
178
180
|
logger.error(
|
179
|
-
"unexpected internal error during task lifecycle RPC sequence",
|
181
|
+
"unexpected internal error during task allocation lifecycle RPC sequence",
|
182
|
+
exc_info=e,
|
180
183
|
)
|
181
|
-
|
182
|
-
allocation=
|
184
|
+
alloc_info.output = TaskAllocationOutput.internal_error(
|
185
|
+
allocation=alloc_info.allocation,
|
183
186
|
execution_start_time=execution_start_time,
|
184
187
|
execution_end_time=time.monotonic(),
|
185
188
|
)
|
@@ -190,11 +193,11 @@ async def run_task_on_function_executor(
|
|
190
193
|
metric_function_executor_run_task_rpcs_in_progress.dec()
|
191
194
|
|
192
195
|
function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
|
193
|
-
task_id=
|
196
|
+
task_id=alloc_info.allocation.task.id,
|
194
197
|
)
|
195
198
|
|
196
199
|
if (
|
197
|
-
|
200
|
+
alloc_info.output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
|
198
201
|
and function_executor_termination_reason is None
|
199
202
|
):
|
200
203
|
try:
|
@@ -205,7 +208,7 @@ async def run_task_on_function_executor(
|
|
205
208
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
|
206
209
|
)
|
207
210
|
logger.error(
|
208
|
-
"Function Executor health check failed after running task, shutting down Function Executor",
|
211
|
+
"Function Executor health check failed after running task allocation, shutting down Function Executor",
|
209
212
|
health_check_fail_reason=result.reason,
|
210
213
|
)
|
211
214
|
except asyncio.CancelledError:
|
@@ -213,10 +216,10 @@ async def run_task_on_function_executor(
|
|
213
216
|
# We can't conclude anything about the health of the FE here.
|
214
217
|
pass
|
215
218
|
|
216
|
-
_log_task_execution_finished(output=
|
219
|
+
_log_task_execution_finished(output=alloc_info.output, logger=logger)
|
217
220
|
|
218
|
-
return
|
219
|
-
|
221
|
+
return TaskAllocationExecutionFinished(
|
222
|
+
alloc_info=alloc_info,
|
220
223
|
function_executor_termination_reason=function_executor_termination_reason,
|
221
224
|
)
|
222
225
|
|
@@ -275,17 +278,17 @@ async def _run_task_rpcs(
|
|
275
278
|
return task_result
|
276
279
|
|
277
280
|
|
278
|
-
def
|
281
|
+
def _task_alloc_output_from_fe_result(
|
279
282
|
allocation: TaskAllocation,
|
280
283
|
result: TaskResult,
|
281
284
|
execution_start_time: Optional[float],
|
282
285
|
execution_end_time: Optional[float],
|
283
286
|
logger: Any,
|
284
|
-
) ->
|
287
|
+
) -> TaskAllocationOutput:
|
285
288
|
response_validator = MessageValidator(result)
|
286
289
|
response_validator.required_field("outcome_code")
|
287
290
|
|
288
|
-
metrics =
|
291
|
+
metrics = TaskAllocationMetrics(counters={}, timers={})
|
289
292
|
if result.HasField("metrics"):
|
290
293
|
# Can be None if e.g. function failed.
|
291
294
|
metrics.counters = dict(result.metrics.counters)
|
@@ -312,7 +315,7 @@ def _task_output_from_function_executor_result(
|
|
312
315
|
# function_outputs can have no items, this happens when the function returns None.
|
313
316
|
response_validator.required_field("uploaded_function_outputs_blob")
|
314
317
|
|
315
|
-
return
|
318
|
+
return TaskAllocationOutput(
|
316
319
|
allocation=allocation,
|
317
320
|
outcome_code=outcome_code,
|
318
321
|
failure_reason=failure_reason,
|
@@ -327,9 +330,9 @@ def _task_output_from_function_executor_result(
|
|
327
330
|
)
|
328
331
|
|
329
332
|
|
330
|
-
def _log_task_execution_finished(output:
|
333
|
+
def _log_task_execution_finished(output: TaskAllocationOutput, logger: Any) -> None:
|
331
334
|
logger.info(
|
332
|
-
"finished running task",
|
335
|
+
"finished running task allocation",
|
333
336
|
success=output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS,
|
334
337
|
outcome_code=TaskOutcomeCode.Name(output.outcome_code),
|
335
338
|
failure_reason=(
|
@@ -344,7 +347,7 @@ def _process_task_diagnostics(task_diagnostics: TaskDiagnostics, logger: Any) ->
|
|
344
347
|
MessageValidator(task_diagnostics).required_field("function_executor_log")
|
345
348
|
# Uncomment these lines once we stop printing FE logs to stdout/stderr.
|
346
349
|
# Print FE logs directly to Executor logs so operators can see them.
|
347
|
-
# logger.info("Function Executor logs during task execution:")
|
350
|
+
# logger.info("Function Executor logs during task allocation execution:")
|
348
351
|
# print(task_diagnostics.function_executor_log)
|
349
352
|
|
350
353
|
|
@@ -4,13 +4,13 @@ from typing import Optional
|
|
4
4
|
|
5
5
|
from indexify.proto.executor_api_pb2 import TaskAllocation
|
6
6
|
|
7
|
-
from .
|
8
|
-
from .
|
7
|
+
from .task_allocation_input import TaskAllocationInput
|
8
|
+
from .task_allocation_output import TaskAllocationOutput
|
9
9
|
|
10
10
|
|
11
11
|
@dataclass
|
12
|
-
class
|
13
|
-
"""Object used to track a task during its full lifecycle in the FunctionExecutorController."""
|
12
|
+
class TaskAllocationInfo:
|
13
|
+
"""Object used to track a task allocation during its full lifecycle in the FunctionExecutorController."""
|
14
14
|
|
15
15
|
allocation: TaskAllocation
|
16
16
|
# time.monotonic() timestamp
|
@@ -22,8 +22,8 @@ class TaskInfo:
|
|
22
22
|
# aio task that is currently executing a lifecycle step of this task.
|
23
23
|
aio_task: Optional[asyncio.Task] = None
|
24
24
|
# Input if function was prepared successfully.
|
25
|
-
input: Optional[
|
25
|
+
input: Optional[TaskAllocationInput] = None
|
26
26
|
# Output of the task, always set when the task is completed.
|
27
|
-
output: Optional[
|
27
|
+
output: Optional[TaskAllocationOutput] = None
|
28
28
|
# True if the task is fully completed and was added to state reporter.
|
29
29
|
is_completed: bool = False
|
@@ -1,8 +1,8 @@
|
|
1
1
|
from tensorlake.function_executor.proto.function_executor_pb2 import FunctionInputs
|
2
2
|
|
3
3
|
|
4
|
-
class
|
5
|
-
"""Represents the input for a task in the function executor controller."""
|
4
|
+
class TaskAllocationInput:
|
5
|
+
"""Represents the input for a task allocation in the function executor controller."""
|
6
6
|
|
7
7
|
def __init__(
|
8
8
|
self,
|