indexify 0.4.29__py3-none-any.whl → 0.4.30__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/executor/function_executor_controller/__init__.py +2 -2
- indexify/executor/function_executor_controller/completed_task_allocation_metrics.py +87 -0
- indexify/executor/function_executor_controller/events.py +29 -33
- indexify/executor/function_executor_controller/{finalize_task.py → finalize_task_allocation.py} +45 -37
- indexify/executor/function_executor_controller/function_executor_controller.py +194 -180
- indexify/executor/function_executor_controller/loggers.py +15 -17
- indexify/executor/function_executor_controller/message_validators.py +4 -12
- indexify/executor/function_executor_controller/metrics/completed_task_allocation_metrics.py +70 -0
- indexify/executor/function_executor_controller/metrics/finalize_task_allocation.py +26 -0
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +12 -11
- indexify/executor/function_executor_controller/metrics/prepare_task_allocation.py +27 -0
- indexify/executor/function_executor_controller/{prepare_task.py → prepare_task_allocation.py} +33 -29
- indexify/executor/function_executor_controller/{run_task.py → run_task_allocation.py} +54 -51
- indexify/executor/function_executor_controller/{task_info.py → task_allocation_info.py} +6 -6
- indexify/executor/function_executor_controller/{task_input.py → task_allocation_input.py} +2 -2
- indexify/executor/function_executor_controller/{task_output.py → task_allocation_output.py} +24 -24
- indexify/executor/state_reconciler.py +23 -19
- {indexify-0.4.29.dist-info → indexify-0.4.30.dist-info}/METADATA +2 -2
- {indexify-0.4.29.dist-info → indexify-0.4.30.dist-info}/RECORD +22 -22
- indexify/executor/function_executor_controller/completed_task_metrics.py +0 -83
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +0 -68
- indexify/executor/function_executor_controller/metrics/finalize_task.py +0 -20
- indexify/executor/function_executor_controller/metrics/prepare_task.py +0 -18
- /indexify/executor/function_executor_controller/metrics/{run_task.py → run_task_allocation.py} +0 -0
- {indexify-0.4.29.dist-info → indexify-0.4.30.dist-info}/WHEEL +0 -0
- {indexify-0.4.29.dist-info → indexify-0.4.30.dist-info}/entry_points.txt +0 -0
@@ -4,7 +4,7 @@ from .message_validators import (
|
|
4
4
|
validate_function_executor_description,
|
5
5
|
validate_task_allocation,
|
6
6
|
)
|
7
|
-
from .
|
7
|
+
from .task_allocation_output import TaskAllocationOutput
|
8
8
|
|
9
9
|
__all__ = [
|
10
10
|
"function_executor_logger",
|
@@ -12,5 +12,5 @@ __all__ = [
|
|
12
12
|
"validate_function_executor_description",
|
13
13
|
"validate_task_allocation",
|
14
14
|
"FunctionExecutorController",
|
15
|
-
"
|
15
|
+
"TaskAllocationOutput",
|
16
16
|
]
|
@@ -0,0 +1,87 @@
|
|
1
|
+
import time
|
2
|
+
from typing import Any
|
3
|
+
|
4
|
+
from indexify.proto.executor_api_pb2 import (
|
5
|
+
TaskFailureReason,
|
6
|
+
TaskOutcomeCode,
|
7
|
+
)
|
8
|
+
|
9
|
+
from .metrics.completed_task_allocation_metrics import (
|
10
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_ALL,
|
11
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_FUNCTION_ERROR,
|
12
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED,
|
13
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_INTERNAL_ERROR,
|
14
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_NONE,
|
15
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_TASK_CANCELLED,
|
16
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_UNKNOWN,
|
17
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_ALL,
|
18
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
|
19
|
+
METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_SUCCESS,
|
20
|
+
metric_task_allocation_completion_latency,
|
21
|
+
metric_task_allocations_completed,
|
22
|
+
)
|
23
|
+
from .task_allocation_info import TaskAllocationInfo
|
24
|
+
|
25
|
+
|
26
|
+
def emit_completed_task_allocation_metrics(
|
27
|
+
alloc_info: TaskAllocationInfo, logger: Any
|
28
|
+
) -> None:
|
29
|
+
"""Emits Prometheus metrics for a completed task allocation.
|
30
|
+
|
31
|
+
Doesn't raise any exceptions.
|
32
|
+
"""
|
33
|
+
logger = logger.bind(module=__name__)
|
34
|
+
metric_task_allocation_completion_latency.observe(
|
35
|
+
time.monotonic() - alloc_info.start_time
|
36
|
+
)
|
37
|
+
|
38
|
+
task_outcome_code: TaskOutcomeCode = alloc_info.output.outcome_code
|
39
|
+
task_failure_reason: TaskFailureReason = alloc_info.output.failure_reason
|
40
|
+
metric_task_allocations_completed.labels(
|
41
|
+
outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_ALL,
|
42
|
+
failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_ALL,
|
43
|
+
).inc()
|
44
|
+
if task_outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS:
|
45
|
+
metric_task_allocations_completed.labels(
|
46
|
+
outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_SUCCESS,
|
47
|
+
failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_NONE,
|
48
|
+
).inc()
|
49
|
+
elif task_outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE:
|
50
|
+
if task_failure_reason == TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR:
|
51
|
+
metric_task_allocations_completed.labels(
|
52
|
+
outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
|
53
|
+
failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_INTERNAL_ERROR,
|
54
|
+
).inc()
|
55
|
+
elif (
|
56
|
+
task_failure_reason
|
57
|
+
== TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED
|
58
|
+
):
|
59
|
+
metric_task_allocations_completed.labels(
|
60
|
+
outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
|
61
|
+
failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED,
|
62
|
+
).inc()
|
63
|
+
elif (
|
64
|
+
task_failure_reason == TaskFailureReason.TASK_FAILURE_REASON_TASK_CANCELLED
|
65
|
+
):
|
66
|
+
metric_task_allocations_completed.labels(
|
67
|
+
outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
|
68
|
+
failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_TASK_CANCELLED,
|
69
|
+
).inc()
|
70
|
+
elif task_failure_reason in [
|
71
|
+
TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR,
|
72
|
+
TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_TIMEOUT,
|
73
|
+
TaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR,
|
74
|
+
]:
|
75
|
+
metric_task_allocations_completed.labels(
|
76
|
+
outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
|
77
|
+
failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_FUNCTION_ERROR,
|
78
|
+
).inc()
|
79
|
+
else:
|
80
|
+
metric_task_allocations_completed.labels(
|
81
|
+
outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
|
82
|
+
failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_UNKNOWN,
|
83
|
+
).inc()
|
84
|
+
logger.warning(
|
85
|
+
"unexpected task allocation failure reason",
|
86
|
+
failure_reason=TaskFailureReason.Name(task_failure_reason),
|
87
|
+
)
|
@@ -1,26 +1,22 @@
|
|
1
1
|
from enum import Enum
|
2
2
|
from typing import List, Optional
|
3
3
|
|
4
|
-
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
5
|
-
FunctionInputs,
|
6
|
-
)
|
7
|
-
|
8
4
|
from indexify.executor.function_executor.function_executor import (
|
9
5
|
FunctionExecutor,
|
10
6
|
)
|
11
7
|
from indexify.proto.executor_api_pb2 import FunctionExecutorTerminationReason
|
12
8
|
|
13
|
-
from .
|
9
|
+
from .task_allocation_info import TaskAllocationInfo
|
14
10
|
|
15
11
|
|
16
12
|
class EventType(Enum):
|
17
13
|
FUNCTION_EXECUTOR_CREATED = 1
|
18
14
|
FUNCTION_EXECUTOR_TERMINATED = 2
|
19
15
|
SHUTDOWN_INITIATED = 3
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
16
|
+
TASK_ALLOCATION_PREPARATION_FINISHED = 4
|
17
|
+
SCHEDULE_TASK_ALLOCATION_EXECUTION = 5
|
18
|
+
TASK_ALLOCATION_EXECUTION_FINISHED = 6
|
19
|
+
TASK_ALLOCATION_FINALIZATION_FINISHED = 7
|
24
20
|
|
25
21
|
|
26
22
|
class BaseEvent:
|
@@ -94,50 +90,50 @@ class ShutdownInitiated(BaseEvent):
|
|
94
90
|
super().__init__(EventType.SHUTDOWN_INITIATED)
|
95
91
|
|
96
92
|
|
97
|
-
class
|
93
|
+
class TaskAllocationPreparationFinished(BaseEvent):
|
98
94
|
"""
|
99
|
-
Event indicating that a task has been prepared for execution or failed to do that.
|
95
|
+
Event indicating that a task allocation has been prepared for execution or failed to do that.
|
100
96
|
"""
|
101
97
|
|
102
98
|
def __init__(
|
103
99
|
self,
|
104
|
-
|
100
|
+
alloc_info: TaskAllocationInfo,
|
105
101
|
is_success: bool,
|
106
102
|
):
|
107
|
-
super().__init__(EventType.
|
108
|
-
self.
|
103
|
+
super().__init__(EventType.TASK_ALLOCATION_PREPARATION_FINISHED)
|
104
|
+
self.alloc_info: TaskAllocationInfo = alloc_info
|
109
105
|
self.is_success: bool = is_success
|
110
106
|
|
111
107
|
def __str__(self) -> str:
|
112
108
|
return (
|
113
109
|
f"Event(type={self.event_type.name}, "
|
114
|
-
f"task_id={self.
|
115
|
-
f"allocation_id={self.
|
110
|
+
f"task_id={self.alloc_info.allocation.task.id}, "
|
111
|
+
f"allocation_id={self.alloc_info.allocation.allocation_id}), "
|
116
112
|
f"is_success={self.is_success}"
|
117
113
|
)
|
118
114
|
|
119
115
|
|
120
|
-
class
|
116
|
+
class ScheduleTaskAllocationExecution(BaseEvent):
|
121
117
|
"""
|
122
|
-
Event indicating that a task
|
118
|
+
Event indicating that a task allocation has been scheduled.
|
123
119
|
"""
|
124
120
|
|
125
121
|
def __init__(self):
|
126
|
-
super().__init__(EventType.
|
122
|
+
super().__init__(EventType.SCHEDULE_TASK_ALLOCATION_EXECUTION)
|
127
123
|
|
128
124
|
|
129
|
-
class
|
125
|
+
class TaskAllocationExecutionFinished(BaseEvent):
|
130
126
|
"""
|
131
|
-
Event indicating that a task execution has been finished on Function Executor.
|
127
|
+
Event indicating that a task allocation execution has been finished on Function Executor.
|
132
128
|
"""
|
133
129
|
|
134
130
|
def __init__(
|
135
131
|
self,
|
136
|
-
|
132
|
+
alloc_info: TaskAllocationInfo,
|
137
133
|
function_executor_termination_reason: FunctionExecutorTerminationReason, # type: Optional[FunctionExecutorTerminationReason]
|
138
134
|
):
|
139
|
-
super().__init__(EventType.
|
140
|
-
self.
|
135
|
+
super().__init__(EventType.TASK_ALLOCATION_EXECUTION_FINISHED)
|
136
|
+
self.alloc_info: TaskAllocationInfo = alloc_info
|
141
137
|
# Not None if the FE needs to get destroyed after running the task.
|
142
138
|
self.function_executor_termination_reason = function_executor_termination_reason
|
143
139
|
|
@@ -151,26 +147,26 @@ class TaskExecutionFinished(BaseEvent):
|
|
151
147
|
)
|
152
148
|
return (
|
153
149
|
f"Event(type={self.event_type.name}, "
|
154
|
-
f"task_id={self.
|
155
|
-
f"allocation_id={self.
|
150
|
+
f"task_id={self.alloc_info.allocation.task.id}, "
|
151
|
+
f"allocation_id={self.alloc_info.allocation.allocation_id}), "
|
156
152
|
f"function_executor_termination_reason={function_executor_termination_reason_str}"
|
157
153
|
)
|
158
154
|
|
159
155
|
|
160
|
-
class
|
156
|
+
class TaskAllocationFinalizationFinished(BaseEvent):
|
161
157
|
"""
|
162
|
-
Event indicating that a task finalization is finished.
|
158
|
+
Event indicating that a task allocation finalization is finished.
|
163
159
|
"""
|
164
160
|
|
165
|
-
def __init__(self,
|
166
|
-
super().__init__(EventType.
|
167
|
-
self.
|
161
|
+
def __init__(self, alloc_info: TaskAllocationInfo, is_success: bool):
|
162
|
+
super().__init__(EventType.TASK_ALLOCATION_FINALIZATION_FINISHED)
|
163
|
+
self.alloc_info: TaskAllocationInfo = alloc_info
|
168
164
|
self.is_success: bool = is_success
|
169
165
|
|
170
166
|
def __str__(self) -> str:
|
171
167
|
return (
|
172
168
|
f"Event(type={self.event_type.name}, "
|
173
|
-
f"task_id={self.
|
174
|
-
f"allocation_id={self.
|
169
|
+
f"task_id={self.alloc_info.allocation.task.id}, "
|
170
|
+
f"allocation_id={self.alloc_info.allocation.allocation_id}), "
|
175
171
|
f"is_success={self.is_success}"
|
176
172
|
)
|
indexify/executor/function_executor_controller/{finalize_task.py → finalize_task_allocation.py}
RENAMED
@@ -8,21 +8,21 @@ from indexify.proto.executor_api_pb2 import (
|
|
8
8
|
TaskOutcomeCode,
|
9
9
|
)
|
10
10
|
|
11
|
-
from .events import
|
12
|
-
from .metrics.
|
13
|
-
|
14
|
-
|
15
|
-
|
16
|
-
|
11
|
+
from .events import TaskAllocationFinalizationFinished
|
12
|
+
from .metrics.finalize_task_allocation import (
|
13
|
+
metric_task_allocation_finalization_errors,
|
14
|
+
metric_task_allocation_finalization_latency,
|
15
|
+
metric_task_allocation_finalizations,
|
16
|
+
metric_task_allocations_finalizing,
|
17
17
|
)
|
18
|
-
from .
|
19
|
-
from .
|
20
|
-
from .
|
18
|
+
from .task_allocation_info import TaskAllocationInfo
|
19
|
+
from .task_allocation_input import TaskAllocationInput
|
20
|
+
from .task_allocation_output import TaskAllocationOutput
|
21
21
|
|
22
22
|
|
23
|
-
async def
|
24
|
-
|
25
|
-
) ->
|
23
|
+
async def finalize_task_allocation(
|
24
|
+
task_alloc: TaskAllocationInfo, blob_store: BLOBStore, logger: Any
|
25
|
+
) -> TaskAllocationFinalizationFinished:
|
26
26
|
"""Prepares the task output for getting it reported to Server.
|
27
27
|
|
28
28
|
The task output is either coming from a failed task or from its finished execution on the Function Executor.
|
@@ -32,34 +32,40 @@ async def finalize_task(
|
|
32
32
|
start_time = time.monotonic()
|
33
33
|
|
34
34
|
with (
|
35
|
-
|
36
|
-
|
37
|
-
|
35
|
+
metric_task_allocations_finalizing.track_inprogress(),
|
36
|
+
metric_task_allocation_finalization_latency.time(),
|
37
|
+
metric_task_allocation_finalization_errors.count_exceptions(),
|
38
38
|
):
|
39
|
-
|
39
|
+
metric_task_allocation_finalizations.inc()
|
40
40
|
try:
|
41
|
-
await
|
42
|
-
|
41
|
+
await _finalize_task_alloc_output(
|
42
|
+
alloc_info=task_alloc,
|
43
43
|
blob_store=blob_store,
|
44
44
|
logger=logger,
|
45
45
|
)
|
46
46
|
logger.info(
|
47
|
-
"task finalized",
|
47
|
+
"task allocation finalized",
|
48
48
|
duration=time.monotonic() - start_time,
|
49
49
|
)
|
50
|
-
return
|
50
|
+
return TaskAllocationFinalizationFinished(
|
51
|
+
alloc_info=task_alloc, is_success=True
|
52
|
+
)
|
51
53
|
except asyncio.CancelledError:
|
52
|
-
return
|
54
|
+
return TaskAllocationFinalizationFinished(
|
55
|
+
alloc_info=task_alloc, is_success=False
|
56
|
+
)
|
53
57
|
except BaseException as e:
|
54
58
|
logger.error(
|
55
|
-
"failed to finalize task",
|
59
|
+
"failed to finalize task allocation",
|
56
60
|
exc_info=e,
|
57
61
|
duration=time.monotonic() - start_time,
|
58
62
|
)
|
59
|
-
return
|
63
|
+
return TaskAllocationFinalizationFinished(
|
64
|
+
alloc_info=task_alloc, is_success=False
|
65
|
+
)
|
60
66
|
|
61
67
|
|
62
|
-
class
|
68
|
+
class _TaskAllocationOutputSummary:
|
63
69
|
def __init__(self):
|
64
70
|
self.output_count: int = 0
|
65
71
|
self.output_bytes: int = 0
|
@@ -68,27 +74,27 @@ class _TaskOutputSummary:
|
|
68
74
|
self.next_functions_count: int = 0
|
69
75
|
|
70
76
|
|
71
|
-
async def
|
72
|
-
|
77
|
+
async def _finalize_task_alloc_output(
|
78
|
+
alloc_info: TaskAllocationInfo, blob_store: BLOBStore, logger: Any
|
73
79
|
) -> None:
|
74
80
|
"""Finalizes the task output.
|
75
81
|
|
76
82
|
Raises exception on error."""
|
77
|
-
if
|
83
|
+
if alloc_info.input is None:
|
78
84
|
raise Exception(
|
79
|
-
"task input is None, this should never happen",
|
85
|
+
"task allocation input is None, this should never happen",
|
80
86
|
)
|
81
|
-
if
|
87
|
+
if alloc_info.output is None:
|
82
88
|
raise Exception(
|
83
|
-
"task output is None, this should never happen",
|
89
|
+
"task allocation output is None, this should never happen",
|
84
90
|
)
|
85
91
|
|
86
|
-
input:
|
87
|
-
output:
|
92
|
+
input: TaskAllocationInput = alloc_info.input
|
93
|
+
output: TaskAllocationOutput = alloc_info.output
|
88
94
|
|
89
|
-
output_summary:
|
95
|
+
output_summary: _TaskAllocationOutputSummary = _task_output_summary(output)
|
90
96
|
logger.info(
|
91
|
-
"task output summary",
|
97
|
+
"task allocation output summary",
|
92
98
|
output_count=output_summary.output_count,
|
93
99
|
output_bytes=output_summary.output_bytes,
|
94
100
|
invocation_error_output_count=output_summary.invocation_error_output_count,
|
@@ -152,8 +158,10 @@ async def _finalize_task_output(
|
|
152
158
|
)
|
153
159
|
|
154
160
|
|
155
|
-
def _task_output_summary(
|
156
|
-
|
161
|
+
def _task_output_summary(
|
162
|
+
task_output: TaskAllocationOutput,
|
163
|
+
) -> _TaskAllocationOutputSummary:
|
164
|
+
summary: _TaskAllocationOutputSummary = _TaskAllocationOutputSummary()
|
157
165
|
|
158
166
|
for output in task_output.function_outputs:
|
159
167
|
summary.output_count += 1
|
@@ -172,7 +180,7 @@ def _task_output_summary(task_output: TaskOutput) -> _TaskOutputSummary:
|
|
172
180
|
|
173
181
|
# Temporary workaround is logging customer metrics until we store them somewhere
|
174
182
|
# for future retrieval and processing.
|
175
|
-
def _log_function_metrics(output:
|
183
|
+
def _log_function_metrics(output: TaskAllocationOutput, logger: Any):
|
176
184
|
if output.metrics is None:
|
177
185
|
return
|
178
186
|
|