indexify 0.3.13__py3-none-any.whl → 0.3.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +11 -7
- indexify/executor/downloader.py +99 -50
- indexify/executor/executor.py +149 -28
- indexify/executor/function_executor/function_executor_state.py +23 -4
- indexify/executor/function_executor/function_executor_states_container.py +28 -16
- indexify/executor/function_executor/health_checker.py +26 -11
- indexify/executor/function_executor/server/function_executor_server_factory.py +4 -1
- indexify/executor/function_executor/single_task_runner.py +28 -8
- indexify/executor/function_executor/task_output.py +27 -4
- indexify/executor/state_reconciler.py +288 -0
- indexify/executor/state_reporter.py +127 -0
- indexify/executor/task_reporter.py +6 -6
- indexify/executor/task_runner.py +20 -12
- indexify/task_scheduler/proto/task_scheduler.proto +147 -0
- indexify/task_scheduler/proto/task_scheduler_pb2.py +69 -0
- indexify/task_scheduler/proto/task_scheduler_pb2.pyi +286 -0
- indexify/task_scheduler/proto/task_scheduler_pb2_grpc.py +170 -0
- {indexify-0.3.13.dist-info → indexify-0.3.14.dist-info}/METADATA +1 -1
- {indexify-0.3.13.dist-info → indexify-0.3.14.dist-info}/RECORD +21 -15
- {indexify-0.3.13.dist-info → indexify-0.3.14.dist-info}/WHEEL +0 -0
- {indexify-0.3.13.dist-info → indexify-0.3.14.dist-info}/entry_points.txt +0 -0
@@ -20,16 +20,22 @@ from .server.client_configuration import HEALTH_CHECK_TIMEOUT_SEC
|
|
20
20
|
HEALTH_CHECK_POLL_PERIOD_SEC = 10
|
21
21
|
|
22
22
|
|
23
|
+
class HealthCheckResult:
|
24
|
+
def __init__(self, is_healthy: bool, reason: str):
|
25
|
+
self.is_healthy: bool = is_healthy
|
26
|
+
self.reason: str = reason
|
27
|
+
|
28
|
+
|
23
29
|
class HealthChecker:
|
24
30
|
def __init__(self, stub: FunctionExecutorStub, logger: Any):
|
25
31
|
self._stub: FunctionExecutorStub = stub
|
26
32
|
self._logger: Any = logger.bind(module=__name__)
|
27
33
|
self._health_check_loop_task: Optional[asyncio.Task] = None
|
28
|
-
self._health_check_failed_callback: Optional[
|
29
|
-
None
|
30
|
-
|
34
|
+
self._health_check_failed_callback: Optional[
|
35
|
+
Callable[[HealthCheckResult], Awaitable[None]]
|
36
|
+
] = None
|
31
37
|
|
32
|
-
async def check(self) ->
|
38
|
+
async def check(self) -> HealthCheckResult:
|
33
39
|
"""Runs the health check once and returns the result.
|
34
40
|
|
35
41
|
Does not raise any exceptions."""
|
@@ -40,17 +46,25 @@ class HealthChecker:
|
|
40
46
|
)
|
41
47
|
if not response.healthy:
|
42
48
|
metric_failed_health_checks.inc()
|
43
|
-
return
|
44
|
-
|
49
|
+
return HealthCheckResult(
|
50
|
+
is_healthy=response.healthy, reason=response.status_message
|
51
|
+
)
|
52
|
+
except AioRpcError as e:
|
45
53
|
metric_failed_health_checks.inc()
|
46
54
|
# Expected exception when there are problems with communication because e.g. the server is unhealthy.
|
47
|
-
return
|
55
|
+
return HealthCheckResult(
|
56
|
+
is_healthy=False,
|
57
|
+
reason=f"Executor side RPC channel error: {str(e)}",
|
58
|
+
)
|
48
59
|
except Exception as e:
|
49
60
|
metric_failed_health_checks.inc()
|
50
61
|
self._logger.warning("Got unexpected exception, ignoring", exc_info=e)
|
51
|
-
return
|
62
|
+
return HealthCheckResult(
|
63
|
+
is_healthy=False,
|
64
|
+
reason=f"Unexpected exception in Executor: {str(e)}",
|
65
|
+
)
|
52
66
|
|
53
|
-
def start(self, callback: Callable[[], Awaitable[None]]) -> None:
|
67
|
+
def start(self, callback: Callable[[HealthCheckResult], Awaitable[None]]) -> None:
|
54
68
|
"""Starts periodic health checks.
|
55
69
|
|
56
70
|
The supplied callback is an async function called in the calling thread's
|
@@ -81,9 +95,10 @@ class HealthChecker:
|
|
81
95
|
|
82
96
|
async def _health_check_loop(self) -> None:
|
83
97
|
while True:
|
84
|
-
|
98
|
+
result: HealthCheckResult = await self.check()
|
99
|
+
if not result.is_healthy:
|
85
100
|
break
|
86
101
|
await asyncio.sleep(HEALTH_CHECK_POLL_PERIOD_SEC)
|
87
102
|
|
88
|
-
asyncio.create_task(self._health_check_failed_callback())
|
103
|
+
asyncio.create_task(self._health_check_failed_callback(result))
|
89
104
|
self._health_check_loop_task = None
|
@@ -14,8 +14,11 @@ class FunctionExecutorServerConfiguration:
|
|
14
14
|
configuration parameters or raise an exception if it can't implement
|
15
15
|
them."""
|
16
16
|
|
17
|
-
def __init__(
|
17
|
+
def __init__(
|
18
|
+
self, executor_id: str, function_executor_id: str, image_uri: Optional[str]
|
19
|
+
):
|
18
20
|
self.executor_id: str = executor_id
|
21
|
+
self.function_executor_id: str = function_executor_id
|
19
22
|
# Container image URI of the Function Executor Server.
|
20
23
|
self.image_uri: Optional[str] = image_uri
|
21
24
|
|
@@ -14,6 +14,7 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
|
14
14
|
from ..api_objects import Task
|
15
15
|
from .function_executor import CustomerError, FunctionExecutor
|
16
16
|
from .function_executor_state import FunctionExecutorState
|
17
|
+
from .health_checker import HealthChecker, HealthCheckResult
|
17
18
|
from .metrics.single_task_runner import (
|
18
19
|
metric_function_executor_run_task_rpc_errors,
|
19
20
|
metric_function_executor_run_task_rpc_latency,
|
@@ -69,7 +70,12 @@ class SingleTaskRunner:
|
|
69
70
|
await self._create_function_executor()
|
70
71
|
except CustomerError as e:
|
71
72
|
return TaskOutput(
|
72
|
-
|
73
|
+
task_id=self._task_input.task.id,
|
74
|
+
namespace=self._task_input.task.namespace,
|
75
|
+
graph_name=self._task_input.task.compute_graph,
|
76
|
+
function_name=self._task_input.task.compute_fn,
|
77
|
+
graph_version=self._task_input.task.graph_version,
|
78
|
+
graph_invocation_id=self._task_input.task.invocation_id,
|
73
79
|
stderr=str(e),
|
74
80
|
success=False,
|
75
81
|
)
|
@@ -88,6 +94,7 @@ class SingleTaskRunner:
|
|
88
94
|
config: FunctionExecutorServerConfiguration = (
|
89
95
|
FunctionExecutorServerConfiguration(
|
90
96
|
executor_id=self._executor_id,
|
97
|
+
function_executor_id=self._state.id,
|
91
98
|
image_uri=self._task_input.task.image_uri,
|
92
99
|
)
|
93
100
|
)
|
@@ -144,24 +151,32 @@ class SingleTaskRunner:
|
|
144
151
|
).run_task(request)
|
145
152
|
return _task_output(task=self._task_input.task, response=response)
|
146
153
|
|
147
|
-
async def _health_check_failed_callback(self):
|
154
|
+
async def _health_check_failed_callback(self, result: HealthCheckResult):
|
148
155
|
# Function Executor destroy due to the periodic health check failure ensures that
|
149
156
|
# a running task RPC stuck in unhealthy Function Executor fails immidiately.
|
150
157
|
async with self._state.lock:
|
151
158
|
if self._state.function_executor is not None:
|
152
|
-
await self._destroy_function_executor_on_failed_health_check(
|
159
|
+
await self._destroy_function_executor_on_failed_health_check(
|
160
|
+
result.reason
|
161
|
+
)
|
153
162
|
|
154
163
|
async def _destroy_existing_function_executor_if_unhealthy(self):
|
155
164
|
self._state.check_locked()
|
156
165
|
if self._state.function_executor is None:
|
157
166
|
return
|
158
|
-
|
167
|
+
result: HealthCheckResult = (
|
168
|
+
await self._state.function_executor.health_checker().check()
|
169
|
+
)
|
170
|
+
if result.is_healthy:
|
159
171
|
return
|
160
|
-
await self._destroy_function_executor_on_failed_health_check()
|
172
|
+
await self._destroy_function_executor_on_failed_health_check(result.reason)
|
161
173
|
|
162
|
-
async def _destroy_function_executor_on_failed_health_check(self):
|
174
|
+
async def _destroy_function_executor_on_failed_health_check(self, reason: str):
|
163
175
|
self._state.check_locked()
|
164
|
-
self._logger.error(
|
176
|
+
self._logger.error(
|
177
|
+
"Function Executor health check failed, destroying Function Executor",
|
178
|
+
health_check_fail_reason=reason,
|
179
|
+
)
|
165
180
|
self._state.health_check_failed = True
|
166
181
|
await self._state.destroy_function_executor()
|
167
182
|
|
@@ -220,7 +235,12 @@ def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
|
|
220
235
|
raise ValueError(f"Response is missing required field: {field}")
|
221
236
|
|
222
237
|
output = TaskOutput(
|
223
|
-
|
238
|
+
task_id=task.id,
|
239
|
+
namespace=task.namespace,
|
240
|
+
graph_name=task.compute_graph,
|
241
|
+
function_name=task.compute_fn,
|
242
|
+
graph_version=task.graph_version,
|
243
|
+
graph_invocation_id=task.invocation_id,
|
224
244
|
stdout=response.stdout,
|
225
245
|
stderr=response.stderr,
|
226
246
|
reducer=response.is_reducer,
|
@@ -13,7 +13,12 @@ class TaskOutput:
|
|
13
13
|
|
14
14
|
def __init__(
|
15
15
|
self,
|
16
|
-
|
16
|
+
task_id: str,
|
17
|
+
namespace: str,
|
18
|
+
graph_name: str,
|
19
|
+
function_name: str,
|
20
|
+
graph_version: str,
|
21
|
+
graph_invocation_id: str,
|
17
22
|
function_output: Optional[FunctionOutput] = None,
|
18
23
|
router_output: Optional[RouterOutput] = None,
|
19
24
|
stdout: Optional[str] = None,
|
@@ -22,7 +27,12 @@ class TaskOutput:
|
|
22
27
|
success: bool = False,
|
23
28
|
is_internal_error: bool = False,
|
24
29
|
):
|
25
|
-
self.
|
30
|
+
self.task_id = task_id
|
31
|
+
self.namespace = namespace
|
32
|
+
self.graph_name = graph_name
|
33
|
+
self.function_name = function_name
|
34
|
+
self.graph_version = graph_version
|
35
|
+
self.graph_invocation_id = graph_invocation_id
|
26
36
|
self.function_output = function_output
|
27
37
|
self.router_output = router_output
|
28
38
|
self.stdout = stdout
|
@@ -32,11 +42,24 @@ class TaskOutput:
|
|
32
42
|
self.is_internal_error = is_internal_error
|
33
43
|
|
34
44
|
@classmethod
|
35
|
-
def internal_error(
|
45
|
+
def internal_error(
|
46
|
+
cls,
|
47
|
+
task_id: str,
|
48
|
+
namespace: str,
|
49
|
+
graph_name: str,
|
50
|
+
function_name: str,
|
51
|
+
graph_version: str,
|
52
|
+
graph_invocation_id: str,
|
53
|
+
) -> "TaskOutput":
|
36
54
|
"""Creates a TaskOutput for an internal error."""
|
37
55
|
# We are not sharing internal error messages with the customer.
|
38
56
|
return TaskOutput(
|
39
|
-
|
57
|
+
task_id=task_id,
|
58
|
+
namespace=namespace,
|
59
|
+
graph_name=graph_name,
|
60
|
+
function_name=function_name,
|
61
|
+
graph_version=graph_version,
|
62
|
+
graph_invocation_id=graph_invocation_id,
|
40
63
|
stderr="Platform failed to execute the function.",
|
41
64
|
is_internal_error=True,
|
42
65
|
)
|
@@ -0,0 +1,288 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import Any, AsyncGenerator, List, Optional, Set
|
3
|
+
|
4
|
+
import grpc
|
5
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
6
|
+
InitializeRequest,
|
7
|
+
SerializedObject,
|
8
|
+
)
|
9
|
+
|
10
|
+
from indexify.task_scheduler.proto.task_scheduler_pb2 import (
|
11
|
+
DesiredExecutorState,
|
12
|
+
FunctionExecutorDescription,
|
13
|
+
FunctionExecutorStatus,
|
14
|
+
GetDesiredExecutorStatesRequest,
|
15
|
+
)
|
16
|
+
from indexify.task_scheduler.proto.task_scheduler_pb2_grpc import (
|
17
|
+
TaskSchedulerServiceStub,
|
18
|
+
)
|
19
|
+
|
20
|
+
from .downloader import Downloader
|
21
|
+
from .function_executor.function_executor import CustomerError, FunctionExecutor
|
22
|
+
from .function_executor.function_executor_state import FunctionExecutorState
|
23
|
+
from .function_executor.function_executor_states_container import (
|
24
|
+
FunctionExecutorStatesContainer,
|
25
|
+
)
|
26
|
+
from .function_executor.server.function_executor_server_factory import (
|
27
|
+
FunctionExecutorServerConfiguration,
|
28
|
+
FunctionExecutorServerFactory,
|
29
|
+
)
|
30
|
+
from .function_executor.task_input import TaskInput
|
31
|
+
from .function_executor.task_output import TaskOutput
|
32
|
+
from .metrics.executor import (
|
33
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ALL,
|
34
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
|
35
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
|
36
|
+
METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
|
37
|
+
metric_task_completion_latency,
|
38
|
+
metric_task_outcome_report_latency,
|
39
|
+
metric_task_outcome_report_retries,
|
40
|
+
metric_task_outcome_reports,
|
41
|
+
metric_tasks_completed,
|
42
|
+
metric_tasks_fetched,
|
43
|
+
metric_tasks_reporting_outcome,
|
44
|
+
)
|
45
|
+
from .task_reporter import TaskReporter
|
46
|
+
|
47
|
+
|
48
|
+
class ExecutorStateReconciler:
|
49
|
+
def __init__(
|
50
|
+
self,
|
51
|
+
executor_id: str,
|
52
|
+
function_executor_server_factory: FunctionExecutorServerFactory,
|
53
|
+
base_url: str,
|
54
|
+
function_executor_states: FunctionExecutorStatesContainer,
|
55
|
+
config_path: Optional[str],
|
56
|
+
downloader: Downloader,
|
57
|
+
task_reporter: TaskReporter,
|
58
|
+
server_channel: grpc.aio.Channel,
|
59
|
+
logger: Any,
|
60
|
+
):
|
61
|
+
self._executor_id: str = executor_id
|
62
|
+
self._factory: FunctionExecutorServerFactory = function_executor_server_factory
|
63
|
+
self._base_url: str = base_url
|
64
|
+
self._config_path: Optional[str] = config_path
|
65
|
+
self._downloader: Downloader = downloader
|
66
|
+
self._task_reporter: TaskReporter = task_reporter
|
67
|
+
self._function_executor_states: FunctionExecutorStatesContainer = (
|
68
|
+
function_executor_states
|
69
|
+
)
|
70
|
+
self._stub: TaskSchedulerServiceStub = TaskSchedulerServiceStub(server_channel)
|
71
|
+
self._logger: Any = logger.bind(module=__name__)
|
72
|
+
self._is_shutdown: bool = False
|
73
|
+
self._reconciliation_lock: asyncio.Lock = asyncio.Lock()
|
74
|
+
self._server_last_clock: Optional[int] = None
|
75
|
+
|
76
|
+
async def run(self):
|
77
|
+
desired_states: AsyncGenerator[DesiredExecutorState, None] = (
|
78
|
+
self._stub.get_desired_executor_states(
|
79
|
+
GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
|
80
|
+
)
|
81
|
+
)
|
82
|
+
async for new_state in desired_states:
|
83
|
+
if self._is_shutdown:
|
84
|
+
return
|
85
|
+
new_state: DesiredExecutorState
|
86
|
+
if self._server_last_clock is not None:
|
87
|
+
if self._server_last_clock >= new_state.clock:
|
88
|
+
continue # Duplicate or outdated message state sent by Server.
|
89
|
+
|
90
|
+
self._server_last_clock = new_state.clock
|
91
|
+
asyncio.create_task(self._reconcile_state(new_state))
|
92
|
+
|
93
|
+
async def _reconcile_state(self, new_state: DesiredExecutorState):
|
94
|
+
if self._is_shutdown:
|
95
|
+
return
|
96
|
+
|
97
|
+
# Simple non concurrent implementation for now for the PoC.
|
98
|
+
# Obtain this lock to force only a single coroutine doing the reconciliation.
|
99
|
+
async with self._reconciliation_lock:
|
100
|
+
await self._reconcile_function_executors(new_state)
|
101
|
+
# TODO
|
102
|
+
# await self._reconcile_task_allocations(new_state)
|
103
|
+
|
104
|
+
async def shutdown(self):
|
105
|
+
"""Shuts down the state reconciler.
|
106
|
+
|
107
|
+
Never raises any exceptions.
|
108
|
+
"""
|
109
|
+
self._is_shutdown = True
|
110
|
+
|
111
|
+
async def _reconcile_function_executors(self, desired_state: DesiredExecutorState):
|
112
|
+
desired_function_executor_ids: Set[str] = set()
|
113
|
+
for desired_function_executor in desired_state.function_executors:
|
114
|
+
desired_function_executor: FunctionExecutorDescription
|
115
|
+
desired_function_executor_ids.add(desired_function_executor.id)
|
116
|
+
|
117
|
+
function_executor_state: FunctionExecutorState = (
|
118
|
+
self._function_executor_states.get_or_create_state(
|
119
|
+
id=desired_function_executor.id,
|
120
|
+
namespace=desired_function_executor.namespace,
|
121
|
+
graph_name=desired_function_executor.graph_name,
|
122
|
+
graph_version=desired_function_executor.graph_version,
|
123
|
+
function_name=desired_function_executor.function_name,
|
124
|
+
)
|
125
|
+
)
|
126
|
+
|
127
|
+
async with function_executor_state.lock:
|
128
|
+
if (
|
129
|
+
function_executor_state.status
|
130
|
+
== FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPED
|
131
|
+
):
|
132
|
+
function_executor_state.status = (
|
133
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTING_UP
|
134
|
+
)
|
135
|
+
try:
|
136
|
+
function_executor_state.function_executor = (
|
137
|
+
await self._create_function_executor()
|
138
|
+
)
|
139
|
+
function_executor_state.status = (
|
140
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_IDLE
|
141
|
+
)
|
142
|
+
except CustomerError as e:
|
143
|
+
function_executor_state.status = (
|
144
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR
|
145
|
+
)
|
146
|
+
except Exception as e:
|
147
|
+
function_executor_state.status = (
|
148
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR
|
149
|
+
)
|
150
|
+
self._logger.error(
|
151
|
+
f"Failed to create Function Executor", exc_info=e
|
152
|
+
)
|
153
|
+
|
154
|
+
function_executor_state_ids_to_destroy: List[str] = []
|
155
|
+
async for function_executor_state in self._function_executor_states:
|
156
|
+
function_executor_state: FunctionExecutorState
|
157
|
+
if function_executor_state.id not in desired_function_executor_ids:
|
158
|
+
function_executor_state_ids_to_destroy.append(
|
159
|
+
function_executor_state.id
|
160
|
+
)
|
161
|
+
|
162
|
+
for function_executor_state_id in function_executor_state_ids_to_destroy:
|
163
|
+
function_executor_state: FunctionExecutorState = (
|
164
|
+
self._function_executor_states.pop_state(function_executor_state_id)
|
165
|
+
)
|
166
|
+
async with function_executor_state.lock:
|
167
|
+
logger = self._function_executor_logger(
|
168
|
+
id=function_executor_state.id,
|
169
|
+
namespace=function_executor_state.namespace,
|
170
|
+
graph_name=function_executor_state.graph_name,
|
171
|
+
graph_version=function_executor_state.graph_version,
|
172
|
+
function_name=function_executor_state.function_name,
|
173
|
+
)
|
174
|
+
if (
|
175
|
+
function_executor_state.status
|
176
|
+
== FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK
|
177
|
+
):
|
178
|
+
logger.warning(
|
179
|
+
"Destroying Function Executor that is running a task. No task output will be reported as this is expected by the Server."
|
180
|
+
)
|
181
|
+
function_executor_state.status = (
|
182
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPING
|
183
|
+
)
|
184
|
+
await function_executor_state.destroy_function_executor()
|
185
|
+
function_executor_state.status = (
|
186
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPED
|
187
|
+
)
|
188
|
+
|
189
|
+
async def _create_function_executor(
|
190
|
+
self, description: FunctionExecutorDescription
|
191
|
+
) -> FunctionExecutor:
|
192
|
+
logger = self._function_executor_logger(
|
193
|
+
id=description.id,
|
194
|
+
namespace=description.namespace,
|
195
|
+
graph_name=description.graph_name,
|
196
|
+
graph_version=description.graph_version,
|
197
|
+
function_name=description.function_name,
|
198
|
+
)
|
199
|
+
graph: SerializedObject = await self._downloader.download_graph(
|
200
|
+
namespace=description.namespace,
|
201
|
+
graph_name=description.graph_name,
|
202
|
+
graph_version=description.graph_version,
|
203
|
+
logger=logger,
|
204
|
+
)
|
205
|
+
function_executor: FunctionExecutor = FunctionExecutor(
|
206
|
+
server_factory=self._factory, logger=logger
|
207
|
+
)
|
208
|
+
config: FunctionExecutorServerConfiguration = (
|
209
|
+
FunctionExecutorServerConfiguration(
|
210
|
+
executor_id=self._executor_id,
|
211
|
+
function_executor_id=description.id,
|
212
|
+
image_uri=description.image_uri,
|
213
|
+
)
|
214
|
+
)
|
215
|
+
initialize_request: InitializeRequest = InitializeRequest(
|
216
|
+
namespace=description.namespace,
|
217
|
+
graph_name=description.graph_name,
|
218
|
+
graph_version=description.graph_version,
|
219
|
+
function_name=description.function_name,
|
220
|
+
graph=graph,
|
221
|
+
)
|
222
|
+
|
223
|
+
try:
|
224
|
+
await function_executor.initialize(
|
225
|
+
config=config,
|
226
|
+
initialize_request=initialize_request,
|
227
|
+
base_url=self._base_url,
|
228
|
+
config_path=self._config_path,
|
229
|
+
)
|
230
|
+
return function_executor
|
231
|
+
except Exception:
|
232
|
+
await function_executor.destroy()
|
233
|
+
raise
|
234
|
+
|
235
|
+
async def _cancel_running_tasks(
|
236
|
+
self, function_executor_state: FunctionExecutorState
|
237
|
+
):
|
238
|
+
pass
|
239
|
+
|
240
|
+
def _function_executor_logger(
|
241
|
+
self,
|
242
|
+
id: str,
|
243
|
+
namespace: str,
|
244
|
+
graph_name: str,
|
245
|
+
graph_version: str,
|
246
|
+
function_name: str,
|
247
|
+
) -> Any:
|
248
|
+
return self._logger.bind(
|
249
|
+
id=id,
|
250
|
+
namespace=namespace,
|
251
|
+
graph=graph_name,
|
252
|
+
graph_version=graph_version,
|
253
|
+
function_name=function_name,
|
254
|
+
)
|
255
|
+
|
256
|
+
async def _report_task_outcome(self, task_output: TaskOutput):
|
257
|
+
"""Reports the task with the given output to the server.
|
258
|
+
|
259
|
+
Doesn't raise any Exceptions. Runs till the reporting is successful."""
|
260
|
+
reporting_retries: int = 0
|
261
|
+
|
262
|
+
while True:
|
263
|
+
logger = logger.bind(retries=reporting_retries)
|
264
|
+
try:
|
265
|
+
await self._task_reporter.report(output=task_output, logger=logger)
|
266
|
+
break
|
267
|
+
except Exception as e:
|
268
|
+
logger.error(
|
269
|
+
"failed to report task",
|
270
|
+
exc_info=e,
|
271
|
+
)
|
272
|
+
reporting_retries += 1
|
273
|
+
metric_task_outcome_report_retries.inc()
|
274
|
+
await asyncio.sleep(5)
|
275
|
+
|
276
|
+
metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
|
277
|
+
if task_output.is_internal_error:
|
278
|
+
metric_tasks_completed.labels(
|
279
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
|
280
|
+
).inc()
|
281
|
+
elif task_output.success:
|
282
|
+
metric_tasks_completed.labels(
|
283
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
|
284
|
+
).inc()
|
285
|
+
else:
|
286
|
+
metric_tasks_completed.labels(
|
287
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
|
288
|
+
).inc()
|
@@ -0,0 +1,127 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import Any, List, Optional
|
3
|
+
|
4
|
+
import grpc
|
5
|
+
|
6
|
+
from indexify.task_scheduler.proto.task_scheduler_pb2 import (
|
7
|
+
AllowedFunction,
|
8
|
+
ExecutorState,
|
9
|
+
ExecutorStatus,
|
10
|
+
FunctionExecutorDescription,
|
11
|
+
)
|
12
|
+
from indexify.task_scheduler.proto.task_scheduler_pb2 import (
|
13
|
+
FunctionExecutorState as FunctionExecutorStateProto,
|
14
|
+
)
|
15
|
+
from indexify.task_scheduler.proto.task_scheduler_pb2 import (
|
16
|
+
GPUResources,
|
17
|
+
HostResources,
|
18
|
+
ReportExecutorStateRequest,
|
19
|
+
)
|
20
|
+
from indexify.task_scheduler.proto.task_scheduler_pb2_grpc import (
|
21
|
+
TaskSchedulerServiceStub,
|
22
|
+
)
|
23
|
+
|
24
|
+
from .api_objects import FunctionURI
|
25
|
+
from .function_executor.function_executor_state import FunctionExecutorState
|
26
|
+
from .function_executor.function_executor_states_container import (
|
27
|
+
FunctionExecutorStatesContainer,
|
28
|
+
)
|
29
|
+
|
30
|
+
_REPORTING_INTERVAL_SEC = 5
|
31
|
+
_REPORT_RPC_TIMEOUT_SEC = 5
|
32
|
+
|
33
|
+
|
34
|
+
class ExecutorStateReporter:
|
35
|
+
def __init__(
|
36
|
+
self,
|
37
|
+
executor_id: str,
|
38
|
+
function_allowlist: Optional[List[FunctionURI]],
|
39
|
+
function_executor_states: FunctionExecutorStatesContainer,
|
40
|
+
server_channel: grpc.aio.Channel,
|
41
|
+
logger: Any,
|
42
|
+
):
|
43
|
+
self._executor_id: str = executor_id
|
44
|
+
self._function_executor_states: FunctionExecutorStatesContainer = (
|
45
|
+
function_executor_states
|
46
|
+
)
|
47
|
+
self._stub: TaskSchedulerServiceStub = TaskSchedulerServiceStub(server_channel)
|
48
|
+
self._logger: Any = logger.bind(module=__name__)
|
49
|
+
self._is_shutdown: bool = False
|
50
|
+
self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
|
51
|
+
self._allowed_functions: List[AllowedFunction] = []
|
52
|
+
|
53
|
+
for function_uri in (
|
54
|
+
function_allowlist if function_allowlist is not None else []
|
55
|
+
):
|
56
|
+
allowed_function = AllowedFunction(
|
57
|
+
namespace=function_uri.namespace,
|
58
|
+
graph_name=function_uri.compute_graph,
|
59
|
+
function_name=function_uri.compute_fn,
|
60
|
+
)
|
61
|
+
if function_uri.version is not None:
|
62
|
+
allowed_function.graph_version = function_uri.version
|
63
|
+
self._allowed_functions.append(allowed_function)
|
64
|
+
|
65
|
+
# TODO: Update Executor to call status updates.
|
66
|
+
def update_status(self, value: ExecutorStatus):
|
67
|
+
self._executor_status = value
|
68
|
+
|
69
|
+
async def run(self):
|
70
|
+
while not self._is_shutdown:
|
71
|
+
await self._report_state()
|
72
|
+
await asyncio.sleep(_REPORTING_INTERVAL_SEC)
|
73
|
+
|
74
|
+
async def _report_state(self):
|
75
|
+
state = ExecutorState(
|
76
|
+
executor_id=self._executor_id,
|
77
|
+
executor_status=self._executor_status,
|
78
|
+
host_resources=await self._fetch_host_resources(),
|
79
|
+
allowed_functions=self._allowed_functions,
|
80
|
+
function_executor_states=await self._fetch_function_executor_states(),
|
81
|
+
)
|
82
|
+
|
83
|
+
await self._stub.report_executor_state(
|
84
|
+
ReportExecutorStateRequest(executor_state=state),
|
85
|
+
timeout=_REPORT_RPC_TIMEOUT_SEC,
|
86
|
+
)
|
87
|
+
|
88
|
+
async def _fetch_host_resources(self) -> HostResources:
|
89
|
+
# We're only supporting Executors with non empty function allowlist right now.
|
90
|
+
# In this mode Server should ignore available host resources.
|
91
|
+
# This is why it's okay to report zeros right now.
|
92
|
+
return HostResources(
|
93
|
+
cpu_count=0,
|
94
|
+
memory_bytes=0,
|
95
|
+
disk_bytes=0,
|
96
|
+
gpu=GPUResources(
|
97
|
+
count=0,
|
98
|
+
model="",
|
99
|
+
),
|
100
|
+
)
|
101
|
+
|
102
|
+
async def _fetch_function_executor_states(self) -> List[FunctionExecutorStateProto]:
|
103
|
+
states = []
|
104
|
+
|
105
|
+
async for function_executor_state in self._function_executor_states:
|
106
|
+
function_executor_state: FunctionExecutorState
|
107
|
+
states.append(
|
108
|
+
FunctionExecutorStateProto(
|
109
|
+
description=FunctionExecutorDescription(
|
110
|
+
id=function_executor_state.id,
|
111
|
+
namespace=function_executor_state.namespace,
|
112
|
+
graph_name=function_executor_state.graph_name,
|
113
|
+
graph_version=function_executor_state.graph_version,
|
114
|
+
function_name=function_executor_state.function_name,
|
115
|
+
),
|
116
|
+
status=function_executor_state.status,
|
117
|
+
)
|
118
|
+
)
|
119
|
+
|
120
|
+
return states
|
121
|
+
|
122
|
+
async def shutdown(self):
|
123
|
+
"""Shuts down the state reporter.
|
124
|
+
|
125
|
+
Never raises any exceptions.
|
126
|
+
"""
|
127
|
+
self._is_shutdown = True
|
@@ -13,12 +13,12 @@ from .api_objects import (
|
|
13
13
|
RouterOutput,
|
14
14
|
TaskResult,
|
15
15
|
)
|
16
|
+
from .function_executor.task_output import TaskOutput
|
16
17
|
from .metrics.task_reporter import (
|
17
18
|
metric_server_ingest_files_errors,
|
18
19
|
metric_server_ingest_files_latency,
|
19
20
|
metric_server_ingest_files_requests,
|
20
21
|
)
|
21
|
-
from .task_runner import TaskOutput
|
22
22
|
|
23
23
|
|
24
24
|
# https://github.com/psf/requests/issues/1081#issuecomment-428504128
|
@@ -118,12 +118,12 @@ class TaskReporter:
|
|
118
118
|
) -> Tuple[TaskResult, List[Any], TaskOutputSummary]:
|
119
119
|
task_result = TaskResult(
|
120
120
|
outcome="failure",
|
121
|
-
namespace=output.
|
122
|
-
compute_graph=output.
|
123
|
-
compute_fn=output.
|
124
|
-
invocation_id=output.
|
121
|
+
namespace=output.namespace,
|
122
|
+
compute_graph=output.graph_name,
|
123
|
+
compute_fn=output.function_name,
|
124
|
+
invocation_id=output.graph_invocation_id,
|
125
125
|
executor_id=self._executor_id,
|
126
|
-
task_id=output.
|
126
|
+
task_id=output.task_id,
|
127
127
|
)
|
128
128
|
output_files: List[Any] = []
|
129
129
|
summary: TaskOutputSummary = TaskOutputSummary()
|