indexify 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +11 -7
- indexify/executor/downloader.py +99 -50
- indexify/executor/executor.py +149 -28
- indexify/executor/function_executor/function_executor.py +28 -1
- indexify/executor/function_executor/function_executor_state.py +23 -4
- indexify/executor/function_executor/function_executor_states_container.py +28 -16
- indexify/executor/function_executor/health_checker.py +26 -11
- indexify/executor/function_executor/metrics/function_executor.py +16 -0
- indexify/executor/function_executor/server/function_executor_server_factory.py +4 -1
- indexify/executor/function_executor/single_task_runner.py +28 -8
- indexify/executor/function_executor/task_output.py +27 -4
- indexify/executor/state_reconciler.py +288 -0
- indexify/executor/state_reporter.py +127 -0
- indexify/executor/task_reporter.py +6 -6
- indexify/executor/task_runner.py +20 -12
- indexify/task_scheduler/proto/task_scheduler.proto +147 -0
- indexify/task_scheduler/proto/task_scheduler_pb2.py +69 -0
- indexify/task_scheduler/proto/task_scheduler_pb2.pyi +286 -0
- indexify/task_scheduler/proto/task_scheduler_pb2_grpc.py +170 -0
- {indexify-0.3.12.dist-info → indexify-0.3.14.dist-info}/METADATA +1 -1
- {indexify-0.3.12.dist-info → indexify-0.3.14.dist-info}/RECORD +23 -17
- {indexify-0.3.12.dist-info → indexify-0.3.14.dist-info}/WHEEL +0 -0
- {indexify-0.3.12.dist-info → indexify-0.3.14.dist-info}/entry_points.txt +0 -0
@@ -1,7 +1,6 @@
|
|
1
1
|
import asyncio
|
2
|
-
from typing import AsyncGenerator, Dict
|
2
|
+
from typing import AsyncGenerator, Dict, Optional
|
3
3
|
|
4
|
-
from ..api_objects import Task
|
5
4
|
from .function_executor_state import FunctionExecutorState
|
6
5
|
from .metrics.function_executor_state_container import (
|
7
6
|
metric_function_executor_states_count,
|
@@ -17,19 +16,33 @@ class FunctionExecutorStatesContainer:
|
|
17
16
|
self._states: Dict[str, FunctionExecutorState] = {}
|
18
17
|
self._is_shutdown: bool = False
|
19
18
|
|
20
|
-
async def get_or_create_state(
|
21
|
-
|
19
|
+
async def get_or_create_state(
|
20
|
+
self,
|
21
|
+
id: str,
|
22
|
+
namespace: str,
|
23
|
+
graph_name: str,
|
24
|
+
graph_version: str,
|
25
|
+
function_name: str,
|
26
|
+
image_uri: Optional[str],
|
27
|
+
) -> FunctionExecutorState:
|
28
|
+
"""Get or create a function executor state with the given ID.
|
22
29
|
|
30
|
+
If the state already exists, it is returned. Otherwise, a new state is created from the supplied task.
|
23
31
|
Raises Exception if it's not possible to create a new state at this time."""
|
24
32
|
async with self._lock:
|
25
33
|
if self._is_shutdown:
|
26
|
-
raise RuntimeError(
|
34
|
+
raise RuntimeError(
|
35
|
+
"Function Executor states container is shutting down."
|
36
|
+
)
|
27
37
|
|
28
|
-
id = function_id_without_version(task)
|
29
38
|
if id not in self._states:
|
30
39
|
state = FunctionExecutorState(
|
31
|
-
|
32
|
-
|
40
|
+
id=id,
|
41
|
+
namespace=namespace,
|
42
|
+
graph_name=graph_name,
|
43
|
+
graph_version=graph_version,
|
44
|
+
function_name=function_name,
|
45
|
+
image_uri=image_uri,
|
33
46
|
)
|
34
47
|
self._states[id] = state
|
35
48
|
metric_function_executor_states_count.set(len(self._states))
|
@@ -41,6 +54,13 @@ class FunctionExecutorStatesContainer:
|
|
41
54
|
for state in self._states.values():
|
42
55
|
yield state
|
43
56
|
|
57
|
+
async def pop(self, id: str) -> FunctionExecutorState:
|
58
|
+
"""Removes the state with the given ID and returns it."""
|
59
|
+
async with self._lock:
|
60
|
+
state = self._states.pop(id)
|
61
|
+
metric_function_executor_states_count.set(len(self._states))
|
62
|
+
return state
|
63
|
+
|
44
64
|
async def shutdown(self):
|
45
65
|
# Function Executors are outside the Executor process
|
46
66
|
# so they need to get cleaned up explicitly and reliably.
|
@@ -54,11 +74,3 @@ class FunctionExecutorStatesContainer:
|
|
54
74
|
async with state.lock:
|
55
75
|
await state.shutdown()
|
56
76
|
# The task running inside the Function Executor will fail because it's destroyed.
|
57
|
-
|
58
|
-
|
59
|
-
def function_id_with_version(task: Task) -> str:
|
60
|
-
return f"versioned/{task.namespace}/{task.compute_graph}/{task.graph_version}/{task.compute_fn}"
|
61
|
-
|
62
|
-
|
63
|
-
def function_id_without_version(task: Task) -> str:
|
64
|
-
return f"not_versioned/{task.namespace}/{task.compute_graph}/{task.compute_fn}"
|
@@ -20,16 +20,22 @@ from .server.client_configuration import HEALTH_CHECK_TIMEOUT_SEC
|
|
20
20
|
HEALTH_CHECK_POLL_PERIOD_SEC = 10
|
21
21
|
|
22
22
|
|
23
|
+
class HealthCheckResult:
|
24
|
+
def __init__(self, is_healthy: bool, reason: str):
|
25
|
+
self.is_healthy: bool = is_healthy
|
26
|
+
self.reason: str = reason
|
27
|
+
|
28
|
+
|
23
29
|
class HealthChecker:
|
24
30
|
def __init__(self, stub: FunctionExecutorStub, logger: Any):
|
25
31
|
self._stub: FunctionExecutorStub = stub
|
26
32
|
self._logger: Any = logger.bind(module=__name__)
|
27
33
|
self._health_check_loop_task: Optional[asyncio.Task] = None
|
28
|
-
self._health_check_failed_callback: Optional[
|
29
|
-
None
|
30
|
-
|
34
|
+
self._health_check_failed_callback: Optional[
|
35
|
+
Callable[[HealthCheckResult], Awaitable[None]]
|
36
|
+
] = None
|
31
37
|
|
32
|
-
async def check(self) ->
|
38
|
+
async def check(self) -> HealthCheckResult:
|
33
39
|
"""Runs the health check once and returns the result.
|
34
40
|
|
35
41
|
Does not raise any exceptions."""
|
@@ -40,17 +46,25 @@ class HealthChecker:
|
|
40
46
|
)
|
41
47
|
if not response.healthy:
|
42
48
|
metric_failed_health_checks.inc()
|
43
|
-
return
|
44
|
-
|
49
|
+
return HealthCheckResult(
|
50
|
+
is_healthy=response.healthy, reason=response.status_message
|
51
|
+
)
|
52
|
+
except AioRpcError as e:
|
45
53
|
metric_failed_health_checks.inc()
|
46
54
|
# Expected exception when there are problems with communication because e.g. the server is unhealthy.
|
47
|
-
return
|
55
|
+
return HealthCheckResult(
|
56
|
+
is_healthy=False,
|
57
|
+
reason=f"Executor side RPC channel error: {str(e)}",
|
58
|
+
)
|
48
59
|
except Exception as e:
|
49
60
|
metric_failed_health_checks.inc()
|
50
61
|
self._logger.warning("Got unexpected exception, ignoring", exc_info=e)
|
51
|
-
return
|
62
|
+
return HealthCheckResult(
|
63
|
+
is_healthy=False,
|
64
|
+
reason=f"Unexpected exception in Executor: {str(e)}",
|
65
|
+
)
|
52
66
|
|
53
|
-
def start(self, callback: Callable[[], Awaitable[None]]) -> None:
|
67
|
+
def start(self, callback: Callable[[HealthCheckResult], Awaitable[None]]) -> None:
|
54
68
|
"""Starts periodic health checks.
|
55
69
|
|
56
70
|
The supplied callback is an async function called in the calling thread's
|
@@ -81,9 +95,10 @@ class HealthChecker:
|
|
81
95
|
|
82
96
|
async def _health_check_loop(self) -> None:
|
83
97
|
while True:
|
84
|
-
|
98
|
+
result: HealthCheckResult = await self.check()
|
99
|
+
if not result.is_healthy:
|
85
100
|
break
|
86
101
|
await asyncio.sleep(HEALTH_CHECK_POLL_PERIOD_SEC)
|
87
102
|
|
88
|
-
asyncio.create_task(self._health_check_failed_callback())
|
103
|
+
asyncio.create_task(self._health_check_failed_callback(result))
|
89
104
|
self._health_check_loop_task = None
|
@@ -78,6 +78,22 @@ metric_destroy_channel_errors: prometheus_client.Counter = prometheus_client.Cou
|
|
78
78
|
"Number of Function Executor channel destruction errors",
|
79
79
|
)
|
80
80
|
|
81
|
+
# FE get_info RPC metrics.
|
82
|
+
metric_get_info_rpc_latency: prometheus_client.Histogram = (
|
83
|
+
latency_metric_for_fast_operation(
|
84
|
+
"function_executor_get_info_rpc", "Function Executor get_info RPC"
|
85
|
+
)
|
86
|
+
)
|
87
|
+
metric_get_info_rpc_errors: prometheus_client.Counter = prometheus_client.Counter(
|
88
|
+
"function_executor_get_info_rpc_errors",
|
89
|
+
"Number of Function Executor get_info RPC errors",
|
90
|
+
)
|
91
|
+
metric_function_executor_infos: prometheus_client.Counter = prometheus_client.Counter(
|
92
|
+
"function_executor_infos",
|
93
|
+
"Number of Function Executors with particular info",
|
94
|
+
["version", "sdk_version", "sdk_language", "sdk_language_version"],
|
95
|
+
)
|
96
|
+
|
81
97
|
# FE initialization RPC metrics.
|
82
98
|
metric_initialize_rpc_latency: prometheus_client.Histogram = (
|
83
99
|
latency_metric_for_customer_controlled_operation(
|
@@ -14,8 +14,11 @@ class FunctionExecutorServerConfiguration:
|
|
14
14
|
configuration parameters or raise an exception if it can't implement
|
15
15
|
them."""
|
16
16
|
|
17
|
-
def __init__(
|
17
|
+
def __init__(
|
18
|
+
self, executor_id: str, function_executor_id: str, image_uri: Optional[str]
|
19
|
+
):
|
18
20
|
self.executor_id: str = executor_id
|
21
|
+
self.function_executor_id: str = function_executor_id
|
19
22
|
# Container image URI of the Function Executor Server.
|
20
23
|
self.image_uri: Optional[str] = image_uri
|
21
24
|
|
@@ -14,6 +14,7 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
|
14
14
|
from ..api_objects import Task
|
15
15
|
from .function_executor import CustomerError, FunctionExecutor
|
16
16
|
from .function_executor_state import FunctionExecutorState
|
17
|
+
from .health_checker import HealthChecker, HealthCheckResult
|
17
18
|
from .metrics.single_task_runner import (
|
18
19
|
metric_function_executor_run_task_rpc_errors,
|
19
20
|
metric_function_executor_run_task_rpc_latency,
|
@@ -69,7 +70,12 @@ class SingleTaskRunner:
|
|
69
70
|
await self._create_function_executor()
|
70
71
|
except CustomerError as e:
|
71
72
|
return TaskOutput(
|
72
|
-
|
73
|
+
task_id=self._task_input.task.id,
|
74
|
+
namespace=self._task_input.task.namespace,
|
75
|
+
graph_name=self._task_input.task.compute_graph,
|
76
|
+
function_name=self._task_input.task.compute_fn,
|
77
|
+
graph_version=self._task_input.task.graph_version,
|
78
|
+
graph_invocation_id=self._task_input.task.invocation_id,
|
73
79
|
stderr=str(e),
|
74
80
|
success=False,
|
75
81
|
)
|
@@ -88,6 +94,7 @@ class SingleTaskRunner:
|
|
88
94
|
config: FunctionExecutorServerConfiguration = (
|
89
95
|
FunctionExecutorServerConfiguration(
|
90
96
|
executor_id=self._executor_id,
|
97
|
+
function_executor_id=self._state.id,
|
91
98
|
image_uri=self._task_input.task.image_uri,
|
92
99
|
)
|
93
100
|
)
|
@@ -144,24 +151,32 @@ class SingleTaskRunner:
|
|
144
151
|
).run_task(request)
|
145
152
|
return _task_output(task=self._task_input.task, response=response)
|
146
153
|
|
147
|
-
async def _health_check_failed_callback(self):
|
154
|
+
async def _health_check_failed_callback(self, result: HealthCheckResult):
|
148
155
|
# Function Executor destroy due to the periodic health check failure ensures that
|
149
156
|
# a running task RPC stuck in unhealthy Function Executor fails immidiately.
|
150
157
|
async with self._state.lock:
|
151
158
|
if self._state.function_executor is not None:
|
152
|
-
await self._destroy_function_executor_on_failed_health_check(
|
159
|
+
await self._destroy_function_executor_on_failed_health_check(
|
160
|
+
result.reason
|
161
|
+
)
|
153
162
|
|
154
163
|
async def _destroy_existing_function_executor_if_unhealthy(self):
|
155
164
|
self._state.check_locked()
|
156
165
|
if self._state.function_executor is None:
|
157
166
|
return
|
158
|
-
|
167
|
+
result: HealthCheckResult = (
|
168
|
+
await self._state.function_executor.health_checker().check()
|
169
|
+
)
|
170
|
+
if result.is_healthy:
|
159
171
|
return
|
160
|
-
await self._destroy_function_executor_on_failed_health_check()
|
172
|
+
await self._destroy_function_executor_on_failed_health_check(result.reason)
|
161
173
|
|
162
|
-
async def _destroy_function_executor_on_failed_health_check(self):
|
174
|
+
async def _destroy_function_executor_on_failed_health_check(self, reason: str):
|
163
175
|
self._state.check_locked()
|
164
|
-
self._logger.error(
|
176
|
+
self._logger.error(
|
177
|
+
"Function Executor health check failed, destroying Function Executor",
|
178
|
+
health_check_fail_reason=reason,
|
179
|
+
)
|
165
180
|
self._state.health_check_failed = True
|
166
181
|
await self._state.destroy_function_executor()
|
167
182
|
|
@@ -220,7 +235,12 @@ def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
|
|
220
235
|
raise ValueError(f"Response is missing required field: {field}")
|
221
236
|
|
222
237
|
output = TaskOutput(
|
223
|
-
|
238
|
+
task_id=task.id,
|
239
|
+
namespace=task.namespace,
|
240
|
+
graph_name=task.compute_graph,
|
241
|
+
function_name=task.compute_fn,
|
242
|
+
graph_version=task.graph_version,
|
243
|
+
graph_invocation_id=task.invocation_id,
|
224
244
|
stdout=response.stdout,
|
225
245
|
stderr=response.stderr,
|
226
246
|
reducer=response.is_reducer,
|
@@ -13,7 +13,12 @@ class TaskOutput:
|
|
13
13
|
|
14
14
|
def __init__(
|
15
15
|
self,
|
16
|
-
|
16
|
+
task_id: str,
|
17
|
+
namespace: str,
|
18
|
+
graph_name: str,
|
19
|
+
function_name: str,
|
20
|
+
graph_version: str,
|
21
|
+
graph_invocation_id: str,
|
17
22
|
function_output: Optional[FunctionOutput] = None,
|
18
23
|
router_output: Optional[RouterOutput] = None,
|
19
24
|
stdout: Optional[str] = None,
|
@@ -22,7 +27,12 @@ class TaskOutput:
|
|
22
27
|
success: bool = False,
|
23
28
|
is_internal_error: bool = False,
|
24
29
|
):
|
25
|
-
self.
|
30
|
+
self.task_id = task_id
|
31
|
+
self.namespace = namespace
|
32
|
+
self.graph_name = graph_name
|
33
|
+
self.function_name = function_name
|
34
|
+
self.graph_version = graph_version
|
35
|
+
self.graph_invocation_id = graph_invocation_id
|
26
36
|
self.function_output = function_output
|
27
37
|
self.router_output = router_output
|
28
38
|
self.stdout = stdout
|
@@ -32,11 +42,24 @@ class TaskOutput:
|
|
32
42
|
self.is_internal_error = is_internal_error
|
33
43
|
|
34
44
|
@classmethod
|
35
|
-
def internal_error(
|
45
|
+
def internal_error(
|
46
|
+
cls,
|
47
|
+
task_id: str,
|
48
|
+
namespace: str,
|
49
|
+
graph_name: str,
|
50
|
+
function_name: str,
|
51
|
+
graph_version: str,
|
52
|
+
graph_invocation_id: str,
|
53
|
+
) -> "TaskOutput":
|
36
54
|
"""Creates a TaskOutput for an internal error."""
|
37
55
|
# We are not sharing internal error messages with the customer.
|
38
56
|
return TaskOutput(
|
39
|
-
|
57
|
+
task_id=task_id,
|
58
|
+
namespace=namespace,
|
59
|
+
graph_name=graph_name,
|
60
|
+
function_name=function_name,
|
61
|
+
graph_version=graph_version,
|
62
|
+
graph_invocation_id=graph_invocation_id,
|
40
63
|
stderr="Platform failed to execute the function.",
|
41
64
|
is_internal_error=True,
|
42
65
|
)
|
@@ -0,0 +1,288 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import Any, AsyncGenerator, List, Optional, Set
|
3
|
+
|
4
|
+
import grpc
|
5
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
6
|
+
InitializeRequest,
|
7
|
+
SerializedObject,
|
8
|
+
)
|
9
|
+
|
10
|
+
from indexify.task_scheduler.proto.task_scheduler_pb2 import (
|
11
|
+
DesiredExecutorState,
|
12
|
+
FunctionExecutorDescription,
|
13
|
+
FunctionExecutorStatus,
|
14
|
+
GetDesiredExecutorStatesRequest,
|
15
|
+
)
|
16
|
+
from indexify.task_scheduler.proto.task_scheduler_pb2_grpc import (
|
17
|
+
TaskSchedulerServiceStub,
|
18
|
+
)
|
19
|
+
|
20
|
+
from .downloader import Downloader
|
21
|
+
from .function_executor.function_executor import CustomerError, FunctionExecutor
|
22
|
+
from .function_executor.function_executor_state import FunctionExecutorState
|
23
|
+
from .function_executor.function_executor_states_container import (
|
24
|
+
FunctionExecutorStatesContainer,
|
25
|
+
)
|
26
|
+
from .function_executor.server.function_executor_server_factory import (
|
27
|
+
FunctionExecutorServerConfiguration,
|
28
|
+
FunctionExecutorServerFactory,
|
29
|
+
)
|
30
|
+
from .function_executor.task_input import TaskInput
|
31
|
+
from .function_executor.task_output import TaskOutput
|
32
|
+
from .metrics.executor import (
|
33
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ALL,
|
34
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
|
35
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
|
36
|
+
METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
|
37
|
+
metric_task_completion_latency,
|
38
|
+
metric_task_outcome_report_latency,
|
39
|
+
metric_task_outcome_report_retries,
|
40
|
+
metric_task_outcome_reports,
|
41
|
+
metric_tasks_completed,
|
42
|
+
metric_tasks_fetched,
|
43
|
+
metric_tasks_reporting_outcome,
|
44
|
+
)
|
45
|
+
from .task_reporter import TaskReporter
|
46
|
+
|
47
|
+
|
48
|
+
class ExecutorStateReconciler:
|
49
|
+
def __init__(
|
50
|
+
self,
|
51
|
+
executor_id: str,
|
52
|
+
function_executor_server_factory: FunctionExecutorServerFactory,
|
53
|
+
base_url: str,
|
54
|
+
function_executor_states: FunctionExecutorStatesContainer,
|
55
|
+
config_path: Optional[str],
|
56
|
+
downloader: Downloader,
|
57
|
+
task_reporter: TaskReporter,
|
58
|
+
server_channel: grpc.aio.Channel,
|
59
|
+
logger: Any,
|
60
|
+
):
|
61
|
+
self._executor_id: str = executor_id
|
62
|
+
self._factory: FunctionExecutorServerFactory = function_executor_server_factory
|
63
|
+
self._base_url: str = base_url
|
64
|
+
self._config_path: Optional[str] = config_path
|
65
|
+
self._downloader: Downloader = downloader
|
66
|
+
self._task_reporter: TaskReporter = task_reporter
|
67
|
+
self._function_executor_states: FunctionExecutorStatesContainer = (
|
68
|
+
function_executor_states
|
69
|
+
)
|
70
|
+
self._stub: TaskSchedulerServiceStub = TaskSchedulerServiceStub(server_channel)
|
71
|
+
self._logger: Any = logger.bind(module=__name__)
|
72
|
+
self._is_shutdown: bool = False
|
73
|
+
self._reconciliation_lock: asyncio.Lock = asyncio.Lock()
|
74
|
+
self._server_last_clock: Optional[int] = None
|
75
|
+
|
76
|
+
async def run(self):
|
77
|
+
desired_states: AsyncGenerator[DesiredExecutorState, None] = (
|
78
|
+
self._stub.get_desired_executor_states(
|
79
|
+
GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
|
80
|
+
)
|
81
|
+
)
|
82
|
+
async for new_state in desired_states:
|
83
|
+
if self._is_shutdown:
|
84
|
+
return
|
85
|
+
new_state: DesiredExecutorState
|
86
|
+
if self._server_last_clock is not None:
|
87
|
+
if self._server_last_clock >= new_state.clock:
|
88
|
+
continue # Duplicate or outdated message state sent by Server.
|
89
|
+
|
90
|
+
self._server_last_clock = new_state.clock
|
91
|
+
asyncio.create_task(self._reconcile_state(new_state))
|
92
|
+
|
93
|
+
async def _reconcile_state(self, new_state: DesiredExecutorState):
|
94
|
+
if self._is_shutdown:
|
95
|
+
return
|
96
|
+
|
97
|
+
# Simple non concurrent implementation for now for the PoC.
|
98
|
+
# Obtain this lock to force only a single coroutine doing the reconciliation.
|
99
|
+
async with self._reconciliation_lock:
|
100
|
+
await self._reconcile_function_executors(new_state)
|
101
|
+
# TODO
|
102
|
+
# await self._reconcile_task_allocations(new_state)
|
103
|
+
|
104
|
+
async def shutdown(self):
|
105
|
+
"""Shuts down the state reconciler.
|
106
|
+
|
107
|
+
Never raises any exceptions.
|
108
|
+
"""
|
109
|
+
self._is_shutdown = True
|
110
|
+
|
111
|
+
async def _reconcile_function_executors(self, desired_state: DesiredExecutorState):
|
112
|
+
desired_function_executor_ids: Set[str] = set()
|
113
|
+
for desired_function_executor in desired_state.function_executors:
|
114
|
+
desired_function_executor: FunctionExecutorDescription
|
115
|
+
desired_function_executor_ids.add(desired_function_executor.id)
|
116
|
+
|
117
|
+
function_executor_state: FunctionExecutorState = (
|
118
|
+
self._function_executor_states.get_or_create_state(
|
119
|
+
id=desired_function_executor.id,
|
120
|
+
namespace=desired_function_executor.namespace,
|
121
|
+
graph_name=desired_function_executor.graph_name,
|
122
|
+
graph_version=desired_function_executor.graph_version,
|
123
|
+
function_name=desired_function_executor.function_name,
|
124
|
+
)
|
125
|
+
)
|
126
|
+
|
127
|
+
async with function_executor_state.lock:
|
128
|
+
if (
|
129
|
+
function_executor_state.status
|
130
|
+
== FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPED
|
131
|
+
):
|
132
|
+
function_executor_state.status = (
|
133
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTING_UP
|
134
|
+
)
|
135
|
+
try:
|
136
|
+
function_executor_state.function_executor = (
|
137
|
+
await self._create_function_executor()
|
138
|
+
)
|
139
|
+
function_executor_state.status = (
|
140
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_IDLE
|
141
|
+
)
|
142
|
+
except CustomerError as e:
|
143
|
+
function_executor_state.status = (
|
144
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR
|
145
|
+
)
|
146
|
+
except Exception as e:
|
147
|
+
function_executor_state.status = (
|
148
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR
|
149
|
+
)
|
150
|
+
self._logger.error(
|
151
|
+
f"Failed to create Function Executor", exc_info=e
|
152
|
+
)
|
153
|
+
|
154
|
+
function_executor_state_ids_to_destroy: List[str] = []
|
155
|
+
async for function_executor_state in self._function_executor_states:
|
156
|
+
function_executor_state: FunctionExecutorState
|
157
|
+
if function_executor_state.id not in desired_function_executor_ids:
|
158
|
+
function_executor_state_ids_to_destroy.append(
|
159
|
+
function_executor_state.id
|
160
|
+
)
|
161
|
+
|
162
|
+
for function_executor_state_id in function_executor_state_ids_to_destroy:
|
163
|
+
function_executor_state: FunctionExecutorState = (
|
164
|
+
self._function_executor_states.pop_state(function_executor_state_id)
|
165
|
+
)
|
166
|
+
async with function_executor_state.lock:
|
167
|
+
logger = self._function_executor_logger(
|
168
|
+
id=function_executor_state.id,
|
169
|
+
namespace=function_executor_state.namespace,
|
170
|
+
graph_name=function_executor_state.graph_name,
|
171
|
+
graph_version=function_executor_state.graph_version,
|
172
|
+
function_name=function_executor_state.function_name,
|
173
|
+
)
|
174
|
+
if (
|
175
|
+
function_executor_state.status
|
176
|
+
== FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK
|
177
|
+
):
|
178
|
+
logger.warning(
|
179
|
+
"Destroying Function Executor that is running a task. No task output will be reported as this is expected by the Server."
|
180
|
+
)
|
181
|
+
function_executor_state.status = (
|
182
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPING
|
183
|
+
)
|
184
|
+
await function_executor_state.destroy_function_executor()
|
185
|
+
function_executor_state.status = (
|
186
|
+
FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPED
|
187
|
+
)
|
188
|
+
|
189
|
+
async def _create_function_executor(
|
190
|
+
self, description: FunctionExecutorDescription
|
191
|
+
) -> FunctionExecutor:
|
192
|
+
logger = self._function_executor_logger(
|
193
|
+
id=description.id,
|
194
|
+
namespace=description.namespace,
|
195
|
+
graph_name=description.graph_name,
|
196
|
+
graph_version=description.graph_version,
|
197
|
+
function_name=description.function_name,
|
198
|
+
)
|
199
|
+
graph: SerializedObject = await self._downloader.download_graph(
|
200
|
+
namespace=description.namespace,
|
201
|
+
graph_name=description.graph_name,
|
202
|
+
graph_version=description.graph_version,
|
203
|
+
logger=logger,
|
204
|
+
)
|
205
|
+
function_executor: FunctionExecutor = FunctionExecutor(
|
206
|
+
server_factory=self._factory, logger=logger
|
207
|
+
)
|
208
|
+
config: FunctionExecutorServerConfiguration = (
|
209
|
+
FunctionExecutorServerConfiguration(
|
210
|
+
executor_id=self._executor_id,
|
211
|
+
function_executor_id=description.id,
|
212
|
+
image_uri=description.image_uri,
|
213
|
+
)
|
214
|
+
)
|
215
|
+
initialize_request: InitializeRequest = InitializeRequest(
|
216
|
+
namespace=description.namespace,
|
217
|
+
graph_name=description.graph_name,
|
218
|
+
graph_version=description.graph_version,
|
219
|
+
function_name=description.function_name,
|
220
|
+
graph=graph,
|
221
|
+
)
|
222
|
+
|
223
|
+
try:
|
224
|
+
await function_executor.initialize(
|
225
|
+
config=config,
|
226
|
+
initialize_request=initialize_request,
|
227
|
+
base_url=self._base_url,
|
228
|
+
config_path=self._config_path,
|
229
|
+
)
|
230
|
+
return function_executor
|
231
|
+
except Exception:
|
232
|
+
await function_executor.destroy()
|
233
|
+
raise
|
234
|
+
|
235
|
+
async def _cancel_running_tasks(
|
236
|
+
self, function_executor_state: FunctionExecutorState
|
237
|
+
):
|
238
|
+
pass
|
239
|
+
|
240
|
+
def _function_executor_logger(
|
241
|
+
self,
|
242
|
+
id: str,
|
243
|
+
namespace: str,
|
244
|
+
graph_name: str,
|
245
|
+
graph_version: str,
|
246
|
+
function_name: str,
|
247
|
+
) -> Any:
|
248
|
+
return self._logger.bind(
|
249
|
+
id=id,
|
250
|
+
namespace=namespace,
|
251
|
+
graph=graph_name,
|
252
|
+
graph_version=graph_version,
|
253
|
+
function_name=function_name,
|
254
|
+
)
|
255
|
+
|
256
|
+
async def _report_task_outcome(self, task_output: TaskOutput):
|
257
|
+
"""Reports the task with the given output to the server.
|
258
|
+
|
259
|
+
Doesn't raise any Exceptions. Runs till the reporting is successful."""
|
260
|
+
reporting_retries: int = 0
|
261
|
+
|
262
|
+
while True:
|
263
|
+
logger = logger.bind(retries=reporting_retries)
|
264
|
+
try:
|
265
|
+
await self._task_reporter.report(output=task_output, logger=logger)
|
266
|
+
break
|
267
|
+
except Exception as e:
|
268
|
+
logger.error(
|
269
|
+
"failed to report task",
|
270
|
+
exc_info=e,
|
271
|
+
)
|
272
|
+
reporting_retries += 1
|
273
|
+
metric_task_outcome_report_retries.inc()
|
274
|
+
await asyncio.sleep(5)
|
275
|
+
|
276
|
+
metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
|
277
|
+
if task_output.is_internal_error:
|
278
|
+
metric_tasks_completed.labels(
|
279
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
|
280
|
+
).inc()
|
281
|
+
elif task_output.success:
|
282
|
+
metric_tasks_completed.labels(
|
283
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
|
284
|
+
).inc()
|
285
|
+
else:
|
286
|
+
metric_tasks_completed.labels(
|
287
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
|
288
|
+
).inc()
|