indexify 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +38 -78
- indexify/executor/api_objects.py +4 -0
- indexify/executor/downloader.py +45 -5
- indexify/executor/executor.py +103 -16
- indexify/executor/function_executor/function_executor.py +174 -55
- indexify/executor/function_executor/function_executor_state.py +6 -0
- indexify/executor/function_executor/function_executor_states_container.py +64 -0
- indexify/executor/function_executor/health_checker.py +20 -10
- indexify/executor/function_executor/invocation_state_client.py +31 -6
- indexify/executor/function_executor/metrics/function_executor.py +142 -0
- indexify/executor/function_executor/metrics/function_executor_state.py +10 -0
- indexify/executor/function_executor/metrics/function_executor_state_container.py +10 -0
- indexify/executor/function_executor/metrics/health_checker.py +14 -0
- indexify/executor/function_executor/metrics/invocation_state_client.py +45 -0
- indexify/executor/function_executor/metrics/single_task_runner.py +22 -0
- indexify/executor/function_executor/single_task_runner.py +44 -15
- indexify/executor/function_executor/task_output.py +7 -1
- indexify/executor/metrics/downloader.py +69 -0
- indexify/executor/metrics/executor.py +51 -0
- indexify/executor/metrics/task_fetcher.py +21 -0
- indexify/executor/metrics/task_reporter.py +22 -0
- indexify/executor/metrics/task_runner.py +45 -0
- indexify/executor/monitoring/function_allowlist.py +25 -0
- indexify/executor/monitoring/handler.py +8 -0
- indexify/executor/monitoring/health_check_handler.py +20 -0
- indexify/executor/monitoring/health_checker/generic_health_checker.py +58 -0
- indexify/executor/monitoring/health_checker/health_checker.py +23 -0
- indexify/executor/monitoring/metrics.py +245 -0
- indexify/executor/monitoring/prometheus_metrics_handler.py +18 -0
- indexify/executor/monitoring/server.py +41 -0
- indexify/executor/monitoring/startup_probe_handler.py +17 -0
- indexify/executor/task_fetcher.py +15 -1
- indexify/executor/task_reporter.py +24 -7
- indexify/executor/task_runner.py +64 -46
- {indexify-0.3.8.dist-info → indexify-0.3.10.dist-info}/METADATA +4 -2
- indexify-0.3.10.dist-info/RECORD +46 -0
- indexify-0.3.8.dist-info/RECORD +0 -25
- {indexify-0.3.8.dist-info → indexify-0.3.10.dist-info}/WHEEL +0 -0
- {indexify-0.3.8.dist-info → indexify-0.3.10.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,45 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from ...monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
# This file contains all metrics used by InvocationStateClient.
|
6
|
+
|
7
|
+
# General metrics.
|
8
|
+
metric_request_read_errors: prometheus_client.Counter = prometheus_client.Counter(
|
9
|
+
"function_executor_invocation_state_client_request_read_errors",
|
10
|
+
"Number of failed request reads in Function Executor Invocation State client resulting in its early termination",
|
11
|
+
)
|
12
|
+
|
13
|
+
# Get invocation state key-value Server API metrics.
|
14
|
+
metric_server_get_state_requests: prometheus_client.Counter = prometheus_client.Counter(
|
15
|
+
"server_get_invocation_state_requests",
|
16
|
+
"Number of get invocation state requests sent to the Server on behalf of Function Executor",
|
17
|
+
)
|
18
|
+
metric_server_get_state_request_errors: prometheus_client.Counter = (
|
19
|
+
prometheus_client.Counter(
|
20
|
+
"server_get_invocation_state_request_errors",
|
21
|
+
"Server get invocation state request errors",
|
22
|
+
)
|
23
|
+
)
|
24
|
+
metric_server_get_state_request_latency: prometheus_client.Histogram = (
|
25
|
+
latency_metric_for_fast_operation(
|
26
|
+
"server_get_invocation_state_request", "Server get invocation state request"
|
27
|
+
)
|
28
|
+
)
|
29
|
+
|
30
|
+
# Set invocation state key-value Server API metrics.
|
31
|
+
metric_server_set_state_requests: prometheus_client.Counter = prometheus_client.Counter(
|
32
|
+
"server_set_invocation_state_requests",
|
33
|
+
"Number of set invocation state requests sent to the Server on behalf of Function Executor",
|
34
|
+
)
|
35
|
+
metric_server_set_state_request_errors: prometheus_client.Counter = (
|
36
|
+
prometheus_client.Counter(
|
37
|
+
"server_set_invocation_state_request_errors",
|
38
|
+
"Server set invocation state request errors",
|
39
|
+
)
|
40
|
+
)
|
41
|
+
metric_server_set_state_request_latency: prometheus_client.Histogram = (
|
42
|
+
latency_metric_for_fast_operation(
|
43
|
+
"server_set_invocation_state_request", "Server set invocation state request"
|
44
|
+
)
|
45
|
+
)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from ...monitoring.metrics import latency_metric_for_customer_controlled_operation
|
4
|
+
|
5
|
+
# This file contains all metrics used by SingleTaskRunner.
|
6
|
+
|
7
|
+
metric_function_executor_run_task_rpcs: prometheus_client.Counter = (
|
8
|
+
prometheus_client.Counter(
|
9
|
+
"function_executor_run_task_rpcs", "Number of Function Executor run task RPCs"
|
10
|
+
)
|
11
|
+
)
|
12
|
+
metric_function_executor_run_task_rpc_errors: prometheus_client.Counter = (
|
13
|
+
prometheus_client.Counter(
|
14
|
+
"function_executor_run_task_rpc_errors",
|
15
|
+
"Number of Function Executor run task RPC errors",
|
16
|
+
)
|
17
|
+
)
|
18
|
+
metric_function_executor_run_task_rpc_latency: prometheus_client.Histogram = (
|
19
|
+
latency_metric_for_customer_controlled_operation(
|
20
|
+
"function_executor_run_task_rpc", "Function Executor run task RPC"
|
21
|
+
)
|
22
|
+
)
|
@@ -14,6 +14,11 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
|
14
14
|
from ..api_objects import Task
|
15
15
|
from .function_executor import CustomerError, FunctionExecutor
|
16
16
|
from .function_executor_state import FunctionExecutorState
|
17
|
+
from .metrics.single_task_runner import (
|
18
|
+
metric_function_executor_run_task_rpc_errors,
|
19
|
+
metric_function_executor_run_task_rpc_latency,
|
20
|
+
metric_function_executor_run_task_rpcs,
|
21
|
+
)
|
17
22
|
from .server.function_executor_server_factory import (
|
18
23
|
FunctionExecutorServerConfiguration,
|
19
24
|
FunctionExecutorServerFactory,
|
@@ -54,12 +59,11 @@ class SingleTaskRunner:
|
|
54
59
|
if self._state.is_shutdown:
|
55
60
|
raise RuntimeError("Function Executor state is shutting down.")
|
56
61
|
|
57
|
-
# If Function Executor
|
58
|
-
|
59
|
-
|
60
|
-
self._logger.error("Health check failed, destroying FunctionExecutor.")
|
61
|
-
await self._state.destroy_function_executor()
|
62
|
+
# If Function Executor became unhealthy while was idle then destroy it.
|
63
|
+
# It'll be recreated below.
|
64
|
+
await self._destroy_existing_function_executor_if_unhealthy()
|
62
65
|
|
66
|
+
# Create Function Executor if it doesn't exist yet.
|
63
67
|
if self._state.function_executor is None:
|
64
68
|
try:
|
65
69
|
await self._create_function_executor()
|
@@ -70,7 +74,12 @@ class SingleTaskRunner:
|
|
70
74
|
success=False,
|
71
75
|
)
|
72
76
|
|
73
|
-
|
77
|
+
try:
|
78
|
+
return await self._run()
|
79
|
+
finally:
|
80
|
+
# If Function Executor became unhealthy while running the task then destroy it.
|
81
|
+
# The periodic health checker might not notice this as it does only periodic checks.
|
82
|
+
await self._destroy_existing_function_executor_if_unhealthy()
|
74
83
|
|
75
84
|
async def _create_function_executor(self) -> FunctionExecutor:
|
76
85
|
function_executor: FunctionExecutor = FunctionExecutor(
|
@@ -122,19 +131,39 @@ class SingleTaskRunner:
|
|
122
131
|
health_check_failed_callback=self._health_check_failed_callback,
|
123
132
|
function_executor_state=self._state,
|
124
133
|
):
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
134
|
+
with (
|
135
|
+
metric_function_executor_run_task_rpc_errors.count_exceptions(),
|
136
|
+
metric_function_executor_run_task_rpc_latency.time(),
|
137
|
+
):
|
138
|
+
metric_function_executor_run_task_rpcs.inc()
|
139
|
+
# If this RPC failed due to customer code crashing the server we won't be
|
140
|
+
# able to detect this. We'll treat this as our own error for now and thus
|
141
|
+
# let the AioRpcError to be raised here.
|
142
|
+
response: RunTaskResponse = await FunctionExecutorStub(
|
143
|
+
channel
|
144
|
+
).run_task(request)
|
131
145
|
return _task_output(task=self._task_input.task, response=response)
|
132
146
|
|
133
147
|
async def _health_check_failed_callback(self):
|
134
|
-
#
|
135
|
-
|
148
|
+
# Function Executor destroy due to the periodic health check failure ensures that
|
149
|
+
# a running task RPC stuck in unhealthy Function Executor fails immidiately.
|
136
150
|
async with self._state.lock:
|
137
|
-
|
151
|
+
if self._state.function_executor is not None:
|
152
|
+
await self._destroy_function_executor_on_failed_health_check()
|
153
|
+
|
154
|
+
async def _destroy_existing_function_executor_if_unhealthy(self):
|
155
|
+
self._state.check_locked()
|
156
|
+
if self._state.function_executor is None:
|
157
|
+
return
|
158
|
+
if await self._state.function_executor.health_checker().check():
|
159
|
+
return
|
160
|
+
await self._destroy_function_executor_on_failed_health_check()
|
161
|
+
|
162
|
+
async def _destroy_function_executor_on_failed_health_check(self):
|
163
|
+
self._state.check_locked()
|
164
|
+
self._logger.error("Health check failed, destroying FunctionExecutor.")
|
165
|
+
self._state.health_check_failed = True
|
166
|
+
await self._state.destroy_function_executor()
|
138
167
|
|
139
168
|
|
140
169
|
class _RunningTaskContextManager:
|
@@ -20,6 +20,7 @@ class TaskOutput:
|
|
20
20
|
stderr: Optional[str] = None,
|
21
21
|
reducer: bool = False,
|
22
22
|
success: bool = False,
|
23
|
+
is_internal_error: bool = False,
|
23
24
|
):
|
24
25
|
self.task = task
|
25
26
|
self.function_output = function_output
|
@@ -28,9 +29,14 @@ class TaskOutput:
|
|
28
29
|
self.stderr = stderr
|
29
30
|
self.reducer = reducer
|
30
31
|
self.success = success
|
32
|
+
self.is_internal_error = is_internal_error
|
31
33
|
|
32
34
|
@classmethod
|
33
35
|
def internal_error(cls, task: Task) -> "TaskOutput":
|
34
36
|
"""Creates a TaskOutput for an internal error."""
|
35
37
|
# We are not sharing internal error messages with the customer.
|
36
|
-
return TaskOutput(
|
38
|
+
return TaskOutput(
|
39
|
+
task=task,
|
40
|
+
stderr="Platform failed to execute the function.",
|
41
|
+
is_internal_error=True,
|
42
|
+
)
|
@@ -0,0 +1,69 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from ..monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
# This file contains all metrics used by Downloader.
|
6
|
+
|
7
|
+
# Graph download metrics
|
8
|
+
metric_graph_downloads: prometheus_client.Counter = prometheus_client.Counter(
|
9
|
+
"task_graph_downloads",
|
10
|
+
"Number of task graph downloads, including downloads served from local cache",
|
11
|
+
)
|
12
|
+
metric_graph_download_errors: prometheus_client.Counter = prometheus_client.Counter(
|
13
|
+
"task_graph_download_errors",
|
14
|
+
"Number of task download errors, including downloads served from local cache",
|
15
|
+
)
|
16
|
+
metric_graphs_from_cache: prometheus_client.Counter = prometheus_client.Counter(
|
17
|
+
"task_graph_downloads_from_cache",
|
18
|
+
"Number of task graph downloads served from local cache",
|
19
|
+
)
|
20
|
+
metric_graph_download_latency: prometheus_client.Histogram = (
|
21
|
+
latency_metric_for_fast_operation(
|
22
|
+
"task_graph_download",
|
23
|
+
"task graph download, including downloads served from local cache",
|
24
|
+
)
|
25
|
+
)
|
26
|
+
metric_tasks_downloading_graphs: prometheus_client.Gauge = prometheus_client.Gauge(
|
27
|
+
"tasks_downloading_graphs",
|
28
|
+
"Number of tasks currently downloading their graphs, including local cache lookups",
|
29
|
+
)
|
30
|
+
|
31
|
+
# Task input download metrics
|
32
|
+
metric_task_input_downloads: prometheus_client.Counter = prometheus_client.Counter(
|
33
|
+
"task_input_downloads", "Number of task input downloads"
|
34
|
+
)
|
35
|
+
metric_task_input_download_errors: prometheus_client.Counter = (
|
36
|
+
prometheus_client.Counter(
|
37
|
+
"task_input_download_errors", "Number of task input download errors"
|
38
|
+
)
|
39
|
+
)
|
40
|
+
metric_task_input_download_latency: prometheus_client.Histogram = (
|
41
|
+
latency_metric_for_fast_operation("task_input_download", "task input download")
|
42
|
+
)
|
43
|
+
metric_tasks_downloading_inputs: prometheus_client.Gauge = prometheus_client.Gauge(
|
44
|
+
"tasks_downloading_inputs", "Number of tasks currently downloading their inputs"
|
45
|
+
)
|
46
|
+
|
47
|
+
# Reducer init value download metrics
|
48
|
+
metric_reducer_init_value_downloads: prometheus_client.Counter = (
|
49
|
+
prometheus_client.Counter(
|
50
|
+
"task_reducer_init_value_downloads", "Number of reducer init value downloads"
|
51
|
+
)
|
52
|
+
)
|
53
|
+
metric_reducer_init_value_download_errors: prometheus_client.Counter = (
|
54
|
+
prometheus_client.Counter(
|
55
|
+
"task_reducer_init_value_download_errors",
|
56
|
+
"Number of reducer init value download errors",
|
57
|
+
)
|
58
|
+
)
|
59
|
+
metric_reducer_init_value_download_latency: prometheus_client.Histogram = (
|
60
|
+
latency_metric_for_fast_operation(
|
61
|
+
"task_reducer_init_value_download", "Task reducer init value download"
|
62
|
+
)
|
63
|
+
)
|
64
|
+
metric_tasks_downloading_reducer_init_value: prometheus_client.Gauge = (
|
65
|
+
prometheus_client.Gauge(
|
66
|
+
"tasks_downloading_reducer_init_value",
|
67
|
+
"Number of tasks currently downloading their reducer init values",
|
68
|
+
)
|
69
|
+
)
|
@@ -0,0 +1,51 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from ..monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
# This file contains all metrics used by Executor.
|
6
|
+
|
7
|
+
# Executor overview metrics.
|
8
|
+
metric_executor_info: prometheus_client.Info = prometheus_client.Info(
|
9
|
+
"executor", "Executor information"
|
10
|
+
)
|
11
|
+
metric_executor_state: prometheus_client.Enum = prometheus_client.Enum(
|
12
|
+
"executor_state",
|
13
|
+
"Current Executor state",
|
14
|
+
states=["starting", "running", "shutting_down"],
|
15
|
+
)
|
16
|
+
|
17
|
+
# Task statistics metrics.
|
18
|
+
metric_tasks_fetched: prometheus_client.Counter = prometheus_client.Counter(
|
19
|
+
"tasks_fetched", "Number of tasks that were fetched from Server"
|
20
|
+
)
|
21
|
+
metric_tasks_completed: prometheus_client.Counter = prometheus_client.Counter(
|
22
|
+
"tasks_completed", "Number of tasks that were completed", ["outcome"]
|
23
|
+
)
|
24
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ALL = "all"
|
25
|
+
METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS = "success"
|
26
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE = "error_customer_code"
|
27
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM = "error_platform"
|
28
|
+
metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL)
|
29
|
+
metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS)
|
30
|
+
metric_tasks_completed.labels(
|
31
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
|
32
|
+
)
|
33
|
+
metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM)
|
34
|
+
|
35
|
+
# Task outcome reporting metrics.
|
36
|
+
metric_task_outcome_reports: prometheus_client.Counter = prometheus_client.Counter(
|
37
|
+
"task_outcome_reports",
|
38
|
+
"Number of task outcome reports",
|
39
|
+
)
|
40
|
+
metric_tasks_reporting_outcome: prometheus_client.Gauge = prometheus_client.Gauge(
|
41
|
+
"tasks_reporting_outcome",
|
42
|
+
"Number of tasks currently reporting their outcomes",
|
43
|
+
)
|
44
|
+
metric_task_outcome_report_latency: prometheus_client.Histogram = (
|
45
|
+
latency_metric_for_fast_operation("task_outcome_report", "task outcome report")
|
46
|
+
)
|
47
|
+
metric_task_outcome_report_retries: prometheus_client.Counter = (
|
48
|
+
prometheus_client.Counter(
|
49
|
+
"tasks_outcome_report_retries", "Number of task outcome report retries"
|
50
|
+
)
|
51
|
+
)
|
@@ -0,0 +1,21 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from ..monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
# This file contains all metrics used by TaskFetcher.
|
6
|
+
|
7
|
+
metric_server_registrations: prometheus_client.Counter = prometheus_client.Counter(
|
8
|
+
"server_registration_requests",
|
9
|
+
"Number of Executor registrations requests sent to the Server",
|
10
|
+
)
|
11
|
+
metric_server_registration_errors: prometheus_client.Counter = (
|
12
|
+
prometheus_client.Counter(
|
13
|
+
"server_registration_request_errors",
|
14
|
+
"Number of failed Executor registration requests",
|
15
|
+
)
|
16
|
+
)
|
17
|
+
metric_server_registration_latency: prometheus_client.Histogram = (
|
18
|
+
latency_metric_for_fast_operation(
|
19
|
+
"server_registration_request", "Register Executor at the Server"
|
20
|
+
)
|
21
|
+
)
|
@@ -0,0 +1,22 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from ..monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
# This file contains all metrics used by TaskReporter.
|
6
|
+
|
7
|
+
metric_server_ingest_files_requests: prometheus_client.Counter = (
|
8
|
+
prometheus_client.Counter(
|
9
|
+
"server_ingest_files_requests", "Number of Server ingest files requests"
|
10
|
+
)
|
11
|
+
)
|
12
|
+
metric_server_ingest_files_errors: prometheus_client.Counter = (
|
13
|
+
prometheus_client.Counter(
|
14
|
+
"server_ingest_files_request_errors",
|
15
|
+
"Number of Server ingest files request errors",
|
16
|
+
)
|
17
|
+
)
|
18
|
+
metric_server_ingest_files_latency: prometheus_client.Histogram = (
|
19
|
+
latency_metric_for_fast_operation(
|
20
|
+
"server_ingest_files_request", "Ingest files request to Server"
|
21
|
+
)
|
22
|
+
)
|
@@ -0,0 +1,45 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from ..monitoring.metrics import latency_metric_for_customer_controlled_operation
|
4
|
+
|
5
|
+
# This file contains all metrics used by TaskRunner.
|
6
|
+
|
7
|
+
# Metrics for the stage when task is blocked by the current policy.
|
8
|
+
metric_task_policy_runs: prometheus_client.Counter = prometheus_client.Counter(
|
9
|
+
"task_policy_runs",
|
10
|
+
"Number of task execution policy runs",
|
11
|
+
)
|
12
|
+
metric_task_policy_errors: prometheus_client.Counter = prometheus_client.Counter(
|
13
|
+
"task_policy_errors",
|
14
|
+
"Number of errors while running task execution policy",
|
15
|
+
)
|
16
|
+
metric_task_policy_latency: prometheus_client.Histogram = (
|
17
|
+
latency_metric_for_customer_controlled_operation(
|
18
|
+
"task_policy",
|
19
|
+
"Task execution blocked by the policy",
|
20
|
+
)
|
21
|
+
)
|
22
|
+
metric_tasks_blocked_by_policy: prometheus_client.Gauge = prometheus_client.Gauge(
|
23
|
+
"tasks_blocked_by_policy",
|
24
|
+
"Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
|
25
|
+
)
|
26
|
+
|
27
|
+
# Metrics for the stage when task is running.
|
28
|
+
metric_task_runs: prometheus_client.Counter = prometheus_client.Counter(
|
29
|
+
"task_runs",
|
30
|
+
"Number of task runs",
|
31
|
+
)
|
32
|
+
metric_task_run_platform_errors: prometheus_client.Counter = prometheus_client.Counter(
|
33
|
+
"task_run_platform_errors",
|
34
|
+
"Number of platform errors while running task",
|
35
|
+
)
|
36
|
+
metric_task_run_latency: prometheus_client.Histogram = (
|
37
|
+
latency_metric_for_customer_controlled_operation(
|
38
|
+
"task_run",
|
39
|
+
"run task from the moment it is unblocked by the policy until it finishes",
|
40
|
+
)
|
41
|
+
)
|
42
|
+
metric_tasks_running: prometheus_client.Gauge = prometheus_client.Gauge(
|
43
|
+
"tasks_running",
|
44
|
+
"Number of running tasks",
|
45
|
+
)
|
@@ -0,0 +1,25 @@
|
|
1
|
+
from typing import Dict, List, Optional
|
2
|
+
|
3
|
+
from ..api_objects import FunctionURI
|
4
|
+
|
5
|
+
|
6
|
+
def function_allowlist_to_info_dict(
|
7
|
+
function_allowlist: Optional[List[FunctionURI]],
|
8
|
+
) -> Dict[str, str]:
|
9
|
+
if function_allowlist is None:
|
10
|
+
return {"function_allowlist": "None"}
|
11
|
+
|
12
|
+
info = {}
|
13
|
+
counter = 0
|
14
|
+
for function_uri in function_allowlist:
|
15
|
+
function_uri: FunctionURI
|
16
|
+
info[f"function_allowlist_{counter}"] = ":".join(
|
17
|
+
[
|
18
|
+
function_uri.namespace,
|
19
|
+
function_uri.compute_graph,
|
20
|
+
function_uri.compute_fn,
|
21
|
+
str(function_uri.version),
|
22
|
+
]
|
23
|
+
)
|
24
|
+
counter += 1
|
25
|
+
return info
|
@@ -0,0 +1,20 @@
|
|
1
|
+
from aiohttp import web
|
2
|
+
|
3
|
+
from .handler import Handler
|
4
|
+
from .health_checker.health_checker import HealthChecker, HealthCheckResult
|
5
|
+
|
6
|
+
|
7
|
+
class HealthCheckHandler(Handler):
|
8
|
+
def __init__(self, health_checker: HealthChecker):
|
9
|
+
self._health_checker = health_checker
|
10
|
+
|
11
|
+
async def handle(self, request: web.Request) -> web.Response:
|
12
|
+
result: HealthCheckResult = await self._health_checker.check()
|
13
|
+
return web.json_response(
|
14
|
+
{
|
15
|
+
"status": "ok" if result.is_success else "nok",
|
16
|
+
"message": result.status_message,
|
17
|
+
"checker": result.checker_name,
|
18
|
+
},
|
19
|
+
status=200 if result.is_success else 503,
|
20
|
+
)
|
@@ -0,0 +1,58 @@
|
|
1
|
+
from typing import Optional
|
2
|
+
|
3
|
+
from ...function_executor.function_executor_states_container import (
|
4
|
+
FunctionExecutorStatesContainer,
|
5
|
+
)
|
6
|
+
from .health_checker import HealthChecker, HealthCheckResult
|
7
|
+
|
8
|
+
HEALTH_CHECKER_NAME = "GenericHealthChecker"
|
9
|
+
|
10
|
+
|
11
|
+
class GenericHealthChecker(HealthChecker):
|
12
|
+
"""A generic health checker that doesn't depend on machine type and other features of the environment.
|
13
|
+
|
14
|
+
The health checker uses software signals available in all environments like Function Executor failure rates.
|
15
|
+
"""
|
16
|
+
|
17
|
+
def __init__(self):
|
18
|
+
self._function_executor_states: Optional[FunctionExecutorStatesContainer] = None
|
19
|
+
|
20
|
+
def set_function_executor_states_container(
|
21
|
+
self, states: FunctionExecutorStatesContainer
|
22
|
+
):
|
23
|
+
self._function_executor_states = states
|
24
|
+
|
25
|
+
async def check(self) -> HealthCheckResult:
|
26
|
+
if self._function_executor_states is None:
|
27
|
+
return HealthCheckResult(
|
28
|
+
is_success=False,
|
29
|
+
status_message="Function Executor states container was not provided yet",
|
30
|
+
checker_name=HEALTH_CHECKER_NAME,
|
31
|
+
)
|
32
|
+
|
33
|
+
# Current health check policy and reasoning:
|
34
|
+
# * A Function Executor health check failure is a strong signal that something is wrong
|
35
|
+
# either with:
|
36
|
+
# - The Function Code (a criticial software bug).
|
37
|
+
# - The Executor machine/container/VM (a software bug or malfunctioning local hardware).
|
38
|
+
# * Critical Function Code bugs tend to get fixed eventually by users. What doesn't get fixed eventually
|
39
|
+
# is rare but recurring local Executor issues like hardware errors and software bugs in middleware like
|
40
|
+
# drivers.
|
41
|
+
# * Such issues tend to get mitigated by automatically recreating the Executor machine/VM/container.
|
42
|
+
# * So we fail whole Executor health check if a Function Executor health check ever failed to hint the users
|
43
|
+
# that we probably need to recreate the Executor machine/VM/container (unless there's a bug in Function
|
44
|
+
# code that user can investigate themself).
|
45
|
+
async for state in self._function_executor_states:
|
46
|
+
# No need to async lock the state to read a single value.
|
47
|
+
if state.health_check_failed:
|
48
|
+
return HealthCheckResult(
|
49
|
+
is_success=False,
|
50
|
+
status_message="A Function Executor health check failed",
|
51
|
+
checker_name=HEALTH_CHECKER_NAME,
|
52
|
+
)
|
53
|
+
|
54
|
+
return HealthCheckResult(
|
55
|
+
is_success=True,
|
56
|
+
status_message="All Function Executors pass health checks",
|
57
|
+
checker_name=HEALTH_CHECKER_NAME,
|
58
|
+
)
|
@@ -0,0 +1,23 @@
|
|
1
|
+
from ...function_executor.function_executor_states_container import (
|
2
|
+
FunctionExecutorStatesContainer,
|
3
|
+
)
|
4
|
+
|
5
|
+
|
6
|
+
class HealthCheckResult:
|
7
|
+
def __init__(self, checker_name: str, is_success: bool, status_message: str):
|
8
|
+
self.checker_name = checker_name
|
9
|
+
self.is_success = is_success
|
10
|
+
self.status_message = status_message
|
11
|
+
|
12
|
+
|
13
|
+
class HealthChecker:
|
14
|
+
"""Abstract base class for health checkers."""
|
15
|
+
|
16
|
+
def set_function_executor_states_container(
|
17
|
+
self, states: FunctionExecutorStatesContainer
|
18
|
+
):
|
19
|
+
"""Provides function executor states to this health checker so it can use them in the health checks."""
|
20
|
+
raise NotImplementedError("Subclasses must implement this method.")
|
21
|
+
|
22
|
+
async def check(self) -> HealthCheckResult:
|
23
|
+
raise NotImplementedError("Subclasses must implement this method.")
|