indexify 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. indexify/cli/cli.py +35 -6
  2. indexify/executor/api_objects.py +4 -0
  3. indexify/executor/downloader.py +45 -5
  4. indexify/executor/executor.py +103 -16
  5. indexify/executor/function_executor/function_executor.py +174 -55
  6. indexify/executor/function_executor/function_executor_state.py +6 -0
  7. indexify/executor/function_executor/function_executor_states_container.py +64 -0
  8. indexify/executor/function_executor/health_checker.py +20 -10
  9. indexify/executor/function_executor/invocation_state_client.py +31 -6
  10. indexify/executor/function_executor/metrics/function_executor.py +142 -0
  11. indexify/executor/function_executor/metrics/function_executor_state.py +10 -0
  12. indexify/executor/function_executor/metrics/function_executor_state_container.py +10 -0
  13. indexify/executor/function_executor/metrics/health_checker.py +14 -0
  14. indexify/executor/function_executor/metrics/invocation_state_client.py +45 -0
  15. indexify/executor/function_executor/metrics/single_task_runner.py +22 -0
  16. indexify/executor/function_executor/single_task_runner.py +44 -15
  17. indexify/executor/function_executor/task_output.py +7 -1
  18. indexify/executor/metrics/downloader.py +69 -0
  19. indexify/executor/metrics/executor.py +51 -0
  20. indexify/executor/metrics/task_fetcher.py +21 -0
  21. indexify/executor/metrics/task_reporter.py +22 -0
  22. indexify/executor/metrics/task_runner.py +45 -0
  23. indexify/executor/monitoring/function_allowlist.py +25 -0
  24. indexify/executor/monitoring/handler.py +8 -0
  25. indexify/executor/monitoring/health_check_handler.py +20 -0
  26. indexify/executor/monitoring/health_checker/generic_health_checker.py +58 -0
  27. indexify/executor/monitoring/health_checker/health_checker.py +23 -0
  28. indexify/executor/monitoring/metrics.py +245 -0
  29. indexify/executor/monitoring/prometheus_metrics_handler.py +18 -0
  30. indexify/executor/monitoring/server.py +41 -0
  31. indexify/executor/monitoring/startup_probe_handler.py +17 -0
  32. indexify/executor/task_fetcher.py +15 -1
  33. indexify/executor/task_reporter.py +24 -7
  34. indexify/executor/task_runner.py +64 -46
  35. {indexify-0.3.9.dist-info → indexify-0.3.10.dist-info}/METADATA +4 -2
  36. indexify-0.3.10.dist-info/RECORD +46 -0
  37. indexify-0.3.9.dist-info/RECORD +0 -25
  38. {indexify-0.3.9.dist-info → indexify-0.3.10.dist-info}/WHEEL +0 -0
  39. {indexify-0.3.9.dist-info → indexify-0.3.10.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,45 @@
1
+ import prometheus_client
2
+
3
+ from ...monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ # This file contains all metrics used by InvocationStateClient.
6
+
7
+ # General metrics.
8
+ metric_request_read_errors: prometheus_client.Counter = prometheus_client.Counter(
9
+ "function_executor_invocation_state_client_request_read_errors",
10
+ "Number of failed request reads in Function Executor Invocation State client resulting in its early termination",
11
+ )
12
+
13
+ # Get invocation state key-value Server API metrics.
14
+ metric_server_get_state_requests: prometheus_client.Counter = prometheus_client.Counter(
15
+ "server_get_invocation_state_requests",
16
+ "Number of get invocation state requests sent to the Server on behalf of Function Executor",
17
+ )
18
+ metric_server_get_state_request_errors: prometheus_client.Counter = (
19
+ prometheus_client.Counter(
20
+ "server_get_invocation_state_request_errors",
21
+ "Server get invocation state request errors",
22
+ )
23
+ )
24
+ metric_server_get_state_request_latency: prometheus_client.Histogram = (
25
+ latency_metric_for_fast_operation(
26
+ "server_get_invocation_state_request", "Server get invocation state request"
27
+ )
28
+ )
29
+
30
+ # Set invocation state key-value Server API metrics.
31
+ metric_server_set_state_requests: prometheus_client.Counter = prometheus_client.Counter(
32
+ "server_set_invocation_state_requests",
33
+ "Number of set invocation state requests sent to the Server on behalf of Function Executor",
34
+ )
35
+ metric_server_set_state_request_errors: prometheus_client.Counter = (
36
+ prometheus_client.Counter(
37
+ "server_set_invocation_state_request_errors",
38
+ "Server set invocation state request errors",
39
+ )
40
+ )
41
+ metric_server_set_state_request_latency: prometheus_client.Histogram = (
42
+ latency_metric_for_fast_operation(
43
+ "server_set_invocation_state_request", "Server set invocation state request"
44
+ )
45
+ )
@@ -0,0 +1,22 @@
1
+ import prometheus_client
2
+
3
+ from ...monitoring.metrics import latency_metric_for_customer_controlled_operation
4
+
5
+ # This file contains all metrics used by SingleTaskRunner.
6
+
7
+ metric_function_executor_run_task_rpcs: prometheus_client.Counter = (
8
+ prometheus_client.Counter(
9
+ "function_executor_run_task_rpcs", "Number of Function Executor run task RPCs"
10
+ )
11
+ )
12
+ metric_function_executor_run_task_rpc_errors: prometheus_client.Counter = (
13
+ prometheus_client.Counter(
14
+ "function_executor_run_task_rpc_errors",
15
+ "Number of Function Executor run task RPC errors",
16
+ )
17
+ )
18
+ metric_function_executor_run_task_rpc_latency: prometheus_client.Histogram = (
19
+ latency_metric_for_customer_controlled_operation(
20
+ "function_executor_run_task_rpc", "Function Executor run task RPC"
21
+ )
22
+ )
@@ -14,6 +14,11 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
14
14
  from ..api_objects import Task
15
15
  from .function_executor import CustomerError, FunctionExecutor
16
16
  from .function_executor_state import FunctionExecutorState
17
+ from .metrics.single_task_runner import (
18
+ metric_function_executor_run_task_rpc_errors,
19
+ metric_function_executor_run_task_rpc_latency,
20
+ metric_function_executor_run_task_rpcs,
21
+ )
17
22
  from .server.function_executor_server_factory import (
18
23
  FunctionExecutorServerConfiguration,
19
24
  FunctionExecutorServerFactory,
@@ -54,12 +59,11 @@ class SingleTaskRunner:
54
59
  if self._state.is_shutdown:
55
60
  raise RuntimeError("Function Executor state is shutting down.")
56
61
 
57
- # If Function Executor is not healthy then recreate it.
58
- if self._state.function_executor is not None:
59
- if not await self._state.function_executor.health_checker().check():
60
- self._logger.error("Health check failed, destroying FunctionExecutor.")
61
- await self._state.destroy_function_executor()
62
+ # If Function Executor became unhealthy while was idle then destroy it.
63
+ # It'll be recreated below.
64
+ await self._destroy_existing_function_executor_if_unhealthy()
62
65
 
66
+ # Create Function Executor if it doesn't exist yet.
63
67
  if self._state.function_executor is None:
64
68
  try:
65
69
  await self._create_function_executor()
@@ -70,7 +74,12 @@ class SingleTaskRunner:
70
74
  success=False,
71
75
  )
72
76
 
73
- return await self._run()
77
+ try:
78
+ return await self._run()
79
+ finally:
80
+ # If Function Executor became unhealthy while running the task then destroy it.
81
+ # The periodic health checker might not notice this as it does only periodic checks.
82
+ await self._destroy_existing_function_executor_if_unhealthy()
74
83
 
75
84
  async def _create_function_executor(self) -> FunctionExecutor:
76
85
  function_executor: FunctionExecutor = FunctionExecutor(
@@ -122,19 +131,39 @@ class SingleTaskRunner:
122
131
  health_check_failed_callback=self._health_check_failed_callback,
123
132
  function_executor_state=self._state,
124
133
  ):
125
- # If this RPC failed due to customer code crashing the server we won't be
126
- # able to detect this. We'll treat this as our own error for now and thus
127
- # let the AioRpcError to be raised here.
128
- response: RunTaskResponse = await FunctionExecutorStub(channel).run_task(
129
- request
130
- )
134
+ with (
135
+ metric_function_executor_run_task_rpc_errors.count_exceptions(),
136
+ metric_function_executor_run_task_rpc_latency.time(),
137
+ ):
138
+ metric_function_executor_run_task_rpcs.inc()
139
+ # If this RPC failed due to customer code crashing the server we won't be
140
+ # able to detect this. We'll treat this as our own error for now and thus
141
+ # let the AioRpcError to be raised here.
142
+ response: RunTaskResponse = await FunctionExecutorStub(
143
+ channel
144
+ ).run_task(request)
131
145
  return _task_output(task=self._task_input.task, response=response)
132
146
 
133
147
  async def _health_check_failed_callback(self):
134
- # The Function Executor needs to get recreated on next task run.
135
- self._logger.error("Health check failed, destroying FunctionExecutor.")
148
+ # Function Executor destroy due to the periodic health check failure ensures that
149
+ # a running task RPC stuck in unhealthy Function Executor fails immidiately.
136
150
  async with self._state.lock:
137
- await self._state.destroy_function_executor()
151
+ if self._state.function_executor is not None:
152
+ await self._destroy_function_executor_on_failed_health_check()
153
+
154
+ async def _destroy_existing_function_executor_if_unhealthy(self):
155
+ self._state.check_locked()
156
+ if self._state.function_executor is None:
157
+ return
158
+ if await self._state.function_executor.health_checker().check():
159
+ return
160
+ await self._destroy_function_executor_on_failed_health_check()
161
+
162
+ async def _destroy_function_executor_on_failed_health_check(self):
163
+ self._state.check_locked()
164
+ self._logger.error("Health check failed, destroying FunctionExecutor.")
165
+ self._state.health_check_failed = True
166
+ await self._state.destroy_function_executor()
138
167
 
139
168
 
140
169
  class _RunningTaskContextManager:
@@ -20,6 +20,7 @@ class TaskOutput:
20
20
  stderr: Optional[str] = None,
21
21
  reducer: bool = False,
22
22
  success: bool = False,
23
+ is_internal_error: bool = False,
23
24
  ):
24
25
  self.task = task
25
26
  self.function_output = function_output
@@ -28,9 +29,14 @@ class TaskOutput:
28
29
  self.stderr = stderr
29
30
  self.reducer = reducer
30
31
  self.success = success
32
+ self.is_internal_error = is_internal_error
31
33
 
32
34
  @classmethod
33
35
  def internal_error(cls, task: Task) -> "TaskOutput":
34
36
  """Creates a TaskOutput for an internal error."""
35
37
  # We are not sharing internal error messages with the customer.
36
- return TaskOutput(task=task, stderr="Platform failed to execute the function.")
38
+ return TaskOutput(
39
+ task=task,
40
+ stderr="Platform failed to execute the function.",
41
+ is_internal_error=True,
42
+ )
@@ -0,0 +1,69 @@
1
+ import prometheus_client
2
+
3
+ from ..monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ # This file contains all metrics used by Downloader.
6
+
7
+ # Graph download metrics
8
+ metric_graph_downloads: prometheus_client.Counter = prometheus_client.Counter(
9
+ "task_graph_downloads",
10
+ "Number of task graph downloads, including downloads served from local cache",
11
+ )
12
+ metric_graph_download_errors: prometheus_client.Counter = prometheus_client.Counter(
13
+ "task_graph_download_errors",
14
+ "Number of task download errors, including downloads served from local cache",
15
+ )
16
+ metric_graphs_from_cache: prometheus_client.Counter = prometheus_client.Counter(
17
+ "task_graph_downloads_from_cache",
18
+ "Number of task graph downloads served from local cache",
19
+ )
20
+ metric_graph_download_latency: prometheus_client.Histogram = (
21
+ latency_metric_for_fast_operation(
22
+ "task_graph_download",
23
+ "task graph download, including downloads served from local cache",
24
+ )
25
+ )
26
+ metric_tasks_downloading_graphs: prometheus_client.Gauge = prometheus_client.Gauge(
27
+ "tasks_downloading_graphs",
28
+ "Number of tasks currently downloading their graphs, including local cache lookups",
29
+ )
30
+
31
+ # Task input download metrics
32
+ metric_task_input_downloads: prometheus_client.Counter = prometheus_client.Counter(
33
+ "task_input_downloads", "Number of task input downloads"
34
+ )
35
+ metric_task_input_download_errors: prometheus_client.Counter = (
36
+ prometheus_client.Counter(
37
+ "task_input_download_errors", "Number of task input download errors"
38
+ )
39
+ )
40
+ metric_task_input_download_latency: prometheus_client.Histogram = (
41
+ latency_metric_for_fast_operation("task_input_download", "task input download")
42
+ )
43
+ metric_tasks_downloading_inputs: prometheus_client.Gauge = prometheus_client.Gauge(
44
+ "tasks_downloading_inputs", "Number of tasks currently downloading their inputs"
45
+ )
46
+
47
+ # Reducer init value download metrics
48
+ metric_reducer_init_value_downloads: prometheus_client.Counter = (
49
+ prometheus_client.Counter(
50
+ "task_reducer_init_value_downloads", "Number of reducer init value downloads"
51
+ )
52
+ )
53
+ metric_reducer_init_value_download_errors: prometheus_client.Counter = (
54
+ prometheus_client.Counter(
55
+ "task_reducer_init_value_download_errors",
56
+ "Number of reducer init value download errors",
57
+ )
58
+ )
59
+ metric_reducer_init_value_download_latency: prometheus_client.Histogram = (
60
+ latency_metric_for_fast_operation(
61
+ "task_reducer_init_value_download", "Task reducer init value download"
62
+ )
63
+ )
64
+ metric_tasks_downloading_reducer_init_value: prometheus_client.Gauge = (
65
+ prometheus_client.Gauge(
66
+ "tasks_downloading_reducer_init_value",
67
+ "Number of tasks currently downloading their reducer init values",
68
+ )
69
+ )
@@ -0,0 +1,51 @@
1
+ import prometheus_client
2
+
3
+ from ..monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ # This file contains all metrics used by Executor.
6
+
7
+ # Executor overview metrics.
8
+ metric_executor_info: prometheus_client.Info = prometheus_client.Info(
9
+ "executor", "Executor information"
10
+ )
11
+ metric_executor_state: prometheus_client.Enum = prometheus_client.Enum(
12
+ "executor_state",
13
+ "Current Executor state",
14
+ states=["starting", "running", "shutting_down"],
15
+ )
16
+
17
+ # Task statistics metrics.
18
+ metric_tasks_fetched: prometheus_client.Counter = prometheus_client.Counter(
19
+ "tasks_fetched", "Number of tasks that were fetched from Server"
20
+ )
21
+ metric_tasks_completed: prometheus_client.Counter = prometheus_client.Counter(
22
+ "tasks_completed", "Number of tasks that were completed", ["outcome"]
23
+ )
24
+ METRIC_TASKS_COMPLETED_OUTCOME_ALL = "all"
25
+ METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS = "success"
26
+ METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE = "error_customer_code"
27
+ METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM = "error_platform"
28
+ metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL)
29
+ metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS)
30
+ metric_tasks_completed.labels(
31
+ outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
32
+ )
33
+ metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM)
34
+
35
+ # Task outcome reporting metrics.
36
+ metric_task_outcome_reports: prometheus_client.Counter = prometheus_client.Counter(
37
+ "task_outcome_reports",
38
+ "Number of task outcome reports",
39
+ )
40
+ metric_tasks_reporting_outcome: prometheus_client.Gauge = prometheus_client.Gauge(
41
+ "tasks_reporting_outcome",
42
+ "Number of tasks currently reporting their outcomes",
43
+ )
44
+ metric_task_outcome_report_latency: prometheus_client.Histogram = (
45
+ latency_metric_for_fast_operation("task_outcome_report", "task outcome report")
46
+ )
47
+ metric_task_outcome_report_retries: prometheus_client.Counter = (
48
+ prometheus_client.Counter(
49
+ "tasks_outcome_report_retries", "Number of task outcome report retries"
50
+ )
51
+ )
@@ -0,0 +1,21 @@
1
+ import prometheus_client
2
+
3
+ from ..monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ # This file contains all metrics used by TaskFetcher.
6
+
7
+ metric_server_registrations: prometheus_client.Counter = prometheus_client.Counter(
8
+ "server_registration_requests",
9
+ "Number of Executor registrations requests sent to the Server",
10
+ )
11
+ metric_server_registration_errors: prometheus_client.Counter = (
12
+ prometheus_client.Counter(
13
+ "server_registration_request_errors",
14
+ "Number of failed Executor registration requests",
15
+ )
16
+ )
17
+ metric_server_registration_latency: prometheus_client.Histogram = (
18
+ latency_metric_for_fast_operation(
19
+ "server_registration_request", "Register Executor at the Server"
20
+ )
21
+ )
@@ -0,0 +1,22 @@
1
+ import prometheus_client
2
+
3
+ from ..monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ # This file contains all metrics used by TaskReporter.
6
+
7
+ metric_server_ingest_files_requests: prometheus_client.Counter = (
8
+ prometheus_client.Counter(
9
+ "server_ingest_files_requests", "Number of Server ingest files requests"
10
+ )
11
+ )
12
+ metric_server_ingest_files_errors: prometheus_client.Counter = (
13
+ prometheus_client.Counter(
14
+ "server_ingest_files_request_errors",
15
+ "Number of Server ingest files request errors",
16
+ )
17
+ )
18
+ metric_server_ingest_files_latency: prometheus_client.Histogram = (
19
+ latency_metric_for_fast_operation(
20
+ "server_ingest_files_request", "Ingest files request to Server"
21
+ )
22
+ )
@@ -0,0 +1,45 @@
1
+ import prometheus_client
2
+
3
+ from ..monitoring.metrics import latency_metric_for_customer_controlled_operation
4
+
5
+ # This file contains all metrics used by TaskRunner.
6
+
7
+ # Metrics for the stage when task is blocked by the current policy.
8
+ metric_task_policy_runs: prometheus_client.Counter = prometheus_client.Counter(
9
+ "task_policy_runs",
10
+ "Number of task execution policy runs",
11
+ )
12
+ metric_task_policy_errors: prometheus_client.Counter = prometheus_client.Counter(
13
+ "task_policy_errors",
14
+ "Number of errors while running task execution policy",
15
+ )
16
+ metric_task_policy_latency: prometheus_client.Histogram = (
17
+ latency_metric_for_customer_controlled_operation(
18
+ "task_policy",
19
+ "Task execution blocked by the policy",
20
+ )
21
+ )
22
+ metric_tasks_blocked_by_policy: prometheus_client.Gauge = prometheus_client.Gauge(
23
+ "tasks_blocked_by_policy",
24
+ "Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
25
+ )
26
+
27
+ # Metrics for the stage when task is running.
28
+ metric_task_runs: prometheus_client.Counter = prometheus_client.Counter(
29
+ "task_runs",
30
+ "Number of task runs",
31
+ )
32
+ metric_task_run_platform_errors: prometheus_client.Counter = prometheus_client.Counter(
33
+ "task_run_platform_errors",
34
+ "Number of platform errors while running task",
35
+ )
36
+ metric_task_run_latency: prometheus_client.Histogram = (
37
+ latency_metric_for_customer_controlled_operation(
38
+ "task_run",
39
+ "run task from the moment it is unblocked by the policy until it finishes",
40
+ )
41
+ )
42
+ metric_tasks_running: prometheus_client.Gauge = prometheus_client.Gauge(
43
+ "tasks_running",
44
+ "Number of running tasks",
45
+ )
@@ -0,0 +1,25 @@
1
+ from typing import Dict, List, Optional
2
+
3
+ from ..api_objects import FunctionURI
4
+
5
+
6
+ def function_allowlist_to_info_dict(
7
+ function_allowlist: Optional[List[FunctionURI]],
8
+ ) -> Dict[str, str]:
9
+ if function_allowlist is None:
10
+ return {"function_allowlist": "None"}
11
+
12
+ info = {}
13
+ counter = 0
14
+ for function_uri in function_allowlist:
15
+ function_uri: FunctionURI
16
+ info[f"function_allowlist_{counter}"] = ":".join(
17
+ [
18
+ function_uri.namespace,
19
+ function_uri.compute_graph,
20
+ function_uri.compute_fn,
21
+ str(function_uri.version),
22
+ ]
23
+ )
24
+ counter += 1
25
+ return info
@@ -0,0 +1,8 @@
1
+ from aiohttp import web
2
+
3
+
4
+ class Handler:
5
+ """Abstract base class for all request handlers."""
6
+
7
+ async def handle(self, request: web.Request) -> web.Response:
8
+ raise NotImplementedError("Subclasses must implement this method.")
@@ -0,0 +1,20 @@
1
+ from aiohttp import web
2
+
3
+ from .handler import Handler
4
+ from .health_checker.health_checker import HealthChecker, HealthCheckResult
5
+
6
+
7
+ class HealthCheckHandler(Handler):
8
+ def __init__(self, health_checker: HealthChecker):
9
+ self._health_checker = health_checker
10
+
11
+ async def handle(self, request: web.Request) -> web.Response:
12
+ result: HealthCheckResult = await self._health_checker.check()
13
+ return web.json_response(
14
+ {
15
+ "status": "ok" if result.is_success else "nok",
16
+ "message": result.status_message,
17
+ "checker": result.checker_name,
18
+ },
19
+ status=200 if result.is_success else 503,
20
+ )
@@ -0,0 +1,58 @@
1
+ from typing import Optional
2
+
3
+ from ...function_executor.function_executor_states_container import (
4
+ FunctionExecutorStatesContainer,
5
+ )
6
+ from .health_checker import HealthChecker, HealthCheckResult
7
+
8
+ HEALTH_CHECKER_NAME = "GenericHealthChecker"
9
+
10
+
11
+ class GenericHealthChecker(HealthChecker):
12
+ """A generic health checker that doesn't depend on machine type and other features of the environment.
13
+
14
+ The health checker uses software signals available in all environments like Function Executor failure rates.
15
+ """
16
+
17
+ def __init__(self):
18
+ self._function_executor_states: Optional[FunctionExecutorStatesContainer] = None
19
+
20
+ def set_function_executor_states_container(
21
+ self, states: FunctionExecutorStatesContainer
22
+ ):
23
+ self._function_executor_states = states
24
+
25
+ async def check(self) -> HealthCheckResult:
26
+ if self._function_executor_states is None:
27
+ return HealthCheckResult(
28
+ is_success=False,
29
+ status_message="Function Executor states container was not provided yet",
30
+ checker_name=HEALTH_CHECKER_NAME,
31
+ )
32
+
33
+ # Current health check policy and reasoning:
34
+ # * A Function Executor health check failure is a strong signal that something is wrong
35
+ # either with:
36
+ # - The Function Code (a criticial software bug).
37
+ # - The Executor machine/container/VM (a software bug or malfunctioning local hardware).
38
+ # * Critical Function Code bugs tend to get fixed eventually by users. What doesn't get fixed eventually
39
+ # is rare but recurring local Executor issues like hardware errors and software bugs in middleware like
40
+ # drivers.
41
+ # * Such issues tend to get mitigated by automatically recreating the Executor machine/VM/container.
42
+ # * So we fail whole Executor health check if a Function Executor health check ever failed to hint the users
43
+ # that we probably need to recreate the Executor machine/VM/container (unless there's a bug in Function
44
+ # code that user can investigate themself).
45
+ async for state in self._function_executor_states:
46
+ # No need to async lock the state to read a single value.
47
+ if state.health_check_failed:
48
+ return HealthCheckResult(
49
+ is_success=False,
50
+ status_message="A Function Executor health check failed",
51
+ checker_name=HEALTH_CHECKER_NAME,
52
+ )
53
+
54
+ return HealthCheckResult(
55
+ is_success=True,
56
+ status_message="All Function Executors pass health checks",
57
+ checker_name=HEALTH_CHECKER_NAME,
58
+ )
@@ -0,0 +1,23 @@
1
+ from ...function_executor.function_executor_states_container import (
2
+ FunctionExecutorStatesContainer,
3
+ )
4
+
5
+
6
+ class HealthCheckResult:
7
+ def __init__(self, checker_name: str, is_success: bool, status_message: str):
8
+ self.checker_name = checker_name
9
+ self.is_success = is_success
10
+ self.status_message = status_message
11
+
12
+
13
+ class HealthChecker:
14
+ """Abstract base class for health checkers."""
15
+
16
+ def set_function_executor_states_container(
17
+ self, states: FunctionExecutorStatesContainer
18
+ ):
19
+ """Provides function executor states to this health checker so it can use them in the health checks."""
20
+ raise NotImplementedError("Subclasses must implement this method.")
21
+
22
+ async def check(self) -> HealthCheckResult:
23
+ raise NotImplementedError("Subclasses must implement this method.")