indexify 0.3.31__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. indexify/cli/__init__.py +18 -0
  2. indexify/cli/build_image.py +51 -0
  3. indexify/cli/deploy.py +57 -0
  4. indexify/cli/executor.py +205 -0
  5. indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
  6. indexify/executor/executor.py +57 -313
  7. indexify/executor/function_allowlist.py +59 -0
  8. indexify/executor/function_executor/function_executor.py +12 -6
  9. indexify/executor/function_executor/invocation_state_client.py +25 -3
  10. indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
  11. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
  12. indexify/executor/function_executor_controller/__init__.py +13 -0
  13. indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
  14. indexify/executor/function_executor_controller/create_function_executor.py +158 -0
  15. indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
  16. indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
  17. indexify/executor/function_executor_controller/downloads.py +199 -0
  18. indexify/executor/function_executor_controller/events.py +172 -0
  19. indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
  20. indexify/executor/function_executor_controller/loggers.py +57 -0
  21. indexify/executor/function_executor_controller/message_validators.py +69 -0
  22. indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
  23. indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
  24. indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
  25. indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
  26. indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
  27. indexify/executor/function_executor_controller/prepare_task.py +38 -0
  28. indexify/executor/function_executor_controller/run_task.py +201 -0
  29. indexify/executor/function_executor_controller/task_info.py +33 -0
  30. indexify/executor/function_executor_controller/task_output.py +122 -0
  31. indexify/executor/function_executor_controller/upload_task_output.py +234 -0
  32. indexify/executor/host_resources/host_resources.py +20 -25
  33. indexify/executor/host_resources/nvidia_gpu_allocator.py +8 -1
  34. indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
  35. indexify/executor/metrics/executor.py +0 -47
  36. indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
  37. indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
  38. indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
  39. indexify/executor/monitoring/health_checker/health_checker.py +0 -11
  40. indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
  41. indexify/executor/state_reporter.py +364 -0
  42. indexify/proto/executor_api.proto +68 -60
  43. indexify/proto/executor_api_pb2.py +52 -52
  44. indexify/proto/executor_api_pb2.pyi +129 -108
  45. indexify/proto/executor_api_pb2_grpc.py +0 -47
  46. {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/METADATA +2 -5
  47. indexify-0.4.3.dist-info/RECORD +68 -0
  48. indexify-0.4.3.dist-info/entry_points.txt +3 -0
  49. indexify/cli/cli.py +0 -268
  50. indexify/executor/api_objects.py +0 -92
  51. indexify/executor/downloader.py +0 -417
  52. indexify/executor/executor_flavor.py +0 -7
  53. indexify/executor/function_executor/function_executor_state.py +0 -107
  54. indexify/executor/function_executor/function_executor_states_container.py +0 -93
  55. indexify/executor/function_executor/function_executor_status.py +0 -95
  56. indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
  57. indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
  58. indexify/executor/function_executor/single_task_runner.py +0 -345
  59. indexify/executor/function_executor/task_input.py +0 -21
  60. indexify/executor/function_executor/task_output.py +0 -105
  61. indexify/executor/grpc/function_executor_controller.py +0 -418
  62. indexify/executor/grpc/metrics/task_controller.py +0 -8
  63. indexify/executor/grpc/state_reporter.py +0 -317
  64. indexify/executor/grpc/task_controller.py +0 -508
  65. indexify/executor/metrics/task_fetcher.py +0 -21
  66. indexify/executor/metrics/task_reporter.py +0 -53
  67. indexify/executor/metrics/task_runner.py +0 -52
  68. indexify/executor/monitoring/function_allowlist.py +0 -25
  69. indexify/executor/runtime_probes.py +0 -68
  70. indexify/executor/task_fetcher.py +0 -96
  71. indexify/executor/task_reporter.py +0 -459
  72. indexify/executor/task_runner.py +0 -177
  73. indexify-0.3.31.dist-info/RECORD +0 -68
  74. indexify-0.3.31.dist-info/entry_points.txt +0 -3
  75. {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/WHEEL +0 -0
@@ -0,0 +1,122 @@
1
+ from typing import Dict, List, Optional
2
+
3
+ from tensorlake.function_executor.proto.function_executor_pb2 import (
4
+ FunctionOutput,
5
+ RouterOutput,
6
+ )
7
+
8
+ from indexify.proto.executor_api_pb2 import (
9
+ DataPayload,
10
+ Task,
11
+ TaskFailureReason,
12
+ TaskOutcomeCode,
13
+ )
14
+
15
+
16
+ class TaskMetrics:
17
+ """Metrics for a task."""
18
+
19
+ def __init__(self, counters: Dict[str, int], timers: Dict[str, float]):
20
+ self.counters = counters
21
+ self.timers = timers
22
+
23
+
24
+ class TaskOutput:
25
+ """Result of running a task."""
26
+
27
+ def __init__(
28
+ self,
29
+ task: Task,
30
+ allocation_id: str,
31
+ outcome_code: TaskOutcomeCode,
32
+ # Optional[TaskFailureReason] is not supported in python 3.9
33
+ failure_reason: TaskFailureReason = None,
34
+ output_encoding: Optional[str] = None,
35
+ function_output: Optional[FunctionOutput] = None,
36
+ router_output: Optional[RouterOutput] = None,
37
+ stdout: Optional[str] = None,
38
+ stderr: Optional[str] = None,
39
+ reducer: bool = False,
40
+ metrics: Optional[TaskMetrics] = None,
41
+ uploaded_data_payloads: List[DataPayload] = [],
42
+ uploaded_stdout: Optional[DataPayload] = None,
43
+ uploaded_stderr: Optional[DataPayload] = None,
44
+ ):
45
+ self.task = task
46
+ self.allocation_id = allocation_id
47
+ self.function_output = function_output
48
+ self.router_output = router_output
49
+ self.stdout = stdout
50
+ self.stderr = stderr
51
+ self.reducer = reducer
52
+ self.outcome_code = outcome_code
53
+ self.failure_reason = failure_reason
54
+ self.metrics = metrics
55
+ self.output_encoding = output_encoding
56
+ self.uploaded_data_payloads = uploaded_data_payloads
57
+ self.uploaded_stdout = uploaded_stdout
58
+ self.uploaded_stderr = uploaded_stderr
59
+
60
+ @classmethod
61
+ def internal_error(
62
+ cls,
63
+ task: Task,
64
+ allocation_id: str,
65
+ ) -> "TaskOutput":
66
+ """Creates a TaskOutput for an internal error."""
67
+ # We are not sharing internal error messages with the customer.
68
+ return TaskOutput(
69
+ task=task,
70
+ allocation_id=allocation_id,
71
+ outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
72
+ failure_reason=TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR,
73
+ stderr="Platform failed to execute the function.",
74
+ )
75
+
76
+ @classmethod
77
+ def function_timeout(
78
+ cls,
79
+ task: Task,
80
+ allocation_id: str,
81
+ timeout_sec: float,
82
+ ) -> "TaskOutput":
83
+ """Creates a TaskOutput for an function timeout error."""
84
+ # Task stdout, stderr is not available.
85
+ return TaskOutput(
86
+ task=task,
87
+ allocation_id=allocation_id,
88
+ outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
89
+ failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_TIMEOUT,
90
+ stderr=f"Function exceeded its configured timeout of {timeout_sec:.3f} sec.",
91
+ )
92
+
93
+ @classmethod
94
+ def task_cancelled(
95
+ cls,
96
+ task: Task,
97
+ allocation_id: str,
98
+ ) -> "TaskOutput":
99
+ """Creates a TaskOutput for the case when task didn't finish because its allocation was removed by Server."""
100
+ return TaskOutput(
101
+ task=task,
102
+ allocation_id=allocation_id,
103
+ outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
104
+ failure_reason=TaskFailureReason.TASK_FAILURE_REASON_TASK_CANCELLED,
105
+ )
106
+
107
+ @classmethod
108
+ def function_executor_terminated(
109
+ cls,
110
+ task: Task,
111
+ allocation_id: str,
112
+ ) -> "TaskOutput":
113
+ """Creates a TaskOutput for the case when task didn't run because its FE terminated."""
114
+ return TaskOutput(
115
+ task=task,
116
+ allocation_id=allocation_id,
117
+ outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
118
+ failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED,
119
+ # TODO: add FE startup stdout, stderr to the task output if FE failed to startup.
120
+ stdout="",
121
+ stderr="Can't execute the function because its Function Executor terminated.",
122
+ )
@@ -0,0 +1,234 @@
1
+ import asyncio
2
+ import hashlib
3
+ import time
4
+ from typing import Any
5
+
6
+ from indexify.executor.blob_store.blob_store import BLOBStore
7
+ from indexify.proto.executor_api_pb2 import (
8
+ DataPayload,
9
+ DataPayloadEncoding,
10
+ )
11
+
12
+ from .events import TaskOutputUploadFinished
13
+ from .metrics.upload_task_output import (
14
+ metric_task_output_blob_store_upload_errors,
15
+ metric_task_output_blob_store_upload_latency,
16
+ metric_task_output_blob_store_uploads,
17
+ metric_task_output_upload_latency,
18
+ metric_task_output_upload_retries,
19
+ metric_task_output_uploads,
20
+ metric_tasks_uploading_outputs,
21
+ )
22
+ from .task_info import TaskInfo
23
+ from .task_output import TaskOutput
24
+
25
+ _TASK_OUTPUT_UPLOAD_BACKOFF_SEC = 5.0
26
+
27
+
28
+ async def upload_task_output(
29
+ task_info: TaskInfo, blob_store: BLOBStore, logger: Any
30
+ ) -> TaskOutputUploadFinished:
31
+ """Uploads the task output to blob store.
32
+
33
+ Doesn't raise any Exceptions. Runs till the reporting is successful.
34
+ """
35
+ logger = logger.bind(module=__name__)
36
+
37
+ with (
38
+ metric_tasks_uploading_outputs.track_inprogress(),
39
+ metric_task_output_upload_latency.time(),
40
+ ):
41
+ metric_task_output_uploads.inc()
42
+ await _upload_task_output_until_successful(
43
+ output=task_info.output,
44
+ blob_store=blob_store,
45
+ logger=logger,
46
+ )
47
+ _log_function_metrics(output=task_info.output, logger=logger)
48
+ return TaskOutputUploadFinished(task_info=task_info, is_success=True)
49
+
50
+
51
+ async def _upload_task_output_until_successful(
52
+ output: TaskOutput, blob_store: BLOBStore, logger: Any
53
+ ) -> None:
54
+ upload_retries: int = 0
55
+
56
+ while True:
57
+ logger = logger.bind(retries=upload_retries)
58
+ try:
59
+ await _upload_task_output_once(
60
+ output=output, blob_store=blob_store, logger=logger
61
+ )
62
+ return
63
+ except Exception as e:
64
+ logger.error(
65
+ "failed to upload task output",
66
+ exc_info=e,
67
+ )
68
+ upload_retries += 1
69
+ metric_task_output_upload_retries.inc()
70
+ await asyncio.sleep(_TASK_OUTPUT_UPLOAD_BACKOFF_SEC)
71
+
72
+
73
+ class _TaskOutputSummary:
74
+ def __init__(self):
75
+ self.output_count: int = 0
76
+ self.output_total_bytes: int = 0
77
+ self.router_output_count: int = 0
78
+ self.stdout_count: int = 0
79
+ self.stdout_total_bytes: int = 0
80
+ self.stderr_count: int = 0
81
+ self.stderr_total_bytes: int = 0
82
+ self.total_bytes: int = 0
83
+
84
+
85
+ async def _upload_task_output_once(
86
+ output: TaskOutput, blob_store: BLOBStore, logger: Any
87
+ ) -> None:
88
+ """Uploads the supplied task output to blob store.
89
+
90
+ Raises an Exception if the upload fails.
91
+ """
92
+ output_summary: _TaskOutputSummary = _task_output_summary(output)
93
+ logger.info(
94
+ "uploading task output to blob store",
95
+ total_bytes=output_summary.total_bytes,
96
+ total_files=output_summary.output_count
97
+ + output_summary.stdout_count
98
+ + output_summary.stderr_count,
99
+ output_files=output_summary.output_count,
100
+ output_bytes=output_summary.total_bytes,
101
+ router_output_count=output_summary.router_output_count,
102
+ stdout_bytes=output_summary.stdout_total_bytes,
103
+ stderr_bytes=output_summary.stderr_total_bytes,
104
+ )
105
+
106
+ start_time = time.time()
107
+ with (
108
+ metric_task_output_blob_store_upload_latency.time(),
109
+ metric_task_output_blob_store_upload_errors.count_exceptions(),
110
+ ):
111
+ metric_task_output_blob_store_uploads.inc()
112
+ await _upload_to_blob_store(output=output, blob_store=blob_store, logger=logger)
113
+
114
+ logger.info(
115
+ "files uploaded to blob store",
116
+ duration=time.time() - start_time,
117
+ )
118
+
119
+
120
+ async def _upload_to_blob_store(
121
+ output: TaskOutput, blob_store: BLOBStore, logger: Any
122
+ ) -> None:
123
+ if output.stdout is not None:
124
+ stdout_url = f"{output.task.output_payload_uri_prefix}.{output.task.id}.stdout"
125
+ stdout_bytes: bytes = output.stdout.encode()
126
+ await blob_store.put(stdout_url, stdout_bytes, logger)
127
+ output.uploaded_stdout = DataPayload(
128
+ uri=stdout_url,
129
+ size=len(stdout_bytes),
130
+ sha256_hash=_compute_hash(stdout_bytes),
131
+ encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
132
+ encoding_version=0,
133
+ )
134
+ # stdout is uploaded, free the memory used for it.
135
+ output.stdout = None
136
+
137
+ if output.stderr is not None:
138
+ stderr_url = f"{output.task.output_payload_uri_prefix}.{output.task.id}.stderr"
139
+ stderr_bytes: bytes = output.stderr.encode()
140
+ await blob_store.put(stderr_url, stderr_bytes, logger)
141
+ output.uploaded_stderr = DataPayload(
142
+ uri=stderr_url,
143
+ size=len(stderr_bytes),
144
+ sha256_hash=_compute_hash(stderr_bytes),
145
+ encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
146
+ encoding_version=0,
147
+ )
148
+ # stderr is uploaded, free the memory used for it.
149
+ output.stderr = None
150
+
151
+ if output.function_output is not None:
152
+ # We can't use the default empty list output.uploaded_data_payloads because it's a singleton.
153
+ uploaded_data_payloads = []
154
+ for func_output_item in output.function_output.outputs:
155
+ node_output_sequence = len(uploaded_data_payloads)
156
+ output_url = f"{output.task.output_payload_uri_prefix}.{output.task.id}.{node_output_sequence}"
157
+ output_bytes: bytes = (
158
+ func_output_item.bytes
159
+ if func_output_item.HasField("bytes")
160
+ else func_output_item.string.encode()
161
+ )
162
+ await blob_store.put(output_url, output_bytes, logger)
163
+ uploaded_data_payloads.append(
164
+ DataPayload(
165
+ uri=output_url,
166
+ size=len(output_bytes),
167
+ sha256_hash=_compute_hash(output_bytes),
168
+ encoding=_to_grpc_data_payload_encoding(output),
169
+ encoding_version=0,
170
+ )
171
+ )
172
+
173
+ output.uploaded_data_payloads = uploaded_data_payloads
174
+ # The output is uploaded, free the memory used for it.
175
+ output.function_output = None
176
+
177
+
178
+ def _task_output_summary(output: TaskOutput) -> _TaskOutputSummary:
179
+ summary: _TaskOutputSummary = _TaskOutputSummary()
180
+
181
+ if output.stdout is not None:
182
+ summary.stdout_count += 1
183
+ summary.stdout_total_bytes += len(output.stdout)
184
+
185
+ if output.stderr is not None:
186
+ summary.stderr_count += 1
187
+ summary.stderr_total_bytes += len(output.stderr)
188
+
189
+ if output.function_output is not None:
190
+ for func_output_item in output.function_output.outputs:
191
+ output_len: bytes = len(
192
+ func_output_item.bytes
193
+ if func_output_item.HasField("bytes")
194
+ else func_output_item.string
195
+ )
196
+ summary.output_count += 1
197
+ summary.output_total_bytes += output_len
198
+
199
+ if output.router_output is not None:
200
+ summary.router_output_count += 1
201
+
202
+ summary.total_bytes = (
203
+ summary.output_total_bytes
204
+ + summary.stdout_total_bytes
205
+ + summary.stderr_total_bytes
206
+ )
207
+ return summary
208
+
209
+
210
+ def _to_grpc_data_payload_encoding(task_output: TaskOutput) -> DataPayloadEncoding:
211
+ if task_output.output_encoding == "json":
212
+ return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_JSON
213
+ else:
214
+ return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_BINARY_PICKLE
215
+
216
+
217
+ def _compute_hash(data: bytes) -> str:
218
+ hasher = hashlib.sha256(usedforsecurity=False)
219
+ hasher.update(data)
220
+ return hasher.hexdigest()
221
+
222
+
223
+ # Temporary workaround is logging customer metrics until we store them somewhere
224
+ # for future retrieval and processing.
225
+ def _log_function_metrics(output: TaskOutput, logger: Any):
226
+ if output.metrics is None:
227
+ return
228
+
229
+ for counter_name, counter_value in output.metrics.counters.items():
230
+ logger.info(
231
+ "function_metric", counter_name=counter_name, counter_value=counter_value
232
+ )
233
+ for timer_name, timer_value in output.metrics.timers.items():
234
+ logger.info("function_metric", timer_name=timer_name, timer_value=timer_value)
@@ -1,4 +1,3 @@
1
- import asyncio
2
1
  from typing import Any, List, Optional
3
2
 
4
3
  import psutil
@@ -47,34 +46,11 @@ class HostResourcesProvider:
47
46
  host_overhead_function_executors_ephimeral_disks_gb
48
47
  )
49
48
 
50
- async def total_host_resources(self, logger: Any) -> HostResources:
49
+ def total_host_resources(self, logger: Any) -> HostResources:
51
50
  """Returns all hardware resources that exist at the host.
52
51
 
53
52
  Raises Exception on error.
54
53
  """
55
- # Run psutil library calls in a separate thread to not block the event loop.
56
- return await asyncio.to_thread(self._total_host_resources, logger=logger)
57
-
58
- async def total_function_executor_resources(self, logger: Any) -> HostResources:
59
- """Returns all hardware resources on the host that are usable by Function Executors.
60
-
61
- Raises Exception on error.
62
- """
63
- total_resources: HostResources = await self.total_host_resources(logger=logger)
64
- return HostResources(
65
- cpu_count=max(0, total_resources.cpu_count - self._host_overhead_cpus),
66
- memory_mb=max(
67
- 0, total_resources.memory_mb - self._host_overhead_memory_gb * 1024
68
- ),
69
- disk_mb=max(
70
- 0,
71
- total_resources.disk_mb
72
- - self._host_overhead_function_executors_ephimeral_disks_gb * 1024,
73
- ),
74
- gpus=total_resources.gpus,
75
- )
76
-
77
- def _total_host_resources(self, logger: Any) -> HostResources:
78
54
  logger = logger.bind(module=__name__)
79
55
 
80
56
  # If users disable Hyper-Threading in OS then we'd only see physical cores here.
@@ -102,3 +78,22 @@ class HostResourcesProvider:
102
78
  disk_mb=disk_mb,
103
79
  gpus=all_gpus,
104
80
  )
81
+
82
+ def total_function_executor_resources(self, logger: Any) -> HostResources:
83
+ """Returns all hardware resources on the host that are usable by Function Executors.
84
+
85
+ Raises Exception on error.
86
+ """
87
+ total_resources: HostResources = self.total_host_resources(logger=logger)
88
+ return HostResources(
89
+ cpu_count=max(0, total_resources.cpu_count - self._host_overhead_cpus),
90
+ memory_mb=max(
91
+ 0, total_resources.memory_mb - self._host_overhead_memory_gb * 1024
92
+ ),
93
+ disk_mb=max(
94
+ 0,
95
+ total_resources.disk_mb
96
+ - self._host_overhead_function_executors_ephimeral_disks_gb * 1024,
97
+ ),
98
+ gpus=total_resources.gpus,
99
+ )
@@ -40,11 +40,18 @@ class NvidiaGPUAllocator:
40
40
  allocated_gpus: List[NvidiaGPUInfo] = []
41
41
  for _ in range(count):
42
42
  allocated_gpus.append(self._free_gpus.pop())
43
+
44
+ if len(allocated_gpus) > 0:
45
+ logger.bind(module=__name__).info("allocated GPUs:", gpus=allocated_gpus)
46
+
43
47
  return allocated_gpus
44
48
 
45
- def deallocate(self, gpus: List[NvidiaGPUInfo]) -> None:
49
+ def deallocate(self, gpus: List[NvidiaGPUInfo], logger: Any) -> None:
46
50
  self._free_gpus.extend(gpus)
47
51
 
52
+ if len(gpus) > 0:
53
+ logger.bind(module=__name__).info("deallocated GPUs:", gpus=gpus)
54
+
48
55
  def list_all(self) -> List[NvidiaGPUInfo]:
49
56
  return list(self._all_gpus) # Return a copy to avoid external modification
50
57
 
@@ -1,6 +1,6 @@
1
1
  import prometheus_client
2
2
 
3
- from ...monitoring.metrics import latency_metric_for_fast_operation
3
+ from ..monitoring.metrics import latency_metric_for_fast_operation
4
4
 
5
5
  metric_grpc_server_channel_creations = prometheus_client.Counter(
6
6
  "grpc_server_channel_creations",
@@ -1,10 +1,5 @@
1
1
  import prometheus_client
2
2
 
3
- from ..monitoring.metrics import (
4
- latency_metric_for_customer_controlled_operation,
5
- latency_metric_for_fast_operation,
6
- )
7
-
8
3
  # This file contains all metrics used by Executor.
9
4
 
10
5
  # Executor overview metrics.
@@ -16,45 +11,3 @@ metric_executor_state: prometheus_client.Enum = prometheus_client.Enum(
16
11
  "Current Executor state",
17
12
  states=["starting", "running", "shutting_down"],
18
13
  )
19
-
20
- # Task statistics metrics.
21
- metric_tasks_fetched: prometheus_client.Counter = prometheus_client.Counter(
22
- "tasks_fetched", "Number of tasks that were fetched from Server"
23
- )
24
- metric_tasks_completed: prometheus_client.Counter = prometheus_client.Counter(
25
- "tasks_completed", "Number of tasks that were completed", ["outcome"]
26
- )
27
- METRIC_TASKS_COMPLETED_OUTCOME_ALL = "all"
28
- METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS = "success"
29
- METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE = "error_customer_code"
30
- METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM = "error_platform"
31
- metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL)
32
- metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS)
33
- metric_tasks_completed.labels(
34
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
35
- )
36
- metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM)
37
- metric_task_completion_latency: prometheus_client.Histogram = (
38
- latency_metric_for_customer_controlled_operation(
39
- "task_completion",
40
- "task completion from the moment it got fetched until its outcome got reported",
41
- )
42
- )
43
-
44
- # Task outcome reporting metrics.
45
- metric_task_outcome_reports: prometheus_client.Counter = prometheus_client.Counter(
46
- "task_outcome_reports",
47
- "Number of task outcome reports",
48
- )
49
- metric_tasks_reporting_outcome: prometheus_client.Gauge = prometheus_client.Gauge(
50
- "tasks_reporting_outcome",
51
- "Number of tasks currently reporting their outcomes",
52
- )
53
- metric_task_outcome_report_latency: prometheus_client.Histogram = (
54
- latency_metric_for_fast_operation("task_outcome_report", "task outcome report")
55
- )
56
- metric_task_outcome_report_retries: prometheus_client.Counter = (
57
- prometheus_client.Counter(
58
- "tasks_outcome_report_retries", "Number of task outcome report retries"
59
- )
60
- )
@@ -1,6 +1,6 @@
1
1
  import prometheus_client
2
2
 
3
- from ...monitoring.metrics import latency_metric_for_fast_operation
3
+ from ..monitoring.metrics import latency_metric_for_fast_operation
4
4
 
5
5
  metric_state_reconciliations = prometheus_client.Counter(
6
6
  "state_reconciliations",
@@ -1,6 +1,6 @@
1
1
  import prometheus_client
2
2
 
3
- from ...monitoring.metrics import latency_metric_for_fast_operation
3
+ from ..monitoring.metrics import latency_metric_for_fast_operation
4
4
 
5
5
  metric_state_report_rpcs = prometheus_client.Counter(
6
6
  "state_report_rpcs",
@@ -1,9 +1,3 @@
1
- from typing import Optional
2
-
3
- from ...function_executor.function_executor_states_container import (
4
- FunctionExecutorStatesContainer,
5
- )
6
- from ...function_executor.function_executor_status import FunctionExecutorStatus
7
1
  from .health_checker import HealthChecker, HealthCheckResult
8
2
 
9
3
  HEALTH_CHECKER_NAME = "GenericHealthChecker"
@@ -16,58 +10,11 @@ class GenericHealthChecker(HealthChecker):
16
10
  """
17
11
 
18
12
  def __init__(self):
19
- self._function_executor_states: Optional[FunctionExecutorStatesContainer] = None
20
- self._function_executor_health_check_ever_failed = False
21
-
22
- def set_function_executor_states_container(
23
- self, states: FunctionExecutorStatesContainer
24
- ):
25
- self._function_executor_states = states
13
+ pass
26
14
 
27
15
  async def check(self) -> HealthCheckResult:
28
- if self._function_executor_states is None:
29
- return HealthCheckResult(
30
- is_success=False,
31
- status_message="Function Executor states container was not provided yet",
32
- checker_name=HEALTH_CHECKER_NAME,
33
- )
34
-
35
- # Current health check policy and reasoning:
36
- # * A Function Executor health check failure is a strong signal that something is wrong
37
- # either with:
38
- # - The Function Code (a criticial software bug).
39
- # - The Executor machine/container/VM (a software bug or malfunctioning local hardware).
40
- # * Critical Function Code bugs tend to get fixed eventually by users. What doesn't get fixed eventually
41
- # is rare but recurring local Executor issues like hardware errors and software bugs in middleware like
42
- # drivers.
43
- # * Such issues tend to get mitigated by automatically recreating the Executor machine/VM/container.
44
- # * So we fail whole Executor health check if a Function Executor health check ever failed to hint the users
45
- # that we probably need to recreate the Executor machine/VM/container (unless there's a bug in Function
46
- # code that user can investigate themself).
47
- await self._check_function_executors()
48
- if self._function_executor_health_check_ever_failed:
49
- return HealthCheckResult(
50
- is_success=False,
51
- status_message="A Function Executor health check failed",
52
- checker_name=HEALTH_CHECKER_NAME,
53
- )
54
- else:
55
- return HealthCheckResult(
56
- is_success=True,
57
- status_message="All Function Executors pass health checks",
58
- checker_name=HEALTH_CHECKER_NAME,
59
- )
60
-
61
- async def _check_function_executors(self):
62
- if self._function_executor_health_check_ever_failed:
63
- return
64
-
65
- async for state in self._function_executor_states:
66
- # No need to async lock the state to read a single value.
67
- if state.status in [
68
- FunctionExecutorStatus.UNHEALTHY,
69
- FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
70
- FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
71
- ]:
72
- self._function_executor_health_check_ever_failed = True
73
- return
16
+ return HealthCheckResult(
17
+ is_success=True,
18
+ status_message="The health check is always successful",
19
+ checker_name=HEALTH_CHECKER_NAME,
20
+ )
@@ -1,8 +1,3 @@
1
- from ...function_executor.function_executor_states_container import (
2
- FunctionExecutorStatesContainer,
3
- )
4
-
5
-
6
1
  class HealthCheckResult:
7
2
  def __init__(self, checker_name: str, is_success: bool, status_message: str):
8
3
  self.checker_name = checker_name
@@ -13,11 +8,5 @@ class HealthCheckResult:
13
8
  class HealthChecker:
14
9
  """Abstract base class for health checkers."""
15
10
 
16
- def set_function_executor_states_container(
17
- self, states: FunctionExecutorStatesContainer
18
- ):
19
- """Provides function executor states to this health checker so it can use them in the health checks."""
20
- raise NotImplementedError("Subclasses must implement this method.")
21
-
22
11
  async def check(self) -> HealthCheckResult:
23
12
  raise NotImplementedError("Subclasses must implement this method.")