indexify 0.3.30__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/__init__.py +18 -0
- indexify/cli/build_image.py +51 -0
- indexify/cli/deploy.py +57 -0
- indexify/cli/executor.py +205 -0
- indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
- indexify/executor/executor.py +57 -311
- indexify/executor/function_allowlist.py +59 -0
- indexify/executor/function_executor/function_executor.py +12 -6
- indexify/executor/function_executor/invocation_state_client.py +25 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
- indexify/executor/function_executor_controller/__init__.py +13 -0
- indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
- indexify/executor/function_executor_controller/create_function_executor.py +154 -0
- indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
- indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
- indexify/executor/function_executor_controller/downloads.py +199 -0
- indexify/executor/function_executor_controller/events.py +172 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
- indexify/executor/function_executor_controller/loggers.py +57 -0
- indexify/executor/function_executor_controller/message_validators.py +65 -0
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
- indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
- indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
- indexify/executor/function_executor_controller/prepare_task.py +38 -0
- indexify/executor/function_executor_controller/run_task.py +201 -0
- indexify/executor/function_executor_controller/task_info.py +33 -0
- indexify/executor/function_executor_controller/task_output.py +122 -0
- indexify/executor/function_executor_controller/upload_task_output.py +234 -0
- indexify/executor/host_resources/host_resources.py +20 -25
- indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
- indexify/executor/metrics/executor.py +0 -47
- indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
- indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
- indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
- indexify/executor/monitoring/health_checker/health_checker.py +0 -11
- indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
- indexify/executor/state_reporter.py +364 -0
- indexify/proto/executor_api.proto +67 -59
- indexify/proto/executor_api_pb2.py +52 -52
- indexify/proto/executor_api_pb2.pyi +125 -104
- indexify/proto/executor_api_pb2_grpc.py +0 -47
- {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/METADATA +1 -3
- indexify-0.4.2.dist-info/RECORD +68 -0
- indexify-0.4.2.dist-info/entry_points.txt +3 -0
- indexify/cli/cli.py +0 -267
- indexify/executor/api_objects.py +0 -92
- indexify/executor/downloader.py +0 -417
- indexify/executor/executor_flavor.py +0 -7
- indexify/executor/function_executor/function_executor_state.py +0 -107
- indexify/executor/function_executor/function_executor_states_container.py +0 -93
- indexify/executor/function_executor/function_executor_status.py +0 -95
- indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
- indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
- indexify/executor/function_executor/single_task_runner.py +0 -345
- indexify/executor/function_executor/task_input.py +0 -21
- indexify/executor/function_executor/task_output.py +0 -105
- indexify/executor/grpc/function_executor_controller.py +0 -418
- indexify/executor/grpc/metrics/task_controller.py +0 -8
- indexify/executor/grpc/state_reporter.py +0 -314
- indexify/executor/grpc/task_controller.py +0 -508
- indexify/executor/metrics/task_fetcher.py +0 -21
- indexify/executor/metrics/task_reporter.py +0 -53
- indexify/executor/metrics/task_runner.py +0 -52
- indexify/executor/monitoring/function_allowlist.py +0 -25
- indexify/executor/runtime_probes.py +0 -68
- indexify/executor/task_fetcher.py +0 -96
- indexify/executor/task_reporter.py +0 -459
- indexify/executor/task_runner.py +0 -177
- indexify-0.3.30.dist-info/RECORD +0 -68
- indexify-0.3.30.dist-info/entry_points.txt +0 -3
- {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/WHEEL +0 -0
@@ -0,0 +1,122 @@
|
|
1
|
+
from typing import Dict, List, Optional
|
2
|
+
|
3
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
4
|
+
FunctionOutput,
|
5
|
+
RouterOutput,
|
6
|
+
)
|
7
|
+
|
8
|
+
from indexify.proto.executor_api_pb2 import (
|
9
|
+
DataPayload,
|
10
|
+
Task,
|
11
|
+
TaskFailureReason,
|
12
|
+
TaskOutcomeCode,
|
13
|
+
)
|
14
|
+
|
15
|
+
|
16
|
+
class TaskMetrics:
|
17
|
+
"""Metrics for a task."""
|
18
|
+
|
19
|
+
def __init__(self, counters: Dict[str, int], timers: Dict[str, float]):
|
20
|
+
self.counters = counters
|
21
|
+
self.timers = timers
|
22
|
+
|
23
|
+
|
24
|
+
class TaskOutput:
|
25
|
+
"""Result of running a task."""
|
26
|
+
|
27
|
+
def __init__(
|
28
|
+
self,
|
29
|
+
task: Task,
|
30
|
+
allocation_id: str,
|
31
|
+
outcome_code: TaskOutcomeCode,
|
32
|
+
# Optional[TaskFailureReason] is not supported in python 3.9
|
33
|
+
failure_reason: TaskFailureReason = None,
|
34
|
+
output_encoding: Optional[str] = None,
|
35
|
+
function_output: Optional[FunctionOutput] = None,
|
36
|
+
router_output: Optional[RouterOutput] = None,
|
37
|
+
stdout: Optional[str] = None,
|
38
|
+
stderr: Optional[str] = None,
|
39
|
+
reducer: bool = False,
|
40
|
+
metrics: Optional[TaskMetrics] = None,
|
41
|
+
uploaded_data_payloads: List[DataPayload] = [],
|
42
|
+
uploaded_stdout: Optional[DataPayload] = None,
|
43
|
+
uploaded_stderr: Optional[DataPayload] = None,
|
44
|
+
):
|
45
|
+
self.task = task
|
46
|
+
self.allocation_id = allocation_id
|
47
|
+
self.function_output = function_output
|
48
|
+
self.router_output = router_output
|
49
|
+
self.stdout = stdout
|
50
|
+
self.stderr = stderr
|
51
|
+
self.reducer = reducer
|
52
|
+
self.outcome_code = outcome_code
|
53
|
+
self.failure_reason = failure_reason
|
54
|
+
self.metrics = metrics
|
55
|
+
self.output_encoding = output_encoding
|
56
|
+
self.uploaded_data_payloads = uploaded_data_payloads
|
57
|
+
self.uploaded_stdout = uploaded_stdout
|
58
|
+
self.uploaded_stderr = uploaded_stderr
|
59
|
+
|
60
|
+
@classmethod
|
61
|
+
def internal_error(
|
62
|
+
cls,
|
63
|
+
task: Task,
|
64
|
+
allocation_id: str,
|
65
|
+
) -> "TaskOutput":
|
66
|
+
"""Creates a TaskOutput for an internal error."""
|
67
|
+
# We are not sharing internal error messages with the customer.
|
68
|
+
return TaskOutput(
|
69
|
+
task=task,
|
70
|
+
allocation_id=allocation_id,
|
71
|
+
outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
|
72
|
+
failure_reason=TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR,
|
73
|
+
stderr="Platform failed to execute the function.",
|
74
|
+
)
|
75
|
+
|
76
|
+
@classmethod
|
77
|
+
def function_timeout(
|
78
|
+
cls,
|
79
|
+
task: Task,
|
80
|
+
allocation_id: str,
|
81
|
+
timeout_sec: float,
|
82
|
+
) -> "TaskOutput":
|
83
|
+
"""Creates a TaskOutput for an function timeout error."""
|
84
|
+
# Task stdout, stderr is not available.
|
85
|
+
return TaskOutput(
|
86
|
+
task=task,
|
87
|
+
allocation_id=allocation_id,
|
88
|
+
outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
|
89
|
+
failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_TIMEOUT,
|
90
|
+
stderr=f"Function exceeded its configured timeout of {timeout_sec:.3f} sec.",
|
91
|
+
)
|
92
|
+
|
93
|
+
@classmethod
|
94
|
+
def task_cancelled(
|
95
|
+
cls,
|
96
|
+
task: Task,
|
97
|
+
allocation_id: str,
|
98
|
+
) -> "TaskOutput":
|
99
|
+
"""Creates a TaskOutput for the case when task didn't finish because its allocation was removed by Server."""
|
100
|
+
return TaskOutput(
|
101
|
+
task=task,
|
102
|
+
allocation_id=allocation_id,
|
103
|
+
outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
|
104
|
+
failure_reason=TaskFailureReason.TASK_FAILURE_REASON_TASK_CANCELLED,
|
105
|
+
)
|
106
|
+
|
107
|
+
@classmethod
|
108
|
+
def function_executor_terminated(
|
109
|
+
cls,
|
110
|
+
task: Task,
|
111
|
+
allocation_id: str,
|
112
|
+
) -> "TaskOutput":
|
113
|
+
"""Creates a TaskOutput for the case when task didn't run because its FE terminated."""
|
114
|
+
return TaskOutput(
|
115
|
+
task=task,
|
116
|
+
allocation_id=allocation_id,
|
117
|
+
outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
|
118
|
+
failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED,
|
119
|
+
# TODO: add FE startup stdout, stderr to the task output if FE failed to startup.
|
120
|
+
stdout="",
|
121
|
+
stderr="Can't execute the function because its Function Executor terminated.",
|
122
|
+
)
|
@@ -0,0 +1,234 @@
|
|
1
|
+
import asyncio
|
2
|
+
import hashlib
|
3
|
+
import time
|
4
|
+
from typing import Any
|
5
|
+
|
6
|
+
from indexify.executor.blob_store.blob_store import BLOBStore
|
7
|
+
from indexify.proto.executor_api_pb2 import (
|
8
|
+
DataPayload,
|
9
|
+
DataPayloadEncoding,
|
10
|
+
)
|
11
|
+
|
12
|
+
from .events import TaskOutputUploadFinished
|
13
|
+
from .metrics.upload_task_output import (
|
14
|
+
metric_task_output_blob_store_upload_errors,
|
15
|
+
metric_task_output_blob_store_upload_latency,
|
16
|
+
metric_task_output_blob_store_uploads,
|
17
|
+
metric_task_output_upload_latency,
|
18
|
+
metric_task_output_upload_retries,
|
19
|
+
metric_task_output_uploads,
|
20
|
+
metric_tasks_uploading_outputs,
|
21
|
+
)
|
22
|
+
from .task_info import TaskInfo
|
23
|
+
from .task_output import TaskOutput
|
24
|
+
|
25
|
+
_TASK_OUTPUT_UPLOAD_BACKOFF_SEC = 5.0
|
26
|
+
|
27
|
+
|
28
|
+
async def upload_task_output(
|
29
|
+
task_info: TaskInfo, blob_store: BLOBStore, logger: Any
|
30
|
+
) -> TaskOutputUploadFinished:
|
31
|
+
"""Uploads the task output to blob store.
|
32
|
+
|
33
|
+
Doesn't raise any Exceptions. Runs till the reporting is successful.
|
34
|
+
"""
|
35
|
+
logger = logger.bind(module=__name__)
|
36
|
+
|
37
|
+
with (
|
38
|
+
metric_tasks_uploading_outputs.track_inprogress(),
|
39
|
+
metric_task_output_upload_latency.time(),
|
40
|
+
):
|
41
|
+
metric_task_output_uploads.inc()
|
42
|
+
await _upload_task_output_until_successful(
|
43
|
+
output=task_info.output,
|
44
|
+
blob_store=blob_store,
|
45
|
+
logger=logger,
|
46
|
+
)
|
47
|
+
_log_function_metrics(output=task_info.output, logger=logger)
|
48
|
+
return TaskOutputUploadFinished(task_info=task_info, is_success=True)
|
49
|
+
|
50
|
+
|
51
|
+
async def _upload_task_output_until_successful(
|
52
|
+
output: TaskOutput, blob_store: BLOBStore, logger: Any
|
53
|
+
) -> None:
|
54
|
+
upload_retries: int = 0
|
55
|
+
|
56
|
+
while True:
|
57
|
+
logger = logger.bind(retries=upload_retries)
|
58
|
+
try:
|
59
|
+
await _upload_task_output_once(
|
60
|
+
output=output, blob_store=blob_store, logger=logger
|
61
|
+
)
|
62
|
+
return
|
63
|
+
except Exception as e:
|
64
|
+
logger.error(
|
65
|
+
"failed to upload task output",
|
66
|
+
exc_info=e,
|
67
|
+
)
|
68
|
+
upload_retries += 1
|
69
|
+
metric_task_output_upload_retries.inc()
|
70
|
+
await asyncio.sleep(_TASK_OUTPUT_UPLOAD_BACKOFF_SEC)
|
71
|
+
|
72
|
+
|
73
|
+
class _TaskOutputSummary:
|
74
|
+
def __init__(self):
|
75
|
+
self.output_count: int = 0
|
76
|
+
self.output_total_bytes: int = 0
|
77
|
+
self.router_output_count: int = 0
|
78
|
+
self.stdout_count: int = 0
|
79
|
+
self.stdout_total_bytes: int = 0
|
80
|
+
self.stderr_count: int = 0
|
81
|
+
self.stderr_total_bytes: int = 0
|
82
|
+
self.total_bytes: int = 0
|
83
|
+
|
84
|
+
|
85
|
+
async def _upload_task_output_once(
|
86
|
+
output: TaskOutput, blob_store: BLOBStore, logger: Any
|
87
|
+
) -> None:
|
88
|
+
"""Uploads the supplied task output to blob store.
|
89
|
+
|
90
|
+
Raises an Exception if the upload fails.
|
91
|
+
"""
|
92
|
+
output_summary: _TaskOutputSummary = _task_output_summary(output)
|
93
|
+
logger.info(
|
94
|
+
"uploading task output to blob store",
|
95
|
+
total_bytes=output_summary.total_bytes,
|
96
|
+
total_files=output_summary.output_count
|
97
|
+
+ output_summary.stdout_count
|
98
|
+
+ output_summary.stderr_count,
|
99
|
+
output_files=output_summary.output_count,
|
100
|
+
output_bytes=output_summary.total_bytes,
|
101
|
+
router_output_count=output_summary.router_output_count,
|
102
|
+
stdout_bytes=output_summary.stdout_total_bytes,
|
103
|
+
stderr_bytes=output_summary.stderr_total_bytes,
|
104
|
+
)
|
105
|
+
|
106
|
+
start_time = time.time()
|
107
|
+
with (
|
108
|
+
metric_task_output_blob_store_upload_latency.time(),
|
109
|
+
metric_task_output_blob_store_upload_errors.count_exceptions(),
|
110
|
+
):
|
111
|
+
metric_task_output_blob_store_uploads.inc()
|
112
|
+
await _upload_to_blob_store(output=output, blob_store=blob_store, logger=logger)
|
113
|
+
|
114
|
+
logger.info(
|
115
|
+
"files uploaded to blob store",
|
116
|
+
duration=time.time() - start_time,
|
117
|
+
)
|
118
|
+
|
119
|
+
|
120
|
+
async def _upload_to_blob_store(
|
121
|
+
output: TaskOutput, blob_store: BLOBStore, logger: Any
|
122
|
+
) -> None:
|
123
|
+
if output.stdout is not None:
|
124
|
+
stdout_url = f"{output.task.output_payload_uri_prefix}.{output.task.id}.stdout"
|
125
|
+
stdout_bytes: bytes = output.stdout.encode()
|
126
|
+
await blob_store.put(stdout_url, stdout_bytes, logger)
|
127
|
+
output.uploaded_stdout = DataPayload(
|
128
|
+
uri=stdout_url,
|
129
|
+
size=len(stdout_bytes),
|
130
|
+
sha256_hash=_compute_hash(stdout_bytes),
|
131
|
+
encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
|
132
|
+
encoding_version=0,
|
133
|
+
)
|
134
|
+
# stdout is uploaded, free the memory used for it.
|
135
|
+
output.stdout = None
|
136
|
+
|
137
|
+
if output.stderr is not None:
|
138
|
+
stderr_url = f"{output.task.output_payload_uri_prefix}.{output.task.id}.stderr"
|
139
|
+
stderr_bytes: bytes = output.stderr.encode()
|
140
|
+
await blob_store.put(stderr_url, stderr_bytes, logger)
|
141
|
+
output.uploaded_stderr = DataPayload(
|
142
|
+
uri=stderr_url,
|
143
|
+
size=len(stderr_bytes),
|
144
|
+
sha256_hash=_compute_hash(stderr_bytes),
|
145
|
+
encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
|
146
|
+
encoding_version=0,
|
147
|
+
)
|
148
|
+
# stderr is uploaded, free the memory used for it.
|
149
|
+
output.stderr = None
|
150
|
+
|
151
|
+
if output.function_output is not None:
|
152
|
+
# We can't use the default empty list output.uploaded_data_payloads because it's a singleton.
|
153
|
+
uploaded_data_payloads = []
|
154
|
+
for func_output_item in output.function_output.outputs:
|
155
|
+
node_output_sequence = len(uploaded_data_payloads)
|
156
|
+
output_url = f"{output.task.output_payload_uri_prefix}.{output.task.id}.{node_output_sequence}"
|
157
|
+
output_bytes: bytes = (
|
158
|
+
func_output_item.bytes
|
159
|
+
if func_output_item.HasField("bytes")
|
160
|
+
else func_output_item.string.encode()
|
161
|
+
)
|
162
|
+
await blob_store.put(output_url, output_bytes, logger)
|
163
|
+
uploaded_data_payloads.append(
|
164
|
+
DataPayload(
|
165
|
+
uri=output_url,
|
166
|
+
size=len(output_bytes),
|
167
|
+
sha256_hash=_compute_hash(output_bytes),
|
168
|
+
encoding=_to_grpc_data_payload_encoding(output),
|
169
|
+
encoding_version=0,
|
170
|
+
)
|
171
|
+
)
|
172
|
+
|
173
|
+
output.uploaded_data_payloads = uploaded_data_payloads
|
174
|
+
# The output is uploaded, free the memory used for it.
|
175
|
+
output.function_output = None
|
176
|
+
|
177
|
+
|
178
|
+
def _task_output_summary(output: TaskOutput) -> _TaskOutputSummary:
|
179
|
+
summary: _TaskOutputSummary = _TaskOutputSummary()
|
180
|
+
|
181
|
+
if output.stdout is not None:
|
182
|
+
summary.stdout_count += 1
|
183
|
+
summary.stdout_total_bytes += len(output.stdout)
|
184
|
+
|
185
|
+
if output.stderr is not None:
|
186
|
+
summary.stderr_count += 1
|
187
|
+
summary.stderr_total_bytes += len(output.stderr)
|
188
|
+
|
189
|
+
if output.function_output is not None:
|
190
|
+
for func_output_item in output.function_output.outputs:
|
191
|
+
output_len: bytes = len(
|
192
|
+
func_output_item.bytes
|
193
|
+
if func_output_item.HasField("bytes")
|
194
|
+
else func_output_item.string
|
195
|
+
)
|
196
|
+
summary.output_count += 1
|
197
|
+
summary.output_total_bytes += output_len
|
198
|
+
|
199
|
+
if output.router_output is not None:
|
200
|
+
summary.router_output_count += 1
|
201
|
+
|
202
|
+
summary.total_bytes = (
|
203
|
+
summary.output_total_bytes
|
204
|
+
+ summary.stdout_total_bytes
|
205
|
+
+ summary.stderr_total_bytes
|
206
|
+
)
|
207
|
+
return summary
|
208
|
+
|
209
|
+
|
210
|
+
def _to_grpc_data_payload_encoding(task_output: TaskOutput) -> DataPayloadEncoding:
|
211
|
+
if task_output.output_encoding == "json":
|
212
|
+
return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_JSON
|
213
|
+
else:
|
214
|
+
return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_BINARY_PICKLE
|
215
|
+
|
216
|
+
|
217
|
+
def _compute_hash(data: bytes) -> str:
|
218
|
+
hasher = hashlib.sha256(usedforsecurity=False)
|
219
|
+
hasher.update(data)
|
220
|
+
return hasher.hexdigest()
|
221
|
+
|
222
|
+
|
223
|
+
# Temporary workaround is logging customer metrics until we store them somewhere
|
224
|
+
# for future retrieval and processing.
|
225
|
+
def _log_function_metrics(output: TaskOutput, logger: Any):
|
226
|
+
if output.metrics is None:
|
227
|
+
return
|
228
|
+
|
229
|
+
for counter_name, counter_value in output.metrics.counters.items():
|
230
|
+
logger.info(
|
231
|
+
"function_metric", counter_name=counter_name, counter_value=counter_value
|
232
|
+
)
|
233
|
+
for timer_name, timer_value in output.metrics.timers.items():
|
234
|
+
logger.info("function_metric", timer_name=timer_name, timer_value=timer_value)
|
@@ -1,4 +1,3 @@
|
|
1
|
-
import asyncio
|
2
1
|
from typing import Any, List, Optional
|
3
2
|
|
4
3
|
import psutil
|
@@ -47,34 +46,11 @@ class HostResourcesProvider:
|
|
47
46
|
host_overhead_function_executors_ephimeral_disks_gb
|
48
47
|
)
|
49
48
|
|
50
|
-
|
49
|
+
def total_host_resources(self, logger: Any) -> HostResources:
|
51
50
|
"""Returns all hardware resources that exist at the host.
|
52
51
|
|
53
52
|
Raises Exception on error.
|
54
53
|
"""
|
55
|
-
# Run psutil library calls in a separate thread to not block the event loop.
|
56
|
-
return await asyncio.to_thread(self._total_host_resources, logger=logger)
|
57
|
-
|
58
|
-
async def total_function_executor_resources(self, logger: Any) -> HostResources:
|
59
|
-
"""Returns all hardware resources on the host that are usable by Function Executors.
|
60
|
-
|
61
|
-
Raises Exception on error.
|
62
|
-
"""
|
63
|
-
total_resources: HostResources = await self.total_host_resources(logger=logger)
|
64
|
-
return HostResources(
|
65
|
-
cpu_count=max(0, total_resources.cpu_count - self._host_overhead_cpus),
|
66
|
-
memory_mb=max(
|
67
|
-
0, total_resources.memory_mb - self._host_overhead_memory_gb * 1024
|
68
|
-
),
|
69
|
-
disk_mb=max(
|
70
|
-
0,
|
71
|
-
total_resources.disk_mb
|
72
|
-
- self._host_overhead_function_executors_ephimeral_disks_gb * 1024,
|
73
|
-
),
|
74
|
-
gpus=total_resources.gpus,
|
75
|
-
)
|
76
|
-
|
77
|
-
def _total_host_resources(self, logger: Any) -> HostResources:
|
78
54
|
logger = logger.bind(module=__name__)
|
79
55
|
|
80
56
|
# If users disable Hyper-Threading in OS then we'd only see physical cores here.
|
@@ -102,3 +78,22 @@ class HostResourcesProvider:
|
|
102
78
|
disk_mb=disk_mb,
|
103
79
|
gpus=all_gpus,
|
104
80
|
)
|
81
|
+
|
82
|
+
def total_function_executor_resources(self, logger: Any) -> HostResources:
|
83
|
+
"""Returns all hardware resources on the host that are usable by Function Executors.
|
84
|
+
|
85
|
+
Raises Exception on error.
|
86
|
+
"""
|
87
|
+
total_resources: HostResources = self.total_host_resources(logger=logger)
|
88
|
+
return HostResources(
|
89
|
+
cpu_count=max(0, total_resources.cpu_count - self._host_overhead_cpus),
|
90
|
+
memory_mb=max(
|
91
|
+
0, total_resources.memory_mb - self._host_overhead_memory_gb * 1024
|
92
|
+
),
|
93
|
+
disk_mb=max(
|
94
|
+
0,
|
95
|
+
total_resources.disk_mb
|
96
|
+
- self._host_overhead_function_executors_ephimeral_disks_gb * 1024,
|
97
|
+
),
|
98
|
+
gpus=total_resources.gpus,
|
99
|
+
)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
import prometheus_client
|
2
2
|
|
3
|
-
from
|
3
|
+
from ..monitoring.metrics import latency_metric_for_fast_operation
|
4
4
|
|
5
5
|
metric_grpc_server_channel_creations = prometheus_client.Counter(
|
6
6
|
"grpc_server_channel_creations",
|
@@ -1,10 +1,5 @@
|
|
1
1
|
import prometheus_client
|
2
2
|
|
3
|
-
from ..monitoring.metrics import (
|
4
|
-
latency_metric_for_customer_controlled_operation,
|
5
|
-
latency_metric_for_fast_operation,
|
6
|
-
)
|
7
|
-
|
8
3
|
# This file contains all metrics used by Executor.
|
9
4
|
|
10
5
|
# Executor overview metrics.
|
@@ -16,45 +11,3 @@ metric_executor_state: prometheus_client.Enum = prometheus_client.Enum(
|
|
16
11
|
"Current Executor state",
|
17
12
|
states=["starting", "running", "shutting_down"],
|
18
13
|
)
|
19
|
-
|
20
|
-
# Task statistics metrics.
|
21
|
-
metric_tasks_fetched: prometheus_client.Counter = prometheus_client.Counter(
|
22
|
-
"tasks_fetched", "Number of tasks that were fetched from Server"
|
23
|
-
)
|
24
|
-
metric_tasks_completed: prometheus_client.Counter = prometheus_client.Counter(
|
25
|
-
"tasks_completed", "Number of tasks that were completed", ["outcome"]
|
26
|
-
)
|
27
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ALL = "all"
|
28
|
-
METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS = "success"
|
29
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE = "error_customer_code"
|
30
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM = "error_platform"
|
31
|
-
metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL)
|
32
|
-
metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS)
|
33
|
-
metric_tasks_completed.labels(
|
34
|
-
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
|
35
|
-
)
|
36
|
-
metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM)
|
37
|
-
metric_task_completion_latency: prometheus_client.Histogram = (
|
38
|
-
latency_metric_for_customer_controlled_operation(
|
39
|
-
"task_completion",
|
40
|
-
"task completion from the moment it got fetched until its outcome got reported",
|
41
|
-
)
|
42
|
-
)
|
43
|
-
|
44
|
-
# Task outcome reporting metrics.
|
45
|
-
metric_task_outcome_reports: prometheus_client.Counter = prometheus_client.Counter(
|
46
|
-
"task_outcome_reports",
|
47
|
-
"Number of task outcome reports",
|
48
|
-
)
|
49
|
-
metric_tasks_reporting_outcome: prometheus_client.Gauge = prometheus_client.Gauge(
|
50
|
-
"tasks_reporting_outcome",
|
51
|
-
"Number of tasks currently reporting their outcomes",
|
52
|
-
)
|
53
|
-
metric_task_outcome_report_latency: prometheus_client.Histogram = (
|
54
|
-
latency_metric_for_fast_operation("task_outcome_report", "task outcome report")
|
55
|
-
)
|
56
|
-
metric_task_outcome_report_retries: prometheus_client.Counter = (
|
57
|
-
prometheus_client.Counter(
|
58
|
-
"tasks_outcome_report_retries", "Number of task outcome report retries"
|
59
|
-
)
|
60
|
-
)
|
@@ -1,9 +1,3 @@
|
|
1
|
-
from typing import Optional
|
2
|
-
|
3
|
-
from ...function_executor.function_executor_states_container import (
|
4
|
-
FunctionExecutorStatesContainer,
|
5
|
-
)
|
6
|
-
from ...function_executor.function_executor_status import FunctionExecutorStatus
|
7
1
|
from .health_checker import HealthChecker, HealthCheckResult
|
8
2
|
|
9
3
|
HEALTH_CHECKER_NAME = "GenericHealthChecker"
|
@@ -16,58 +10,11 @@ class GenericHealthChecker(HealthChecker):
|
|
16
10
|
"""
|
17
11
|
|
18
12
|
def __init__(self):
|
19
|
-
|
20
|
-
self._function_executor_health_check_ever_failed = False
|
21
|
-
|
22
|
-
def set_function_executor_states_container(
|
23
|
-
self, states: FunctionExecutorStatesContainer
|
24
|
-
):
|
25
|
-
self._function_executor_states = states
|
13
|
+
pass
|
26
14
|
|
27
15
|
async def check(self) -> HealthCheckResult:
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
)
|
34
|
-
|
35
|
-
# Current health check policy and reasoning:
|
36
|
-
# * A Function Executor health check failure is a strong signal that something is wrong
|
37
|
-
# either with:
|
38
|
-
# - The Function Code (a criticial software bug).
|
39
|
-
# - The Executor machine/container/VM (a software bug or malfunctioning local hardware).
|
40
|
-
# * Critical Function Code bugs tend to get fixed eventually by users. What doesn't get fixed eventually
|
41
|
-
# is rare but recurring local Executor issues like hardware errors and software bugs in middleware like
|
42
|
-
# drivers.
|
43
|
-
# * Such issues tend to get mitigated by automatically recreating the Executor machine/VM/container.
|
44
|
-
# * So we fail whole Executor health check if a Function Executor health check ever failed to hint the users
|
45
|
-
# that we probably need to recreate the Executor machine/VM/container (unless there's a bug in Function
|
46
|
-
# code that user can investigate themself).
|
47
|
-
await self._check_function_executors()
|
48
|
-
if self._function_executor_health_check_ever_failed:
|
49
|
-
return HealthCheckResult(
|
50
|
-
is_success=False,
|
51
|
-
status_message="A Function Executor health check failed",
|
52
|
-
checker_name=HEALTH_CHECKER_NAME,
|
53
|
-
)
|
54
|
-
else:
|
55
|
-
return HealthCheckResult(
|
56
|
-
is_success=True,
|
57
|
-
status_message="All Function Executors pass health checks",
|
58
|
-
checker_name=HEALTH_CHECKER_NAME,
|
59
|
-
)
|
60
|
-
|
61
|
-
async def _check_function_executors(self):
|
62
|
-
if self._function_executor_health_check_ever_failed:
|
63
|
-
return
|
64
|
-
|
65
|
-
async for state in self._function_executor_states:
|
66
|
-
# No need to async lock the state to read a single value.
|
67
|
-
if state.status in [
|
68
|
-
FunctionExecutorStatus.UNHEALTHY,
|
69
|
-
FunctionExecutorStatus.STARTUP_FAILED_CUSTOMER_ERROR,
|
70
|
-
FunctionExecutorStatus.STARTUP_FAILED_PLATFORM_ERROR,
|
71
|
-
]:
|
72
|
-
self._function_executor_health_check_ever_failed = True
|
73
|
-
return
|
16
|
+
return HealthCheckResult(
|
17
|
+
is_success=True,
|
18
|
+
status_message="The health check is always successful",
|
19
|
+
checker_name=HEALTH_CHECKER_NAME,
|
20
|
+
)
|
@@ -1,8 +1,3 @@
|
|
1
|
-
from ...function_executor.function_executor_states_container import (
|
2
|
-
FunctionExecutorStatesContainer,
|
3
|
-
)
|
4
|
-
|
5
|
-
|
6
1
|
class HealthCheckResult:
|
7
2
|
def __init__(self, checker_name: str, is_success: bool, status_message: str):
|
8
3
|
self.checker_name = checker_name
|
@@ -13,11 +8,5 @@ class HealthCheckResult:
|
|
13
8
|
class HealthChecker:
|
14
9
|
"""Abstract base class for health checkers."""
|
15
10
|
|
16
|
-
def set_function_executor_states_container(
|
17
|
-
self, states: FunctionExecutorStatesContainer
|
18
|
-
):
|
19
|
-
"""Provides function executor states to this health checker so it can use them in the health checks."""
|
20
|
-
raise NotImplementedError("Subclasses must implement this method.")
|
21
|
-
|
22
11
|
async def check(self) -> HealthCheckResult:
|
23
12
|
raise NotImplementedError("Subclasses must implement this method.")
|