indexify 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +35 -6
- indexify/executor/api_objects.py +4 -0
- indexify/executor/downloader.py +45 -5
- indexify/executor/executor.py +103 -16
- indexify/executor/function_executor/function_executor.py +174 -55
- indexify/executor/function_executor/function_executor_state.py +6 -0
- indexify/executor/function_executor/function_executor_states_container.py +64 -0
- indexify/executor/function_executor/health_checker.py +20 -10
- indexify/executor/function_executor/invocation_state_client.py +31 -6
- indexify/executor/function_executor/metrics/function_executor.py +142 -0
- indexify/executor/function_executor/metrics/function_executor_state.py +10 -0
- indexify/executor/function_executor/metrics/function_executor_state_container.py +10 -0
- indexify/executor/function_executor/metrics/health_checker.py +14 -0
- indexify/executor/function_executor/metrics/invocation_state_client.py +45 -0
- indexify/executor/function_executor/metrics/single_task_runner.py +22 -0
- indexify/executor/function_executor/single_task_runner.py +44 -15
- indexify/executor/function_executor/task_output.py +7 -1
- indexify/executor/metrics/downloader.py +69 -0
- indexify/executor/metrics/executor.py +51 -0
- indexify/executor/metrics/task_fetcher.py +21 -0
- indexify/executor/metrics/task_reporter.py +22 -0
- indexify/executor/metrics/task_runner.py +45 -0
- indexify/executor/monitoring/function_allowlist.py +25 -0
- indexify/executor/monitoring/handler.py +8 -0
- indexify/executor/monitoring/health_check_handler.py +20 -0
- indexify/executor/monitoring/health_checker/generic_health_checker.py +58 -0
- indexify/executor/monitoring/health_checker/health_checker.py +23 -0
- indexify/executor/monitoring/metrics.py +245 -0
- indexify/executor/monitoring/prometheus_metrics_handler.py +18 -0
- indexify/executor/monitoring/server.py +41 -0
- indexify/executor/monitoring/startup_probe_handler.py +17 -0
- indexify/executor/task_fetcher.py +15 -1
- indexify/executor/task_reporter.py +24 -7
- indexify/executor/task_runner.py +64 -46
- {indexify-0.3.9.dist-info → indexify-0.3.10.dist-info}/METADATA +4 -2
- indexify-0.3.10.dist-info/RECORD +46 -0
- indexify-0.3.9.dist-info/RECORD +0 -25
- {indexify-0.3.9.dist-info → indexify-0.3.10.dist-info}/WHEEL +0 -0
- {indexify-0.3.9.dist-info → indexify-0.3.10.dist-info}/entry_points.txt +0 -0
@@ -13,6 +13,33 @@ from tensorlake.utils.http_client import get_httpx_client
|
|
13
13
|
|
14
14
|
from .health_checker import HealthChecker
|
15
15
|
from .invocation_state_client import InvocationStateClient
|
16
|
+
from .metrics.function_executor import (
|
17
|
+
metric_create_errors,
|
18
|
+
metric_create_health_checker_errors,
|
19
|
+
metric_create_health_checker_latency,
|
20
|
+
metric_create_invocation_state_client_errors,
|
21
|
+
metric_create_invocation_state_client_latency,
|
22
|
+
metric_create_latency,
|
23
|
+
metric_create_server_errors,
|
24
|
+
metric_create_server_latency,
|
25
|
+
metric_creations,
|
26
|
+
metric_destroy_channel_errors,
|
27
|
+
metric_destroy_channel_latency,
|
28
|
+
metric_destroy_errors,
|
29
|
+
metric_destroy_health_checker_errors,
|
30
|
+
metric_destroy_health_checker_latency,
|
31
|
+
metric_destroy_invocation_state_client_errors,
|
32
|
+
metric_destroy_invocation_state_client_latency,
|
33
|
+
metric_destroy_latency,
|
34
|
+
metric_destroy_server_errors,
|
35
|
+
metric_destroy_server_latency,
|
36
|
+
metric_destroys,
|
37
|
+
metric_establish_channel_errors,
|
38
|
+
metric_establish_channel_latency,
|
39
|
+
metric_function_executors_count,
|
40
|
+
metric_initialize_rpc_errors,
|
41
|
+
metric_initialize_rpc_latency,
|
42
|
+
)
|
16
43
|
from .server.function_executor_server import (
|
17
44
|
FUNCTION_EXECUTOR_SERVER_READY_TIMEOUT_SEC,
|
18
45
|
FunctionExecutorServer,
|
@@ -47,6 +74,7 @@ class FunctionExecutor:
|
|
47
74
|
self._invocation_state_client: Optional[InvocationStateClient] = None
|
48
75
|
self._health_checker: Optional[HealthChecker] = None
|
49
76
|
self._initialized = False
|
77
|
+
metric_function_executors_count.inc()
|
50
78
|
|
51
79
|
async def initialize(
|
52
80
|
self,
|
@@ -60,31 +88,23 @@ class FunctionExecutor:
|
|
60
88
|
Raises CustomerError if the server failed to initialize due to an error in customer owned code or data.
|
61
89
|
Raises an Exception if an internal error occured."""
|
62
90
|
try:
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
await self._invocation_state_client.start()
|
81
|
-
|
82
|
-
self._health_checker = HealthChecker(
|
83
|
-
stub=stub,
|
84
|
-
logger=self._logger,
|
85
|
-
)
|
86
|
-
|
87
|
-
self._initialized = True
|
91
|
+
with (
|
92
|
+
metric_create_errors.count_exceptions(),
|
93
|
+
metric_create_latency.time(),
|
94
|
+
):
|
95
|
+
metric_creations.inc()
|
96
|
+
await self._create_server(config)
|
97
|
+
await self._establish_channel()
|
98
|
+
stub: FunctionExecutorStub = FunctionExecutorStub(self._channel)
|
99
|
+
await _initialize_server(stub, initialize_request)
|
100
|
+
await self._create_invocation_state_client(
|
101
|
+
stub=stub,
|
102
|
+
base_url=base_url,
|
103
|
+
config_path=config_path,
|
104
|
+
initialize_request=initialize_request,
|
105
|
+
)
|
106
|
+
await self._create_health_checker(stub)
|
107
|
+
self._initialized = True
|
88
108
|
except Exception:
|
89
109
|
await self.destroy()
|
90
110
|
raise
|
@@ -106,56 +126,155 @@ class FunctionExecutor:
|
|
106
126
|
|
107
127
|
Never raises any exceptions but logs them."""
|
108
128
|
try:
|
109
|
-
|
110
|
-
|
111
|
-
|
129
|
+
with (
|
130
|
+
metric_destroy_errors.count_exceptions(),
|
131
|
+
metric_destroy_latency.time(),
|
132
|
+
):
|
133
|
+
metric_function_executors_count.dec()
|
134
|
+
metric_destroys.inc()
|
135
|
+
await self._destroy_health_checker()
|
136
|
+
await self._destroy_invocation_state_client()
|
137
|
+
await self._destroy_channel()
|
138
|
+
await self._destroy_server()
|
112
139
|
except Exception as e:
|
113
|
-
self._logger.error(
|
140
|
+
self._logger.error(
|
141
|
+
"exception from a Function Executor destroy step, some destroy steps are not executed, this is a resource leak",
|
142
|
+
exc_info=e,
|
143
|
+
)
|
144
|
+
|
145
|
+
def _check_initialized(self) -> None:
|
146
|
+
if not self._initialized:
|
147
|
+
raise RuntimeError("FunctionExecutor is not initialized")
|
148
|
+
|
149
|
+
async def _create_server(self, config: FunctionExecutorServerConfiguration) -> None:
|
150
|
+
with (
|
151
|
+
metric_create_server_errors.count_exceptions(),
|
152
|
+
metric_create_server_latency.time(),
|
153
|
+
):
|
154
|
+
self._server = await self._server_factory.create(
|
155
|
+
config=config, logger=self._logger
|
156
|
+
)
|
157
|
+
|
158
|
+
async def _destroy_server(self) -> None:
|
159
|
+
if self._server is None:
|
160
|
+
return
|
114
161
|
|
115
162
|
try:
|
116
|
-
|
117
|
-
|
118
|
-
|
163
|
+
with (
|
164
|
+
metric_destroy_server_errors.count_exceptions(),
|
165
|
+
metric_destroy_server_latency.time(),
|
166
|
+
):
|
167
|
+
await self._server_factory.destroy(self._server, self._logger)
|
119
168
|
except Exception as e:
|
120
|
-
self._logger.error(
|
121
|
-
|
169
|
+
self._logger.error("failed to destroy FunctionExecutorServer", exc_info=e)
|
170
|
+
finally:
|
171
|
+
self._server = None
|
172
|
+
|
173
|
+
async def _establish_channel(self) -> None:
|
174
|
+
with (
|
175
|
+
metric_establish_channel_errors.count_exceptions(),
|
176
|
+
metric_establish_channel_latency.time(),
|
177
|
+
):
|
178
|
+
self._channel = await self._server.create_channel(self._logger)
|
179
|
+
await asyncio.wait_for(
|
180
|
+
self._channel.channel_ready(),
|
181
|
+
timeout=FUNCTION_EXECUTOR_SERVER_READY_TIMEOUT_SEC,
|
122
182
|
)
|
123
183
|
|
184
|
+
async def _destroy_channel(self) -> None:
|
185
|
+
if self._channel is None:
|
186
|
+
return
|
187
|
+
|
124
188
|
try:
|
125
|
-
|
189
|
+
with (
|
190
|
+
metric_destroy_channel_errors.count_exceptions(),
|
191
|
+
metric_destroy_channel_latency.time(),
|
192
|
+
):
|
126
193
|
await self._channel.close()
|
127
|
-
self._channel = None
|
128
194
|
except Exception as e:
|
129
195
|
self._logger.error(
|
130
196
|
"failed to close FunctionExecutorServer channel", exc_info=e
|
131
197
|
)
|
198
|
+
finally:
|
199
|
+
self._channel = None
|
200
|
+
|
201
|
+
async def _create_invocation_state_client(
|
202
|
+
self,
|
203
|
+
stub: FunctionExecutorStub,
|
204
|
+
base_url: str,
|
205
|
+
config_path: Optional[str],
|
206
|
+
initialize_request: InitializeRequest,
|
207
|
+
) -> None:
|
208
|
+
with (
|
209
|
+
metric_create_invocation_state_client_errors.count_exceptions(),
|
210
|
+
metric_create_invocation_state_client_latency.time(),
|
211
|
+
):
|
212
|
+
self._invocation_state_client = InvocationStateClient(
|
213
|
+
stub=stub,
|
214
|
+
base_url=base_url,
|
215
|
+
http_client=get_httpx_client(config_path=config_path, make_async=True),
|
216
|
+
graph=initialize_request.graph_name,
|
217
|
+
namespace=initialize_request.namespace,
|
218
|
+
logger=self._logger,
|
219
|
+
)
|
220
|
+
await self._invocation_state_client.start()
|
221
|
+
|
222
|
+
async def _destroy_invocation_state_client(self) -> None:
|
223
|
+
if self._invocation_state_client is None:
|
224
|
+
return
|
132
225
|
|
133
226
|
try:
|
134
|
-
|
135
|
-
|
136
|
-
|
227
|
+
with (
|
228
|
+
metric_destroy_invocation_state_client_errors.count_exceptions(),
|
229
|
+
metric_destroy_invocation_state_client_latency.time(),
|
230
|
+
):
|
231
|
+
await self._invocation_state_client.destroy()
|
137
232
|
except Exception as e:
|
138
|
-
self._logger.error(
|
233
|
+
self._logger.error(
|
234
|
+
"failed to destroy FunctionExecutor invocation state client", exc_info=e
|
235
|
+
)
|
236
|
+
finally:
|
237
|
+
self._invocation_state_client = None
|
139
238
|
|
140
|
-
def
|
141
|
-
|
142
|
-
|
239
|
+
async def _create_health_checker(self, stub: FunctionExecutorStub) -> None:
|
240
|
+
with (
|
241
|
+
metric_create_health_checker_errors.count_exceptions(),
|
242
|
+
metric_create_health_checker_latency.time(),
|
243
|
+
):
|
244
|
+
self._health_checker = HealthChecker(
|
245
|
+
stub=stub,
|
246
|
+
logger=self._logger,
|
247
|
+
)
|
143
248
|
|
249
|
+
async def _destroy_health_checker(self) -> None:
|
250
|
+
if self._health_checker is None:
|
251
|
+
return
|
144
252
|
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
253
|
+
try:
|
254
|
+
with (
|
255
|
+
metric_destroy_health_checker_errors.count_exceptions(),
|
256
|
+
metric_destroy_health_checker_latency.time(),
|
257
|
+
):
|
258
|
+
self._health_checker.stop()
|
259
|
+
except Exception as e:
|
260
|
+
self._logger.error("failed to stop HealthChecker", exc_info=e)
|
261
|
+
finally:
|
262
|
+
self._health_checker = None
|
150
263
|
|
151
264
|
|
152
265
|
async def _initialize_server(
|
153
266
|
stub: FunctionExecutorStub, initialize_request: InitializeRequest
|
154
267
|
):
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
268
|
+
with (
|
269
|
+
metric_initialize_rpc_errors.count_exceptions(),
|
270
|
+
metric_initialize_rpc_latency.time(),
|
271
|
+
):
|
272
|
+
initialize_response: InitializeResponse = await stub.initialize(
|
273
|
+
initialize_request
|
274
|
+
)
|
275
|
+
if initialize_response.success:
|
276
|
+
return
|
277
|
+
if initialize_response.HasField("customer_error"):
|
278
|
+
raise CustomerError(initialize_response.customer_error)
|
279
|
+
else:
|
280
|
+
raise Exception("initialize RPC failed at function executor server")
|
@@ -2,6 +2,9 @@ import asyncio
|
|
2
2
|
from typing import Optional
|
3
3
|
|
4
4
|
from .function_executor import FunctionExecutor
|
5
|
+
from .metrics.function_executor_state import (
|
6
|
+
metric_function_executor_state_not_locked_errors,
|
7
|
+
)
|
5
8
|
|
6
9
|
|
7
10
|
class FunctionExecutorState:
|
@@ -18,6 +21,8 @@ class FunctionExecutorState:
|
|
18
21
|
# All the fields below are protected by the lock.
|
19
22
|
self.lock: asyncio.Lock = asyncio.Lock()
|
20
23
|
self.is_shutdown: bool = False
|
24
|
+
# Set to True if a Function Executor health check ever failed.
|
25
|
+
self.health_check_failed: bool = False
|
21
26
|
self.function_executor: Optional[FunctionExecutor] = None
|
22
27
|
self.running_tasks: int = 0
|
23
28
|
self.running_tasks_change_notifier: asyncio.Condition = asyncio.Condition(
|
@@ -75,4 +80,5 @@ class FunctionExecutorState:
|
|
75
80
|
def check_locked(self) -> None:
|
76
81
|
"""Raises an exception if the lock is not held."""
|
77
82
|
if not self.lock.locked():
|
83
|
+
metric_function_executor_state_not_locked_errors.inc()
|
78
84
|
raise RuntimeError("The FunctionExecutorState lock must be held.")
|
@@ -0,0 +1,64 @@
|
|
1
|
+
import asyncio
|
2
|
+
from typing import AsyncGenerator, Dict
|
3
|
+
|
4
|
+
from ..api_objects import Task
|
5
|
+
from .function_executor_state import FunctionExecutorState
|
6
|
+
from .metrics.function_executor_state_container import (
|
7
|
+
metric_function_executor_states_count,
|
8
|
+
)
|
9
|
+
|
10
|
+
|
11
|
+
class FunctionExecutorStatesContainer:
|
12
|
+
"""An asyncio concurrent container for the function executor states."""
|
13
|
+
|
14
|
+
def __init__(self):
|
15
|
+
# The fields below are protected by the lock.
|
16
|
+
self._lock: asyncio.Lock = asyncio.Lock()
|
17
|
+
self._states: Dict[str, FunctionExecutorState] = {}
|
18
|
+
self._is_shutdown: bool = False
|
19
|
+
|
20
|
+
async def get_or_create_state(self, task: Task) -> FunctionExecutorState:
|
21
|
+
"""Get or create a function executor state for the given task.
|
22
|
+
|
23
|
+
Raises Exception if it's not possible to create a new state at this time."""
|
24
|
+
async with self._lock:
|
25
|
+
if self._is_shutdown:
|
26
|
+
raise RuntimeError("Task runner is shutting down.")
|
27
|
+
|
28
|
+
id = function_id_without_version(task)
|
29
|
+
if id not in self._states:
|
30
|
+
state = FunctionExecutorState(
|
31
|
+
function_id_with_version=function_id_with_version(task),
|
32
|
+
function_id_without_version=id,
|
33
|
+
)
|
34
|
+
self._states[id] = state
|
35
|
+
metric_function_executor_states_count.set(len(self._states))
|
36
|
+
|
37
|
+
return self._states[id]
|
38
|
+
|
39
|
+
async def __aiter__(self) -> AsyncGenerator[FunctionExecutorState, None]:
|
40
|
+
async with self._lock:
|
41
|
+
for state in self._states.values():
|
42
|
+
yield state
|
43
|
+
|
44
|
+
async def shutdown(self):
|
45
|
+
# Function Executors are outside the Executor process
|
46
|
+
# so they need to get cleaned up explicitly and reliably.
|
47
|
+
async with self._lock:
|
48
|
+
self._is_shutdown = True # No new Function Executor States can be created.
|
49
|
+
while self._states:
|
50
|
+
id, state = self._states.popitem()
|
51
|
+
metric_function_executor_states_count.set(len(self._states))
|
52
|
+
# Only ongoing tasks who have a reference to the state already can see it.
|
53
|
+
# The state is unlocked while a task is running inside Function Executor.
|
54
|
+
async with state.lock:
|
55
|
+
await state.shutdown()
|
56
|
+
# The task running inside the Function Executor will fail because it's destroyed.
|
57
|
+
|
58
|
+
|
59
|
+
def function_id_with_version(task: Task) -> str:
|
60
|
+
return f"versioned/{task.namespace}/{task.compute_graph}/{task.graph_version}/{task.compute_fn}"
|
61
|
+
|
62
|
+
|
63
|
+
def function_id_without_version(task: Task) -> str:
|
64
|
+
return f"not_versioned/{task.namespace}/{task.compute_graph}/{task.compute_fn}"
|
@@ -11,6 +11,10 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
|
11
11
|
FunctionExecutorStub,
|
12
12
|
)
|
13
13
|
|
14
|
+
from .metrics.health_checker import (
|
15
|
+
metric_failed_health_checks,
|
16
|
+
metric_health_check_latency,
|
17
|
+
)
|
14
18
|
from .server.client_configuration import HEALTH_CHECK_TIMEOUT_SEC
|
15
19
|
|
16
20
|
HEALTH_CHECK_POLL_PERIOD_SEC = 10
|
@@ -29,16 +33,22 @@ class HealthChecker:
|
|
29
33
|
"""Runs the health check once and returns the result.
|
30
34
|
|
31
35
|
Does not raise any exceptions."""
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
36
|
+
with metric_health_check_latency.time():
|
37
|
+
try:
|
38
|
+
response: HealthCheckResponse = await self._stub.check_health(
|
39
|
+
HealthCheckRequest(), timeout=HEALTH_CHECK_TIMEOUT_SEC
|
40
|
+
)
|
41
|
+
if not response.healthy:
|
42
|
+
metric_failed_health_checks.inc()
|
43
|
+
return response.healthy
|
44
|
+
except AioRpcError:
|
45
|
+
metric_failed_health_checks.inc()
|
46
|
+
# Expected exception when there are problems with communication because e.g. the server is unhealthy.
|
47
|
+
return False
|
48
|
+
except Exception as e:
|
49
|
+
metric_failed_health_checks.inc()
|
50
|
+
self._logger.warning("Got unexpected exception, ignoring", exc_info=e)
|
51
|
+
return False
|
42
52
|
|
43
53
|
def start(self, callback: Callable[[], Awaitable[None]]) -> None:
|
44
54
|
"""Starts periodic health checks.
|
@@ -16,6 +16,15 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
|
16
16
|
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
17
17
|
|
18
18
|
from ..downloader import serialized_object_from_http_response
|
19
|
+
from .metrics.invocation_state_client import (
|
20
|
+
metric_request_read_errors,
|
21
|
+
metric_server_get_state_request_errors,
|
22
|
+
metric_server_get_state_request_latency,
|
23
|
+
metric_server_get_state_requests,
|
24
|
+
metric_server_set_state_request_errors,
|
25
|
+
metric_server_set_state_request_latency,
|
26
|
+
metric_server_set_state_requests,
|
27
|
+
)
|
19
28
|
|
20
29
|
|
21
30
|
class InvocationStateClient:
|
@@ -92,6 +101,12 @@ class InvocationStateClient:
|
|
92
101
|
except asyncio.CancelledError:
|
93
102
|
# This async task was cancelled by destroy(). Normal situation too.
|
94
103
|
pass
|
104
|
+
except Exception as e:
|
105
|
+
metric_request_read_errors.inc()
|
106
|
+
self._logger.error(
|
107
|
+
"failed to read request from server, shutting down invocation state client",
|
108
|
+
exc_info=e,
|
109
|
+
)
|
95
110
|
|
96
111
|
async def _process_request_no_raise(self, request: InvocationStateRequest) -> None:
|
97
112
|
try:
|
@@ -122,9 +137,14 @@ class InvocationStateClient:
|
|
122
137
|
# a privelege escalation attempt.
|
123
138
|
invocation_id: str = self._task_id_to_invocation_id[request.task_id]
|
124
139
|
if request.HasField("get"):
|
125
|
-
|
126
|
-
|
127
|
-
|
140
|
+
with (
|
141
|
+
metric_server_get_state_request_errors.count_exceptions(),
|
142
|
+
metric_server_get_state_request_latency.time(),
|
143
|
+
):
|
144
|
+
metric_server_get_state_requests.inc()
|
145
|
+
value: Optional[SerializedObject] = await self._get_server_state(
|
146
|
+
invocation_id, request.get.key
|
147
|
+
)
|
128
148
|
await self._client_response_queue.put(
|
129
149
|
InvocationStateResponse(
|
130
150
|
request_id=request.request_id,
|
@@ -136,9 +156,14 @@ class InvocationStateClient:
|
|
136
156
|
)
|
137
157
|
)
|
138
158
|
elif request.HasField("set"):
|
139
|
-
|
140
|
-
|
141
|
-
|
159
|
+
with (
|
160
|
+
metric_server_set_state_request_errors.count_exceptions(),
|
161
|
+
metric_server_set_state_request_latency.time(),
|
162
|
+
):
|
163
|
+
metric_server_set_state_requests.inc()
|
164
|
+
await self._set_server_state(
|
165
|
+
invocation_id, request.set.key, request.set.value
|
166
|
+
)
|
142
167
|
await self._client_response_queue.put(
|
143
168
|
InvocationStateResponse(
|
144
169
|
request_id=request.request_id,
|
@@ -0,0 +1,142 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from ...monitoring.metrics import (
|
4
|
+
latency_metric_for_customer_controlled_operation,
|
5
|
+
latency_metric_for_fast_operation,
|
6
|
+
latency_metric_for_slow_operation,
|
7
|
+
)
|
8
|
+
|
9
|
+
# This file contains all metrics used by FunctionExecutor.
|
10
|
+
|
11
|
+
metric_function_executors_count = prometheus_client.Gauge(
|
12
|
+
"function_executors_count", "Number of existing Function Executors"
|
13
|
+
)
|
14
|
+
|
15
|
+
# Metrics about whole FE creation workflow.
|
16
|
+
metric_creations: prometheus_client.Counter = prometheus_client.Counter(
|
17
|
+
"function_executor_creates",
|
18
|
+
"Number of Function Executor creations",
|
19
|
+
)
|
20
|
+
metric_create_latency: prometheus_client.Histogram = (
|
21
|
+
latency_metric_for_customer_controlled_operation(
|
22
|
+
"function_executor_create", "Function Executor creation (aka cold start)"
|
23
|
+
)
|
24
|
+
)
|
25
|
+
metric_create_errors: prometheus_client.Counter = prometheus_client.Counter(
|
26
|
+
"function_executor_create_errors", "Number of Function Executor creation errors"
|
27
|
+
)
|
28
|
+
|
29
|
+
# Metrics about whole FE destroy workflow.
|
30
|
+
metric_destroys: prometheus_client.Counter = prometheus_client.Counter(
|
31
|
+
"function_executor_destroys", "Number of Function Executor destructions"
|
32
|
+
)
|
33
|
+
metric_destroy_latency: prometheus_client.Histogram = latency_metric_for_slow_operation(
|
34
|
+
"function_executor_destroy", "Function Executor destruction"
|
35
|
+
)
|
36
|
+
metric_destroy_errors: prometheus_client.Counter = prometheus_client.Counter(
|
37
|
+
"function_executor_destroy_errors",
|
38
|
+
"Number of Function Executor destruction errors, results in a resource leak",
|
39
|
+
)
|
40
|
+
|
41
|
+
# FE server create and destruction metrics.
|
42
|
+
metric_create_server_latency: prometheus_client.Histogram = (
|
43
|
+
latency_metric_for_slow_operation(
|
44
|
+
"function_executor_create_server", "Function Executor server creation"
|
45
|
+
)
|
46
|
+
)
|
47
|
+
metric_create_server_errors: prometheus_client.Counter = prometheus_client.Counter(
|
48
|
+
"function_executor_create_server_errors",
|
49
|
+
"Number of Function Executor server creation errors",
|
50
|
+
)
|
51
|
+
metric_destroy_server_latency: prometheus_client.Histogram = (
|
52
|
+
latency_metric_for_slow_operation(
|
53
|
+
"function_executor_destroy_server", "Function Executor server destruction"
|
54
|
+
)
|
55
|
+
)
|
56
|
+
metric_destroy_server_errors: prometheus_client.Counter = prometheus_client.Counter(
|
57
|
+
"function_executor_destroy_server_errors",
|
58
|
+
"Number of Function Executor server destruction errors",
|
59
|
+
)
|
60
|
+
|
61
|
+
# FE channel creation and destruction metrics.
|
62
|
+
metric_establish_channel_latency: prometheus_client.Histogram = (
|
63
|
+
latency_metric_for_fast_operation(
|
64
|
+
"function_executor_establish_channel", "Function Executor channel establishment"
|
65
|
+
)
|
66
|
+
)
|
67
|
+
metric_establish_channel_errors: prometheus_client.Counter = prometheus_client.Counter(
|
68
|
+
"function_executor_establish_channel_errors",
|
69
|
+
"Number of Function Executor channel establishment errors",
|
70
|
+
)
|
71
|
+
metric_destroy_channel_latency: prometheus_client.Histogram = (
|
72
|
+
latency_metric_for_fast_operation(
|
73
|
+
"function_executor_destroy_channel", "Function Executor channel destruction"
|
74
|
+
)
|
75
|
+
)
|
76
|
+
metric_destroy_channel_errors: prometheus_client.Counter = prometheus_client.Counter(
|
77
|
+
"function_executor_destroy_channel_errors",
|
78
|
+
"Number of Function Executor channel destruction errors",
|
79
|
+
)
|
80
|
+
|
81
|
+
# FE initialization RPC metrics.
|
82
|
+
metric_initialize_rpc_latency: prometheus_client.Histogram = (
|
83
|
+
latency_metric_for_customer_controlled_operation(
|
84
|
+
"function_executor_initialize_rpc", "Function Executor initialize RPC"
|
85
|
+
)
|
86
|
+
)
|
87
|
+
metric_initialize_rpc_errors: prometheus_client.Counter = prometheus_client.Counter(
|
88
|
+
"function_executor_initialize_rpc_errors",
|
89
|
+
"Number of Function Executor initialize RPC errors",
|
90
|
+
)
|
91
|
+
|
92
|
+
# FE invocation state client creation and destruction metrics.
|
93
|
+
metric_create_invocation_state_client_latency: prometheus_client.Histogram = (
|
94
|
+
latency_metric_for_fast_operation(
|
95
|
+
"function_executor_create_invocation_state_client",
|
96
|
+
"Function Executor invocation state client creation",
|
97
|
+
)
|
98
|
+
)
|
99
|
+
metric_create_invocation_state_client_errors: prometheus_client.Counter = (
|
100
|
+
prometheus_client.Counter(
|
101
|
+
"function_executor_create_invocation_state_client_errors",
|
102
|
+
"Number of Function Executor invocation state client creation errors",
|
103
|
+
)
|
104
|
+
)
|
105
|
+
metric_destroy_invocation_state_client_latency: prometheus_client.Histogram = (
|
106
|
+
latency_metric_for_fast_operation(
|
107
|
+
"function_executor_destroy_invocation_state_client",
|
108
|
+
"Function Executor invocation state client destruction",
|
109
|
+
)
|
110
|
+
)
|
111
|
+
metric_destroy_invocation_state_client_errors: prometheus_client.Counter = (
|
112
|
+
prometheus_client.Counter(
|
113
|
+
"function_executor_destroy_invocation_state_client_errors",
|
114
|
+
"Number of Function Executor invocation state client destruction errors",
|
115
|
+
)
|
116
|
+
)
|
117
|
+
|
118
|
+
# FE health checker creation and destruction metrics.
|
119
|
+
metric_create_health_checker_latency: prometheus_client.Histogram = (
|
120
|
+
latency_metric_for_fast_operation(
|
121
|
+
"function_executor_create_health_checker",
|
122
|
+
"Function Executor health checker creation",
|
123
|
+
)
|
124
|
+
)
|
125
|
+
metric_create_health_checker_errors: prometheus_client.Counter = (
|
126
|
+
prometheus_client.Counter(
|
127
|
+
"function_executor_create_health_checker_errors",
|
128
|
+
"Number of Function Executor health checker creation errors",
|
129
|
+
)
|
130
|
+
)
|
131
|
+
metric_destroy_health_checker_latency: prometheus_client.Histogram = (
|
132
|
+
latency_metric_for_fast_operation(
|
133
|
+
"function_executor_destroy_health_checker",
|
134
|
+
"Function Executor health checker destruction",
|
135
|
+
)
|
136
|
+
)
|
137
|
+
metric_destroy_health_checker_errors: prometheus_client.Counter = (
|
138
|
+
prometheus_client.Counter(
|
139
|
+
"function_executor_destroy_health_checker_errors",
|
140
|
+
"Number of Function Executor health checker destruction errors",
|
141
|
+
)
|
142
|
+
)
|
@@ -0,0 +1,10 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
# This file contains all metrics used by FunctionExecutorState.
|
4
|
+
|
5
|
+
metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
|
6
|
+
prometheus_client.Counter(
|
7
|
+
"function_executor_state_not_locked_errors",
|
8
|
+
"Number of times a Function Executor state was used without acquiring its lock",
|
9
|
+
)
|
10
|
+
)
|
@@ -0,0 +1,10 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
# This file contains all metrics used by FunctionExecutorStatesContainer.
|
4
|
+
|
5
|
+
metric_function_executor_states_count: prometheus_client.Gauge = (
|
6
|
+
prometheus_client.Gauge(
|
7
|
+
"function_executor_states_count",
|
8
|
+
"Number of existing Function Executor states",
|
9
|
+
)
|
10
|
+
)
|
@@ -0,0 +1,14 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
from ...monitoring.metrics import latency_metric_for_fast_operation
|
4
|
+
|
5
|
+
# This file contains all metrics used by HealthChecker.
|
6
|
+
|
7
|
+
metric_failed_health_checks = prometheus_client.Counter(
|
8
|
+
"function_executor_failed_health_checks",
|
9
|
+
"Number of health checks that were not successful",
|
10
|
+
)
|
11
|
+
metric_health_check_latency = latency_metric_for_fast_operation(
|
12
|
+
"function_executor_health_check",
|
13
|
+
"Function Executor health check",
|
14
|
+
)
|