indexify 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. indexify/cli/cli.py +38 -78
  2. indexify/executor/api_objects.py +4 -0
  3. indexify/executor/downloader.py +45 -5
  4. indexify/executor/executor.py +103 -16
  5. indexify/executor/function_executor/function_executor.py +174 -55
  6. indexify/executor/function_executor/function_executor_state.py +6 -0
  7. indexify/executor/function_executor/function_executor_states_container.py +64 -0
  8. indexify/executor/function_executor/health_checker.py +20 -10
  9. indexify/executor/function_executor/invocation_state_client.py +31 -6
  10. indexify/executor/function_executor/metrics/function_executor.py +142 -0
  11. indexify/executor/function_executor/metrics/function_executor_state.py +10 -0
  12. indexify/executor/function_executor/metrics/function_executor_state_container.py +10 -0
  13. indexify/executor/function_executor/metrics/health_checker.py +14 -0
  14. indexify/executor/function_executor/metrics/invocation_state_client.py +45 -0
  15. indexify/executor/function_executor/metrics/single_task_runner.py +22 -0
  16. indexify/executor/function_executor/single_task_runner.py +44 -15
  17. indexify/executor/function_executor/task_output.py +7 -1
  18. indexify/executor/metrics/downloader.py +69 -0
  19. indexify/executor/metrics/executor.py +51 -0
  20. indexify/executor/metrics/task_fetcher.py +21 -0
  21. indexify/executor/metrics/task_reporter.py +22 -0
  22. indexify/executor/metrics/task_runner.py +45 -0
  23. indexify/executor/monitoring/function_allowlist.py +25 -0
  24. indexify/executor/monitoring/handler.py +8 -0
  25. indexify/executor/monitoring/health_check_handler.py +20 -0
  26. indexify/executor/monitoring/health_checker/generic_health_checker.py +58 -0
  27. indexify/executor/monitoring/health_checker/health_checker.py +23 -0
  28. indexify/executor/monitoring/metrics.py +245 -0
  29. indexify/executor/monitoring/prometheus_metrics_handler.py +18 -0
  30. indexify/executor/monitoring/server.py +41 -0
  31. indexify/executor/monitoring/startup_probe_handler.py +17 -0
  32. indexify/executor/task_fetcher.py +15 -1
  33. indexify/executor/task_reporter.py +24 -7
  34. indexify/executor/task_runner.py +64 -46
  35. {indexify-0.3.8.dist-info → indexify-0.3.10.dist-info}/METADATA +4 -2
  36. indexify-0.3.10.dist-info/RECORD +46 -0
  37. indexify-0.3.8.dist-info/RECORD +0 -25
  38. {indexify-0.3.8.dist-info → indexify-0.3.10.dist-info}/WHEEL +0 -0
  39. {indexify-0.3.8.dist-info → indexify-0.3.10.dist-info}/entry_points.txt +0 -0
@@ -13,6 +13,33 @@ from tensorlake.utils.http_client import get_httpx_client
13
13
 
14
14
  from .health_checker import HealthChecker
15
15
  from .invocation_state_client import InvocationStateClient
16
+ from .metrics.function_executor import (
17
+ metric_create_errors,
18
+ metric_create_health_checker_errors,
19
+ metric_create_health_checker_latency,
20
+ metric_create_invocation_state_client_errors,
21
+ metric_create_invocation_state_client_latency,
22
+ metric_create_latency,
23
+ metric_create_server_errors,
24
+ metric_create_server_latency,
25
+ metric_creations,
26
+ metric_destroy_channel_errors,
27
+ metric_destroy_channel_latency,
28
+ metric_destroy_errors,
29
+ metric_destroy_health_checker_errors,
30
+ metric_destroy_health_checker_latency,
31
+ metric_destroy_invocation_state_client_errors,
32
+ metric_destroy_invocation_state_client_latency,
33
+ metric_destroy_latency,
34
+ metric_destroy_server_errors,
35
+ metric_destroy_server_latency,
36
+ metric_destroys,
37
+ metric_establish_channel_errors,
38
+ metric_establish_channel_latency,
39
+ metric_function_executors_count,
40
+ metric_initialize_rpc_errors,
41
+ metric_initialize_rpc_latency,
42
+ )
16
43
  from .server.function_executor_server import (
17
44
  FUNCTION_EXECUTOR_SERVER_READY_TIMEOUT_SEC,
18
45
  FunctionExecutorServer,
@@ -47,6 +74,7 @@ class FunctionExecutor:
47
74
  self._invocation_state_client: Optional[InvocationStateClient] = None
48
75
  self._health_checker: Optional[HealthChecker] = None
49
76
  self._initialized = False
77
+ metric_function_executors_count.inc()
50
78
 
51
79
  async def initialize(
52
80
  self,
@@ -60,31 +88,23 @@ class FunctionExecutor:
60
88
  Raises CustomerError if the server failed to initialize due to an error in customer owned code or data.
61
89
  Raises an Exception if an internal error occured."""
62
90
  try:
63
- self._server = await self._server_factory.create(
64
- config=config, logger=self._logger
65
- )
66
- self._channel = await self._server.create_channel(self._logger)
67
- await _channel_ready(self._channel)
68
-
69
- stub: FunctionExecutorStub = FunctionExecutorStub(self._channel)
70
- await _initialize_server(stub, initialize_request)
71
-
72
- self._invocation_state_client = InvocationStateClient(
73
- stub=stub,
74
- base_url=base_url,
75
- http_client=get_httpx_client(config_path=config_path, make_async=True),
76
- graph=initialize_request.graph_name,
77
- namespace=initialize_request.namespace,
78
- logger=self._logger,
79
- )
80
- await self._invocation_state_client.start()
81
-
82
- self._health_checker = HealthChecker(
83
- stub=stub,
84
- logger=self._logger,
85
- )
86
-
87
- self._initialized = True
91
+ with (
92
+ metric_create_errors.count_exceptions(),
93
+ metric_create_latency.time(),
94
+ ):
95
+ metric_creations.inc()
96
+ await self._create_server(config)
97
+ await self._establish_channel()
98
+ stub: FunctionExecutorStub = FunctionExecutorStub(self._channel)
99
+ await _initialize_server(stub, initialize_request)
100
+ await self._create_invocation_state_client(
101
+ stub=stub,
102
+ base_url=base_url,
103
+ config_path=config_path,
104
+ initialize_request=initialize_request,
105
+ )
106
+ await self._create_health_checker(stub)
107
+ self._initialized = True
88
108
  except Exception:
89
109
  await self.destroy()
90
110
  raise
@@ -106,56 +126,155 @@ class FunctionExecutor:
106
126
 
107
127
  Never raises any exceptions but logs them."""
108
128
  try:
109
- if self._health_checker is not None:
110
- self._health_checker.stop()
111
- self._health_checker = None
129
+ with (
130
+ metric_destroy_errors.count_exceptions(),
131
+ metric_destroy_latency.time(),
132
+ ):
133
+ metric_function_executors_count.dec()
134
+ metric_destroys.inc()
135
+ await self._destroy_health_checker()
136
+ await self._destroy_invocation_state_client()
137
+ await self._destroy_channel()
138
+ await self._destroy_server()
112
139
  except Exception as e:
113
- self._logger.error("failed to stop HealthChecker", exc_info=e)
140
+ self._logger.error(
141
+ "exception from a Function Executor destroy step, some destroy steps are not executed, this is a resource leak",
142
+ exc_info=e,
143
+ )
144
+
145
+ def _check_initialized(self) -> None:
146
+ if not self._initialized:
147
+ raise RuntimeError("FunctionExecutor is not initialized")
148
+
149
+ async def _create_server(self, config: FunctionExecutorServerConfiguration) -> None:
150
+ with (
151
+ metric_create_server_errors.count_exceptions(),
152
+ metric_create_server_latency.time(),
153
+ ):
154
+ self._server = await self._server_factory.create(
155
+ config=config, logger=self._logger
156
+ )
157
+
158
+ async def _destroy_server(self) -> None:
159
+ if self._server is None:
160
+ return
114
161
 
115
162
  try:
116
- if self._invocation_state_client is not None:
117
- await self._invocation_state_client.destroy()
118
- self._invocation_state_client = None
163
+ with (
164
+ metric_destroy_server_errors.count_exceptions(),
165
+ metric_destroy_server_latency.time(),
166
+ ):
167
+ await self._server_factory.destroy(self._server, self._logger)
119
168
  except Exception as e:
120
- self._logger.error(
121
- "failed to destroy FunctionExecutor invocation state client", exc_info=e
169
+ self._logger.error("failed to destroy FunctionExecutorServer", exc_info=e)
170
+ finally:
171
+ self._server = None
172
+
173
+ async def _establish_channel(self) -> None:
174
+ with (
175
+ metric_establish_channel_errors.count_exceptions(),
176
+ metric_establish_channel_latency.time(),
177
+ ):
178
+ self._channel = await self._server.create_channel(self._logger)
179
+ await asyncio.wait_for(
180
+ self._channel.channel_ready(),
181
+ timeout=FUNCTION_EXECUTOR_SERVER_READY_TIMEOUT_SEC,
122
182
  )
123
183
 
184
+ async def _destroy_channel(self) -> None:
185
+ if self._channel is None:
186
+ return
187
+
124
188
  try:
125
- if self._channel is not None:
189
+ with (
190
+ metric_destroy_channel_errors.count_exceptions(),
191
+ metric_destroy_channel_latency.time(),
192
+ ):
126
193
  await self._channel.close()
127
- self._channel = None
128
194
  except Exception as e:
129
195
  self._logger.error(
130
196
  "failed to close FunctionExecutorServer channel", exc_info=e
131
197
  )
198
+ finally:
199
+ self._channel = None
200
+
201
+ async def _create_invocation_state_client(
202
+ self,
203
+ stub: FunctionExecutorStub,
204
+ base_url: str,
205
+ config_path: Optional[str],
206
+ initialize_request: InitializeRequest,
207
+ ) -> None:
208
+ with (
209
+ metric_create_invocation_state_client_errors.count_exceptions(),
210
+ metric_create_invocation_state_client_latency.time(),
211
+ ):
212
+ self._invocation_state_client = InvocationStateClient(
213
+ stub=stub,
214
+ base_url=base_url,
215
+ http_client=get_httpx_client(config_path=config_path, make_async=True),
216
+ graph=initialize_request.graph_name,
217
+ namespace=initialize_request.namespace,
218
+ logger=self._logger,
219
+ )
220
+ await self._invocation_state_client.start()
221
+
222
+ async def _destroy_invocation_state_client(self) -> None:
223
+ if self._invocation_state_client is None:
224
+ return
132
225
 
133
226
  try:
134
- if self._server is not None:
135
- await self._server_factory.destroy(self._server, self._logger)
136
- self._server = None
227
+ with (
228
+ metric_destroy_invocation_state_client_errors.count_exceptions(),
229
+ metric_destroy_invocation_state_client_latency.time(),
230
+ ):
231
+ await self._invocation_state_client.destroy()
137
232
  except Exception as e:
138
- self._logger.error("failed to destroy FunctionExecutorServer", exc_info=e)
233
+ self._logger.error(
234
+ "failed to destroy FunctionExecutor invocation state client", exc_info=e
235
+ )
236
+ finally:
237
+ self._invocation_state_client = None
139
238
 
140
- def _check_initialized(self):
141
- if not self._initialized:
142
- raise RuntimeError("FunctionExecutor is not initialized")
239
+ async def _create_health_checker(self, stub: FunctionExecutorStub) -> None:
240
+ with (
241
+ metric_create_health_checker_errors.count_exceptions(),
242
+ metric_create_health_checker_latency.time(),
243
+ ):
244
+ self._health_checker = HealthChecker(
245
+ stub=stub,
246
+ logger=self._logger,
247
+ )
143
248
 
249
+ async def _destroy_health_checker(self) -> None:
250
+ if self._health_checker is None:
251
+ return
144
252
 
145
- async def _channel_ready(channel: grpc.aio.Channel):
146
- await asyncio.wait_for(
147
- channel.channel_ready(),
148
- timeout=FUNCTION_EXECUTOR_SERVER_READY_TIMEOUT_SEC,
149
- )
253
+ try:
254
+ with (
255
+ metric_destroy_health_checker_errors.count_exceptions(),
256
+ metric_destroy_health_checker_latency.time(),
257
+ ):
258
+ self._health_checker.stop()
259
+ except Exception as e:
260
+ self._logger.error("failed to stop HealthChecker", exc_info=e)
261
+ finally:
262
+ self._health_checker = None
150
263
 
151
264
 
152
265
  async def _initialize_server(
153
266
  stub: FunctionExecutorStub, initialize_request: InitializeRequest
154
267
  ):
155
- initialize_response: InitializeResponse = await stub.initialize(initialize_request)
156
- if initialize_response.success:
157
- return
158
- if initialize_response.HasField("customer_error"):
159
- raise CustomerError(initialize_response.customer_error)
160
- else:
161
- raise Exception("initialize RPC failed at function executor server")
268
+ with (
269
+ metric_initialize_rpc_errors.count_exceptions(),
270
+ metric_initialize_rpc_latency.time(),
271
+ ):
272
+ initialize_response: InitializeResponse = await stub.initialize(
273
+ initialize_request
274
+ )
275
+ if initialize_response.success:
276
+ return
277
+ if initialize_response.HasField("customer_error"):
278
+ raise CustomerError(initialize_response.customer_error)
279
+ else:
280
+ raise Exception("initialize RPC failed at function executor server")
@@ -2,6 +2,9 @@ import asyncio
2
2
  from typing import Optional
3
3
 
4
4
  from .function_executor import FunctionExecutor
5
+ from .metrics.function_executor_state import (
6
+ metric_function_executor_state_not_locked_errors,
7
+ )
5
8
 
6
9
 
7
10
  class FunctionExecutorState:
@@ -18,6 +21,8 @@ class FunctionExecutorState:
18
21
  # All the fields below are protected by the lock.
19
22
  self.lock: asyncio.Lock = asyncio.Lock()
20
23
  self.is_shutdown: bool = False
24
+ # Set to True if a Function Executor health check ever failed.
25
+ self.health_check_failed: bool = False
21
26
  self.function_executor: Optional[FunctionExecutor] = None
22
27
  self.running_tasks: int = 0
23
28
  self.running_tasks_change_notifier: asyncio.Condition = asyncio.Condition(
@@ -75,4 +80,5 @@ class FunctionExecutorState:
75
80
  def check_locked(self) -> None:
76
81
  """Raises an exception if the lock is not held."""
77
82
  if not self.lock.locked():
83
+ metric_function_executor_state_not_locked_errors.inc()
78
84
  raise RuntimeError("The FunctionExecutorState lock must be held.")
@@ -0,0 +1,64 @@
1
+ import asyncio
2
+ from typing import AsyncGenerator, Dict
3
+
4
+ from ..api_objects import Task
5
+ from .function_executor_state import FunctionExecutorState
6
+ from .metrics.function_executor_state_container import (
7
+ metric_function_executor_states_count,
8
+ )
9
+
10
+
11
+ class FunctionExecutorStatesContainer:
12
+ """An asyncio concurrent container for the function executor states."""
13
+
14
+ def __init__(self):
15
+ # The fields below are protected by the lock.
16
+ self._lock: asyncio.Lock = asyncio.Lock()
17
+ self._states: Dict[str, FunctionExecutorState] = {}
18
+ self._is_shutdown: bool = False
19
+
20
+ async def get_or_create_state(self, task: Task) -> FunctionExecutorState:
21
+ """Get or create a function executor state for the given task.
22
+
23
+ Raises Exception if it's not possible to create a new state at this time."""
24
+ async with self._lock:
25
+ if self._is_shutdown:
26
+ raise RuntimeError("Task runner is shutting down.")
27
+
28
+ id = function_id_without_version(task)
29
+ if id not in self._states:
30
+ state = FunctionExecutorState(
31
+ function_id_with_version=function_id_with_version(task),
32
+ function_id_without_version=id,
33
+ )
34
+ self._states[id] = state
35
+ metric_function_executor_states_count.set(len(self._states))
36
+
37
+ return self._states[id]
38
+
39
+ async def __aiter__(self) -> AsyncGenerator[FunctionExecutorState, None]:
40
+ async with self._lock:
41
+ for state in self._states.values():
42
+ yield state
43
+
44
+ async def shutdown(self):
45
+ # Function Executors are outside the Executor process
46
+ # so they need to get cleaned up explicitly and reliably.
47
+ async with self._lock:
48
+ self._is_shutdown = True # No new Function Executor States can be created.
49
+ while self._states:
50
+ id, state = self._states.popitem()
51
+ metric_function_executor_states_count.set(len(self._states))
52
+ # Only ongoing tasks who have a reference to the state already can see it.
53
+ # The state is unlocked while a task is running inside Function Executor.
54
+ async with state.lock:
55
+ await state.shutdown()
56
+ # The task running inside the Function Executor will fail because it's destroyed.
57
+
58
+
59
+ def function_id_with_version(task: Task) -> str:
60
+ return f"versioned/{task.namespace}/{task.compute_graph}/{task.graph_version}/{task.compute_fn}"
61
+
62
+
63
+ def function_id_without_version(task: Task) -> str:
64
+ return f"not_versioned/{task.namespace}/{task.compute_graph}/{task.compute_fn}"
@@ -11,6 +11,10 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
11
11
  FunctionExecutorStub,
12
12
  )
13
13
 
14
+ from .metrics.health_checker import (
15
+ metric_failed_health_checks,
16
+ metric_health_check_latency,
17
+ )
14
18
  from .server.client_configuration import HEALTH_CHECK_TIMEOUT_SEC
15
19
 
16
20
  HEALTH_CHECK_POLL_PERIOD_SEC = 10
@@ -29,16 +33,22 @@ class HealthChecker:
29
33
  """Runs the health check once and returns the result.
30
34
 
31
35
  Does not raise any exceptions."""
32
- try:
33
- response: HealthCheckResponse = await self._stub.check_health(
34
- HealthCheckRequest(), timeout=HEALTH_CHECK_TIMEOUT_SEC
35
- )
36
- return response.healthy
37
- except AioRpcError:
38
- return False
39
- except Exception as e:
40
- self._logger.warning("Got unexpected exception, ignoring", exc_info=e)
41
- return False
36
+ with metric_health_check_latency.time():
37
+ try:
38
+ response: HealthCheckResponse = await self._stub.check_health(
39
+ HealthCheckRequest(), timeout=HEALTH_CHECK_TIMEOUT_SEC
40
+ )
41
+ if not response.healthy:
42
+ metric_failed_health_checks.inc()
43
+ return response.healthy
44
+ except AioRpcError:
45
+ metric_failed_health_checks.inc()
46
+ # Expected exception when there are problems with communication because e.g. the server is unhealthy.
47
+ return False
48
+ except Exception as e:
49
+ metric_failed_health_checks.inc()
50
+ self._logger.warning("Got unexpected exception, ignoring", exc_info=e)
51
+ return False
42
52
 
43
53
  def start(self, callback: Callable[[], Awaitable[None]]) -> None:
44
54
  """Starts periodic health checks.
@@ -16,6 +16,15 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
16
16
  from tensorlake.function_executor.proto.message_validator import MessageValidator
17
17
 
18
18
  from ..downloader import serialized_object_from_http_response
19
+ from .metrics.invocation_state_client import (
20
+ metric_request_read_errors,
21
+ metric_server_get_state_request_errors,
22
+ metric_server_get_state_request_latency,
23
+ metric_server_get_state_requests,
24
+ metric_server_set_state_request_errors,
25
+ metric_server_set_state_request_latency,
26
+ metric_server_set_state_requests,
27
+ )
19
28
 
20
29
 
21
30
  class InvocationStateClient:
@@ -92,6 +101,12 @@ class InvocationStateClient:
92
101
  except asyncio.CancelledError:
93
102
  # This async task was cancelled by destroy(). Normal situation too.
94
103
  pass
104
+ except Exception as e:
105
+ metric_request_read_errors.inc()
106
+ self._logger.error(
107
+ "failed to read request from server, shutting down invocation state client",
108
+ exc_info=e,
109
+ )
95
110
 
96
111
  async def _process_request_no_raise(self, request: InvocationStateRequest) -> None:
97
112
  try:
@@ -122,9 +137,14 @@ class InvocationStateClient:
122
137
  # a privelege escalation attempt.
123
138
  invocation_id: str = self._task_id_to_invocation_id[request.task_id]
124
139
  if request.HasField("get"):
125
- value: Optional[SerializedObject] = await self._get_server_state(
126
- invocation_id, request.get.key
127
- )
140
+ with (
141
+ metric_server_get_state_request_errors.count_exceptions(),
142
+ metric_server_get_state_request_latency.time(),
143
+ ):
144
+ metric_server_get_state_requests.inc()
145
+ value: Optional[SerializedObject] = await self._get_server_state(
146
+ invocation_id, request.get.key
147
+ )
128
148
  await self._client_response_queue.put(
129
149
  InvocationStateResponse(
130
150
  request_id=request.request_id,
@@ -136,9 +156,14 @@ class InvocationStateClient:
136
156
  )
137
157
  )
138
158
  elif request.HasField("set"):
139
- await self._set_server_state(
140
- invocation_id, request.set.key, request.set.value
141
- )
159
+ with (
160
+ metric_server_set_state_request_errors.count_exceptions(),
161
+ metric_server_set_state_request_latency.time(),
162
+ ):
163
+ metric_server_set_state_requests.inc()
164
+ await self._set_server_state(
165
+ invocation_id, request.set.key, request.set.value
166
+ )
142
167
  await self._client_response_queue.put(
143
168
  InvocationStateResponse(
144
169
  request_id=request.request_id,
@@ -0,0 +1,142 @@
1
+ import prometheus_client
2
+
3
+ from ...monitoring.metrics import (
4
+ latency_metric_for_customer_controlled_operation,
5
+ latency_metric_for_fast_operation,
6
+ latency_metric_for_slow_operation,
7
+ )
8
+
9
+ # This file contains all metrics used by FunctionExecutor.
10
+
11
+ metric_function_executors_count = prometheus_client.Gauge(
12
+ "function_executors_count", "Number of existing Function Executors"
13
+ )
14
+
15
+ # Metrics about whole FE creation workflow.
16
+ metric_creations: prometheus_client.Counter = prometheus_client.Counter(
17
+ "function_executor_creates",
18
+ "Number of Function Executor creations",
19
+ )
20
+ metric_create_latency: prometheus_client.Histogram = (
21
+ latency_metric_for_customer_controlled_operation(
22
+ "function_executor_create", "Function Executor creation (aka cold start)"
23
+ )
24
+ )
25
+ metric_create_errors: prometheus_client.Counter = prometheus_client.Counter(
26
+ "function_executor_create_errors", "Number of Function Executor creation errors"
27
+ )
28
+
29
+ # Metrics about whole FE destroy workflow.
30
+ metric_destroys: prometheus_client.Counter = prometheus_client.Counter(
31
+ "function_executor_destroys", "Number of Function Executor destructions"
32
+ )
33
+ metric_destroy_latency: prometheus_client.Histogram = latency_metric_for_slow_operation(
34
+ "function_executor_destroy", "Function Executor destruction"
35
+ )
36
+ metric_destroy_errors: prometheus_client.Counter = prometheus_client.Counter(
37
+ "function_executor_destroy_errors",
38
+ "Number of Function Executor destruction errors, results in a resource leak",
39
+ )
40
+
41
+ # FE server create and destruction metrics.
42
+ metric_create_server_latency: prometheus_client.Histogram = (
43
+ latency_metric_for_slow_operation(
44
+ "function_executor_create_server", "Function Executor server creation"
45
+ )
46
+ )
47
+ metric_create_server_errors: prometheus_client.Counter = prometheus_client.Counter(
48
+ "function_executor_create_server_errors",
49
+ "Number of Function Executor server creation errors",
50
+ )
51
+ metric_destroy_server_latency: prometheus_client.Histogram = (
52
+ latency_metric_for_slow_operation(
53
+ "function_executor_destroy_server", "Function Executor server destruction"
54
+ )
55
+ )
56
+ metric_destroy_server_errors: prometheus_client.Counter = prometheus_client.Counter(
57
+ "function_executor_destroy_server_errors",
58
+ "Number of Function Executor server destruction errors",
59
+ )
60
+
61
+ # FE channel creation and destruction metrics.
62
+ metric_establish_channel_latency: prometheus_client.Histogram = (
63
+ latency_metric_for_fast_operation(
64
+ "function_executor_establish_channel", "Function Executor channel establishment"
65
+ )
66
+ )
67
+ metric_establish_channel_errors: prometheus_client.Counter = prometheus_client.Counter(
68
+ "function_executor_establish_channel_errors",
69
+ "Number of Function Executor channel establishment errors",
70
+ )
71
+ metric_destroy_channel_latency: prometheus_client.Histogram = (
72
+ latency_metric_for_fast_operation(
73
+ "function_executor_destroy_channel", "Function Executor channel destruction"
74
+ )
75
+ )
76
+ metric_destroy_channel_errors: prometheus_client.Counter = prometheus_client.Counter(
77
+ "function_executor_destroy_channel_errors",
78
+ "Number of Function Executor channel destruction errors",
79
+ )
80
+
81
+ # FE initialization RPC metrics.
82
+ metric_initialize_rpc_latency: prometheus_client.Histogram = (
83
+ latency_metric_for_customer_controlled_operation(
84
+ "function_executor_initialize_rpc", "Function Executor initialize RPC"
85
+ )
86
+ )
87
+ metric_initialize_rpc_errors: prometheus_client.Counter = prometheus_client.Counter(
88
+ "function_executor_initialize_rpc_errors",
89
+ "Number of Function Executor initialize RPC errors",
90
+ )
91
+
92
+ # FE invocation state client creation and destruction metrics.
93
+ metric_create_invocation_state_client_latency: prometheus_client.Histogram = (
94
+ latency_metric_for_fast_operation(
95
+ "function_executor_create_invocation_state_client",
96
+ "Function Executor invocation state client creation",
97
+ )
98
+ )
99
+ metric_create_invocation_state_client_errors: prometheus_client.Counter = (
100
+ prometheus_client.Counter(
101
+ "function_executor_create_invocation_state_client_errors",
102
+ "Number of Function Executor invocation state client creation errors",
103
+ )
104
+ )
105
+ metric_destroy_invocation_state_client_latency: prometheus_client.Histogram = (
106
+ latency_metric_for_fast_operation(
107
+ "function_executor_destroy_invocation_state_client",
108
+ "Function Executor invocation state client destruction",
109
+ )
110
+ )
111
+ metric_destroy_invocation_state_client_errors: prometheus_client.Counter = (
112
+ prometheus_client.Counter(
113
+ "function_executor_destroy_invocation_state_client_errors",
114
+ "Number of Function Executor invocation state client destruction errors",
115
+ )
116
+ )
117
+
118
+ # FE health checker creation and destruction metrics.
119
+ metric_create_health_checker_latency: prometheus_client.Histogram = (
120
+ latency_metric_for_fast_operation(
121
+ "function_executor_create_health_checker",
122
+ "Function Executor health checker creation",
123
+ )
124
+ )
125
+ metric_create_health_checker_errors: prometheus_client.Counter = (
126
+ prometheus_client.Counter(
127
+ "function_executor_create_health_checker_errors",
128
+ "Number of Function Executor health checker creation errors",
129
+ )
130
+ )
131
+ metric_destroy_health_checker_latency: prometheus_client.Histogram = (
132
+ latency_metric_for_fast_operation(
133
+ "function_executor_destroy_health_checker",
134
+ "Function Executor health checker destruction",
135
+ )
136
+ )
137
+ metric_destroy_health_checker_errors: prometheus_client.Counter = (
138
+ prometheus_client.Counter(
139
+ "function_executor_destroy_health_checker_errors",
140
+ "Number of Function Executor health checker destruction errors",
141
+ )
142
+ )
@@ -0,0 +1,10 @@
1
+ import prometheus_client
2
+
3
+ # This file contains all metrics used by FunctionExecutorState.
4
+
5
+ metric_function_executor_state_not_locked_errors: prometheus_client.Counter = (
6
+ prometheus_client.Counter(
7
+ "function_executor_state_not_locked_errors",
8
+ "Number of times a Function Executor state was used without acquiring its lock",
9
+ )
10
+ )
@@ -0,0 +1,10 @@
1
+ import prometheus_client
2
+
3
+ # This file contains all metrics used by FunctionExecutorStatesContainer.
4
+
5
+ metric_function_executor_states_count: prometheus_client.Gauge = (
6
+ prometheus_client.Gauge(
7
+ "function_executor_states_count",
8
+ "Number of existing Function Executor states",
9
+ )
10
+ )
@@ -0,0 +1,14 @@
1
+ import prometheus_client
2
+
3
+ from ...monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ # This file contains all metrics used by HealthChecker.
6
+
7
+ metric_failed_health_checks = prometheus_client.Counter(
8
+ "function_executor_failed_health_checks",
9
+ "Number of health checks that were not successful",
10
+ )
11
+ metric_health_check_latency = latency_metric_for_fast_operation(
12
+ "function_executor_health_check",
13
+ "Function Executor health check",
14
+ )