indexify 0.3.31__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/__init__.py +18 -0
- indexify/cli/build_image.py +51 -0
- indexify/cli/deploy.py +57 -0
- indexify/cli/executor.py +205 -0
- indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
- indexify/executor/executor.py +57 -313
- indexify/executor/function_allowlist.py +59 -0
- indexify/executor/function_executor/function_executor.py +12 -6
- indexify/executor/function_executor/invocation_state_client.py +25 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
- indexify/executor/function_executor_controller/__init__.py +13 -0
- indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
- indexify/executor/function_executor_controller/create_function_executor.py +158 -0
- indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
- indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
- indexify/executor/function_executor_controller/downloads.py +199 -0
- indexify/executor/function_executor_controller/events.py +172 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
- indexify/executor/function_executor_controller/loggers.py +57 -0
- indexify/executor/function_executor_controller/message_validators.py +69 -0
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
- indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
- indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
- indexify/executor/function_executor_controller/prepare_task.py +38 -0
- indexify/executor/function_executor_controller/run_task.py +201 -0
- indexify/executor/function_executor_controller/task_info.py +33 -0
- indexify/executor/function_executor_controller/task_output.py +122 -0
- indexify/executor/function_executor_controller/upload_task_output.py +234 -0
- indexify/executor/host_resources/host_resources.py +20 -25
- indexify/executor/host_resources/nvidia_gpu_allocator.py +8 -1
- indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
- indexify/executor/metrics/executor.py +0 -47
- indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
- indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
- indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
- indexify/executor/monitoring/health_checker/health_checker.py +0 -11
- indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
- indexify/executor/state_reporter.py +364 -0
- indexify/proto/executor_api.proto +68 -60
- indexify/proto/executor_api_pb2.py +52 -52
- indexify/proto/executor_api_pb2.pyi +129 -108
- indexify/proto/executor_api_pb2_grpc.py +0 -47
- {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/METADATA +2 -5
- indexify-0.4.3.dist-info/RECORD +68 -0
- indexify-0.4.3.dist-info/entry_points.txt +3 -0
- indexify/cli/cli.py +0 -268
- indexify/executor/api_objects.py +0 -92
- indexify/executor/downloader.py +0 -417
- indexify/executor/executor_flavor.py +0 -7
- indexify/executor/function_executor/function_executor_state.py +0 -107
- indexify/executor/function_executor/function_executor_states_container.py +0 -93
- indexify/executor/function_executor/function_executor_status.py +0 -95
- indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
- indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
- indexify/executor/function_executor/single_task_runner.py +0 -345
- indexify/executor/function_executor/task_input.py +0 -21
- indexify/executor/function_executor/task_output.py +0 -105
- indexify/executor/grpc/function_executor_controller.py +0 -418
- indexify/executor/grpc/metrics/task_controller.py +0 -8
- indexify/executor/grpc/state_reporter.py +0 -317
- indexify/executor/grpc/task_controller.py +0 -508
- indexify/executor/metrics/task_fetcher.py +0 -21
- indexify/executor/metrics/task_reporter.py +0 -53
- indexify/executor/metrics/task_runner.py +0 -52
- indexify/executor/monitoring/function_allowlist.py +0 -25
- indexify/executor/runtime_probes.py +0 -68
- indexify/executor/task_fetcher.py +0 -96
- indexify/executor/task_reporter.py +0 -459
- indexify/executor/task_runner.py +0 -177
- indexify-0.3.31.dist-info/RECORD +0 -68
- indexify-0.3.31.dist-info/entry_points.txt +0 -3
- {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/WHEEL +0 -0
@@ -1,508 +0,0 @@
|
|
1
|
-
import asyncio
|
2
|
-
from typing import Any, Optional
|
3
|
-
|
4
|
-
import grpc
|
5
|
-
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
6
|
-
RunTaskRequest,
|
7
|
-
RunTaskResponse,
|
8
|
-
SerializedObject,
|
9
|
-
)
|
10
|
-
from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
11
|
-
FunctionExecutorStub,
|
12
|
-
)
|
13
|
-
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
14
|
-
|
15
|
-
from indexify.proto.executor_api_pb2 import Task
|
16
|
-
|
17
|
-
from ..downloader import Downloader
|
18
|
-
from ..function_executor.function_executor import FunctionExecutor
|
19
|
-
from ..function_executor.function_executor_state import FunctionExecutorState
|
20
|
-
from ..function_executor.function_executor_status import FunctionExecutorStatus
|
21
|
-
from ..function_executor.metrics.single_task_runner import (
|
22
|
-
metric_function_executor_run_task_rpc_errors,
|
23
|
-
metric_function_executor_run_task_rpc_latency,
|
24
|
-
metric_function_executor_run_task_rpcs,
|
25
|
-
)
|
26
|
-
from ..function_executor.task_output import TaskMetrics, TaskOutput
|
27
|
-
|
28
|
-
# TODO: combine these metrics into a single python file once gRPC migration is over and old code is removed.
|
29
|
-
from ..metrics.executor import (
|
30
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ALL,
|
31
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
|
32
|
-
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
|
33
|
-
METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
|
34
|
-
metric_task_completion_latency,
|
35
|
-
metric_task_outcome_report_latency,
|
36
|
-
metric_task_outcome_report_retries,
|
37
|
-
metric_task_outcome_reports,
|
38
|
-
metric_tasks_completed,
|
39
|
-
metric_tasks_fetched,
|
40
|
-
metric_tasks_reporting_outcome,
|
41
|
-
)
|
42
|
-
from ..metrics.task_runner import (
|
43
|
-
metric_task_policy_latency,
|
44
|
-
metric_task_policy_runs,
|
45
|
-
metric_task_run_latency,
|
46
|
-
metric_task_run_platform_errors,
|
47
|
-
metric_task_runs,
|
48
|
-
metric_tasks_blocked_by_policy,
|
49
|
-
metric_tasks_blocked_by_policy_per_function_name,
|
50
|
-
metric_tasks_running,
|
51
|
-
)
|
52
|
-
from ..task_reporter import TaskReporter
|
53
|
-
from .metrics.task_controller import metric_task_cancellations
|
54
|
-
|
55
|
-
_TASK_OUTCOME_REPORT_BACKOFF_SEC = 5.0
|
56
|
-
|
57
|
-
|
58
|
-
def validate_task(task: Task) -> None:
|
59
|
-
"""Validates the supplied Task.
|
60
|
-
|
61
|
-
Raises ValueError if the Task is not valid.
|
62
|
-
"""
|
63
|
-
validator = MessageValidator(task)
|
64
|
-
validator.required_field("id")
|
65
|
-
validator.required_field("namespace")
|
66
|
-
validator.required_field("graph_name")
|
67
|
-
validator.required_field("graph_version")
|
68
|
-
validator.required_field("function_name")
|
69
|
-
validator.required_field("graph_invocation_id")
|
70
|
-
if not (task.HasField("input_key") or task.HasField("input")):
|
71
|
-
raise ValueError(
|
72
|
-
"Task must have either input_key or input field set. " f"Got task: {task}"
|
73
|
-
)
|
74
|
-
|
75
|
-
|
76
|
-
def task_logger(task: Task, logger: Any) -> Any:
|
77
|
-
"""Returns a logger bound with the task's metadata.
|
78
|
-
|
79
|
-
The function assumes that the task might be invalid."""
|
80
|
-
return logger.bind(
|
81
|
-
task_id=task.id if task.HasField("id") else None,
|
82
|
-
namespace=task.namespace if task.HasField("namespace") else None,
|
83
|
-
graph_name=task.graph_name if task.HasField("graph_name") else None,
|
84
|
-
graph_version=task.graph_version if task.HasField("graph_version") else None,
|
85
|
-
function_name=task.function_name if task.HasField("function_name") else None,
|
86
|
-
graph_invocation_id=(
|
87
|
-
task.graph_invocation_id if task.HasField("graph_invocation_id") else None
|
88
|
-
),
|
89
|
-
)
|
90
|
-
|
91
|
-
|
92
|
-
class TaskController:
|
93
|
-
def __init__(
|
94
|
-
self,
|
95
|
-
task: Task,
|
96
|
-
downloader: Downloader,
|
97
|
-
task_reporter: TaskReporter,
|
98
|
-
function_executor_id: str,
|
99
|
-
function_executor_state: FunctionExecutorState,
|
100
|
-
logger: Any,
|
101
|
-
):
|
102
|
-
"""Creates a new TaskController instance.
|
103
|
-
|
104
|
-
The supplied Task must be already validated by the caller using validate_task().
|
105
|
-
"""
|
106
|
-
self._task: Task = task
|
107
|
-
self._downloader: Downloader = downloader
|
108
|
-
self._task_reporter: TaskReporter = task_reporter
|
109
|
-
self._function_executor_id: str = function_executor_id
|
110
|
-
self._function_executor_state: FunctionExecutorState = function_executor_state
|
111
|
-
self._logger: Any = task_logger(task, logger).bind(
|
112
|
-
function_executor_id=function_executor_id,
|
113
|
-
module=__name__,
|
114
|
-
)
|
115
|
-
|
116
|
-
self._input: Optional[SerializedObject] = None
|
117
|
-
self._init_value: Optional[SerializedObject] = None
|
118
|
-
self._is_timed_out: bool = False
|
119
|
-
# Automatically start the controller on creation.
|
120
|
-
self._task_runner: asyncio.Task = asyncio.create_task(
|
121
|
-
self._run(), name="task controller task runner"
|
122
|
-
)
|
123
|
-
|
124
|
-
def function_executor_id(self) -> str:
|
125
|
-
return self._function_executor_id
|
126
|
-
|
127
|
-
def task(self) -> Task:
|
128
|
-
return self._task
|
129
|
-
|
130
|
-
async def destroy(self) -> None:
|
131
|
-
"""Destroys the controller and cancells the task if it didn't finish yet.
|
132
|
-
|
133
|
-
A running task is cancelled by destroying its Function Executor.
|
134
|
-
Doesn't raise any exceptions.
|
135
|
-
"""
|
136
|
-
if self._task_runner.done():
|
137
|
-
return # Nothin to do, the task is finished already.
|
138
|
-
|
139
|
-
# The task runner code handles asyncio.CancelledError properly.
|
140
|
-
self._task_runner.cancel()
|
141
|
-
# Don't await the cancelled task to not block the caller unnecessary.
|
142
|
-
|
143
|
-
async def _run(self) -> None:
|
144
|
-
metric_tasks_fetched.inc()
|
145
|
-
with metric_task_completion_latency.time():
|
146
|
-
await self._run_task()
|
147
|
-
|
148
|
-
async def _run_task(self) -> None:
|
149
|
-
"""Runs the supplied task and does full managemenet of its lifecycle.
|
150
|
-
|
151
|
-
Doesn't raise any exceptions."""
|
152
|
-
output: Optional[TaskOutput] = None
|
153
|
-
|
154
|
-
try:
|
155
|
-
await self._download_inputs()
|
156
|
-
output = await self._run_task_when_function_executor_is_available()
|
157
|
-
self._logger.info("task execution finished", success=output.success)
|
158
|
-
_log_function_metrics(output, self._logger)
|
159
|
-
except Exception as e:
|
160
|
-
metric_task_run_platform_errors.inc(),
|
161
|
-
output = self._internal_error_output()
|
162
|
-
self._logger.error("task execution failed", exc_info=e)
|
163
|
-
except asyncio.CancelledError:
|
164
|
-
metric_task_cancellations.inc()
|
165
|
-
self._logger.info("task execution cancelled")
|
166
|
-
# Don't report task outcome according to the current policy.
|
167
|
-
# asyncio.CancelledError can't be suppressed, see Python docs.
|
168
|
-
raise
|
169
|
-
|
170
|
-
# Current task outcome reporting policy:
|
171
|
-
# Don't report task outcomes for tasks that didn't fail with internal or customer error.
|
172
|
-
# This is required to simplify the protocol so Server doesn't need to care about task states
|
173
|
-
# and cancel each tasks carefully to not get its outcome as failed.
|
174
|
-
with (
|
175
|
-
metric_tasks_reporting_outcome.track_inprogress(),
|
176
|
-
metric_task_outcome_report_latency.time(),
|
177
|
-
):
|
178
|
-
metric_task_outcome_reports.inc()
|
179
|
-
await self._report_task_outcome(output)
|
180
|
-
|
181
|
-
async def _download_inputs(self) -> None:
|
182
|
-
"""Downloads the task inputs and init value.
|
183
|
-
|
184
|
-
Raises an Exception if the inputs failed to download.
|
185
|
-
"""
|
186
|
-
self._input = await self._downloader.download_input(
|
187
|
-
namespace=self._task.namespace,
|
188
|
-
graph_name=self._task.graph_name,
|
189
|
-
graph_invocation_id=self._task.graph_invocation_id,
|
190
|
-
input_key=self._task.input_key,
|
191
|
-
data_payload=self._task.input if self._task.HasField("input") else None,
|
192
|
-
logger=self._logger,
|
193
|
-
)
|
194
|
-
|
195
|
-
if self._task.HasField("reducer_output_key") or self._task.HasField(
|
196
|
-
"reducer_input"
|
197
|
-
):
|
198
|
-
self._init_value = await self._downloader.download_init_value(
|
199
|
-
namespace=self._task.namespace,
|
200
|
-
graph_name=self._task.graph_name,
|
201
|
-
function_name=self._task.function_name,
|
202
|
-
graph_invocation_id=self._task.graph_invocation_id,
|
203
|
-
reducer_output_key=(
|
204
|
-
self._task.reducer_output_key
|
205
|
-
if self._task.HasField("reducer_output_key")
|
206
|
-
else None
|
207
|
-
),
|
208
|
-
data_payload=(
|
209
|
-
self._task.reducer_input
|
210
|
-
if self._task.HasField("reducer_input")
|
211
|
-
else None
|
212
|
-
),
|
213
|
-
logger=self._logger,
|
214
|
-
)
|
215
|
-
|
216
|
-
async def _run_task_when_function_executor_is_available(self) -> TaskOutput:
|
217
|
-
"""Runs the task on the Function Executor when it's available.
|
218
|
-
|
219
|
-
Raises an Exception if task failed due to an internal error."""
|
220
|
-
await self._acquire_function_executor()
|
221
|
-
|
222
|
-
next_status: FunctionExecutorStatus = FunctionExecutorStatus.IDLE
|
223
|
-
try:
|
224
|
-
return await self._run_task_on_acquired_function_executor()
|
225
|
-
except asyncio.CancelledError:
|
226
|
-
# This one is raised here when destroy() was called while we were running the task on this FE.
|
227
|
-
next_status = FunctionExecutorStatus.UNHEALTHY
|
228
|
-
# asyncio.CancelledError can't be suppressed, see Python docs.
|
229
|
-
raise
|
230
|
-
finally:
|
231
|
-
# If the task finished running on FE then put it into IDLE state so other tasks can run on it.
|
232
|
-
# Otherwise, mark the FE as unhealthy to force its destruction so the task stops running on it eventually
|
233
|
-
# and no other tasks run on this FE because it'd result in undefined behavior.
|
234
|
-
if self._is_timed_out:
|
235
|
-
next_status = FunctionExecutorStatus.UNHEALTHY
|
236
|
-
# TODO: When task controller is removed do FE health check here to stop scheduling tasks on unhealthy FE asap.
|
237
|
-
await self._release_function_executor(next_status=next_status)
|
238
|
-
|
239
|
-
async def _acquire_function_executor(self) -> None:
|
240
|
-
"""Waits until the Function Executor is in IDLE state and then locks it so the task can run on it.
|
241
|
-
|
242
|
-
Doesn't raise any exceptions.
|
243
|
-
"""
|
244
|
-
with (
|
245
|
-
metric_tasks_blocked_by_policy.track_inprogress(),
|
246
|
-
metric_tasks_blocked_by_policy_per_function_name.labels(
|
247
|
-
function_name=self._task.function_name
|
248
|
-
).track_inprogress(),
|
249
|
-
metric_task_policy_latency.time(),
|
250
|
-
):
|
251
|
-
metric_task_policy_runs.inc()
|
252
|
-
self._logger.info(
|
253
|
-
"task is blocked by policy: waiting for idle function executor"
|
254
|
-
)
|
255
|
-
async with self._function_executor_state.lock:
|
256
|
-
await self._function_executor_state.wait_status(
|
257
|
-
allowlist=[FunctionExecutorStatus.IDLE]
|
258
|
-
)
|
259
|
-
await self._function_executor_state.set_status(
|
260
|
-
FunctionExecutorStatus.RUNNING_TASK
|
261
|
-
)
|
262
|
-
|
263
|
-
# At this point the Function Executor belongs to this task controller due to RUNNING_TASK status.
|
264
|
-
# We can now unlock the FE state. We have to update the FE status once the task succeeds or fails.
|
265
|
-
|
266
|
-
async def _release_function_executor(
|
267
|
-
self, next_status: FunctionExecutorStatus
|
268
|
-
) -> None:
|
269
|
-
# Release the Function Executor so others can run tasks on it if FE status didn't change.
|
270
|
-
# If FE status changed, then it means that we're off normal task execution path, e.g.
|
271
|
-
# Server decided to do something with FE.
|
272
|
-
async with self._function_executor_state.lock:
|
273
|
-
if (
|
274
|
-
self._function_executor_state.status
|
275
|
-
== FunctionExecutorStatus.RUNNING_TASK
|
276
|
-
):
|
277
|
-
await self._function_executor_state.set_status(next_status)
|
278
|
-
if next_status == FunctionExecutorStatus.UNHEALTHY:
|
279
|
-
# Destroy the unhealthy FE asap so it doesn't consume resources.
|
280
|
-
# Don't do it under the state lock to not add unnecessary delays.
|
281
|
-
asyncio.create_task(
|
282
|
-
self._function_executor_state.function_executor.destroy()
|
283
|
-
)
|
284
|
-
self._function_executor_state.function_executor = None
|
285
|
-
else:
|
286
|
-
self._logger.warning(
|
287
|
-
"skipping releasing Function Executor after running the task due to unexpected Function Executor status",
|
288
|
-
status=self._function_executor_state.status.name,
|
289
|
-
next_status=next_status.name,
|
290
|
-
)
|
291
|
-
|
292
|
-
async def _run_task_on_acquired_function_executor(self) -> TaskOutput:
|
293
|
-
"""Runs the task on the Function Executor acquired by this task already and returns the output.
|
294
|
-
|
295
|
-
Raises an Exception if the task failed to run due to an internal error."""
|
296
|
-
with metric_tasks_running.track_inprogress(), metric_task_run_latency.time():
|
297
|
-
metric_task_runs.inc()
|
298
|
-
return await self._run_task_rpc_on_function_executor()
|
299
|
-
|
300
|
-
async def _run_task_rpc_on_function_executor(self) -> TaskOutput:
|
301
|
-
"""Runs the task on the Function Executor and returns the output.
|
302
|
-
|
303
|
-
Raises an Exception if the task failed to run due to an internal error.
|
304
|
-
"""
|
305
|
-
request: RunTaskRequest = RunTaskRequest(
|
306
|
-
namespace=self._task.namespace,
|
307
|
-
graph_name=self._task.graph_name,
|
308
|
-
graph_version=self._task.graph_version,
|
309
|
-
function_name=self._task.function_name,
|
310
|
-
graph_invocation_id=self._task.graph_invocation_id,
|
311
|
-
task_id=self._task.id,
|
312
|
-
function_input=self._input,
|
313
|
-
)
|
314
|
-
# Don't keep the input in memory after we started running the task.
|
315
|
-
self._input = None
|
316
|
-
|
317
|
-
if self._init_value is not None:
|
318
|
-
request.function_init_value.CopyFrom(self._init_value)
|
319
|
-
# Don't keep the init value in memory after we started running the task.
|
320
|
-
self._init_value = None
|
321
|
-
|
322
|
-
channel: grpc.aio.Channel = (
|
323
|
-
self._function_executor_state.function_executor.channel()
|
324
|
-
)
|
325
|
-
|
326
|
-
timeout_sec: Optional[float] = None
|
327
|
-
if self._task.HasField("timeout_ms"):
|
328
|
-
# TODO: Add integration tests with function timeout when end-to-end implementation is done.
|
329
|
-
timeout_sec = self._task.timeout_ms / 1000.0
|
330
|
-
|
331
|
-
async with _RunningTaskContextManager(
|
332
|
-
task=self._task,
|
333
|
-
function_executor=self._function_executor_state.function_executor,
|
334
|
-
):
|
335
|
-
with (
|
336
|
-
metric_function_executor_run_task_rpc_errors.count_exceptions(),
|
337
|
-
metric_function_executor_run_task_rpc_latency.time(),
|
338
|
-
):
|
339
|
-
metric_function_executor_run_task_rpcs.inc()
|
340
|
-
# If this RPC failed due to customer code crashing the server we won't be
|
341
|
-
# able to detect this. We'll treat this as our own error for now and thus
|
342
|
-
# let the AioRpcError to be raised here.
|
343
|
-
try:
|
344
|
-
response: RunTaskResponse = await FunctionExecutorStub(
|
345
|
-
channel
|
346
|
-
).run_task(request, timeout=timeout_sec)
|
347
|
-
except grpc.aio.AioRpcError as e:
|
348
|
-
if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
|
349
|
-
# Not logging customer error.
|
350
|
-
self._is_timed_out = True
|
351
|
-
return self._function_timeout_output(timeout_sec=timeout_sec)
|
352
|
-
raise
|
353
|
-
|
354
|
-
return _task_output_from_function_executor_response(
|
355
|
-
task=self._task, response=response
|
356
|
-
)
|
357
|
-
|
358
|
-
async def _report_task_outcome(self, output: TaskOutput) -> None:
|
359
|
-
"""Reports the task with the given output to the server.
|
360
|
-
|
361
|
-
Doesn't raise any Exceptions. Runs till the reporting is successful."""
|
362
|
-
reporting_retries: int = 0
|
363
|
-
|
364
|
-
while True:
|
365
|
-
logger = self._logger.bind(retries=reporting_retries)
|
366
|
-
try:
|
367
|
-
await self._task_reporter.report(output=output, logger=logger)
|
368
|
-
break
|
369
|
-
except Exception as e:
|
370
|
-
logger.error(
|
371
|
-
"failed to report task",
|
372
|
-
exc_info=e,
|
373
|
-
)
|
374
|
-
reporting_retries += 1
|
375
|
-
metric_task_outcome_report_retries.inc()
|
376
|
-
await asyncio.sleep(_TASK_OUTCOME_REPORT_BACKOFF_SEC)
|
377
|
-
|
378
|
-
metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
|
379
|
-
if output.is_internal_error:
|
380
|
-
metric_tasks_completed.labels(
|
381
|
-
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
|
382
|
-
).inc()
|
383
|
-
elif output.success:
|
384
|
-
metric_tasks_completed.labels(
|
385
|
-
outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
|
386
|
-
).inc()
|
387
|
-
else:
|
388
|
-
metric_tasks_completed.labels(
|
389
|
-
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
|
390
|
-
).inc()
|
391
|
-
|
392
|
-
def _internal_error_output(self) -> TaskOutput:
|
393
|
-
return TaskOutput.internal_error(
|
394
|
-
task_id=self._task.id,
|
395
|
-
namespace=self._task.namespace,
|
396
|
-
graph_name=self._task.graph_name,
|
397
|
-
function_name=self._task.function_name,
|
398
|
-
graph_version=self._task.graph_version,
|
399
|
-
graph_invocation_id=self._task.graph_invocation_id,
|
400
|
-
output_payload_uri_prefix=(
|
401
|
-
self._task.output_payload_uri_prefix
|
402
|
-
if self._task.HasField("output_payload_uri_prefix")
|
403
|
-
else None
|
404
|
-
),
|
405
|
-
)
|
406
|
-
|
407
|
-
def _function_timeout_output(self, timeout_sec: float) -> TaskOutput:
|
408
|
-
return TaskOutput.function_timeout(
|
409
|
-
task_id=self._task.id,
|
410
|
-
namespace=self._task.namespace,
|
411
|
-
graph_name=self._task.graph_name,
|
412
|
-
function_name=self._task.function_name,
|
413
|
-
graph_version=self._task.graph_version,
|
414
|
-
graph_invocation_id=self._task.graph_invocation_id,
|
415
|
-
timeout_sec=timeout_sec,
|
416
|
-
output_payload_uri_prefix=(
|
417
|
-
self._task.output_payload_uri_prefix
|
418
|
-
if self._task.HasField("output_payload_uri_prefix")
|
419
|
-
else None
|
420
|
-
),
|
421
|
-
)
|
422
|
-
|
423
|
-
|
424
|
-
def _task_output_from_function_executor_response(
|
425
|
-
task: Task, response: RunTaskResponse
|
426
|
-
) -> TaskOutput:
|
427
|
-
response_validator = MessageValidator(response)
|
428
|
-
response_validator.required_field("stdout")
|
429
|
-
response_validator.required_field("stderr")
|
430
|
-
response_validator.required_field("is_reducer")
|
431
|
-
response_validator.required_field("success")
|
432
|
-
|
433
|
-
metrics = TaskMetrics(counters={}, timers={})
|
434
|
-
if response.HasField("metrics"):
|
435
|
-
# Can be None if e.g. function failed.
|
436
|
-
metrics.counters = dict(response.metrics.counters)
|
437
|
-
metrics.timers = dict(response.metrics.timers)
|
438
|
-
|
439
|
-
output = TaskOutput(
|
440
|
-
task_id=task.id,
|
441
|
-
namespace=task.namespace,
|
442
|
-
graph_name=task.graph_name,
|
443
|
-
function_name=task.function_name,
|
444
|
-
graph_version=task.graph_version,
|
445
|
-
graph_invocation_id=task.graph_invocation_id,
|
446
|
-
stdout=response.stdout,
|
447
|
-
stderr=response.stderr,
|
448
|
-
reducer=response.is_reducer,
|
449
|
-
success=response.success,
|
450
|
-
metrics=metrics,
|
451
|
-
output_payload_uri_prefix=(
|
452
|
-
task.output_payload_uri_prefix
|
453
|
-
if task.HasField("output_payload_uri_prefix")
|
454
|
-
else None
|
455
|
-
),
|
456
|
-
)
|
457
|
-
|
458
|
-
if response.HasField("function_output"):
|
459
|
-
output.function_output = response.function_output
|
460
|
-
if response.HasField("router_output"):
|
461
|
-
output.router_output = response.router_output
|
462
|
-
|
463
|
-
return output
|
464
|
-
|
465
|
-
|
466
|
-
# Temporary workaround is logging customer metrics until we store them somewhere
|
467
|
-
# for future retrieval and processing.
|
468
|
-
def _log_function_metrics(output: TaskOutput, logger: Any):
|
469
|
-
if output.metrics is None:
|
470
|
-
return
|
471
|
-
|
472
|
-
logger = logger.bind(
|
473
|
-
invocation_id=output.graph_invocation_id,
|
474
|
-
function_name=output.function_name,
|
475
|
-
graph_name=output.graph_name,
|
476
|
-
namespace=output.namespace,
|
477
|
-
)
|
478
|
-
|
479
|
-
for counter_name, counter_value in output.metrics.counters.items():
|
480
|
-
logger.info(
|
481
|
-
"function_metric", counter_name=counter_name, counter_value=counter_value
|
482
|
-
)
|
483
|
-
for timer_name, timer_value in output.metrics.timers.items():
|
484
|
-
logger.info("function_metric", timer_name=timer_name, timer_value=timer_value)
|
485
|
-
|
486
|
-
|
487
|
-
class _RunningTaskContextManager:
|
488
|
-
"""Performs all the actions required before and after running a task."""
|
489
|
-
|
490
|
-
def __init__(
|
491
|
-
self,
|
492
|
-
task: Task,
|
493
|
-
function_executor: FunctionExecutor,
|
494
|
-
):
|
495
|
-
self._task = task
|
496
|
-
self._function_executor: FunctionExecutor = function_executor
|
497
|
-
|
498
|
-
async def __aenter__(self):
|
499
|
-
self._function_executor.invocation_state_client().add_task_to_invocation_id_entry(
|
500
|
-
task_id=self._task.id,
|
501
|
-
invocation_id=self._task.graph_invocation_id,
|
502
|
-
)
|
503
|
-
return self
|
504
|
-
|
505
|
-
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
506
|
-
self._function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
|
507
|
-
task_id=self._task.id,
|
508
|
-
)
|
@@ -1,21 +0,0 @@
|
|
1
|
-
import prometheus_client
|
2
|
-
|
3
|
-
from ..monitoring.metrics import latency_metric_for_fast_operation
|
4
|
-
|
5
|
-
# This file contains all metrics used by TaskFetcher.
|
6
|
-
|
7
|
-
metric_server_registrations: prometheus_client.Counter = prometheus_client.Counter(
|
8
|
-
"server_registration_requests",
|
9
|
-
"Number of Executor registrations requests sent to the Server",
|
10
|
-
)
|
11
|
-
metric_server_registration_errors: prometheus_client.Counter = (
|
12
|
-
prometheus_client.Counter(
|
13
|
-
"server_registration_request_errors",
|
14
|
-
"Number of failed Executor registration requests",
|
15
|
-
)
|
16
|
-
)
|
17
|
-
metric_server_registration_latency: prometheus_client.Histogram = (
|
18
|
-
latency_metric_for_fast_operation(
|
19
|
-
"server_registration_request", "Register Executor at the Server"
|
20
|
-
)
|
21
|
-
)
|
@@ -1,53 +0,0 @@
|
|
1
|
-
import prometheus_client
|
2
|
-
|
3
|
-
from ..monitoring.metrics import latency_metric_for_fast_operation
|
4
|
-
|
5
|
-
# This file contains all metrics used by TaskReporter.
|
6
|
-
|
7
|
-
metric_server_ingest_files_requests: prometheus_client.Counter = (
|
8
|
-
prometheus_client.Counter(
|
9
|
-
"server_ingest_files_requests", "Number of Server ingest files requests"
|
10
|
-
)
|
11
|
-
)
|
12
|
-
metric_server_ingest_files_errors: prometheus_client.Counter = (
|
13
|
-
prometheus_client.Counter(
|
14
|
-
"server_ingest_files_request_errors",
|
15
|
-
"Number of Server ingest files request errors",
|
16
|
-
)
|
17
|
-
)
|
18
|
-
metric_server_ingest_files_latency: prometheus_client.Histogram = (
|
19
|
-
latency_metric_for_fast_operation(
|
20
|
-
"server_ingest_files_request", "Ingest files request to Server"
|
21
|
-
)
|
22
|
-
)
|
23
|
-
|
24
|
-
metric_task_output_blob_store_uploads: prometheus_client.Counter = (
|
25
|
-
prometheus_client.Counter(
|
26
|
-
"task_output_blob_store_uploads", "Number of task output uploads to blob store"
|
27
|
-
)
|
28
|
-
)
|
29
|
-
metric_task_output_blob_store_upload_errors: prometheus_client.Counter = (
|
30
|
-
prometheus_client.Counter(
|
31
|
-
"task_output_blob_store_upload_errors",
|
32
|
-
"Number of failed task output uploads to blob store",
|
33
|
-
)
|
34
|
-
)
|
35
|
-
metric_task_output_blob_store_upload_latency: prometheus_client.Histogram = (
|
36
|
-
latency_metric_for_fast_operation(
|
37
|
-
"task_output_blob_store_upload", "Upload task output to blob store"
|
38
|
-
)
|
39
|
-
)
|
40
|
-
|
41
|
-
metric_report_task_outcome_rpcs = prometheus_client.Counter(
|
42
|
-
"report_task_outcome_rpcs",
|
43
|
-
"Number of report task outcome RPCs to Server",
|
44
|
-
)
|
45
|
-
metric_report_task_outcome_errors = prometheus_client.Counter(
|
46
|
-
"report_task_outcome_rpc_errors",
|
47
|
-
"Number of report task outcome RPC errors",
|
48
|
-
)
|
49
|
-
metric_report_task_outcome_latency: prometheus_client.Histogram = (
|
50
|
-
latency_metric_for_fast_operation(
|
51
|
-
"report_task_outcome_rpc", "Report task outcome RPC to Server"
|
52
|
-
)
|
53
|
-
)
|
@@ -1,52 +0,0 @@
|
|
1
|
-
import prometheus_client
|
2
|
-
|
3
|
-
from ..monitoring.metrics import latency_metric_for_customer_controlled_operation
|
4
|
-
|
5
|
-
# This file contains all metrics used by TaskRunner.
|
6
|
-
|
7
|
-
# Metrics for the stage when task is blocked by the current policy.
|
8
|
-
metric_task_policy_runs: prometheus_client.Counter = prometheus_client.Counter(
|
9
|
-
"task_policy_runs",
|
10
|
-
"Number of task execution policy runs",
|
11
|
-
)
|
12
|
-
metric_task_policy_errors: prometheus_client.Counter = prometheus_client.Counter(
|
13
|
-
"task_policy_errors",
|
14
|
-
"Number of errors while running task execution policy",
|
15
|
-
)
|
16
|
-
metric_task_policy_latency: prometheus_client.Histogram = (
|
17
|
-
latency_metric_for_customer_controlled_operation(
|
18
|
-
"task_policy",
|
19
|
-
"Task execution blocked by the policy",
|
20
|
-
)
|
21
|
-
)
|
22
|
-
metric_tasks_blocked_by_policy: prometheus_client.Gauge = prometheus_client.Gauge(
|
23
|
-
"tasks_blocked_by_policy",
|
24
|
-
"Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
|
25
|
-
)
|
26
|
-
metric_tasks_blocked_by_policy_per_function_name: prometheus_client.Gauge = (
|
27
|
-
prometheus_client.Gauge(
|
28
|
-
"tasks_blocked_by_policy_per_function_name",
|
29
|
-
"Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
|
30
|
-
["function_name"],
|
31
|
-
)
|
32
|
-
)
|
33
|
-
|
34
|
-
# Metrics for the stage when task is running.
|
35
|
-
metric_task_runs: prometheus_client.Counter = prometheus_client.Counter(
|
36
|
-
"task_runs",
|
37
|
-
"Number of task runs",
|
38
|
-
)
|
39
|
-
metric_task_run_platform_errors: prometheus_client.Counter = prometheus_client.Counter(
|
40
|
-
"task_run_platform_errors",
|
41
|
-
"Number of platform errors while running task",
|
42
|
-
)
|
43
|
-
metric_task_run_latency: prometheus_client.Histogram = (
|
44
|
-
latency_metric_for_customer_controlled_operation(
|
45
|
-
"task_run",
|
46
|
-
"run task from the moment it is unblocked by the policy until it finishes",
|
47
|
-
)
|
48
|
-
)
|
49
|
-
metric_tasks_running: prometheus_client.Gauge = prometheus_client.Gauge(
|
50
|
-
"tasks_running",
|
51
|
-
"Number of running tasks",
|
52
|
-
)
|
@@ -1,25 +0,0 @@
|
|
1
|
-
from typing import Dict, List, Optional
|
2
|
-
|
3
|
-
from ..api_objects import FunctionURI
|
4
|
-
|
5
|
-
|
6
|
-
def function_allowlist_to_info_dict(
|
7
|
-
function_allowlist: Optional[List[FunctionURI]],
|
8
|
-
) -> Dict[str, str]:
|
9
|
-
if function_allowlist is None:
|
10
|
-
return {"function_allowlist": "None"}
|
11
|
-
|
12
|
-
info = {}
|
13
|
-
counter = 0
|
14
|
-
for function_uri in function_allowlist:
|
15
|
-
function_uri: FunctionURI
|
16
|
-
info[f"function_allowlist_{counter}"] = ":".join(
|
17
|
-
[
|
18
|
-
function_uri.namespace,
|
19
|
-
function_uri.compute_graph,
|
20
|
-
function_uri.compute_fn,
|
21
|
-
str(function_uri.version),
|
22
|
-
]
|
23
|
-
)
|
24
|
-
counter += 1
|
25
|
-
return info
|