indexify 0.3.31__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. indexify/cli/__init__.py +18 -0
  2. indexify/cli/build_image.py +51 -0
  3. indexify/cli/deploy.py +57 -0
  4. indexify/cli/executor.py +205 -0
  5. indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
  6. indexify/executor/executor.py +57 -313
  7. indexify/executor/function_allowlist.py +59 -0
  8. indexify/executor/function_executor/function_executor.py +12 -6
  9. indexify/executor/function_executor/invocation_state_client.py +25 -3
  10. indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
  11. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
  12. indexify/executor/function_executor_controller/__init__.py +13 -0
  13. indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
  14. indexify/executor/function_executor_controller/create_function_executor.py +154 -0
  15. indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
  16. indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
  17. indexify/executor/function_executor_controller/downloads.py +199 -0
  18. indexify/executor/function_executor_controller/events.py +172 -0
  19. indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
  20. indexify/executor/function_executor_controller/loggers.py +57 -0
  21. indexify/executor/function_executor_controller/message_validators.py +65 -0
  22. indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
  23. indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
  24. indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
  25. indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
  26. indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
  27. indexify/executor/function_executor_controller/prepare_task.py +38 -0
  28. indexify/executor/function_executor_controller/run_task.py +201 -0
  29. indexify/executor/function_executor_controller/task_info.py +33 -0
  30. indexify/executor/function_executor_controller/task_output.py +122 -0
  31. indexify/executor/function_executor_controller/upload_task_output.py +234 -0
  32. indexify/executor/host_resources/host_resources.py +20 -25
  33. indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
  34. indexify/executor/metrics/executor.py +0 -47
  35. indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
  36. indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
  37. indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
  38. indexify/executor/monitoring/health_checker/health_checker.py +0 -11
  39. indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
  40. indexify/executor/state_reporter.py +364 -0
  41. indexify/proto/executor_api.proto +67 -59
  42. indexify/proto/executor_api_pb2.py +52 -52
  43. indexify/proto/executor_api_pb2.pyi +125 -104
  44. indexify/proto/executor_api_pb2_grpc.py +0 -47
  45. {indexify-0.3.31.dist-info → indexify-0.4.2.dist-info}/METADATA +1 -3
  46. indexify-0.4.2.dist-info/RECORD +68 -0
  47. indexify-0.4.2.dist-info/entry_points.txt +3 -0
  48. indexify/cli/cli.py +0 -268
  49. indexify/executor/api_objects.py +0 -92
  50. indexify/executor/downloader.py +0 -417
  51. indexify/executor/executor_flavor.py +0 -7
  52. indexify/executor/function_executor/function_executor_state.py +0 -107
  53. indexify/executor/function_executor/function_executor_states_container.py +0 -93
  54. indexify/executor/function_executor/function_executor_status.py +0 -95
  55. indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
  56. indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
  57. indexify/executor/function_executor/single_task_runner.py +0 -345
  58. indexify/executor/function_executor/task_input.py +0 -21
  59. indexify/executor/function_executor/task_output.py +0 -105
  60. indexify/executor/grpc/function_executor_controller.py +0 -418
  61. indexify/executor/grpc/metrics/task_controller.py +0 -8
  62. indexify/executor/grpc/state_reporter.py +0 -317
  63. indexify/executor/grpc/task_controller.py +0 -508
  64. indexify/executor/metrics/task_fetcher.py +0 -21
  65. indexify/executor/metrics/task_reporter.py +0 -53
  66. indexify/executor/metrics/task_runner.py +0 -52
  67. indexify/executor/monitoring/function_allowlist.py +0 -25
  68. indexify/executor/runtime_probes.py +0 -68
  69. indexify/executor/task_fetcher.py +0 -96
  70. indexify/executor/task_reporter.py +0 -459
  71. indexify/executor/task_runner.py +0 -177
  72. indexify-0.3.31.dist-info/RECORD +0 -68
  73. indexify-0.3.31.dist-info/entry_points.txt +0 -3
  74. {indexify-0.3.31.dist-info → indexify-0.4.2.dist-info}/WHEEL +0 -0
@@ -1,508 +0,0 @@
1
- import asyncio
2
- from typing import Any, Optional
3
-
4
- import grpc
5
- from tensorlake.function_executor.proto.function_executor_pb2 import (
6
- RunTaskRequest,
7
- RunTaskResponse,
8
- SerializedObject,
9
- )
10
- from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
11
- FunctionExecutorStub,
12
- )
13
- from tensorlake.function_executor.proto.message_validator import MessageValidator
14
-
15
- from indexify.proto.executor_api_pb2 import Task
16
-
17
- from ..downloader import Downloader
18
- from ..function_executor.function_executor import FunctionExecutor
19
- from ..function_executor.function_executor_state import FunctionExecutorState
20
- from ..function_executor.function_executor_status import FunctionExecutorStatus
21
- from ..function_executor.metrics.single_task_runner import (
22
- metric_function_executor_run_task_rpc_errors,
23
- metric_function_executor_run_task_rpc_latency,
24
- metric_function_executor_run_task_rpcs,
25
- )
26
- from ..function_executor.task_output import TaskMetrics, TaskOutput
27
-
28
- # TODO: combine these metrics into a single python file once gRPC migration is over and old code is removed.
29
- from ..metrics.executor import (
30
- METRIC_TASKS_COMPLETED_OUTCOME_ALL,
31
- METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
32
- METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
33
- METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
34
- metric_task_completion_latency,
35
- metric_task_outcome_report_latency,
36
- metric_task_outcome_report_retries,
37
- metric_task_outcome_reports,
38
- metric_tasks_completed,
39
- metric_tasks_fetched,
40
- metric_tasks_reporting_outcome,
41
- )
42
- from ..metrics.task_runner import (
43
- metric_task_policy_latency,
44
- metric_task_policy_runs,
45
- metric_task_run_latency,
46
- metric_task_run_platform_errors,
47
- metric_task_runs,
48
- metric_tasks_blocked_by_policy,
49
- metric_tasks_blocked_by_policy_per_function_name,
50
- metric_tasks_running,
51
- )
52
- from ..task_reporter import TaskReporter
53
- from .metrics.task_controller import metric_task_cancellations
54
-
55
- _TASK_OUTCOME_REPORT_BACKOFF_SEC = 5.0
56
-
57
-
58
- def validate_task(task: Task) -> None:
59
- """Validates the supplied Task.
60
-
61
- Raises ValueError if the Task is not valid.
62
- """
63
- validator = MessageValidator(task)
64
- validator.required_field("id")
65
- validator.required_field("namespace")
66
- validator.required_field("graph_name")
67
- validator.required_field("graph_version")
68
- validator.required_field("function_name")
69
- validator.required_field("graph_invocation_id")
70
- if not (task.HasField("input_key") or task.HasField("input")):
71
- raise ValueError(
72
- "Task must have either input_key or input field set. " f"Got task: {task}"
73
- )
74
-
75
-
76
- def task_logger(task: Task, logger: Any) -> Any:
77
- """Returns a logger bound with the task's metadata.
78
-
79
- The function assumes that the task might be invalid."""
80
- return logger.bind(
81
- task_id=task.id if task.HasField("id") else None,
82
- namespace=task.namespace if task.HasField("namespace") else None,
83
- graph_name=task.graph_name if task.HasField("graph_name") else None,
84
- graph_version=task.graph_version if task.HasField("graph_version") else None,
85
- function_name=task.function_name if task.HasField("function_name") else None,
86
- graph_invocation_id=(
87
- task.graph_invocation_id if task.HasField("graph_invocation_id") else None
88
- ),
89
- )
90
-
91
-
92
- class TaskController:
93
- def __init__(
94
- self,
95
- task: Task,
96
- downloader: Downloader,
97
- task_reporter: TaskReporter,
98
- function_executor_id: str,
99
- function_executor_state: FunctionExecutorState,
100
- logger: Any,
101
- ):
102
- """Creates a new TaskController instance.
103
-
104
- The supplied Task must be already validated by the caller using validate_task().
105
- """
106
- self._task: Task = task
107
- self._downloader: Downloader = downloader
108
- self._task_reporter: TaskReporter = task_reporter
109
- self._function_executor_id: str = function_executor_id
110
- self._function_executor_state: FunctionExecutorState = function_executor_state
111
- self._logger: Any = task_logger(task, logger).bind(
112
- function_executor_id=function_executor_id,
113
- module=__name__,
114
- )
115
-
116
- self._input: Optional[SerializedObject] = None
117
- self._init_value: Optional[SerializedObject] = None
118
- self._is_timed_out: bool = False
119
- # Automatically start the controller on creation.
120
- self._task_runner: asyncio.Task = asyncio.create_task(
121
- self._run(), name="task controller task runner"
122
- )
123
-
124
- def function_executor_id(self) -> str:
125
- return self._function_executor_id
126
-
127
- def task(self) -> Task:
128
- return self._task
129
-
130
- async def destroy(self) -> None:
131
- """Destroys the controller and cancells the task if it didn't finish yet.
132
-
133
- A running task is cancelled by destroying its Function Executor.
134
- Doesn't raise any exceptions.
135
- """
136
- if self._task_runner.done():
137
- return # Nothin to do, the task is finished already.
138
-
139
- # The task runner code handles asyncio.CancelledError properly.
140
- self._task_runner.cancel()
141
- # Don't await the cancelled task to not block the caller unnecessary.
142
-
143
- async def _run(self) -> None:
144
- metric_tasks_fetched.inc()
145
- with metric_task_completion_latency.time():
146
- await self._run_task()
147
-
148
- async def _run_task(self) -> None:
149
- """Runs the supplied task and does full managemenet of its lifecycle.
150
-
151
- Doesn't raise any exceptions."""
152
- output: Optional[TaskOutput] = None
153
-
154
- try:
155
- await self._download_inputs()
156
- output = await self._run_task_when_function_executor_is_available()
157
- self._logger.info("task execution finished", success=output.success)
158
- _log_function_metrics(output, self._logger)
159
- except Exception as e:
160
- metric_task_run_platform_errors.inc(),
161
- output = self._internal_error_output()
162
- self._logger.error("task execution failed", exc_info=e)
163
- except asyncio.CancelledError:
164
- metric_task_cancellations.inc()
165
- self._logger.info("task execution cancelled")
166
- # Don't report task outcome according to the current policy.
167
- # asyncio.CancelledError can't be suppressed, see Python docs.
168
- raise
169
-
170
- # Current task outcome reporting policy:
171
- # Don't report task outcomes for tasks that didn't fail with internal or customer error.
172
- # This is required to simplify the protocol so Server doesn't need to care about task states
173
- # and cancel each tasks carefully to not get its outcome as failed.
174
- with (
175
- metric_tasks_reporting_outcome.track_inprogress(),
176
- metric_task_outcome_report_latency.time(),
177
- ):
178
- metric_task_outcome_reports.inc()
179
- await self._report_task_outcome(output)
180
-
181
- async def _download_inputs(self) -> None:
182
- """Downloads the task inputs and init value.
183
-
184
- Raises an Exception if the inputs failed to download.
185
- """
186
- self._input = await self._downloader.download_input(
187
- namespace=self._task.namespace,
188
- graph_name=self._task.graph_name,
189
- graph_invocation_id=self._task.graph_invocation_id,
190
- input_key=self._task.input_key,
191
- data_payload=self._task.input if self._task.HasField("input") else None,
192
- logger=self._logger,
193
- )
194
-
195
- if self._task.HasField("reducer_output_key") or self._task.HasField(
196
- "reducer_input"
197
- ):
198
- self._init_value = await self._downloader.download_init_value(
199
- namespace=self._task.namespace,
200
- graph_name=self._task.graph_name,
201
- function_name=self._task.function_name,
202
- graph_invocation_id=self._task.graph_invocation_id,
203
- reducer_output_key=(
204
- self._task.reducer_output_key
205
- if self._task.HasField("reducer_output_key")
206
- else None
207
- ),
208
- data_payload=(
209
- self._task.reducer_input
210
- if self._task.HasField("reducer_input")
211
- else None
212
- ),
213
- logger=self._logger,
214
- )
215
-
216
- async def _run_task_when_function_executor_is_available(self) -> TaskOutput:
217
- """Runs the task on the Function Executor when it's available.
218
-
219
- Raises an Exception if task failed due to an internal error."""
220
- await self._acquire_function_executor()
221
-
222
- next_status: FunctionExecutorStatus = FunctionExecutorStatus.IDLE
223
- try:
224
- return await self._run_task_on_acquired_function_executor()
225
- except asyncio.CancelledError:
226
- # This one is raised here when destroy() was called while we were running the task on this FE.
227
- next_status = FunctionExecutorStatus.UNHEALTHY
228
- # asyncio.CancelledError can't be suppressed, see Python docs.
229
- raise
230
- finally:
231
- # If the task finished running on FE then put it into IDLE state so other tasks can run on it.
232
- # Otherwise, mark the FE as unhealthy to force its destruction so the task stops running on it eventually
233
- # and no other tasks run on this FE because it'd result in undefined behavior.
234
- if self._is_timed_out:
235
- next_status = FunctionExecutorStatus.UNHEALTHY
236
- # TODO: When task controller is removed do FE health check here to stop scheduling tasks on unhealthy FE asap.
237
- await self._release_function_executor(next_status=next_status)
238
-
239
- async def _acquire_function_executor(self) -> None:
240
- """Waits until the Function Executor is in IDLE state and then locks it so the task can run on it.
241
-
242
- Doesn't raise any exceptions.
243
- """
244
- with (
245
- metric_tasks_blocked_by_policy.track_inprogress(),
246
- metric_tasks_blocked_by_policy_per_function_name.labels(
247
- function_name=self._task.function_name
248
- ).track_inprogress(),
249
- metric_task_policy_latency.time(),
250
- ):
251
- metric_task_policy_runs.inc()
252
- self._logger.info(
253
- "task is blocked by policy: waiting for idle function executor"
254
- )
255
- async with self._function_executor_state.lock:
256
- await self._function_executor_state.wait_status(
257
- allowlist=[FunctionExecutorStatus.IDLE]
258
- )
259
- await self._function_executor_state.set_status(
260
- FunctionExecutorStatus.RUNNING_TASK
261
- )
262
-
263
- # At this point the Function Executor belongs to this task controller due to RUNNING_TASK status.
264
- # We can now unlock the FE state. We have to update the FE status once the task succeeds or fails.
265
-
266
- async def _release_function_executor(
267
- self, next_status: FunctionExecutorStatus
268
- ) -> None:
269
- # Release the Function Executor so others can run tasks on it if FE status didn't change.
270
- # If FE status changed, then it means that we're off normal task execution path, e.g.
271
- # Server decided to do something with FE.
272
- async with self._function_executor_state.lock:
273
- if (
274
- self._function_executor_state.status
275
- == FunctionExecutorStatus.RUNNING_TASK
276
- ):
277
- await self._function_executor_state.set_status(next_status)
278
- if next_status == FunctionExecutorStatus.UNHEALTHY:
279
- # Destroy the unhealthy FE asap so it doesn't consume resources.
280
- # Don't do it under the state lock to not add unnecessary delays.
281
- asyncio.create_task(
282
- self._function_executor_state.function_executor.destroy()
283
- )
284
- self._function_executor_state.function_executor = None
285
- else:
286
- self._logger.warning(
287
- "skipping releasing Function Executor after running the task due to unexpected Function Executor status",
288
- status=self._function_executor_state.status.name,
289
- next_status=next_status.name,
290
- )
291
-
292
- async def _run_task_on_acquired_function_executor(self) -> TaskOutput:
293
- """Runs the task on the Function Executor acquired by this task already and returns the output.
294
-
295
- Raises an Exception if the task failed to run due to an internal error."""
296
- with metric_tasks_running.track_inprogress(), metric_task_run_latency.time():
297
- metric_task_runs.inc()
298
- return await self._run_task_rpc_on_function_executor()
299
-
300
- async def _run_task_rpc_on_function_executor(self) -> TaskOutput:
301
- """Runs the task on the Function Executor and returns the output.
302
-
303
- Raises an Exception if the task failed to run due to an internal error.
304
- """
305
- request: RunTaskRequest = RunTaskRequest(
306
- namespace=self._task.namespace,
307
- graph_name=self._task.graph_name,
308
- graph_version=self._task.graph_version,
309
- function_name=self._task.function_name,
310
- graph_invocation_id=self._task.graph_invocation_id,
311
- task_id=self._task.id,
312
- function_input=self._input,
313
- )
314
- # Don't keep the input in memory after we started running the task.
315
- self._input = None
316
-
317
- if self._init_value is not None:
318
- request.function_init_value.CopyFrom(self._init_value)
319
- # Don't keep the init value in memory after we started running the task.
320
- self._init_value = None
321
-
322
- channel: grpc.aio.Channel = (
323
- self._function_executor_state.function_executor.channel()
324
- )
325
-
326
- timeout_sec: Optional[float] = None
327
- if self._task.HasField("timeout_ms"):
328
- # TODO: Add integration tests with function timeout when end-to-end implementation is done.
329
- timeout_sec = self._task.timeout_ms / 1000.0
330
-
331
- async with _RunningTaskContextManager(
332
- task=self._task,
333
- function_executor=self._function_executor_state.function_executor,
334
- ):
335
- with (
336
- metric_function_executor_run_task_rpc_errors.count_exceptions(),
337
- metric_function_executor_run_task_rpc_latency.time(),
338
- ):
339
- metric_function_executor_run_task_rpcs.inc()
340
- # If this RPC failed due to customer code crashing the server we won't be
341
- # able to detect this. We'll treat this as our own error for now and thus
342
- # let the AioRpcError to be raised here.
343
- try:
344
- response: RunTaskResponse = await FunctionExecutorStub(
345
- channel
346
- ).run_task(request, timeout=timeout_sec)
347
- except grpc.aio.AioRpcError as e:
348
- if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
349
- # Not logging customer error.
350
- self._is_timed_out = True
351
- return self._function_timeout_output(timeout_sec=timeout_sec)
352
- raise
353
-
354
- return _task_output_from_function_executor_response(
355
- task=self._task, response=response
356
- )
357
-
358
- async def _report_task_outcome(self, output: TaskOutput) -> None:
359
- """Reports the task with the given output to the server.
360
-
361
- Doesn't raise any Exceptions. Runs till the reporting is successful."""
362
- reporting_retries: int = 0
363
-
364
- while True:
365
- logger = self._logger.bind(retries=reporting_retries)
366
- try:
367
- await self._task_reporter.report(output=output, logger=logger)
368
- break
369
- except Exception as e:
370
- logger.error(
371
- "failed to report task",
372
- exc_info=e,
373
- )
374
- reporting_retries += 1
375
- metric_task_outcome_report_retries.inc()
376
- await asyncio.sleep(_TASK_OUTCOME_REPORT_BACKOFF_SEC)
377
-
378
- metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
379
- if output.is_internal_error:
380
- metric_tasks_completed.labels(
381
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
382
- ).inc()
383
- elif output.success:
384
- metric_tasks_completed.labels(
385
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
386
- ).inc()
387
- else:
388
- metric_tasks_completed.labels(
389
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
390
- ).inc()
391
-
392
- def _internal_error_output(self) -> TaskOutput:
393
- return TaskOutput.internal_error(
394
- task_id=self._task.id,
395
- namespace=self._task.namespace,
396
- graph_name=self._task.graph_name,
397
- function_name=self._task.function_name,
398
- graph_version=self._task.graph_version,
399
- graph_invocation_id=self._task.graph_invocation_id,
400
- output_payload_uri_prefix=(
401
- self._task.output_payload_uri_prefix
402
- if self._task.HasField("output_payload_uri_prefix")
403
- else None
404
- ),
405
- )
406
-
407
- def _function_timeout_output(self, timeout_sec: float) -> TaskOutput:
408
- return TaskOutput.function_timeout(
409
- task_id=self._task.id,
410
- namespace=self._task.namespace,
411
- graph_name=self._task.graph_name,
412
- function_name=self._task.function_name,
413
- graph_version=self._task.graph_version,
414
- graph_invocation_id=self._task.graph_invocation_id,
415
- timeout_sec=timeout_sec,
416
- output_payload_uri_prefix=(
417
- self._task.output_payload_uri_prefix
418
- if self._task.HasField("output_payload_uri_prefix")
419
- else None
420
- ),
421
- )
422
-
423
-
424
- def _task_output_from_function_executor_response(
425
- task: Task, response: RunTaskResponse
426
- ) -> TaskOutput:
427
- response_validator = MessageValidator(response)
428
- response_validator.required_field("stdout")
429
- response_validator.required_field("stderr")
430
- response_validator.required_field("is_reducer")
431
- response_validator.required_field("success")
432
-
433
- metrics = TaskMetrics(counters={}, timers={})
434
- if response.HasField("metrics"):
435
- # Can be None if e.g. function failed.
436
- metrics.counters = dict(response.metrics.counters)
437
- metrics.timers = dict(response.metrics.timers)
438
-
439
- output = TaskOutput(
440
- task_id=task.id,
441
- namespace=task.namespace,
442
- graph_name=task.graph_name,
443
- function_name=task.function_name,
444
- graph_version=task.graph_version,
445
- graph_invocation_id=task.graph_invocation_id,
446
- stdout=response.stdout,
447
- stderr=response.stderr,
448
- reducer=response.is_reducer,
449
- success=response.success,
450
- metrics=metrics,
451
- output_payload_uri_prefix=(
452
- task.output_payload_uri_prefix
453
- if task.HasField("output_payload_uri_prefix")
454
- else None
455
- ),
456
- )
457
-
458
- if response.HasField("function_output"):
459
- output.function_output = response.function_output
460
- if response.HasField("router_output"):
461
- output.router_output = response.router_output
462
-
463
- return output
464
-
465
-
466
- # Temporary workaround is logging customer metrics until we store them somewhere
467
- # for future retrieval and processing.
468
- def _log_function_metrics(output: TaskOutput, logger: Any):
469
- if output.metrics is None:
470
- return
471
-
472
- logger = logger.bind(
473
- invocation_id=output.graph_invocation_id,
474
- function_name=output.function_name,
475
- graph_name=output.graph_name,
476
- namespace=output.namespace,
477
- )
478
-
479
- for counter_name, counter_value in output.metrics.counters.items():
480
- logger.info(
481
- "function_metric", counter_name=counter_name, counter_value=counter_value
482
- )
483
- for timer_name, timer_value in output.metrics.timers.items():
484
- logger.info("function_metric", timer_name=timer_name, timer_value=timer_value)
485
-
486
-
487
- class _RunningTaskContextManager:
488
- """Performs all the actions required before and after running a task."""
489
-
490
- def __init__(
491
- self,
492
- task: Task,
493
- function_executor: FunctionExecutor,
494
- ):
495
- self._task = task
496
- self._function_executor: FunctionExecutor = function_executor
497
-
498
- async def __aenter__(self):
499
- self._function_executor.invocation_state_client().add_task_to_invocation_id_entry(
500
- task_id=self._task.id,
501
- invocation_id=self._task.graph_invocation_id,
502
- )
503
- return self
504
-
505
- async def __aexit__(self, exc_type, exc_val, exc_tb):
506
- self._function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
507
- task_id=self._task.id,
508
- )
@@ -1,21 +0,0 @@
1
- import prometheus_client
2
-
3
- from ..monitoring.metrics import latency_metric_for_fast_operation
4
-
5
- # This file contains all metrics used by TaskFetcher.
6
-
7
- metric_server_registrations: prometheus_client.Counter = prometheus_client.Counter(
8
- "server_registration_requests",
9
- "Number of Executor registrations requests sent to the Server",
10
- )
11
- metric_server_registration_errors: prometheus_client.Counter = (
12
- prometheus_client.Counter(
13
- "server_registration_request_errors",
14
- "Number of failed Executor registration requests",
15
- )
16
- )
17
- metric_server_registration_latency: prometheus_client.Histogram = (
18
- latency_metric_for_fast_operation(
19
- "server_registration_request", "Register Executor at the Server"
20
- )
21
- )
@@ -1,53 +0,0 @@
1
- import prometheus_client
2
-
3
- from ..monitoring.metrics import latency_metric_for_fast_operation
4
-
5
- # This file contains all metrics used by TaskReporter.
6
-
7
- metric_server_ingest_files_requests: prometheus_client.Counter = (
8
- prometheus_client.Counter(
9
- "server_ingest_files_requests", "Number of Server ingest files requests"
10
- )
11
- )
12
- metric_server_ingest_files_errors: prometheus_client.Counter = (
13
- prometheus_client.Counter(
14
- "server_ingest_files_request_errors",
15
- "Number of Server ingest files request errors",
16
- )
17
- )
18
- metric_server_ingest_files_latency: prometheus_client.Histogram = (
19
- latency_metric_for_fast_operation(
20
- "server_ingest_files_request", "Ingest files request to Server"
21
- )
22
- )
23
-
24
- metric_task_output_blob_store_uploads: prometheus_client.Counter = (
25
- prometheus_client.Counter(
26
- "task_output_blob_store_uploads", "Number of task output uploads to blob store"
27
- )
28
- )
29
- metric_task_output_blob_store_upload_errors: prometheus_client.Counter = (
30
- prometheus_client.Counter(
31
- "task_output_blob_store_upload_errors",
32
- "Number of failed task output uploads to blob store",
33
- )
34
- )
35
- metric_task_output_blob_store_upload_latency: prometheus_client.Histogram = (
36
- latency_metric_for_fast_operation(
37
- "task_output_blob_store_upload", "Upload task output to blob store"
38
- )
39
- )
40
-
41
- metric_report_task_outcome_rpcs = prometheus_client.Counter(
42
- "report_task_outcome_rpcs",
43
- "Number of report task outcome RPCs to Server",
44
- )
45
- metric_report_task_outcome_errors = prometheus_client.Counter(
46
- "report_task_outcome_rpc_errors",
47
- "Number of report task outcome RPC errors",
48
- )
49
- metric_report_task_outcome_latency: prometheus_client.Histogram = (
50
- latency_metric_for_fast_operation(
51
- "report_task_outcome_rpc", "Report task outcome RPC to Server"
52
- )
53
- )
@@ -1,52 +0,0 @@
1
- import prometheus_client
2
-
3
- from ..monitoring.metrics import latency_metric_for_customer_controlled_operation
4
-
5
- # This file contains all metrics used by TaskRunner.
6
-
7
- # Metrics for the stage when task is blocked by the current policy.
8
- metric_task_policy_runs: prometheus_client.Counter = prometheus_client.Counter(
9
- "task_policy_runs",
10
- "Number of task execution policy runs",
11
- )
12
- metric_task_policy_errors: prometheus_client.Counter = prometheus_client.Counter(
13
- "task_policy_errors",
14
- "Number of errors while running task execution policy",
15
- )
16
- metric_task_policy_latency: prometheus_client.Histogram = (
17
- latency_metric_for_customer_controlled_operation(
18
- "task_policy",
19
- "Task execution blocked by the policy",
20
- )
21
- )
22
- metric_tasks_blocked_by_policy: prometheus_client.Gauge = prometheus_client.Gauge(
23
- "tasks_blocked_by_policy",
24
- "Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
25
- )
26
- metric_tasks_blocked_by_policy_per_function_name: prometheus_client.Gauge = (
27
- prometheus_client.Gauge(
28
- "tasks_blocked_by_policy_per_function_name",
29
- "Number of tasks that are ready for execution but are blocked according to the current policy (typically waiting for a free Function Executor)",
30
- ["function_name"],
31
- )
32
- )
33
-
34
- # Metrics for the stage when task is running.
35
- metric_task_runs: prometheus_client.Counter = prometheus_client.Counter(
36
- "task_runs",
37
- "Number of task runs",
38
- )
39
- metric_task_run_platform_errors: prometheus_client.Counter = prometheus_client.Counter(
40
- "task_run_platform_errors",
41
- "Number of platform errors while running task",
42
- )
43
- metric_task_run_latency: prometheus_client.Histogram = (
44
- latency_metric_for_customer_controlled_operation(
45
- "task_run",
46
- "run task from the moment it is unblocked by the policy until it finishes",
47
- )
48
- )
49
- metric_tasks_running: prometheus_client.Gauge = prometheus_client.Gauge(
50
- "tasks_running",
51
- "Number of running tasks",
52
- )
@@ -1,25 +0,0 @@
1
- from typing import Dict, List, Optional
2
-
3
- from ..api_objects import FunctionURI
4
-
5
-
6
- def function_allowlist_to_info_dict(
7
- function_allowlist: Optional[List[FunctionURI]],
8
- ) -> Dict[str, str]:
9
- if function_allowlist is None:
10
- return {"function_allowlist": "None"}
11
-
12
- info = {}
13
- counter = 0
14
- for function_uri in function_allowlist:
15
- function_uri: FunctionURI
16
- info[f"function_allowlist_{counter}"] = ":".join(
17
- [
18
- function_uri.namespace,
19
- function_uri.compute_graph,
20
- function_uri.compute_fn,
21
- str(function_uri.version),
22
- ]
23
- )
24
- counter += 1
25
- return info