indexify 0.3.31__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. indexify/cli/__init__.py +18 -0
  2. indexify/cli/build_image.py +51 -0
  3. indexify/cli/deploy.py +57 -0
  4. indexify/cli/executor.py +205 -0
  5. indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
  6. indexify/executor/executor.py +57 -313
  7. indexify/executor/function_allowlist.py +59 -0
  8. indexify/executor/function_executor/function_executor.py +12 -6
  9. indexify/executor/function_executor/invocation_state_client.py +25 -3
  10. indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
  11. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
  12. indexify/executor/function_executor_controller/__init__.py +13 -0
  13. indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
  14. indexify/executor/function_executor_controller/create_function_executor.py +158 -0
  15. indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
  16. indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
  17. indexify/executor/function_executor_controller/downloads.py +199 -0
  18. indexify/executor/function_executor_controller/events.py +172 -0
  19. indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
  20. indexify/executor/function_executor_controller/loggers.py +57 -0
  21. indexify/executor/function_executor_controller/message_validators.py +69 -0
  22. indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
  23. indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
  24. indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
  25. indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
  26. indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
  27. indexify/executor/function_executor_controller/prepare_task.py +38 -0
  28. indexify/executor/function_executor_controller/run_task.py +201 -0
  29. indexify/executor/function_executor_controller/task_info.py +33 -0
  30. indexify/executor/function_executor_controller/task_output.py +122 -0
  31. indexify/executor/function_executor_controller/upload_task_output.py +234 -0
  32. indexify/executor/host_resources/host_resources.py +20 -25
  33. indexify/executor/host_resources/nvidia_gpu_allocator.py +8 -1
  34. indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
  35. indexify/executor/metrics/executor.py +0 -47
  36. indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
  37. indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
  38. indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
  39. indexify/executor/monitoring/health_checker/health_checker.py +0 -11
  40. indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
  41. indexify/executor/state_reporter.py +364 -0
  42. indexify/proto/executor_api.proto +68 -60
  43. indexify/proto/executor_api_pb2.py +52 -52
  44. indexify/proto/executor_api_pb2.pyi +129 -108
  45. indexify/proto/executor_api_pb2_grpc.py +0 -47
  46. {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/METADATA +2 -5
  47. indexify-0.4.3.dist-info/RECORD +68 -0
  48. indexify-0.4.3.dist-info/entry_points.txt +3 -0
  49. indexify/cli/cli.py +0 -268
  50. indexify/executor/api_objects.py +0 -92
  51. indexify/executor/downloader.py +0 -417
  52. indexify/executor/executor_flavor.py +0 -7
  53. indexify/executor/function_executor/function_executor_state.py +0 -107
  54. indexify/executor/function_executor/function_executor_states_container.py +0 -93
  55. indexify/executor/function_executor/function_executor_status.py +0 -95
  56. indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
  57. indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
  58. indexify/executor/function_executor/single_task_runner.py +0 -345
  59. indexify/executor/function_executor/task_input.py +0 -21
  60. indexify/executor/function_executor/task_output.py +0 -105
  61. indexify/executor/grpc/function_executor_controller.py +0 -418
  62. indexify/executor/grpc/metrics/task_controller.py +0 -8
  63. indexify/executor/grpc/state_reporter.py +0 -317
  64. indexify/executor/grpc/task_controller.py +0 -508
  65. indexify/executor/metrics/task_fetcher.py +0 -21
  66. indexify/executor/metrics/task_reporter.py +0 -53
  67. indexify/executor/metrics/task_runner.py +0 -52
  68. indexify/executor/monitoring/function_allowlist.py +0 -25
  69. indexify/executor/runtime_probes.py +0 -68
  70. indexify/executor/task_fetcher.py +0 -96
  71. indexify/executor/task_reporter.py +0 -459
  72. indexify/executor/task_runner.py +0 -177
  73. indexify-0.3.31.dist-info/RECORD +0 -68
  74. indexify-0.3.31.dist-info/entry_points.txt +0 -3
  75. {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/WHEEL +0 -0
@@ -0,0 +1,57 @@
1
+ from typing import Any
2
+
3
+ from indexify.proto.executor_api_pb2 import (
4
+ FunctionExecutorDescription,
5
+ Task,
6
+ )
7
+
8
+
9
+ def function_executor_logger(
10
+ function_executor_description: FunctionExecutorDescription, logger: Any
11
+ ) -> Any:
12
+ """Returns a logger bound with the FE's metadata.
13
+
14
+ The function assumes that the FE might be invalid."""
15
+ return logger.bind(
16
+ function_executor_id=(
17
+ function_executor_description.id
18
+ if function_executor_description.HasField("id")
19
+ else None
20
+ ),
21
+ namespace=(
22
+ function_executor_description.namespace
23
+ if function_executor_description.HasField("namespace")
24
+ else None
25
+ ),
26
+ graph_name=(
27
+ function_executor_description.graph_name
28
+ if function_executor_description.HasField("graph_name")
29
+ else None
30
+ ),
31
+ graph_version=(
32
+ function_executor_description.graph_version
33
+ if function_executor_description.HasField("graph_version")
34
+ else None
35
+ ),
36
+ function_name=(
37
+ function_executor_description.function_name
38
+ if function_executor_description.HasField("function_name")
39
+ else None
40
+ ),
41
+ )
42
+
43
+
44
+ def task_logger(task: Task, logger: Any) -> Any:
45
+ """Returns a logger bound with the task's metadata.
46
+
47
+ The function assumes that the task might be invalid."""
48
+ return logger.bind(
49
+ task_id=task.id if task.HasField("id") else None,
50
+ namespace=task.namespace if task.HasField("namespace") else None,
51
+ graph_name=task.graph_name if task.HasField("graph_name") else None,
52
+ graph_version=task.graph_version if task.HasField("graph_version") else None,
53
+ function_name=task.function_name if task.HasField("function_name") else None,
54
+ graph_invocation_id=(
55
+ task.graph_invocation_id if task.HasField("graph_invocation_id") else None
56
+ ),
57
+ )
@@ -0,0 +1,69 @@
1
+ from tensorlake.function_executor.proto.message_validator import MessageValidator
2
+
3
+ from indexify.proto.executor_api_pb2 import (
4
+ DataPayload,
5
+ FunctionExecutorDescription,
6
+ Task,
7
+ )
8
+
9
+
10
+ def validate_function_executor_description(
11
+ function_executor_description: FunctionExecutorDescription,
12
+ ) -> None:
13
+ """Validates the supplied FE description.
14
+
15
+ Raises ValueError if the description is not valid.
16
+ """
17
+ validator = MessageValidator(function_executor_description)
18
+ validator.required_field("id")
19
+ validator.required_field("namespace")
20
+ validator.required_field("graph_name")
21
+ validator.required_field("graph_version")
22
+ validator.required_field("function_name")
23
+ # image_uri is optional.
24
+ # secret_names can be empty.
25
+ validator.required_field("customer_code_timeout_ms")
26
+ validator.required_field("graph")
27
+ validator.required_field("resources")
28
+
29
+ _validate_data_payload(function_executor_description.graph)
30
+
31
+ validator = MessageValidator(function_executor_description.resources)
32
+ validator.required_field("cpu_ms_per_sec")
33
+ validator.required_field("memory_bytes")
34
+ validator.required_field("disk_bytes")
35
+
36
+ if function_executor_description.resources.HasField("gpu"):
37
+ validator = MessageValidator(function_executor_description.resources.gpu)
38
+ validator.required_field("count")
39
+ validator.required_field("model")
40
+
41
+
42
+ def validate_task(task: Task) -> None:
43
+ """Validates the supplied Task.
44
+
45
+ Raises ValueError if the Task is not valid.
46
+ """
47
+ validator = MessageValidator(task)
48
+ validator.required_field("id")
49
+ validator.required_field("namespace")
50
+ validator.required_field("graph_name")
51
+ validator.required_field("graph_version")
52
+ validator.required_field("function_name")
53
+ validator.required_field("graph_invocation_id")
54
+ validator.required_field("timeout_ms")
55
+ validator.required_field("input")
56
+ validator.required_field("output_payload_uri_prefix")
57
+ validator.required_field("retry_policy")
58
+
59
+ _validate_data_payload(task.input)
60
+ if task.HasField("reducer_input"):
61
+ _validate_data_payload(task.reducer_input)
62
+
63
+
64
+ def _validate_data_payload(data_payload: DataPayload) -> None:
65
+ """Validates the supplied DataPayload.
66
+
67
+ Raises ValueError if the DataPayload is not valid.
68
+ """
69
+ (MessageValidator(data_payload).required_field("uri").required_field("encoding"))
@@ -0,0 +1,68 @@
1
+ import prometheus_client
2
+
3
+ from indexify.executor.monitoring.metrics import (
4
+ latency_metric_for_customer_controlled_operation,
5
+ )
6
+
7
+ metric_tasks_completed: prometheus_client.Counter = prometheus_client.Counter(
8
+ "tasks_completed",
9
+ "Number of tasks that were completed",
10
+ ["outcome_code", "failure_reason"],
11
+ )
12
+ METRIC_TASKS_COMPLETED_OUTCOME_CODE_ALL = "all"
13
+ METRIC_TASKS_COMPLETED_OUTCOME_CODE_SUCCESS = "success"
14
+ METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE = "failure"
15
+
16
+ METRIC_TASKS_COMPLETED_FAILURE_REASON_ALL = "all"
17
+ # Used when the task is successfull.
18
+ METRIC_TASKS_COMPLETED_FAILURE_REASON_NONE = "none"
19
+ # Matches TASK_FAILURE_REASON_UNKNOWN
20
+ METRIC_TASKS_COMPLETED_FAILURE_REASON_UNKNOWN = "unknown"
21
+ # Includes all function errors including timeouts to reduce cardinality.
22
+ METRIC_TASKS_COMPLETED_FAILURE_REASON_FUNCTION_ERROR = "function_error"
23
+ # Includes all internal errors to reduce cardinality.
24
+ METRIC_TASKS_COMPLETED_FAILURE_REASON_INTERNAL_ERROR = "internal_error"
25
+ # Matches TASK_FAILURE_REASON_TASK_CANCELLED
26
+ METRIC_TASKS_COMPLETED_FAILURE_REASON_TASK_CANCELLED = "task_cancelled"
27
+ # Matches TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED
28
+ METRIC_TASKS_COMPLETED_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED = (
29
+ "function_executor_terminated"
30
+ )
31
+
32
+ # Valid combinations of the labels:
33
+ metric_tasks_completed.labels(
34
+ outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_ALL,
35
+ failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_ALL,
36
+ )
37
+ metric_tasks_completed.labels(
38
+ outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_SUCCESS,
39
+ failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_NONE,
40
+ )
41
+
42
+ metric_tasks_completed.labels(
43
+ outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
44
+ failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_UNKNOWN,
45
+ )
46
+ metric_tasks_completed.labels(
47
+ outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
48
+ failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_FUNCTION_ERROR,
49
+ )
50
+ metric_tasks_completed.labels(
51
+ outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
52
+ failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_INTERNAL_ERROR,
53
+ )
54
+ metric_tasks_completed.labels(
55
+ outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
56
+ failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_TASK_CANCELLED,
57
+ )
58
+ metric_tasks_completed.labels(
59
+ outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
60
+ failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED,
61
+ )
62
+
63
+ metric_task_completion_latency: prometheus_client.Histogram = (
64
+ latency_metric_for_customer_controlled_operation(
65
+ "task_completion",
66
+ "task completion from the moment it got fetched until its output got uploaded to blob store",
67
+ )
68
+ )
@@ -1,8 +1,6 @@
1
1
  import prometheus_client
2
2
 
3
- from ..monitoring.metrics import latency_metric_for_fast_operation
4
-
5
- # This file contains all metrics used by Downloader.
3
+ from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
6
4
 
7
5
  # Graph download metrics
8
6
  metric_graph_downloads: prometheus_client.Counter = prometheus_client.Counter(
@@ -0,0 +1,60 @@
1
+ import prometheus_client
2
+
3
+ from indexify.executor.monitoring.metrics import (
4
+ latency_metric_for_customer_controlled_operation,
5
+ latency_metric_for_fast_operation,
6
+ )
7
+
8
+ metric_control_loop_handle_event_latency: prometheus_client.Histogram = (
9
+ latency_metric_for_fast_operation(
10
+ "handle_function_executor_control_loop_event",
11
+ "Handle Function Executor control loop event",
12
+ )
13
+ )
14
+
15
+ metric_tasks_fetched: prometheus_client.Counter = prometheus_client.Counter(
16
+ "tasks_fetched", "Number of tasks that were fetched from Server"
17
+ )
18
+
19
+ metric_schedule_task_latency: prometheus_client.Histogram = (
20
+ latency_metric_for_customer_controlled_operation(
21
+ "schedule_task",
22
+ "Schedule a task for execution after it got ready for execution",
23
+ )
24
+ )
25
+ metric_runnable_tasks: prometheus_client.Gauge = prometheus_client.Gauge(
26
+ "runnable_tasks",
27
+ "Number of tasks that are ready for execution but are waiting to get scheduled to run on Function Executor (typically waiting for a free Function Executor)",
28
+ )
29
+ metric_runnable_tasks_per_function_name: prometheus_client.Gauge = (
30
+ prometheus_client.Gauge(
31
+ "runnable_tasks_per_function_name",
32
+ "Number of tasks that are ready for execution but are waiting to get scheduled to run on Function Executor (typically waiting for a free Function Executor)",
33
+ ["function_name"],
34
+ )
35
+ )
36
+
37
+ metric_function_executors_with_status: prometheus_client.Gauge = (
38
+ prometheus_client.Gauge(
39
+ "function_executors_with_status",
40
+ "Number of Function Executors with a particular status",
41
+ ["status"],
42
+ )
43
+ )
44
+ METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN = "unknown"
45
+ METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING = "pending"
46
+ METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING = "running"
47
+ METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED = "terminated"
48
+
49
+ metric_function_executors_with_status.labels(
50
+ status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
51
+ )
52
+ metric_function_executors_with_status.labels(
53
+ status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING
54
+ )
55
+ metric_function_executors_with_status.labels(
56
+ status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING
57
+ )
58
+ metric_function_executors_with_status.labels(
59
+ status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED
60
+ )
@@ -1,8 +1,8 @@
1
1
  import prometheus_client
2
2
 
3
- from ...monitoring.metrics import latency_metric_for_customer_controlled_operation
4
-
5
- # This file contains all metrics used by SingleTaskRunner.
3
+ from indexify.executor.monitoring.metrics import (
4
+ latency_metric_for_customer_controlled_operation,
5
+ )
6
6
 
7
7
  metric_function_executor_run_task_rpcs: prometheus_client.Counter = (
8
8
  prometheus_client.Counter(
@@ -20,3 +20,9 @@ metric_function_executor_run_task_rpc_latency: prometheus_client.Histogram = (
20
20
  "function_executor_run_task_rpc", "Function Executor run task RPC"
21
21
  )
22
22
  )
23
+ metric_function_executor_run_task_rpcs_in_progress: prometheus_client.Gauge = (
24
+ prometheus_client.Gauge(
25
+ "function_executor_run_task_rpcs_in_progress",
26
+ "Number of Function Executor run task RPCs in progress",
27
+ )
28
+ )
@@ -0,0 +1,39 @@
1
+ import prometheus_client
2
+
3
+ from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ # Task output upload metrics.
6
+ metric_task_output_uploads: prometheus_client.Counter = prometheus_client.Counter(
7
+ "task_output_uploads",
8
+ "Number of task output uploads",
9
+ )
10
+ metric_tasks_uploading_outputs: prometheus_client.Gauge = prometheus_client.Gauge(
11
+ "tasks_uploading_output",
12
+ "Number of tasks currently uploading their outputs",
13
+ )
14
+ metric_task_output_upload_latency: prometheus_client.Histogram = (
15
+ latency_metric_for_fast_operation("task_output_upload", "task output upload")
16
+ )
17
+ metric_task_output_upload_retries: prometheus_client.Counter = (
18
+ prometheus_client.Counter(
19
+ "tasks_output_upload_retries", "Number of task output upload retries"
20
+ )
21
+ )
22
+
23
+ # Metrics for individual blob store operations.
24
+ metric_task_output_blob_store_uploads: prometheus_client.Counter = (
25
+ prometheus_client.Counter(
26
+ "task_output_blob_store_uploads", "Number of task output uploads to blob store"
27
+ )
28
+ )
29
+ metric_task_output_blob_store_upload_errors: prometheus_client.Counter = (
30
+ prometheus_client.Counter(
31
+ "task_output_blob_store_upload_errors",
32
+ "Number of failed task output uploads to blob store",
33
+ )
34
+ )
35
+ metric_task_output_blob_store_upload_latency: prometheus_client.Histogram = (
36
+ latency_metric_for_fast_operation(
37
+ "task_output_blob_store_upload", "Upload task output to blob store"
38
+ )
39
+ )
@@ -0,0 +1,38 @@
1
+ from typing import Any
2
+
3
+ from indexify.executor.blob_store.blob_store import BLOBStore
4
+
5
+ from .downloads import download_init_value, download_input
6
+ from .events import TaskPreparationFinished
7
+ from .task_info import TaskInfo
8
+
9
+
10
+ async def prepare_task(
11
+ task_info: TaskInfo, blob_store: BLOBStore, logger: Any
12
+ ) -> TaskPreparationFinished:
13
+ """Prepares the task by downloading the input and init value if available.
14
+
15
+ Doesn't raise any exceptions.
16
+ """
17
+ logger = logger.bind(module=__name__)
18
+ try:
19
+ task_info.input = await download_input(
20
+ data_payload=task_info.task.input,
21
+ blob_store=blob_store,
22
+ logger=logger,
23
+ )
24
+
25
+ if task_info.task.HasField("reducer_input"):
26
+ task_info.init_value = await download_init_value(
27
+ data_payload=task_info.task.reducer_input,
28
+ blob_store=blob_store,
29
+ logger=logger,
30
+ )
31
+
32
+ return TaskPreparationFinished(task_info=task_info, is_success=True)
33
+ except Exception as e:
34
+ logger.error(
35
+ "Failed to prepare task",
36
+ exc_info=e,
37
+ )
38
+ return TaskPreparationFinished(task_info=task_info, is_success=False)
@@ -0,0 +1,201 @@
1
+ import asyncio
2
+ import time
3
+ from typing import Any, Optional
4
+
5
+ import grpc
6
+ from tensorlake.function_executor.proto.function_executor_pb2 import (
7
+ RunTaskRequest,
8
+ RunTaskResponse,
9
+ )
10
+ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
11
+ FunctionExecutorStub,
12
+ )
13
+ from tensorlake.function_executor.proto.message_validator import MessageValidator
14
+
15
+ from indexify.executor.function_executor.function_executor import FunctionExecutor
16
+ from indexify.executor.function_executor.health_checker import HealthCheckResult
17
+ from indexify.proto.executor_api_pb2 import (
18
+ FunctionExecutorTerminationReason,
19
+ Task,
20
+ TaskFailureReason,
21
+ TaskOutcomeCode,
22
+ )
23
+
24
+ from .events import TaskExecutionFinished
25
+ from .metrics.run_task import (
26
+ metric_function_executor_run_task_rpc_errors,
27
+ metric_function_executor_run_task_rpc_latency,
28
+ metric_function_executor_run_task_rpcs,
29
+ metric_function_executor_run_task_rpcs_in_progress,
30
+ )
31
+ from .task_info import TaskInfo
32
+ from .task_output import TaskMetrics, TaskOutput
33
+
34
+
35
+ async def run_task_on_function_executor(
36
+ task_info: TaskInfo, function_executor: FunctionExecutor, logger: Any
37
+ ) -> TaskExecutionFinished:
38
+ """Runs the task on the Function Executor and sets task_info.output with the result.
39
+
40
+ Doesn't raise any exceptions.
41
+ """
42
+ logger = logger.bind(module=__name__)
43
+ request: RunTaskRequest = RunTaskRequest(
44
+ namespace=task_info.task.namespace,
45
+ graph_name=task_info.task.graph_name,
46
+ graph_version=task_info.task.graph_version,
47
+ function_name=task_info.task.function_name,
48
+ graph_invocation_id=task_info.task.graph_invocation_id,
49
+ task_id=task_info.task.id,
50
+ function_input=task_info.input,
51
+ )
52
+ # Don't keep the input in memory after we started running the task.
53
+ task_info.input = None
54
+
55
+ if task_info.init_value is not None:
56
+ request.function_init_value.CopyFrom(task_info.init_value)
57
+ # Don't keep the init value in memory after we started running the task.
58
+ task_info.init_value = None
59
+
60
+ function_executor.invocation_state_client().add_task_to_invocation_id_entry(
61
+ task_id=task_info.task.id,
62
+ invocation_id=task_info.task.graph_invocation_id,
63
+ )
64
+
65
+ metric_function_executor_run_task_rpcs.inc()
66
+ metric_function_executor_run_task_rpcs_in_progress.inc()
67
+ start_time = time.monotonic()
68
+ # Not None if the Function Executor should be terminated after running the task.
69
+ function_executor_termination_reason: Optional[
70
+ FunctionExecutorTerminationReason
71
+ ] = None
72
+
73
+ # If this RPC failed due to customer code crashing the server we won't be
74
+ # able to detect this. We'll treat this as our own error for now and thus
75
+ # let the AioRpcError to be raised here.
76
+ timeout_sec = task_info.task.timeout_ms / 1000.0
77
+ try:
78
+ channel: grpc.aio.Channel = function_executor.channel()
79
+ response: RunTaskResponse = await FunctionExecutorStub(channel).run_task(
80
+ request, timeout=timeout_sec
81
+ )
82
+ task_info.output = _task_output_from_function_executor_response(
83
+ task=task_info.task,
84
+ response=response,
85
+ allocation_id=task_info.allocation_id,
86
+ )
87
+ except grpc.aio.AioRpcError as e:
88
+ if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
89
+ # The task is still running in FE, we only cancelled the client-side RPC.
90
+ function_executor_termination_reason = (
91
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT
92
+ )
93
+ task_info.output = TaskOutput.function_timeout(
94
+ task=task_info.task,
95
+ allocation_id=task_info.allocation_id,
96
+ timeout_sec=timeout_sec,
97
+ )
98
+ else:
99
+ metric_function_executor_run_task_rpc_errors.inc()
100
+ logger.error("task execution failed", exc_info=e)
101
+ task_info.output = TaskOutput.internal_error(
102
+ task=task_info.task, allocation_id=task_info.allocation_id
103
+ )
104
+ except asyncio.CancelledError:
105
+ # The task is still running in FE, we only cancelled the client-side RPC.
106
+ function_executor_termination_reason = (
107
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED
108
+ )
109
+ task_info.output = TaskOutput.task_cancelled(
110
+ task=task_info.task, allocation_id=task_info.allocation_id
111
+ )
112
+ except Exception as e:
113
+ metric_function_executor_run_task_rpc_errors.inc()
114
+ logger.error("task execution failed", exc_info=e)
115
+ task_info.output = TaskOutput.internal_error(
116
+ task=task_info.task, allocation_id=task_info.allocation_id
117
+ )
118
+
119
+ metric_function_executor_run_task_rpc_latency.observe(time.monotonic() - start_time)
120
+ metric_function_executor_run_task_rpcs_in_progress.dec()
121
+
122
+ function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
123
+ task_id=task_info.task.id,
124
+ )
125
+
126
+ if (
127
+ task_info.output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
128
+ and function_executor_termination_reason is None
129
+ ):
130
+ # Check if the task failed because the FE is unhealthy to prevent more tasks failing.
131
+ result: HealthCheckResult = await function_executor.health_checker().check()
132
+ if not result.is_healthy:
133
+ function_executor_termination_reason = (
134
+ FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
135
+ )
136
+ logger.error(
137
+ "Function Executor health check failed after running task, shutting down Function Executor",
138
+ health_check_fail_reason=result.reason,
139
+ )
140
+
141
+ _log_task_execution_finished(output=task_info.output, logger=logger)
142
+
143
+ return TaskExecutionFinished(
144
+ task_info=task_info,
145
+ function_executor_termination_reason=function_executor_termination_reason,
146
+ )
147
+
148
+
149
+ def _task_output_from_function_executor_response(
150
+ task: Task, response: RunTaskResponse, allocation_id: str
151
+ ) -> TaskOutput:
152
+ response_validator = MessageValidator(response)
153
+ response_validator.required_field("stdout")
154
+ response_validator.required_field("stderr")
155
+ response_validator.required_field("is_reducer")
156
+ response_validator.required_field("success")
157
+
158
+ metrics = TaskMetrics(counters={}, timers={})
159
+ if response.HasField("metrics"):
160
+ # Can be None if e.g. function failed.
161
+ metrics.counters = dict(response.metrics.counters)
162
+ metrics.timers = dict(response.metrics.timers)
163
+
164
+ output = TaskOutput(
165
+ task=task,
166
+ allocation_id=allocation_id,
167
+ outcome_code=(
168
+ TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS
169
+ if response.success
170
+ else TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
171
+ ),
172
+ failure_reason=(
173
+ None
174
+ if response.success
175
+ else TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR
176
+ ),
177
+ stdout=response.stdout,
178
+ stderr=response.stderr,
179
+ reducer=response.is_reducer,
180
+ metrics=metrics,
181
+ )
182
+
183
+ if response.HasField("function_output"):
184
+ output.function_output = response.function_output
185
+ if response.HasField("router_output"):
186
+ output.router_output = response.router_output
187
+
188
+ return output
189
+
190
+
191
+ def _log_task_execution_finished(output: TaskOutput, logger: Any) -> None:
192
+ logger.info(
193
+ "finished running task",
194
+ success=output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS,
195
+ outcome_code=TaskOutcomeCode.Name(output.outcome_code),
196
+ failure_reason=(
197
+ TaskFailureReason.Name(output.failure_reason)
198
+ if output.failure_reason is not None
199
+ else None
200
+ ),
201
+ )
@@ -0,0 +1,33 @@
1
+ import asyncio
2
+ from dataclasses import dataclass
3
+ from typing import Optional
4
+
5
+ from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
6
+
7
+ from indexify.proto.executor_api_pb2 import Task
8
+
9
+ from .task_output import TaskOutput
10
+
11
+
12
+ @dataclass
13
+ class TaskInfo:
14
+ """Object used to track a task during its full lifecycle in the FunctionExecutorController."""
15
+
16
+ task: Task
17
+ allocation_id: str
18
+ # time.monotonic() timestamp
19
+ start_time: float
20
+ # time.monotonic() timestamp when the task was prepared for execution
21
+ prepared_time: float = 0.0
22
+ # True if the task was cancelled.
23
+ is_cancelled: bool = False
24
+ # aio task that is currently executing a lifecycle step of this task.
25
+ aio_task: Optional[asyncio.Task] = None
26
+ # Downloaded input if function was prepared successfully.
27
+ input: Optional[SerializedObject] = None
28
+ # Downloaded init value if function was prepared successfully and is a reducer.
29
+ init_value: Optional[SerializedObject] = None
30
+ # Output of the task.
31
+ output: Optional[TaskOutput] = None
32
+ # True if the task is fully completed and was added to state reporter.
33
+ is_completed: bool = False