indexify 0.4.28__py3-none-any.whl → 0.4.30__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (31) hide show
  1. indexify/executor/executor.py +11 -7
  2. indexify/executor/function_executor_controller/__init__.py +2 -2
  3. indexify/executor/function_executor_controller/completed_task_allocation_metrics.py +87 -0
  4. indexify/executor/function_executor_controller/events.py +29 -33
  5. indexify/executor/function_executor_controller/{finalize_task.py → finalize_task_allocation.py} +45 -37
  6. indexify/executor/function_executor_controller/function_executor_controller.py +194 -180
  7. indexify/executor/function_executor_controller/loggers.py +15 -17
  8. indexify/executor/function_executor_controller/message_validators.py +4 -12
  9. indexify/executor/function_executor_controller/metrics/completed_task_allocation_metrics.py +70 -0
  10. indexify/executor/function_executor_controller/metrics/finalize_task_allocation.py +26 -0
  11. indexify/executor/function_executor_controller/metrics/function_executor_controller.py +12 -11
  12. indexify/executor/function_executor_controller/metrics/prepare_task_allocation.py +27 -0
  13. indexify/executor/function_executor_controller/{prepare_task.py → prepare_task_allocation.py} +33 -29
  14. indexify/executor/function_executor_controller/{run_task.py → run_task_allocation.py} +54 -51
  15. indexify/executor/function_executor_controller/{task_info.py → task_allocation_info.py} +6 -6
  16. indexify/executor/function_executor_controller/{task_input.py → task_allocation_input.py} +2 -2
  17. indexify/executor/function_executor_controller/{task_output.py → task_allocation_output.py} +24 -24
  18. indexify/executor/monitoring/desired_state_handler.py +24 -0
  19. indexify/executor/monitoring/reported_state_handler.py +22 -0
  20. indexify/executor/monitoring/server.py +4 -0
  21. indexify/executor/state_reconciler.py +26 -19
  22. indexify/executor/state_reporter.py +9 -4
  23. {indexify-0.4.28.dist-info → indexify-0.4.30.dist-info}/METADATA +2 -2
  24. {indexify-0.4.28.dist-info → indexify-0.4.30.dist-info}/RECORD +27 -25
  25. indexify/executor/function_executor_controller/completed_task_metrics.py +0 -83
  26. indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +0 -68
  27. indexify/executor/function_executor_controller/metrics/finalize_task.py +0 -20
  28. indexify/executor/function_executor_controller/metrics/prepare_task.py +0 -18
  29. /indexify/executor/function_executor_controller/metrics/{run_task.py → run_task_allocation.py} +0 -0
  30. {indexify-0.4.28.dist-info → indexify-0.4.30.dist-info}/WHEEL +0 -0
  31. {indexify-0.4.28.dist-info → indexify-0.4.30.dist-info}/entry_points.txt +0 -0
@@ -49,7 +49,21 @@ def task_allocation_logger(task_allocation: TaskAllocation, logger: Any) -> Any:
49
49
  Doesn't assume that the supplied TaskAllocation is valid.
50
50
  """
51
51
  if task_allocation.HasField("task"):
52
- logger = _task_logger(task_allocation.task, logger)
52
+ task: Task = task_allocation.task
53
+ logger = logger.bind(
54
+ task_id=task.id if task.HasField("id") else None,
55
+ namespace=task.namespace if task.HasField("namespace") else None,
56
+ graph=task.graph_name if task.HasField("graph_name") else None,
57
+ graph_version=(
58
+ task.graph_version if task.HasField("graph_version") else None
59
+ ),
60
+ fn=task.function_name if task.HasField("function_name") else None,
61
+ invocation_id=(
62
+ task.graph_invocation_id
63
+ if task.HasField("graph_invocation_id")
64
+ else None
65
+ ),
66
+ )
53
67
  return logger.bind(
54
68
  allocation_id=(
55
69
  task_allocation.allocation_id
@@ -87,19 +101,3 @@ def task_result_logger(task_result: TaskResult, logger: Any) -> Any:
87
101
  else None
88
102
  ),
89
103
  )
90
-
91
-
92
- def _task_logger(task: Task, logger: Any) -> Any:
93
- """Returns a logger bound with the task's metadata.
94
-
95
- The function assumes that the task might be invalid."""
96
- return logger.bind(
97
- task_id=task.id if task.HasField("id") else None,
98
- namespace=task.namespace if task.HasField("namespace") else None,
99
- graph=task.graph_name if task.HasField("graph_name") else None,
100
- graph_version=task.graph_version if task.HasField("graph_version") else None,
101
- fn=task.function_name if task.HasField("function_name") else None,
102
- invocation_id=(
103
- task.graph_invocation_id if task.HasField("graph_invocation_id") else None
104
- ),
105
- )
@@ -3,7 +3,6 @@ from tensorlake.function_executor.proto.message_validator import MessageValidato
3
3
  from indexify.proto.executor_api_pb2 import (
4
4
  DataPayload,
5
5
  FunctionExecutorDescription,
6
- Task,
7
6
  TaskAllocation,
8
7
  )
9
8
 
@@ -50,15 +49,8 @@ def validate_task_allocation(task_allocation: TaskAllocation) -> None:
50
49
  validator.required_field("allocation_id")
51
50
  if not task_allocation.HasField("task"):
52
51
  raise ValueError("TaskAllocation must have a 'task' field.")
53
- _validate_task(task_allocation.task)
54
52
 
55
-
56
- def _validate_task(task: Task) -> None:
57
- """Validates the supplied Task.
58
-
59
- Raises ValueError if the Task is not valid.
60
- """
61
- validator = MessageValidator(task)
53
+ validator = MessageValidator(task_allocation.task)
62
54
  validator.required_field("id")
63
55
  validator.required_field("namespace")
64
56
  validator.required_field("graph_name")
@@ -70,9 +62,9 @@ def _validate_task(task: Task) -> None:
70
62
  validator.required_field("output_payload_uri_prefix")
71
63
  validator.required_field("retry_policy")
72
64
 
73
- _validate_data_payload(task.input)
74
- if task.HasField("reducer_input"):
75
- _validate_data_payload(task.reducer_input)
65
+ _validate_data_payload(task_allocation.task.input)
66
+ if task_allocation.task.HasField("reducer_input"):
67
+ _validate_data_payload(task_allocation.task.reducer_input)
76
68
 
77
69
 
78
70
  def _validate_data_payload(data_payload: DataPayload) -> None:
@@ -0,0 +1,70 @@
1
+ import prometheus_client
2
+
3
+ from indexify.executor.monitoring.metrics import (
4
+ latency_metric_for_customer_controlled_operation,
5
+ )
6
+
7
+ metric_task_allocations_completed: prometheus_client.Counter = (
8
+ prometheus_client.Counter(
9
+ "task_allocations_completed",
10
+ "Number of task allocations that were completed",
11
+ ["outcome_code", "failure_reason"],
12
+ )
13
+ )
14
+ METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_ALL = "all"
15
+ METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_SUCCESS = "success"
16
+ METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE = "failure"
17
+
18
+ METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_ALL = "all"
19
+ # Used when the task is successfull.
20
+ METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_NONE = "none"
21
+ # Matches TASK_FAILURE_REASON_UNKNOWN
22
+ METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_UNKNOWN = "unknown"
23
+ # Includes all function errors including timeouts to reduce cardinality.
24
+ METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_FUNCTION_ERROR = "function_error"
25
+ # Includes all internal errors to reduce cardinality.
26
+ METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_INTERNAL_ERROR = "internal_error"
27
+ # Matches TASK_FAILURE_REASON_TASK_CANCELLED
28
+ METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_TASK_CANCELLED = "task_cancelled"
29
+ # Matches TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED
30
+ METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED = (
31
+ "function_executor_terminated"
32
+ )
33
+
34
+ # Valid combinations of the labels:
35
+ metric_task_allocations_completed.labels(
36
+ outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_ALL,
37
+ failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_ALL,
38
+ )
39
+ metric_task_allocations_completed.labels(
40
+ outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_SUCCESS,
41
+ failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_NONE,
42
+ )
43
+
44
+ metric_task_allocations_completed.labels(
45
+ outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
46
+ failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_UNKNOWN,
47
+ )
48
+ metric_task_allocations_completed.labels(
49
+ outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
50
+ failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_FUNCTION_ERROR,
51
+ )
52
+ metric_task_allocations_completed.labels(
53
+ outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
54
+ failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_INTERNAL_ERROR,
55
+ )
56
+ metric_task_allocations_completed.labels(
57
+ outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
58
+ failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_TASK_CANCELLED,
59
+ )
60
+ metric_task_allocations_completed.labels(
61
+ outcome_code=METRIC_TASK_ALLOCATIONS_COMPLETED_OUTCOME_CODE_FAILURE,
62
+ failure_reason=METRIC_TASK_ALLOCATIONS_COMPLETED_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED,
63
+ )
64
+
65
+ metric_task_allocation_completion_latency: prometheus_client.Histogram = (
66
+ latency_metric_for_customer_controlled_operation(
67
+ "task_allocation_completion",
68
+ "task allocation completion from the moment it got fetched until its output got uploaded to blob store",
69
+ )
70
+ )
@@ -0,0 +1,26 @@
1
+ import prometheus_client
2
+
3
+ from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ # Task allocation finalization metrics.
6
+ metric_task_allocation_finalizations: prometheus_client.Counter = (
7
+ prometheus_client.Counter(
8
+ "task_allocation_finalizations",
9
+ "Number of task allocation finalizations",
10
+ )
11
+ )
12
+ metric_task_allocation_finalization_errors: prometheus_client.Counter = (
13
+ prometheus_client.Counter(
14
+ "task_allocation_finalization_errors",
15
+ "Number of task allocation finalization errors",
16
+ )
17
+ )
18
+ metric_task_allocations_finalizing: prometheus_client.Gauge = prometheus_client.Gauge(
19
+ "task_allocations_finalizing",
20
+ "Number of task allocations currently finalizing",
21
+ )
22
+ metric_task_allocation_finalization_latency: prometheus_client.Histogram = (
23
+ latency_metric_for_fast_operation(
24
+ "task_allocation_finalization", "task allocation finalization"
25
+ )
26
+ )
@@ -12,24 +12,25 @@ metric_control_loop_handle_event_latency: prometheus_client.Histogram = (
12
12
  )
13
13
  )
14
14
 
15
- metric_tasks_fetched: prometheus_client.Counter = prometheus_client.Counter(
16
- "tasks_fetched", "Number of tasks that were fetched from Server"
15
+ metric_task_allocations_fetched: prometheus_client.Counter = prometheus_client.Counter(
16
+ "task_allocations_fetched",
17
+ "Number of task allocations that were fetched from Server",
17
18
  )
18
19
 
19
- metric_schedule_task_latency: prometheus_client.Histogram = (
20
+ metric_schedule_task_allocation_latency: prometheus_client.Histogram = (
20
21
  latency_metric_for_customer_controlled_operation(
21
- "schedule_task",
22
- "Schedule a task for execution after it got ready for execution",
22
+ "schedule_task_allocation",
23
+ "Schedule a task allocation for execution after it got ready for execution",
23
24
  )
24
25
  )
25
- metric_runnable_tasks: prometheus_client.Gauge = prometheus_client.Gauge(
26
- "runnable_tasks",
27
- "Number of tasks that are ready for execution but are waiting to get scheduled to run on Function Executor (typically waiting for a free Function Executor)",
26
+ metric_runnable_task_allocations: prometheus_client.Gauge = prometheus_client.Gauge(
27
+ "runnable_task_allocations",
28
+ "Number of task allocations that are ready for execution but are waiting to get scheduled to run on Function Executor (typically waiting for a free Function Executor)",
28
29
  )
29
- metric_runnable_tasks_per_function_name: prometheus_client.Gauge = (
30
+ metric_runnable_task_allocations_per_function_name: prometheus_client.Gauge = (
30
31
  prometheus_client.Gauge(
31
- "runnable_tasks_per_function_name",
32
- "Number of tasks that are ready for execution but are waiting to get scheduled to run on Function Executor (typically waiting for a free Function Executor)",
32
+ "runnable_task_allocations_per_function_name",
33
+ "Number of task allocations that are ready for execution but are waiting to get scheduled to run on Function Executor (typically waiting for a free Function Executor)",
33
34
  ["function_name"],
34
35
  )
35
36
  )
@@ -0,0 +1,27 @@
1
+ import prometheus_client
2
+
3
+ from indexify.executor.monitoring.metrics import latency_metric_for_fast_operation
4
+
5
+ metric_task_allocation_preparations: prometheus_client.Counter = (
6
+ prometheus_client.Counter(
7
+ "task_allocation_preparations",
8
+ "Number of task allocation preparations for execution",
9
+ )
10
+ )
11
+ metric_task_allocation_preparation_errors: prometheus_client.Counter = (
12
+ prometheus_client.Counter(
13
+ "task_allocation_preparation_errors",
14
+ "Number of task allocation preparation errors",
15
+ )
16
+ )
17
+ metric_task_allocation_preparation_latency: prometheus_client.Histogram = (
18
+ latency_metric_for_fast_operation(
19
+ "task_allocation_preparation", "task allocation preparation for execution"
20
+ )
21
+ )
22
+ metric_task_allocations_getting_prepared: prometheus_client.Gauge = (
23
+ prometheus_client.Gauge(
24
+ "task_allocations_getting_prepared",
25
+ "Number of task allocations currently getting prepared for execution",
26
+ )
27
+ )
@@ -13,15 +13,15 @@ from indexify.executor.blob_store.blob_store import BLOBStore
13
13
  from indexify.proto.executor_api_pb2 import DataPayload, Task
14
14
 
15
15
  from .downloads import serialized_object_manifest_from_data_payload_proto
16
- from .events import TaskPreparationFinished
17
- from .metrics.prepare_task import (
18
- metric_task_preparation_errors,
19
- metric_task_preparation_latency,
20
- metric_task_preparations,
21
- metric_tasks_getting_prepared,
16
+ from .events import TaskAllocationPreparationFinished
17
+ from .metrics.prepare_task_allocation import (
18
+ metric_task_allocation_preparation_errors,
19
+ metric_task_allocation_preparation_latency,
20
+ metric_task_allocation_preparations,
21
+ metric_task_allocations_getting_prepared,
22
22
  )
23
- from .task_info import TaskInfo
24
- from .task_input import TaskInput
23
+ from .task_allocation_info import TaskAllocationInfo
24
+ from .task_allocation_input import TaskAllocationInput
25
25
 
26
26
  # The following constants are subject to S3 limits,
27
27
  # see https://docs.aws.amazon.com/AmazonS3/latest/userguide/qfacts.html.
@@ -42,9 +42,9 @@ _OUTPUT_BLOB_SLOWER_CHUNKS_COUNT: int = 100
42
42
  _INVOCATION_ERROR_MAX_SIZE_BYTES: int = 10 * 1024 * 1024 # 10 MB
43
43
 
44
44
 
45
- async def prepare_task(
46
- task_info: TaskInfo, blob_store: BLOBStore, logger: Any
47
- ) -> TaskPreparationFinished:
45
+ async def prepare_task_allocation(
46
+ alloc_info: TaskAllocationInfo, blob_store: BLOBStore, logger: Any
47
+ ) -> TaskAllocationPreparationFinished:
48
48
  """Prepares the task for execution.
49
49
 
50
50
  If successful then the task is runnable.
@@ -54,43 +54,47 @@ async def prepare_task(
54
54
  start_time = time.monotonic()
55
55
  try:
56
56
  with (
57
- metric_task_preparation_errors.count_exceptions(),
58
- metric_tasks_getting_prepared.track_inprogress(),
59
- metric_task_preparation_latency.time(),
57
+ metric_task_allocation_preparation_errors.count_exceptions(),
58
+ metric_task_allocations_getting_prepared.track_inprogress(),
59
+ metric_task_allocation_preparation_latency.time(),
60
60
  ):
61
- metric_task_preparations.inc()
62
- task_info.input = await _prepare_task_input(
63
- task_info=task_info,
61
+ metric_task_allocation_preparations.inc()
62
+ alloc_info.input = await _prepare_task_alloc_input(
63
+ alloc_info=alloc_info,
64
64
  blob_store=blob_store,
65
65
  logger=logger,
66
66
  )
67
67
  logger.info(
68
- "task was prepared for execution",
68
+ "task allocation was prepared for execution",
69
69
  duration=time.monotonic() - start_time,
70
70
  )
71
- return TaskPreparationFinished(
72
- task_info=task_info,
71
+ return TaskAllocationPreparationFinished(
72
+ alloc_info=alloc_info,
73
73
  is_success=True,
74
74
  )
75
75
  except asyncio.CancelledError:
76
- return TaskPreparationFinished(task_info=task_info, is_success=False)
76
+ return TaskAllocationPreparationFinished(
77
+ alloc_info=alloc_info, is_success=False
78
+ )
77
79
  except BaseException as e:
78
80
  logger.error(
79
- "failed to prepare task for execution",
81
+ "failed to prepare task allocation for execution",
80
82
  exc_info=e,
81
83
  duration=time.monotonic() - start_time,
82
84
  )
83
- return TaskPreparationFinished(task_info=task_info, is_success=False)
85
+ return TaskAllocationPreparationFinished(
86
+ alloc_info=alloc_info, is_success=False
87
+ )
84
88
 
85
89
 
86
- async def _prepare_task_input(
87
- task_info: TaskInfo, blob_store: BLOBStore, logger: Any
88
- ) -> TaskInput:
90
+ async def _prepare_task_alloc_input(
91
+ alloc_info: TaskAllocationInfo, blob_store: BLOBStore, logger: Any
92
+ ) -> TaskAllocationInput:
89
93
  """Prepares the task for execution.
90
94
 
91
95
  Raises an exception on error.
92
96
  """
93
- task: Task = task_info.allocation.task
97
+ task: Task = alloc_info.allocation.task
94
98
  function_init_value_blob: Optional[BLOB] = None
95
99
  function_init_value: Optional[SerializedObjectInsideBLOB] = None
96
100
  if task.HasField("reducer_input"):
@@ -102,7 +106,7 @@ async def _prepare_task_input(
102
106
  function_init_value = _to_serialized_object_inside_blob(task.reducer_input)
103
107
 
104
108
  function_outputs_blob_uri: str = (
105
- f"{task.output_payload_uri_prefix}.{task_info.allocation.allocation_id}.output"
109
+ f"{task.output_payload_uri_prefix}.{alloc_info.allocation.allocation_id}.output"
106
110
  )
107
111
  invocation_error_blob_uri: str = (
108
112
  f"{task.invocation_error_payload_uri_prefix}.{task.graph_invocation_id}.inverr"
@@ -136,7 +140,7 @@ async def _prepare_task_input(
136
140
  )
137
141
  raise
138
142
 
139
- return TaskInput(
143
+ return TaskAllocationInput(
140
144
  function_inputs=FunctionInputs(
141
145
  function_input_blob=await _presign_function_input_blob(
142
146
  data_payload=task.input,
@@ -36,57 +36,57 @@ from indexify.proto.executor_api_pb2 import (
36
36
  TaskOutcomeCode,
37
37
  )
38
38
 
39
- from .events import TaskExecutionFinished
40
- from .metrics.run_task import (
39
+ from .events import TaskAllocationExecutionFinished
40
+ from .metrics.run_task_allocation import (
41
41
  metric_function_executor_run_task_rpc_errors,
42
42
  metric_function_executor_run_task_rpc_latency,
43
43
  metric_function_executor_run_task_rpcs,
44
44
  metric_function_executor_run_task_rpcs_in_progress,
45
45
  )
46
- from .task_info import TaskInfo
47
- from .task_output import TaskMetrics, TaskOutput
46
+ from .task_allocation_info import TaskAllocationInfo
47
+ from .task_allocation_output import TaskAllocationMetrics, TaskAllocationOutput
48
48
 
49
49
  _CREATE_TASK_TIMEOUT_SECS = 5
50
50
  _DELETE_TASK_TIMEOUT_SECS = 5
51
51
 
52
52
 
53
- async def run_task_on_function_executor(
54
- task_info: TaskInfo, function_executor: FunctionExecutor, logger: Any
55
- ) -> TaskExecutionFinished:
56
- """Runs the task on the Function Executor and sets task_info.output with the result.
53
+ async def run_task_allocation_on_function_executor(
54
+ alloc_info: TaskAllocationInfo, function_executor: FunctionExecutor, logger: Any
55
+ ) -> TaskAllocationExecutionFinished:
56
+ """Runs the task on the Function Executor and sets alloc_info.output with the result.
57
57
 
58
58
  Doesn't raise any exceptions.
59
59
  """
60
60
  logger = logger.bind(module=__name__)
61
61
 
62
- if task_info.input is None:
62
+ if alloc_info.input is None:
63
63
  logger.error(
64
- "task input is None, this should never happen",
64
+ "task allocation input is None, this should never happen",
65
65
  )
66
- task_info.output = TaskOutput.internal_error(
67
- allocation=task_info.allocation,
66
+ alloc_info.output = TaskAllocationOutput.internal_error(
67
+ allocation=alloc_info.allocation,
68
68
  execution_start_time=None,
69
69
  execution_end_time=None,
70
70
  )
71
- return TaskExecutionFinished(
72
- task_info=task_info,
71
+ return TaskAllocationExecutionFinished(
72
+ alloc_info=alloc_info,
73
73
  function_executor_termination_reason=None,
74
74
  )
75
75
 
76
76
  task = Task(
77
- namespace=task_info.allocation.task.namespace,
78
- graph_name=task_info.allocation.task.graph_name,
79
- graph_version=task_info.allocation.task.graph_version,
80
- function_name=task_info.allocation.task.function_name,
81
- graph_invocation_id=task_info.allocation.task.graph_invocation_id,
82
- task_id=task_info.allocation.task.id,
83
- allocation_id=task_info.allocation.allocation_id,
84
- request=task_info.input.function_inputs,
77
+ namespace=alloc_info.allocation.task.namespace,
78
+ graph_name=alloc_info.allocation.task.graph_name,
79
+ graph_version=alloc_info.allocation.task.graph_version,
80
+ function_name=alloc_info.allocation.task.function_name,
81
+ graph_invocation_id=alloc_info.allocation.task.graph_invocation_id,
82
+ task_id=alloc_info.allocation.task.id,
83
+ allocation_id=alloc_info.allocation.allocation_id,
84
+ request=alloc_info.input.function_inputs,
85
85
  )
86
86
 
87
87
  function_executor.invocation_state_client().add_task_to_invocation_id_entry(
88
- task_id=task_info.allocation.task.id,
89
- invocation_id=task_info.allocation.task.graph_invocation_id,
88
+ task_id=alloc_info.allocation.task.id,
89
+ invocation_id=alloc_info.allocation.task.graph_invocation_id,
90
90
  )
91
91
 
92
92
  metric_function_executor_run_task_rpcs.inc()
@@ -107,15 +107,15 @@ async def run_task_on_function_executor(
107
107
  # If this RPC failed due to customer code crashing the server we won't be
108
108
  # able to detect this. We'll treat this as our own error for now and thus
109
109
  # let the AioRpcError to be raised here.
110
- timeout_sec: float = task_info.allocation.task.timeout_ms / 1000.0
110
+ timeout_sec: float = alloc_info.allocation.task.timeout_ms / 1000.0
111
111
  try:
112
112
  # This aio task can only be cancelled during this await call.
113
113
  task_result = await _run_task_rpcs(task, function_executor, timeout_sec)
114
114
 
115
115
  _process_task_diagnostics(task_result.diagnostics, logger)
116
116
 
117
- task_info.output = _task_output_from_function_executor_result(
118
- allocation=task_info.allocation,
117
+ alloc_info.output = _task_alloc_output_from_fe_result(
118
+ allocation=alloc_info.allocation,
119
119
  result=task_result,
120
120
  execution_start_time=execution_start_time,
121
121
  execution_end_time=time.monotonic(),
@@ -127,8 +127,8 @@ async def run_task_on_function_executor(
127
127
  function_executor_termination_reason = (
128
128
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT
129
129
  )
130
- task_info.output = TaskOutput.function_timeout(
131
- allocation=task_info.allocation,
130
+ alloc_info.output = TaskAllocationOutput.function_timeout(
131
+ allocation=alloc_info.allocation,
132
132
  execution_start_time=execution_start_time,
133
133
  execution_end_time=time.monotonic(),
134
134
  )
@@ -150,14 +150,16 @@ async def run_task_on_function_executor(
150
150
  # This is either a create_task() RPC timeout or a
151
151
  # delete_task() RPC timeout; either suggests that the FE
152
152
  # is unhealthy.
153
- logger.error("task management RPC execution deadline exceeded", exc_info=e)
153
+ logger.error(
154
+ "task allocationmanagement RPC execution deadline exceeded", exc_info=e
155
+ )
154
156
  else:
155
157
  # This is a status from an unsuccessful RPC; this
156
158
  # shouldn't happen, but we handle it.
157
- logger.error("task management RPC failed", exc_info=e)
159
+ logger.error("task allocation management RPC failed", exc_info=e)
158
160
 
159
- task_info.output = TaskOutput.function_executor_unresponsive(
160
- allocation=task_info.allocation,
161
+ alloc_info.output = TaskAllocationOutput.function_executor_unresponsive(
162
+ allocation=alloc_info.allocation,
161
163
  execution_start_time=execution_start_time,
162
164
  execution_end_time=time.monotonic(),
163
165
  )
@@ -167,8 +169,8 @@ async def run_task_on_function_executor(
167
169
  function_executor_termination_reason = (
168
170
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED
169
171
  )
170
- task_info.output = TaskOutput.task_cancelled(
171
- allocation=task_info.allocation,
172
+ alloc_info.output = TaskAllocationOutput.task_allocation_cancelled(
173
+ allocation=alloc_info.allocation,
172
174
  execution_start_time=execution_start_time,
173
175
  execution_end_time=time.monotonic(),
174
176
  )
@@ -176,10 +178,11 @@ async def run_task_on_function_executor(
176
178
  # This is an unexpected exception; we believe that this
177
179
  # indicates an internal error.
178
180
  logger.error(
179
- "unexpected internal error during task lifecycle RPC sequence", exc_info=e
181
+ "unexpected internal error during task allocation lifecycle RPC sequence",
182
+ exc_info=e,
180
183
  )
181
- task_info.output = TaskOutput.internal_error(
182
- allocation=task_info.allocation,
184
+ alloc_info.output = TaskAllocationOutput.internal_error(
185
+ allocation=alloc_info.allocation,
183
186
  execution_start_time=execution_start_time,
184
187
  execution_end_time=time.monotonic(),
185
188
  )
@@ -190,11 +193,11 @@ async def run_task_on_function_executor(
190
193
  metric_function_executor_run_task_rpcs_in_progress.dec()
191
194
 
192
195
  function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
193
- task_id=task_info.allocation.task.id,
196
+ task_id=alloc_info.allocation.task.id,
194
197
  )
195
198
 
196
199
  if (
197
- task_info.output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
200
+ alloc_info.output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
198
201
  and function_executor_termination_reason is None
199
202
  ):
200
203
  try:
@@ -205,7 +208,7 @@ async def run_task_on_function_executor(
205
208
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
206
209
  )
207
210
  logger.error(
208
- "Function Executor health check failed after running task, shutting down Function Executor",
211
+ "Function Executor health check failed after running task allocation, shutting down Function Executor",
209
212
  health_check_fail_reason=result.reason,
210
213
  )
211
214
  except asyncio.CancelledError:
@@ -213,10 +216,10 @@ async def run_task_on_function_executor(
213
216
  # We can't conclude anything about the health of the FE here.
214
217
  pass
215
218
 
216
- _log_task_execution_finished(output=task_info.output, logger=logger)
219
+ _log_task_execution_finished(output=alloc_info.output, logger=logger)
217
220
 
218
- return TaskExecutionFinished(
219
- task_info=task_info,
221
+ return TaskAllocationExecutionFinished(
222
+ alloc_info=alloc_info,
220
223
  function_executor_termination_reason=function_executor_termination_reason,
221
224
  )
222
225
 
@@ -275,17 +278,17 @@ async def _run_task_rpcs(
275
278
  return task_result
276
279
 
277
280
 
278
- def _task_output_from_function_executor_result(
281
+ def _task_alloc_output_from_fe_result(
279
282
  allocation: TaskAllocation,
280
283
  result: TaskResult,
281
284
  execution_start_time: Optional[float],
282
285
  execution_end_time: Optional[float],
283
286
  logger: Any,
284
- ) -> TaskOutput:
287
+ ) -> TaskAllocationOutput:
285
288
  response_validator = MessageValidator(result)
286
289
  response_validator.required_field("outcome_code")
287
290
 
288
- metrics = TaskMetrics(counters={}, timers={})
291
+ metrics = TaskAllocationMetrics(counters={}, timers={})
289
292
  if result.HasField("metrics"):
290
293
  # Can be None if e.g. function failed.
291
294
  metrics.counters = dict(result.metrics.counters)
@@ -312,7 +315,7 @@ def _task_output_from_function_executor_result(
312
315
  # function_outputs can have no items, this happens when the function returns None.
313
316
  response_validator.required_field("uploaded_function_outputs_blob")
314
317
 
315
- return TaskOutput(
318
+ return TaskAllocationOutput(
316
319
  allocation=allocation,
317
320
  outcome_code=outcome_code,
318
321
  failure_reason=failure_reason,
@@ -327,9 +330,9 @@ def _task_output_from_function_executor_result(
327
330
  )
328
331
 
329
332
 
330
- def _log_task_execution_finished(output: TaskOutput, logger: Any) -> None:
333
+ def _log_task_execution_finished(output: TaskAllocationOutput, logger: Any) -> None:
331
334
  logger.info(
332
- "finished running task",
335
+ "finished running task allocation",
333
336
  success=output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS,
334
337
  outcome_code=TaskOutcomeCode.Name(output.outcome_code),
335
338
  failure_reason=(
@@ -344,7 +347,7 @@ def _process_task_diagnostics(task_diagnostics: TaskDiagnostics, logger: Any) ->
344
347
  MessageValidator(task_diagnostics).required_field("function_executor_log")
345
348
  # Uncomment these lines once we stop printing FE logs to stdout/stderr.
346
349
  # Print FE logs directly to Executor logs so operators can see them.
347
- # logger.info("Function Executor logs during task execution:")
350
+ # logger.info("Function Executor logs during task allocation execution:")
348
351
  # print(task_diagnostics.function_executor_log)
349
352
 
350
353
 
@@ -4,13 +4,13 @@ from typing import Optional
4
4
 
5
5
  from indexify.proto.executor_api_pb2 import TaskAllocation
6
6
 
7
- from .task_input import TaskInput
8
- from .task_output import TaskOutput
7
+ from .task_allocation_input import TaskAllocationInput
8
+ from .task_allocation_output import TaskAllocationOutput
9
9
 
10
10
 
11
11
  @dataclass
12
- class TaskInfo:
13
- """Object used to track a task during its full lifecycle in the FunctionExecutorController."""
12
+ class TaskAllocationInfo:
13
+ """Object used to track a task allocation during its full lifecycle in the FunctionExecutorController."""
14
14
 
15
15
  allocation: TaskAllocation
16
16
  # time.monotonic() timestamp
@@ -22,8 +22,8 @@ class TaskInfo:
22
22
  # aio task that is currently executing a lifecycle step of this task.
23
23
  aio_task: Optional[asyncio.Task] = None
24
24
  # Input if function was prepared successfully.
25
- input: Optional[TaskInput] = None
25
+ input: Optional[TaskAllocationInput] = None
26
26
  # Output of the task, always set when the task is completed.
27
- output: Optional[TaskOutput] = None
27
+ output: Optional[TaskAllocationOutput] = None
28
28
  # True if the task is fully completed and was added to state reporter.
29
29
  is_completed: bool = False
@@ -1,8 +1,8 @@
1
1
  from tensorlake.function_executor.proto.function_executor_pb2 import FunctionInputs
2
2
 
3
3
 
4
- class TaskInput:
5
- """Represents the input for a task in the function executor controller."""
4
+ class TaskAllocationInput:
5
+ """Represents the input for a task allocation in the function executor controller."""
6
6
 
7
7
  def __init__(
8
8
  self,