indexify 0.4.5__py3-none-any.whl → 0.4.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/executor/function_executor/function_executor.py +30 -25
- indexify/executor/function_executor_controller/__init__.py +7 -4
- indexify/executor/function_executor_controller/create_function_executor.py +125 -27
- indexify/executor/function_executor_controller/destroy_function_executor.py +1 -1
- indexify/executor/function_executor_controller/events.py +10 -14
- indexify/executor/function_executor_controller/function_executor_controller.py +108 -66
- indexify/executor/function_executor_controller/function_executor_startup_output.py +21 -0
- indexify/executor/function_executor_controller/loggers.py +57 -7
- indexify/executor/function_executor_controller/message_validators.py +16 -1
- indexify/executor/function_executor_controller/prepare_task.py +3 -3
- indexify/executor/function_executor_controller/run_task.py +19 -27
- indexify/executor/function_executor_controller/task_info.py +2 -3
- indexify/executor/function_executor_controller/task_output.py +12 -24
- indexify/executor/function_executor_controller/upload_task_output.py +7 -7
- indexify/executor/state_reconciler.py +5 -33
- indexify/executor/state_reporter.py +46 -56
- indexify/proto/executor_api.proto +34 -17
- indexify/proto/executor_api_pb2.py +46 -42
- indexify/proto/executor_api_pb2.pyi +50 -8
- {indexify-0.4.5.dist-info → indexify-0.4.7.dist-info}/METADATA +2 -2
- {indexify-0.4.5.dist-info → indexify-0.4.7.dist-info}/RECORD +23 -22
- {indexify-0.4.5.dist-info → indexify-0.4.7.dist-info}/WHEEL +0 -0
- {indexify-0.4.5.dist-info → indexify-0.4.7.dist-info}/entry_points.txt +0 -0
@@ -16,7 +16,7 @@ from indexify.executor.function_executor.function_executor import FunctionExecut
|
|
16
16
|
from indexify.executor.function_executor.health_checker import HealthCheckResult
|
17
17
|
from indexify.proto.executor_api_pb2 import (
|
18
18
|
FunctionExecutorTerminationReason,
|
19
|
-
|
19
|
+
TaskAllocation,
|
20
20
|
TaskFailureReason,
|
21
21
|
TaskOutcomeCode,
|
22
22
|
)
|
@@ -41,12 +41,13 @@ async def run_task_on_function_executor(
|
|
41
41
|
"""
|
42
42
|
logger = logger.bind(module=__name__)
|
43
43
|
request: RunTaskRequest = RunTaskRequest(
|
44
|
-
namespace=task_info.task.namespace,
|
45
|
-
graph_name=task_info.task.graph_name,
|
46
|
-
graph_version=task_info.task.graph_version,
|
47
|
-
function_name=task_info.task.function_name,
|
48
|
-
graph_invocation_id=task_info.task.graph_invocation_id,
|
49
|
-
task_id=task_info.task.id,
|
44
|
+
namespace=task_info.allocation.task.namespace,
|
45
|
+
graph_name=task_info.allocation.task.graph_name,
|
46
|
+
graph_version=task_info.allocation.task.graph_version,
|
47
|
+
function_name=task_info.allocation.task.function_name,
|
48
|
+
graph_invocation_id=task_info.allocation.task.graph_invocation_id,
|
49
|
+
task_id=task_info.allocation.task.id,
|
50
|
+
allocation_id=task_info.allocation.allocation_id,
|
50
51
|
function_input=task_info.input,
|
51
52
|
)
|
52
53
|
# Don't keep the input in memory after we started running the task.
|
@@ -58,8 +59,8 @@ async def run_task_on_function_executor(
|
|
58
59
|
task_info.init_value = None
|
59
60
|
|
60
61
|
function_executor.invocation_state_client().add_task_to_invocation_id_entry(
|
61
|
-
task_id=task_info.task.id,
|
62
|
-
invocation_id=task_info.task.graph_invocation_id,
|
62
|
+
task_id=task_info.allocation.task.id,
|
63
|
+
invocation_id=task_info.allocation.task.graph_invocation_id,
|
63
64
|
)
|
64
65
|
|
65
66
|
metric_function_executor_run_task_rpcs.inc()
|
@@ -73,16 +74,15 @@ async def run_task_on_function_executor(
|
|
73
74
|
# If this RPC failed due to customer code crashing the server we won't be
|
74
75
|
# able to detect this. We'll treat this as our own error for now and thus
|
75
76
|
# let the AioRpcError to be raised here.
|
76
|
-
timeout_sec = task_info.task.timeout_ms / 1000.0
|
77
|
+
timeout_sec = task_info.allocation.task.timeout_ms / 1000.0
|
77
78
|
try:
|
78
79
|
channel: grpc.aio.Channel = function_executor.channel()
|
79
80
|
response: RunTaskResponse = await FunctionExecutorStub(channel).run_task(
|
80
81
|
request, timeout=timeout_sec
|
81
82
|
)
|
82
83
|
task_info.output = _task_output_from_function_executor_response(
|
83
|
-
|
84
|
+
allocation=task_info.allocation,
|
84
85
|
response=response,
|
85
|
-
allocation_id=task_info.allocation_id,
|
86
86
|
)
|
87
87
|
except grpc.aio.AioRpcError as e:
|
88
88
|
if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
|
@@ -91,36 +91,29 @@ async def run_task_on_function_executor(
|
|
91
91
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT
|
92
92
|
)
|
93
93
|
task_info.output = TaskOutput.function_timeout(
|
94
|
-
|
95
|
-
allocation_id=task_info.allocation_id,
|
94
|
+
allocation=task_info.allocation,
|
96
95
|
timeout_sec=timeout_sec,
|
97
96
|
)
|
98
97
|
else:
|
99
98
|
metric_function_executor_run_task_rpc_errors.inc()
|
100
99
|
logger.error("task execution failed", exc_info=e)
|
101
|
-
task_info.output = TaskOutput.internal_error(
|
102
|
-
task=task_info.task, allocation_id=task_info.allocation_id
|
103
|
-
)
|
100
|
+
task_info.output = TaskOutput.internal_error(task_info.allocation)
|
104
101
|
except asyncio.CancelledError:
|
105
102
|
# The task is still running in FE, we only cancelled the client-side RPC.
|
106
103
|
function_executor_termination_reason = (
|
107
104
|
FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_CANCELLED
|
108
105
|
)
|
109
|
-
task_info.output = TaskOutput.task_cancelled(
|
110
|
-
task=task_info.task, allocation_id=task_info.allocation_id
|
111
|
-
)
|
106
|
+
task_info.output = TaskOutput.task_cancelled(task_info.allocation)
|
112
107
|
except Exception as e:
|
113
108
|
metric_function_executor_run_task_rpc_errors.inc()
|
114
109
|
logger.error("task execution failed", exc_info=e)
|
115
|
-
task_info.output = TaskOutput.internal_error(
|
116
|
-
task=task_info.task, allocation_id=task_info.allocation_id
|
117
|
-
)
|
110
|
+
task_info.output = TaskOutput.internal_error(task_info.allocation)
|
118
111
|
|
119
112
|
metric_function_executor_run_task_rpc_latency.observe(time.monotonic() - start_time)
|
120
113
|
metric_function_executor_run_task_rpcs_in_progress.dec()
|
121
114
|
|
122
115
|
function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
|
123
|
-
task_id=task_info.task.id,
|
116
|
+
task_id=task_info.allocation.task.id,
|
124
117
|
)
|
125
118
|
|
126
119
|
if (
|
@@ -147,7 +140,7 @@ async def run_task_on_function_executor(
|
|
147
140
|
|
148
141
|
|
149
142
|
def _task_output_from_function_executor_response(
|
150
|
-
|
143
|
+
allocation: TaskAllocation, response: RunTaskResponse
|
151
144
|
) -> TaskOutput:
|
152
145
|
response_validator = MessageValidator(response)
|
153
146
|
response_validator.required_field("stdout")
|
@@ -162,8 +155,7 @@ def _task_output_from_function_executor_response(
|
|
162
155
|
metrics.timers = dict(response.metrics.timers)
|
163
156
|
|
164
157
|
output = TaskOutput(
|
165
|
-
|
166
|
-
allocation_id=allocation_id,
|
158
|
+
allocation=allocation,
|
167
159
|
outcome_code=(
|
168
160
|
TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS
|
169
161
|
if response.success
|
@@ -4,7 +4,7 @@ from typing import Optional
|
|
4
4
|
|
5
5
|
from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
|
6
6
|
|
7
|
-
from indexify.proto.executor_api_pb2 import
|
7
|
+
from indexify.proto.executor_api_pb2 import TaskAllocation
|
8
8
|
|
9
9
|
from .task_output import TaskOutput
|
10
10
|
|
@@ -13,8 +13,7 @@ from .task_output import TaskOutput
|
|
13
13
|
class TaskInfo:
|
14
14
|
"""Object used to track a task during its full lifecycle in the FunctionExecutorController."""
|
15
15
|
|
16
|
-
|
17
|
-
allocation_id: str
|
16
|
+
allocation: TaskAllocation
|
18
17
|
# time.monotonic() timestamp
|
19
18
|
start_time: float
|
20
19
|
# time.monotonic() timestamp when the task was prepared for execution
|
@@ -7,7 +7,7 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
|
|
7
7
|
|
8
8
|
from indexify.proto.executor_api_pb2 import (
|
9
9
|
DataPayload,
|
10
|
-
|
10
|
+
TaskAllocation,
|
11
11
|
TaskFailureReason,
|
12
12
|
TaskOutcomeCode,
|
13
13
|
)
|
@@ -26,8 +26,7 @@ class TaskOutput:
|
|
26
26
|
|
27
27
|
def __init__(
|
28
28
|
self,
|
29
|
-
|
30
|
-
allocation_id: str,
|
29
|
+
allocation: TaskAllocation,
|
31
30
|
outcome_code: TaskOutcomeCode,
|
32
31
|
# Optional[TaskFailureReason] is not supported in python 3.9
|
33
32
|
failure_reason: TaskFailureReason = None,
|
@@ -42,8 +41,8 @@ class TaskOutput:
|
|
42
41
|
uploaded_stdout: Optional[DataPayload] = None,
|
43
42
|
uploaded_stderr: Optional[DataPayload] = None,
|
44
43
|
):
|
45
|
-
self.task = task
|
46
|
-
self.
|
44
|
+
self.task = allocation.task
|
45
|
+
self.allocation = allocation
|
47
46
|
self.function_output = function_output
|
48
47
|
self.router_output = router_output
|
49
48
|
self.stdout = stdout
|
@@ -60,14 +59,12 @@ class TaskOutput:
|
|
60
59
|
@classmethod
|
61
60
|
def internal_error(
|
62
61
|
cls,
|
63
|
-
|
64
|
-
allocation_id: str,
|
62
|
+
allocation: TaskAllocation,
|
65
63
|
) -> "TaskOutput":
|
66
64
|
"""Creates a TaskOutput for an internal error."""
|
67
65
|
# We are not sharing internal error messages with the customer.
|
68
66
|
return TaskOutput(
|
69
|
-
|
70
|
-
allocation_id=allocation_id,
|
67
|
+
allocation=allocation,
|
71
68
|
outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
|
72
69
|
failure_reason=TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR,
|
73
70
|
stderr="Platform failed to execute the function.",
|
@@ -76,15 +73,13 @@ class TaskOutput:
|
|
76
73
|
@classmethod
|
77
74
|
def function_timeout(
|
78
75
|
cls,
|
79
|
-
|
80
|
-
allocation_id: str,
|
76
|
+
allocation: TaskAllocation,
|
81
77
|
timeout_sec: float,
|
82
78
|
) -> "TaskOutput":
|
83
79
|
"""Creates a TaskOutput for an function timeout error."""
|
84
80
|
# Task stdout, stderr is not available.
|
85
81
|
return TaskOutput(
|
86
|
-
|
87
|
-
allocation_id=allocation_id,
|
82
|
+
allocation=allocation,
|
88
83
|
outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
|
89
84
|
failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_TIMEOUT,
|
90
85
|
stderr=f"Function exceeded its configured timeout of {timeout_sec:.3f} sec.",
|
@@ -93,13 +88,11 @@ class TaskOutput:
|
|
93
88
|
@classmethod
|
94
89
|
def task_cancelled(
|
95
90
|
cls,
|
96
|
-
|
97
|
-
allocation_id: str,
|
91
|
+
allocation: TaskAllocation,
|
98
92
|
) -> "TaskOutput":
|
99
93
|
"""Creates a TaskOutput for the case when task didn't finish because its allocation was removed by Server."""
|
100
94
|
return TaskOutput(
|
101
|
-
|
102
|
-
allocation_id=allocation_id,
|
95
|
+
allocation=allocation,
|
103
96
|
outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
|
104
97
|
failure_reason=TaskFailureReason.TASK_FAILURE_REASON_TASK_CANCELLED,
|
105
98
|
)
|
@@ -107,16 +100,11 @@ class TaskOutput:
|
|
107
100
|
@classmethod
|
108
101
|
def function_executor_terminated(
|
109
102
|
cls,
|
110
|
-
|
111
|
-
allocation_id: str,
|
103
|
+
allocation: TaskAllocation,
|
112
104
|
) -> "TaskOutput":
|
113
105
|
"""Creates a TaskOutput for the case when task didn't run because its FE terminated."""
|
114
106
|
return TaskOutput(
|
115
|
-
|
116
|
-
allocation_id=allocation_id,
|
107
|
+
allocation=allocation,
|
117
108
|
outcome_code=TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE,
|
118
109
|
failure_reason=TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED,
|
119
|
-
# TODO: add FE startup stdout, stderr to the task output if FE failed to startup.
|
120
|
-
stdout="",
|
121
|
-
stderr="Can't execute the function because its Function Executor terminated.",
|
122
110
|
)
|
@@ -121,13 +121,13 @@ async def _upload_to_blob_store(
|
|
121
121
|
output: TaskOutput, blob_store: BLOBStore, logger: Any
|
122
122
|
) -> None:
|
123
123
|
if output.stdout is not None:
|
124
|
-
stdout_url = f"{output.task.output_payload_uri_prefix}.{output.task.id}.stdout"
|
124
|
+
stdout_url = f"{output.allocation.task.output_payload_uri_prefix}.{output.allocation.task.id}.stdout"
|
125
125
|
stdout_bytes: bytes = output.stdout.encode()
|
126
126
|
await blob_store.put(stdout_url, stdout_bytes, logger)
|
127
127
|
output.uploaded_stdout = DataPayload(
|
128
128
|
uri=stdout_url,
|
129
129
|
size=len(stdout_bytes),
|
130
|
-
sha256_hash=
|
130
|
+
sha256_hash=compute_hash(stdout_bytes),
|
131
131
|
encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
|
132
132
|
encoding_version=0,
|
133
133
|
)
|
@@ -135,13 +135,13 @@ async def _upload_to_blob_store(
|
|
135
135
|
output.stdout = None
|
136
136
|
|
137
137
|
if output.stderr is not None:
|
138
|
-
stderr_url = f"{output.task.output_payload_uri_prefix}.{output.task.id}.stderr"
|
138
|
+
stderr_url = f"{output.allocation.task.output_payload_uri_prefix}.{output.allocation.task.id}.stderr"
|
139
139
|
stderr_bytes: bytes = output.stderr.encode()
|
140
140
|
await blob_store.put(stderr_url, stderr_bytes, logger)
|
141
141
|
output.uploaded_stderr = DataPayload(
|
142
142
|
uri=stderr_url,
|
143
143
|
size=len(stderr_bytes),
|
144
|
-
sha256_hash=
|
144
|
+
sha256_hash=compute_hash(stderr_bytes),
|
145
145
|
encoding=DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT,
|
146
146
|
encoding_version=0,
|
147
147
|
)
|
@@ -153,7 +153,7 @@ async def _upload_to_blob_store(
|
|
153
153
|
uploaded_data_payloads = []
|
154
154
|
for func_output_item in output.function_output.outputs:
|
155
155
|
node_output_sequence = len(uploaded_data_payloads)
|
156
|
-
output_url = f"{output.task.output_payload_uri_prefix}.{output.task.id}.{node_output_sequence}"
|
156
|
+
output_url = f"{output.allocation.task.output_payload_uri_prefix}.{output.allocation.task.id}.{node_output_sequence}"
|
157
157
|
output_bytes: bytes = (
|
158
158
|
func_output_item.bytes
|
159
159
|
if func_output_item.HasField("bytes")
|
@@ -164,7 +164,7 @@ async def _upload_to_blob_store(
|
|
164
164
|
DataPayload(
|
165
165
|
uri=output_url,
|
166
166
|
size=len(output_bytes),
|
167
|
-
sha256_hash=
|
167
|
+
sha256_hash=compute_hash(output_bytes),
|
168
168
|
encoding=_to_grpc_data_payload_encoding(output),
|
169
169
|
encoding_version=0,
|
170
170
|
)
|
@@ -214,7 +214,7 @@ def _to_grpc_data_payload_encoding(task_output: TaskOutput) -> DataPayloadEncodi
|
|
214
214
|
return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_BINARY_PICKLE
|
215
215
|
|
216
216
|
|
217
|
-
def
|
217
|
+
def compute_hash(data: bytes) -> str:
|
218
218
|
hasher = hashlib.sha256(usedforsecurity=False)
|
219
219
|
hasher.update(data)
|
220
220
|
return hasher.hexdigest()
|
@@ -21,9 +21,9 @@ from .function_executor.server.function_executor_server_factory import (
|
|
21
21
|
from .function_executor_controller import (
|
22
22
|
FunctionExecutorController,
|
23
23
|
function_executor_logger,
|
24
|
-
|
24
|
+
task_allocation_logger,
|
25
25
|
validate_function_executor_description,
|
26
|
-
|
26
|
+
validate_task_allocation,
|
27
27
|
)
|
28
28
|
from .metrics.state_reconciler import (
|
29
29
|
metric_state_reconciliation_errors,
|
@@ -376,31 +376,16 @@ class ExecutorStateReconciler:
|
|
376
376
|
# Nothing to do, task already exists and it's immutable.
|
377
377
|
return
|
378
378
|
|
379
|
-
function_executor_controller.
|
380
|
-
task=task_allocation.task,
|
381
|
-
allocation_id=task_allocation.allocation_id,
|
382
|
-
)
|
379
|
+
function_executor_controller.add_task_allocation(task_allocation)
|
383
380
|
|
384
381
|
def _valid_task_allocations(self, task_allocations: Iterable[TaskAllocation]):
|
385
382
|
valid_task_allocations: List[TaskAllocation] = []
|
386
383
|
for task_allocation in task_allocations:
|
387
384
|
task_allocation: TaskAllocation
|
388
|
-
logger = self.
|
389
|
-
|
390
|
-
try:
|
391
|
-
validate_task(task_allocation.task)
|
392
|
-
except ValueError as e:
|
393
|
-
# There's no way to report this error to Server so just log it.
|
394
|
-
logger.error(
|
395
|
-
"received invalid TaskAllocation from Server, dropping it from desired state",
|
396
|
-
exc_info=e,
|
397
|
-
)
|
398
|
-
continue
|
385
|
+
logger = task_allocation_logger(task_allocation, self._logger)
|
399
386
|
|
400
|
-
validator = MessageValidator(task_allocation)
|
401
387
|
try:
|
402
|
-
|
403
|
-
validator.required_field("allocation_id")
|
388
|
+
validate_task_allocation(task_allocation)
|
404
389
|
except ValueError as e:
|
405
390
|
# There's no way to report this error to Server so just log it.
|
406
391
|
logger.error(
|
@@ -423,16 +408,3 @@ class ExecutorStateReconciler:
|
|
423
408
|
valid_task_allocations.append(task_allocation)
|
424
409
|
|
425
410
|
return valid_task_allocations
|
426
|
-
|
427
|
-
def _task_allocation_logger(self, task_allocation: TaskAllocation) -> Any:
|
428
|
-
"""Returns a logger for the given TaskAllocation.
|
429
|
-
|
430
|
-
Doesn't assume that the supplied TaskAllocation is valid.
|
431
|
-
"""
|
432
|
-
return task_logger(task_allocation.task, self._logger).bind(
|
433
|
-
function_executor_id=(
|
434
|
-
task_allocation.function_executor_id
|
435
|
-
if task_allocation.HasField("function_executor_id")
|
436
|
-
else None
|
437
|
-
)
|
438
|
-
)
|
@@ -9,7 +9,9 @@ from indexify.proto.executor_api_pb2 import (
|
|
9
9
|
AllowedFunction,
|
10
10
|
ExecutorState,
|
11
11
|
ExecutorStatus,
|
12
|
+
ExecutorUpdate,
|
12
13
|
FunctionExecutorState,
|
14
|
+
FunctionExecutorUpdate,
|
13
15
|
GPUModel,
|
14
16
|
GPUResources,
|
15
17
|
)
|
@@ -24,8 +26,7 @@ from indexify.proto.executor_api_pb2_grpc import ExecutorAPIStub
|
|
24
26
|
|
25
27
|
from .channel_manager import ChannelManager
|
26
28
|
from .function_allowlist import FunctionURI
|
27
|
-
from .function_executor_controller.loggers import
|
28
|
-
from .function_executor_controller.task_output import TaskOutput
|
29
|
+
from .function_executor_controller.loggers import task_result_logger
|
29
30
|
from .host_resources.host_resources import HostResources, HostResourcesProvider
|
30
31
|
from .host_resources.nvidia_gpu import NVIDIA_GPU_MODEL
|
31
32
|
from .metrics.state_reporter import (
|
@@ -83,7 +84,8 @@ class ExecutorStateReporter:
|
|
83
84
|
self._last_server_clock: int = (
|
84
85
|
0 # Server expects initial value to be 0 until it is set by Server.
|
85
86
|
)
|
86
|
-
self.
|
87
|
+
self._pending_task_results: List[TaskResult] = []
|
88
|
+
self._pending_fe_updates: List[FunctionExecutorUpdate] = []
|
87
89
|
self._function_executor_states: Dict[str, FunctionExecutorState] = {}
|
88
90
|
|
89
91
|
def update_executor_status(self, value: ExecutorStatus) -> None:
|
@@ -98,7 +100,7 @@ class ExecutorStateReporter:
|
|
98
100
|
) -> None:
|
99
101
|
self._function_executor_states[state.description.id] = state
|
100
102
|
|
101
|
-
def
|
103
|
+
def remove_function_executor_state(self, function_executor_id: str) -> None:
|
102
104
|
if function_executor_id not in self._function_executor_states:
|
103
105
|
self._logger.warning(
|
104
106
|
"attempted to remove non-existing function executor state",
|
@@ -108,8 +110,12 @@ class ExecutorStateReporter:
|
|
108
110
|
|
109
111
|
self._function_executor_states.pop(function_executor_id)
|
110
112
|
|
111
|
-
def
|
112
|
-
self.
|
113
|
+
def add_completed_task_result(self, task_result: TaskResult) -> None:
|
114
|
+
self._pending_task_results.append(task_result)
|
115
|
+
|
116
|
+
def add_function_executor_update(self, update: FunctionExecutorUpdate) -> None:
|
117
|
+
"""Adds a function executor update to the list of updates to be reported."""
|
118
|
+
self._pending_fe_updates.append(update)
|
113
119
|
|
114
120
|
def schedule_state_report(self) -> None:
|
115
121
|
"""Schedules a state report to be sent to the server asap.
|
@@ -212,19 +218,28 @@ class ExecutorStateReporter:
|
|
212
218
|
):
|
213
219
|
metric_state_report_rpcs.inc()
|
214
220
|
state: ExecutorState = self._current_executor_state()
|
215
|
-
|
216
|
-
|
221
|
+
update: ExecutorUpdate = self._remove_pending_update()
|
222
|
+
|
223
|
+
for task_result in update.task_results:
|
224
|
+
task_result_logger(task_result, self._logger).info(
|
225
|
+
"reporting task outcome",
|
226
|
+
outcome_code=TaskOutcomeCode.Name(task_result.outcome_code),
|
227
|
+
failure_reason=(
|
228
|
+
TaskFailureReason.Name(task_result.failure_reason)
|
229
|
+
if task_result.HasField("failure_reason")
|
230
|
+
else "None"
|
231
|
+
),
|
232
|
+
)
|
217
233
|
|
218
234
|
try:
|
219
235
|
await stub.report_executor_state(
|
220
236
|
ReportExecutorStateRequest(
|
221
|
-
executor_state=state,
|
237
|
+
executor_state=state, executor_update=update
|
222
238
|
),
|
223
239
|
timeout=_REPORT_RPC_TIMEOUT_SEC,
|
224
240
|
)
|
225
241
|
except Exception as e:
|
226
|
-
|
227
|
-
self.add_completed_task_output(task_output)
|
242
|
+
self._add_to_pending_update(update)
|
228
243
|
raise
|
229
244
|
|
230
245
|
def _current_executor_state(self) -> ExecutorState:
|
@@ -247,21 +262,26 @@ class ExecutorStateReporter:
|
|
247
262
|
state.server_clock = self._last_server_clock
|
248
263
|
return state
|
249
264
|
|
250
|
-
def
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
264
|
-
|
265
|
+
def _remove_pending_update(self) -> ExecutorUpdate:
|
266
|
+
"""Removes all pending executor updates and returns them."""
|
267
|
+
# No races here cause we don't await.
|
268
|
+
task_results: List[TaskResult] = self._pending_task_results
|
269
|
+
self._pending_task_results = []
|
270
|
+
|
271
|
+
fe_updates: List[FunctionExecutorUpdate] = self._pending_fe_updates
|
272
|
+
self._pending_fe_updates = []
|
273
|
+
|
274
|
+
return ExecutorUpdate(
|
275
|
+
executor_id=self._executor_id,
|
276
|
+
task_results=task_results,
|
277
|
+
function_executor_updates=fe_updates,
|
278
|
+
)
|
279
|
+
|
280
|
+
def _add_to_pending_update(self, update: ExecutorUpdate) -> None:
|
281
|
+
for task_result in update.task_results:
|
282
|
+
self.add_completed_task_result(task_result)
|
283
|
+
for function_executor_update in update.function_executor_updates:
|
284
|
+
self.add_function_executor_update(function_executor_update)
|
265
285
|
|
266
286
|
|
267
287
|
def _to_allowed_function_protos(
|
@@ -324,36 +344,6 @@ def _to_gpu_model_proto(nvidia_gpu_model: NVIDIA_GPU_MODEL) -> GPUModel:
|
|
324
344
|
return GPUModel.GPU_MODEL_UNKNOWN
|
325
345
|
|
326
346
|
|
327
|
-
def _to_task_result_protos(task_outputs: List[TaskOutput]) -> List[TaskResult]:
|
328
|
-
task_results: List[TaskResult] = []
|
329
|
-
|
330
|
-
for output in task_outputs:
|
331
|
-
task_result = TaskResult(
|
332
|
-
task_id=output.task.id,
|
333
|
-
allocation_id=output.allocation_id,
|
334
|
-
namespace=output.task.namespace,
|
335
|
-
graph_name=output.task.graph_name,
|
336
|
-
function_name=output.task.function_name,
|
337
|
-
graph_invocation_id=output.task.graph_invocation_id,
|
338
|
-
reducer=output.reducer,
|
339
|
-
outcome_code=output.outcome_code,
|
340
|
-
next_functions=(output.router_output.edges if output.router_output else []),
|
341
|
-
function_outputs=output.uploaded_data_payloads,
|
342
|
-
)
|
343
|
-
if output.failure_reason is not None:
|
344
|
-
task_result.failure_reason = output.failure_reason
|
345
|
-
if output.uploaded_stdout is not None:
|
346
|
-
task_result.stdout.CopyFrom(output.uploaded_stdout)
|
347
|
-
if output.uploaded_stderr is not None:
|
348
|
-
task_result.stderr.CopyFrom(output.uploaded_stderr)
|
349
|
-
if output.router_output is not None:
|
350
|
-
task_result.routing.next_functions[:] = output.router_output.edges
|
351
|
-
|
352
|
-
task_results.append(task_result)
|
353
|
-
|
354
|
-
return task_results
|
355
|
-
|
356
|
-
|
357
347
|
def _executor_labels() -> Dict[str, str]:
|
358
348
|
"""Returns standard executor labels always added to user supplied labels."""
|
359
349
|
return {
|
@@ -109,6 +109,10 @@ message FunctionExecutorDescription {
|
|
109
109
|
optional uint32 customer_code_timeout_ms = 9;
|
110
110
|
optional DataPayload graph = 10;
|
111
111
|
optional FunctionExecutorResources resources = 11;
|
112
|
+
// URI prefix for the startup output payloads.
|
113
|
+
// S3 URI if the data is stored in S3.
|
114
|
+
// Starts with "file://"" prefix followed by an absolute directory path if the data is stored on a local file system.
|
115
|
+
optional string output_payload_uri_prefix = 12;
|
112
116
|
}
|
113
117
|
|
114
118
|
message FunctionExecutorState {
|
@@ -117,6 +121,12 @@ message FunctionExecutorState {
|
|
117
121
|
optional FunctionExecutorTerminationReason termination_reason = 3;
|
118
122
|
}
|
119
123
|
|
124
|
+
message FunctionExecutorUpdate {
|
125
|
+
optional FunctionExecutorDescription description = 1;
|
126
|
+
optional DataPayload startup_stdout = 2;
|
127
|
+
optional DataPayload startup_stderr = 3;
|
128
|
+
}
|
129
|
+
|
120
130
|
enum ExecutorStatus {
|
121
131
|
EXECUTOR_STATUS_UNKNOWN = 0;
|
122
132
|
EXECUTOR_STATUS_STARTING_UP = 1;
|
@@ -145,13 +155,20 @@ message ExecutorState {
|
|
145
155
|
optional uint64 server_clock = 12;
|
146
156
|
}
|
147
157
|
|
148
|
-
//
|
158
|
+
// Updates that Executor wants to report to Server. If report_executor_state RPC is successful
|
159
|
+
// then the updates from it won't be included in the next RPC.
|
160
|
+
message ExecutorUpdate {
|
161
|
+
optional string executor_id = 1;
|
162
|
+
repeated TaskResult task_results = 2;
|
163
|
+
repeated FunctionExecutorUpdate function_executor_updates = 3;
|
164
|
+
}
|
165
|
+
|
149
166
|
message ReportExecutorStateRequest {
|
150
167
|
optional ExecutorState executor_state = 1;
|
151
|
-
|
168
|
+
optional ExecutorUpdate executor_update = 2;
|
152
169
|
}
|
153
170
|
|
154
|
-
// A message sent by Server to Executor to acknowledge the receipt of
|
171
|
+
// A message sent by Server to Executor to acknowledge the receipt of ReportExecutorStateRequest.
|
155
172
|
message ReportExecutorStateResponse {
|
156
173
|
}
|
157
174
|
|
@@ -226,24 +243,24 @@ message ResultRouting {
|
|
226
243
|
|
227
244
|
message TaskResult {
|
228
245
|
optional string task_id = 1;
|
229
|
-
optional string
|
230
|
-
optional string
|
231
|
-
optional string
|
232
|
-
optional string
|
233
|
-
optional
|
234
|
-
optional
|
235
|
-
optional
|
246
|
+
optional string allocation_id = 2;
|
247
|
+
optional string namespace = 3;
|
248
|
+
optional string graph_name = 4;
|
249
|
+
optional string graph_version = 5;
|
250
|
+
optional string function_name = 6;
|
251
|
+
optional string graph_invocation_id = 7;
|
252
|
+
optional bool reducer = 8;
|
253
|
+
optional TaskOutcomeCode outcome_code = 9;
|
254
|
+
optional TaskFailureReason failure_reason = 10;
|
236
255
|
// Edges that the function wants the invocation to be routed to.
|
237
256
|
// Previously called router_edges.
|
238
257
|
// NB: An empty list indicates that the graph's route definitions should be used,
|
239
258
|
// unless this field is overridden by the presence of the `routing` field.
|
240
|
-
repeated string next_functions =
|
241
|
-
repeated DataPayload function_outputs =
|
259
|
+
repeated string next_functions = 11;
|
260
|
+
repeated DataPayload function_outputs = 12;
|
242
261
|
// Standard output and error streams of the function.
|
243
|
-
optional DataPayload stdout =
|
244
|
-
optional DataPayload stderr =
|
245
|
-
|
246
|
-
optional string allocation_id = 13;
|
262
|
+
optional DataPayload stdout = 13;
|
263
|
+
optional DataPayload stderr = 14;
|
247
264
|
|
248
265
|
// Indicates how the results should be routed.
|
249
266
|
// If this is present, it replaces `next_functions`.
|
@@ -253,7 +270,7 @@ message TaskResult {
|
|
253
270
|
// graph's routing. The long-term goal is to deprecate
|
254
271
|
// `next_functions`, so that if `routing` is not present, the
|
255
272
|
// graph's routing definitions will always be used.
|
256
|
-
ResultRouting routing =
|
273
|
+
ResultRouting routing = 15;
|
257
274
|
}
|
258
275
|
|
259
276
|
// Internal API for scheduling and running tasks on Executors. Executors are acting as clients of this API.
|