indexify 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/executor.py +2 -9
- indexify/executor/blob_store/blob_store.py +110 -26
- indexify/executor/blob_store/local_fs_blob_store.py +41 -1
- indexify/executor/blob_store/metrics/blob_store.py +87 -15
- indexify/executor/blob_store/s3_blob_store.py +112 -1
- indexify/executor/function_executor/function_executor.py +32 -56
- indexify/executor/function_executor/invocation_state_client.py +10 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +0 -1
- indexify/executor/function_executor_controller/create_function_executor.py +129 -116
- indexify/executor/function_executor_controller/downloads.py +34 -86
- indexify/executor/function_executor_controller/events.py +13 -7
- indexify/executor/function_executor_controller/finalize_task.py +184 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +121 -78
- indexify/executor/function_executor_controller/message_validators.py +10 -3
- indexify/executor/function_executor_controller/metrics/downloads.py +8 -52
- indexify/executor/function_executor_controller/metrics/finalize_task.py +20 -0
- indexify/executor/function_executor_controller/metrics/prepare_task.py +18 -0
- indexify/executor/function_executor_controller/metrics/run_task.py +5 -4
- indexify/executor/function_executor_controller/prepare_task.py +232 -14
- indexify/executor/function_executor_controller/run_task.py +189 -81
- indexify/executor/function_executor_controller/task_info.py +4 -7
- indexify/executor/function_executor_controller/task_input.py +21 -0
- indexify/executor/function_executor_controller/task_output.py +41 -33
- indexify/executor/function_executor_controller/terminate_function_executor.py +6 -1
- indexify/executor/logging.py +69 -0
- indexify/executor/monitoring/metrics.py +22 -0
- indexify/proto/executor_api.proto +11 -3
- indexify/proto/executor_api_pb2.py +54 -54
- indexify/proto/executor_api_pb2.pyi +8 -1
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/METADATA +6 -7
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/RECORD +33 -31
- indexify/executor/function_executor_controller/function_executor_startup_output.py +0 -21
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +0 -39
- indexify/executor/function_executor_controller/upload_task_output.py +0 -274
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/WHEEL +0 -0
- {indexify-0.4.21.dist-info → indexify-0.4.23.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,184 @@
|
|
1
|
+
import asyncio
|
2
|
+
import time
|
3
|
+
from typing import Any
|
4
|
+
|
5
|
+
from indexify.executor.blob_store.blob_store import BLOBStore
|
6
|
+
from indexify.proto.executor_api_pb2 import (
|
7
|
+
TaskFailureReason,
|
8
|
+
TaskOutcomeCode,
|
9
|
+
)
|
10
|
+
|
11
|
+
from .events import TaskFinalizationFinished
|
12
|
+
from .metrics.finalize_task import (
|
13
|
+
metric_task_finalization_errors,
|
14
|
+
metric_task_finalization_latency,
|
15
|
+
metric_task_finalizations,
|
16
|
+
metric_tasks_finalizing,
|
17
|
+
)
|
18
|
+
from .task_info import TaskInfo
|
19
|
+
from .task_input import TaskInput
|
20
|
+
from .task_output import TaskOutput
|
21
|
+
|
22
|
+
|
23
|
+
async def finalize_task(
|
24
|
+
task_info: TaskInfo, blob_store: BLOBStore, logger: Any
|
25
|
+
) -> TaskFinalizationFinished:
|
26
|
+
"""Prepares the task output for getting it reported to Server.
|
27
|
+
|
28
|
+
The task output is either coming from a failed task or from its finished execution on the Function Executor.
|
29
|
+
Doesn't raise any Exceptions.
|
30
|
+
"""
|
31
|
+
logger = logger.bind(module=__name__)
|
32
|
+
start_time = time.monotonic()
|
33
|
+
|
34
|
+
with (
|
35
|
+
metric_tasks_finalizing.track_inprogress(),
|
36
|
+
metric_task_finalization_latency.time(),
|
37
|
+
metric_task_finalization_errors.count_exceptions(),
|
38
|
+
):
|
39
|
+
metric_task_finalizations.inc()
|
40
|
+
try:
|
41
|
+
await _finalize_task_output(
|
42
|
+
task_info=task_info,
|
43
|
+
blob_store=blob_store,
|
44
|
+
logger=logger,
|
45
|
+
)
|
46
|
+
logger.info(
|
47
|
+
"task finalized",
|
48
|
+
duration=time.monotonic() - start_time,
|
49
|
+
)
|
50
|
+
return TaskFinalizationFinished(task_info=task_info, is_success=True)
|
51
|
+
except asyncio.CancelledError:
|
52
|
+
return TaskFinalizationFinished(task_info=task_info, is_success=False)
|
53
|
+
except BaseException as e:
|
54
|
+
logger.error(
|
55
|
+
"failed to finalize task",
|
56
|
+
exc_info=e,
|
57
|
+
duration=time.monotonic() - start_time,
|
58
|
+
)
|
59
|
+
return TaskFinalizationFinished(task_info=task_info, is_success=False)
|
60
|
+
|
61
|
+
|
62
|
+
class _TaskOutputSummary:
|
63
|
+
def __init__(self):
|
64
|
+
self.output_count: int = 0
|
65
|
+
self.output_bytes: int = 0
|
66
|
+
self.invocation_error_output_count: int = 0
|
67
|
+
self.invocation_error_output_bytes: int = 0
|
68
|
+
self.next_functions_count: int = 0
|
69
|
+
|
70
|
+
|
71
|
+
async def _finalize_task_output(
|
72
|
+
task_info: TaskInfo, blob_store: BLOBStore, logger: Any
|
73
|
+
) -> None:
|
74
|
+
"""Finalizes the task output.
|
75
|
+
|
76
|
+
Raises exception on error."""
|
77
|
+
if task_info.input is None:
|
78
|
+
raise Exception(
|
79
|
+
"task input is None, this should never happen",
|
80
|
+
)
|
81
|
+
if task_info.output is None:
|
82
|
+
raise Exception(
|
83
|
+
"task output is None, this should never happen",
|
84
|
+
)
|
85
|
+
|
86
|
+
input: TaskInput = task_info.input
|
87
|
+
output: TaskOutput = task_info.output
|
88
|
+
|
89
|
+
output_summary: _TaskOutputSummary = _task_output_summary(output)
|
90
|
+
logger.info(
|
91
|
+
"task output summary",
|
92
|
+
output_count=output_summary.output_count,
|
93
|
+
output_bytes=output_summary.output_bytes,
|
94
|
+
invocation_error_output_count=output_summary.invocation_error_output_count,
|
95
|
+
invocation_error_output_bytes=output_summary.invocation_error_output_bytes,
|
96
|
+
next_functions_count=output_summary.next_functions_count,
|
97
|
+
)
|
98
|
+
|
99
|
+
_log_function_metrics(output, logger)
|
100
|
+
|
101
|
+
if output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS:
|
102
|
+
if len(output.uploaded_function_outputs_blob.chunks) == 0:
|
103
|
+
# No output from function, usually means it returns None.
|
104
|
+
await blob_store.abort_multipart_upload(
|
105
|
+
uri=input.function_outputs_blob_uri,
|
106
|
+
upload_id=input.function_outputs_blob_upload_id,
|
107
|
+
logger=logger,
|
108
|
+
)
|
109
|
+
else:
|
110
|
+
await blob_store.complete_multipart_upload(
|
111
|
+
uri=input.function_outputs_blob_uri,
|
112
|
+
upload_id=input.function_outputs_blob_upload_id,
|
113
|
+
parts_etags=[
|
114
|
+
blob_chunk.etag
|
115
|
+
for blob_chunk in output.uploaded_function_outputs_blob.chunks
|
116
|
+
],
|
117
|
+
logger=logger,
|
118
|
+
)
|
119
|
+
await blob_store.abort_multipart_upload(
|
120
|
+
uri=input.invocation_error_blob_uri,
|
121
|
+
upload_id=input.invocation_error_blob_upload_id,
|
122
|
+
logger=logger,
|
123
|
+
)
|
124
|
+
elif output.outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE:
|
125
|
+
await blob_store.abort_multipart_upload(
|
126
|
+
uri=input.function_outputs_blob_uri,
|
127
|
+
upload_id=input.function_outputs_blob_upload_id,
|
128
|
+
logger=logger,
|
129
|
+
)
|
130
|
+
if (
|
131
|
+
output.failure_reason
|
132
|
+
== TaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR
|
133
|
+
) and len(output.uploaded_invocation_error_blob.chunks) != 0:
|
134
|
+
await blob_store.complete_multipart_upload(
|
135
|
+
uri=input.invocation_error_blob_uri,
|
136
|
+
upload_id=input.invocation_error_blob_upload_id,
|
137
|
+
parts_etags=[
|
138
|
+
blob_chunk.etag
|
139
|
+
for blob_chunk in output.uploaded_invocation_error_blob.chunks
|
140
|
+
],
|
141
|
+
logger=logger,
|
142
|
+
)
|
143
|
+
else:
|
144
|
+
await blob_store.abort_multipart_upload(
|
145
|
+
uri=input.invocation_error_blob_uri,
|
146
|
+
upload_id=input.invocation_error_blob_upload_id,
|
147
|
+
logger=logger,
|
148
|
+
)
|
149
|
+
else:
|
150
|
+
raise ValueError(
|
151
|
+
f"Unexpected outcome code: {TaskOutcomeCode.Name(output.outcome_code)}"
|
152
|
+
)
|
153
|
+
|
154
|
+
|
155
|
+
def _task_output_summary(task_output: TaskOutput) -> _TaskOutputSummary:
|
156
|
+
summary: _TaskOutputSummary = _TaskOutputSummary()
|
157
|
+
|
158
|
+
for output in task_output.function_outputs:
|
159
|
+
summary.output_count += 1
|
160
|
+
summary.output_bytes += output.manifest.size
|
161
|
+
|
162
|
+
if task_output.invocation_error_output is not None:
|
163
|
+
summary.invocation_error_output_count = 1
|
164
|
+
summary.invocation_error_output_bytes = (
|
165
|
+
task_output.invocation_error_output.manifest.size
|
166
|
+
)
|
167
|
+
|
168
|
+
summary.next_functions_count = len(task_output.next_functions)
|
169
|
+
|
170
|
+
return summary
|
171
|
+
|
172
|
+
|
173
|
+
# Temporary workaround is logging customer metrics until we store them somewhere
|
174
|
+
# for future retrieval and processing.
|
175
|
+
def _log_function_metrics(output: TaskOutput, logger: Any):
|
176
|
+
if output.metrics is None:
|
177
|
+
return
|
178
|
+
|
179
|
+
for counter_name, counter_value in output.metrics.counters.items():
|
180
|
+
logger.info(
|
181
|
+
"function_metric", counter_name=counter_name, counter_value=counter_value
|
182
|
+
)
|
183
|
+
for timer_name, timer_value in output.metrics.timers.items():
|
184
|
+
logger.info("function_metric", timer_name=timer_name, timer_value=timer_value)
|
@@ -6,6 +6,11 @@ from enum import Enum
|
|
6
6
|
from pathlib import Path
|
7
7
|
from typing import Any, Dict, List, Optional
|
8
8
|
|
9
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
10
|
+
SerializedObjectEncoding,
|
11
|
+
SerializedObjectInsideBLOB,
|
12
|
+
)
|
13
|
+
|
9
14
|
from indexify.executor.blob_store.blob_store import BLOBStore
|
10
15
|
from indexify.executor.function_executor.function_executor import FunctionExecutor
|
11
16
|
from indexify.executor.function_executor.health_checker import HealthCheckResult
|
@@ -14,11 +19,12 @@ from indexify.executor.function_executor.server.function_executor_server_factory
|
|
14
19
|
)
|
15
20
|
from indexify.executor.state_reporter import ExecutorStateReporter
|
16
21
|
from indexify.proto.executor_api_pb2 import (
|
22
|
+
DataPayload,
|
23
|
+
DataPayloadEncoding,
|
17
24
|
FunctionExecutorDescription,
|
18
25
|
FunctionExecutorState,
|
19
26
|
FunctionExecutorStatus,
|
20
27
|
FunctionExecutorTerminationReason,
|
21
|
-
FunctionExecutorUpdate,
|
22
28
|
TaskAllocation,
|
23
29
|
TaskResult,
|
24
30
|
)
|
@@ -38,10 +44,10 @@ from .events import (
|
|
38
44
|
ScheduleTaskExecution,
|
39
45
|
ShutdownInitiated,
|
40
46
|
TaskExecutionFinished,
|
41
|
-
|
47
|
+
TaskFinalizationFinished,
|
42
48
|
TaskPreparationFinished,
|
43
49
|
)
|
44
|
-
from .
|
50
|
+
from .finalize_task import finalize_task
|
45
51
|
from .loggers import function_executor_logger, task_allocation_logger
|
46
52
|
from .metrics.function_executor_controller import (
|
47
53
|
METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED,
|
@@ -60,9 +66,9 @@ from .metrics.function_executor_controller import (
|
|
60
66
|
from .prepare_task import prepare_task
|
61
67
|
from .run_task import run_task_on_function_executor
|
62
68
|
from .task_info import TaskInfo
|
69
|
+
from .task_input import TaskInput
|
63
70
|
from .task_output import TaskOutput
|
64
71
|
from .terminate_function_executor import terminate_function_executor
|
65
|
-
from .upload_task_output import upload_task_output
|
66
72
|
|
67
73
|
|
68
74
|
# Actual FE controller states, they are a bit different from statuses reported to the Server.
|
@@ -242,10 +248,7 @@ class FunctionExecutorController:
|
|
242
248
|
aio=next_aio,
|
243
249
|
on_exception=FunctionExecutorCreated(
|
244
250
|
function_executor=None,
|
245
|
-
|
246
|
-
function_executor_description=self._fe_description,
|
247
|
-
termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR,
|
248
|
-
),
|
251
|
+
fe_termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR,
|
249
252
|
),
|
250
253
|
)
|
251
254
|
|
@@ -326,7 +329,7 @@ class FunctionExecutorController:
|
|
326
329
|
|
327
330
|
try:
|
328
331
|
if event.event_type == EventType.SHUTDOWN_INITIATED:
|
329
|
-
return await self.
|
332
|
+
return await self._shutdown(event)
|
330
333
|
|
331
334
|
with metric_control_loop_handle_event_latency.time():
|
332
335
|
self._handle_event(event)
|
@@ -338,6 +341,8 @@ class FunctionExecutorController:
|
|
338
341
|
exc_info=e,
|
339
342
|
event_type=event.event_type.name,
|
340
343
|
)
|
344
|
+
if event.event_type == EventType.SHUTDOWN_INITIATED:
|
345
|
+
return # Unexpected exception during shutdown, should return anyway.
|
341
346
|
|
342
347
|
def _handle_event(self, event: BaseEvent) -> None:
|
343
348
|
"""Handles the event.
|
@@ -355,7 +360,7 @@ class FunctionExecutorController:
|
|
355
360
|
elif event.event_type == EventType.TASK_EXECUTION_FINISHED:
|
356
361
|
return self._handle_event_task_execution_finished(event)
|
357
362
|
elif event.event_type == EventType.TASK_OUTPUT_UPLOAD_FINISHED:
|
358
|
-
return self.
|
363
|
+
return self._handle_event_task_finalization_finished(event)
|
359
364
|
|
360
365
|
self._logger.warning(
|
361
366
|
"unexpected event type received", event_type=event.event_type.name
|
@@ -402,7 +407,7 @@ class FunctionExecutorController:
|
|
402
407
|
"""Spawns an aio task for the supplied coroutine.
|
403
408
|
|
404
409
|
The coroutine should return an event that will be added to the FE controller events.
|
405
|
-
The coroutine should not raise any exceptions.
|
410
|
+
The coroutine should not raise any exceptions including BaseException and asyncio.CancelledError.
|
406
411
|
on_exception event will be added to the FE controller events if the aio task raises an unexpected exception.
|
407
412
|
on_exception is required to not silently stall the task processing due to an unexpected exception.
|
408
413
|
If task_info is not None, the aio task will be associated with the task_info while the aio task is running.
|
@@ -417,8 +422,6 @@ class FunctionExecutorController:
|
|
417
422
|
async def coroutine_wrapper() -> None:
|
418
423
|
try:
|
419
424
|
self._add_event(await aio, source=aio_task_name)
|
420
|
-
except asyncio.CancelledError:
|
421
|
-
pass # Expected exception on aio task cancellation.
|
422
425
|
except BaseException as e:
|
423
426
|
logger.error(
|
424
427
|
"unexpected exception in aio task",
|
@@ -449,15 +452,6 @@ class FunctionExecutorController:
|
|
449
452
|
|
450
453
|
Doesn't raise any exceptions. Doesn't block.
|
451
454
|
"""
|
452
|
-
self._state_reporter.add_function_executor_update(
|
453
|
-
FunctionExecutorUpdate(
|
454
|
-
description=self._fe_description,
|
455
|
-
startup_stdout=event.output.stdout,
|
456
|
-
startup_stderr=event.output.stderr,
|
457
|
-
)
|
458
|
-
)
|
459
|
-
self._state_reporter.schedule_state_report()
|
460
|
-
|
461
455
|
if event.function_executor is None:
|
462
456
|
# Server needs to increment attempts counter for all the tasks that were pending while FE was starting up.
|
463
457
|
# This prevents infinite retries if FEs consistently fail to start up.
|
@@ -474,11 +468,11 @@ class FunctionExecutorController:
|
|
474
468
|
)
|
475
469
|
task_info.output = TaskOutput.function_executor_startup_failed(
|
476
470
|
allocation=task_info.allocation,
|
477
|
-
|
471
|
+
fe_termination_reason=event.fe_termination_reason,
|
478
472
|
logger=task_logger,
|
479
473
|
)
|
480
474
|
self._start_termination(
|
481
|
-
fe_termination_reason=event.
|
475
|
+
fe_termination_reason=event.fe_termination_reason,
|
482
476
|
allocation_ids_caused_termination=allocation_ids_caused_termination,
|
483
477
|
)
|
484
478
|
return
|
@@ -559,16 +553,18 @@ class FunctionExecutorController:
|
|
559
553
|
execution_start_time=None,
|
560
554
|
execution_end_time=None,
|
561
555
|
)
|
562
|
-
self.
|
556
|
+
self._start_task_finalization(task_info)
|
563
557
|
return
|
558
|
+
|
564
559
|
if not event.is_success:
|
560
|
+
# Failed to prepare the task inputs.
|
565
561
|
task_info.output = TaskOutput.internal_error(
|
566
562
|
allocation=task_info.allocation,
|
567
563
|
# Task was prepared but never executed
|
568
564
|
execution_start_time=None,
|
569
565
|
execution_end_time=None,
|
570
566
|
)
|
571
|
-
self.
|
567
|
+
self._start_task_finalization(task_info)
|
572
568
|
return
|
573
569
|
|
574
570
|
task_info.prepared_time = time.monotonic()
|
@@ -616,7 +612,7 @@ class FunctionExecutorController:
|
|
616
612
|
execution_start_time=None,
|
617
613
|
execution_end_time=None,
|
618
614
|
)
|
619
|
-
self.
|
615
|
+
self._start_task_finalization(task_info)
|
620
616
|
elif self._internal_state in [
|
621
617
|
_FE_CONTROLLER_STATE.TERMINATING,
|
622
618
|
_FE_CONTROLLER_STATE.TERMINATED,
|
@@ -626,7 +622,7 @@ class FunctionExecutorController:
|
|
626
622
|
task_info.output = TaskOutput.function_executor_terminated(
|
627
623
|
task_info.allocation
|
628
624
|
)
|
629
|
-
self.
|
625
|
+
self._start_task_finalization(task_info)
|
630
626
|
elif self._internal_state == _FE_CONTROLLER_STATE.RUNNING:
|
631
627
|
self._running_task = task_info
|
632
628
|
next_aio = run_task_on_function_executor(
|
@@ -677,17 +673,26 @@ class FunctionExecutorController:
|
|
677
673
|
],
|
678
674
|
)
|
679
675
|
|
680
|
-
|
681
|
-
|
682
|
-
|
683
|
-
|
676
|
+
task_info: TaskInfo = event.task_info
|
677
|
+
if task_info.output is None:
|
678
|
+
# `run_task_on_function_executor` guarantees that the output is set in
|
679
|
+
# all cases including task cancellations. If this didn't happen then some
|
680
|
+
# internal error occurred in our code.
|
681
|
+
task_info.output = TaskOutput.internal_error(
|
682
|
+
allocation=task_info.allocation,
|
683
|
+
execution_start_time=None,
|
684
|
+
execution_end_time=None,
|
685
|
+
)
|
684
686
|
|
685
|
-
|
686
|
-
|
687
|
+
self._start_task_finalization(task_info)
|
688
|
+
|
689
|
+
def _start_task_finalization(self, task_info: TaskInfo) -> None:
|
690
|
+
"""Starts finalization for the given task.
|
687
691
|
|
688
692
|
Doesn't raise any exceptions. Doesn't block.
|
693
|
+
task_info.output should not be None.
|
689
694
|
"""
|
690
|
-
next_aio =
|
695
|
+
next_aio = finalize_task(
|
691
696
|
task_info=task_info,
|
692
697
|
blob_store=self._blob_store,
|
693
698
|
logger=task_allocation_logger(task_info.allocation, self._logger),
|
@@ -695,43 +700,37 @@ class FunctionExecutorController:
|
|
695
700
|
self._spawn_aio_for_task(
|
696
701
|
aio=next_aio,
|
697
702
|
task_info=task_info,
|
698
|
-
on_exception=
|
703
|
+
on_exception=TaskFinalizationFinished(
|
699
704
|
task_info=task_info, is_success=False
|
700
705
|
),
|
701
706
|
)
|
702
707
|
|
703
|
-
def
|
704
|
-
self, event:
|
708
|
+
def _handle_event_task_finalization_finished(
|
709
|
+
self, event: TaskFinalizationFinished
|
705
710
|
) -> None:
|
706
|
-
"""Handles the task
|
711
|
+
"""Handles the task finalization finished event.
|
707
712
|
|
708
713
|
Doesn't raise any exceptions. Doesn't block.
|
709
714
|
"""
|
710
715
|
task_info: TaskInfo = event.task_info
|
711
716
|
if not event.is_success:
|
712
|
-
|
717
|
+
original_task_output: TaskOutput = task_info.output # Never None here
|
713
718
|
task_info.output = TaskOutput.internal_error(
|
714
719
|
allocation=task_info.allocation,
|
715
|
-
execution_start_time=
|
716
|
-
execution_end_time=
|
720
|
+
execution_start_time=original_task_output.execution_start_time,
|
721
|
+
execution_end_time=original_task_output.execution_end_time,
|
717
722
|
)
|
718
723
|
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
def _complete_task(self, task_info: TaskInfo) -> None:
|
723
|
-
"""Marks the task as completed and reports it to the Server.
|
724
|
-
|
725
|
-
Doesn't raise any exceptions. Doesn't block.
|
726
|
-
"""
|
724
|
+
logger: Any = task_allocation_logger(task_info.allocation, self._logger)
|
725
|
+
# Ignore task cancellation as it's technically finished at this point.
|
727
726
|
task_info.is_completed = True
|
728
727
|
emit_completed_task_metrics(
|
729
728
|
task_info=task_info,
|
730
|
-
logger=
|
729
|
+
logger=logger,
|
731
730
|
)
|
732
731
|
# Reconciler will call .remove_task() once Server signals that it processed this update.
|
733
732
|
self._state_reporter.add_completed_task_result(
|
734
|
-
_to_task_result_proto(task_info
|
733
|
+
_to_task_result_proto(task_info, logger)
|
735
734
|
)
|
736
735
|
self._state_reporter.schedule_state_report()
|
737
736
|
|
@@ -769,16 +768,6 @@ class FunctionExecutorController:
|
|
769
768
|
),
|
770
769
|
)
|
771
770
|
|
772
|
-
async def _shutdown_no_exceptions(self, event: ShutdownInitiated) -> None:
|
773
|
-
try:
|
774
|
-
await self._shutdown(event)
|
775
|
-
except BaseException as e:
|
776
|
-
# This would result in resource leaks.
|
777
|
-
self._logger.error(
|
778
|
-
"unexpected exception in function executor controller shutdown, this should never happen",
|
779
|
-
exc_info=e,
|
780
|
-
)
|
781
|
-
|
782
771
|
async def _shutdown(self, event: ShutdownInitiated) -> None:
|
783
772
|
"""Shuts down the Function Executor and frees all its resources.
|
784
773
|
|
@@ -865,7 +854,13 @@ def _termination_reason_to_short_name(value: FunctionExecutorTerminationReason)
|
|
865
854
|
return _termination_reason_to_short_name_map.get(value, "UNEXPECTED")
|
866
855
|
|
867
856
|
|
868
|
-
def _to_task_result_proto(
|
857
|
+
def _to_task_result_proto(task_info: TaskInfo, logger: Any) -> TaskResult:
|
858
|
+
allocation: TaskAllocation = task_info.allocation
|
859
|
+
# Might be None if the task wasn't prepared successfully.
|
860
|
+
input: Optional[TaskInput] = task_info.input
|
861
|
+
# Never None here as we're completing the task here.
|
862
|
+
output: Optional[TaskOutput] = task_info.output
|
863
|
+
|
869
864
|
execution_duration_ms: Optional[int] = None
|
870
865
|
if (
|
871
866
|
output.execution_start_time is not None
|
@@ -876,24 +871,72 @@ def _to_task_result_proto(output: TaskOutput) -> TaskResult:
|
|
876
871
|
(output.execution_end_time - output.execution_start_time) * 1000
|
877
872
|
)
|
878
873
|
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
874
|
+
invocation_error_output: Optional[DataPayload] = None
|
875
|
+
if output.invocation_error_output is not None:
|
876
|
+
# input can't be None if invocation_error_output is set because the task ran already.
|
877
|
+
invocation_error_output = _to_data_payload_proto(
|
878
|
+
so=output.invocation_error_output,
|
879
|
+
blob_uri=input.invocation_error_blob_uri,
|
880
|
+
logger=logger,
|
881
|
+
)
|
882
|
+
|
883
|
+
function_outputs: List[DataPayload] = []
|
884
|
+
for function_output in output.function_outputs:
|
885
|
+
# input can't be None if invocation_function_outputs is set because the task ran already.
|
886
|
+
function_output: SerializedObjectInsideBLOB
|
887
|
+
function_outputs.append(
|
888
|
+
_to_data_payload_proto(
|
889
|
+
so=function_output,
|
890
|
+
blob_uri=input.function_outputs_blob_uri,
|
891
|
+
logger=logger,
|
892
|
+
)
|
893
|
+
)
|
894
|
+
|
895
|
+
return TaskResult(
|
896
|
+
task_id=allocation.task.id,
|
897
|
+
allocation_id=allocation.allocation_id,
|
898
|
+
namespace=allocation.task.namespace,
|
899
|
+
graph_name=allocation.task.graph_name,
|
900
|
+
graph_version=allocation.task.graph_version,
|
901
|
+
function_name=allocation.task.function_name,
|
902
|
+
graph_invocation_id=allocation.task.graph_invocation_id,
|
887
903
|
outcome_code=output.outcome_code,
|
888
904
|
failure_reason=output.failure_reason,
|
889
905
|
next_functions=output.next_functions,
|
890
|
-
function_outputs=
|
891
|
-
invocation_error_output=
|
906
|
+
function_outputs=function_outputs,
|
907
|
+
invocation_error_output=invocation_error_output,
|
892
908
|
execution_duration_ms=execution_duration_ms,
|
893
909
|
)
|
894
|
-
if output.uploaded_stdout is not None:
|
895
|
-
task_result.stdout.CopyFrom(output.uploaded_stdout)
|
896
|
-
if output.uploaded_stderr is not None:
|
897
|
-
task_result.stderr.CopyFrom(output.uploaded_stderr)
|
898
910
|
|
899
|
-
|
911
|
+
|
912
|
+
def _to_data_payload_proto(
|
913
|
+
so: SerializedObjectInsideBLOB,
|
914
|
+
blob_uri: str,
|
915
|
+
logger: Any,
|
916
|
+
) -> DataPayload:
|
917
|
+
"""Converts a serialized object inside BLOB to into a DataPayload."""
|
918
|
+
return DataPayload(
|
919
|
+
size=so.manifest.size,
|
920
|
+
sha256_hash=so.manifest.sha256_hash,
|
921
|
+
uri=blob_uri,
|
922
|
+
encoding=_to_data_payload_encoding(so.manifest.encoding, logger),
|
923
|
+
encoding_version=so.manifest.encoding_version,
|
924
|
+
offset=so.offset,
|
925
|
+
)
|
926
|
+
|
927
|
+
|
928
|
+
def _to_data_payload_encoding(
|
929
|
+
encoding: SerializedObjectEncoding, logger: Any
|
930
|
+
) -> DataPayloadEncoding:
|
931
|
+
if encoding == SerializedObjectEncoding.SERIALIZED_OBJECT_ENCODING_BINARY_PICKLE:
|
932
|
+
return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_BINARY_PICKLE
|
933
|
+
elif encoding == SerializedObjectEncoding.SERIALIZED_OBJECT_ENCODING_UTF8_JSON:
|
934
|
+
return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_JSON
|
935
|
+
elif encoding == SerializedObjectEncoding.SERIALIZED_OBJECT_ENCODING_UTF8_TEXT:
|
936
|
+
return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT
|
937
|
+
else:
|
938
|
+
logger.error(
|
939
|
+
"Unexpected encoding for SerializedObject",
|
940
|
+
encoding=SerializedObjectEncoding.Name(encoding),
|
941
|
+
)
|
942
|
+
return DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UNKNOWN
|
@@ -21,12 +21,10 @@ def validate_function_executor_description(
|
|
21
21
|
validator.required_field("graph_name")
|
22
22
|
validator.required_field("graph_version")
|
23
23
|
validator.required_field("function_name")
|
24
|
-
# image_uri is optional.
|
25
24
|
# secret_names can be empty.
|
26
25
|
validator.required_field("customer_code_timeout_ms")
|
27
26
|
validator.required_field("graph")
|
28
27
|
validator.required_field("resources")
|
29
|
-
validator.required_field("output_payload_uri_prefix")
|
30
28
|
|
31
29
|
_validate_data_payload(function_executor_description.graph)
|
32
30
|
|
@@ -81,4 +79,13 @@ def _validate_data_payload(data_payload: DataPayload) -> None:
|
|
81
79
|
|
82
80
|
Raises ValueError if the DataPayload is not valid.
|
83
81
|
"""
|
84
|
-
(
|
82
|
+
(
|
83
|
+
MessageValidator(data_payload)
|
84
|
+
.required_field("size")
|
85
|
+
.required_field("sha256_hash")
|
86
|
+
.required_field("uri")
|
87
|
+
.required_field("encoding")
|
88
|
+
# Ignored by Server right now and not set.
|
89
|
+
# .required_field("encoding_version")
|
90
|
+
.required_field("offset")
|
91
|
+
)
|
@@ -4,64 +4,20 @@ from indexify.executor.monitoring.metrics import latency_metric_for_fast_operati
|
|
4
4
|
|
5
5
|
# Graph download metrics
|
6
6
|
metric_graph_downloads: prometheus_client.Counter = prometheus_client.Counter(
|
7
|
-
"
|
8
|
-
"Number of
|
7
|
+
"graph_downloads",
|
8
|
+
"Number of graph downloads, including downloads served from local cache",
|
9
9
|
)
|
10
10
|
metric_graph_download_errors: prometheus_client.Counter = prometheus_client.Counter(
|
11
|
-
"
|
12
|
-
"Number of
|
11
|
+
"graph_download_errors",
|
12
|
+
"Number of download errors, including downloads served from local cache",
|
13
13
|
)
|
14
14
|
metric_graphs_from_cache: prometheus_client.Counter = prometheus_client.Counter(
|
15
|
-
"
|
16
|
-
"Number of
|
15
|
+
"graph_downloads_from_cache",
|
16
|
+
"Number of graph downloads served from local cache",
|
17
17
|
)
|
18
18
|
metric_graph_download_latency: prometheus_client.Histogram = (
|
19
19
|
latency_metric_for_fast_operation(
|
20
|
-
"
|
21
|
-
"
|
22
|
-
)
|
23
|
-
)
|
24
|
-
metric_tasks_downloading_graphs: prometheus_client.Gauge = prometheus_client.Gauge(
|
25
|
-
"tasks_downloading_graphs",
|
26
|
-
"Number of tasks currently downloading their graphs, including local cache lookups",
|
27
|
-
)
|
28
|
-
|
29
|
-
# Task input download metrics
|
30
|
-
metric_task_input_downloads: prometheus_client.Counter = prometheus_client.Counter(
|
31
|
-
"task_input_downloads", "Number of task input downloads"
|
32
|
-
)
|
33
|
-
metric_task_input_download_errors: prometheus_client.Counter = (
|
34
|
-
prometheus_client.Counter(
|
35
|
-
"task_input_download_errors", "Number of task input download errors"
|
36
|
-
)
|
37
|
-
)
|
38
|
-
metric_task_input_download_latency: prometheus_client.Histogram = (
|
39
|
-
latency_metric_for_fast_operation("task_input_download", "task input download")
|
40
|
-
)
|
41
|
-
metric_tasks_downloading_inputs: prometheus_client.Gauge = prometheus_client.Gauge(
|
42
|
-
"tasks_downloading_inputs", "Number of tasks currently downloading their inputs"
|
43
|
-
)
|
44
|
-
|
45
|
-
# Reducer init value download metrics
|
46
|
-
metric_reducer_init_value_downloads: prometheus_client.Counter = (
|
47
|
-
prometheus_client.Counter(
|
48
|
-
"task_reducer_init_value_downloads", "Number of reducer init value downloads"
|
49
|
-
)
|
50
|
-
)
|
51
|
-
metric_reducer_init_value_download_errors: prometheus_client.Counter = (
|
52
|
-
prometheus_client.Counter(
|
53
|
-
"task_reducer_init_value_download_errors",
|
54
|
-
"Number of reducer init value download errors",
|
55
|
-
)
|
56
|
-
)
|
57
|
-
metric_reducer_init_value_download_latency: prometheus_client.Histogram = (
|
58
|
-
latency_metric_for_fast_operation(
|
59
|
-
"task_reducer_init_value_download", "Task reducer init value download"
|
60
|
-
)
|
61
|
-
)
|
62
|
-
metric_tasks_downloading_reducer_init_value: prometheus_client.Gauge = (
|
63
|
-
prometheus_client.Gauge(
|
64
|
-
"tasks_downloading_reducer_init_value",
|
65
|
-
"Number of tasks currently downloading their reducer init values",
|
20
|
+
"graph_download",
|
21
|
+
"Graph download, including downloads served from local cache",
|
66
22
|
)
|
67
23
|
)
|