indexify 0.3.31__py3-none-any.whl → 0.4.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/__init__.py +18 -0
- indexify/cli/build_image.py +51 -0
- indexify/cli/deploy.py +57 -0
- indexify/cli/executor.py +205 -0
- indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
- indexify/executor/executor.py +57 -313
- indexify/executor/function_allowlist.py +59 -0
- indexify/executor/function_executor/function_executor.py +12 -6
- indexify/executor/function_executor/invocation_state_client.py +25 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
- indexify/executor/function_executor_controller/__init__.py +13 -0
- indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
- indexify/executor/function_executor_controller/create_function_executor.py +158 -0
- indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
- indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
- indexify/executor/function_executor_controller/downloads.py +199 -0
- indexify/executor/function_executor_controller/events.py +172 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
- indexify/executor/function_executor_controller/loggers.py +57 -0
- indexify/executor/function_executor_controller/message_validators.py +69 -0
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
- indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
- indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
- indexify/executor/function_executor_controller/prepare_task.py +38 -0
- indexify/executor/function_executor_controller/run_task.py +201 -0
- indexify/executor/function_executor_controller/task_info.py +33 -0
- indexify/executor/function_executor_controller/task_output.py +122 -0
- indexify/executor/function_executor_controller/upload_task_output.py +234 -0
- indexify/executor/host_resources/host_resources.py +20 -25
- indexify/executor/host_resources/nvidia_gpu_allocator.py +8 -1
- indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
- indexify/executor/metrics/executor.py +0 -47
- indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
- indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
- indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
- indexify/executor/monitoring/health_checker/health_checker.py +0 -11
- indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
- indexify/executor/state_reporter.py +364 -0
- indexify/proto/executor_api.proto +68 -60
- indexify/proto/executor_api_pb2.py +52 -52
- indexify/proto/executor_api_pb2.pyi +129 -108
- indexify/proto/executor_api_pb2_grpc.py +0 -47
- {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/METADATA +2 -5
- indexify-0.4.3.dist-info/RECORD +68 -0
- indexify-0.4.3.dist-info/entry_points.txt +3 -0
- indexify/cli/cli.py +0 -268
- indexify/executor/api_objects.py +0 -92
- indexify/executor/downloader.py +0 -417
- indexify/executor/executor_flavor.py +0 -7
- indexify/executor/function_executor/function_executor_state.py +0 -107
- indexify/executor/function_executor/function_executor_states_container.py +0 -93
- indexify/executor/function_executor/function_executor_status.py +0 -95
- indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
- indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
- indexify/executor/function_executor/single_task_runner.py +0 -345
- indexify/executor/function_executor/task_input.py +0 -21
- indexify/executor/function_executor/task_output.py +0 -105
- indexify/executor/grpc/function_executor_controller.py +0 -418
- indexify/executor/grpc/metrics/task_controller.py +0 -8
- indexify/executor/grpc/state_reporter.py +0 -317
- indexify/executor/grpc/task_controller.py +0 -508
- indexify/executor/metrics/task_fetcher.py +0 -21
- indexify/executor/metrics/task_reporter.py +0 -53
- indexify/executor/metrics/task_runner.py +0 -52
- indexify/executor/monitoring/function_allowlist.py +0 -25
- indexify/executor/runtime_probes.py +0 -68
- indexify/executor/task_fetcher.py +0 -96
- indexify/executor/task_reporter.py +0 -459
- indexify/executor/task_runner.py +0 -177
- indexify-0.3.31.dist-info/RECORD +0 -68
- indexify-0.3.31.dist-info/entry_points.txt +0 -3
- {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/WHEEL +0 -0
@@ -11,15 +11,11 @@ from .function_executor_server_factory import (
|
|
11
11
|
from .subprocess_function_executor_server import SubprocessFunctionExecutorServer
|
12
12
|
|
13
13
|
|
14
|
-
def get_free_tcp_port(iface_name="localhost") -> int:
|
15
|
-
tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
|
16
|
-
tcp.bind((iface_name, 0))
|
17
|
-
_, port = tcp.getsockname()
|
18
|
-
tcp.close()
|
19
|
-
return port
|
20
|
-
|
21
|
-
|
22
14
|
class SubprocessFunctionExecutorServerFactory(FunctionExecutorServerFactory):
|
15
|
+
def __init__(self, verbose_logs: bool) -> None:
|
16
|
+
super().__init__()
|
17
|
+
self._verbose_logs = verbose_logs
|
18
|
+
|
23
19
|
async def create(
|
24
20
|
self, config: FunctionExecutorServerConfiguration, logger: Any
|
25
21
|
) -> SubprocessFunctionExecutorServer:
|
@@ -33,13 +29,15 @@ class SubprocessFunctionExecutorServerFactory(FunctionExecutorServerFactory):
|
|
33
29
|
)
|
34
30
|
|
35
31
|
try:
|
36
|
-
port =
|
37
|
-
logger.info("allocated function executor port", port=port)
|
32
|
+
port = _find_free_localhost_tcp_port()
|
38
33
|
args = [
|
39
34
|
f"--executor-id={config.executor_id}", # use = as executor_id can start with -
|
35
|
+
f"--function-executor-id={config.function_executor_id}",
|
40
36
|
"--address",
|
41
37
|
_server_address(port),
|
42
38
|
]
|
39
|
+
if self._verbose_logs:
|
40
|
+
args.append("--dev")
|
43
41
|
# Run the process with our stdout, stderr. We want to see process logs and exceptions in our process output.
|
44
42
|
# This is useful for dubugging. Customer function stdout and stderr is captured and returned in the response
|
45
43
|
# so we won't see it in our process outputs. This is the right behavior as customer function stdout and stderr
|
@@ -91,10 +89,23 @@ class SubprocessFunctionExecutorServerFactory(FunctionExecutorServerFactory):
|
|
91
89
|
)
|
92
90
|
|
93
91
|
|
92
|
+
# Function Executors are only listening on localhost so external connections to them are not possible.
|
93
|
+
# This is a security measure. Also Executor <-> Function Executor communication is always local and
|
94
|
+
# don't support Function Executors running on a different host.
|
95
|
+
_FUNCTION_EXECUTOR_SERVER_HOSTNAME = "localhost"
|
96
|
+
|
97
|
+
|
94
98
|
def _server_address(port: int) -> str:
|
95
|
-
return f"
|
99
|
+
return f"{_FUNCTION_EXECUTOR_SERVER_HOSTNAME}:{port}"
|
96
100
|
|
97
101
|
|
98
102
|
def _new_process_group() -> None:
|
99
103
|
"""Creates a new process group with ID equal to the current process PID. POSIX only."""
|
100
104
|
os.setpgid(0, 0)
|
105
|
+
|
106
|
+
|
107
|
+
def _find_free_localhost_tcp_port() -> int:
|
108
|
+
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
|
109
|
+
sock.bind((_FUNCTION_EXECUTOR_SERVER_HOSTNAME, 0))
|
110
|
+
_, port = sock.getsockname()
|
111
|
+
return port
|
@@ -0,0 +1,13 @@
|
|
1
|
+
from .function_executor_controller import FunctionExecutorController
|
2
|
+
from .loggers import function_executor_logger, task_logger
|
3
|
+
from .message_validators import validate_function_executor_description, validate_task
|
4
|
+
from .task_output import TaskOutput
|
5
|
+
|
6
|
+
__all__ = [
|
7
|
+
"function_executor_logger",
|
8
|
+
"task_logger",
|
9
|
+
"validate_function_executor_description",
|
10
|
+
"validate_task",
|
11
|
+
"FunctionExecutorController",
|
12
|
+
"TaskOutput",
|
13
|
+
]
|
@@ -0,0 +1,82 @@
|
|
1
|
+
import time
|
2
|
+
from typing import Any
|
3
|
+
|
4
|
+
from indexify.proto.executor_api_pb2 import (
|
5
|
+
TaskFailureReason,
|
6
|
+
TaskOutcomeCode,
|
7
|
+
)
|
8
|
+
|
9
|
+
from .metrics.completed_task_metrics import (
|
10
|
+
METRIC_TASKS_COMPLETED_FAILURE_REASON_ALL,
|
11
|
+
METRIC_TASKS_COMPLETED_FAILURE_REASON_FUNCTION_ERROR,
|
12
|
+
METRIC_TASKS_COMPLETED_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED,
|
13
|
+
METRIC_TASKS_COMPLETED_FAILURE_REASON_INTERNAL_ERROR,
|
14
|
+
METRIC_TASKS_COMPLETED_FAILURE_REASON_NONE,
|
15
|
+
METRIC_TASKS_COMPLETED_FAILURE_REASON_TASK_CANCELLED,
|
16
|
+
METRIC_TASKS_COMPLETED_FAILURE_REASON_UNKNOWN,
|
17
|
+
METRIC_TASKS_COMPLETED_OUTCOME_CODE_ALL,
|
18
|
+
METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
|
19
|
+
METRIC_TASKS_COMPLETED_OUTCOME_CODE_SUCCESS,
|
20
|
+
metric_task_completion_latency,
|
21
|
+
metric_tasks_completed,
|
22
|
+
)
|
23
|
+
from .task_info import TaskInfo
|
24
|
+
|
25
|
+
|
26
|
+
def emit_completed_task_metrics(task_info: TaskInfo, logger: Any) -> None:
|
27
|
+
"""Emits Prometheus metrics for a completed task.
|
28
|
+
|
29
|
+
Doesn't raise any exceptions.
|
30
|
+
"""
|
31
|
+
logger = logger.bind(module=__name__)
|
32
|
+
metric_task_completion_latency.observe(time.monotonic() - task_info.start_time)
|
33
|
+
|
34
|
+
task_outcome_code: TaskOutcomeCode = task_info.output.outcome_code
|
35
|
+
task_failure_reason: TaskFailureReason = task_info.output.failure_reason
|
36
|
+
metric_tasks_completed.labels(
|
37
|
+
outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_ALL,
|
38
|
+
failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_ALL,
|
39
|
+
).inc()
|
40
|
+
if task_outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS:
|
41
|
+
metric_tasks_completed.labels(
|
42
|
+
outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_SUCCESS,
|
43
|
+
failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_NONE,
|
44
|
+
).inc()
|
45
|
+
elif task_outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE:
|
46
|
+
if task_failure_reason == TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR:
|
47
|
+
metric_tasks_completed.labels(
|
48
|
+
outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
|
49
|
+
failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_INTERNAL_ERROR,
|
50
|
+
).inc()
|
51
|
+
elif (
|
52
|
+
task_failure_reason
|
53
|
+
== TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED
|
54
|
+
):
|
55
|
+
metric_tasks_completed.labels(
|
56
|
+
outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
|
57
|
+
failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_FUNCTION_EXECUTOR_TERMINATED,
|
58
|
+
).inc()
|
59
|
+
elif (
|
60
|
+
task_failure_reason == TaskFailureReason.TASK_FAILURE_REASON_TASK_CANCELLED
|
61
|
+
):
|
62
|
+
metric_tasks_completed.labels(
|
63
|
+
outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
|
64
|
+
failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_TASK_CANCELLED,
|
65
|
+
).inc()
|
66
|
+
elif task_failure_reason in [
|
67
|
+
TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR,
|
68
|
+
TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_TIMEOUT,
|
69
|
+
]:
|
70
|
+
metric_tasks_completed.labels(
|
71
|
+
outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
|
72
|
+
failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_FUNCTION_ERROR,
|
73
|
+
).inc()
|
74
|
+
else:
|
75
|
+
metric_tasks_completed.labels(
|
76
|
+
outcome_code=METRIC_TASKS_COMPLETED_OUTCOME_CODE_FAILURE,
|
77
|
+
failure_reason=METRIC_TASKS_COMPLETED_FAILURE_REASON_UNKNOWN,
|
78
|
+
).inc()
|
79
|
+
logger.warning(
|
80
|
+
"unexpected task failure reason",
|
81
|
+
failure_reason=TaskFailureReason.Name(task_failure_reason),
|
82
|
+
)
|
@@ -0,0 +1,158 @@
|
|
1
|
+
import asyncio
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Any, Optional
|
4
|
+
|
5
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
6
|
+
InitializeRequest,
|
7
|
+
SerializedObject,
|
8
|
+
)
|
9
|
+
|
10
|
+
from indexify.executor.blob_store.blob_store import BLOBStore
|
11
|
+
from indexify.executor.function_executor.function_executor import (
|
12
|
+
FunctionError,
|
13
|
+
FunctionExecutor,
|
14
|
+
FunctionTimeoutError,
|
15
|
+
)
|
16
|
+
from indexify.executor.function_executor.server.function_executor_server_factory import (
|
17
|
+
FunctionExecutorServerConfiguration,
|
18
|
+
FunctionExecutorServerFactory,
|
19
|
+
)
|
20
|
+
from indexify.proto.executor_api_pb2 import (
|
21
|
+
FunctionExecutorDescription,
|
22
|
+
FunctionExecutorTerminationReason,
|
23
|
+
)
|
24
|
+
|
25
|
+
from .downloads import download_graph
|
26
|
+
from .events import FunctionExecutorCreated
|
27
|
+
|
28
|
+
|
29
|
+
async def create_function_executor(
|
30
|
+
function_executor_description: FunctionExecutorDescription,
|
31
|
+
function_executor_server_factory: FunctionExecutorServerFactory,
|
32
|
+
blob_store: BLOBStore,
|
33
|
+
executor_id: str,
|
34
|
+
base_url: str,
|
35
|
+
config_path: str,
|
36
|
+
cache_path: Path,
|
37
|
+
logger: Any,
|
38
|
+
) -> FunctionExecutorCreated:
|
39
|
+
"""Creates a function executor.
|
40
|
+
|
41
|
+
Doesn't raise any exceptions.
|
42
|
+
"""
|
43
|
+
logger = logger.bind(module=__name__)
|
44
|
+
try:
|
45
|
+
function_executor: FunctionExecutor = await _create_function_executor(
|
46
|
+
function_executor_description=function_executor_description,
|
47
|
+
function_executor_server_factory=function_executor_server_factory,
|
48
|
+
blob_store=blob_store,
|
49
|
+
executor_id=executor_id,
|
50
|
+
base_url=base_url,
|
51
|
+
config_path=config_path,
|
52
|
+
cache_path=cache_path,
|
53
|
+
logger=logger,
|
54
|
+
)
|
55
|
+
return FunctionExecutorCreated(function_executor)
|
56
|
+
except FunctionTimeoutError as e:
|
57
|
+
return FunctionExecutorCreated(
|
58
|
+
function_executor=None,
|
59
|
+
function_error=e,
|
60
|
+
termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_TIMEOUT,
|
61
|
+
)
|
62
|
+
except FunctionError as e:
|
63
|
+
return FunctionExecutorCreated(
|
64
|
+
function_executor=None,
|
65
|
+
function_error=e,
|
66
|
+
termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_ERROR,
|
67
|
+
)
|
68
|
+
except BaseException as e:
|
69
|
+
if isinstance(e, asyncio.CancelledError):
|
70
|
+
logger.info("function executor startup was cancelled")
|
71
|
+
return FunctionExecutorCreated(
|
72
|
+
function_executor=None,
|
73
|
+
termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_REMOVED_FROM_DESIRED_STATE,
|
74
|
+
)
|
75
|
+
else:
|
76
|
+
logger.error(
|
77
|
+
"failed to create function executor due to platform error",
|
78
|
+
exc_info=e,
|
79
|
+
)
|
80
|
+
return FunctionExecutorCreated(
|
81
|
+
function_executor=None,
|
82
|
+
termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR,
|
83
|
+
)
|
84
|
+
|
85
|
+
|
86
|
+
async def _create_function_executor(
|
87
|
+
function_executor_description: FunctionExecutorDescription,
|
88
|
+
function_executor_server_factory: FunctionExecutorServerFactory,
|
89
|
+
blob_store: BLOBStore,
|
90
|
+
executor_id: str,
|
91
|
+
base_url: str,
|
92
|
+
config_path: str,
|
93
|
+
cache_path: Path,
|
94
|
+
logger: Any,
|
95
|
+
) -> FunctionExecutor:
|
96
|
+
"""Creates a function executor.
|
97
|
+
|
98
|
+
Raises Exception on platform error.
|
99
|
+
Raises FunctionError if customer code failed during FE creation.
|
100
|
+
"""
|
101
|
+
graph: SerializedObject = await download_graph(
|
102
|
+
function_executor_description=function_executor_description,
|
103
|
+
cache_path=cache_path,
|
104
|
+
blob_store=blob_store,
|
105
|
+
logger=logger,
|
106
|
+
)
|
107
|
+
|
108
|
+
gpu_count: int = 0
|
109
|
+
if function_executor_description.resources.HasField("gpu"):
|
110
|
+
gpu_count = function_executor_description.resources.gpu.count
|
111
|
+
|
112
|
+
config: FunctionExecutorServerConfiguration = FunctionExecutorServerConfiguration(
|
113
|
+
executor_id=executor_id,
|
114
|
+
function_executor_id=function_executor_description.id,
|
115
|
+
namespace=function_executor_description.namespace,
|
116
|
+
graph_name=function_executor_description.graph_name,
|
117
|
+
graph_version=function_executor_description.graph_version,
|
118
|
+
function_name=function_executor_description.function_name,
|
119
|
+
image_uri=None,
|
120
|
+
secret_names=list(function_executor_description.secret_names),
|
121
|
+
cpu_ms_per_sec=function_executor_description.resources.cpu_ms_per_sec,
|
122
|
+
memory_bytes=function_executor_description.resources.memory_bytes,
|
123
|
+
disk_bytes=function_executor_description.resources.disk_bytes,
|
124
|
+
gpu_count=gpu_count,
|
125
|
+
)
|
126
|
+
if function_executor_description.HasField("image_uri"):
|
127
|
+
config.image_uri = function_executor_description.image_uri
|
128
|
+
|
129
|
+
initialize_request: InitializeRequest = InitializeRequest(
|
130
|
+
namespace=function_executor_description.namespace,
|
131
|
+
graph_name=function_executor_description.graph_name,
|
132
|
+
graph_version=function_executor_description.graph_version,
|
133
|
+
function_name=function_executor_description.function_name,
|
134
|
+
graph=graph,
|
135
|
+
)
|
136
|
+
customer_code_timeout_sec: Optional[float] = None
|
137
|
+
if function_executor_description.HasField("customer_code_timeout_ms"):
|
138
|
+
customer_code_timeout_sec = (
|
139
|
+
function_executor_description.customer_code_timeout_ms / 1000.0
|
140
|
+
)
|
141
|
+
|
142
|
+
function_executor: FunctionExecutor = FunctionExecutor(
|
143
|
+
server_factory=function_executor_server_factory, logger=logger
|
144
|
+
)
|
145
|
+
|
146
|
+
try:
|
147
|
+
# Raises FunctionError if initialization failed in customer code or customer code timed out.
|
148
|
+
await function_executor.initialize(
|
149
|
+
config=config,
|
150
|
+
initialize_request=initialize_request,
|
151
|
+
base_url=base_url,
|
152
|
+
config_path=config_path,
|
153
|
+
customer_code_timeout_sec=customer_code_timeout_sec,
|
154
|
+
)
|
155
|
+
return function_executor
|
156
|
+
except BaseException: # includes asyncio.CancelledError and anything else
|
157
|
+
await function_executor.destroy()
|
158
|
+
raise
|
@@ -0,0 +1,37 @@
|
|
1
|
+
import os
|
2
|
+
from typing import Any, List
|
3
|
+
|
4
|
+
from .events import BaseEvent
|
5
|
+
|
6
|
+
_DEBUG_EVENT_LOOP: bool = (
|
7
|
+
os.getenv("INDEXIFY_FUNCTION_EXECUTOR_CONTROLLER_DEBUG_EVENT_LOOP", "0")
|
8
|
+
) == "1"
|
9
|
+
|
10
|
+
|
11
|
+
def debug_print_processing_event(event: BaseEvent, logger: Any) -> None:
|
12
|
+
if _DEBUG_EVENT_LOOP:
|
13
|
+
logger.debug(
|
14
|
+
"processing event in control loop",
|
15
|
+
fe_event=str(event),
|
16
|
+
)
|
17
|
+
|
18
|
+
|
19
|
+
def debug_print_adding_event(event: BaseEvent, source: str, logger: Any) -> None:
|
20
|
+
if _DEBUG_EVENT_LOOP:
|
21
|
+
logger.debug(
|
22
|
+
"adding event to control loop",
|
23
|
+
source=source,
|
24
|
+
fe_event=str(event),
|
25
|
+
)
|
26
|
+
|
27
|
+
|
28
|
+
def debug_print_events(events: List[BaseEvent], logger: Any) -> None:
|
29
|
+
if _DEBUG_EVENT_LOOP:
|
30
|
+
if len(events) == 0:
|
31
|
+
logger.debug("no events n control loop")
|
32
|
+
else:
|
33
|
+
logger.debug(
|
34
|
+
"events in control loop",
|
35
|
+
count=len(events),
|
36
|
+
fe_events=[str(event) for event in events],
|
37
|
+
)
|
@@ -0,0 +1,28 @@
|
|
1
|
+
from typing import Any, Optional
|
2
|
+
|
3
|
+
from indexify.executor.function_executor.function_executor import FunctionExecutor
|
4
|
+
from indexify.proto.executor_api_pb2 import FunctionExecutorTerminationReason
|
5
|
+
|
6
|
+
from .events import FunctionExecutorDestroyed
|
7
|
+
|
8
|
+
|
9
|
+
async def destroy_function_executor(
|
10
|
+
function_executor: Optional[FunctionExecutor],
|
11
|
+
termination_reason: FunctionExecutorTerminationReason,
|
12
|
+
logger: Any,
|
13
|
+
) -> FunctionExecutorDestroyed:
|
14
|
+
"""Destroys a function executor.
|
15
|
+
|
16
|
+
Doesn't raise any exceptions.
|
17
|
+
"""
|
18
|
+
logger = logger.bind(module=__name__)
|
19
|
+
|
20
|
+
if function_executor is not None:
|
21
|
+
logger.info(
|
22
|
+
"destroying function executor",
|
23
|
+
)
|
24
|
+
await function_executor.destroy()
|
25
|
+
|
26
|
+
return FunctionExecutorDestroyed(
|
27
|
+
is_success=True, termination_reason=termination_reason
|
28
|
+
)
|
@@ -0,0 +1,199 @@
|
|
1
|
+
import asyncio
|
2
|
+
import os
|
3
|
+
from pathlib import Path
|
4
|
+
from typing import Any, Optional
|
5
|
+
|
6
|
+
import nanoid
|
7
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
|
8
|
+
|
9
|
+
from indexify.executor.blob_store.blob_store import BLOBStore
|
10
|
+
from indexify.proto.executor_api_pb2 import (
|
11
|
+
DataPayload,
|
12
|
+
DataPayloadEncoding,
|
13
|
+
FunctionExecutorDescription,
|
14
|
+
)
|
15
|
+
|
16
|
+
from .metrics.downloads import (
|
17
|
+
metric_graph_download_errors,
|
18
|
+
metric_graph_download_latency,
|
19
|
+
metric_graph_downloads,
|
20
|
+
metric_graphs_from_cache,
|
21
|
+
metric_reducer_init_value_download_errors,
|
22
|
+
metric_reducer_init_value_download_latency,
|
23
|
+
metric_reducer_init_value_downloads,
|
24
|
+
metric_task_input_download_errors,
|
25
|
+
metric_task_input_download_latency,
|
26
|
+
metric_task_input_downloads,
|
27
|
+
metric_tasks_downloading_graphs,
|
28
|
+
metric_tasks_downloading_inputs,
|
29
|
+
metric_tasks_downloading_reducer_init_value,
|
30
|
+
)
|
31
|
+
|
32
|
+
|
33
|
+
async def download_graph(
|
34
|
+
function_executor_description: FunctionExecutorDescription,
|
35
|
+
cache_path: Path,
|
36
|
+
blob_store: BLOBStore,
|
37
|
+
logger: Any,
|
38
|
+
) -> SerializedObject:
|
39
|
+
logger = logger.bind(module=__name__)
|
40
|
+
with (
|
41
|
+
metric_graph_download_errors.count_exceptions(),
|
42
|
+
metric_tasks_downloading_graphs.track_inprogress(),
|
43
|
+
metric_graph_download_latency.time(),
|
44
|
+
):
|
45
|
+
metric_graph_downloads.inc()
|
46
|
+
return await _download_graph(
|
47
|
+
function_executor_description=function_executor_description,
|
48
|
+
cache_path=cache_path,
|
49
|
+
blob_store=blob_store,
|
50
|
+
logger=logger,
|
51
|
+
)
|
52
|
+
|
53
|
+
|
54
|
+
async def download_input(
|
55
|
+
data_payload: DataPayload,
|
56
|
+
blob_store: BLOBStore,
|
57
|
+
logger: Any,
|
58
|
+
) -> SerializedObject:
|
59
|
+
logger = logger.bind(module=__name__)
|
60
|
+
with (
|
61
|
+
metric_task_input_download_errors.count_exceptions(),
|
62
|
+
metric_tasks_downloading_inputs.track_inprogress(),
|
63
|
+
metric_task_input_download_latency.time(),
|
64
|
+
):
|
65
|
+
metric_task_input_downloads.inc()
|
66
|
+
return await _download_input(
|
67
|
+
data_payload=data_payload,
|
68
|
+
blob_store=blob_store,
|
69
|
+
logger=logger,
|
70
|
+
)
|
71
|
+
|
72
|
+
|
73
|
+
async def download_init_value(
|
74
|
+
data_payload: DataPayload,
|
75
|
+
blob_store: BLOBStore,
|
76
|
+
logger: Any,
|
77
|
+
) -> SerializedObject:
|
78
|
+
logger = logger.bind(module=__name__)
|
79
|
+
with (
|
80
|
+
metric_reducer_init_value_download_errors.count_exceptions(),
|
81
|
+
metric_tasks_downloading_reducer_init_value.track_inprogress(),
|
82
|
+
metric_reducer_init_value_download_latency.time(),
|
83
|
+
):
|
84
|
+
metric_reducer_init_value_downloads.inc()
|
85
|
+
return await _download_input(
|
86
|
+
data_payload=data_payload,
|
87
|
+
blob_store=blob_store,
|
88
|
+
logger=logger,
|
89
|
+
)
|
90
|
+
|
91
|
+
|
92
|
+
async def _download_input(
|
93
|
+
data_payload: DataPayload,
|
94
|
+
blob_store: BLOBStore,
|
95
|
+
logger: Any,
|
96
|
+
) -> SerializedObject:
|
97
|
+
data: bytes = await blob_store.get(uri=data_payload.uri, logger=logger)
|
98
|
+
return _serialized_object_from_data_payload_proto(
|
99
|
+
data_payload=data_payload,
|
100
|
+
data=data,
|
101
|
+
)
|
102
|
+
|
103
|
+
|
104
|
+
async def _download_graph(
|
105
|
+
function_executor_description: FunctionExecutorDescription,
|
106
|
+
cache_path: Path,
|
107
|
+
blob_store: BLOBStore,
|
108
|
+
logger: Any,
|
109
|
+
) -> SerializedObject:
|
110
|
+
# Cache graph to reduce load on the server.
|
111
|
+
graph_path = os.path.join(
|
112
|
+
str(cache_path),
|
113
|
+
"graph_cache",
|
114
|
+
function_executor_description.namespace,
|
115
|
+
function_executor_description.graph_name,
|
116
|
+
function_executor_description.graph_version,
|
117
|
+
)
|
118
|
+
# Filesystem operations are synchronous.
|
119
|
+
# Run in a separate thread to not block the main event loop.
|
120
|
+
graph: Optional[SerializedObject] = await asyncio.to_thread(
|
121
|
+
_read_cached_graph, path=graph_path
|
122
|
+
)
|
123
|
+
if graph is not None:
|
124
|
+
metric_graphs_from_cache.inc()
|
125
|
+
return graph
|
126
|
+
|
127
|
+
data: bytes = await blob_store.get(
|
128
|
+
uri=function_executor_description.graph.uri, logger=logger
|
129
|
+
)
|
130
|
+
graph = _serialized_object_from_data_payload_proto(
|
131
|
+
data_payload=function_executor_description.graph,
|
132
|
+
data=data,
|
133
|
+
)
|
134
|
+
|
135
|
+
# Filesystem operations are synchronous.
|
136
|
+
# Run in a separate thread to not block the main event loop.
|
137
|
+
# We don't need to wait for the write completion so we use create_task.
|
138
|
+
asyncio.create_task(
|
139
|
+
asyncio.to_thread(
|
140
|
+
_write_cached_graph, path=graph_path, graph=graph, cache_path=cache_path
|
141
|
+
),
|
142
|
+
name="graph cache write",
|
143
|
+
)
|
144
|
+
|
145
|
+
return graph
|
146
|
+
|
147
|
+
|
148
|
+
def _read_cached_graph(path: str) -> Optional[SerializedObject]:
|
149
|
+
if not os.path.exists(path):
|
150
|
+
return None
|
151
|
+
|
152
|
+
with open(path, "rb") as f:
|
153
|
+
return SerializedObject.FromString(f.read())
|
154
|
+
|
155
|
+
|
156
|
+
def _write_cached_graph(path: str, graph: SerializedObject, cache_path: Path) -> None:
|
157
|
+
if os.path.exists(path):
|
158
|
+
# Another task already cached the graph.
|
159
|
+
return None
|
160
|
+
|
161
|
+
tmp_path = os.path.join(str(cache_path), "task_graph_cache", nanoid.generate())
|
162
|
+
os.makedirs(os.path.dirname(tmp_path), exist_ok=True)
|
163
|
+
with open(tmp_path, "wb") as f:
|
164
|
+
f.write(graph.SerializeToString())
|
165
|
+
os.makedirs(os.path.dirname(path), exist_ok=True)
|
166
|
+
# Atomically rename the fully written file at tmp path.
|
167
|
+
# This allows us to not use any locking because file link/unlink
|
168
|
+
# are atomic operations at filesystem level.
|
169
|
+
# This also allows to share the same cache between multiple Executors.
|
170
|
+
os.replace(tmp_path, path)
|
171
|
+
|
172
|
+
|
173
|
+
def _serialized_object_from_data_payload_proto(
|
174
|
+
data_payload: DataPayload, data: bytes
|
175
|
+
) -> SerializedObject:
|
176
|
+
"""Converts the given data payload and its data into SerializedObject accepted by Function Executor.
|
177
|
+
|
178
|
+
Raises ValueError if the supplied data payload can't be converted into serialized object.
|
179
|
+
"""
|
180
|
+
if data_payload.encoding == DataPayloadEncoding.DATA_PAYLOAD_ENCODING_BINARY_PICKLE:
|
181
|
+
return SerializedObject(
|
182
|
+
bytes=data,
|
183
|
+
content_type="application/octet-stream",
|
184
|
+
)
|
185
|
+
elif data_payload.encoding == DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_TEXT:
|
186
|
+
return SerializedObject(
|
187
|
+
content_type="text/plain",
|
188
|
+
string=data.decode("utf-8"),
|
189
|
+
)
|
190
|
+
elif data_payload.encoding == DataPayloadEncoding.DATA_PAYLOAD_ENCODING_UTF8_JSON:
|
191
|
+
result = SerializedObject(
|
192
|
+
content_type="application/json",
|
193
|
+
string=data.decode("utf-8"),
|
194
|
+
)
|
195
|
+
return result
|
196
|
+
|
197
|
+
raise ValueError(
|
198
|
+
f"Can't convert data payload {data_payload} into serialized object"
|
199
|
+
)
|