indexify 0.3.31__py3-none-any.whl → 0.4.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/__init__.py +18 -0
- indexify/cli/build_image.py +51 -0
- indexify/cli/deploy.py +57 -0
- indexify/cli/executor.py +205 -0
- indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
- indexify/executor/executor.py +57 -313
- indexify/executor/function_allowlist.py +59 -0
- indexify/executor/function_executor/function_executor.py +12 -6
- indexify/executor/function_executor/invocation_state_client.py +25 -3
- indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
- indexify/executor/function_executor_controller/__init__.py +13 -0
- indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
- indexify/executor/function_executor_controller/create_function_executor.py +154 -0
- indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
- indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
- indexify/executor/function_executor_controller/downloads.py +199 -0
- indexify/executor/function_executor_controller/events.py +172 -0
- indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
- indexify/executor/function_executor_controller/loggers.py +57 -0
- indexify/executor/function_executor_controller/message_validators.py +65 -0
- indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
- indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
- indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
- indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
- indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
- indexify/executor/function_executor_controller/prepare_task.py +38 -0
- indexify/executor/function_executor_controller/run_task.py +201 -0
- indexify/executor/function_executor_controller/task_info.py +33 -0
- indexify/executor/function_executor_controller/task_output.py +122 -0
- indexify/executor/function_executor_controller/upload_task_output.py +234 -0
- indexify/executor/host_resources/host_resources.py +20 -25
- indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
- indexify/executor/metrics/executor.py +0 -47
- indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
- indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
- indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
- indexify/executor/monitoring/health_checker/health_checker.py +0 -11
- indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
- indexify/executor/state_reporter.py +364 -0
- indexify/proto/executor_api.proto +67 -59
- indexify/proto/executor_api_pb2.py +52 -52
- indexify/proto/executor_api_pb2.pyi +125 -104
- indexify/proto/executor_api_pb2_grpc.py +0 -47
- {indexify-0.3.31.dist-info → indexify-0.4.2.dist-info}/METADATA +1 -3
- indexify-0.4.2.dist-info/RECORD +68 -0
- indexify-0.4.2.dist-info/entry_points.txt +3 -0
- indexify/cli/cli.py +0 -268
- indexify/executor/api_objects.py +0 -92
- indexify/executor/downloader.py +0 -417
- indexify/executor/executor_flavor.py +0 -7
- indexify/executor/function_executor/function_executor_state.py +0 -107
- indexify/executor/function_executor/function_executor_states_container.py +0 -93
- indexify/executor/function_executor/function_executor_status.py +0 -95
- indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
- indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
- indexify/executor/function_executor/single_task_runner.py +0 -345
- indexify/executor/function_executor/task_input.py +0 -21
- indexify/executor/function_executor/task_output.py +0 -105
- indexify/executor/grpc/function_executor_controller.py +0 -418
- indexify/executor/grpc/metrics/task_controller.py +0 -8
- indexify/executor/grpc/state_reporter.py +0 -317
- indexify/executor/grpc/task_controller.py +0 -508
- indexify/executor/metrics/task_fetcher.py +0 -21
- indexify/executor/metrics/task_reporter.py +0 -53
- indexify/executor/metrics/task_runner.py +0 -52
- indexify/executor/monitoring/function_allowlist.py +0 -25
- indexify/executor/runtime_probes.py +0 -68
- indexify/executor/task_fetcher.py +0 -96
- indexify/executor/task_reporter.py +0 -459
- indexify/executor/task_runner.py +0 -177
- indexify-0.3.31.dist-info/RECORD +0 -68
- indexify-0.3.31.dist-info/entry_points.txt +0 -3
- {indexify-0.3.31.dist-info → indexify-0.4.2.dist-info}/WHEEL +0 -0
indexify/cli/__init__.py
ADDED
@@ -0,0 +1,18 @@
|
|
1
|
+
import click
|
2
|
+
|
3
|
+
from . import build_image, deploy, executor
|
4
|
+
|
5
|
+
|
6
|
+
@click.group()
|
7
|
+
@click.version_option(package_name="indexify", prog_name="indexify-cli")
|
8
|
+
@click.pass_context
|
9
|
+
def cli(ctx: click.Context):
|
10
|
+
"""
|
11
|
+
Indexify CLI to manage and deploy workflows to Indexify Server and run Indexify Executors.
|
12
|
+
"""
|
13
|
+
pass
|
14
|
+
|
15
|
+
|
16
|
+
cli.add_command(build_image.build_image)
|
17
|
+
cli.add_command(deploy.deploy)
|
18
|
+
cli.add_command(executor.executor)
|
@@ -0,0 +1,51 @@
|
|
1
|
+
import click
|
2
|
+
from tensorlake.functions_sdk.image import Image
|
3
|
+
from tensorlake.functions_sdk.workflow_module import (
|
4
|
+
WorkflowModuleInfo,
|
5
|
+
load_workflow_module_info,
|
6
|
+
)
|
7
|
+
|
8
|
+
|
9
|
+
@click.command(
|
10
|
+
short_help="Build images for graphs/workflows defined in the workflow file"
|
11
|
+
)
|
12
|
+
# Path to the file where the graphs/workflows are defined as global variables
|
13
|
+
@click.argument(
|
14
|
+
"workflow-file-path",
|
15
|
+
type=click.Path(exists=True, file_okay=True, dir_okay=False),
|
16
|
+
)
|
17
|
+
@click.option(
|
18
|
+
"-i",
|
19
|
+
"--image-names",
|
20
|
+
multiple=True,
|
21
|
+
help="Names of images to build. Can be specified multiple times. If not provided, all images will be built.",
|
22
|
+
)
|
23
|
+
def build_image(
|
24
|
+
workflow_file_path: str,
|
25
|
+
image_names: tuple[str, ...] = None,
|
26
|
+
):
|
27
|
+
try:
|
28
|
+
workflow_module_info: WorkflowModuleInfo = load_workflow_module_info(
|
29
|
+
workflow_file_path
|
30
|
+
)
|
31
|
+
except Exception as e:
|
32
|
+
click.secho(
|
33
|
+
f"Failed loading workflow file, please check the error message: {e}",
|
34
|
+
fg="red",
|
35
|
+
)
|
36
|
+
raise click.Abort
|
37
|
+
|
38
|
+
for image in workflow_module_info.images.keys():
|
39
|
+
image: Image
|
40
|
+
if image_names is not None and image.image_name not in image_names:
|
41
|
+
click.echo(
|
42
|
+
f"Skipping image `{image.image_name}` as it is not in the provided image names."
|
43
|
+
)
|
44
|
+
continue
|
45
|
+
|
46
|
+
click.echo(f"Building image `{image.image_name}`")
|
47
|
+
built_image, generator = image.build()
|
48
|
+
for output in generator:
|
49
|
+
click.secho(output)
|
50
|
+
|
51
|
+
click.secho(f"built image: {built_image.tags[0]}", fg="green")
|
indexify/cli/deploy.py
ADDED
@@ -0,0 +1,57 @@
|
|
1
|
+
import click
|
2
|
+
from tensorlake import Graph
|
3
|
+
from tensorlake.functions_sdk.graph_serialization import graph_code_dir_path
|
4
|
+
from tensorlake.functions_sdk.workflow_module import (
|
5
|
+
WorkflowModuleInfo,
|
6
|
+
load_workflow_module_info,
|
7
|
+
)
|
8
|
+
from tensorlake.remote_graph import RemoteGraph
|
9
|
+
|
10
|
+
|
11
|
+
@click.command(
|
12
|
+
short_help="Deploy all graphs/workflows defined in the workflow file to Indexify"
|
13
|
+
)
|
14
|
+
# Path to the file where the graphs/workflows are defined as global variables
|
15
|
+
@click.argument(
|
16
|
+
"workflow-file-path",
|
17
|
+
type=click.Path(exists=True, file_okay=True, dir_okay=False),
|
18
|
+
)
|
19
|
+
@click.option(
|
20
|
+
"-u",
|
21
|
+
"--upgrade-queued-requests",
|
22
|
+
is_flag=True,
|
23
|
+
default=False,
|
24
|
+
help="Upgrade invocations that are already queued or running to use the deployed version of the graphs/workflows",
|
25
|
+
)
|
26
|
+
def deploy(
|
27
|
+
workflow_file_path: str,
|
28
|
+
upgrade_queued_invocations: bool,
|
29
|
+
):
|
30
|
+
click.echo(f"Preparing deployment for {workflow_file_path}")
|
31
|
+
try:
|
32
|
+
workflow_module_info: WorkflowModuleInfo = load_workflow_module_info(
|
33
|
+
workflow_file_path
|
34
|
+
)
|
35
|
+
except Exception as e:
|
36
|
+
click.secho(
|
37
|
+
f"Failed loading workflow file, please check the error message: {e}",
|
38
|
+
fg="red",
|
39
|
+
)
|
40
|
+
raise click.Abort
|
41
|
+
|
42
|
+
for graph in workflow_module_info.graphs:
|
43
|
+
graph: Graph
|
44
|
+
try:
|
45
|
+
RemoteGraph.deploy(
|
46
|
+
graph,
|
47
|
+
code_dir_path=graph_code_dir_path(workflow_file_path),
|
48
|
+
upgrade_tasks_to_latest_version=upgrade_queued_invocations,
|
49
|
+
)
|
50
|
+
except Exception as e:
|
51
|
+
click.secho(
|
52
|
+
f"Graph {graph.name} could not be deployed, please check the error message: {e}",
|
53
|
+
fg="red",
|
54
|
+
)
|
55
|
+
raise click.Abort
|
56
|
+
|
57
|
+
click.secho(f"Deployed {graph.name}", fg="green")
|
indexify/cli/executor.py
ADDED
@@ -0,0 +1,205 @@
|
|
1
|
+
from tensorlake.utils.logging import (
|
2
|
+
configure_development_mode_logging,
|
3
|
+
configure_logging_early,
|
4
|
+
configure_production_mode_logging,
|
5
|
+
)
|
6
|
+
|
7
|
+
configure_logging_early()
|
8
|
+
|
9
|
+
import shutil
|
10
|
+
from importlib.metadata import version
|
11
|
+
from pathlib import Path
|
12
|
+
from socket import gethostname
|
13
|
+
from typing import Dict, List, Optional
|
14
|
+
|
15
|
+
import click
|
16
|
+
import nanoid
|
17
|
+
import prometheus_client
|
18
|
+
import structlog
|
19
|
+
|
20
|
+
from indexify.executor.blob_store.blob_store import BLOBStore
|
21
|
+
from indexify.executor.blob_store.local_fs_blob_store import LocalFSBLOBStore
|
22
|
+
from indexify.executor.blob_store.s3_blob_store import S3BLOBStore
|
23
|
+
from indexify.executor.executor import Executor
|
24
|
+
from indexify.executor.function_executor.server.subprocess_function_executor_server_factory import (
|
25
|
+
SubprocessFunctionExecutorServerFactory,
|
26
|
+
)
|
27
|
+
from indexify.executor.host_resources.host_resources import HostResourcesProvider
|
28
|
+
from indexify.executor.host_resources.nvidia_gpu_allocator import NvidiaGPUAllocator
|
29
|
+
from indexify.executor.monitoring.health_checker.generic_health_checker import (
|
30
|
+
GenericHealthChecker,
|
31
|
+
)
|
32
|
+
|
33
|
+
|
34
|
+
@click.command(
|
35
|
+
context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
|
36
|
+
short_help="Runs Executor that connects to the Indexify server and starts running its tasks",
|
37
|
+
)
|
38
|
+
@click.option(
|
39
|
+
"--server-addr",
|
40
|
+
"server_address",
|
41
|
+
default="localhost:8900",
|
42
|
+
help="Address of Indexify HTTP Server to connect to",
|
43
|
+
)
|
44
|
+
@click.option(
|
45
|
+
"--grpc-server-addr",
|
46
|
+
"grpc_server_address",
|
47
|
+
default="localhost:8901",
|
48
|
+
help="Address of Indexify gRPC Server to connect to",
|
49
|
+
)
|
50
|
+
@click.option(
|
51
|
+
"-v",
|
52
|
+
"--verbose",
|
53
|
+
is_flag=True,
|
54
|
+
default=False,
|
55
|
+
help="Verbose logging",
|
56
|
+
)
|
57
|
+
@click.option(
|
58
|
+
"-vv",
|
59
|
+
"--very-verbose",
|
60
|
+
is_flag=True,
|
61
|
+
default=False,
|
62
|
+
help="Very verbose logging",
|
63
|
+
)
|
64
|
+
@click.option(
|
65
|
+
"-f",
|
66
|
+
"--function",
|
67
|
+
"function_uris",
|
68
|
+
default=[],
|
69
|
+
multiple=True,
|
70
|
+
help="Functions that the executor will run "
|
71
|
+
"specified as <namespace>:<workflow>:<function>:<version>"
|
72
|
+
"version is optional, not specifying it will make the server send any version"
|
73
|
+
"of the function. Any number of --function arguments can be passed.",
|
74
|
+
)
|
75
|
+
@click.option(
|
76
|
+
"--config-path",
|
77
|
+
type=click.Path(exists=True, dir_okay=False, file_okay=True, readable=True),
|
78
|
+
default=None,
|
79
|
+
help="Path to the TLS configuration file",
|
80
|
+
)
|
81
|
+
@click.option(
|
82
|
+
"--executor-cache-path",
|
83
|
+
type=click.Path(exists=False, dir_okay=True, readable=True, writable=True),
|
84
|
+
default="~/.indexify/executor_cache",
|
85
|
+
help="Path to the executor cache directory",
|
86
|
+
)
|
87
|
+
@click.option(
|
88
|
+
"--monitoring-server-host",
|
89
|
+
default="localhost",
|
90
|
+
help="IP address or hostname where to run Executor Monitoring server",
|
91
|
+
)
|
92
|
+
@click.option(
|
93
|
+
"--monitoring-server-port",
|
94
|
+
default=7000,
|
95
|
+
type=int,
|
96
|
+
help="Port where to run Executor Monitoring server",
|
97
|
+
)
|
98
|
+
@click.option(
|
99
|
+
"-l",
|
100
|
+
"--label",
|
101
|
+
"labels",
|
102
|
+
default=[],
|
103
|
+
multiple=True,
|
104
|
+
help="Executor key-value label to be sent to the Server. "
|
105
|
+
"Specified as <key>=<value>",
|
106
|
+
)
|
107
|
+
@click.pass_context
|
108
|
+
def executor(
|
109
|
+
ctx: click.Context,
|
110
|
+
server_address: str,
|
111
|
+
grpc_server_address: str,
|
112
|
+
verbose: bool,
|
113
|
+
very_verbose: bool,
|
114
|
+
function_uris: List[str],
|
115
|
+
config_path: Optional[str],
|
116
|
+
executor_cache_path: str,
|
117
|
+
monitoring_server_host: str,
|
118
|
+
monitoring_server_port: int,
|
119
|
+
labels: List[str],
|
120
|
+
):
|
121
|
+
if verbose or very_verbose:
|
122
|
+
configure_development_mode_logging(compact_tracebacks=not very_verbose)
|
123
|
+
else:
|
124
|
+
configure_production_mode_logging()
|
125
|
+
|
126
|
+
kv_labels: Dict[str, str] = {}
|
127
|
+
for label in labels:
|
128
|
+
key, value = label.split("=")
|
129
|
+
kv_labels[key] = value
|
130
|
+
|
131
|
+
executor_id: str = nanoid.generate()
|
132
|
+
executor_version = version("indexify")
|
133
|
+
logger = structlog.get_logger(module=__name__, executor_id=executor_id)
|
134
|
+
|
135
|
+
logger.info(
|
136
|
+
"starting executor",
|
137
|
+
hostname=gethostname(),
|
138
|
+
server_address=server_address,
|
139
|
+
grpc_server_address=grpc_server_address,
|
140
|
+
config_path=config_path,
|
141
|
+
executor_version=executor_version,
|
142
|
+
labels=kv_labels,
|
143
|
+
executor_cache_path=executor_cache_path,
|
144
|
+
functions=function_uris,
|
145
|
+
verbose=verbose,
|
146
|
+
very_verbose=very_verbose,
|
147
|
+
monitoring_server_host=monitoring_server_host,
|
148
|
+
monitoring_server_port=monitoring_server_port,
|
149
|
+
)
|
150
|
+
if ctx.args:
|
151
|
+
logger.warning(
|
152
|
+
"Unknown arguments passed to the executor",
|
153
|
+
unknown_args=ctx.args,
|
154
|
+
)
|
155
|
+
if len(function_uris) == 0:
|
156
|
+
logger.warning(
|
157
|
+
"No --function arguments were passed. Executor will run all functions. This scenario is only supported for testing purposes.",
|
158
|
+
)
|
159
|
+
|
160
|
+
executor_cache_path: Path = Path(executor_cache_path).expanduser().absolute()
|
161
|
+
if executor_cache_path.exists():
|
162
|
+
shutil.rmtree(str(executor_cache_path))
|
163
|
+
executor_cache_path.mkdir(parents=True, exist_ok=True)
|
164
|
+
|
165
|
+
blob_store: BLOBStore = BLOBStore(
|
166
|
+
# Local FS mode is used in tests and in cases when user wants to store data on NFS.
|
167
|
+
local=LocalFSBLOBStore(),
|
168
|
+
# S3 is initiliazed lazily so it's okay to create it even if the user is not going to use it.
|
169
|
+
s3=S3BLOBStore(),
|
170
|
+
)
|
171
|
+
|
172
|
+
host_resources_provider: HostResourcesProvider = HostResourcesProvider(
|
173
|
+
gpu_allocator=NvidiaGPUAllocator(logger),
|
174
|
+
# Assuming a simple setup in OSS where Executor container has a single file system
|
175
|
+
# used by all Function Executors and all the container resources are available to all Function Executors.
|
176
|
+
function_executors_ephimeral_disks_path="/",
|
177
|
+
host_overhead_cpus=0,
|
178
|
+
host_overhead_memory_gb=0,
|
179
|
+
host_overhead_function_executors_ephimeral_disks_gb=0,
|
180
|
+
)
|
181
|
+
|
182
|
+
prometheus_client.Info("cli", "CLI information").info(
|
183
|
+
{
|
184
|
+
"package": "indexify",
|
185
|
+
}
|
186
|
+
)
|
187
|
+
|
188
|
+
Executor(
|
189
|
+
id=executor_id,
|
190
|
+
version=executor_version,
|
191
|
+
labels=kv_labels,
|
192
|
+
health_checker=GenericHealthChecker(),
|
193
|
+
cache_path=executor_cache_path,
|
194
|
+
function_uris=function_uris,
|
195
|
+
function_executor_server_factory=SubprocessFunctionExecutorServerFactory(
|
196
|
+
verbose_logs=verbose or very_verbose
|
197
|
+
),
|
198
|
+
server_addr=server_address,
|
199
|
+
grpc_server_addr=grpc_server_address,
|
200
|
+
config_path=config_path,
|
201
|
+
monitoring_server_host=monitoring_server_host,
|
202
|
+
monitoring_server_port=monitoring_server_port,
|
203
|
+
blob_store=blob_store,
|
204
|
+
host_resources_provider=host_resources_provider,
|
205
|
+
).run()
|
@@ -84,15 +84,28 @@ class ChannelManager:
|
|
84
84
|
# Use the lock to ensure that we only create one channel without race conditions.
|
85
85
|
async with self._lock:
|
86
86
|
if self._channel is None:
|
87
|
-
self._channel = await self.
|
87
|
+
self._channel = await self._create_ready_channel()
|
88
88
|
elif not await self._locked_channel_is_healthy():
|
89
89
|
self._logger.info("grpc channel to server is unhealthy")
|
90
90
|
await self._destroy_locked_channel()
|
91
|
-
self._channel = await self.
|
91
|
+
self._channel = await self._create_ready_channel()
|
92
92
|
|
93
93
|
return self._channel
|
94
94
|
|
95
|
-
|
95
|
+
def create_channel(self) -> grpc.aio.Channel:
|
96
|
+
"""Creates a new channel to the gRPC server.
|
97
|
+
|
98
|
+
The channel is not be ready to use. Raises an exception on failure.
|
99
|
+
"""
|
100
|
+
if self._channel_credentials is None:
|
101
|
+
return grpc.aio.insecure_channel(target=self._server_address)
|
102
|
+
else:
|
103
|
+
return grpc.aio.secure_channel(
|
104
|
+
target=self._server_address,
|
105
|
+
credentials=self._channel_credentials,
|
106
|
+
)
|
107
|
+
|
108
|
+
async def _create_ready_channel(self) -> grpc.aio.Channel:
|
96
109
|
"""Creates a new channel to the gRPC server."
|
97
110
|
|
98
111
|
Returns a ready to use channel. Blocks until the channel
|
@@ -104,14 +117,7 @@ class ChannelManager:
|
|
104
117
|
metric_grpc_server_channel_creations.inc()
|
105
118
|
while True:
|
106
119
|
try:
|
107
|
-
|
108
|
-
channel = grpc.aio.insecure_channel(target=self._server_address)
|
109
|
-
else:
|
110
|
-
channel = grpc.aio.secure_channel(
|
111
|
-
target=self._server_address,
|
112
|
-
credentials=self._channel_credentials,
|
113
|
-
)
|
114
|
-
|
120
|
+
channel = self.create_channel()
|
115
121
|
await asyncio.wait_for(
|
116
122
|
channel.channel_ready(),
|
117
123
|
timeout=_CONNECT_TIMEOUT_SEC,
|