indexify 0.3.8__tar.gz → 0.3.10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {indexify-0.3.8 → indexify-0.3.10}/PKG-INFO +4 -2
- {indexify-0.3.8 → indexify-0.3.10}/pyproject.toml +4 -2
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/cli/cli.py +38 -78
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/api_objects.py +4 -0
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/downloader.py +45 -5
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/executor.py +103 -16
- indexify-0.3.10/src/indexify/executor/function_executor/function_executor.py +280 -0
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/function_executor/function_executor_state.py +6 -0
- indexify-0.3.10/src/indexify/executor/function_executor/function_executor_states_container.py +64 -0
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/function_executor/health_checker.py +20 -10
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/function_executor/invocation_state_client.py +31 -6
- indexify-0.3.10/src/indexify/executor/function_executor/metrics/function_executor.py +142 -0
- indexify-0.3.10/src/indexify/executor/function_executor/metrics/function_executor_state.py +10 -0
- indexify-0.3.10/src/indexify/executor/function_executor/metrics/function_executor_state_container.py +10 -0
- indexify-0.3.10/src/indexify/executor/function_executor/metrics/health_checker.py +14 -0
- indexify-0.3.10/src/indexify/executor/function_executor/metrics/invocation_state_client.py +45 -0
- indexify-0.3.10/src/indexify/executor/function_executor/metrics/single_task_runner.py +22 -0
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/function_executor/single_task_runner.py +44 -15
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/function_executor/task_output.py +7 -1
- indexify-0.3.10/src/indexify/executor/metrics/downloader.py +69 -0
- indexify-0.3.10/src/indexify/executor/metrics/executor.py +51 -0
- indexify-0.3.10/src/indexify/executor/metrics/task_fetcher.py +21 -0
- indexify-0.3.10/src/indexify/executor/metrics/task_reporter.py +22 -0
- indexify-0.3.10/src/indexify/executor/metrics/task_runner.py +45 -0
- indexify-0.3.10/src/indexify/executor/monitoring/function_allowlist.py +25 -0
- indexify-0.3.10/src/indexify/executor/monitoring/handler.py +8 -0
- indexify-0.3.10/src/indexify/executor/monitoring/health_check_handler.py +20 -0
- indexify-0.3.10/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +58 -0
- indexify-0.3.10/src/indexify/executor/monitoring/health_checker/health_checker.py +23 -0
- indexify-0.3.10/src/indexify/executor/monitoring/metrics.py +245 -0
- indexify-0.3.10/src/indexify/executor/monitoring/prometheus_metrics_handler.py +18 -0
- indexify-0.3.10/src/indexify/executor/monitoring/server.py +41 -0
- indexify-0.3.10/src/indexify/executor/monitoring/startup_probe_handler.py +17 -0
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/task_fetcher.py +15 -1
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/task_reporter.py +24 -7
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/task_runner.py +64 -46
- indexify-0.3.8/src/indexify/executor/function_executor/function_executor.py +0 -161
- {indexify-0.3.8 → indexify-0.3.10}/README.md +0 -0
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/README.md +0 -0
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +0 -0
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/function_executor/task_input.py +0 -0
- {indexify-0.3.8 → indexify-0.3.10}/src/indexify/executor/runtime_probes.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: indexify
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.10
|
4
4
|
Summary: Open Source Indexify components and helper tools
|
5
5
|
Home-page: https://github.com/tensorlakeai/indexify
|
6
6
|
License: Apache 2.0
|
@@ -14,15 +14,17 @@ Classifier: Programming Language :: Python :: 3.10
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
16
16
|
Classifier: Programming Language :: Python :: 3.13
|
17
|
+
Requires-Dist: aiohttp (>=3.11.0,<4.0.0)
|
17
18
|
Requires-Dist: grpcio (==1.70.0)
|
18
19
|
Requires-Dist: httpx-sse (>=0.4.0,<0.5.0)
|
19
20
|
Requires-Dist: httpx[http2] (>=0.27,<0.28)
|
20
21
|
Requires-Dist: nanoid (>=2.0.0,<3.0.0)
|
22
|
+
Requires-Dist: prometheus-client (>=0.21.1,<0.22.0)
|
21
23
|
Requires-Dist: pydantic (==2.10.4)
|
22
24
|
Requires-Dist: pyyaml (>=6,<7)
|
23
25
|
Requires-Dist: rich (>=13.9.2,<14.0.0)
|
24
26
|
Requires-Dist: structlog (>=24.4.0,<25.0.0)
|
25
|
-
Requires-Dist: tensorlake (>=0.1.
|
27
|
+
Requires-Dist: tensorlake (>=0.1.20)
|
26
28
|
Requires-Dist: typer (>=0.12,<0.13)
|
27
29
|
Project-URL: Repository, https://github.com/tensorlakeai/indexify
|
28
30
|
Description-Content-Type: text/markdown
|
@@ -1,7 +1,7 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "indexify"
|
3
3
|
# Incremented if any of the components provided in this packages are updated.
|
4
|
-
version = "0.3.
|
4
|
+
version = "0.3.10"
|
5
5
|
description = "Open Source Indexify components and helper tools"
|
6
6
|
authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
|
7
7
|
license = "Apache 2.0"
|
@@ -23,8 +23,10 @@ grpcio = "1.70.0"
|
|
23
23
|
# Executor only
|
24
24
|
pydantic = "2.10.4"
|
25
25
|
httpx-sse = "^0.4.0"
|
26
|
+
aiohttp = "^3.11.0"
|
27
|
+
prometheus-client = "^0.21.1"
|
26
28
|
# Adds function-executor binary and utils lib.
|
27
|
-
tensorlake = ">=0.1.
|
29
|
+
tensorlake = ">=0.1.20"
|
28
30
|
|
29
31
|
# CLI only
|
30
32
|
rich = "^13.9.2"
|
@@ -6,8 +6,6 @@ from tensorlake.utils.logging import (
|
|
6
6
|
|
7
7
|
configure_logging_early()
|
8
8
|
|
9
|
-
import importlib.metadata
|
10
|
-
import json
|
11
9
|
import os
|
12
10
|
import shutil
|
13
11
|
import signal
|
@@ -17,10 +15,11 @@ import threading
|
|
17
15
|
import time
|
18
16
|
from importlib.metadata import version
|
19
17
|
from pathlib import Path
|
18
|
+
from socket import gethostname
|
20
19
|
from typing import Annotated, List, Optional, Tuple
|
21
20
|
|
22
|
-
import docker
|
23
21
|
import nanoid
|
22
|
+
import prometheus_client
|
24
23
|
import structlog
|
25
24
|
import typer
|
26
25
|
from rich.console import Console
|
@@ -33,6 +32,9 @@ from indexify.executor.executor import Executor
|
|
33
32
|
from indexify.executor.function_executor.server.subprocess_function_executor_server_factory import (
|
34
33
|
SubprocessFunctionExecutorServerFactory,
|
35
34
|
)
|
35
|
+
from indexify.executor.monitoring.health_checker.generic_health_checker import (
|
36
|
+
GenericHealthChecker,
|
37
|
+
)
|
36
38
|
|
37
39
|
custom_theme = Theme(
|
38
40
|
{
|
@@ -158,25 +160,6 @@ def build_image(
|
|
158
160
|
_create_image(obj, python_sdk_path)
|
159
161
|
|
160
162
|
|
161
|
-
@app.command(help="Build default image for indexify")
|
162
|
-
def build_default_image(
|
163
|
-
python_version: Optional[str] = typer.Option(
|
164
|
-
f"{sys.version_info.major}.{sys.version_info.minor}",
|
165
|
-
help="Python version to use in the base image",
|
166
|
-
)
|
167
|
-
):
|
168
|
-
image = GetDefaultPythonImage(python_version)
|
169
|
-
|
170
|
-
_build_image(image=image)
|
171
|
-
|
172
|
-
console.print(
|
173
|
-
Text(f"Built default indexify image with hash {image.hash()}\n", style="cyan"),
|
174
|
-
Text(
|
175
|
-
f"Don't forget to update your executors to run this image!", style="yellow"
|
176
|
-
),
|
177
|
-
)
|
178
|
-
|
179
|
-
|
180
163
|
@app.command(
|
181
164
|
help="Runs Executor that connects to the Indexify server and starts running its tasks"
|
182
165
|
)
|
@@ -204,8 +187,23 @@ def executor(
|
|
204
187
|
),
|
205
188
|
# Registred ports range ends at 49151.
|
206
189
|
ports: Tuple[int, int] = typer.Option(
|
207
|
-
(50000, 51000),
|
190
|
+
(50000, 51000),
|
191
|
+
help="Range of localhost TCP ports to be used by Function Executors",
|
208
192
|
),
|
193
|
+
monitoring_server_host: Annotated[
|
194
|
+
str,
|
195
|
+
typer.Option(
|
196
|
+
"--monitoring-server-host",
|
197
|
+
help="IP address or hostname where to run Executor Monitoring server",
|
198
|
+
),
|
199
|
+
] = "localhost",
|
200
|
+
monitoring_server_port: Annotated[
|
201
|
+
int,
|
202
|
+
typer.Option(
|
203
|
+
"--monitoring-server-port",
|
204
|
+
help="Port where to run Executor Monitoring server",
|
205
|
+
),
|
206
|
+
] = 7000,
|
209
207
|
disable_automatic_function_executor_management: Annotated[
|
210
208
|
bool,
|
211
209
|
typer.Option(
|
@@ -229,6 +227,7 @@ def executor(
|
|
229
227
|
|
230
228
|
logger.info(
|
231
229
|
"starting executor",
|
230
|
+
hostname=gethostname(),
|
232
231
|
server_addr=server_addr,
|
233
232
|
config_path=config_path,
|
234
233
|
executor_version=executor_version,
|
@@ -236,6 +235,8 @@ def executor(
|
|
236
235
|
ports=ports,
|
237
236
|
functions=function_uris,
|
238
237
|
dev_mode=dev,
|
238
|
+
monitoring_server_host=monitoring_server_host,
|
239
|
+
monitoring_server_port=monitoring_server_port,
|
239
240
|
disable_automatic_function_executor_management=disable_automatic_function_executor_management,
|
240
241
|
)
|
241
242
|
|
@@ -254,17 +255,26 @@ def executor(
|
|
254
255
|
)
|
255
256
|
exit(1)
|
256
257
|
|
258
|
+
prometheus_client.Info("cli", "CLI information").info(
|
259
|
+
{
|
260
|
+
"package": "indexify",
|
261
|
+
}
|
262
|
+
)
|
263
|
+
|
257
264
|
Executor(
|
258
265
|
id=id,
|
259
266
|
version=executor_version,
|
260
|
-
|
261
|
-
config_path=config_path,
|
267
|
+
health_checker=GenericHealthChecker(),
|
262
268
|
code_path=executor_cache,
|
263
269
|
function_allowlist=_parse_function_uris(function_uris),
|
264
270
|
function_executor_server_factory=SubprocessFunctionExecutorServerFactory(
|
265
271
|
development_mode=dev,
|
266
272
|
server_ports=range(ports[0], ports[1]),
|
267
273
|
),
|
274
|
+
server_addr=server_addr,
|
275
|
+
config_path=config_path,
|
276
|
+
monitoring_server_host=monitoring_server_host,
|
277
|
+
monitoring_server_port=monitoring_server_port,
|
268
278
|
disable_automatic_function_executor_management=disable_automatic_function_executor_management,
|
269
279
|
).run()
|
270
280
|
|
@@ -307,57 +317,7 @@ def _create_image(image: Image, python_sdk_path):
|
|
307
317
|
|
308
318
|
|
309
319
|
def _build_image(image: Image, python_sdk_path: Optional[str] = None):
|
310
|
-
|
311
|
-
image_name = f"{image._image_name}:{image._tag}"
|
312
|
-
|
313
|
-
# low_level_client = docker.APIClient(base_url=docker_client.api.base_url)
|
314
|
-
docker_host = os.getenv("DOCKER_HOST", "unix:///var/run/docker.sock")
|
315
|
-
low_level_client = docker.APIClient(base_url=docker_host)
|
316
|
-
docker.api.build.process_dockerfile = lambda dockerfile, path: (
|
317
|
-
"Dockerfile",
|
318
|
-
dockerfile,
|
319
|
-
)
|
320
|
-
generator = low_level_client.build(
|
321
|
-
dockerfile=docker_file,
|
322
|
-
rm=True,
|
323
|
-
path=".",
|
324
|
-
tag=image_name,
|
325
|
-
)
|
326
|
-
|
320
|
+
built_image, generator = image.build(python_sdk_path=python_sdk_path)
|
327
321
|
for output in generator:
|
328
|
-
|
329
|
-
|
330
|
-
if "stream" in json_line:
|
331
|
-
print(json_line["stream"], end="")
|
332
|
-
|
333
|
-
elif "errorDetail" in json_line:
|
334
|
-
print(json_line["errorDetail"]["message"])
|
335
|
-
|
336
|
-
|
337
|
-
def _generate_dockerfile(image, python_sdk_path: Optional[str] = None):
|
338
|
-
docker_contents = [
|
339
|
-
f"FROM {image._base_image}",
|
340
|
-
"RUN mkdir -p ~/.indexify",
|
341
|
-
f"RUN echo {image._image_name} > ~/.indexify/image_name", # TODO: Do we still use this in executors?
|
342
|
-
f"RUN echo {image.hash()} > ~/.indexify/image_hash", # TODO: Do we still use this in executors?
|
343
|
-
"WORKDIR /app",
|
344
|
-
]
|
345
|
-
|
346
|
-
for build_op in image._build_ops:
|
347
|
-
docker_contents.append(build_op.render())
|
348
|
-
|
349
|
-
if python_sdk_path is not None:
|
350
|
-
print(f"Building image {image._image_name} with local version of the SDK")
|
351
|
-
|
352
|
-
if not os.path.exists(python_sdk_path):
|
353
|
-
print(f"error: {python_sdk_path} does not exist")
|
354
|
-
os.exit(1)
|
355
|
-
docker_contents.append(f"COPY {python_sdk_path} /app/python-sdk")
|
356
|
-
docker_contents.append("RUN (cd /app/python-sdk && pip install .)")
|
357
|
-
else:
|
358
|
-
docker_contents.append(
|
359
|
-
f"RUN pip install indexify=={importlib.metadata.version('indexify')}"
|
360
|
-
)
|
361
|
-
|
362
|
-
docker_file = "\n".join(docker_contents)
|
363
|
-
return docker_file
|
322
|
+
print(output)
|
323
|
+
print(f"built image: {built_image.tags[0]}")
|
@@ -8,6 +8,21 @@ from tensorlake.function_executor.proto.function_executor_pb2 import SerializedO
|
|
8
8
|
from tensorlake.utils.http_client import get_httpx_client
|
9
9
|
|
10
10
|
from .api_objects import Task
|
11
|
+
from .metrics.downloader import (
|
12
|
+
metric_graph_download_errors,
|
13
|
+
metric_graph_download_latency,
|
14
|
+
metric_graph_downloads,
|
15
|
+
metric_graphs_from_cache,
|
16
|
+
metric_reducer_init_value_download_errors,
|
17
|
+
metric_reducer_init_value_download_latency,
|
18
|
+
metric_reducer_init_value_downloads,
|
19
|
+
metric_task_input_download_errors,
|
20
|
+
metric_task_input_download_latency,
|
21
|
+
metric_task_input_downloads,
|
22
|
+
metric_tasks_downloading_graphs,
|
23
|
+
metric_tasks_downloading_inputs,
|
24
|
+
metric_tasks_downloading_reducer_init_value,
|
25
|
+
)
|
11
26
|
|
12
27
|
|
13
28
|
class Downloader:
|
@@ -19,6 +34,33 @@ class Downloader:
|
|
19
34
|
self._client = get_httpx_client(config_path, make_async=True)
|
20
35
|
|
21
36
|
async def download_graph(self, task: Task) -> SerializedObject:
|
37
|
+
with (
|
38
|
+
metric_graph_download_errors.count_exceptions(),
|
39
|
+
metric_tasks_downloading_graphs.track_inprogress(),
|
40
|
+
metric_graph_download_latency.time(),
|
41
|
+
):
|
42
|
+
metric_graph_downloads.inc()
|
43
|
+
return await self._download_graph(task)
|
44
|
+
|
45
|
+
async def download_input(self, task: Task) -> SerializedObject:
|
46
|
+
with (
|
47
|
+
metric_task_input_download_errors.count_exceptions(),
|
48
|
+
metric_tasks_downloading_inputs.track_inprogress(),
|
49
|
+
metric_task_input_download_latency.time(),
|
50
|
+
):
|
51
|
+
metric_task_input_downloads.inc()
|
52
|
+
return await self._download_input(task)
|
53
|
+
|
54
|
+
async def download_init_value(self, task: Task) -> SerializedObject:
|
55
|
+
with (
|
56
|
+
metric_reducer_init_value_download_errors.count_exceptions(),
|
57
|
+
metric_tasks_downloading_reducer_init_value.track_inprogress(),
|
58
|
+
metric_reducer_init_value_download_latency.time(),
|
59
|
+
):
|
60
|
+
metric_reducer_init_value_downloads.inc()
|
61
|
+
return await self._download_init_value(task)
|
62
|
+
|
63
|
+
async def _download_graph(self, task: Task) -> SerializedObject:
|
22
64
|
# Cache graph to reduce load on the server.
|
23
65
|
graph_path = os.path.join(
|
24
66
|
self.code_path,
|
@@ -33,6 +75,7 @@ class Downloader:
|
|
33
75
|
self._read_cached_graph, graph_path
|
34
76
|
)
|
35
77
|
if graph is not None:
|
78
|
+
metric_graphs_from_cache.inc()
|
36
79
|
return graph
|
37
80
|
|
38
81
|
logger = self._task_logger(task)
|
@@ -71,7 +114,7 @@ class Downloader:
|
|
71
114
|
# This also allows to share the same cache between multiple Executors.
|
72
115
|
os.replace(tmp_path, path)
|
73
116
|
|
74
|
-
async def
|
117
|
+
async def _download_input(self, task: Task) -> SerializedObject:
|
75
118
|
logger = self._task_logger(task)
|
76
119
|
|
77
120
|
first_function_in_graph = task.invocation_id == task.input_key.split("|")[-1]
|
@@ -81,10 +124,7 @@ class Downloader:
|
|
81
124
|
else:
|
82
125
|
return await self._fetch_function_input(task, logger)
|
83
126
|
|
84
|
-
async def
|
85
|
-
if task.reducer_output_id is None:
|
86
|
-
return None
|
87
|
-
|
127
|
+
async def _download_init_value(self, task: Task) -> SerializedObject:
|
88
128
|
logger = self._task_logger(task)
|
89
129
|
return await self._fetch_function_init_value(task, logger)
|
90
130
|
|
@@ -1,7 +1,8 @@
|
|
1
1
|
import asyncio
|
2
2
|
import signal
|
3
3
|
from pathlib import Path
|
4
|
-
from
|
4
|
+
from socket import gethostname
|
5
|
+
from typing import Any, Dict, List, Optional
|
5
6
|
|
6
7
|
import structlog
|
7
8
|
from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
|
@@ -9,13 +10,38 @@ from tensorlake.utils.logging import suppress as suppress_logging
|
|
9
10
|
|
10
11
|
from .api_objects import FunctionURI, Task
|
11
12
|
from .downloader import Downloader
|
13
|
+
from .function_executor.function_executor_states_container import (
|
14
|
+
FunctionExecutorStatesContainer,
|
15
|
+
)
|
12
16
|
from .function_executor.server.function_executor_server_factory import (
|
13
17
|
FunctionExecutorServerFactory,
|
14
18
|
)
|
19
|
+
from .metrics.executor import (
|
20
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ALL,
|
21
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
|
22
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
|
23
|
+
METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
|
24
|
+
metric_executor_info,
|
25
|
+
metric_executor_state,
|
26
|
+
metric_task_outcome_report_latency,
|
27
|
+
metric_task_outcome_report_retries,
|
28
|
+
metric_task_outcome_reports,
|
29
|
+
metric_tasks_completed,
|
30
|
+
metric_tasks_fetched,
|
31
|
+
metric_tasks_reporting_outcome,
|
32
|
+
)
|
33
|
+
from .monitoring.function_allowlist import function_allowlist_to_info_dict
|
34
|
+
from .monitoring.health_check_handler import HealthCheckHandler
|
35
|
+
from .monitoring.health_checker.health_checker import HealthChecker
|
36
|
+
from .monitoring.prometheus_metrics_handler import PrometheusMetricsHandler
|
37
|
+
from .monitoring.server import MonitoringServer
|
38
|
+
from .monitoring.startup_probe_handler import StartupProbeHandler
|
15
39
|
from .task_fetcher import TaskFetcher
|
16
40
|
from .task_reporter import TaskReporter
|
17
41
|
from .task_runner import TaskInput, TaskOutput, TaskRunner
|
18
42
|
|
43
|
+
metric_executor_state.state("starting")
|
44
|
+
|
19
45
|
|
20
46
|
class Executor:
|
21
47
|
def __init__(
|
@@ -23,11 +49,14 @@ class Executor:
|
|
23
49
|
id: str,
|
24
50
|
version: str,
|
25
51
|
code_path: Path,
|
52
|
+
health_checker: HealthChecker,
|
26
53
|
function_allowlist: Optional[List[FunctionURI]],
|
27
54
|
function_executor_server_factory: FunctionExecutorServerFactory,
|
28
|
-
server_addr: str
|
29
|
-
config_path: Optional[str]
|
30
|
-
|
55
|
+
server_addr: str,
|
56
|
+
config_path: Optional[str],
|
57
|
+
monitoring_server_host: str,
|
58
|
+
monitoring_server_port: int,
|
59
|
+
disable_automatic_function_executor_management: bool,
|
31
60
|
):
|
32
61
|
self._logger = structlog.get_logger(module=__name__)
|
33
62
|
self._is_shutdown: bool = False
|
@@ -40,12 +69,25 @@ class Executor:
|
|
40
69
|
self._server_addr = server_addr
|
41
70
|
self._base_url = f"{protocol}://{self._server_addr}"
|
42
71
|
self._code_path = code_path
|
72
|
+
self._startup_probe_handler = StartupProbeHandler()
|
73
|
+
self._monitoring_server = MonitoringServer(
|
74
|
+
host=monitoring_server_host,
|
75
|
+
port=monitoring_server_port,
|
76
|
+
startup_probe_handler=self._startup_probe_handler,
|
77
|
+
health_probe_handler=HealthCheckHandler(health_checker),
|
78
|
+
metrics_handler=PrometheusMetricsHandler(),
|
79
|
+
)
|
80
|
+
self._function_executor_states = FunctionExecutorStatesContainer()
|
81
|
+
health_checker.set_function_executor_states_container(
|
82
|
+
self._function_executor_states
|
83
|
+
)
|
43
84
|
self._task_runner = TaskRunner(
|
44
85
|
executor_id=id,
|
45
86
|
function_executor_server_factory=function_executor_server_factory,
|
46
87
|
base_url=self._base_url,
|
47
|
-
config_path=config_path,
|
48
88
|
disable_automatic_function_executor_management=disable_automatic_function_executor_management,
|
89
|
+
function_executor_states=self._function_executor_states,
|
90
|
+
config_path=config_path,
|
49
91
|
)
|
50
92
|
self._downloader = Downloader(
|
51
93
|
code_path=code_path, base_url=self._base_url, config_path=config_path
|
@@ -63,8 +105,22 @@ class Executor:
|
|
63
105
|
executor_id=id,
|
64
106
|
config_path=self._config_path,
|
65
107
|
)
|
108
|
+
executor_info: Dict[str, str] = {
|
109
|
+
"id": id,
|
110
|
+
"version": version,
|
111
|
+
"code_path": str(code_path),
|
112
|
+
"server_addr": server_addr,
|
113
|
+
"config_path": str(config_path),
|
114
|
+
"disable_automatic_function_executor_management": str(
|
115
|
+
disable_automatic_function_executor_management
|
116
|
+
),
|
117
|
+
"hostname": gethostname(),
|
118
|
+
}
|
119
|
+
executor_info.update(function_allowlist_to_info_dict(function_allowlist))
|
120
|
+
metric_executor_info.info(executor_info)
|
66
121
|
|
67
122
|
def run(self):
|
123
|
+
asyncio.new_event_loop()
|
68
124
|
for signum in [
|
69
125
|
signal.SIGABRT,
|
70
126
|
signal.SIGINT,
|
@@ -76,15 +132,20 @@ class Executor:
|
|
76
132
|
signum, self.shutdown, asyncio.get_event_loop()
|
77
133
|
)
|
78
134
|
|
135
|
+
asyncio.get_event_loop().create_task(self._monitoring_server.run())
|
136
|
+
|
79
137
|
try:
|
80
|
-
asyncio.get_event_loop().run_until_complete(self.
|
138
|
+
asyncio.get_event_loop().run_until_complete(self._run_tasks_loop())
|
81
139
|
except asyncio.CancelledError:
|
82
140
|
pass # Suppress this expected exception and return without error (normally).
|
83
141
|
|
84
|
-
async def
|
142
|
+
async def _run_tasks_loop(self):
|
143
|
+
metric_executor_state.state("running")
|
144
|
+
self._startup_probe_handler.set_ready()
|
85
145
|
while not self._is_shutdown:
|
86
146
|
try:
|
87
147
|
async for task in self._task_fetcher.run():
|
148
|
+
metric_tasks_fetched.inc()
|
88
149
|
asyncio.create_task(self._run_task(task))
|
89
150
|
except Exception as e:
|
90
151
|
self._logger.error(
|
@@ -103,9 +164,10 @@ class Executor:
|
|
103
164
|
graph: SerializedObject = await self._downloader.download_graph(task)
|
104
165
|
input: SerializedObject = await self._downloader.download_input(task)
|
105
166
|
init_value: Optional[SerializedObject] = (
|
106
|
-
|
167
|
+
None
|
168
|
+
if task.reducer_output_id is None
|
169
|
+
else (await self._downloader.download_init_value(task))
|
107
170
|
)
|
108
|
-
logger.info("task_execution_started")
|
109
171
|
output: TaskOutput = await self._task_runner.run(
|
110
172
|
TaskInput(
|
111
173
|
task=task,
|
@@ -115,15 +177,22 @@ class Executor:
|
|
115
177
|
),
|
116
178
|
logger=logger,
|
117
179
|
)
|
118
|
-
logger.info("
|
180
|
+
logger.info("task execution finished", success=output.success)
|
119
181
|
except Exception as e:
|
120
182
|
output = TaskOutput.internal_error(task)
|
121
|
-
logger.error("
|
183
|
+
logger.error("task execution failed", exc_info=e)
|
122
184
|
|
123
|
-
|
185
|
+
with (
|
186
|
+
metric_tasks_reporting_outcome.track_inprogress(),
|
187
|
+
metric_task_outcome_report_latency.time(),
|
188
|
+
):
|
189
|
+
metric_task_outcome_reports.inc()
|
190
|
+
await self._report_task_outcome(output=output, logger=logger)
|
124
191
|
|
125
192
|
async def _report_task_outcome(self, output: TaskOutput, logger: Any) -> None:
|
126
|
-
"""Reports the task with the given output to the server.
|
193
|
+
"""Reports the task with the given output to the server.
|
194
|
+
|
195
|
+
Doesn't raise any Exceptions. Runs till the reporting is successful."""
|
127
196
|
reporting_retries: int = 0
|
128
197
|
|
129
198
|
while True:
|
@@ -133,22 +202,40 @@ class Executor:
|
|
133
202
|
break
|
134
203
|
except Exception as e:
|
135
204
|
logger.error(
|
136
|
-
"
|
205
|
+
"failed to report task",
|
137
206
|
exc_info=e,
|
138
207
|
)
|
139
208
|
reporting_retries += 1
|
209
|
+
metric_task_outcome_report_retries.inc()
|
140
210
|
await asyncio.sleep(5)
|
141
211
|
|
212
|
+
metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
|
213
|
+
if output.is_internal_error:
|
214
|
+
metric_tasks_completed.labels(
|
215
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
|
216
|
+
).inc()
|
217
|
+
elif output.success:
|
218
|
+
metric_tasks_completed.labels(
|
219
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
|
220
|
+
).inc()
|
221
|
+
else:
|
222
|
+
metric_tasks_completed.labels(
|
223
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
|
224
|
+
).inc()
|
225
|
+
|
142
226
|
async def _shutdown(self, loop):
|
143
|
-
self._logger.info("
|
227
|
+
self._logger.info("shutting down")
|
228
|
+
metric_executor_state.state("shutting_down")
|
144
229
|
# There will be lots of task cancellation exceptions and "X is shutting down"
|
145
230
|
# exceptions logged during Executor shutdown. Suppress their logs as they are
|
146
231
|
# expected and are confusing for users.
|
147
232
|
suppress_logging()
|
148
233
|
|
149
234
|
self._is_shutdown = True
|
235
|
+
await self._monitoring_server.shutdown()
|
150
236
|
await self._task_runner.shutdown()
|
151
|
-
|
237
|
+
await self._function_executor_states.shutdown()
|
238
|
+
# We mainly need to cancel the task that runs _run_tasks_loop().
|
152
239
|
for task in asyncio.all_tasks(loop):
|
153
240
|
task.cancel()
|
154
241
|
|