indexify 0.3.9__tar.gz → 0.3.11__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {indexify-0.3.9 → indexify-0.3.11}/PKG-INFO +4 -2
- {indexify-0.3.9 → indexify-0.3.11}/pyproject.toml +4 -2
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/cli/cli.py +36 -7
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/api_objects.py +4 -0
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/downloader.py +45 -5
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/executor.py +103 -16
- indexify-0.3.11/src/indexify/executor/function_executor/function_executor.py +280 -0
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/function_executor/function_executor_state.py +6 -0
- indexify-0.3.11/src/indexify/executor/function_executor/function_executor_states_container.py +64 -0
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/function_executor/health_checker.py +20 -10
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/function_executor/invocation_state_client.py +31 -6
- indexify-0.3.11/src/indexify/executor/function_executor/metrics/function_executor.py +142 -0
- indexify-0.3.11/src/indexify/executor/function_executor/metrics/function_executor_state.py +10 -0
- indexify-0.3.11/src/indexify/executor/function_executor/metrics/function_executor_state_container.py +10 -0
- indexify-0.3.11/src/indexify/executor/function_executor/metrics/health_checker.py +14 -0
- indexify-0.3.11/src/indexify/executor/function_executor/metrics/invocation_state_client.py +45 -0
- indexify-0.3.11/src/indexify/executor/function_executor/metrics/single_task_runner.py +22 -0
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +1 -2
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/function_executor/single_task_runner.py +44 -15
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/function_executor/task_output.py +7 -1
- indexify-0.3.11/src/indexify/executor/metrics/downloader.py +69 -0
- indexify-0.3.11/src/indexify/executor/metrics/executor.py +51 -0
- indexify-0.3.11/src/indexify/executor/metrics/task_fetcher.py +21 -0
- indexify-0.3.11/src/indexify/executor/metrics/task_reporter.py +22 -0
- indexify-0.3.11/src/indexify/executor/metrics/task_runner.py +45 -0
- indexify-0.3.11/src/indexify/executor/monitoring/function_allowlist.py +25 -0
- indexify-0.3.11/src/indexify/executor/monitoring/handler.py +8 -0
- indexify-0.3.11/src/indexify/executor/monitoring/health_check_handler.py +20 -0
- indexify-0.3.11/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +58 -0
- indexify-0.3.11/src/indexify/executor/monitoring/health_checker/health_checker.py +23 -0
- indexify-0.3.11/src/indexify/executor/monitoring/metrics.py +245 -0
- indexify-0.3.11/src/indexify/executor/monitoring/prometheus_metrics_handler.py +18 -0
- indexify-0.3.11/src/indexify/executor/monitoring/server.py +41 -0
- indexify-0.3.11/src/indexify/executor/monitoring/startup_probe_handler.py +17 -0
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/task_fetcher.py +15 -1
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/task_reporter.py +24 -7
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/task_runner.py +64 -46
- indexify-0.3.9/src/indexify/executor/function_executor/function_executor.py +0 -161
- {indexify-0.3.9 → indexify-0.3.11}/README.md +0 -0
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/README.md +0 -0
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/function_executor/task_input.py +0 -0
- {indexify-0.3.9 → indexify-0.3.11}/src/indexify/executor/runtime_probes.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: indexify
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.11
|
4
4
|
Summary: Open Source Indexify components and helper tools
|
5
5
|
Home-page: https://github.com/tensorlakeai/indexify
|
6
6
|
License: Apache 2.0
|
@@ -14,15 +14,17 @@ Classifier: Programming Language :: Python :: 3.10
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
16
16
|
Classifier: Programming Language :: Python :: 3.13
|
17
|
+
Requires-Dist: aiohttp (>=3.11.0,<4.0.0)
|
17
18
|
Requires-Dist: grpcio (==1.70.0)
|
18
19
|
Requires-Dist: httpx-sse (>=0.4.0,<0.5.0)
|
19
20
|
Requires-Dist: httpx[http2] (>=0.27,<0.28)
|
20
21
|
Requires-Dist: nanoid (>=2.0.0,<3.0.0)
|
22
|
+
Requires-Dist: prometheus-client (>=0.21.1,<0.22.0)
|
21
23
|
Requires-Dist: pydantic (==2.10.4)
|
22
24
|
Requires-Dist: pyyaml (>=6,<7)
|
23
25
|
Requires-Dist: rich (>=13.9.2,<14.0.0)
|
24
26
|
Requires-Dist: structlog (>=24.4.0,<25.0.0)
|
25
|
-
Requires-Dist: tensorlake (>=0.1.
|
27
|
+
Requires-Dist: tensorlake (>=0.1.20)
|
26
28
|
Requires-Dist: typer (>=0.12,<0.13)
|
27
29
|
Project-URL: Repository, https://github.com/tensorlakeai/indexify
|
28
30
|
Description-Content-Type: text/markdown
|
@@ -1,7 +1,7 @@
|
|
1
1
|
[tool.poetry]
|
2
2
|
name = "indexify"
|
3
3
|
# Incremented if any of the components provided in this packages are updated.
|
4
|
-
version = "0.3.
|
4
|
+
version = "0.3.11"
|
5
5
|
description = "Open Source Indexify components and helper tools"
|
6
6
|
authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
|
7
7
|
license = "Apache 2.0"
|
@@ -23,8 +23,10 @@ grpcio = "1.70.0"
|
|
23
23
|
# Executor only
|
24
24
|
pydantic = "2.10.4"
|
25
25
|
httpx-sse = "^0.4.0"
|
26
|
+
aiohttp = "^3.11.0"
|
27
|
+
prometheus-client = "^0.21.1"
|
26
28
|
# Adds function-executor binary and utils lib.
|
27
|
-
tensorlake = ">=0.1.
|
29
|
+
tensorlake = ">=0.1.20"
|
28
30
|
|
29
31
|
# CLI only
|
30
32
|
rich = "^13.9.2"
|
@@ -6,8 +6,6 @@ from tensorlake.utils.logging import (
|
|
6
6
|
|
7
7
|
configure_logging_early()
|
8
8
|
|
9
|
-
import importlib.metadata
|
10
|
-
import json
|
11
9
|
import os
|
12
10
|
import shutil
|
13
11
|
import signal
|
@@ -17,22 +15,26 @@ import threading
|
|
17
15
|
import time
|
18
16
|
from importlib.metadata import version
|
19
17
|
from pathlib import Path
|
18
|
+
from socket import gethostname
|
20
19
|
from typing import Annotated, List, Optional, Tuple
|
21
20
|
|
22
|
-
import docker
|
23
21
|
import nanoid
|
22
|
+
import prometheus_client
|
24
23
|
import structlog
|
25
24
|
import typer
|
26
25
|
from rich.console import Console
|
27
26
|
from rich.text import Text
|
28
27
|
from rich.theme import Theme
|
29
|
-
from tensorlake.functions_sdk.image import
|
28
|
+
from tensorlake.functions_sdk.image import Image
|
30
29
|
|
31
30
|
from indexify.executor.api_objects import FunctionURI
|
32
31
|
from indexify.executor.executor import Executor
|
33
32
|
from indexify.executor.function_executor.server.subprocess_function_executor_server_factory import (
|
34
33
|
SubprocessFunctionExecutorServerFactory,
|
35
34
|
)
|
35
|
+
from indexify.executor.monitoring.health_checker.generic_health_checker import (
|
36
|
+
GenericHealthChecker,
|
37
|
+
)
|
36
38
|
|
37
39
|
custom_theme = Theme(
|
38
40
|
{
|
@@ -185,8 +187,23 @@ def executor(
|
|
185
187
|
),
|
186
188
|
# Registred ports range ends at 49151.
|
187
189
|
ports: Tuple[int, int] = typer.Option(
|
188
|
-
(50000, 51000),
|
190
|
+
(50000, 51000),
|
191
|
+
help="Range of localhost TCP ports to be used by Function Executors",
|
189
192
|
),
|
193
|
+
monitoring_server_host: Annotated[
|
194
|
+
str,
|
195
|
+
typer.Option(
|
196
|
+
"--monitoring-server-host",
|
197
|
+
help="IP address or hostname where to run Executor Monitoring server",
|
198
|
+
),
|
199
|
+
] = "localhost",
|
200
|
+
monitoring_server_port: Annotated[
|
201
|
+
int,
|
202
|
+
typer.Option(
|
203
|
+
"--monitoring-server-port",
|
204
|
+
help="Port where to run Executor Monitoring server",
|
205
|
+
),
|
206
|
+
] = 7000,
|
190
207
|
disable_automatic_function_executor_management: Annotated[
|
191
208
|
bool,
|
192
209
|
typer.Option(
|
@@ -210,6 +227,7 @@ def executor(
|
|
210
227
|
|
211
228
|
logger.info(
|
212
229
|
"starting executor",
|
230
|
+
hostname=gethostname(),
|
213
231
|
server_addr=server_addr,
|
214
232
|
config_path=config_path,
|
215
233
|
executor_version=executor_version,
|
@@ -217,6 +235,8 @@ def executor(
|
|
217
235
|
ports=ports,
|
218
236
|
functions=function_uris,
|
219
237
|
dev_mode=dev,
|
238
|
+
monitoring_server_host=monitoring_server_host,
|
239
|
+
monitoring_server_port=monitoring_server_port,
|
220
240
|
disable_automatic_function_executor_management=disable_automatic_function_executor_management,
|
221
241
|
)
|
222
242
|
|
@@ -235,17 +255,26 @@ def executor(
|
|
235
255
|
)
|
236
256
|
exit(1)
|
237
257
|
|
258
|
+
prometheus_client.Info("cli", "CLI information").info(
|
259
|
+
{
|
260
|
+
"package": "indexify",
|
261
|
+
}
|
262
|
+
)
|
263
|
+
|
238
264
|
Executor(
|
239
265
|
id=id,
|
240
266
|
version=executor_version,
|
241
|
-
|
242
|
-
config_path=config_path,
|
267
|
+
health_checker=GenericHealthChecker(),
|
243
268
|
code_path=executor_cache,
|
244
269
|
function_allowlist=_parse_function_uris(function_uris),
|
245
270
|
function_executor_server_factory=SubprocessFunctionExecutorServerFactory(
|
246
271
|
development_mode=dev,
|
247
272
|
server_ports=range(ports[0], ports[1]),
|
248
273
|
),
|
274
|
+
server_addr=server_addr,
|
275
|
+
config_path=config_path,
|
276
|
+
monitoring_server_host=monitoring_server_host,
|
277
|
+
monitoring_server_port=monitoring_server_port,
|
249
278
|
disable_automatic_function_executor_management=disable_automatic_function_executor_management,
|
250
279
|
).run()
|
251
280
|
|
@@ -8,6 +8,21 @@ from tensorlake.function_executor.proto.function_executor_pb2 import SerializedO
|
|
8
8
|
from tensorlake.utils.http_client import get_httpx_client
|
9
9
|
|
10
10
|
from .api_objects import Task
|
11
|
+
from .metrics.downloader import (
|
12
|
+
metric_graph_download_errors,
|
13
|
+
metric_graph_download_latency,
|
14
|
+
metric_graph_downloads,
|
15
|
+
metric_graphs_from_cache,
|
16
|
+
metric_reducer_init_value_download_errors,
|
17
|
+
metric_reducer_init_value_download_latency,
|
18
|
+
metric_reducer_init_value_downloads,
|
19
|
+
metric_task_input_download_errors,
|
20
|
+
metric_task_input_download_latency,
|
21
|
+
metric_task_input_downloads,
|
22
|
+
metric_tasks_downloading_graphs,
|
23
|
+
metric_tasks_downloading_inputs,
|
24
|
+
metric_tasks_downloading_reducer_init_value,
|
25
|
+
)
|
11
26
|
|
12
27
|
|
13
28
|
class Downloader:
|
@@ -19,6 +34,33 @@ class Downloader:
|
|
19
34
|
self._client = get_httpx_client(config_path, make_async=True)
|
20
35
|
|
21
36
|
async def download_graph(self, task: Task) -> SerializedObject:
|
37
|
+
with (
|
38
|
+
metric_graph_download_errors.count_exceptions(),
|
39
|
+
metric_tasks_downloading_graphs.track_inprogress(),
|
40
|
+
metric_graph_download_latency.time(),
|
41
|
+
):
|
42
|
+
metric_graph_downloads.inc()
|
43
|
+
return await self._download_graph(task)
|
44
|
+
|
45
|
+
async def download_input(self, task: Task) -> SerializedObject:
|
46
|
+
with (
|
47
|
+
metric_task_input_download_errors.count_exceptions(),
|
48
|
+
metric_tasks_downloading_inputs.track_inprogress(),
|
49
|
+
metric_task_input_download_latency.time(),
|
50
|
+
):
|
51
|
+
metric_task_input_downloads.inc()
|
52
|
+
return await self._download_input(task)
|
53
|
+
|
54
|
+
async def download_init_value(self, task: Task) -> SerializedObject:
|
55
|
+
with (
|
56
|
+
metric_reducer_init_value_download_errors.count_exceptions(),
|
57
|
+
metric_tasks_downloading_reducer_init_value.track_inprogress(),
|
58
|
+
metric_reducer_init_value_download_latency.time(),
|
59
|
+
):
|
60
|
+
metric_reducer_init_value_downloads.inc()
|
61
|
+
return await self._download_init_value(task)
|
62
|
+
|
63
|
+
async def _download_graph(self, task: Task) -> SerializedObject:
|
22
64
|
# Cache graph to reduce load on the server.
|
23
65
|
graph_path = os.path.join(
|
24
66
|
self.code_path,
|
@@ -33,6 +75,7 @@ class Downloader:
|
|
33
75
|
self._read_cached_graph, graph_path
|
34
76
|
)
|
35
77
|
if graph is not None:
|
78
|
+
metric_graphs_from_cache.inc()
|
36
79
|
return graph
|
37
80
|
|
38
81
|
logger = self._task_logger(task)
|
@@ -71,7 +114,7 @@ class Downloader:
|
|
71
114
|
# This also allows to share the same cache between multiple Executors.
|
72
115
|
os.replace(tmp_path, path)
|
73
116
|
|
74
|
-
async def
|
117
|
+
async def _download_input(self, task: Task) -> SerializedObject:
|
75
118
|
logger = self._task_logger(task)
|
76
119
|
|
77
120
|
first_function_in_graph = task.invocation_id == task.input_key.split("|")[-1]
|
@@ -81,10 +124,7 @@ class Downloader:
|
|
81
124
|
else:
|
82
125
|
return await self._fetch_function_input(task, logger)
|
83
126
|
|
84
|
-
async def
|
85
|
-
if task.reducer_output_id is None:
|
86
|
-
return None
|
87
|
-
|
127
|
+
async def _download_init_value(self, task: Task) -> SerializedObject:
|
88
128
|
logger = self._task_logger(task)
|
89
129
|
return await self._fetch_function_init_value(task, logger)
|
90
130
|
|
@@ -1,7 +1,8 @@
|
|
1
1
|
import asyncio
|
2
2
|
import signal
|
3
3
|
from pathlib import Path
|
4
|
-
from
|
4
|
+
from socket import gethostname
|
5
|
+
from typing import Any, Dict, List, Optional
|
5
6
|
|
6
7
|
import structlog
|
7
8
|
from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
|
@@ -9,13 +10,38 @@ from tensorlake.utils.logging import suppress as suppress_logging
|
|
9
10
|
|
10
11
|
from .api_objects import FunctionURI, Task
|
11
12
|
from .downloader import Downloader
|
13
|
+
from .function_executor.function_executor_states_container import (
|
14
|
+
FunctionExecutorStatesContainer,
|
15
|
+
)
|
12
16
|
from .function_executor.server.function_executor_server_factory import (
|
13
17
|
FunctionExecutorServerFactory,
|
14
18
|
)
|
19
|
+
from .metrics.executor import (
|
20
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ALL,
|
21
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
|
22
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
|
23
|
+
METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
|
24
|
+
metric_executor_info,
|
25
|
+
metric_executor_state,
|
26
|
+
metric_task_outcome_report_latency,
|
27
|
+
metric_task_outcome_report_retries,
|
28
|
+
metric_task_outcome_reports,
|
29
|
+
metric_tasks_completed,
|
30
|
+
metric_tasks_fetched,
|
31
|
+
metric_tasks_reporting_outcome,
|
32
|
+
)
|
33
|
+
from .monitoring.function_allowlist import function_allowlist_to_info_dict
|
34
|
+
from .monitoring.health_check_handler import HealthCheckHandler
|
35
|
+
from .monitoring.health_checker.health_checker import HealthChecker
|
36
|
+
from .monitoring.prometheus_metrics_handler import PrometheusMetricsHandler
|
37
|
+
from .monitoring.server import MonitoringServer
|
38
|
+
from .monitoring.startup_probe_handler import StartupProbeHandler
|
15
39
|
from .task_fetcher import TaskFetcher
|
16
40
|
from .task_reporter import TaskReporter
|
17
41
|
from .task_runner import TaskInput, TaskOutput, TaskRunner
|
18
42
|
|
43
|
+
metric_executor_state.state("starting")
|
44
|
+
|
19
45
|
|
20
46
|
class Executor:
|
21
47
|
def __init__(
|
@@ -23,11 +49,14 @@ class Executor:
|
|
23
49
|
id: str,
|
24
50
|
version: str,
|
25
51
|
code_path: Path,
|
52
|
+
health_checker: HealthChecker,
|
26
53
|
function_allowlist: Optional[List[FunctionURI]],
|
27
54
|
function_executor_server_factory: FunctionExecutorServerFactory,
|
28
|
-
server_addr: str
|
29
|
-
config_path: Optional[str]
|
30
|
-
|
55
|
+
server_addr: str,
|
56
|
+
config_path: Optional[str],
|
57
|
+
monitoring_server_host: str,
|
58
|
+
monitoring_server_port: int,
|
59
|
+
disable_automatic_function_executor_management: bool,
|
31
60
|
):
|
32
61
|
self._logger = structlog.get_logger(module=__name__)
|
33
62
|
self._is_shutdown: bool = False
|
@@ -40,12 +69,25 @@ class Executor:
|
|
40
69
|
self._server_addr = server_addr
|
41
70
|
self._base_url = f"{protocol}://{self._server_addr}"
|
42
71
|
self._code_path = code_path
|
72
|
+
self._startup_probe_handler = StartupProbeHandler()
|
73
|
+
self._monitoring_server = MonitoringServer(
|
74
|
+
host=monitoring_server_host,
|
75
|
+
port=monitoring_server_port,
|
76
|
+
startup_probe_handler=self._startup_probe_handler,
|
77
|
+
health_probe_handler=HealthCheckHandler(health_checker),
|
78
|
+
metrics_handler=PrometheusMetricsHandler(),
|
79
|
+
)
|
80
|
+
self._function_executor_states = FunctionExecutorStatesContainer()
|
81
|
+
health_checker.set_function_executor_states_container(
|
82
|
+
self._function_executor_states
|
83
|
+
)
|
43
84
|
self._task_runner = TaskRunner(
|
44
85
|
executor_id=id,
|
45
86
|
function_executor_server_factory=function_executor_server_factory,
|
46
87
|
base_url=self._base_url,
|
47
|
-
config_path=config_path,
|
48
88
|
disable_automatic_function_executor_management=disable_automatic_function_executor_management,
|
89
|
+
function_executor_states=self._function_executor_states,
|
90
|
+
config_path=config_path,
|
49
91
|
)
|
50
92
|
self._downloader = Downloader(
|
51
93
|
code_path=code_path, base_url=self._base_url, config_path=config_path
|
@@ -63,8 +105,22 @@ class Executor:
|
|
63
105
|
executor_id=id,
|
64
106
|
config_path=self._config_path,
|
65
107
|
)
|
108
|
+
executor_info: Dict[str, str] = {
|
109
|
+
"id": id,
|
110
|
+
"version": version,
|
111
|
+
"code_path": str(code_path),
|
112
|
+
"server_addr": server_addr,
|
113
|
+
"config_path": str(config_path),
|
114
|
+
"disable_automatic_function_executor_management": str(
|
115
|
+
disable_automatic_function_executor_management
|
116
|
+
),
|
117
|
+
"hostname": gethostname(),
|
118
|
+
}
|
119
|
+
executor_info.update(function_allowlist_to_info_dict(function_allowlist))
|
120
|
+
metric_executor_info.info(executor_info)
|
66
121
|
|
67
122
|
def run(self):
|
123
|
+
asyncio.new_event_loop()
|
68
124
|
for signum in [
|
69
125
|
signal.SIGABRT,
|
70
126
|
signal.SIGINT,
|
@@ -76,15 +132,20 @@ class Executor:
|
|
76
132
|
signum, self.shutdown, asyncio.get_event_loop()
|
77
133
|
)
|
78
134
|
|
135
|
+
asyncio.get_event_loop().create_task(self._monitoring_server.run())
|
136
|
+
|
79
137
|
try:
|
80
|
-
asyncio.get_event_loop().run_until_complete(self.
|
138
|
+
asyncio.get_event_loop().run_until_complete(self._run_tasks_loop())
|
81
139
|
except asyncio.CancelledError:
|
82
140
|
pass # Suppress this expected exception and return without error (normally).
|
83
141
|
|
84
|
-
async def
|
142
|
+
async def _run_tasks_loop(self):
|
143
|
+
metric_executor_state.state("running")
|
144
|
+
self._startup_probe_handler.set_ready()
|
85
145
|
while not self._is_shutdown:
|
86
146
|
try:
|
87
147
|
async for task in self._task_fetcher.run():
|
148
|
+
metric_tasks_fetched.inc()
|
88
149
|
asyncio.create_task(self._run_task(task))
|
89
150
|
except Exception as e:
|
90
151
|
self._logger.error(
|
@@ -103,9 +164,10 @@ class Executor:
|
|
103
164
|
graph: SerializedObject = await self._downloader.download_graph(task)
|
104
165
|
input: SerializedObject = await self._downloader.download_input(task)
|
105
166
|
init_value: Optional[SerializedObject] = (
|
106
|
-
|
167
|
+
None
|
168
|
+
if task.reducer_output_id is None
|
169
|
+
else (await self._downloader.download_init_value(task))
|
107
170
|
)
|
108
|
-
logger.info("task_execution_started")
|
109
171
|
output: TaskOutput = await self._task_runner.run(
|
110
172
|
TaskInput(
|
111
173
|
task=task,
|
@@ -115,15 +177,22 @@ class Executor:
|
|
115
177
|
),
|
116
178
|
logger=logger,
|
117
179
|
)
|
118
|
-
logger.info("
|
180
|
+
logger.info("task execution finished", success=output.success)
|
119
181
|
except Exception as e:
|
120
182
|
output = TaskOutput.internal_error(task)
|
121
|
-
logger.error("
|
183
|
+
logger.error("task execution failed", exc_info=e)
|
122
184
|
|
123
|
-
|
185
|
+
with (
|
186
|
+
metric_tasks_reporting_outcome.track_inprogress(),
|
187
|
+
metric_task_outcome_report_latency.time(),
|
188
|
+
):
|
189
|
+
metric_task_outcome_reports.inc()
|
190
|
+
await self._report_task_outcome(output=output, logger=logger)
|
124
191
|
|
125
192
|
async def _report_task_outcome(self, output: TaskOutput, logger: Any) -> None:
|
126
|
-
"""Reports the task with the given output to the server.
|
193
|
+
"""Reports the task with the given output to the server.
|
194
|
+
|
195
|
+
Doesn't raise any Exceptions. Runs till the reporting is successful."""
|
127
196
|
reporting_retries: int = 0
|
128
197
|
|
129
198
|
while True:
|
@@ -133,22 +202,40 @@ class Executor:
|
|
133
202
|
break
|
134
203
|
except Exception as e:
|
135
204
|
logger.error(
|
136
|
-
"
|
205
|
+
"failed to report task",
|
137
206
|
exc_info=e,
|
138
207
|
)
|
139
208
|
reporting_retries += 1
|
209
|
+
metric_task_outcome_report_retries.inc()
|
140
210
|
await asyncio.sleep(5)
|
141
211
|
|
212
|
+
metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
|
213
|
+
if output.is_internal_error:
|
214
|
+
metric_tasks_completed.labels(
|
215
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
|
216
|
+
).inc()
|
217
|
+
elif output.success:
|
218
|
+
metric_tasks_completed.labels(
|
219
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
|
220
|
+
).inc()
|
221
|
+
else:
|
222
|
+
metric_tasks_completed.labels(
|
223
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
|
224
|
+
).inc()
|
225
|
+
|
142
226
|
async def _shutdown(self, loop):
|
143
|
-
self._logger.info("
|
227
|
+
self._logger.info("shutting down")
|
228
|
+
metric_executor_state.state("shutting_down")
|
144
229
|
# There will be lots of task cancellation exceptions and "X is shutting down"
|
145
230
|
# exceptions logged during Executor shutdown. Suppress their logs as they are
|
146
231
|
# expected and are confusing for users.
|
147
232
|
suppress_logging()
|
148
233
|
|
149
234
|
self._is_shutdown = True
|
235
|
+
await self._monitoring_server.shutdown()
|
150
236
|
await self._task_runner.shutdown()
|
151
|
-
|
237
|
+
await self._function_executor_states.shutdown()
|
238
|
+
# We mainly need to cancel the task that runs _run_tasks_loop().
|
152
239
|
for task in asyncio.all_tasks(loop):
|
153
240
|
task.cancel()
|
154
241
|
|