indexify 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +35 -6
- indexify/executor/api_objects.py +4 -0
- indexify/executor/downloader.py +45 -5
- indexify/executor/executor.py +103 -16
- indexify/executor/function_executor/function_executor.py +174 -55
- indexify/executor/function_executor/function_executor_state.py +6 -0
- indexify/executor/function_executor/function_executor_states_container.py +64 -0
- indexify/executor/function_executor/health_checker.py +20 -10
- indexify/executor/function_executor/invocation_state_client.py +31 -6
- indexify/executor/function_executor/metrics/function_executor.py +142 -0
- indexify/executor/function_executor/metrics/function_executor_state.py +10 -0
- indexify/executor/function_executor/metrics/function_executor_state_container.py +10 -0
- indexify/executor/function_executor/metrics/health_checker.py +14 -0
- indexify/executor/function_executor/metrics/invocation_state_client.py +45 -0
- indexify/executor/function_executor/metrics/single_task_runner.py +22 -0
- indexify/executor/function_executor/single_task_runner.py +44 -15
- indexify/executor/function_executor/task_output.py +7 -1
- indexify/executor/metrics/downloader.py +69 -0
- indexify/executor/metrics/executor.py +51 -0
- indexify/executor/metrics/task_fetcher.py +21 -0
- indexify/executor/metrics/task_reporter.py +22 -0
- indexify/executor/metrics/task_runner.py +45 -0
- indexify/executor/monitoring/function_allowlist.py +25 -0
- indexify/executor/monitoring/handler.py +8 -0
- indexify/executor/monitoring/health_check_handler.py +20 -0
- indexify/executor/monitoring/health_checker/generic_health_checker.py +58 -0
- indexify/executor/monitoring/health_checker/health_checker.py +23 -0
- indexify/executor/monitoring/metrics.py +245 -0
- indexify/executor/monitoring/prometheus_metrics_handler.py +18 -0
- indexify/executor/monitoring/server.py +41 -0
- indexify/executor/monitoring/startup_probe_handler.py +17 -0
- indexify/executor/task_fetcher.py +15 -1
- indexify/executor/task_reporter.py +24 -7
- indexify/executor/task_runner.py +64 -46
- {indexify-0.3.9.dist-info → indexify-0.3.10.dist-info}/METADATA +4 -2
- indexify-0.3.10.dist-info/RECORD +46 -0
- indexify-0.3.9.dist-info/RECORD +0 -25
- {indexify-0.3.9.dist-info → indexify-0.3.10.dist-info}/WHEEL +0 -0
- {indexify-0.3.9.dist-info → indexify-0.3.10.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,245 @@
|
|
1
|
+
import prometheus_client
|
2
|
+
|
3
|
+
_INF = float("inf")
|
4
|
+
|
5
|
+
|
6
|
+
def latency_metric_for_fast_operation(
|
7
|
+
operation_name: str, operation_description: str
|
8
|
+
) -> prometheus_client.Histogram:
|
9
|
+
"""Creates a histogram metric for latency of a fast operation.
|
10
|
+
|
11
|
+
A fast operation is typically expected to complete in a few tens of milliseconds."""
|
12
|
+
return prometheus_client.Histogram(
|
13
|
+
f"{operation_name}_latency_seconds",
|
14
|
+
f"Latency of {operation_description}",
|
15
|
+
# Buckets are in seconds
|
16
|
+
buckets=[
|
17
|
+
_ms_to_sec(1),
|
18
|
+
_ms_to_sec(2),
|
19
|
+
_ms_to_sec(3),
|
20
|
+
_ms_to_sec(4),
|
21
|
+
_ms_to_sec(5),
|
22
|
+
_ms_to_sec(6),
|
23
|
+
_ms_to_sec(7),
|
24
|
+
_ms_to_sec(8),
|
25
|
+
_ms_to_sec(9),
|
26
|
+
_ms_to_sec(10),
|
27
|
+
_ms_to_sec(15),
|
28
|
+
_ms_to_sec(20),
|
29
|
+
_ms_to_sec(25),
|
30
|
+
_ms_to_sec(30),
|
31
|
+
_ms_to_sec(35),
|
32
|
+
_ms_to_sec(40),
|
33
|
+
_ms_to_sec(45),
|
34
|
+
_ms_to_sec(50),
|
35
|
+
_ms_to_sec(55),
|
36
|
+
_ms_to_sec(60),
|
37
|
+
_ms_to_sec(65),
|
38
|
+
_ms_to_sec(70),
|
39
|
+
_ms_to_sec(75),
|
40
|
+
_ms_to_sec(80),
|
41
|
+
_ms_to_sec(85),
|
42
|
+
_ms_to_sec(90),
|
43
|
+
_ms_to_sec(95),
|
44
|
+
_ms_to_sec(100),
|
45
|
+
_ms_to_sec(150),
|
46
|
+
_ms_to_sec(200),
|
47
|
+
_ms_to_sec(250),
|
48
|
+
_ms_to_sec(300),
|
49
|
+
_ms_to_sec(350),
|
50
|
+
_ms_to_sec(400),
|
51
|
+
_ms_to_sec(450),
|
52
|
+
_ms_to_sec(500),
|
53
|
+
_ms_to_sec(550),
|
54
|
+
_ms_to_sec(600),
|
55
|
+
_ms_to_sec(650),
|
56
|
+
_ms_to_sec(700),
|
57
|
+
_ms_to_sec(750),
|
58
|
+
_ms_to_sec(800),
|
59
|
+
_ms_to_sec(850),
|
60
|
+
_ms_to_sec(900),
|
61
|
+
_ms_to_sec(950),
|
62
|
+
1,
|
63
|
+
_ms_to_sec(1050),
|
64
|
+
_ms_to_sec(1100),
|
65
|
+
_ms_to_sec(1150),
|
66
|
+
_ms_to_sec(1200),
|
67
|
+
_ms_to_sec(1250),
|
68
|
+
_ms_to_sec(1300),
|
69
|
+
_ms_to_sec(1350),
|
70
|
+
_ms_to_sec(1400),
|
71
|
+
_ms_to_sec(1450),
|
72
|
+
_ms_to_sec(1500),
|
73
|
+
_ms_to_sec(1550),
|
74
|
+
_ms_to_sec(1600),
|
75
|
+
_ms_to_sec(1650),
|
76
|
+
_ms_to_sec(1700),
|
77
|
+
_ms_to_sec(1750),
|
78
|
+
_ms_to_sec(1800),
|
79
|
+
_ms_to_sec(1850),
|
80
|
+
_ms_to_sec(1900),
|
81
|
+
_ms_to_sec(1950),
|
82
|
+
2,
|
83
|
+
_ms_to_sec(2100),
|
84
|
+
_ms_to_sec(2200),
|
85
|
+
_ms_to_sec(2300),
|
86
|
+
_ms_to_sec(2400),
|
87
|
+
_ms_to_sec(2500),
|
88
|
+
_ms_to_sec(2600),
|
89
|
+
_ms_to_sec(2700),
|
90
|
+
_ms_to_sec(2800),
|
91
|
+
_ms_to_sec(2900),
|
92
|
+
3,
|
93
|
+
_ms_to_sec(3500),
|
94
|
+
4,
|
95
|
+
_ms_to_sec(4500),
|
96
|
+
5,
|
97
|
+
10,
|
98
|
+
60,
|
99
|
+
_INF,
|
100
|
+
],
|
101
|
+
)
|
102
|
+
|
103
|
+
|
104
|
+
def latency_metric_for_slow_operation(
|
105
|
+
operation_name: str, operation_description: str
|
106
|
+
) -> prometheus_client.Histogram:
|
107
|
+
"""Creates a histogram metric for latency of a slow operation.
|
108
|
+
|
109
|
+
A slow operation is typically expected to complete within a few tens of seconds."""
|
110
|
+
return prometheus_client.Histogram(
|
111
|
+
f"{operation_name}_latency_seconds",
|
112
|
+
f"Latency of {operation_description}",
|
113
|
+
# Buckets are in seconds
|
114
|
+
buckets=[
|
115
|
+
_ms_to_sec(10),
|
116
|
+
_ms_to_sec(20),
|
117
|
+
_ms_to_sec(30),
|
118
|
+
_ms_to_sec(40),
|
119
|
+
_ms_to_sec(50),
|
120
|
+
_ms_to_sec(60),
|
121
|
+
_ms_to_sec(70),
|
122
|
+
_ms_to_sec(80),
|
123
|
+
_ms_to_sec(90),
|
124
|
+
_ms_to_sec(100),
|
125
|
+
_ms_to_sec(200),
|
126
|
+
_ms_to_sec(300),
|
127
|
+
_ms_to_sec(400),
|
128
|
+
_ms_to_sec(500),
|
129
|
+
_ms_to_sec(600),
|
130
|
+
_ms_to_sec(700),
|
131
|
+
_ms_to_sec(800),
|
132
|
+
_ms_to_sec(900),
|
133
|
+
1,
|
134
|
+
2,
|
135
|
+
3,
|
136
|
+
4,
|
137
|
+
5,
|
138
|
+
6,
|
139
|
+
7,
|
140
|
+
8,
|
141
|
+
9,
|
142
|
+
10,
|
143
|
+
15,
|
144
|
+
20,
|
145
|
+
25,
|
146
|
+
30,
|
147
|
+
35,
|
148
|
+
40,
|
149
|
+
45,
|
150
|
+
50,
|
151
|
+
_minutes_to_sec(1),
|
152
|
+
_minutes_to_sec(2),
|
153
|
+
_minutes_to_sec(3),
|
154
|
+
_minutes_to_sec(4),
|
155
|
+
_minutes_to_sec(5),
|
156
|
+
_minutes_to_sec(6),
|
157
|
+
_minutes_to_sec(7),
|
158
|
+
_minutes_to_sec(8),
|
159
|
+
_minutes_to_sec(9),
|
160
|
+
_minutes_to_sec(10),
|
161
|
+
_minutes_to_sec(20),
|
162
|
+
_INF,
|
163
|
+
],
|
164
|
+
)
|
165
|
+
|
166
|
+
|
167
|
+
def latency_metric_for_customer_controlled_operation(
|
168
|
+
operation_name: str, operation_description: str
|
169
|
+
) -> prometheus_client.Histogram:
|
170
|
+
"""Creates a histogram metric for latency of a customer controlled operation.
|
171
|
+
|
172
|
+
Example of a customer controlled operation is executing customer code.
|
173
|
+
The buckets in this histrogram tend to be quite large because we don't
|
174
|
+
know how long customer code will take to execute and because we don't need
|
175
|
+
a precise understanding of its duration."""
|
176
|
+
return prometheus_client.Histogram(
|
177
|
+
f"{operation_name}_latency_seconds",
|
178
|
+
f"Latency of {operation_description}",
|
179
|
+
# Buckets are in seconds
|
180
|
+
buckets=[
|
181
|
+
_ms_to_sec(1),
|
182
|
+
_ms_to_sec(5),
|
183
|
+
_ms_to_sec(50),
|
184
|
+
_ms_to_sec(100),
|
185
|
+
_ms_to_sec(200),
|
186
|
+
_ms_to_sec(300),
|
187
|
+
_ms_to_sec(400),
|
188
|
+
_ms_to_sec(500),
|
189
|
+
1,
|
190
|
+
2,
|
191
|
+
3,
|
192
|
+
4,
|
193
|
+
5,
|
194
|
+
6,
|
195
|
+
7,
|
196
|
+
8,
|
197
|
+
9,
|
198
|
+
10,
|
199
|
+
15,
|
200
|
+
20,
|
201
|
+
30,
|
202
|
+
40,
|
203
|
+
50,
|
204
|
+
_minutes_to_sec(1),
|
205
|
+
_minutes_to_sec(2),
|
206
|
+
_minutes_to_sec(3),
|
207
|
+
_minutes_to_sec(4),
|
208
|
+
_minutes_to_sec(5),
|
209
|
+
_minutes_to_sec(6),
|
210
|
+
_minutes_to_sec(7),
|
211
|
+
_minutes_to_sec(8),
|
212
|
+
_minutes_to_sec(9),
|
213
|
+
_minutes_to_sec(10),
|
214
|
+
_minutes_to_sec(20),
|
215
|
+
_minutes_to_sec(30),
|
216
|
+
_minutes_to_sec(40),
|
217
|
+
_minutes_to_sec(50),
|
218
|
+
_hours_to_sec(1),
|
219
|
+
_hours_to_sec(2),
|
220
|
+
_hours_to_sec(3),
|
221
|
+
_hours_to_sec(4),
|
222
|
+
_hours_to_sec(5),
|
223
|
+
_hours_to_sec(10),
|
224
|
+
_hours_to_sec(20),
|
225
|
+
_days_to_sec(1),
|
226
|
+
_days_to_sec(2),
|
227
|
+
_INF,
|
228
|
+
],
|
229
|
+
)
|
230
|
+
|
231
|
+
|
232
|
+
def _ms_to_sec(ms: int) -> float:
|
233
|
+
return ms / 1000.0
|
234
|
+
|
235
|
+
|
236
|
+
def _minutes_to_sec(minutes: int) -> float:
|
237
|
+
return minutes * 60.0
|
238
|
+
|
239
|
+
|
240
|
+
def _hours_to_sec(hours: int) -> float:
|
241
|
+
return _minutes_to_sec(hours * 60)
|
242
|
+
|
243
|
+
|
244
|
+
def _days_to_sec(days: int) -> float:
|
245
|
+
return _hours_to_sec(days * 24)
|
@@ -0,0 +1,18 @@
|
|
1
|
+
import asyncio
|
2
|
+
|
3
|
+
from aiohttp import web
|
4
|
+
from prometheus_client import CONTENT_TYPE_LATEST, generate_latest
|
5
|
+
|
6
|
+
from .handler import Handler
|
7
|
+
|
8
|
+
|
9
|
+
class PrometheusMetricsHandler(Handler):
|
10
|
+
async def handle(self, request: web.Request) -> web.Response:
|
11
|
+
# Run the synchronous metrics generation code in ThreadPool thread
|
12
|
+
# to not block the main asyncio loop.
|
13
|
+
return await asyncio.to_thread(self._handle_sync)
|
14
|
+
|
15
|
+
def _handle_sync(self) -> web.Response:
|
16
|
+
return web.Response(
|
17
|
+
body=generate_latest(), headers={"Content-Type": CONTENT_TYPE_LATEST}
|
18
|
+
)
|
@@ -0,0 +1,41 @@
|
|
1
|
+
from aiohttp import web
|
2
|
+
|
3
|
+
from .handler import Handler
|
4
|
+
|
5
|
+
|
6
|
+
class MonitoringServer:
|
7
|
+
def __init__(
|
8
|
+
self,
|
9
|
+
host: str,
|
10
|
+
port: int,
|
11
|
+
startup_probe_handler: Handler,
|
12
|
+
health_probe_handler: Handler,
|
13
|
+
metrics_handler: Handler,
|
14
|
+
):
|
15
|
+
self._host = host
|
16
|
+
self._port = port
|
17
|
+
self._app: web.Application = web.Application()
|
18
|
+
self._app.add_routes(
|
19
|
+
[
|
20
|
+
web.get("/monitoring/startup", startup_probe_handler.handle),
|
21
|
+
web.get("/monitoring/health", health_probe_handler.handle),
|
22
|
+
web.get("/monitoring/metrics", metrics_handler.handle),
|
23
|
+
]
|
24
|
+
)
|
25
|
+
self._app_runner: web.AppRunner = web.AppRunner(self._app)
|
26
|
+
|
27
|
+
async def run(self):
|
28
|
+
await self._app_runner.setup()
|
29
|
+
site = web.TCPSite(
|
30
|
+
runner=self._app_runner,
|
31
|
+
host=self._host,
|
32
|
+
port=self._port,
|
33
|
+
# Allow to listen when there's a closed socket in TIME_WAIT state
|
34
|
+
reuse_address=True,
|
35
|
+
# Don't allow other TCP sockets to actively listen on this address
|
36
|
+
reuse_port=False,
|
37
|
+
)
|
38
|
+
await site.start()
|
39
|
+
|
40
|
+
async def shutdown(self):
|
41
|
+
await self._app_runner.cleanup()
|
@@ -0,0 +1,17 @@
|
|
1
|
+
from aiohttp import web
|
2
|
+
|
3
|
+
from .handler import Handler
|
4
|
+
|
5
|
+
|
6
|
+
class StartupProbeHandler(Handler):
|
7
|
+
def __init__(self):
|
8
|
+
self._ready = False
|
9
|
+
|
10
|
+
def set_ready(self):
|
11
|
+
self._ready = True
|
12
|
+
|
13
|
+
async def handle(self, request: web.Request) -> web.Response:
|
14
|
+
if self._ready:
|
15
|
+
return web.json_response({"status": "ok"})
|
16
|
+
else:
|
17
|
+
return web.json_response({"status": "nok"}, status=503)
|
@@ -1,5 +1,5 @@
|
|
1
1
|
import json
|
2
|
-
|
2
|
+
import time
|
3
3
|
from typing import AsyncGenerator, List, Optional
|
4
4
|
|
5
5
|
import structlog
|
@@ -7,6 +7,11 @@ from httpx_sse import aconnect_sse
|
|
7
7
|
from tensorlake.utils.http_client import get_httpx_client
|
8
8
|
|
9
9
|
from .api_objects import ExecutorMetadata, FunctionURI, Task
|
10
|
+
from .metrics.task_fetcher import (
|
11
|
+
metric_server_registration_errors,
|
12
|
+
metric_server_registration_latency,
|
13
|
+
metric_server_registrations,
|
14
|
+
)
|
10
15
|
from .runtime_probes import ProbeInfo, RuntimeProbes
|
11
16
|
|
12
17
|
|
@@ -48,6 +53,9 @@ class TaskFetcher:
|
|
48
53
|
url=url,
|
49
54
|
executor_version=self._executor_metadata.executor_version,
|
50
55
|
)
|
56
|
+
metric_server_registrations.inc()
|
57
|
+
registration_start_time: float = time.monotonic()
|
58
|
+
|
51
59
|
async with get_httpx_client(
|
52
60
|
config_path=self.config_path, make_async=True
|
53
61
|
) as client:
|
@@ -61,16 +69,22 @@ class TaskFetcher:
|
|
61
69
|
try:
|
62
70
|
event_source.response.raise_for_status()
|
63
71
|
except Exception as e:
|
72
|
+
metric_server_registration_errors.inc()
|
64
73
|
await event_source.response.aread()
|
65
74
|
raise Exception(
|
66
75
|
"failed to register at server. "
|
67
76
|
f"Response code: {event_source.response.status_code}. "
|
68
77
|
f"Response text: '{event_source.response.text}'."
|
69
78
|
) from e
|
79
|
+
finally:
|
80
|
+
metric_server_registration_latency.observe(
|
81
|
+
time.monotonic() - registration_start_time
|
82
|
+
)
|
70
83
|
|
71
84
|
self._logger.info(
|
72
85
|
"executor_registered", executor_id=self._executor_metadata.id
|
73
86
|
)
|
87
|
+
|
74
88
|
async for sse in event_source.aiter_sse():
|
75
89
|
task_dicts = json.loads(sse.data)
|
76
90
|
for task_dict in task_dicts:
|
@@ -7,7 +7,17 @@ from httpx import Timeout
|
|
7
7
|
from tensorlake.function_executor.proto.function_executor_pb2 import FunctionOutput
|
8
8
|
from tensorlake.utils.http_client import get_httpx_client
|
9
9
|
|
10
|
-
from .api_objects import
|
10
|
+
from .api_objects import (
|
11
|
+
TASK_OUTCOME_FAILURE,
|
12
|
+
TASK_OUTCOME_SUCCESS,
|
13
|
+
RouterOutput,
|
14
|
+
TaskResult,
|
15
|
+
)
|
16
|
+
from .metrics.task_reporter import (
|
17
|
+
metric_server_ingest_files_errors,
|
18
|
+
metric_server_ingest_files_latency,
|
19
|
+
metric_server_ingest_files_requests,
|
20
|
+
)
|
11
21
|
from .task_runner import TaskOutput
|
12
22
|
|
13
23
|
|
@@ -77,13 +87,17 @@ class TaskReporter:
|
|
77
87
|
}
|
78
88
|
|
79
89
|
start_time = time.time()
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
90
|
+
with metric_server_ingest_files_latency.time():
|
91
|
+
metric_server_ingest_files_requests.inc()
|
92
|
+
# Run in a separate thread to not block the main event loop.
|
93
|
+
response = await asyncio.to_thread(
|
94
|
+
self._client.post,
|
95
|
+
url=f"{self._base_url}/internal/ingest_files",
|
96
|
+
**kwargs,
|
97
|
+
)
|
84
98
|
end_time = time.time()
|
85
99
|
logger.info(
|
86
|
-
"
|
100
|
+
"task outcome reported",
|
87
101
|
response_time=end_time - start_time,
|
88
102
|
response_code=response.status_code,
|
89
103
|
)
|
@@ -91,6 +105,7 @@ class TaskReporter:
|
|
91
105
|
try:
|
92
106
|
response.raise_for_status()
|
93
107
|
except Exception as e:
|
108
|
+
metric_server_ingest_files_errors.inc()
|
94
109
|
# Caller catches and logs the exception.
|
95
110
|
raise Exception(
|
96
111
|
"failed to report task outcome. "
|
@@ -115,7 +130,9 @@ class TaskReporter:
|
|
115
130
|
if output is None:
|
116
131
|
return task_result, output_files, summary
|
117
132
|
|
118
|
-
task_result.outcome =
|
133
|
+
task_result.outcome = (
|
134
|
+
TASK_OUTCOME_SUCCESS if output.success else TASK_OUTCOME_FAILURE
|
135
|
+
)
|
119
136
|
task_result.reducer = output.reducer
|
120
137
|
|
121
138
|
_process_function_output(
|
indexify/executor/task_runner.py
CHANGED
@@ -1,14 +1,27 @@
|
|
1
|
-
import
|
2
|
-
from typing import Any, Dict, Optional
|
1
|
+
from typing import Any, Optional
|
3
2
|
|
4
3
|
from .api_objects import Task
|
5
4
|
from .function_executor.function_executor_state import FunctionExecutorState
|
5
|
+
from .function_executor.function_executor_states_container import (
|
6
|
+
FunctionExecutorStatesContainer,
|
7
|
+
function_id_with_version,
|
8
|
+
)
|
6
9
|
from .function_executor.server.function_executor_server_factory import (
|
7
10
|
FunctionExecutorServerFactory,
|
8
11
|
)
|
9
12
|
from .function_executor.single_task_runner import SingleTaskRunner
|
10
13
|
from .function_executor.task_input import TaskInput
|
11
14
|
from .function_executor.task_output import TaskOutput
|
15
|
+
from .metrics.task_runner import (
|
16
|
+
metric_task_policy_errors,
|
17
|
+
metric_task_policy_latency,
|
18
|
+
metric_task_policy_runs,
|
19
|
+
metric_task_run_latency,
|
20
|
+
metric_task_run_platform_errors,
|
21
|
+
metric_task_runs,
|
22
|
+
metric_tasks_blocked_by_policy,
|
23
|
+
metric_tasks_running,
|
24
|
+
)
|
12
25
|
|
13
26
|
|
14
27
|
class TaskRunner:
|
@@ -21,8 +34,9 @@ class TaskRunner:
|
|
21
34
|
executor_id: str,
|
22
35
|
function_executor_server_factory: FunctionExecutorServerFactory,
|
23
36
|
base_url: str,
|
24
|
-
config_path: Optional[str],
|
25
37
|
disable_automatic_function_executor_management: bool,
|
38
|
+
function_executor_states: FunctionExecutorStatesContainer,
|
39
|
+
config_path: Optional[str],
|
26
40
|
):
|
27
41
|
self._executor_id: str = executor_id
|
28
42
|
self._factory: FunctionExecutorServerFactory = function_executor_server_factory
|
@@ -31,41 +45,61 @@ class TaskRunner:
|
|
31
45
|
self._disable_automatic_function_executor_management: bool = (
|
32
46
|
disable_automatic_function_executor_management
|
33
47
|
)
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
self._function_executor_states: Dict[str, FunctionExecutorState] = {}
|
48
|
+
self._function_executor_states: FunctionExecutorStatesContainer = (
|
49
|
+
function_executor_states
|
50
|
+
)
|
38
51
|
|
39
52
|
async def run(self, task_input: TaskInput, logger: Any) -> TaskOutput:
|
40
53
|
logger = logger.bind(module=__name__)
|
54
|
+
state: Optional[FunctionExecutorState] = None
|
55
|
+
|
41
56
|
try:
|
42
|
-
|
57
|
+
with (
|
58
|
+
metric_task_policy_errors.count_exceptions(),
|
59
|
+
metric_tasks_blocked_by_policy.track_inprogress(),
|
60
|
+
metric_task_policy_latency.time(),
|
61
|
+
):
|
62
|
+
metric_task_policy_runs.inc()
|
63
|
+
state = await self._acquire_function_executor_for_task_execution(
|
64
|
+
task_input, logger
|
65
|
+
)
|
66
|
+
|
67
|
+
with (
|
68
|
+
metric_task_run_platform_errors.count_exceptions(),
|
69
|
+
metric_tasks_running.track_inprogress(),
|
70
|
+
metric_task_run_latency.time(),
|
71
|
+
):
|
72
|
+
metric_task_runs.inc()
|
73
|
+
return await self._run_task(state, task_input, logger)
|
43
74
|
except Exception as e:
|
44
75
|
logger.error(
|
45
76
|
"failed running the task:",
|
46
77
|
exc_info=e,
|
47
78
|
)
|
48
79
|
return TaskOutput.internal_error(task_input.task)
|
80
|
+
finally:
|
81
|
+
if state is not None:
|
82
|
+
state.lock.release()
|
83
|
+
|
84
|
+
async def _acquire_function_executor_for_task_execution(
|
85
|
+
self, task_input: TaskInput, logger: Any
|
86
|
+
) -> FunctionExecutorState:
|
87
|
+
"""Waits untils the task acquires a Function Executor state where the task can run.
|
88
|
+
|
89
|
+
The returned Function Executor state is locked and the caller is responsible for releasing the lock.
|
90
|
+
"""
|
91
|
+
logger.info("task is blocked by policy")
|
92
|
+
state = await self._function_executor_states.get_or_create_state(
|
93
|
+
task_input.task
|
94
|
+
)
|
95
|
+
await state.lock.acquire()
|
49
96
|
|
50
|
-
|
51
|
-
state = await self._get_or_create_state(task_input.task)
|
52
|
-
async with state.lock:
|
97
|
+
try:
|
53
98
|
await self._run_task_policy(state, task_input.task)
|
54
|
-
return
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
if self._is_shutdown:
|
59
|
-
raise RuntimeError("Task runner is shutting down.")
|
60
|
-
|
61
|
-
id = _function_id_without_version(task)
|
62
|
-
if id not in self._function_executor_states:
|
63
|
-
state = FunctionExecutorState(
|
64
|
-
function_id_with_version=_function_id_with_version(task),
|
65
|
-
function_id_without_version=id,
|
66
|
-
)
|
67
|
-
self._function_executor_states[id] = state
|
68
|
-
return self._function_executor_states[id]
|
99
|
+
return state
|
100
|
+
except Exception:
|
101
|
+
state.lock.release()
|
102
|
+
raise
|
69
103
|
|
70
104
|
async def _run_task_policy(self, state: FunctionExecutorState, task: Task) -> None:
|
71
105
|
# Current policy for running tasks:
|
@@ -80,15 +114,16 @@ class TaskRunner:
|
|
80
114
|
if self._disable_automatic_function_executor_management:
|
81
115
|
return # Disable Function Executor destroy in manual management mode.
|
82
116
|
|
83
|
-
if state.function_id_with_version !=
|
117
|
+
if state.function_id_with_version != function_id_with_version(task):
|
84
118
|
await state.destroy_function_executor()
|
85
|
-
state.function_id_with_version =
|
119
|
+
state.function_id_with_version = function_id_with_version(task)
|
86
120
|
# At this point the state belongs to the version of the function from the task
|
87
121
|
# and there are no running tasks in the Function Executor.
|
88
122
|
|
89
123
|
async def _run_task(
|
90
124
|
self, state: FunctionExecutorState, task_input: TaskInput, logger: Any
|
91
125
|
) -> TaskOutput:
|
126
|
+
logger.info("task execution started")
|
92
127
|
runner: SingleTaskRunner = SingleTaskRunner(
|
93
128
|
executor_id=self._executor_id,
|
94
129
|
function_executor_state=state,
|
@@ -101,21 +136,4 @@ class TaskRunner:
|
|
101
136
|
return await runner.run()
|
102
137
|
|
103
138
|
async def shutdown(self) -> None:
|
104
|
-
|
105
|
-
# so they need to get cleaned up explicitly and reliably.
|
106
|
-
async with self._lock:
|
107
|
-
self._is_shutdown = True # No new Function Executor States can be created.
|
108
|
-
while self._function_executor_states:
|
109
|
-
id, state = self._function_executor_states.popitem()
|
110
|
-
# Only ongoing tasks who have a reference to the state already can see it.
|
111
|
-
async with state.lock:
|
112
|
-
await state.shutdown()
|
113
|
-
# The task running inside the Function Executor will fail because it's destroyed.
|
114
|
-
|
115
|
-
|
116
|
-
def _function_id_with_version(task: Task) -> str:
|
117
|
-
return f"versioned/{task.namespace}/{task.compute_graph}/{task.graph_version}/{task.compute_fn}"
|
118
|
-
|
119
|
-
|
120
|
-
def _function_id_without_version(task: Task) -> str:
|
121
|
-
return f"not_versioned/{task.namespace}/{task.compute_graph}/{task.compute_fn}"
|
139
|
+
pass
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: indexify
|
3
|
-
Version: 0.3.
|
3
|
+
Version: 0.3.10
|
4
4
|
Summary: Open Source Indexify components and helper tools
|
5
5
|
Home-page: https://github.com/tensorlakeai/indexify
|
6
6
|
License: Apache 2.0
|
@@ -14,15 +14,17 @@ Classifier: Programming Language :: Python :: 3.10
|
|
14
14
|
Classifier: Programming Language :: Python :: 3.11
|
15
15
|
Classifier: Programming Language :: Python :: 3.12
|
16
16
|
Classifier: Programming Language :: Python :: 3.13
|
17
|
+
Requires-Dist: aiohttp (>=3.11.0,<4.0.0)
|
17
18
|
Requires-Dist: grpcio (==1.70.0)
|
18
19
|
Requires-Dist: httpx-sse (>=0.4.0,<0.5.0)
|
19
20
|
Requires-Dist: httpx[http2] (>=0.27,<0.28)
|
20
21
|
Requires-Dist: nanoid (>=2.0.0,<3.0.0)
|
22
|
+
Requires-Dist: prometheus-client (>=0.21.1,<0.22.0)
|
21
23
|
Requires-Dist: pydantic (==2.10.4)
|
22
24
|
Requires-Dist: pyyaml (>=6,<7)
|
23
25
|
Requires-Dist: rich (>=13.9.2,<14.0.0)
|
24
26
|
Requires-Dist: structlog (>=24.4.0,<25.0.0)
|
25
|
-
Requires-Dist: tensorlake (>=0.1.
|
27
|
+
Requires-Dist: tensorlake (>=0.1.20)
|
26
28
|
Requires-Dist: typer (>=0.12,<0.13)
|
27
29
|
Project-URL: Repository, https://github.com/tensorlakeai/indexify
|
28
30
|
Description-Content-Type: text/markdown
|
@@ -0,0 +1,46 @@
|
|
1
|
+
indexify/cli/cli.py,sha256=ssXc0zalHS7vbCvetZ0cX4BkxIuPIw590jam3c2YVME,10532
|
2
|
+
indexify/executor/README.md,sha256=ozC6_hMkhQQNVCMEpBxwiUALz6lwErPQxNxQfQDqnG4,2029
|
3
|
+
indexify/executor/api_objects.py,sha256=TaYwDoo7EjuLBusxH512-KdvAJRtBwbEP2IObWraabU,1100
|
4
|
+
indexify/executor/downloader.py,sha256=XjaGCzsGM3ex2HxbKGkIsB50OhlXgdnywWGCjUBdW2k,8127
|
5
|
+
indexify/executor/executor.py,sha256=hij_EkB1_lueakCf3pfi_0O3wxSLYFBkvbpwlsMm-ao,9784
|
6
|
+
indexify/executor/function_executor/function_executor.py,sha256=BxNhsW0uXxKjdKT5ixMAyIo5F7otdIT2c0FMQ09Lsrs,10234
|
7
|
+
indexify/executor/function_executor/function_executor_state.py,sha256=IWPLWa7LaN0Eq8PDu-0kFzkuKJB0COShu7wCO1oyiNA,3141
|
8
|
+
indexify/executor/function_executor/function_executor_states_container.py,sha256=x4hlF7ZBWswk9dkA06Rvgeqar6H9TWjZ7Etyy2CzBDE,2682
|
9
|
+
indexify/executor/function_executor/health_checker.py,sha256=YT24ajPLdYOpdt2UNyJGviGbivCDXIJpeQOxQofcl50,3258
|
10
|
+
indexify/executor/function_executor/invocation_state_client.py,sha256=p-xgM4__cHR1ApvMV9hShrGWee_Je0VDhICZUGjpQY4,9644
|
11
|
+
indexify/executor/function_executor/metrics/function_executor.py,sha256=y36cgZEp949HPCBVFF6uFQIxk2PvnSI74QSbxcAalI4,5659
|
12
|
+
indexify/executor/function_executor/metrics/function_executor_state.py,sha256=M7cMA7JY8_8FW9xjuSqtp6o2xxUgB31LJowo7kzcexg,352
|
13
|
+
indexify/executor/function_executor/metrics/function_executor_state_container.py,sha256=6rrAfml-TivjkHatCM4BLY7jmVs523Wzb6QIysncc-0,302
|
14
|
+
indexify/executor/function_executor/metrics/health_checker.py,sha256=EaeIYJPrQ-qqNMGZVGkvjPoeQSCl4FzPKXEv3Cly1NE,456
|
15
|
+
indexify/executor/function_executor/metrics/invocation_state_client.py,sha256=6FCW6rXHVZZSmwLquZdpjgQPSmE_99naDLke5rZiwMI,1867
|
16
|
+
indexify/executor/function_executor/metrics/single_task_runner.py,sha256=7BJlGkdPGKeufMs3zWNO_1GRVzjINRY5rW3Mp4oWWec,805
|
17
|
+
indexify/executor/function_executor/server/client_configuration.py,sha256=gOywMus0cotlX6NKIadEJwvOmBE-LbGE_wvoMi5-HzY,994
|
18
|
+
indexify/executor/function_executor/server/function_executor_server.py,sha256=_DLivLDikupZusRk8gVWDk7fWPT9XjZ4un1yWSlOObs,883
|
19
|
+
indexify/executor/function_executor/server/function_executor_server_factory.py,sha256=oBEuOwuGsZrvOZsv9v8M4JEtfproirE-cH28XCr0HoY,1739
|
20
|
+
indexify/executor/function_executor/server/subprocess_function_executor_server.py,sha256=JekDOqF7oFD4J6zcN3xB0Dxd1cgpEXMOsb_rKZOeBlI,668
|
21
|
+
indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py,sha256=xm_EL2Gouwi0qAuKsyJVAQJRBZ0VB-NYsWUM4mOi3nQ,4187
|
22
|
+
indexify/executor/function_executor/single_task_runner.py,sha256=Td28vF6pNemKZEs4mNWeLKzy22XRmmMVgp2ipYO_OXU,9262
|
23
|
+
indexify/executor/function_executor/task_input.py,sha256=wSrHR4m0juiGClQyeVdhRC37QzDt6Rrjq-ZXJkfBi9k,584
|
24
|
+
indexify/executor/function_executor/task_output.py,sha256=Qg7vojYi0WmeHRf2qlC-5h46jcwis13EgiN63OmWpcg,1229
|
25
|
+
indexify/executor/metrics/downloader.py,sha256=lctPh8xjkXeLEFJnl1hNrD1yEhLhIl5sggsR4Yoe_Zc,2746
|
26
|
+
indexify/executor/metrics/executor.py,sha256=UkJTa8NerbdC1p7jrwl4obLUS2xGSzEYwONPZpoKhT0,2064
|
27
|
+
indexify/executor/metrics/task_fetcher.py,sha256=iJEwCLzYr2cuz7hRvNiqaa2nvQP4OrA0hm0iJY0YKG0,736
|
28
|
+
indexify/executor/metrics/task_reporter.py,sha256=zUA9RpkSgx5lG_ZqDDuela5VuhtsnC0IKoQcEvHND0Y,730
|
29
|
+
indexify/executor/metrics/task_runner.py,sha256=o5ERNePKPmVKknFoSZUr-r597dEOOWvWn3ocbiL2jxI,1699
|
30
|
+
indexify/executor/monitoring/function_allowlist.py,sha256=wUGeiv3aAGWMlQXzHXq9O6MVHby6Tu-zY4U0MyWiQu0,683
|
31
|
+
indexify/executor/monitoring/handler.py,sha256=Cj1cu_LcsAP0tdviqNhoEtGm4h0OJAxxzW9C2YdNXYU,240
|
32
|
+
indexify/executor/monitoring/health_check_handler.py,sha256=e1pEtWFKaVs6H57Z4YLejNECrJtC38PweZc7xTJeqVw,695
|
33
|
+
indexify/executor/monitoring/health_checker/generic_health_checker.py,sha256=pqytFlv7I7lnzZSzNJiylc-gtDohmuoT-Yb4lxfIe0E,2686
|
34
|
+
indexify/executor/monitoring/health_checker/health_checker.py,sha256=c6UooJUIKaj2dYwU3507nnOglU51TC4FB9npCnLHjbY,838
|
35
|
+
indexify/executor/monitoring/metrics.py,sha256=Dx2wPcTKvbd5Y5rGOfeyscFtAQ2DZ16_s5BX6d4nhI8,6660
|
36
|
+
indexify/executor/monitoring/prometheus_metrics_handler.py,sha256=KiGqSf7rkXTfbDwThyXFpFe2jnuZD5q-5SBP_0GDo8Y,591
|
37
|
+
indexify/executor/monitoring/server.py,sha256=yzdYhcxnmY6uTQUMt3vatF5jilN52ZtfFseOmHyQpTo,1254
|
38
|
+
indexify/executor/monitoring/startup_probe_handler.py,sha256=zXXsBU15SMlBx1bSFpxWDfed1VHtKKnwvLQ8-frpG98,425
|
39
|
+
indexify/executor/runtime_probes.py,sha256=bo6Dq6AGZpJH099j0DHtVSDEH80tv3j9MXf3VXSx_p8,2182
|
40
|
+
indexify/executor/task_fetcher.py,sha256=NpFfHgaY99bSL-K2D5kcDAMNUG2FArq0-qF_mgF-LBQ,3375
|
41
|
+
indexify/executor/task_reporter.py,sha256=t7FzSKV7fG1fEtxyTcFh-sTWZ8WXApbG-qXaAMq_gRQ,7363
|
42
|
+
indexify/executor/task_runner.py,sha256=tcS7hHdSVQexC3XEIsDTwX51bJK4iSQqC8CbWeNoud0,5590
|
43
|
+
indexify-0.3.10.dist-info/METADATA,sha256=WZfdgioEIs_yVFdyELZb9NLcHeKxgsxmxouLpKsZpZM,1428
|
44
|
+
indexify-0.3.10.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
|
45
|
+
indexify-0.3.10.dist-info/entry_points.txt,sha256=GU9wmsgvN7nQw3N2X0PMYn1RSvF6CrhH9RuC2D8d3Gk,53
|
46
|
+
indexify-0.3.10.dist-info/RECORD,,
|