indexify 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. indexify/cli/cli.py +38 -78
  2. indexify/executor/api_objects.py +4 -0
  3. indexify/executor/downloader.py +45 -5
  4. indexify/executor/executor.py +103 -16
  5. indexify/executor/function_executor/function_executor.py +174 -55
  6. indexify/executor/function_executor/function_executor_state.py +6 -0
  7. indexify/executor/function_executor/function_executor_states_container.py +64 -0
  8. indexify/executor/function_executor/health_checker.py +20 -10
  9. indexify/executor/function_executor/invocation_state_client.py +31 -6
  10. indexify/executor/function_executor/metrics/function_executor.py +142 -0
  11. indexify/executor/function_executor/metrics/function_executor_state.py +10 -0
  12. indexify/executor/function_executor/metrics/function_executor_state_container.py +10 -0
  13. indexify/executor/function_executor/metrics/health_checker.py +14 -0
  14. indexify/executor/function_executor/metrics/invocation_state_client.py +45 -0
  15. indexify/executor/function_executor/metrics/single_task_runner.py +22 -0
  16. indexify/executor/function_executor/single_task_runner.py +44 -15
  17. indexify/executor/function_executor/task_output.py +7 -1
  18. indexify/executor/metrics/downloader.py +69 -0
  19. indexify/executor/metrics/executor.py +51 -0
  20. indexify/executor/metrics/task_fetcher.py +21 -0
  21. indexify/executor/metrics/task_reporter.py +22 -0
  22. indexify/executor/metrics/task_runner.py +45 -0
  23. indexify/executor/monitoring/function_allowlist.py +25 -0
  24. indexify/executor/monitoring/handler.py +8 -0
  25. indexify/executor/monitoring/health_check_handler.py +20 -0
  26. indexify/executor/monitoring/health_checker/generic_health_checker.py +58 -0
  27. indexify/executor/monitoring/health_checker/health_checker.py +23 -0
  28. indexify/executor/monitoring/metrics.py +245 -0
  29. indexify/executor/monitoring/prometheus_metrics_handler.py +18 -0
  30. indexify/executor/monitoring/server.py +41 -0
  31. indexify/executor/monitoring/startup_probe_handler.py +17 -0
  32. indexify/executor/task_fetcher.py +15 -1
  33. indexify/executor/task_reporter.py +24 -7
  34. indexify/executor/task_runner.py +64 -46
  35. {indexify-0.3.8.dist-info → indexify-0.3.10.dist-info}/METADATA +4 -2
  36. indexify-0.3.10.dist-info/RECORD +46 -0
  37. indexify-0.3.8.dist-info/RECORD +0 -25
  38. {indexify-0.3.8.dist-info → indexify-0.3.10.dist-info}/WHEEL +0 -0
  39. {indexify-0.3.8.dist-info → indexify-0.3.10.dist-info}/entry_points.txt +0 -0
indexify/cli/cli.py CHANGED
@@ -6,8 +6,6 @@ from tensorlake.utils.logging import (
6
6
 
7
7
  configure_logging_early()
8
8
 
9
- import importlib.metadata
10
- import json
11
9
  import os
12
10
  import shutil
13
11
  import signal
@@ -17,10 +15,11 @@ import threading
17
15
  import time
18
16
  from importlib.metadata import version
19
17
  from pathlib import Path
18
+ from socket import gethostname
20
19
  from typing import Annotated, List, Optional, Tuple
21
20
 
22
- import docker
23
21
  import nanoid
22
+ import prometheus_client
24
23
  import structlog
25
24
  import typer
26
25
  from rich.console import Console
@@ -33,6 +32,9 @@ from indexify.executor.executor import Executor
33
32
  from indexify.executor.function_executor.server.subprocess_function_executor_server_factory import (
34
33
  SubprocessFunctionExecutorServerFactory,
35
34
  )
35
+ from indexify.executor.monitoring.health_checker.generic_health_checker import (
36
+ GenericHealthChecker,
37
+ )
36
38
 
37
39
  custom_theme = Theme(
38
40
  {
@@ -158,25 +160,6 @@ def build_image(
158
160
  _create_image(obj, python_sdk_path)
159
161
 
160
162
 
161
- @app.command(help="Build default image for indexify")
162
- def build_default_image(
163
- python_version: Optional[str] = typer.Option(
164
- f"{sys.version_info.major}.{sys.version_info.minor}",
165
- help="Python version to use in the base image",
166
- )
167
- ):
168
- image = GetDefaultPythonImage(python_version)
169
-
170
- _build_image(image=image)
171
-
172
- console.print(
173
- Text(f"Built default indexify image with hash {image.hash()}\n", style="cyan"),
174
- Text(
175
- f"Don't forget to update your executors to run this image!", style="yellow"
176
- ),
177
- )
178
-
179
-
180
163
  @app.command(
181
164
  help="Runs Executor that connects to the Indexify server and starts running its tasks"
182
165
  )
@@ -204,8 +187,23 @@ def executor(
204
187
  ),
205
188
  # Registred ports range ends at 49151.
206
189
  ports: Tuple[int, int] = typer.Option(
207
- (50000, 51000), help="Range of localhost TCP ports to be used by the executor"
190
+ (50000, 51000),
191
+ help="Range of localhost TCP ports to be used by Function Executors",
208
192
  ),
193
+ monitoring_server_host: Annotated[
194
+ str,
195
+ typer.Option(
196
+ "--monitoring-server-host",
197
+ help="IP address or hostname where to run Executor Monitoring server",
198
+ ),
199
+ ] = "localhost",
200
+ monitoring_server_port: Annotated[
201
+ int,
202
+ typer.Option(
203
+ "--monitoring-server-port",
204
+ help="Port where to run Executor Monitoring server",
205
+ ),
206
+ ] = 7000,
209
207
  disable_automatic_function_executor_management: Annotated[
210
208
  bool,
211
209
  typer.Option(
@@ -229,6 +227,7 @@ def executor(
229
227
 
230
228
  logger.info(
231
229
  "starting executor",
230
+ hostname=gethostname(),
232
231
  server_addr=server_addr,
233
232
  config_path=config_path,
234
233
  executor_version=executor_version,
@@ -236,6 +235,8 @@ def executor(
236
235
  ports=ports,
237
236
  functions=function_uris,
238
237
  dev_mode=dev,
238
+ monitoring_server_host=monitoring_server_host,
239
+ monitoring_server_port=monitoring_server_port,
239
240
  disable_automatic_function_executor_management=disable_automatic_function_executor_management,
240
241
  )
241
242
 
@@ -254,17 +255,26 @@ def executor(
254
255
  )
255
256
  exit(1)
256
257
 
258
+ prometheus_client.Info("cli", "CLI information").info(
259
+ {
260
+ "package": "indexify",
261
+ }
262
+ )
263
+
257
264
  Executor(
258
265
  id=id,
259
266
  version=executor_version,
260
- server_addr=server_addr,
261
- config_path=config_path,
267
+ health_checker=GenericHealthChecker(),
262
268
  code_path=executor_cache,
263
269
  function_allowlist=_parse_function_uris(function_uris),
264
270
  function_executor_server_factory=SubprocessFunctionExecutorServerFactory(
265
271
  development_mode=dev,
266
272
  server_ports=range(ports[0], ports[1]),
267
273
  ),
274
+ server_addr=server_addr,
275
+ config_path=config_path,
276
+ monitoring_server_host=monitoring_server_host,
277
+ monitoring_server_port=monitoring_server_port,
268
278
  disable_automatic_function_executor_management=disable_automatic_function_executor_management,
269
279
  ).run()
270
280
 
@@ -307,57 +317,7 @@ def _create_image(image: Image, python_sdk_path):
307
317
 
308
318
 
309
319
  def _build_image(image: Image, python_sdk_path: Optional[str] = None):
310
- docker_file = _generate_dockerfile(image, python_sdk_path=python_sdk_path)
311
- image_name = f"{image._image_name}:{image._tag}"
312
-
313
- # low_level_client = docker.APIClient(base_url=docker_client.api.base_url)
314
- docker_host = os.getenv("DOCKER_HOST", "unix:///var/run/docker.sock")
315
- low_level_client = docker.APIClient(base_url=docker_host)
316
- docker.api.build.process_dockerfile = lambda dockerfile, path: (
317
- "Dockerfile",
318
- dockerfile,
319
- )
320
- generator = low_level_client.build(
321
- dockerfile=docker_file,
322
- rm=True,
323
- path=".",
324
- tag=image_name,
325
- )
326
-
320
+ built_image, generator = image.build(python_sdk_path=python_sdk_path)
327
321
  for output in generator:
328
- for line in output.decode().splitlines():
329
- json_line = json.loads(line)
330
- if "stream" in json_line:
331
- print(json_line["stream"], end="")
332
-
333
- elif "errorDetail" in json_line:
334
- print(json_line["errorDetail"]["message"])
335
-
336
-
337
- def _generate_dockerfile(image, python_sdk_path: Optional[str] = None):
338
- docker_contents = [
339
- f"FROM {image._base_image}",
340
- "RUN mkdir -p ~/.indexify",
341
- f"RUN echo {image._image_name} > ~/.indexify/image_name", # TODO: Do we still use this in executors?
342
- f"RUN echo {image.hash()} > ~/.indexify/image_hash", # TODO: Do we still use this in executors?
343
- "WORKDIR /app",
344
- ]
345
-
346
- for build_op in image._build_ops:
347
- docker_contents.append(build_op.render())
348
-
349
- if python_sdk_path is not None:
350
- print(f"Building image {image._image_name} with local version of the SDK")
351
-
352
- if not os.path.exists(python_sdk_path):
353
- print(f"error: {python_sdk_path} does not exist")
354
- os.exit(1)
355
- docker_contents.append(f"COPY {python_sdk_path} /app/python-sdk")
356
- docker_contents.append("RUN (cd /app/python-sdk && pip install .)")
357
- else:
358
- docker_contents.append(
359
- f"RUN pip install indexify=={importlib.metadata.version('indexify')}"
360
- )
361
-
362
- docker_file = "\n".join(docker_contents)
363
- return docker_file
322
+ print(output)
323
+ print(f"built image: {built_image.tags[0]}")
@@ -45,3 +45,7 @@ class TaskResult(BaseModel):
45
45
  executor_id: str
46
46
  task_id: str
47
47
  reducer: bool = False
48
+
49
+
50
+ TASK_OUTCOME_SUCCESS = "success"
51
+ TASK_OUTCOME_FAILURE = "failure"
@@ -8,6 +8,21 @@ from tensorlake.function_executor.proto.function_executor_pb2 import SerializedO
8
8
  from tensorlake.utils.http_client import get_httpx_client
9
9
 
10
10
  from .api_objects import Task
11
+ from .metrics.downloader import (
12
+ metric_graph_download_errors,
13
+ metric_graph_download_latency,
14
+ metric_graph_downloads,
15
+ metric_graphs_from_cache,
16
+ metric_reducer_init_value_download_errors,
17
+ metric_reducer_init_value_download_latency,
18
+ metric_reducer_init_value_downloads,
19
+ metric_task_input_download_errors,
20
+ metric_task_input_download_latency,
21
+ metric_task_input_downloads,
22
+ metric_tasks_downloading_graphs,
23
+ metric_tasks_downloading_inputs,
24
+ metric_tasks_downloading_reducer_init_value,
25
+ )
11
26
 
12
27
 
13
28
  class Downloader:
@@ -19,6 +34,33 @@ class Downloader:
19
34
  self._client = get_httpx_client(config_path, make_async=True)
20
35
 
21
36
  async def download_graph(self, task: Task) -> SerializedObject:
37
+ with (
38
+ metric_graph_download_errors.count_exceptions(),
39
+ metric_tasks_downloading_graphs.track_inprogress(),
40
+ metric_graph_download_latency.time(),
41
+ ):
42
+ metric_graph_downloads.inc()
43
+ return await self._download_graph(task)
44
+
45
+ async def download_input(self, task: Task) -> SerializedObject:
46
+ with (
47
+ metric_task_input_download_errors.count_exceptions(),
48
+ metric_tasks_downloading_inputs.track_inprogress(),
49
+ metric_task_input_download_latency.time(),
50
+ ):
51
+ metric_task_input_downloads.inc()
52
+ return await self._download_input(task)
53
+
54
+ async def download_init_value(self, task: Task) -> SerializedObject:
55
+ with (
56
+ metric_reducer_init_value_download_errors.count_exceptions(),
57
+ metric_tasks_downloading_reducer_init_value.track_inprogress(),
58
+ metric_reducer_init_value_download_latency.time(),
59
+ ):
60
+ metric_reducer_init_value_downloads.inc()
61
+ return await self._download_init_value(task)
62
+
63
+ async def _download_graph(self, task: Task) -> SerializedObject:
22
64
  # Cache graph to reduce load on the server.
23
65
  graph_path = os.path.join(
24
66
  self.code_path,
@@ -33,6 +75,7 @@ class Downloader:
33
75
  self._read_cached_graph, graph_path
34
76
  )
35
77
  if graph is not None:
78
+ metric_graphs_from_cache.inc()
36
79
  return graph
37
80
 
38
81
  logger = self._task_logger(task)
@@ -71,7 +114,7 @@ class Downloader:
71
114
  # This also allows to share the same cache between multiple Executors.
72
115
  os.replace(tmp_path, path)
73
116
 
74
- async def download_input(self, task: Task) -> SerializedObject:
117
+ async def _download_input(self, task: Task) -> SerializedObject:
75
118
  logger = self._task_logger(task)
76
119
 
77
120
  first_function_in_graph = task.invocation_id == task.input_key.split("|")[-1]
@@ -81,10 +124,7 @@ class Downloader:
81
124
  else:
82
125
  return await self._fetch_function_input(task, logger)
83
126
 
84
- async def download_init_value(self, task: Task) -> Optional[SerializedObject]:
85
- if task.reducer_output_id is None:
86
- return None
87
-
127
+ async def _download_init_value(self, task: Task) -> SerializedObject:
88
128
  logger = self._task_logger(task)
89
129
  return await self._fetch_function_init_value(task, logger)
90
130
 
@@ -1,7 +1,8 @@
1
1
  import asyncio
2
2
  import signal
3
3
  from pathlib import Path
4
- from typing import Any, List, Optional
4
+ from socket import gethostname
5
+ from typing import Any, Dict, List, Optional
5
6
 
6
7
  import structlog
7
8
  from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
@@ -9,13 +10,38 @@ from tensorlake.utils.logging import suppress as suppress_logging
9
10
 
10
11
  from .api_objects import FunctionURI, Task
11
12
  from .downloader import Downloader
13
+ from .function_executor.function_executor_states_container import (
14
+ FunctionExecutorStatesContainer,
15
+ )
12
16
  from .function_executor.server.function_executor_server_factory import (
13
17
  FunctionExecutorServerFactory,
14
18
  )
19
+ from .metrics.executor import (
20
+ METRIC_TASKS_COMPLETED_OUTCOME_ALL,
21
+ METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
22
+ METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
23
+ METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
24
+ metric_executor_info,
25
+ metric_executor_state,
26
+ metric_task_outcome_report_latency,
27
+ metric_task_outcome_report_retries,
28
+ metric_task_outcome_reports,
29
+ metric_tasks_completed,
30
+ metric_tasks_fetched,
31
+ metric_tasks_reporting_outcome,
32
+ )
33
+ from .monitoring.function_allowlist import function_allowlist_to_info_dict
34
+ from .monitoring.health_check_handler import HealthCheckHandler
35
+ from .monitoring.health_checker.health_checker import HealthChecker
36
+ from .monitoring.prometheus_metrics_handler import PrometheusMetricsHandler
37
+ from .monitoring.server import MonitoringServer
38
+ from .monitoring.startup_probe_handler import StartupProbeHandler
15
39
  from .task_fetcher import TaskFetcher
16
40
  from .task_reporter import TaskReporter
17
41
  from .task_runner import TaskInput, TaskOutput, TaskRunner
18
42
 
43
+ metric_executor_state.state("starting")
44
+
19
45
 
20
46
  class Executor:
21
47
  def __init__(
@@ -23,11 +49,14 @@ class Executor:
23
49
  id: str,
24
50
  version: str,
25
51
  code_path: Path,
52
+ health_checker: HealthChecker,
26
53
  function_allowlist: Optional[List[FunctionURI]],
27
54
  function_executor_server_factory: FunctionExecutorServerFactory,
28
- server_addr: str = "localhost:8900",
29
- config_path: Optional[str] = None,
30
- disable_automatic_function_executor_management: bool = False,
55
+ server_addr: str,
56
+ config_path: Optional[str],
57
+ monitoring_server_host: str,
58
+ monitoring_server_port: int,
59
+ disable_automatic_function_executor_management: bool,
31
60
  ):
32
61
  self._logger = structlog.get_logger(module=__name__)
33
62
  self._is_shutdown: bool = False
@@ -40,12 +69,25 @@ class Executor:
40
69
  self._server_addr = server_addr
41
70
  self._base_url = f"{protocol}://{self._server_addr}"
42
71
  self._code_path = code_path
72
+ self._startup_probe_handler = StartupProbeHandler()
73
+ self._monitoring_server = MonitoringServer(
74
+ host=monitoring_server_host,
75
+ port=monitoring_server_port,
76
+ startup_probe_handler=self._startup_probe_handler,
77
+ health_probe_handler=HealthCheckHandler(health_checker),
78
+ metrics_handler=PrometheusMetricsHandler(),
79
+ )
80
+ self._function_executor_states = FunctionExecutorStatesContainer()
81
+ health_checker.set_function_executor_states_container(
82
+ self._function_executor_states
83
+ )
43
84
  self._task_runner = TaskRunner(
44
85
  executor_id=id,
45
86
  function_executor_server_factory=function_executor_server_factory,
46
87
  base_url=self._base_url,
47
- config_path=config_path,
48
88
  disable_automatic_function_executor_management=disable_automatic_function_executor_management,
89
+ function_executor_states=self._function_executor_states,
90
+ config_path=config_path,
49
91
  )
50
92
  self._downloader = Downloader(
51
93
  code_path=code_path, base_url=self._base_url, config_path=config_path
@@ -63,8 +105,22 @@ class Executor:
63
105
  executor_id=id,
64
106
  config_path=self._config_path,
65
107
  )
108
+ executor_info: Dict[str, str] = {
109
+ "id": id,
110
+ "version": version,
111
+ "code_path": str(code_path),
112
+ "server_addr": server_addr,
113
+ "config_path": str(config_path),
114
+ "disable_automatic_function_executor_management": str(
115
+ disable_automatic_function_executor_management
116
+ ),
117
+ "hostname": gethostname(),
118
+ }
119
+ executor_info.update(function_allowlist_to_info_dict(function_allowlist))
120
+ metric_executor_info.info(executor_info)
66
121
 
67
122
  def run(self):
123
+ asyncio.new_event_loop()
68
124
  for signum in [
69
125
  signal.SIGABRT,
70
126
  signal.SIGINT,
@@ -76,15 +132,20 @@ class Executor:
76
132
  signum, self.shutdown, asyncio.get_event_loop()
77
133
  )
78
134
 
135
+ asyncio.get_event_loop().create_task(self._monitoring_server.run())
136
+
79
137
  try:
80
- asyncio.get_event_loop().run_until_complete(self._run_async())
138
+ asyncio.get_event_loop().run_until_complete(self._run_tasks_loop())
81
139
  except asyncio.CancelledError:
82
140
  pass # Suppress this expected exception and return without error (normally).
83
141
 
84
- async def _run_async(self):
142
+ async def _run_tasks_loop(self):
143
+ metric_executor_state.state("running")
144
+ self._startup_probe_handler.set_ready()
85
145
  while not self._is_shutdown:
86
146
  try:
87
147
  async for task in self._task_fetcher.run():
148
+ metric_tasks_fetched.inc()
88
149
  asyncio.create_task(self._run_task(task))
89
150
  except Exception as e:
90
151
  self._logger.error(
@@ -103,9 +164,10 @@ class Executor:
103
164
  graph: SerializedObject = await self._downloader.download_graph(task)
104
165
  input: SerializedObject = await self._downloader.download_input(task)
105
166
  init_value: Optional[SerializedObject] = (
106
- await self._downloader.download_init_value(task)
167
+ None
168
+ if task.reducer_output_id is None
169
+ else (await self._downloader.download_init_value(task))
107
170
  )
108
- logger.info("task_execution_started")
109
171
  output: TaskOutput = await self._task_runner.run(
110
172
  TaskInput(
111
173
  task=task,
@@ -115,15 +177,22 @@ class Executor:
115
177
  ),
116
178
  logger=logger,
117
179
  )
118
- logger.info("task_execution_finished", success=output.success)
180
+ logger.info("task execution finished", success=output.success)
119
181
  except Exception as e:
120
182
  output = TaskOutput.internal_error(task)
121
- logger.error("task_execution_failed", exc_info=e)
183
+ logger.error("task execution failed", exc_info=e)
122
184
 
123
- await self._report_task_outcome(output=output, logger=logger)
185
+ with (
186
+ metric_tasks_reporting_outcome.track_inprogress(),
187
+ metric_task_outcome_report_latency.time(),
188
+ ):
189
+ metric_task_outcome_reports.inc()
190
+ await self._report_task_outcome(output=output, logger=logger)
124
191
 
125
192
  async def _report_task_outcome(self, output: TaskOutput, logger: Any) -> None:
126
- """Reports the task with the given output to the server."""
193
+ """Reports the task with the given output to the server.
194
+
195
+ Doesn't raise any Exceptions. Runs till the reporting is successful."""
127
196
  reporting_retries: int = 0
128
197
 
129
198
  while True:
@@ -133,22 +202,40 @@ class Executor:
133
202
  break
134
203
  except Exception as e:
135
204
  logger.error(
136
- "failed_to_report_task",
205
+ "failed to report task",
137
206
  exc_info=e,
138
207
  )
139
208
  reporting_retries += 1
209
+ metric_task_outcome_report_retries.inc()
140
210
  await asyncio.sleep(5)
141
211
 
212
+ metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
213
+ if output.is_internal_error:
214
+ metric_tasks_completed.labels(
215
+ outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
216
+ ).inc()
217
+ elif output.success:
218
+ metric_tasks_completed.labels(
219
+ outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
220
+ ).inc()
221
+ else:
222
+ metric_tasks_completed.labels(
223
+ outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
224
+ ).inc()
225
+
142
226
  async def _shutdown(self, loop):
143
- self._logger.info("shutting_down")
227
+ self._logger.info("shutting down")
228
+ metric_executor_state.state("shutting_down")
144
229
  # There will be lots of task cancellation exceptions and "X is shutting down"
145
230
  # exceptions logged during Executor shutdown. Suppress their logs as they are
146
231
  # expected and are confusing for users.
147
232
  suppress_logging()
148
233
 
149
234
  self._is_shutdown = True
235
+ await self._monitoring_server.shutdown()
150
236
  await self._task_runner.shutdown()
151
- # We mainly need to cancel the task that runs _run_async() loop.
237
+ await self._function_executor_states.shutdown()
238
+ # We mainly need to cancel the task that runs _run_tasks_loop().
152
239
  for task in asyncio.all_tasks(loop):
153
240
  task.cancel()
154
241