indexify 0.3.17__tar.gz → 0.3.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {indexify-0.3.17 → indexify-0.3.18}/PKG-INFO +1 -1
  2. {indexify-0.3.17 → indexify-0.3.18}/pyproject.toml +1 -1
  3. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/cli/cli.py +19 -2
  4. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/executor.py +24 -9
  5. indexify-0.3.18/src/indexify/executor/executor_flavor.py +7 -0
  6. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/health_checker.py +20 -2
  7. indexify-0.3.18/src/indexify/executor/grpc/channel_manager.py +160 -0
  8. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/grpc/state_reconciler.py +14 -9
  9. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/grpc/state_reporter.py +72 -14
  10. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/task_fetcher.py +8 -3
  11. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/task_reporter.py +17 -0
  12. indexify-0.3.17/src/indexify/proto/task_scheduler.proto → indexify-0.3.18/src/indexify/proto/executor_api.proto +23 -6
  13. indexify-0.3.18/src/indexify/proto/executor_api_pb2.py +70 -0
  14. indexify-0.3.17/src/indexify/proto/task_scheduler_pb2.pyi → indexify-0.3.18/src/indexify/proto/executor_api_pb2.pyi +44 -4
  15. indexify-0.3.17/src/indexify/proto/task_scheduler_pb2_grpc.py → indexify-0.3.18/src/indexify/proto/executor_api_pb2_grpc.py +36 -26
  16. indexify-0.3.17/src/indexify/executor/grpc/channel_creator.py +0 -53
  17. indexify-0.3.17/src/indexify/proto/task_scheduler_pb2.py +0 -64
  18. {indexify-0.3.17 → indexify-0.3.18}/README.md +0 -0
  19. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/README.md +0 -0
  20. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/api_objects.py +0 -0
  21. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/downloader.py +0 -0
  22. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/function_executor.py +0 -0
  23. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/function_executor_state.py +0 -0
  24. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/function_executor_states_container.py +0 -0
  25. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/function_executor_status.py +0 -0
  26. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/invocation_state_client.py +0 -0
  27. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/function_executor.py +0 -0
  28. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/function_executor_state.py +0 -0
  29. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -0
  30. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/health_checker.py +0 -0
  31. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/invocation_state_client.py +0 -0
  32. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/single_task_runner.py +0 -0
  33. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
  34. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
  35. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
  36. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
  37. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +0 -0
  38. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/single_task_runner.py +0 -0
  39. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/task_input.py +0 -0
  40. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/function_executor/task_output.py +0 -0
  41. /indexify-0.3.17/src/indexify/executor/grpc/metrics/channel_creator.py → /indexify-0.3.18/src/indexify/executor/grpc/metrics/channel_manager.py +0 -0
  42. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/grpc/metrics/state_reporter.py +0 -0
  43. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/metrics/downloader.py +0 -0
  44. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/metrics/executor.py +0 -0
  45. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/metrics/task_fetcher.py +0 -0
  46. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/metrics/task_reporter.py +0 -0
  47. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/metrics/task_runner.py +0 -0
  48. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/function_allowlist.py +0 -0
  49. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/handler.py +0 -0
  50. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/health_check_handler.py +0 -0
  51. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +0 -0
  52. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/health_checker/health_checker.py +0 -0
  53. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/metrics.py +0 -0
  54. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/prometheus_metrics_handler.py +0 -0
  55. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/server.py +0 -0
  56. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/monitoring/startup_probe_handler.py +0 -0
  57. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/runtime_probes.py +0 -0
  58. {indexify-0.3.17 → indexify-0.3.18}/src/indexify/executor/task_runner.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: indexify
3
- Version: 0.3.17
3
+ Version: 0.3.18
4
4
  Summary: Open Source Indexify components and helper tools
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -1,7 +1,7 @@
1
1
  [tool.poetry]
2
2
  name = "indexify"
3
3
  # Incremented if any of the components provided in this packages are updated.
4
- version = "0.3.17"
4
+ version = "0.3.18"
5
5
  description = "Open Source Indexify components and helper tools"
6
6
  authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
7
7
  license = "Apache 2.0"
@@ -13,7 +13,7 @@ import sys
13
13
  from importlib.metadata import version
14
14
  from pathlib import Path
15
15
  from socket import gethostname
16
- from typing import Annotated, List, Optional, Tuple
16
+ from typing import Annotated, Dict, List, Optional, Tuple
17
17
 
18
18
  import nanoid
19
19
  import prometheus_client
@@ -26,6 +26,7 @@ from tensorlake.functions_sdk.image import Image
26
26
 
27
27
  from indexify.executor.api_objects import FunctionURI
28
28
  from indexify.executor.executor import Executor
29
+ from indexify.executor.executor_flavor import ExecutorFlavor
29
30
  from indexify.executor.function_executor.server.subprocess_function_executor_server_factory import (
30
31
  SubprocessFunctionExecutorServerFactory,
31
32
  )
@@ -119,7 +120,6 @@ def executor(
119
120
  help="Port where to run Executor Monitoring server",
120
121
  ),
121
122
  ] = 7000,
122
- # TODO: Figure out mTLS for gRPC.
123
123
  grpc_server_addr: Annotated[
124
124
  Optional[str],
125
125
  typer.Option(
@@ -140,6 +140,15 @@ def executor(
140
140
  ),
141
141
  ),
142
142
  ] = False,
143
+ labels: Annotated[
144
+ List[str],
145
+ typer.Option(
146
+ "--label",
147
+ "-l",
148
+ help="Executor key-value label to be sent to the Server. "
149
+ "Specified as <key>=<value>",
150
+ ),
151
+ ] = [],
143
152
  ):
144
153
  if dev:
145
154
  configure_development_mode_logging()
@@ -162,6 +171,11 @@ def executor(
162
171
  "--grpc-server-addr must be set when --enable-grpc-state-reconciler is set"
163
172
  )
164
173
 
174
+ kv_labels: Dict[str, str] = {}
175
+ for label in labels:
176
+ key, value = label.split("=")
177
+ kv_labels[key] = value
178
+
165
179
  executor_version = version("indexify")
166
180
  logger = structlog.get_logger(module=__name__, executor_id=executor_id)
167
181
 
@@ -171,6 +185,7 @@ def executor(
171
185
  server_addr=server_addr,
172
186
  config_path=config_path,
173
187
  executor_version=executor_version,
188
+ labels=kv_labels,
174
189
  executor_cache=executor_cache,
175
190
  ports=ports,
176
191
  functions=function_uris,
@@ -205,7 +220,9 @@ def executor(
205
220
  Executor(
206
221
  id=executor_id,
207
222
  development_mode=dev,
223
+ flavor=ExecutorFlavor.OSS,
208
224
  version=executor_version,
225
+ labels=kv_labels,
209
226
  health_checker=GenericHealthChecker(),
210
227
  code_path=executor_cache,
211
228
  function_allowlist=_parse_function_uris(function_uris),
@@ -9,17 +9,18 @@ import structlog
9
9
  from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
10
10
  from tensorlake.utils.logging import suppress as suppress_logging
11
11
 
12
- from indexify.proto.task_scheduler_pb2 import ExecutorStatus
12
+ from indexify.proto.executor_api_pb2 import ExecutorStatus
13
13
 
14
14
  from .api_objects import FunctionURI, Task
15
15
  from .downloader import Downloader
16
+ from .executor_flavor import ExecutorFlavor
16
17
  from .function_executor.function_executor_states_container import (
17
18
  FunctionExecutorStatesContainer,
18
19
  )
19
20
  from .function_executor.server.function_executor_server_factory import (
20
21
  FunctionExecutorServerFactory,
21
22
  )
22
- from .grpc.channel_creator import ChannelCreator
23
+ from .grpc.channel_manager import ChannelManager
23
24
  from .grpc.state_reconciler import ExecutorStateReconciler
24
25
  from .grpc.state_reporter import ExecutorStateReporter
25
26
  from .metrics.executor import (
@@ -55,7 +56,9 @@ class Executor:
55
56
  self,
56
57
  id: str,
57
58
  development_mode: bool,
59
+ flavor: ExecutorFlavor,
58
60
  version: str,
61
+ labels: Dict[str, str],
59
62
  code_path: Path,
60
63
  health_checker: HealthChecker,
61
64
  function_allowlist: Optional[List[FunctionURI]],
@@ -106,18 +109,25 @@ class Executor:
106
109
  self._task_runner: Optional[TaskRunner] = None
107
110
  self._task_fetcher: Optional[TaskFetcher] = None
108
111
  # gRPC mode services
109
- self._channel_creator: Optional[ChannelCreator] = None
112
+ self._channel_manager: Optional[ChannelManager] = None
110
113
  self._state_reporter: Optional[ExecutorStateReporter] = None
111
114
  self._state_reconciler: Optional[ExecutorStateReconciler] = None
112
115
 
113
116
  if grpc_server_addr is not None:
114
- self._channel_creator = ChannelCreator(grpc_server_addr, self._logger)
117
+ self._channel_manager = ChannelManager(
118
+ server_address=grpc_server_addr,
119
+ config_path=config_path,
120
+ logger=self._logger,
121
+ )
115
122
  self._state_reporter = ExecutorStateReporter(
116
123
  executor_id=id,
124
+ flavor=flavor,
125
+ version=version,
126
+ labels=labels,
117
127
  development_mode=development_mode,
118
128
  function_allowlist=self._function_allowlist,
119
129
  function_executor_states=self._function_executor_states,
120
- channel_creator=self._channel_creator,
130
+ channel_manager=self._channel_manager,
121
131
  logger=self._logger,
122
132
  )
123
133
  self._state_reporter.update_executor_status(
@@ -133,7 +143,8 @@ class Executor:
133
143
  config_path=config_path,
134
144
  downloader=self._downloader,
135
145
  task_reporter=self._task_reporter,
136
- channel_creator=self._channel_creator,
146
+ channel_manager=self._channel_manager,
147
+ state_reporter=self._state_reporter,
137
148
  logger=self._logger,
138
149
  )
139
150
  else:
@@ -147,6 +158,7 @@ class Executor:
147
158
  self._task_fetcher = TaskFetcher(
148
159
  executor_id=id,
149
160
  executor_version=version,
161
+ labels=labels,
150
162
  function_allowlist=function_allowlist,
151
163
  protocol=protocol,
152
164
  indexify_server_addr=self._server_addr,
@@ -326,7 +338,9 @@ class Executor:
326
338
  ).inc()
327
339
 
328
340
  async def _shutdown(self, loop):
329
- self._logger.info("shutting down")
341
+ self._logger.info(
342
+ "shutting down, all Executor logs are suppressed, no task outcomes will be reported to Server from this point"
343
+ )
330
344
  if self._state_reporter is not None:
331
345
  self._state_reporter.update_executor_status(
332
346
  ExecutorStatus.EXECUTOR_STATUS_STOPPING
@@ -339,12 +353,13 @@ class Executor:
339
353
 
340
354
  self._is_shutdown = True
341
355
  await self._monitoring_server.shutdown()
356
+ await self._task_reporter.shutdown()
342
357
 
343
358
  if self._task_runner is not None:
344
359
  await self._task_runner.shutdown()
345
360
 
346
- if self._channel_creator is not None:
347
- await self._channel_creator.shutdown()
361
+ if self._channel_manager is not None:
362
+ await self._channel_manager.shutdown()
348
363
  if self._state_reporter is not None:
349
364
  await self._state_reporter.shutdown()
350
365
  if self._state_reconciler is not None:
@@ -0,0 +1,7 @@
1
+ from enum import Enum
2
+
3
+
4
+ class ExecutorFlavor(Enum):
5
+ UNKNOWN = "unknown"
6
+ OSS = "oss"
7
+ PLATFORM = "platform"
@@ -70,8 +70,10 @@ class HealthChecker:
70
70
  # code is not involved when TCP connections are established to FE. Problems reestablishing
71
71
  # the TCP connection are usually due to the FE process crashing and its gRPC server socket
72
72
  # not being available anymore or due to prolonged local networking failures on Executor.
73
- channel_connectivity = self._channel.get_state()
74
- if channel_connectivity == grpc.ChannelConnectivity.TRANSIENT_FAILURE:
73
+ if (
74
+ _channel_state(self._channel, self._logger)
75
+ == grpc.ChannelConnectivity.TRANSIENT_FAILURE
76
+ ):
75
77
  return HealthCheckResult(
76
78
  is_healthy=False,
77
79
  reason="Channel is in TRANSIENT_FAILURE state, assuming Function Executor crashed.",
@@ -126,3 +128,19 @@ class HealthChecker:
126
128
 
127
129
  asyncio.create_task(self._health_check_failed_callback(result))
128
130
  self._health_check_loop_task = None
131
+
132
+
133
+ def _channel_state(channel: grpc.aio.Channel, logger: Any) -> grpc.ChannelConnectivity:
134
+ """Get channel connectivity state and suppresses all exceptions.
135
+
136
+ Suppressing the exceptions is important because the channel connectivity state is an experimental
137
+ feature. On error fallse back to READY state which assumes that the channel is okay.
138
+ """
139
+ try:
140
+ return channel.get_state()
141
+ except Exception as e:
142
+ logger.error(
143
+ "Failed getting channel state, falling back to default READY state",
144
+ exc_info=e,
145
+ )
146
+ return grpc.ChannelConnectivity.READY
@@ -0,0 +1,160 @@
1
+ import asyncio
2
+ from typing import Any, Dict, Optional
3
+
4
+ import grpc.aio
5
+ import yaml
6
+
7
+ from .metrics.channel_manager import (
8
+ metric_grpc_server_channel_creation_latency,
9
+ metric_grpc_server_channel_creation_retries,
10
+ metric_grpc_server_channel_creations,
11
+ )
12
+
13
+ _RETRY_INTERVAL_SEC = 5
14
+ _CONNECT_TIMEOUT_SEC = 5
15
+
16
+
17
+ class ChannelManager:
18
+ def __init__(self, server_address: str, config_path: Optional[str], logger: Any):
19
+ self._logger: Any = logger.bind(module=__name__, server_address=server_address)
20
+ self._server_address: str = server_address
21
+ self._channel_credentials: Optional[grpc.ChannelCredentials] = None
22
+ # This lock protects the fields below.
23
+ self._lock = asyncio.Lock()
24
+ self._channel: Optional[grpc.aio.Channel] = None
25
+
26
+ self._init_tls(config_path)
27
+
28
+ def _init_tls(self, config_path: Optional[str]):
29
+ if config_path is None:
30
+ return
31
+
32
+ # The same config file format as in Tensorlake SDK HTTP client, see:
33
+ # https://github.com/tensorlakeai/tensorlake/blob/main/src/tensorlake/utils/http_client.py
34
+ with open(config_path, "r") as config_file:
35
+ config = yaml.safe_load(config_file)
36
+
37
+ if not config.get("use_tls", False):
38
+ return
39
+
40
+ tls_config: Dict[str, str] = config["tls_config"]
41
+ cert_path: Optional[str] = tls_config.get("cert_path", None)
42
+ key_path: Optional[str] = tls_config.get("key_path", None)
43
+ ca_bundle_path: Optional[str] = tls_config.get("ca_bundle_path", None)
44
+
45
+ self._logger = self._logger.bind(
46
+ cert_path=cert_path,
47
+ key_path=key_path,
48
+ ca_bundle_path=ca_bundle_path,
49
+ )
50
+ self._logger.info("TLS is enabled for grpc channels to server")
51
+
52
+ private_key: Optional[bytes] = None
53
+ certificate_chain: Optional[bytes] = None
54
+ root_certificates: Optional[bytes] = None
55
+
56
+ if cert_path is not None:
57
+ with open(cert_path, "rb") as cert_file:
58
+ certificate_chain = cert_file.read()
59
+ if key_path is not None:
60
+ with open(key_path, "rb") as key_file:
61
+ private_key = key_file.read()
62
+ if ca_bundle_path is not None:
63
+ with open(ca_bundle_path, "rb") as ca_bundle_file:
64
+ root_certificates = ca_bundle_file.read()
65
+
66
+ self._channel_credentials = grpc.ssl_channel_credentials(
67
+ root_certificates=root_certificates,
68
+ private_key=private_key,
69
+ certificate_chain=certificate_chain,
70
+ )
71
+
72
+ async def get_channel(self) -> grpc.aio.Channel:
73
+ """Returns a channel to the gRPC server.
74
+
75
+ Returns a ready to use channel. Blocks until the channel is ready,
76
+ never raises any exceptions.
77
+ If previously returned channel is healthy then returns it again.
78
+ Otherwise, returns a new channel but closes the previously returned one.
79
+ """
80
+ # Use the lock to ensure that we only create one channel without race conditions.
81
+ async with self._lock:
82
+ if self._channel is None:
83
+ self._channel = await self._create_channel()
84
+ elif not await self._locked_channel_is_healthy():
85
+ self._logger.info("grpc channel to server is unhealthy")
86
+ await self._destroy_locked_channel()
87
+ self._channel = await self._create_channel()
88
+
89
+ return self._channel
90
+
91
+ async def _create_channel(self) -> grpc.aio.Channel:
92
+ """Creates a new channel to the gRPC server."
93
+
94
+ Returns a ready to use channel. Blocks until the channel
95
+ is ready, never raises any exceptions.
96
+ """
97
+ self._logger.info("creating new grpc server channel")
98
+
99
+ with metric_grpc_server_channel_creation_latency.time():
100
+ metric_grpc_server_channel_creations.inc()
101
+ while True:
102
+ try:
103
+ if self._channel_credentials is None:
104
+ channel = grpc.aio.insecure_channel(target=self._server_address)
105
+ else:
106
+ channel = grpc.aio.secure_channel(
107
+ target=self._server_address,
108
+ credentials=self._channel_credentials,
109
+ )
110
+
111
+ await asyncio.wait_for(
112
+ channel.channel_ready(),
113
+ timeout=_CONNECT_TIMEOUT_SEC,
114
+ )
115
+ return channel
116
+ except Exception:
117
+ self._logger.error(
118
+ f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
119
+ )
120
+ try:
121
+ await channel.close()
122
+ except Exception as e:
123
+ self._logger.error(
124
+ "failed closing not established channel", exc_info=e
125
+ )
126
+
127
+ metric_grpc_server_channel_creation_retries.inc()
128
+ await asyncio.sleep(_RETRY_INTERVAL_SEC)
129
+
130
+ async def _locked_channel_is_healthy(self) -> bool:
131
+ """Checks if the channel is healthy.
132
+
133
+ Returns True if the channel is healthy, False otherwise.
134
+ self._lock must be acquired before calling this method.
135
+ Never raises any exceptions.
136
+ """
137
+ try:
138
+ return self._channel.get_state() == grpc.ChannelConnectivity.READY
139
+ except Exception as e:
140
+ # Assume that the channel is healthy because get_state() method is marked as experimental
141
+ # so we can't fully trust it.
142
+ self._logger.error(
143
+ "failed getting channel state, assuming channel is healthy", exc_info=e
144
+ )
145
+ return True
146
+
147
+ async def _destroy_locked_channel(self):
148
+ """Closes the existing channel.
149
+
150
+ self._lock must be acquired before calling this method.
151
+ Never raises any exceptions.
152
+ """
153
+ try:
154
+ await self._channel.close()
155
+ except Exception as e:
156
+ self._logger.error("failed closing channel", exc_info=e)
157
+ self._channel = None
158
+
159
+ async def shutdown(self):
160
+ pass
@@ -7,14 +7,14 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
7
7
  SerializedObject,
8
8
  )
9
9
 
10
- from indexify.proto.task_scheduler_pb2 import (
10
+ from indexify.proto.executor_api_pb2 import (
11
11
  DesiredExecutorState,
12
12
  FunctionExecutorDescription,
13
13
  FunctionExecutorStatus,
14
14
  GetDesiredExecutorStatesRequest,
15
15
  )
16
- from indexify.proto.task_scheduler_pb2_grpc import (
17
- TaskSchedulerServiceStub,
16
+ from indexify.proto.executor_api_pb2_grpc import (
17
+ ExecutorAPIStub,
18
18
  )
19
19
 
20
20
  from ..downloader import Downloader
@@ -43,7 +43,8 @@ from ..metrics.executor import (
43
43
  metric_tasks_reporting_outcome,
44
44
  )
45
45
  from ..task_reporter import TaskReporter
46
- from .channel_creator import ChannelCreator
46
+ from .channel_manager import ChannelManager
47
+ from .state_reporter import ExecutorStateReporter
47
48
 
48
49
  _RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
49
50
 
@@ -58,7 +59,8 @@ class ExecutorStateReconciler:
58
59
  config_path: Optional[str],
59
60
  downloader: Downloader,
60
61
  task_reporter: TaskReporter,
61
- channel_creator: ChannelCreator,
62
+ channel_manager: ChannelManager,
63
+ state_reporter: ExecutorStateReporter,
62
64
  logger: Any,
63
65
  ):
64
66
  self._executor_id: str = executor_id
@@ -72,7 +74,8 @@ class ExecutorStateReconciler:
72
74
  self._function_executor_states: FunctionExecutorStatesContainer = (
73
75
  function_executor_states
74
76
  )
75
- self._channel_creator = channel_creator
77
+ self._channel_manager: ChannelManager = channel_manager
78
+ self._state_reporter: ExecutorStateReporter = state_reporter
76
79
  self._logger: Any = logger.bind(module=__name__)
77
80
  self._is_shutdown: bool = False
78
81
  self._server_last_clock: Optional[int] = None
@@ -83,12 +86,14 @@ class ExecutorStateReconciler:
83
86
  Never raises any exceptions.
84
87
  """
85
88
  while not self._is_shutdown:
86
- async with await self._channel_creator.create() as server_channel:
89
+ async with await self._channel_manager.get_channel() as server_channel:
87
90
  server_channel: grpc.aio.Channel
88
- stub = TaskSchedulerServiceStub(server_channel)
91
+ stub = ExecutorAPIStub(server_channel)
89
92
  while not self._is_shutdown:
90
93
  try:
91
- # TODO: Report state once before starting the stream.
94
+ # Report state once before starting the stream so Server
95
+ # doesn't use old state it knew about this Executor in the past.
96
+ await self._state_reporter.report_state(stub)
92
97
  desired_states_stream: AsyncGenerator[
93
98
  DesiredExecutorState, None
94
99
  ] = stub.get_desired_executor_states(
@@ -1,37 +1,44 @@
1
1
  import asyncio
2
+ import hashlib
3
+ from socket import gethostname
2
4
  from typing import Any, Dict, List, Optional
3
5
 
4
6
  import grpc
5
7
 
6
- from indexify.proto.task_scheduler_pb2 import (
8
+ from indexify.proto.executor_api_pb2 import (
7
9
  AllowedFunction,
10
+ )
11
+ from indexify.proto.executor_api_pb2 import ExecutorFlavor as ExecutorFlavorProto
12
+ from indexify.proto.executor_api_pb2 import (
8
13
  ExecutorState,
9
14
  ExecutorStatus,
10
15
  FunctionExecutorDescription,
11
16
  )
12
- from indexify.proto.task_scheduler_pb2 import (
17
+ from indexify.proto.executor_api_pb2 import (
13
18
  FunctionExecutorState as FunctionExecutorStateProto,
14
19
  )
15
- from indexify.proto.task_scheduler_pb2 import (
20
+ from indexify.proto.executor_api_pb2 import (
16
21
  FunctionExecutorStatus as FunctionExecutorStatusProto,
17
22
  )
18
- from indexify.proto.task_scheduler_pb2 import (
23
+ from indexify.proto.executor_api_pb2 import (
19
24
  GPUModel,
20
25
  GPUResources,
21
26
  HostResources,
22
27
  ReportExecutorStateRequest,
23
28
  )
24
- from indexify.proto.task_scheduler_pb2_grpc import (
25
- TaskSchedulerServiceStub,
29
+ from indexify.proto.executor_api_pb2_grpc import (
30
+ ExecutorAPIStub,
26
31
  )
27
32
 
28
33
  from ..api_objects import FunctionURI
34
+ from ..executor_flavor import ExecutorFlavor
29
35
  from ..function_executor.function_executor_state import FunctionExecutorState
30
36
  from ..function_executor.function_executor_states_container import (
31
37
  FunctionExecutorStatesContainer,
32
38
  )
33
39
  from ..function_executor.function_executor_status import FunctionExecutorStatus
34
- from .channel_creator import ChannelCreator
40
+ from ..runtime_probes import RuntimeProbes
41
+ from .channel_manager import ChannelManager
35
42
  from .metrics.state_reporter import (
36
43
  metric_state_report_errors,
37
44
  metric_state_report_latency,
@@ -47,24 +54,32 @@ class ExecutorStateReporter:
47
54
  def __init__(
48
55
  self,
49
56
  executor_id: str,
57
+ flavor: ExecutorFlavor,
58
+ version: str,
59
+ labels: Dict[str, str],
50
60
  development_mode: bool,
51
61
  function_allowlist: Optional[List[FunctionURI]],
52
62
  function_executor_states: FunctionExecutorStatesContainer,
53
- channel_creator: ChannelCreator,
63
+ channel_manager: ChannelManager,
54
64
  logger: Any,
55
65
  ):
56
66
  self._executor_id: str = executor_id
67
+ self._flavor: ExecutorFlavor = flavor
68
+ self._version: str = version
69
+ self._labels: Dict[str, str] = labels.copy()
57
70
  self._development_mode: bool = development_mode
71
+ self._hostname: str = gethostname()
58
72
  self._function_executor_states: FunctionExecutorStatesContainer = (
59
73
  function_executor_states
60
74
  )
61
- self._channel_creator = channel_creator
75
+ self._channel_manager = channel_manager
62
76
  self._logger: Any = logger.bind(module=__name__)
63
77
  self._is_shutdown: bool = False
64
78
  self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
65
79
  self._allowed_functions: List[AllowedFunction] = _to_grpc_allowed_functions(
66
80
  function_allowlist
67
81
  )
82
+ self._labels.update(_label_values_to_strings(RuntimeProbes().probe().labels))
68
83
 
69
84
  def update_executor_status(self, value: ExecutorStatus):
70
85
  self._executor_status = value
@@ -75,12 +90,16 @@ class ExecutorStateReporter:
75
90
  Never raises any exceptions.
76
91
  """
77
92
  while not self._is_shutdown:
78
- async with await self._channel_creator.create() as server_channel:
93
+ async with await self._channel_manager.get_channel() as server_channel:
79
94
  server_channel: grpc.aio.Channel
80
- stub = TaskSchedulerServiceStub(server_channel)
95
+ stub = ExecutorAPIStub(server_channel)
81
96
  while not self._is_shutdown:
82
97
  try:
83
- await self._report_state(stub)
98
+ # The periodic state reports serve as channel health monitoring requests
99
+ # (same as TCP keep-alive). Channel Manager returns the same healthy channel
100
+ # for all RPCs that we do from Executor to Server. So all the RPCs benefit
101
+ # from this channel health monitoring.
102
+ await self.report_state(stub)
84
103
  await asyncio.sleep(_REPORTING_INTERVAL_SEC)
85
104
  except Exception as e:
86
105
  self._logger.error(
@@ -92,7 +111,11 @@ class ExecutorStateReporter:
92
111
 
93
112
  self._logger.info("State reporter shutdown")
94
113
 
95
- async def _report_state(self, stub: TaskSchedulerServiceStub):
114
+ async def report_state(self, stub: ExecutorAPIStub):
115
+ """Reports the current state to the server represented by the supplied stub.
116
+
117
+ Raises exceptions on failure.
118
+ """
96
119
  with (
97
120
  metric_state_report_errors.count_exceptions(),
98
121
  metric_state_report_latency.time(),
@@ -101,11 +124,16 @@ class ExecutorStateReporter:
101
124
  state = ExecutorState(
102
125
  executor_id=self._executor_id,
103
126
  development_mode=self._development_mode,
104
- executor_status=self._executor_status,
127
+ hostname=self._hostname,
128
+ flavor=_to_grpc_executor_flavor(self._flavor, self._logger),
129
+ version=self._version,
130
+ status=self._executor_status,
105
131
  free_resources=await self._fetch_free_host_resources(),
106
132
  allowed_functions=self._allowed_functions,
107
133
  function_executor_states=await self._fetch_function_executor_states(),
134
+ labels=self._labels,
108
135
  )
136
+ state.state_hash = _state_hash(state)
109
137
 
110
138
  await stub.report_executor_state(
111
139
  ReportExecutorStateRequest(executor_state=state),
@@ -197,3 +225,33 @@ def _to_grpc_function_executor_status(
197
225
  logger.error("Unexpected Function Executor status", status=status)
198
226
 
199
227
  return result
228
+
229
+
230
+ _FLAVOR_MAPPING = {
231
+ ExecutorFlavor.OSS: ExecutorFlavorProto.EXECUTOR_FLAVOR_OSS,
232
+ ExecutorFlavor.PLATFORM: ExecutorFlavorProto.EXECUTOR_FLAVOR_PLATFORM,
233
+ }
234
+
235
+
236
+ def _to_grpc_executor_flavor(
237
+ flavor: ExecutorFlavor, logger: Any
238
+ ) -> ExecutorFlavorProto:
239
+ result: ExecutorFlavorProto = _FLAVOR_MAPPING.get(
240
+ flavor, ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN
241
+ )
242
+
243
+ if result == ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN:
244
+ logger.error("Unexpected Executor flavor", flavor=flavor)
245
+
246
+ return result
247
+
248
+
249
+ def _label_values_to_strings(labels: Dict[str, Any]) -> Dict[str, str]:
250
+ return {k: str(v) for k, v in labels.items()}
251
+
252
+
253
+ def _state_hash(state: ExecutorState) -> str:
254
+ serialized_state: bytes = state.SerializeToString(deterministic=True)
255
+ hasher = hashlib.sha256(usedforsecurity=False)
256
+ hasher.update(serialized_state)
257
+ return hasher.hexdigest()
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import time
3
- from typing import AsyncGenerator, List, Optional
3
+ from socket import gethostname
4
+ from typing import AsyncGenerator, Dict, List, Optional
4
5
 
5
6
  import structlog
6
7
  from httpx_sse import aconnect_sse
@@ -22,6 +23,7 @@ class TaskFetcher:
22
23
  self,
23
24
  executor_id: str,
24
25
  executor_version: str,
26
+ labels: Dict[str, str],
25
27
  function_allowlist: Optional[List[FunctionURI]],
26
28
  protocol: str,
27
29
  indexify_server_addr: str,
@@ -33,12 +35,15 @@ class TaskFetcher:
33
35
  self._logger = structlog.get_logger(module=__name__)
34
36
 
35
37
  probe_info: ProbeInfo = RuntimeProbes().probe()
38
+ all_labels = probe_info.labels.copy()
39
+ all_labels.update(labels)
40
+
36
41
  self._executor_metadata: ExecutorMetadata = ExecutorMetadata(
37
42
  id=executor_id,
38
43
  executor_version=executor_version,
39
- addr="",
44
+ addr=gethostname(),
40
45
  function_allowlist=function_allowlist,
41
- labels=probe_info.labels,
46
+ labels=all_labels,
42
47
  )
43
48
 
44
49
  async def run(self) -> AsyncGenerator[Task, None]: