indexify 0.3.16__tar.gz → 0.3.18__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. {indexify-0.3.16 → indexify-0.3.18}/PKG-INFO +1 -1
  2. {indexify-0.3.16 → indexify-0.3.18}/pyproject.toml +1 -1
  3. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/cli/cli.py +19 -2
  4. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/executor.py +24 -9
  5. indexify-0.3.18/src/indexify/executor/executor_flavor.py +7 -0
  6. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/function_executor.py +5 -2
  7. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/health_checker.py +55 -13
  8. indexify-0.3.18/src/indexify/executor/grpc/channel_manager.py +160 -0
  9. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/grpc/state_reconciler.py +14 -9
  10. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/grpc/state_reporter.py +72 -14
  11. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/metrics/task_runner.py +7 -0
  12. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/task_fetcher.py +8 -3
  13. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/task_reporter.py +17 -0
  14. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/task_runner.py +4 -0
  15. indexify-0.3.16/src/indexify/proto/task_scheduler.proto → indexify-0.3.18/src/indexify/proto/executor_api.proto +23 -6
  16. indexify-0.3.18/src/indexify/proto/executor_api_pb2.py +70 -0
  17. indexify-0.3.16/src/indexify/proto/task_scheduler_pb2.pyi → indexify-0.3.18/src/indexify/proto/executor_api_pb2.pyi +44 -4
  18. indexify-0.3.16/src/indexify/proto/task_scheduler_pb2_grpc.py → indexify-0.3.18/src/indexify/proto/executor_api_pb2_grpc.py +36 -26
  19. indexify-0.3.16/src/indexify/executor/grpc/channel_creator.py +0 -53
  20. indexify-0.3.16/src/indexify/proto/task_scheduler_pb2.py +0 -64
  21. {indexify-0.3.16 → indexify-0.3.18}/README.md +0 -0
  22. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/README.md +0 -0
  23. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/api_objects.py +0 -0
  24. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/downloader.py +0 -0
  25. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/function_executor_state.py +0 -0
  26. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/function_executor_states_container.py +0 -0
  27. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/function_executor_status.py +0 -0
  28. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/invocation_state_client.py +0 -0
  29. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/function_executor.py +0 -0
  30. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/function_executor_state.py +0 -0
  31. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -0
  32. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/health_checker.py +0 -0
  33. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/invocation_state_client.py +0 -0
  34. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/metrics/single_task_runner.py +0 -0
  35. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/server/client_configuration.py +0 -0
  36. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/server/function_executor_server.py +0 -0
  37. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/server/function_executor_server_factory.py +0 -0
  38. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/server/subprocess_function_executor_server.py +0 -0
  39. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +0 -0
  40. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/single_task_runner.py +0 -0
  41. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/task_input.py +0 -0
  42. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/function_executor/task_output.py +0 -0
  43. /indexify-0.3.16/src/indexify/executor/grpc/metrics/channel_creator.py → /indexify-0.3.18/src/indexify/executor/grpc/metrics/channel_manager.py +0 -0
  44. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/grpc/metrics/state_reporter.py +0 -0
  45. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/metrics/downloader.py +0 -0
  46. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/metrics/executor.py +0 -0
  47. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/metrics/task_fetcher.py +0 -0
  48. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/metrics/task_reporter.py +0 -0
  49. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/monitoring/function_allowlist.py +0 -0
  50. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/monitoring/handler.py +0 -0
  51. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/monitoring/health_check_handler.py +0 -0
  52. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/monitoring/health_checker/generic_health_checker.py +0 -0
  53. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/monitoring/health_checker/health_checker.py +0 -0
  54. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/monitoring/metrics.py +0 -0
  55. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/monitoring/prometheus_metrics_handler.py +0 -0
  56. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/monitoring/server.py +0 -0
  57. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/monitoring/startup_probe_handler.py +0 -0
  58. {indexify-0.3.16 → indexify-0.3.18}/src/indexify/executor/runtime_probes.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: indexify
3
- Version: 0.3.16
3
+ Version: 0.3.18
4
4
  Summary: Open Source Indexify components and helper tools
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -1,7 +1,7 @@
1
1
  [tool.poetry]
2
2
  name = "indexify"
3
3
  # Incremented if any of the components provided in this packages are updated.
4
- version = "0.3.16"
4
+ version = "0.3.18"
5
5
  description = "Open Source Indexify components and helper tools"
6
6
  authors = ["Tensorlake Inc. <support@tensorlake.ai>"]
7
7
  license = "Apache 2.0"
@@ -13,7 +13,7 @@ import sys
13
13
  from importlib.metadata import version
14
14
  from pathlib import Path
15
15
  from socket import gethostname
16
- from typing import Annotated, List, Optional, Tuple
16
+ from typing import Annotated, Dict, List, Optional, Tuple
17
17
 
18
18
  import nanoid
19
19
  import prometheus_client
@@ -26,6 +26,7 @@ from tensorlake.functions_sdk.image import Image
26
26
 
27
27
  from indexify.executor.api_objects import FunctionURI
28
28
  from indexify.executor.executor import Executor
29
+ from indexify.executor.executor_flavor import ExecutorFlavor
29
30
  from indexify.executor.function_executor.server.subprocess_function_executor_server_factory import (
30
31
  SubprocessFunctionExecutorServerFactory,
31
32
  )
@@ -119,7 +120,6 @@ def executor(
119
120
  help="Port where to run Executor Monitoring server",
120
121
  ),
121
122
  ] = 7000,
122
- # TODO: Figure out mTLS for gRPC.
123
123
  grpc_server_addr: Annotated[
124
124
  Optional[str],
125
125
  typer.Option(
@@ -140,6 +140,15 @@ def executor(
140
140
  ),
141
141
  ),
142
142
  ] = False,
143
+ labels: Annotated[
144
+ List[str],
145
+ typer.Option(
146
+ "--label",
147
+ "-l",
148
+ help="Executor key-value label to be sent to the Server. "
149
+ "Specified as <key>=<value>",
150
+ ),
151
+ ] = [],
143
152
  ):
144
153
  if dev:
145
154
  configure_development_mode_logging()
@@ -162,6 +171,11 @@ def executor(
162
171
  "--grpc-server-addr must be set when --enable-grpc-state-reconciler is set"
163
172
  )
164
173
 
174
+ kv_labels: Dict[str, str] = {}
175
+ for label in labels:
176
+ key, value = label.split("=")
177
+ kv_labels[key] = value
178
+
165
179
  executor_version = version("indexify")
166
180
  logger = structlog.get_logger(module=__name__, executor_id=executor_id)
167
181
 
@@ -171,6 +185,7 @@ def executor(
171
185
  server_addr=server_addr,
172
186
  config_path=config_path,
173
187
  executor_version=executor_version,
188
+ labels=kv_labels,
174
189
  executor_cache=executor_cache,
175
190
  ports=ports,
176
191
  functions=function_uris,
@@ -205,7 +220,9 @@ def executor(
205
220
  Executor(
206
221
  id=executor_id,
207
222
  development_mode=dev,
223
+ flavor=ExecutorFlavor.OSS,
208
224
  version=executor_version,
225
+ labels=kv_labels,
209
226
  health_checker=GenericHealthChecker(),
210
227
  code_path=executor_cache,
211
228
  function_allowlist=_parse_function_uris(function_uris),
@@ -9,17 +9,18 @@ import structlog
9
9
  from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
10
10
  from tensorlake.utils.logging import suppress as suppress_logging
11
11
 
12
- from indexify.proto.task_scheduler_pb2 import ExecutorStatus
12
+ from indexify.proto.executor_api_pb2 import ExecutorStatus
13
13
 
14
14
  from .api_objects import FunctionURI, Task
15
15
  from .downloader import Downloader
16
+ from .executor_flavor import ExecutorFlavor
16
17
  from .function_executor.function_executor_states_container import (
17
18
  FunctionExecutorStatesContainer,
18
19
  )
19
20
  from .function_executor.server.function_executor_server_factory import (
20
21
  FunctionExecutorServerFactory,
21
22
  )
22
- from .grpc.channel_creator import ChannelCreator
23
+ from .grpc.channel_manager import ChannelManager
23
24
  from .grpc.state_reconciler import ExecutorStateReconciler
24
25
  from .grpc.state_reporter import ExecutorStateReporter
25
26
  from .metrics.executor import (
@@ -55,7 +56,9 @@ class Executor:
55
56
  self,
56
57
  id: str,
57
58
  development_mode: bool,
59
+ flavor: ExecutorFlavor,
58
60
  version: str,
61
+ labels: Dict[str, str],
59
62
  code_path: Path,
60
63
  health_checker: HealthChecker,
61
64
  function_allowlist: Optional[List[FunctionURI]],
@@ -106,18 +109,25 @@ class Executor:
106
109
  self._task_runner: Optional[TaskRunner] = None
107
110
  self._task_fetcher: Optional[TaskFetcher] = None
108
111
  # gRPC mode services
109
- self._channel_creator: Optional[ChannelCreator] = None
112
+ self._channel_manager: Optional[ChannelManager] = None
110
113
  self._state_reporter: Optional[ExecutorStateReporter] = None
111
114
  self._state_reconciler: Optional[ExecutorStateReconciler] = None
112
115
 
113
116
  if grpc_server_addr is not None:
114
- self._channel_creator = ChannelCreator(grpc_server_addr, self._logger)
117
+ self._channel_manager = ChannelManager(
118
+ server_address=grpc_server_addr,
119
+ config_path=config_path,
120
+ logger=self._logger,
121
+ )
115
122
  self._state_reporter = ExecutorStateReporter(
116
123
  executor_id=id,
124
+ flavor=flavor,
125
+ version=version,
126
+ labels=labels,
117
127
  development_mode=development_mode,
118
128
  function_allowlist=self._function_allowlist,
119
129
  function_executor_states=self._function_executor_states,
120
- channel_creator=self._channel_creator,
130
+ channel_manager=self._channel_manager,
121
131
  logger=self._logger,
122
132
  )
123
133
  self._state_reporter.update_executor_status(
@@ -133,7 +143,8 @@ class Executor:
133
143
  config_path=config_path,
134
144
  downloader=self._downloader,
135
145
  task_reporter=self._task_reporter,
136
- channel_creator=self._channel_creator,
146
+ channel_manager=self._channel_manager,
147
+ state_reporter=self._state_reporter,
137
148
  logger=self._logger,
138
149
  )
139
150
  else:
@@ -147,6 +158,7 @@ class Executor:
147
158
  self._task_fetcher = TaskFetcher(
148
159
  executor_id=id,
149
160
  executor_version=version,
161
+ labels=labels,
150
162
  function_allowlist=function_allowlist,
151
163
  protocol=protocol,
152
164
  indexify_server_addr=self._server_addr,
@@ -326,7 +338,9 @@ class Executor:
326
338
  ).inc()
327
339
 
328
340
  async def _shutdown(self, loop):
329
- self._logger.info("shutting down")
341
+ self._logger.info(
342
+ "shutting down, all Executor logs are suppressed, no task outcomes will be reported to Server from this point"
343
+ )
330
344
  if self._state_reporter is not None:
331
345
  self._state_reporter.update_executor_status(
332
346
  ExecutorStatus.EXECUTOR_STATUS_STOPPING
@@ -339,12 +353,13 @@ class Executor:
339
353
 
340
354
  self._is_shutdown = True
341
355
  await self._monitoring_server.shutdown()
356
+ await self._task_reporter.shutdown()
342
357
 
343
358
  if self._task_runner is not None:
344
359
  await self._task_runner.shutdown()
345
360
 
346
- if self._channel_creator is not None:
347
- await self._channel_creator.shutdown()
361
+ if self._channel_manager is not None:
362
+ await self._channel_manager.shutdown()
348
363
  if self._state_reporter is not None:
349
364
  await self._state_reporter.shutdown()
350
365
  if self._state_reconciler is not None:
@@ -0,0 +1,7 @@
1
+ from enum import Enum
2
+
3
+
4
+ class ExecutorFlavor(Enum):
5
+ UNKNOWN = "unknown"
6
+ OSS = "oss"
7
+ PLATFORM = "platform"
@@ -110,7 +110,7 @@ class FunctionExecutor:
110
110
  config_path=config_path,
111
111
  initialize_request=initialize_request,
112
112
  )
113
- await self._create_health_checker(stub)
113
+ await self._create_health_checker(self._channel, stub)
114
114
  self._initialized = True
115
115
  except Exception:
116
116
  await self.destroy()
@@ -243,12 +243,15 @@ class FunctionExecutor:
243
243
  finally:
244
244
  self._invocation_state_client = None
245
245
 
246
- async def _create_health_checker(self, stub: FunctionExecutorStub) -> None:
246
+ async def _create_health_checker(
247
+ self, channel: grpc.aio.Channel, stub: FunctionExecutorStub
248
+ ) -> None:
247
249
  with (
248
250
  metric_create_health_checker_errors.count_exceptions(),
249
251
  metric_create_health_checker_latency.time(),
250
252
  ):
251
253
  self._health_checker = HealthChecker(
254
+ channel=channel,
252
255
  stub=stub,
253
256
  logger=self._logger,
254
257
  )
@@ -1,8 +1,10 @@
1
1
  import asyncio
2
+ import os
2
3
  from collections.abc import Awaitable, Callable
3
4
  from typing import Any, Optional
4
5
 
5
- from grpc.aio import AioRpcError
6
+ import grpc
7
+ import grpc.aio
6
8
  from tensorlake.function_executor.proto.function_executor_pb2 import (
7
9
  HealthCheckRequest,
8
10
  HealthCheckResponse,
@@ -27,7 +29,10 @@ class HealthCheckResult:
27
29
 
28
30
 
29
31
  class HealthChecker:
30
- def __init__(self, stub: FunctionExecutorStub, logger: Any):
32
+ def __init__(
33
+ self, channel: grpc.aio.Channel, stub: FunctionExecutorStub, logger: Any
34
+ ):
35
+ self._channel: grpc.aio.Channel = channel
31
36
  self._stub: FunctionExecutorStub = stub
32
37
  self._logger: Any = logger.bind(module=__name__)
33
38
  self._health_check_loop_task: Optional[asyncio.Task] = None
@@ -39,6 +44,12 @@ class HealthChecker:
39
44
  """Runs the health check once and returns the result.
40
45
 
41
46
  Does not raise any exceptions."""
47
+ if os.getenv("INDEXIFY_DISABLE_FUNCTION_EXECUTOR_HEALTH_CHECKS", "0") == "1":
48
+ return HealthCheckResult(
49
+ is_healthy=True,
50
+ reason="Function Executor health checks are disabled using INDEXIFY_DISABLE_FUNCTION_EXECUTOR_HEALTH_CHECKS env var.",
51
+ )
52
+
42
53
  with metric_health_check_latency.time():
43
54
  try:
44
55
  response: HealthCheckResponse = await self._stub.check_health(
@@ -49,19 +60,34 @@ class HealthChecker:
49
60
  return HealthCheckResult(
50
61
  is_healthy=response.healthy, reason=response.status_message
51
62
  )
52
- except AioRpcError as e:
53
- metric_failed_health_checks.inc()
54
- # Expected exception when there are problems with communication because e.g. the server is unhealthy.
55
- return HealthCheckResult(
56
- is_healthy=False,
57
- reason=f"Executor side RPC channel error: {str(e)}",
58
- )
63
+ except grpc.aio.AioRpcError as e:
64
+ # Due to the customer code running in Function Executor we can't reliably conclude
65
+ # that the FE is unhealthy when RPC status code is not OK. E.g. customer code can
66
+ # hold Python GIL and prevent the health check RPC from being processed by FE Python code.
67
+ #
68
+ # The only unhealthy condition we can be sure about is when the channel can't re-establish
69
+ # the TCP connection within HEALTH_CHECK_TIMEOUT_SEC deadline. This is because FE Python
70
+ # code is not involved when TCP connections are established to FE. Problems reestablishing
71
+ # the TCP connection are usually due to the FE process crashing and its gRPC server socket
72
+ # not being available anymore or due to prolonged local networking failures on Executor.
73
+ if (
74
+ _channel_state(self._channel, self._logger)
75
+ == grpc.ChannelConnectivity.TRANSIENT_FAILURE
76
+ ):
77
+ return HealthCheckResult(
78
+ is_healthy=False,
79
+ reason="Channel is in TRANSIENT_FAILURE state, assuming Function Executor crashed.",
80
+ )
81
+ else:
82
+ return HealthCheckResult(
83
+ is_healthy=True,
84
+ reason=f"Health check RPC failed with status code: {e.code().name}. Assuming Function Executor is healthy.",
85
+ )
59
86
  except Exception as e:
60
- metric_failed_health_checks.inc()
61
- self._logger.warning("Got unexpected exception, ignoring", exc_info=e)
87
+ self._logger.error("Got unexpected exception, ignoring", exc_info=e)
62
88
  return HealthCheckResult(
63
- is_healthy=False,
64
- reason=f"Unexpected exception in Executor: {str(e)}",
89
+ is_healthy=True,
90
+ reason=f"Unexpected exception in Executor: {str(e)}. Assuming Function Executor is healthy.",
65
91
  )
66
92
 
67
93
  def start(self, callback: Callable[[HealthCheckResult], Awaitable[None]]) -> None:
@@ -102,3 +128,19 @@ class HealthChecker:
102
128
 
103
129
  asyncio.create_task(self._health_check_failed_callback(result))
104
130
  self._health_check_loop_task = None
131
+
132
+
133
+ def _channel_state(channel: grpc.aio.Channel, logger: Any) -> grpc.ChannelConnectivity:
134
+ """Get channel connectivity state and suppresses all exceptions.
135
+
136
+ Suppressing the exceptions is important because the channel connectivity state is an experimental
137
+ feature. On error fallse back to READY state which assumes that the channel is okay.
138
+ """
139
+ try:
140
+ return channel.get_state()
141
+ except Exception as e:
142
+ logger.error(
143
+ "Failed getting channel state, falling back to default READY state",
144
+ exc_info=e,
145
+ )
146
+ return grpc.ChannelConnectivity.READY
@@ -0,0 +1,160 @@
1
+ import asyncio
2
+ from typing import Any, Dict, Optional
3
+
4
+ import grpc.aio
5
+ import yaml
6
+
7
+ from .metrics.channel_manager import (
8
+ metric_grpc_server_channel_creation_latency,
9
+ metric_grpc_server_channel_creation_retries,
10
+ metric_grpc_server_channel_creations,
11
+ )
12
+
13
+ _RETRY_INTERVAL_SEC = 5
14
+ _CONNECT_TIMEOUT_SEC = 5
15
+
16
+
17
+ class ChannelManager:
18
+ def __init__(self, server_address: str, config_path: Optional[str], logger: Any):
19
+ self._logger: Any = logger.bind(module=__name__, server_address=server_address)
20
+ self._server_address: str = server_address
21
+ self._channel_credentials: Optional[grpc.ChannelCredentials] = None
22
+ # This lock protects the fields below.
23
+ self._lock = asyncio.Lock()
24
+ self._channel: Optional[grpc.aio.Channel] = None
25
+
26
+ self._init_tls(config_path)
27
+
28
+ def _init_tls(self, config_path: Optional[str]):
29
+ if config_path is None:
30
+ return
31
+
32
+ # The same config file format as in Tensorlake SDK HTTP client, see:
33
+ # https://github.com/tensorlakeai/tensorlake/blob/main/src/tensorlake/utils/http_client.py
34
+ with open(config_path, "r") as config_file:
35
+ config = yaml.safe_load(config_file)
36
+
37
+ if not config.get("use_tls", False):
38
+ return
39
+
40
+ tls_config: Dict[str, str] = config["tls_config"]
41
+ cert_path: Optional[str] = tls_config.get("cert_path", None)
42
+ key_path: Optional[str] = tls_config.get("key_path", None)
43
+ ca_bundle_path: Optional[str] = tls_config.get("ca_bundle_path", None)
44
+
45
+ self._logger = self._logger.bind(
46
+ cert_path=cert_path,
47
+ key_path=key_path,
48
+ ca_bundle_path=ca_bundle_path,
49
+ )
50
+ self._logger.info("TLS is enabled for grpc channels to server")
51
+
52
+ private_key: Optional[bytes] = None
53
+ certificate_chain: Optional[bytes] = None
54
+ root_certificates: Optional[bytes] = None
55
+
56
+ if cert_path is not None:
57
+ with open(cert_path, "rb") as cert_file:
58
+ certificate_chain = cert_file.read()
59
+ if key_path is not None:
60
+ with open(key_path, "rb") as key_file:
61
+ private_key = key_file.read()
62
+ if ca_bundle_path is not None:
63
+ with open(ca_bundle_path, "rb") as ca_bundle_file:
64
+ root_certificates = ca_bundle_file.read()
65
+
66
+ self._channel_credentials = grpc.ssl_channel_credentials(
67
+ root_certificates=root_certificates,
68
+ private_key=private_key,
69
+ certificate_chain=certificate_chain,
70
+ )
71
+
72
+ async def get_channel(self) -> grpc.aio.Channel:
73
+ """Returns a channel to the gRPC server.
74
+
75
+ Returns a ready to use channel. Blocks until the channel is ready,
76
+ never raises any exceptions.
77
+ If previously returned channel is healthy then returns it again.
78
+ Otherwise, returns a new channel but closes the previously returned one.
79
+ """
80
+ # Use the lock to ensure that we only create one channel without race conditions.
81
+ async with self._lock:
82
+ if self._channel is None:
83
+ self._channel = await self._create_channel()
84
+ elif not await self._locked_channel_is_healthy():
85
+ self._logger.info("grpc channel to server is unhealthy")
86
+ await self._destroy_locked_channel()
87
+ self._channel = await self._create_channel()
88
+
89
+ return self._channel
90
+
91
+ async def _create_channel(self) -> grpc.aio.Channel:
92
+ """Creates a new channel to the gRPC server."
93
+
94
+ Returns a ready to use channel. Blocks until the channel
95
+ is ready, never raises any exceptions.
96
+ """
97
+ self._logger.info("creating new grpc server channel")
98
+
99
+ with metric_grpc_server_channel_creation_latency.time():
100
+ metric_grpc_server_channel_creations.inc()
101
+ while True:
102
+ try:
103
+ if self._channel_credentials is None:
104
+ channel = grpc.aio.insecure_channel(target=self._server_address)
105
+ else:
106
+ channel = grpc.aio.secure_channel(
107
+ target=self._server_address,
108
+ credentials=self._channel_credentials,
109
+ )
110
+
111
+ await asyncio.wait_for(
112
+ channel.channel_ready(),
113
+ timeout=_CONNECT_TIMEOUT_SEC,
114
+ )
115
+ return channel
116
+ except Exception:
117
+ self._logger.error(
118
+ f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
119
+ )
120
+ try:
121
+ await channel.close()
122
+ except Exception as e:
123
+ self._logger.error(
124
+ "failed closing not established channel", exc_info=e
125
+ )
126
+
127
+ metric_grpc_server_channel_creation_retries.inc()
128
+ await asyncio.sleep(_RETRY_INTERVAL_SEC)
129
+
130
+ async def _locked_channel_is_healthy(self) -> bool:
131
+ """Checks if the channel is healthy.
132
+
133
+ Returns True if the channel is healthy, False otherwise.
134
+ self._lock must be acquired before calling this method.
135
+ Never raises any exceptions.
136
+ """
137
+ try:
138
+ return self._channel.get_state() == grpc.ChannelConnectivity.READY
139
+ except Exception as e:
140
+ # Assume that the channel is healthy because get_state() method is marked as experimental
141
+ # so we can't fully trust it.
142
+ self._logger.error(
143
+ "failed getting channel state, assuming channel is healthy", exc_info=e
144
+ )
145
+ return True
146
+
147
+ async def _destroy_locked_channel(self):
148
+ """Closes the existing channel.
149
+
150
+ self._lock must be acquired before calling this method.
151
+ Never raises any exceptions.
152
+ """
153
+ try:
154
+ await self._channel.close()
155
+ except Exception as e:
156
+ self._logger.error("failed closing channel", exc_info=e)
157
+ self._channel = None
158
+
159
+ async def shutdown(self):
160
+ pass
@@ -7,14 +7,14 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
7
7
  SerializedObject,
8
8
  )
9
9
 
10
- from indexify.proto.task_scheduler_pb2 import (
10
+ from indexify.proto.executor_api_pb2 import (
11
11
  DesiredExecutorState,
12
12
  FunctionExecutorDescription,
13
13
  FunctionExecutorStatus,
14
14
  GetDesiredExecutorStatesRequest,
15
15
  )
16
- from indexify.proto.task_scheduler_pb2_grpc import (
17
- TaskSchedulerServiceStub,
16
+ from indexify.proto.executor_api_pb2_grpc import (
17
+ ExecutorAPIStub,
18
18
  )
19
19
 
20
20
  from ..downloader import Downloader
@@ -43,7 +43,8 @@ from ..metrics.executor import (
43
43
  metric_tasks_reporting_outcome,
44
44
  )
45
45
  from ..task_reporter import TaskReporter
46
- from .channel_creator import ChannelCreator
46
+ from .channel_manager import ChannelManager
47
+ from .state_reporter import ExecutorStateReporter
47
48
 
48
49
  _RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
49
50
 
@@ -58,7 +59,8 @@ class ExecutorStateReconciler:
58
59
  config_path: Optional[str],
59
60
  downloader: Downloader,
60
61
  task_reporter: TaskReporter,
61
- channel_creator: ChannelCreator,
62
+ channel_manager: ChannelManager,
63
+ state_reporter: ExecutorStateReporter,
62
64
  logger: Any,
63
65
  ):
64
66
  self._executor_id: str = executor_id
@@ -72,7 +74,8 @@ class ExecutorStateReconciler:
72
74
  self._function_executor_states: FunctionExecutorStatesContainer = (
73
75
  function_executor_states
74
76
  )
75
- self._channel_creator = channel_creator
77
+ self._channel_manager: ChannelManager = channel_manager
78
+ self._state_reporter: ExecutorStateReporter = state_reporter
76
79
  self._logger: Any = logger.bind(module=__name__)
77
80
  self._is_shutdown: bool = False
78
81
  self._server_last_clock: Optional[int] = None
@@ -83,12 +86,14 @@ class ExecutorStateReconciler:
83
86
  Never raises any exceptions.
84
87
  """
85
88
  while not self._is_shutdown:
86
- async with await self._channel_creator.create() as server_channel:
89
+ async with await self._channel_manager.get_channel() as server_channel:
87
90
  server_channel: grpc.aio.Channel
88
- stub = TaskSchedulerServiceStub(server_channel)
91
+ stub = ExecutorAPIStub(server_channel)
89
92
  while not self._is_shutdown:
90
93
  try:
91
- # TODO: Report state once before starting the stream.
94
+ # Report state once before starting the stream so Server
95
+ # doesn't use old state it knew about this Executor in the past.
96
+ await self._state_reporter.report_state(stub)
92
97
  desired_states_stream: AsyncGenerator[
93
98
  DesiredExecutorState, None
94
99
  ] = stub.get_desired_executor_states(