indexify 0.3.17__py3-none-any.whl → 0.3.18__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
indexify/cli/cli.py CHANGED
@@ -13,7 +13,7 @@ import sys
13
13
  from importlib.metadata import version
14
14
  from pathlib import Path
15
15
  from socket import gethostname
16
- from typing import Annotated, List, Optional, Tuple
16
+ from typing import Annotated, Dict, List, Optional, Tuple
17
17
 
18
18
  import nanoid
19
19
  import prometheus_client
@@ -26,6 +26,7 @@ from tensorlake.functions_sdk.image import Image
26
26
 
27
27
  from indexify.executor.api_objects import FunctionURI
28
28
  from indexify.executor.executor import Executor
29
+ from indexify.executor.executor_flavor import ExecutorFlavor
29
30
  from indexify.executor.function_executor.server.subprocess_function_executor_server_factory import (
30
31
  SubprocessFunctionExecutorServerFactory,
31
32
  )
@@ -119,7 +120,6 @@ def executor(
119
120
  help="Port where to run Executor Monitoring server",
120
121
  ),
121
122
  ] = 7000,
122
- # TODO: Figure out mTLS for gRPC.
123
123
  grpc_server_addr: Annotated[
124
124
  Optional[str],
125
125
  typer.Option(
@@ -140,6 +140,15 @@ def executor(
140
140
  ),
141
141
  ),
142
142
  ] = False,
143
+ labels: Annotated[
144
+ List[str],
145
+ typer.Option(
146
+ "--label",
147
+ "-l",
148
+ help="Executor key-value label to be sent to the Server. "
149
+ "Specified as <key>=<value>",
150
+ ),
151
+ ] = [],
143
152
  ):
144
153
  if dev:
145
154
  configure_development_mode_logging()
@@ -162,6 +171,11 @@ def executor(
162
171
  "--grpc-server-addr must be set when --enable-grpc-state-reconciler is set"
163
172
  )
164
173
 
174
+ kv_labels: Dict[str, str] = {}
175
+ for label in labels:
176
+ key, value = label.split("=")
177
+ kv_labels[key] = value
178
+
165
179
  executor_version = version("indexify")
166
180
  logger = structlog.get_logger(module=__name__, executor_id=executor_id)
167
181
 
@@ -171,6 +185,7 @@ def executor(
171
185
  server_addr=server_addr,
172
186
  config_path=config_path,
173
187
  executor_version=executor_version,
188
+ labels=kv_labels,
174
189
  executor_cache=executor_cache,
175
190
  ports=ports,
176
191
  functions=function_uris,
@@ -205,7 +220,9 @@ def executor(
205
220
  Executor(
206
221
  id=executor_id,
207
222
  development_mode=dev,
223
+ flavor=ExecutorFlavor.OSS,
208
224
  version=executor_version,
225
+ labels=kv_labels,
209
226
  health_checker=GenericHealthChecker(),
210
227
  code_path=executor_cache,
211
228
  function_allowlist=_parse_function_uris(function_uris),
@@ -9,17 +9,18 @@ import structlog
9
9
  from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
10
10
  from tensorlake.utils.logging import suppress as suppress_logging
11
11
 
12
- from indexify.proto.task_scheduler_pb2 import ExecutorStatus
12
+ from indexify.proto.executor_api_pb2 import ExecutorStatus
13
13
 
14
14
  from .api_objects import FunctionURI, Task
15
15
  from .downloader import Downloader
16
+ from .executor_flavor import ExecutorFlavor
16
17
  from .function_executor.function_executor_states_container import (
17
18
  FunctionExecutorStatesContainer,
18
19
  )
19
20
  from .function_executor.server.function_executor_server_factory import (
20
21
  FunctionExecutorServerFactory,
21
22
  )
22
- from .grpc.channel_creator import ChannelCreator
23
+ from .grpc.channel_manager import ChannelManager
23
24
  from .grpc.state_reconciler import ExecutorStateReconciler
24
25
  from .grpc.state_reporter import ExecutorStateReporter
25
26
  from .metrics.executor import (
@@ -55,7 +56,9 @@ class Executor:
55
56
  self,
56
57
  id: str,
57
58
  development_mode: bool,
59
+ flavor: ExecutorFlavor,
58
60
  version: str,
61
+ labels: Dict[str, str],
59
62
  code_path: Path,
60
63
  health_checker: HealthChecker,
61
64
  function_allowlist: Optional[List[FunctionURI]],
@@ -106,18 +109,25 @@ class Executor:
106
109
  self._task_runner: Optional[TaskRunner] = None
107
110
  self._task_fetcher: Optional[TaskFetcher] = None
108
111
  # gRPC mode services
109
- self._channel_creator: Optional[ChannelCreator] = None
112
+ self._channel_manager: Optional[ChannelManager] = None
110
113
  self._state_reporter: Optional[ExecutorStateReporter] = None
111
114
  self._state_reconciler: Optional[ExecutorStateReconciler] = None
112
115
 
113
116
  if grpc_server_addr is not None:
114
- self._channel_creator = ChannelCreator(grpc_server_addr, self._logger)
117
+ self._channel_manager = ChannelManager(
118
+ server_address=grpc_server_addr,
119
+ config_path=config_path,
120
+ logger=self._logger,
121
+ )
115
122
  self._state_reporter = ExecutorStateReporter(
116
123
  executor_id=id,
124
+ flavor=flavor,
125
+ version=version,
126
+ labels=labels,
117
127
  development_mode=development_mode,
118
128
  function_allowlist=self._function_allowlist,
119
129
  function_executor_states=self._function_executor_states,
120
- channel_creator=self._channel_creator,
130
+ channel_manager=self._channel_manager,
121
131
  logger=self._logger,
122
132
  )
123
133
  self._state_reporter.update_executor_status(
@@ -133,7 +143,8 @@ class Executor:
133
143
  config_path=config_path,
134
144
  downloader=self._downloader,
135
145
  task_reporter=self._task_reporter,
136
- channel_creator=self._channel_creator,
146
+ channel_manager=self._channel_manager,
147
+ state_reporter=self._state_reporter,
137
148
  logger=self._logger,
138
149
  )
139
150
  else:
@@ -147,6 +158,7 @@ class Executor:
147
158
  self._task_fetcher = TaskFetcher(
148
159
  executor_id=id,
149
160
  executor_version=version,
161
+ labels=labels,
150
162
  function_allowlist=function_allowlist,
151
163
  protocol=protocol,
152
164
  indexify_server_addr=self._server_addr,
@@ -326,7 +338,9 @@ class Executor:
326
338
  ).inc()
327
339
 
328
340
  async def _shutdown(self, loop):
329
- self._logger.info("shutting down")
341
+ self._logger.info(
342
+ "shutting down, all Executor logs are suppressed, no task outcomes will be reported to Server from this point"
343
+ )
330
344
  if self._state_reporter is not None:
331
345
  self._state_reporter.update_executor_status(
332
346
  ExecutorStatus.EXECUTOR_STATUS_STOPPING
@@ -339,12 +353,13 @@ class Executor:
339
353
 
340
354
  self._is_shutdown = True
341
355
  await self._monitoring_server.shutdown()
356
+ await self._task_reporter.shutdown()
342
357
 
343
358
  if self._task_runner is not None:
344
359
  await self._task_runner.shutdown()
345
360
 
346
- if self._channel_creator is not None:
347
- await self._channel_creator.shutdown()
361
+ if self._channel_manager is not None:
362
+ await self._channel_manager.shutdown()
348
363
  if self._state_reporter is not None:
349
364
  await self._state_reporter.shutdown()
350
365
  if self._state_reconciler is not None:
@@ -0,0 +1,7 @@
1
+ from enum import Enum
2
+
3
+
4
+ class ExecutorFlavor(Enum):
5
+ UNKNOWN = "unknown"
6
+ OSS = "oss"
7
+ PLATFORM = "platform"
@@ -70,8 +70,10 @@ class HealthChecker:
70
70
  # code is not involved when TCP connections are established to FE. Problems reestablishing
71
71
  # the TCP connection are usually due to the FE process crashing and its gRPC server socket
72
72
  # not being available anymore or due to prolonged local networking failures on Executor.
73
- channel_connectivity = self._channel.get_state()
74
- if channel_connectivity == grpc.ChannelConnectivity.TRANSIENT_FAILURE:
73
+ if (
74
+ _channel_state(self._channel, self._logger)
75
+ == grpc.ChannelConnectivity.TRANSIENT_FAILURE
76
+ ):
75
77
  return HealthCheckResult(
76
78
  is_healthy=False,
77
79
  reason="Channel is in TRANSIENT_FAILURE state, assuming Function Executor crashed.",
@@ -126,3 +128,19 @@ class HealthChecker:
126
128
 
127
129
  asyncio.create_task(self._health_check_failed_callback(result))
128
130
  self._health_check_loop_task = None
131
+
132
+
133
+ def _channel_state(channel: grpc.aio.Channel, logger: Any) -> grpc.ChannelConnectivity:
134
+ """Get channel connectivity state and suppresses all exceptions.
135
+
136
+ Suppressing the exceptions is important because the channel connectivity state is an experimental
137
+ feature. On error fallse back to READY state which assumes that the channel is okay.
138
+ """
139
+ try:
140
+ return channel.get_state()
141
+ except Exception as e:
142
+ logger.error(
143
+ "Failed getting channel state, falling back to default READY state",
144
+ exc_info=e,
145
+ )
146
+ return grpc.ChannelConnectivity.READY
@@ -0,0 +1,160 @@
1
+ import asyncio
2
+ from typing import Any, Dict, Optional
3
+
4
+ import grpc.aio
5
+ import yaml
6
+
7
+ from .metrics.channel_manager import (
8
+ metric_grpc_server_channel_creation_latency,
9
+ metric_grpc_server_channel_creation_retries,
10
+ metric_grpc_server_channel_creations,
11
+ )
12
+
13
+ _RETRY_INTERVAL_SEC = 5
14
+ _CONNECT_TIMEOUT_SEC = 5
15
+
16
+
17
+ class ChannelManager:
18
+ def __init__(self, server_address: str, config_path: Optional[str], logger: Any):
19
+ self._logger: Any = logger.bind(module=__name__, server_address=server_address)
20
+ self._server_address: str = server_address
21
+ self._channel_credentials: Optional[grpc.ChannelCredentials] = None
22
+ # This lock protects the fields below.
23
+ self._lock = asyncio.Lock()
24
+ self._channel: Optional[grpc.aio.Channel] = None
25
+
26
+ self._init_tls(config_path)
27
+
28
+ def _init_tls(self, config_path: Optional[str]):
29
+ if config_path is None:
30
+ return
31
+
32
+ # The same config file format as in Tensorlake SDK HTTP client, see:
33
+ # https://github.com/tensorlakeai/tensorlake/blob/main/src/tensorlake/utils/http_client.py
34
+ with open(config_path, "r") as config_file:
35
+ config = yaml.safe_load(config_file)
36
+
37
+ if not config.get("use_tls", False):
38
+ return
39
+
40
+ tls_config: Dict[str, str] = config["tls_config"]
41
+ cert_path: Optional[str] = tls_config.get("cert_path", None)
42
+ key_path: Optional[str] = tls_config.get("key_path", None)
43
+ ca_bundle_path: Optional[str] = tls_config.get("ca_bundle_path", None)
44
+
45
+ self._logger = self._logger.bind(
46
+ cert_path=cert_path,
47
+ key_path=key_path,
48
+ ca_bundle_path=ca_bundle_path,
49
+ )
50
+ self._logger.info("TLS is enabled for grpc channels to server")
51
+
52
+ private_key: Optional[bytes] = None
53
+ certificate_chain: Optional[bytes] = None
54
+ root_certificates: Optional[bytes] = None
55
+
56
+ if cert_path is not None:
57
+ with open(cert_path, "rb") as cert_file:
58
+ certificate_chain = cert_file.read()
59
+ if key_path is not None:
60
+ with open(key_path, "rb") as key_file:
61
+ private_key = key_file.read()
62
+ if ca_bundle_path is not None:
63
+ with open(ca_bundle_path, "rb") as ca_bundle_file:
64
+ root_certificates = ca_bundle_file.read()
65
+
66
+ self._channel_credentials = grpc.ssl_channel_credentials(
67
+ root_certificates=root_certificates,
68
+ private_key=private_key,
69
+ certificate_chain=certificate_chain,
70
+ )
71
+
72
+ async def get_channel(self) -> grpc.aio.Channel:
73
+ """Returns a channel to the gRPC server.
74
+
75
+ Returns a ready to use channel. Blocks until the channel is ready,
76
+ never raises any exceptions.
77
+ If previously returned channel is healthy then returns it again.
78
+ Otherwise, returns a new channel but closes the previously returned one.
79
+ """
80
+ # Use the lock to ensure that we only create one channel without race conditions.
81
+ async with self._lock:
82
+ if self._channel is None:
83
+ self._channel = await self._create_channel()
84
+ elif not await self._locked_channel_is_healthy():
85
+ self._logger.info("grpc channel to server is unhealthy")
86
+ await self._destroy_locked_channel()
87
+ self._channel = await self._create_channel()
88
+
89
+ return self._channel
90
+
91
+ async def _create_channel(self) -> grpc.aio.Channel:
92
+ """Creates a new channel to the gRPC server."
93
+
94
+ Returns a ready to use channel. Blocks until the channel
95
+ is ready, never raises any exceptions.
96
+ """
97
+ self._logger.info("creating new grpc server channel")
98
+
99
+ with metric_grpc_server_channel_creation_latency.time():
100
+ metric_grpc_server_channel_creations.inc()
101
+ while True:
102
+ try:
103
+ if self._channel_credentials is None:
104
+ channel = grpc.aio.insecure_channel(target=self._server_address)
105
+ else:
106
+ channel = grpc.aio.secure_channel(
107
+ target=self._server_address,
108
+ credentials=self._channel_credentials,
109
+ )
110
+
111
+ await asyncio.wait_for(
112
+ channel.channel_ready(),
113
+ timeout=_CONNECT_TIMEOUT_SEC,
114
+ )
115
+ return channel
116
+ except Exception:
117
+ self._logger.error(
118
+ f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
119
+ )
120
+ try:
121
+ await channel.close()
122
+ except Exception as e:
123
+ self._logger.error(
124
+ "failed closing not established channel", exc_info=e
125
+ )
126
+
127
+ metric_grpc_server_channel_creation_retries.inc()
128
+ await asyncio.sleep(_RETRY_INTERVAL_SEC)
129
+
130
+ async def _locked_channel_is_healthy(self) -> bool:
131
+ """Checks if the channel is healthy.
132
+
133
+ Returns True if the channel is healthy, False otherwise.
134
+ self._lock must be acquired before calling this method.
135
+ Never raises any exceptions.
136
+ """
137
+ try:
138
+ return self._channel.get_state() == grpc.ChannelConnectivity.READY
139
+ except Exception as e:
140
+ # Assume that the channel is healthy because get_state() method is marked as experimental
141
+ # so we can't fully trust it.
142
+ self._logger.error(
143
+ "failed getting channel state, assuming channel is healthy", exc_info=e
144
+ )
145
+ return True
146
+
147
+ async def _destroy_locked_channel(self):
148
+ """Closes the existing channel.
149
+
150
+ self._lock must be acquired before calling this method.
151
+ Never raises any exceptions.
152
+ """
153
+ try:
154
+ await self._channel.close()
155
+ except Exception as e:
156
+ self._logger.error("failed closing channel", exc_info=e)
157
+ self._channel = None
158
+
159
+ async def shutdown(self):
160
+ pass
@@ -7,14 +7,14 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
7
7
  SerializedObject,
8
8
  )
9
9
 
10
- from indexify.proto.task_scheduler_pb2 import (
10
+ from indexify.proto.executor_api_pb2 import (
11
11
  DesiredExecutorState,
12
12
  FunctionExecutorDescription,
13
13
  FunctionExecutorStatus,
14
14
  GetDesiredExecutorStatesRequest,
15
15
  )
16
- from indexify.proto.task_scheduler_pb2_grpc import (
17
- TaskSchedulerServiceStub,
16
+ from indexify.proto.executor_api_pb2_grpc import (
17
+ ExecutorAPIStub,
18
18
  )
19
19
 
20
20
  from ..downloader import Downloader
@@ -43,7 +43,8 @@ from ..metrics.executor import (
43
43
  metric_tasks_reporting_outcome,
44
44
  )
45
45
  from ..task_reporter import TaskReporter
46
- from .channel_creator import ChannelCreator
46
+ from .channel_manager import ChannelManager
47
+ from .state_reporter import ExecutorStateReporter
47
48
 
48
49
  _RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
49
50
 
@@ -58,7 +59,8 @@ class ExecutorStateReconciler:
58
59
  config_path: Optional[str],
59
60
  downloader: Downloader,
60
61
  task_reporter: TaskReporter,
61
- channel_creator: ChannelCreator,
62
+ channel_manager: ChannelManager,
63
+ state_reporter: ExecutorStateReporter,
62
64
  logger: Any,
63
65
  ):
64
66
  self._executor_id: str = executor_id
@@ -72,7 +74,8 @@ class ExecutorStateReconciler:
72
74
  self._function_executor_states: FunctionExecutorStatesContainer = (
73
75
  function_executor_states
74
76
  )
75
- self._channel_creator = channel_creator
77
+ self._channel_manager: ChannelManager = channel_manager
78
+ self._state_reporter: ExecutorStateReporter = state_reporter
76
79
  self._logger: Any = logger.bind(module=__name__)
77
80
  self._is_shutdown: bool = False
78
81
  self._server_last_clock: Optional[int] = None
@@ -83,12 +86,14 @@ class ExecutorStateReconciler:
83
86
  Never raises any exceptions.
84
87
  """
85
88
  while not self._is_shutdown:
86
- async with await self._channel_creator.create() as server_channel:
89
+ async with await self._channel_manager.get_channel() as server_channel:
87
90
  server_channel: grpc.aio.Channel
88
- stub = TaskSchedulerServiceStub(server_channel)
91
+ stub = ExecutorAPIStub(server_channel)
89
92
  while not self._is_shutdown:
90
93
  try:
91
- # TODO: Report state once before starting the stream.
94
+ # Report state once before starting the stream so Server
95
+ # doesn't use old state it knew about this Executor in the past.
96
+ await self._state_reporter.report_state(stub)
92
97
  desired_states_stream: AsyncGenerator[
93
98
  DesiredExecutorState, None
94
99
  ] = stub.get_desired_executor_states(
@@ -1,37 +1,44 @@
1
1
  import asyncio
2
+ import hashlib
3
+ from socket import gethostname
2
4
  from typing import Any, Dict, List, Optional
3
5
 
4
6
  import grpc
5
7
 
6
- from indexify.proto.task_scheduler_pb2 import (
8
+ from indexify.proto.executor_api_pb2 import (
7
9
  AllowedFunction,
10
+ )
11
+ from indexify.proto.executor_api_pb2 import ExecutorFlavor as ExecutorFlavorProto
12
+ from indexify.proto.executor_api_pb2 import (
8
13
  ExecutorState,
9
14
  ExecutorStatus,
10
15
  FunctionExecutorDescription,
11
16
  )
12
- from indexify.proto.task_scheduler_pb2 import (
17
+ from indexify.proto.executor_api_pb2 import (
13
18
  FunctionExecutorState as FunctionExecutorStateProto,
14
19
  )
15
- from indexify.proto.task_scheduler_pb2 import (
20
+ from indexify.proto.executor_api_pb2 import (
16
21
  FunctionExecutorStatus as FunctionExecutorStatusProto,
17
22
  )
18
- from indexify.proto.task_scheduler_pb2 import (
23
+ from indexify.proto.executor_api_pb2 import (
19
24
  GPUModel,
20
25
  GPUResources,
21
26
  HostResources,
22
27
  ReportExecutorStateRequest,
23
28
  )
24
- from indexify.proto.task_scheduler_pb2_grpc import (
25
- TaskSchedulerServiceStub,
29
+ from indexify.proto.executor_api_pb2_grpc import (
30
+ ExecutorAPIStub,
26
31
  )
27
32
 
28
33
  from ..api_objects import FunctionURI
34
+ from ..executor_flavor import ExecutorFlavor
29
35
  from ..function_executor.function_executor_state import FunctionExecutorState
30
36
  from ..function_executor.function_executor_states_container import (
31
37
  FunctionExecutorStatesContainer,
32
38
  )
33
39
  from ..function_executor.function_executor_status import FunctionExecutorStatus
34
- from .channel_creator import ChannelCreator
40
+ from ..runtime_probes import RuntimeProbes
41
+ from .channel_manager import ChannelManager
35
42
  from .metrics.state_reporter import (
36
43
  metric_state_report_errors,
37
44
  metric_state_report_latency,
@@ -47,24 +54,32 @@ class ExecutorStateReporter:
47
54
  def __init__(
48
55
  self,
49
56
  executor_id: str,
57
+ flavor: ExecutorFlavor,
58
+ version: str,
59
+ labels: Dict[str, str],
50
60
  development_mode: bool,
51
61
  function_allowlist: Optional[List[FunctionURI]],
52
62
  function_executor_states: FunctionExecutorStatesContainer,
53
- channel_creator: ChannelCreator,
63
+ channel_manager: ChannelManager,
54
64
  logger: Any,
55
65
  ):
56
66
  self._executor_id: str = executor_id
67
+ self._flavor: ExecutorFlavor = flavor
68
+ self._version: str = version
69
+ self._labels: Dict[str, str] = labels.copy()
57
70
  self._development_mode: bool = development_mode
71
+ self._hostname: str = gethostname()
58
72
  self._function_executor_states: FunctionExecutorStatesContainer = (
59
73
  function_executor_states
60
74
  )
61
- self._channel_creator = channel_creator
75
+ self._channel_manager = channel_manager
62
76
  self._logger: Any = logger.bind(module=__name__)
63
77
  self._is_shutdown: bool = False
64
78
  self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
65
79
  self._allowed_functions: List[AllowedFunction] = _to_grpc_allowed_functions(
66
80
  function_allowlist
67
81
  )
82
+ self._labels.update(_label_values_to_strings(RuntimeProbes().probe().labels))
68
83
 
69
84
  def update_executor_status(self, value: ExecutorStatus):
70
85
  self._executor_status = value
@@ -75,12 +90,16 @@ class ExecutorStateReporter:
75
90
  Never raises any exceptions.
76
91
  """
77
92
  while not self._is_shutdown:
78
- async with await self._channel_creator.create() as server_channel:
93
+ async with await self._channel_manager.get_channel() as server_channel:
79
94
  server_channel: grpc.aio.Channel
80
- stub = TaskSchedulerServiceStub(server_channel)
95
+ stub = ExecutorAPIStub(server_channel)
81
96
  while not self._is_shutdown:
82
97
  try:
83
- await self._report_state(stub)
98
+ # The periodic state reports serve as channel health monitoring requests
99
+ # (same as TCP keep-alive). Channel Manager returns the same healthy channel
100
+ # for all RPCs that we do from Executor to Server. So all the RPCs benefit
101
+ # from this channel health monitoring.
102
+ await self.report_state(stub)
84
103
  await asyncio.sleep(_REPORTING_INTERVAL_SEC)
85
104
  except Exception as e:
86
105
  self._logger.error(
@@ -92,7 +111,11 @@ class ExecutorStateReporter:
92
111
 
93
112
  self._logger.info("State reporter shutdown")
94
113
 
95
- async def _report_state(self, stub: TaskSchedulerServiceStub):
114
+ async def report_state(self, stub: ExecutorAPIStub):
115
+ """Reports the current state to the server represented by the supplied stub.
116
+
117
+ Raises exceptions on failure.
118
+ """
96
119
  with (
97
120
  metric_state_report_errors.count_exceptions(),
98
121
  metric_state_report_latency.time(),
@@ -101,11 +124,16 @@ class ExecutorStateReporter:
101
124
  state = ExecutorState(
102
125
  executor_id=self._executor_id,
103
126
  development_mode=self._development_mode,
104
- executor_status=self._executor_status,
127
+ hostname=self._hostname,
128
+ flavor=_to_grpc_executor_flavor(self._flavor, self._logger),
129
+ version=self._version,
130
+ status=self._executor_status,
105
131
  free_resources=await self._fetch_free_host_resources(),
106
132
  allowed_functions=self._allowed_functions,
107
133
  function_executor_states=await self._fetch_function_executor_states(),
134
+ labels=self._labels,
108
135
  )
136
+ state.state_hash = _state_hash(state)
109
137
 
110
138
  await stub.report_executor_state(
111
139
  ReportExecutorStateRequest(executor_state=state),
@@ -197,3 +225,33 @@ def _to_grpc_function_executor_status(
197
225
  logger.error("Unexpected Function Executor status", status=status)
198
226
 
199
227
  return result
228
+
229
+
230
+ _FLAVOR_MAPPING = {
231
+ ExecutorFlavor.OSS: ExecutorFlavorProto.EXECUTOR_FLAVOR_OSS,
232
+ ExecutorFlavor.PLATFORM: ExecutorFlavorProto.EXECUTOR_FLAVOR_PLATFORM,
233
+ }
234
+
235
+
236
+ def _to_grpc_executor_flavor(
237
+ flavor: ExecutorFlavor, logger: Any
238
+ ) -> ExecutorFlavorProto:
239
+ result: ExecutorFlavorProto = _FLAVOR_MAPPING.get(
240
+ flavor, ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN
241
+ )
242
+
243
+ if result == ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN:
244
+ logger.error("Unexpected Executor flavor", flavor=flavor)
245
+
246
+ return result
247
+
248
+
249
+ def _label_values_to_strings(labels: Dict[str, Any]) -> Dict[str, str]:
250
+ return {k: str(v) for k, v in labels.items()}
251
+
252
+
253
+ def _state_hash(state: ExecutorState) -> str:
254
+ serialized_state: bytes = state.SerializeToString(deterministic=True)
255
+ hasher = hashlib.sha256(usedforsecurity=False)
256
+ hasher.update(serialized_state)
257
+ return hasher.hexdigest()
@@ -1,6 +1,7 @@
1
1
  import json
2
2
  import time
3
- from typing import AsyncGenerator, List, Optional
3
+ from socket import gethostname
4
+ from typing import AsyncGenerator, Dict, List, Optional
4
5
 
5
6
  import structlog
6
7
  from httpx_sse import aconnect_sse
@@ -22,6 +23,7 @@ class TaskFetcher:
22
23
  self,
23
24
  executor_id: str,
24
25
  executor_version: str,
26
+ labels: Dict[str, str],
25
27
  function_allowlist: Optional[List[FunctionURI]],
26
28
  protocol: str,
27
29
  indexify_server_addr: str,
@@ -33,12 +35,15 @@ class TaskFetcher:
33
35
  self._logger = structlog.get_logger(module=__name__)
34
36
 
35
37
  probe_info: ProbeInfo = RuntimeProbes().probe()
38
+ all_labels = probe_info.labels.copy()
39
+ all_labels.update(labels)
40
+
36
41
  self._executor_metadata: ExecutorMetadata = ExecutorMetadata(
37
42
  id=executor_id,
38
43
  executor_version=executor_version,
39
- addr="",
44
+ addr=gethostname(),
40
45
  function_allowlist=function_allowlist,
41
- labels=probe_info.labels,
46
+ labels=all_labels,
42
47
  )
43
48
 
44
49
  async def run(self) -> AsyncGenerator[Task, None]:
@@ -49,6 +49,7 @@ class TaskReporter:
49
49
  ):
50
50
  self._base_url = base_url
51
51
  self._executor_id = executor_id
52
+ self._is_shutdown = False
52
53
  # Use thread-safe sync client due to issues with async client.
53
54
  # Async client attempts to use connections it already closed.
54
55
  # See e.g. https://github.com/encode/httpx/issues/2337.
@@ -56,9 +57,25 @@ class TaskReporter:
56
57
  # results in not reusing established TCP connections to server.
57
58
  self._client = get_httpx_client(config_path, make_async=False)
58
59
 
60
+ async def shutdown(self):
61
+ """Shuts down the task reporter.
62
+
63
+ Task reporter stops reporting all task outcomes to the Server.
64
+ There are many task failures due to Executor shutdown. We give wrong
65
+ signals to Server if we report such failures.
66
+ """
67
+ self._is_shutdown = True
68
+
59
69
  async def report(self, output: TaskOutput, logger: Any):
60
70
  """Reports result of the supplied task."""
61
71
  logger = logger.bind(module=__name__)
72
+
73
+ if self._is_shutdown:
74
+ logger.warning(
75
+ "task reporter got shutdown, skipping task outcome reporting"
76
+ )
77
+ return
78
+
62
79
  task_result, output_files, output_summary = self._process_task_output(output)
63
80
  task_result_data = task_result.model_dump_json(exclude_none=True)
64
81
 
@@ -1,6 +1,8 @@
1
1
  syntax = "proto3";
2
2
 
3
- package task_scheduler_service;
3
+ // Rename with caution. The package name is part of gRPC service name.
4
+ // Existing clients won't find the service if the package name changes.
5
+ package executor_api_pb;
4
6
 
5
7
  // ===== ReportExecutorState RPC =====
6
8
 
@@ -81,15 +83,26 @@ enum ExecutorStatus {
81
83
  EXECUTOR_STATUS_STOPPED = 5;
82
84
  }
83
85
 
86
+ enum ExecutorFlavor {
87
+ EXECUTOR_FLAVOR_UNKNOWN = 0;
88
+ EXECUTOR_FLAVOR_OSS = 1;
89
+ EXECUTOR_FLAVOR_PLATFORM = 2;
90
+ }
91
+
84
92
  message ExecutorState {
85
93
  optional string executor_id = 1;
86
94
  optional bool development_mode = 2;
87
- optional ExecutorStatus executor_status = 3;
95
+ optional string hostname = 3;
96
+ optional ExecutorFlavor flavor = 4;
97
+ optional string version = 5;
98
+ optional ExecutorStatus status = 6;
88
99
  // Free resources available at the Executor.
89
- optional HostResources free_resources = 4;
100
+ optional HostResources free_resources = 7;
90
101
  // Empty allowed_functions list means that any function can run on the Executor.
91
- repeated AllowedFunction allowed_functions = 5;
92
- repeated FunctionExecutorState function_executor_states = 6;
102
+ repeated AllowedFunction allowed_functions = 8;
103
+ repeated FunctionExecutorState function_executor_states = 9;
104
+ map<string, string> labels = 10;
105
+ optional string state_hash = 11;
93
106
  }
94
107
 
95
108
  // A message sent by Executor to report its up to date state to Server.
@@ -136,7 +149,11 @@ message DesiredExecutorState {
136
149
 
137
150
  // Internal API for scheduling and running tasks on Executors. Executors are acting as clients of this API.
138
151
  // Server is responsible for scheduling tasks on Executors and Executors are responsible for running the tasks.
139
- service TaskSchedulerService {
152
+ //
153
+ // Rename with caution. Existing clients won't find the service if the service name changes. A HTTP2 ingress proxy
154
+ // might use the service name in it HTTP2 path based routing rules. See how gRPC uses service names in its HTTP2 paths
155
+ // at https://github.com/grpc/grpc/blob/master/doc/PROTOCOL-HTTP2.md.
156
+ service ExecutorAPI {
140
157
  // Called by Executor every 5 seconds to report that it's still alive and provide its current state.
141
158
  //
142
159
  // Missing 3 reports will result in the Executor being deregistered by Server.
@@ -0,0 +1,70 @@
1
+ # -*- coding: utf-8 -*-
2
+ # Generated by the protocol buffer compiler. DO NOT EDIT!
3
+ # NO CHECKED-IN PROTOBUF GENCODE
4
+ # source: indexify/proto/executor_api.proto
5
+ # Protobuf Python Version: 5.29.0
6
+ """Generated protocol buffer code."""
7
+ from google.protobuf import descriptor as _descriptor
8
+ from google.protobuf import descriptor_pool as _descriptor_pool
9
+ from google.protobuf import runtime_version as _runtime_version
10
+ from google.protobuf import symbol_database as _symbol_database
11
+ from google.protobuf.internal import builder as _builder
12
+
13
+ _runtime_version.ValidateProtobufRuntimeVersion(
14
+ _runtime_version.Domain.PUBLIC, 5, 29, 0, "", "indexify/proto/executor_api.proto"
15
+ )
16
+ # @@protoc_insertion_point(imports)
17
+
18
+ _sym_db = _symbol_database.Default()
19
+
20
+
21
+ DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
22
+ b'\n!indexify/proto/executor_api.proto\x12\x0f\x65xecutor_api_pb"e\n\x0cGPUResources\x12\x12\n\x05\x63ount\x18\x01 \x01(\rH\x00\x88\x01\x01\x12-\n\x05model\x18\x02 \x01(\x0e\x32\x19.executor_api_pb.GPUModelH\x01\x88\x01\x01\x42\x08\n\x06_countB\x08\n\x06_model"\xc2\x01\n\rHostResources\x12\x16\n\tcpu_count\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x19\n\x0cmemory_bytes\x18\x02 \x01(\x04H\x01\x88\x01\x01\x12\x17\n\ndisk_bytes\x18\x03 \x01(\x04H\x02\x88\x01\x01\x12/\n\x03gpu\x18\x04 \x01(\x0b\x32\x1d.executor_api_pb.GPUResourcesH\x03\x88\x01\x01\x42\x0c\n\n_cpu_countB\x0f\n\r_memory_bytesB\r\n\x0b_disk_bytesB\x06\n\x04_gpu"\xbb\x01\n\x0f\x41llowedFunction\x12\x16\n\tnamespace\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x17\n\ngraph_name\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x42\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_function_nameB\x10\n\x0e_graph_version"\xed\x02\n\x1b\x46unctionExecutorDescription\x12\x0f\n\x02id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x05 \x01(\tH\x04\x88\x01\x01\x12\x16\n\timage_uri\x18\x06 \x01(\tH\x05\x88\x01\x01\x12\x14\n\x0csecret_names\x18\x07 \x03(\t\x12<\n\x0fresource_limits\x18\x08 \x01(\x0b\x32\x1e.executor_api_pb.HostResourcesH\x06\x88\x01\x01\x42\x05\n\x03_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_graph_versionB\x10\n\x0e_function_nameB\x0c\n\n_image_uriB\x12\n\x10_resource_limits"\xb8\x01\n\x15\x46unctionExecutorState\x12\x46\n\x0b\x64\x65scription\x18\x01 \x01(\x0b\x32,.executor_api_pb.FunctionExecutorDescriptionH\x00\x88\x01\x01\x12<\n\x06status\x18\x02 \x01(\x0e\x32\'.executor_api_pb.FunctionExecutorStatusH\x01\x88\x01\x01\x42\x0e\n\x0c_descriptionB\t\n\x07_status"\x9f\x05\n\rExecutorState\x12\x18\n\x0b\x65xecutor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x1d\n\x10\x64\x65velopment_mode\x18\x02 \x01(\x08H\x01\x88\x01\x01\x12\x15\n\x08hostname\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x34\n\x06\x66lavor\x18\x04 \x01(\x0e\x32\x1f.executor_api_pb.ExecutorFlavorH\x03\x88\x01\x01\x12\x14\n\x07version\x18\x05 \x01(\tH\x04\x88\x01\x01\x12\x34\n\x06status\x18\x06 \x01(\x0e\x32\x1f.executor_api_pb.ExecutorStatusH\x05\x88\x01\x01\x12;\n\x0e\x66ree_resources\x18\x07 \x01(\x0b\x32\x1e.executor_api_pb.HostResourcesH\x06\x88\x01\x01\x12;\n\x11\x61llowed_functions\x18\x08 \x03(\x0b\x32 .executor_api_pb.AllowedFunction\x12H\n\x18\x66unction_executor_states\x18\t \x03(\x0b\x32&.executor_api_pb.FunctionExecutorState\x12:\n\x06labels\x18\n \x03(\x0b\x32*.executor_api_pb.ExecutorState.LabelsEntry\x12\x17\n\nstate_hash\x18\x0b \x01(\tH\x07\x88\x01\x01\x1a-\n\x0bLabelsEntry\x12\x0b\n\x03key\x18\x01 \x01(\t\x12\r\n\x05value\x18\x02 \x01(\t:\x02\x38\x01\x42\x0e\n\x0c_executor_idB\x13\n\x11_development_modeB\x0b\n\t_hostnameB\t\n\x07_flavorB\n\n\x08_versionB\t\n\x07_statusB\x11\n\x0f_free_resourcesB\r\n\x0b_state_hash"l\n\x1aReportExecutorStateRequest\x12;\n\x0e\x65xecutor_state\x18\x01 \x01(\x0b\x32\x1e.executor_api_pb.ExecutorStateH\x00\x88\x01\x01\x42\x11\n\x0f_executor_state"\x1d\n\x1bReportExecutorStateResponse"\x88\x03\n\x04Task\x12\x0f\n\x02id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x05 \x01(\tH\x04\x88\x01\x01\x12 \n\x13graph_invocation_id\x18\x06 \x01(\tH\x05\x88\x01\x01\x12\x16\n\tinput_key\x18\x08 \x01(\tH\x06\x88\x01\x01\x12\x1f\n\x12reducer_output_key\x18\t \x01(\tH\x07\x88\x01\x01\x12\x17\n\ntimeout_ms\x18\n \x01(\tH\x08\x88\x01\x01\x42\x05\n\x03_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_graph_versionB\x10\n\x0e_function_nameB\x16\n\x14_graph_invocation_idB\x0c\n\n_input_keyB\x15\n\x13_reducer_output_keyB\r\n\x0b_timeout_ms"\x7f\n\x0eTaskAllocation\x12!\n\x14\x66unction_executor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12(\n\x04task\x18\x02 \x01(\x0b\x32\x15.executor_api_pb.TaskH\x01\x88\x01\x01\x42\x17\n\x15_function_executor_idB\x07\n\x05_task"K\n\x1fGetDesiredExecutorStatesRequest\x12\x18\n\x0b\x65xecutor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x42\x0e\n\x0c_executor_id"\xb9\x01\n\x14\x44\x65siredExecutorState\x12H\n\x12\x66unction_executors\x18\x01 \x03(\x0b\x32,.executor_api_pb.FunctionExecutorDescription\x12\x39\n\x10task_allocations\x18\x02 \x03(\x0b\x32\x1f.executor_api_pb.TaskAllocation\x12\x12\n\x05\x63lock\x18\x03 \x01(\x04H\x00\x88\x01\x01\x42\x08\n\x06_clock*\x86\x03\n\x08GPUModel\x12\x15\n\x11GPU_MODEL_UNKNOWN\x10\x00\x12"\n\x1eGPU_MODEL_NVIDIA_TESLA_T4_16GB\x10\n\x12$\n GPU_MODEL_NVIDIA_TESLA_V100_16GB\x10\x14\x12\x1d\n\x19GPU_MODEL_NVIDIA_A10_24GB\x10\x1e\x12\x1f\n\x1bGPU_MODEL_NVIDIA_A6000_48GB\x10(\x12#\n\x1fGPU_MODEL_NVIDIA_A100_SXM4_40GB\x10\x32\x12#\n\x1fGPU_MODEL_NVIDIA_A100_SXM4_80GB\x10\x33\x12"\n\x1eGPU_MODEL_NVIDIA_A100_PCI_40GB\x10\x34\x12#\n\x1fGPU_MODEL_NVIDIA_H100_SXM5_80GB\x10<\x12"\n\x1eGPU_MODEL_NVIDIA_H100_PCI_80GB\x10=\x12"\n\x1eGPU_MODEL_NVIDIA_RTX_6000_24GB\x10>*\xa3\x03\n\x16\x46unctionExecutorStatus\x12$\n FUNCTION_EXECUTOR_STATUS_UNKNOWN\x10\x00\x12(\n$FUNCTION_EXECUTOR_STATUS_STARTING_UP\x10\x01\x12:\n6FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR\x10\x02\x12:\n6FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR\x10\x03\x12!\n\x1d\x46UNCTION_EXECUTOR_STATUS_IDLE\x10\x04\x12)\n%FUNCTION_EXECUTOR_STATUS_RUNNING_TASK\x10\x05\x12&\n"FUNCTION_EXECUTOR_STATUS_UNHEALTHY\x10\x06\x12%\n!FUNCTION_EXECUTOR_STATUS_STOPPING\x10\x07\x12$\n FUNCTION_EXECUTOR_STATUS_STOPPED\x10\x08*\xc3\x01\n\x0e\x45xecutorStatus\x12\x1b\n\x17\x45XECUTOR_STATUS_UNKNOWN\x10\x00\x12\x1f\n\x1b\x45XECUTOR_STATUS_STARTING_UP\x10\x01\x12\x1b\n\x17\x45XECUTOR_STATUS_RUNNING\x10\x02\x12\x1b\n\x17\x45XECUTOR_STATUS_DRAINED\x10\x03\x12\x1c\n\x18\x45XECUTOR_STATUS_STOPPING\x10\x04\x12\x1b\n\x17\x45XECUTOR_STATUS_STOPPED\x10\x05*d\n\x0e\x45xecutorFlavor\x12\x1b\n\x17\x45XECUTOR_FLAVOR_UNKNOWN\x10\x00\x12\x17\n\x13\x45XECUTOR_FLAVOR_OSS\x10\x01\x12\x1c\n\x18\x45XECUTOR_FLAVOR_PLATFORM\x10\x02\x32\xff\x01\n\x0b\x45xecutorAPI\x12t\n\x15report_executor_state\x12+.executor_api_pb.ReportExecutorStateRequest\x1a,.executor_api_pb.ReportExecutorStateResponse"\x00\x12z\n\x1bget_desired_executor_states\x12\x30.executor_api_pb.GetDesiredExecutorStatesRequest\x1a%.executor_api_pb.DesiredExecutorState"\x00\x30\x01\x62\x06proto3'
23
+ )
24
+
25
+ _globals = globals()
26
+ _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
27
+ _builder.BuildTopDescriptorsAndMessages(
28
+ DESCRIPTOR, "indexify.proto.executor_api_pb2", _globals
29
+ )
30
+ if not _descriptor._USE_C_DESCRIPTORS:
31
+ DESCRIPTOR._loaded_options = None
32
+ _globals["_EXECUTORSTATE_LABELSENTRY"]._loaded_options = None
33
+ _globals["_EXECUTORSTATE_LABELSENTRY"]._serialized_options = b"8\001"
34
+ _globals["_GPUMODEL"]._serialized_start = 2704
35
+ _globals["_GPUMODEL"]._serialized_end = 3094
36
+ _globals["_FUNCTIONEXECUTORSTATUS"]._serialized_start = 3097
37
+ _globals["_FUNCTIONEXECUTORSTATUS"]._serialized_end = 3516
38
+ _globals["_EXECUTORSTATUS"]._serialized_start = 3519
39
+ _globals["_EXECUTORSTATUS"]._serialized_end = 3714
40
+ _globals["_EXECUTORFLAVOR"]._serialized_start = 3716
41
+ _globals["_EXECUTORFLAVOR"]._serialized_end = 3816
42
+ _globals["_GPURESOURCES"]._serialized_start = 54
43
+ _globals["_GPURESOURCES"]._serialized_end = 155
44
+ _globals["_HOSTRESOURCES"]._serialized_start = 158
45
+ _globals["_HOSTRESOURCES"]._serialized_end = 352
46
+ _globals["_ALLOWEDFUNCTION"]._serialized_start = 355
47
+ _globals["_ALLOWEDFUNCTION"]._serialized_end = 542
48
+ _globals["_FUNCTIONEXECUTORDESCRIPTION"]._serialized_start = 545
49
+ _globals["_FUNCTIONEXECUTORDESCRIPTION"]._serialized_end = 910
50
+ _globals["_FUNCTIONEXECUTORSTATE"]._serialized_start = 913
51
+ _globals["_FUNCTIONEXECUTORSTATE"]._serialized_end = 1097
52
+ _globals["_EXECUTORSTATE"]._serialized_start = 1100
53
+ _globals["_EXECUTORSTATE"]._serialized_end = 1771
54
+ _globals["_EXECUTORSTATE_LABELSENTRY"]._serialized_start = 1608
55
+ _globals["_EXECUTORSTATE_LABELSENTRY"]._serialized_end = 1653
56
+ _globals["_REPORTEXECUTORSTATEREQUEST"]._serialized_start = 1773
57
+ _globals["_REPORTEXECUTORSTATEREQUEST"]._serialized_end = 1881
58
+ _globals["_REPORTEXECUTORSTATERESPONSE"]._serialized_start = 1883
59
+ _globals["_REPORTEXECUTORSTATERESPONSE"]._serialized_end = 1912
60
+ _globals["_TASK"]._serialized_start = 1915
61
+ _globals["_TASK"]._serialized_end = 2307
62
+ _globals["_TASKALLOCATION"]._serialized_start = 2309
63
+ _globals["_TASKALLOCATION"]._serialized_end = 2436
64
+ _globals["_GETDESIREDEXECUTORSTATESREQUEST"]._serialized_start = 2438
65
+ _globals["_GETDESIREDEXECUTORSTATESREQUEST"]._serialized_end = 2513
66
+ _globals["_DESIREDEXECUTORSTATE"]._serialized_start = 2516
67
+ _globals["_DESIREDEXECUTORSTATE"]._serialized_end = 2701
68
+ _globals["_EXECUTORAPI"]._serialized_start = 3819
69
+ _globals["_EXECUTORAPI"]._serialized_end = 4074
70
+ # @@protoc_insertion_point(module_scope)
@@ -50,6 +50,12 @@ class ExecutorStatus(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
50
50
  EXECUTOR_STATUS_STOPPING: _ClassVar[ExecutorStatus]
51
51
  EXECUTOR_STATUS_STOPPED: _ClassVar[ExecutorStatus]
52
52
 
53
+ class ExecutorFlavor(int, metaclass=_enum_type_wrapper.EnumTypeWrapper):
54
+ __slots__ = ()
55
+ EXECUTOR_FLAVOR_UNKNOWN: _ClassVar[ExecutorFlavor]
56
+ EXECUTOR_FLAVOR_OSS: _ClassVar[ExecutorFlavor]
57
+ EXECUTOR_FLAVOR_PLATFORM: _ClassVar[ExecutorFlavor]
58
+
53
59
  GPU_MODEL_UNKNOWN: GPUModel
54
60
  GPU_MODEL_NVIDIA_TESLA_T4_16GB: GPUModel
55
61
  GPU_MODEL_NVIDIA_TESLA_V100_16GB: GPUModel
@@ -76,6 +82,9 @@ EXECUTOR_STATUS_RUNNING: ExecutorStatus
76
82
  EXECUTOR_STATUS_DRAINED: ExecutorStatus
77
83
  EXECUTOR_STATUS_STOPPING: ExecutorStatus
78
84
  EXECUTOR_STATUS_STOPPED: ExecutorStatus
85
+ EXECUTOR_FLAVOR_UNKNOWN: ExecutorFlavor
86
+ EXECUTOR_FLAVOR_OSS: ExecutorFlavor
87
+ EXECUTOR_FLAVOR_PLATFORM: ExecutorFlavor
79
88
 
80
89
  class GPUResources(_message.Message):
81
90
  __slots__ = ("count", "model")
@@ -178,30 +187,59 @@ class ExecutorState(_message.Message):
178
187
  __slots__ = (
179
188
  "executor_id",
180
189
  "development_mode",
181
- "executor_status",
190
+ "hostname",
191
+ "flavor",
192
+ "version",
193
+ "status",
182
194
  "free_resources",
183
195
  "allowed_functions",
184
196
  "function_executor_states",
197
+ "labels",
198
+ "state_hash",
185
199
  )
200
+
201
+ class LabelsEntry(_message.Message):
202
+ __slots__ = ("key", "value")
203
+ KEY_FIELD_NUMBER: _ClassVar[int]
204
+ VALUE_FIELD_NUMBER: _ClassVar[int]
205
+ key: str
206
+ value: str
207
+ def __init__(
208
+ self, key: _Optional[str] = ..., value: _Optional[str] = ...
209
+ ) -> None: ...
210
+
186
211
  EXECUTOR_ID_FIELD_NUMBER: _ClassVar[int]
187
212
  DEVELOPMENT_MODE_FIELD_NUMBER: _ClassVar[int]
188
- EXECUTOR_STATUS_FIELD_NUMBER: _ClassVar[int]
213
+ HOSTNAME_FIELD_NUMBER: _ClassVar[int]
214
+ FLAVOR_FIELD_NUMBER: _ClassVar[int]
215
+ VERSION_FIELD_NUMBER: _ClassVar[int]
216
+ STATUS_FIELD_NUMBER: _ClassVar[int]
189
217
  FREE_RESOURCES_FIELD_NUMBER: _ClassVar[int]
190
218
  ALLOWED_FUNCTIONS_FIELD_NUMBER: _ClassVar[int]
191
219
  FUNCTION_EXECUTOR_STATES_FIELD_NUMBER: _ClassVar[int]
220
+ LABELS_FIELD_NUMBER: _ClassVar[int]
221
+ STATE_HASH_FIELD_NUMBER: _ClassVar[int]
192
222
  executor_id: str
193
223
  development_mode: bool
194
- executor_status: ExecutorStatus
224
+ hostname: str
225
+ flavor: ExecutorFlavor
226
+ version: str
227
+ status: ExecutorStatus
195
228
  free_resources: HostResources
196
229
  allowed_functions: _containers.RepeatedCompositeFieldContainer[AllowedFunction]
197
230
  function_executor_states: _containers.RepeatedCompositeFieldContainer[
198
231
  FunctionExecutorState
199
232
  ]
233
+ labels: _containers.ScalarMap[str, str]
234
+ state_hash: str
200
235
  def __init__(
201
236
  self,
202
237
  executor_id: _Optional[str] = ...,
203
238
  development_mode: bool = ...,
204
- executor_status: _Optional[_Union[ExecutorStatus, str]] = ...,
239
+ hostname: _Optional[str] = ...,
240
+ flavor: _Optional[_Union[ExecutorFlavor, str]] = ...,
241
+ version: _Optional[str] = ...,
242
+ status: _Optional[_Union[ExecutorStatus, str]] = ...,
205
243
  free_resources: _Optional[_Union[HostResources, _Mapping]] = ...,
206
244
  allowed_functions: _Optional[
207
245
  _Iterable[_Union[AllowedFunction, _Mapping]]
@@ -209,6 +247,8 @@ class ExecutorState(_message.Message):
209
247
  function_executor_states: _Optional[
210
248
  _Iterable[_Union[FunctionExecutorState, _Mapping]]
211
249
  ] = ...,
250
+ labels: _Optional[_Mapping[str, str]] = ...,
251
+ state_hash: _Optional[str] = ...,
212
252
  ) -> None: ...
213
253
 
214
254
  class ReportExecutorStateRequest(_message.Message):
@@ -4,9 +4,7 @@ import warnings
4
4
 
5
5
  import grpc
6
6
 
7
- from indexify.proto import (
8
- task_scheduler_pb2 as indexify_dot_proto_dot_task__scheduler__pb2,
9
- )
7
+ from indexify.proto import executor_api_pb2 as indexify_dot_proto_dot_executor__api__pb2
10
8
 
11
9
  GRPC_GENERATED_VERSION = "1.70.0"
12
10
  GRPC_VERSION = grpc.__version__
@@ -24,16 +22,20 @@ except ImportError:
24
22
  if _version_not_supported:
25
23
  raise RuntimeError(
26
24
  f"The grpc package installed is at version {GRPC_VERSION},"
27
- + f" but the generated code in indexify/proto/task_scheduler_pb2_grpc.py depends on"
25
+ + f" but the generated code in indexify/proto/executor_api_pb2_grpc.py depends on"
28
26
  + f" grpcio>={GRPC_GENERATED_VERSION}."
29
27
  + f" Please upgrade your grpc module to grpcio>={GRPC_GENERATED_VERSION}"
30
28
  + f" or downgrade your generated code using grpcio-tools<={GRPC_VERSION}."
31
29
  )
32
30
 
33
31
 
34
- class TaskSchedulerServiceStub(object):
32
+ class ExecutorAPIStub(object):
35
33
  """Internal API for scheduling and running tasks on Executors. Executors are acting as clients of this API.
36
34
  Server is responsible for scheduling tasks on Executors and Executors are responsible for running the tasks.
35
+
36
+ Rename with caution. Existing clients won't find the service if the service name changes. A HTTP2 ingress proxy
37
+ might use the service name in it HTTP2 path based routing rules. See how gRPC uses service names in its HTTP2 paths
38
+ at https://github.com/grpc/grpc/blob/master/doc/PROTOCOL-HTTP2.md.
37
39
  """
38
40
 
39
41
  def __init__(self, channel):
@@ -43,22 +45,26 @@ class TaskSchedulerServiceStub(object):
43
45
  channel: A grpc.Channel.
44
46
  """
45
47
  self.report_executor_state = channel.unary_unary(
46
- "/task_scheduler_service.TaskSchedulerService/report_executor_state",
47
- request_serializer=indexify_dot_proto_dot_task__scheduler__pb2.ReportExecutorStateRequest.SerializeToString,
48
- response_deserializer=indexify_dot_proto_dot_task__scheduler__pb2.ReportExecutorStateResponse.FromString,
48
+ "/executor_api_pb.ExecutorAPI/report_executor_state",
49
+ request_serializer=indexify_dot_proto_dot_executor__api__pb2.ReportExecutorStateRequest.SerializeToString,
50
+ response_deserializer=indexify_dot_proto_dot_executor__api__pb2.ReportExecutorStateResponse.FromString,
49
51
  _registered_method=True,
50
52
  )
51
53
  self.get_desired_executor_states = channel.unary_stream(
52
- "/task_scheduler_service.TaskSchedulerService/get_desired_executor_states",
53
- request_serializer=indexify_dot_proto_dot_task__scheduler__pb2.GetDesiredExecutorStatesRequest.SerializeToString,
54
- response_deserializer=indexify_dot_proto_dot_task__scheduler__pb2.DesiredExecutorState.FromString,
54
+ "/executor_api_pb.ExecutorAPI/get_desired_executor_states",
55
+ request_serializer=indexify_dot_proto_dot_executor__api__pb2.GetDesiredExecutorStatesRequest.SerializeToString,
56
+ response_deserializer=indexify_dot_proto_dot_executor__api__pb2.DesiredExecutorState.FromString,
55
57
  _registered_method=True,
56
58
  )
57
59
 
58
60
 
59
- class TaskSchedulerServiceServicer(object):
61
+ class ExecutorAPIServicer(object):
60
62
  """Internal API for scheduling and running tasks on Executors. Executors are acting as clients of this API.
61
63
  Server is responsible for scheduling tasks on Executors and Executors are responsible for running the tasks.
64
+
65
+ Rename with caution. Existing clients won't find the service if the service name changes. A HTTP2 ingress proxy
66
+ might use the service name in it HTTP2 path based routing rules. See how gRPC uses service names in its HTTP2 paths
67
+ at https://github.com/grpc/grpc/blob/master/doc/PROTOCOL-HTTP2.md.
62
68
  """
63
69
 
64
70
  def report_executor_state(self, request, context):
@@ -81,32 +87,36 @@ class TaskSchedulerServiceServicer(object):
81
87
  raise NotImplementedError("Method not implemented!")
82
88
 
83
89
 
84
- def add_TaskSchedulerServiceServicer_to_server(servicer, server):
90
+ def add_ExecutorAPIServicer_to_server(servicer, server):
85
91
  rpc_method_handlers = {
86
92
  "report_executor_state": grpc.unary_unary_rpc_method_handler(
87
93
  servicer.report_executor_state,
88
- request_deserializer=indexify_dot_proto_dot_task__scheduler__pb2.ReportExecutorStateRequest.FromString,
89
- response_serializer=indexify_dot_proto_dot_task__scheduler__pb2.ReportExecutorStateResponse.SerializeToString,
94
+ request_deserializer=indexify_dot_proto_dot_executor__api__pb2.ReportExecutorStateRequest.FromString,
95
+ response_serializer=indexify_dot_proto_dot_executor__api__pb2.ReportExecutorStateResponse.SerializeToString,
90
96
  ),
91
97
  "get_desired_executor_states": grpc.unary_stream_rpc_method_handler(
92
98
  servicer.get_desired_executor_states,
93
- request_deserializer=indexify_dot_proto_dot_task__scheduler__pb2.GetDesiredExecutorStatesRequest.FromString,
94
- response_serializer=indexify_dot_proto_dot_task__scheduler__pb2.DesiredExecutorState.SerializeToString,
99
+ request_deserializer=indexify_dot_proto_dot_executor__api__pb2.GetDesiredExecutorStatesRequest.FromString,
100
+ response_serializer=indexify_dot_proto_dot_executor__api__pb2.DesiredExecutorState.SerializeToString,
95
101
  ),
96
102
  }
97
103
  generic_handler = grpc.method_handlers_generic_handler(
98
- "task_scheduler_service.TaskSchedulerService", rpc_method_handlers
104
+ "executor_api_pb.ExecutorAPI", rpc_method_handlers
99
105
  )
100
106
  server.add_generic_rpc_handlers((generic_handler,))
101
107
  server.add_registered_method_handlers(
102
- "task_scheduler_service.TaskSchedulerService", rpc_method_handlers
108
+ "executor_api_pb.ExecutorAPI", rpc_method_handlers
103
109
  )
104
110
 
105
111
 
106
112
  # This class is part of an EXPERIMENTAL API.
107
- class TaskSchedulerService(object):
113
+ class ExecutorAPI(object):
108
114
  """Internal API for scheduling and running tasks on Executors. Executors are acting as clients of this API.
109
115
  Server is responsible for scheduling tasks on Executors and Executors are responsible for running the tasks.
116
+
117
+ Rename with caution. Existing clients won't find the service if the service name changes. A HTTP2 ingress proxy
118
+ might use the service name in it HTTP2 path based routing rules. See how gRPC uses service names in its HTTP2 paths
119
+ at https://github.com/grpc/grpc/blob/master/doc/PROTOCOL-HTTP2.md.
110
120
  """
111
121
 
112
122
  @staticmethod
@@ -125,9 +135,9 @@ class TaskSchedulerService(object):
125
135
  return grpc.experimental.unary_unary(
126
136
  request,
127
137
  target,
128
- "/task_scheduler_service.TaskSchedulerService/report_executor_state",
129
- indexify_dot_proto_dot_task__scheduler__pb2.ReportExecutorStateRequest.SerializeToString,
130
- indexify_dot_proto_dot_task__scheduler__pb2.ReportExecutorStateResponse.FromString,
138
+ "/executor_api_pb.ExecutorAPI/report_executor_state",
139
+ indexify_dot_proto_dot_executor__api__pb2.ReportExecutorStateRequest.SerializeToString,
140
+ indexify_dot_proto_dot_executor__api__pb2.ReportExecutorStateResponse.FromString,
131
141
  options,
132
142
  channel_credentials,
133
143
  insecure,
@@ -155,9 +165,9 @@ class TaskSchedulerService(object):
155
165
  return grpc.experimental.unary_stream(
156
166
  request,
157
167
  target,
158
- "/task_scheduler_service.TaskSchedulerService/get_desired_executor_states",
159
- indexify_dot_proto_dot_task__scheduler__pb2.GetDesiredExecutorStatesRequest.SerializeToString,
160
- indexify_dot_proto_dot_task__scheduler__pb2.DesiredExecutorState.FromString,
168
+ "/executor_api_pb.ExecutorAPI/get_desired_executor_states",
169
+ indexify_dot_proto_dot_executor__api__pb2.GetDesiredExecutorStatesRequest.SerializeToString,
170
+ indexify_dot_proto_dot_executor__api__pb2.DesiredExecutorState.FromString,
161
171
  options,
162
172
  channel_credentials,
163
173
  insecure,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: indexify
3
- Version: 0.3.17
3
+ Version: 0.3.18
4
4
  Summary: Open Source Indexify components and helper tools
5
5
  Home-page: https://github.com/tensorlakeai/indexify
6
6
  License: Apache 2.0
@@ -1,13 +1,14 @@
1
- indexify/cli/cli.py,sha256=-ngLINNhZ2Req18_dQ1rHEnusY2feQ-9gYxOP8jcorA,8768
1
+ indexify/cli/cli.py,sha256=YGIpXDtWnA3uj9VYhH8PeFGiRBbGBWLb8SyyzIVRKFg,9255
2
2
  indexify/executor/README.md,sha256=ozC6_hMkhQQNVCMEpBxwiUALz6lwErPQxNxQfQDqnG4,2029
3
3
  indexify/executor/api_objects.py,sha256=oUlH-GQPuPmwgcBzMpI2HehXeElBTCULECk-oHiBHwU,1263
4
4
  indexify/executor/downloader.py,sha256=LkvAXfKxddnDzgfmwHcpDB_n795-eVKzn-hLjq4nUEM,9412
5
- indexify/executor/executor.py,sha256=FTp05YxuKlMUbI99OV7NkL3KuFD12evKcqxzb-fXWBk,14641
5
+ indexify/executor/executor.py,sha256=0wsWDchr4ocLBk2JVVFEA9be-8Qz09kbxPLDUnrJuV0,15198
6
+ indexify/executor/executor_flavor.py,sha256=uilzDQVVYlQGR1MVnrUC4NevUActDWHdnJkr38M6kTk,118
6
7
  indexify/executor/function_executor/function_executor.py,sha256=s1mc7g6b8ilc98Fp7RFElEBSLJl0UGNQY0iZzCpuR2A,11334
7
8
  indexify/executor/function_executor/function_executor_state.py,sha256=b2taGClg0BUnlD_rYGkpom6syXBMUp7UWWrjLrUCwyo,3966
8
9
  indexify/executor/function_executor/function_executor_states_container.py,sha256=RclJDJqIr8ywKipPBC6_idnPAqYi0dPa1d4QUAaXqbw,3460
9
10
  indexify/executor/function_executor/function_executor_status.py,sha256=U4p1fcdVWlHr7uPY7e7ZSb2_WelUmPeH-WgboQQ9mw4,3336
10
- indexify/executor/function_executor/health_checker.py,sha256=CPUWvvtJtJCwbEsfr_BOhSKkRki4qOoSk1oeyBosWz0,5464
11
+ indexify/executor/function_executor/health_checker.py,sha256=Fvd1gmrcjyJqP-8vcsUxfnTHQIMNlHeMWCS70PAVr9E,6095
11
12
  indexify/executor/function_executor/invocation_state_client.py,sha256=p-xgM4__cHR1ApvMV9hShrGWee_Je0VDhICZUGjpQY4,9644
12
13
  indexify/executor/function_executor/metrics/function_executor.py,sha256=TDksxLRJr-P9ZKhF2Orsaxzzb4lVIBxFEjd_9Zv53Ng,6313
13
14
  indexify/executor/function_executor/metrics/function_executor_state.py,sha256=qheMhnoiYLiZB7ky5EyegfDy4Mr0Zh83bOE0gJ38YmU,1607
@@ -23,11 +24,11 @@ indexify/executor/function_executor/server/subprocess_function_executor_server_f
23
24
  indexify/executor/function_executor/single_task_runner.py,sha256=iWnJsB2BGqdgAkrlJHbOvSIhVXc88X0AYbB2_o-bB-E,13547
24
25
  indexify/executor/function_executor/task_input.py,sha256=wSrHR4m0juiGClQyeVdhRC37QzDt6Rrjq-ZXJkfBi9k,584
25
26
  indexify/executor/function_executor/task_output.py,sha256=SQJSlrknB7Ylf5IOeINfBEgiplS5hAPJh1hYulhyvfU,1962
26
- indexify/executor/grpc/channel_creator.py,sha256=Z_DU212-wkaU_m-I14OBbWKVeHo3aG5vPmF_ebJaZGc,1849
27
- indexify/executor/grpc/metrics/channel_creator.py,sha256=k-WArgklmP5WhjcmFmrgRblB7yc3XlaOXO8owRyV-mw,649
27
+ indexify/executor/grpc/channel_manager.py,sha256=THamn5VghCxRkXDlu2WEXtC6-SNKGc0xoa718bw9A4k,6257
28
+ indexify/executor/grpc/metrics/channel_manager.py,sha256=k-WArgklmP5WhjcmFmrgRblB7yc3XlaOXO8owRyV-mw,649
28
29
  indexify/executor/grpc/metrics/state_reporter.py,sha256=GggBEjMzQUYIG95LtTS4fUg1u9jYowkaXoUXppAXucs,543
29
- indexify/executor/grpc/state_reconciler.py,sha256=g7Qi6t79vuldh4y0Ue2mfnU2Jj8J-rsDtQzNZZlbLfE,12973
30
- indexify/executor/grpc/state_reporter.py,sha256=kk1kqvuUfgNZQriWj2FWjkPyloLcWPq7cCuNCK7mb5I,7669
30
+ indexify/executor/grpc/state_reconciler.py,sha256=RvlY2k6QwxryjOYxhf1AMb1T8BRadEYzsU03mS0nQFY,13300
31
+ indexify/executor/grpc/state_reporter.py,sha256=tpbg4A3nMyvwEsrYd-whET821a2ZuS8OLyu89Y3DvBw,9876
31
32
  indexify/executor/metrics/downloader.py,sha256=lctPh8xjkXeLEFJnl1hNrD1yEhLhIl5sggsR4Yoe_Zc,2746
32
33
  indexify/executor/metrics/executor.py,sha256=ua-Vv_k1CB4juJdF7tEBQbBMksqWAA3iXKKMKXZUCLk,2369
33
34
  indexify/executor/metrics/task_fetcher.py,sha256=iJEwCLzYr2cuz7hRvNiqaa2nvQP4OrA0hm0iJY0YKG0,736
@@ -43,14 +44,14 @@ indexify/executor/monitoring/prometheus_metrics_handler.py,sha256=KiGqSf7rkXTfbD
43
44
  indexify/executor/monitoring/server.py,sha256=yzdYhcxnmY6uTQUMt3vatF5jilN52ZtfFseOmHyQpTo,1254
44
45
  indexify/executor/monitoring/startup_probe_handler.py,sha256=zXXsBU15SMlBx1bSFpxWDfed1VHtKKnwvLQ8-frpG98,425
45
46
  indexify/executor/runtime_probes.py,sha256=bo6Dq6AGZpJH099j0DHtVSDEH80tv3j9MXf3VXSx_p8,2182
46
- indexify/executor/task_fetcher.py,sha256=NpFfHgaY99bSL-K2D5kcDAMNUG2FArq0-qF_mgF-LBQ,3375
47
- indexify/executor/task_reporter.py,sha256=mYgwozUO95PEwYMmeeIS0-HfMrO4z3Nhy6IduMsMahM,7367
47
+ indexify/executor/task_fetcher.py,sha256=p3iEsWyGi0ZMPAv0183smzOUD1KycQ_dXsyd9mpB9IU,3529
48
+ indexify/executor/task_reporter.py,sha256=0D6ToLhDvd9U0ZPRaDMsZJYBsdzZUqcdkpIxHDUrvdk,7892
48
49
  indexify/executor/task_runner.py,sha256=1zYH03yS_FaFk9xXBl-ioM74-L2xdW3vHJt522mseds,7073
49
- indexify/proto/task_scheduler.proto,sha256=kxMIJCj1pXG-fHeJGHXlthZTsB1dy_yvshQLt0UJRTM,5672
50
- indexify/proto/task_scheduler_pb2.py,sha256=X97JBJZ2n6ToDtUlDjPFV66_vZ05-vO8wPATrpzAonA,9085
51
- indexify/proto/task_scheduler_pb2.pyi,sha256=aXrB7-eNwgchy2OVlvEfPXtr9EyYoU-sgbdSRVNEI8s,11357
52
- indexify/proto/task_scheduler_pb2_grpc.py,sha256=STtk9XrBzLbmWdLwpL55Obyf9ehUesfxxysxER32SEE,6854
53
- indexify-0.3.17.dist-info/METADATA,sha256=hREMWJfSrd4Vcclp2w8fcUnjtvkiXHw6jMCTWECKAtw,1158
54
- indexify-0.3.17.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
55
- indexify-0.3.17.dist-info/entry_points.txt,sha256=GU9wmsgvN7nQw3N2X0PMYn1RSvF6CrhH9RuC2D8d3Gk,53
56
- indexify-0.3.17.dist-info/RECORD,,
50
+ indexify/proto/executor_api.proto,sha256=-dSnBE35OUoknDDV0HvSOVk11-pPqIjbQ5X22uJ_rSs,6399
51
+ indexify/proto/executor_api_pb2.py,sha256=WffUqYV39xoDmdaHYpckR3XX-pdbOzMoaYO5ghcA1Lg,9949
52
+ indexify/proto/executor_api_pb2.pyi,sha256=lXP79CootL4pHghuVIv1wgR0Y0YPl0wIVUiHKY7PM2s,12677
53
+ indexify/proto/executor_api_pb2_grpc.py,sha256=i8LEPG6esub6C-xxJ7S3vEJSgWCOxSqElNjMW3Imqg8,7607
54
+ indexify-0.3.18.dist-info/METADATA,sha256=T7_EDOfiMyAn0dpZ-m96vgiDVT2oGJDn0N7UYzDYNSA,1158
55
+ indexify-0.3.18.dist-info/WHEEL,sha256=RaoafKOydTQ7I_I3JTrPCg6kUmTgtm4BornzOqyEfJ8,88
56
+ indexify-0.3.18.dist-info/entry_points.txt,sha256=GU9wmsgvN7nQw3N2X0PMYn1RSvF6CrhH9RuC2D8d3Gk,53
57
+ indexify-0.3.18.dist-info/RECORD,,
@@ -1,53 +0,0 @@
1
- import asyncio
2
- from typing import Any
3
-
4
- import grpc.aio
5
-
6
- from .metrics.channel_creator import (
7
- metric_grpc_server_channel_creation_latency,
8
- metric_grpc_server_channel_creation_retries,
9
- metric_grpc_server_channel_creations,
10
- )
11
-
12
- _RETRY_INTERVAL_SEC = 5
13
- _CONNECT_TIMEOUT_SEC = 5
14
-
15
-
16
- class ChannelCreator:
17
- def __init__(self, server_address: str, logger: Any):
18
- self._logger = logger.bind(module=__name__)
19
- self._server_address = server_address
20
- self._is_shutdown = False
21
-
22
- async def create(self) -> grpc.aio.Channel:
23
- """Creates a channel to the gRPC server.
24
-
25
- Blocks until the channel is ready.
26
- Never raises any exceptions.
27
- """
28
- with metric_grpc_server_channel_creation_latency.time():
29
- metric_grpc_server_channel_creations.inc()
30
- while not self._is_shutdown:
31
- try:
32
- channel = grpc.aio.insecure_channel(self._server_address)
33
- await asyncio.wait_for(
34
- channel.channel_ready(),
35
- timeout=_CONNECT_TIMEOUT_SEC,
36
- )
37
- return channel
38
- except Exception:
39
- self._logger.error(
40
- f"failed establishing grpc server channel in {_CONNECT_TIMEOUT_SEC} sec, retrying in {_RETRY_INTERVAL_SEC} sec"
41
- )
42
- try:
43
- await channel.close()
44
- except Exception as e:
45
- self._logger.error(
46
- "failed closing not established channel", exc_info=e
47
- )
48
-
49
- metric_grpc_server_channel_creation_retries.inc()
50
- await asyncio.sleep(_RETRY_INTERVAL_SEC)
51
-
52
- async def shutdown(self):
53
- self._is_shutdown = True
@@ -1,64 +0,0 @@
1
- # -*- coding: utf-8 -*-
2
- # Generated by the protocol buffer compiler. DO NOT EDIT!
3
- # NO CHECKED-IN PROTOBUF GENCODE
4
- # source: indexify/proto/task_scheduler.proto
5
- # Protobuf Python Version: 5.29.0
6
- """Generated protocol buffer code."""
7
- from google.protobuf import descriptor as _descriptor
8
- from google.protobuf import descriptor_pool as _descriptor_pool
9
- from google.protobuf import runtime_version as _runtime_version
10
- from google.protobuf import symbol_database as _symbol_database
11
- from google.protobuf.internal import builder as _builder
12
-
13
- _runtime_version.ValidateProtobufRuntimeVersion(
14
- _runtime_version.Domain.PUBLIC, 5, 29, 0, "", "indexify/proto/task_scheduler.proto"
15
- )
16
- # @@protoc_insertion_point(imports)
17
-
18
- _sym_db = _symbol_database.Default()
19
-
20
-
21
- DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
22
- b'\n#indexify/proto/task_scheduler.proto\x12\x16task_scheduler_service"l\n\x0cGPUResources\x12\x12\n\x05\x63ount\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x34\n\x05model\x18\x02 \x01(\x0e\x32 .task_scheduler_service.GPUModelH\x01\x88\x01\x01\x42\x08\n\x06_countB\x08\n\x06_model"\xc9\x01\n\rHostResources\x12\x16\n\tcpu_count\x18\x01 \x01(\rH\x00\x88\x01\x01\x12\x19\n\x0cmemory_bytes\x18\x02 \x01(\x04H\x01\x88\x01\x01\x12\x17\n\ndisk_bytes\x18\x03 \x01(\x04H\x02\x88\x01\x01\x12\x36\n\x03gpu\x18\x04 \x01(\x0b\x32$.task_scheduler_service.GPUResourcesH\x03\x88\x01\x01\x42\x0c\n\n_cpu_countB\x0f\n\r_memory_bytesB\r\n\x0b_disk_bytesB\x06\n\x04_gpu"\xbb\x01\n\x0f\x41llowedFunction\x12\x16\n\tnamespace\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x17\n\ngraph_name\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x42\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_function_nameB\x10\n\x0e_graph_version"\xf4\x02\n\x1b\x46unctionExecutorDescription\x12\x0f\n\x02id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x05 \x01(\tH\x04\x88\x01\x01\x12\x16\n\timage_uri\x18\x06 \x01(\tH\x05\x88\x01\x01\x12\x14\n\x0csecret_names\x18\x07 \x03(\t\x12\x43\n\x0fresource_limits\x18\x08 \x01(\x0b\x32%.task_scheduler_service.HostResourcesH\x06\x88\x01\x01\x42\x05\n\x03_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_graph_versionB\x10\n\x0e_function_nameB\x0c\n\n_image_uriB\x12\n\x10_resource_limits"\xc6\x01\n\x15\x46unctionExecutorState\x12M\n\x0b\x64\x65scription\x18\x01 \x01(\x0b\x32\x33.task_scheduler_service.FunctionExecutorDescriptionH\x00\x88\x01\x01\x12\x43\n\x06status\x18\x02 \x01(\x0e\x32..task_scheduler_service.FunctionExecutorStatusH\x01\x88\x01\x01\x42\x0e\n\x0c_descriptionB\t\n\x07_status"\xb3\x03\n\rExecutorState\x12\x18\n\x0b\x65xecutor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x1d\n\x10\x64\x65velopment_mode\x18\x02 \x01(\x08H\x01\x88\x01\x01\x12\x44\n\x0f\x65xecutor_status\x18\x03 \x01(\x0e\x32&.task_scheduler_service.ExecutorStatusH\x02\x88\x01\x01\x12\x42\n\x0e\x66ree_resources\x18\x04 \x01(\x0b\x32%.task_scheduler_service.HostResourcesH\x03\x88\x01\x01\x12\x42\n\x11\x61llowed_functions\x18\x05 \x03(\x0b\x32\'.task_scheduler_service.AllowedFunction\x12O\n\x18\x66unction_executor_states\x18\x06 \x03(\x0b\x32-.task_scheduler_service.FunctionExecutorStateB\x0e\n\x0c_executor_idB\x13\n\x11_development_modeB\x12\n\x10_executor_statusB\x11\n\x0f_free_resources"s\n\x1aReportExecutorStateRequest\x12\x42\n\x0e\x65xecutor_state\x18\x01 \x01(\x0b\x32%.task_scheduler_service.ExecutorStateH\x00\x88\x01\x01\x42\x11\n\x0f_executor_state"\x1d\n\x1bReportExecutorStateResponse"\x88\x03\n\x04Task\x12\x0f\n\x02id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12\x16\n\tnamespace\x18\x02 \x01(\tH\x01\x88\x01\x01\x12\x17\n\ngraph_name\x18\x03 \x01(\tH\x02\x88\x01\x01\x12\x1a\n\rgraph_version\x18\x04 \x01(\tH\x03\x88\x01\x01\x12\x1a\n\rfunction_name\x18\x05 \x01(\tH\x04\x88\x01\x01\x12 \n\x13graph_invocation_id\x18\x06 \x01(\tH\x05\x88\x01\x01\x12\x16\n\tinput_key\x18\x08 \x01(\tH\x06\x88\x01\x01\x12\x1f\n\x12reducer_output_key\x18\t \x01(\tH\x07\x88\x01\x01\x12\x17\n\ntimeout_ms\x18\n \x01(\tH\x08\x88\x01\x01\x42\x05\n\x03_idB\x0c\n\n_namespaceB\r\n\x0b_graph_nameB\x10\n\x0e_graph_versionB\x10\n\x0e_function_nameB\x16\n\x14_graph_invocation_idB\x0c\n\n_input_keyB\x15\n\x13_reducer_output_keyB\r\n\x0b_timeout_ms"\x86\x01\n\x0eTaskAllocation\x12!\n\x14\x66unction_executor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x12/\n\x04task\x18\x02 \x01(\x0b\x32\x1c.task_scheduler_service.TaskH\x01\x88\x01\x01\x42\x17\n\x15_function_executor_idB\x07\n\x05_task"K\n\x1fGetDesiredExecutorStatesRequest\x12\x18\n\x0b\x65xecutor_id\x18\x01 \x01(\tH\x00\x88\x01\x01\x42\x0e\n\x0c_executor_id"\xc7\x01\n\x14\x44\x65siredExecutorState\x12O\n\x12\x66unction_executors\x18\x01 \x03(\x0b\x32\x33.task_scheduler_service.FunctionExecutorDescription\x12@\n\x10task_allocations\x18\x02 \x03(\x0b\x32&.task_scheduler_service.TaskAllocation\x12\x12\n\x05\x63lock\x18\x03 \x01(\x04H\x00\x88\x01\x01\x42\x08\n\x06_clock*\x86\x03\n\x08GPUModel\x12\x15\n\x11GPU_MODEL_UNKNOWN\x10\x00\x12"\n\x1eGPU_MODEL_NVIDIA_TESLA_T4_16GB\x10\n\x12$\n GPU_MODEL_NVIDIA_TESLA_V100_16GB\x10\x14\x12\x1d\n\x19GPU_MODEL_NVIDIA_A10_24GB\x10\x1e\x12\x1f\n\x1bGPU_MODEL_NVIDIA_A6000_48GB\x10(\x12#\n\x1fGPU_MODEL_NVIDIA_A100_SXM4_40GB\x10\x32\x12#\n\x1fGPU_MODEL_NVIDIA_A100_SXM4_80GB\x10\x33\x12"\n\x1eGPU_MODEL_NVIDIA_A100_PCI_40GB\x10\x34\x12#\n\x1fGPU_MODEL_NVIDIA_H100_SXM5_80GB\x10<\x12"\n\x1eGPU_MODEL_NVIDIA_H100_PCI_80GB\x10=\x12"\n\x1eGPU_MODEL_NVIDIA_RTX_6000_24GB\x10>*\xa3\x03\n\x16\x46unctionExecutorStatus\x12$\n FUNCTION_EXECUTOR_STATUS_UNKNOWN\x10\x00\x12(\n$FUNCTION_EXECUTOR_STATUS_STARTING_UP\x10\x01\x12:\n6FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR\x10\x02\x12:\n6FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR\x10\x03\x12!\n\x1d\x46UNCTION_EXECUTOR_STATUS_IDLE\x10\x04\x12)\n%FUNCTION_EXECUTOR_STATUS_RUNNING_TASK\x10\x05\x12&\n"FUNCTION_EXECUTOR_STATUS_UNHEALTHY\x10\x06\x12%\n!FUNCTION_EXECUTOR_STATUS_STOPPING\x10\x07\x12$\n FUNCTION_EXECUTOR_STATUS_STOPPED\x10\x08*\xc3\x01\n\x0e\x45xecutorStatus\x12\x1b\n\x17\x45XECUTOR_STATUS_UNKNOWN\x10\x00\x12\x1f\n\x1b\x45XECUTOR_STATUS_STARTING_UP\x10\x01\x12\x1b\n\x17\x45XECUTOR_STATUS_RUNNING\x10\x02\x12\x1b\n\x17\x45XECUTOR_STATUS_DRAINED\x10\x03\x12\x1c\n\x18\x45XECUTOR_STATUS_STOPPING\x10\x04\x12\x1b\n\x17\x45XECUTOR_STATUS_STOPPED\x10\x05\x32\xa6\x02\n\x14TaskSchedulerService\x12\x82\x01\n\x15report_executor_state\x12\x32.task_scheduler_service.ReportExecutorStateRequest\x1a\x33.task_scheduler_service.ReportExecutorStateResponse"\x00\x12\x88\x01\n\x1bget_desired_executor_states\x12\x37.task_scheduler_service.GetDesiredExecutorStatesRequest\x1a,.task_scheduler_service.DesiredExecutorState"\x00\x30\x01\x62\x06proto3'
23
- )
24
-
25
- _globals = globals()
26
- _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, _globals)
27
- _builder.BuildTopDescriptorsAndMessages(
28
- DESCRIPTOR, "indexify.proto.task_scheduler_pb2", _globals
29
- )
30
- if not _descriptor._USE_C_DESCRIPTORS:
31
- DESCRIPTOR._loaded_options = None
32
- _globals["_GPUMODEL"]._serialized_start = 2541
33
- _globals["_GPUMODEL"]._serialized_end = 2931
34
- _globals["_FUNCTIONEXECUTORSTATUS"]._serialized_start = 2934
35
- _globals["_FUNCTIONEXECUTORSTATUS"]._serialized_end = 3353
36
- _globals["_EXECUTORSTATUS"]._serialized_start = 3356
37
- _globals["_EXECUTORSTATUS"]._serialized_end = 3551
38
- _globals["_GPURESOURCES"]._serialized_start = 63
39
- _globals["_GPURESOURCES"]._serialized_end = 171
40
- _globals["_HOSTRESOURCES"]._serialized_start = 174
41
- _globals["_HOSTRESOURCES"]._serialized_end = 375
42
- _globals["_ALLOWEDFUNCTION"]._serialized_start = 378
43
- _globals["_ALLOWEDFUNCTION"]._serialized_end = 565
44
- _globals["_FUNCTIONEXECUTORDESCRIPTION"]._serialized_start = 568
45
- _globals["_FUNCTIONEXECUTORDESCRIPTION"]._serialized_end = 940
46
- _globals["_FUNCTIONEXECUTORSTATE"]._serialized_start = 943
47
- _globals["_FUNCTIONEXECUTORSTATE"]._serialized_end = 1141
48
- _globals["_EXECUTORSTATE"]._serialized_start = 1144
49
- _globals["_EXECUTORSTATE"]._serialized_end = 1579
50
- _globals["_REPORTEXECUTORSTATEREQUEST"]._serialized_start = 1581
51
- _globals["_REPORTEXECUTORSTATEREQUEST"]._serialized_end = 1696
52
- _globals["_REPORTEXECUTORSTATERESPONSE"]._serialized_start = 1698
53
- _globals["_REPORTEXECUTORSTATERESPONSE"]._serialized_end = 1727
54
- _globals["_TASK"]._serialized_start = 1730
55
- _globals["_TASK"]._serialized_end = 2122
56
- _globals["_TASKALLOCATION"]._serialized_start = 2125
57
- _globals["_TASKALLOCATION"]._serialized_end = 2259
58
- _globals["_GETDESIREDEXECUTORSTATESREQUEST"]._serialized_start = 2261
59
- _globals["_GETDESIREDEXECUTORSTATESREQUEST"]._serialized_end = 2336
60
- _globals["_DESIREDEXECUTORSTATE"]._serialized_start = 2339
61
- _globals["_DESIREDEXECUTORSTATE"]._serialized_end = 2538
62
- _globals["_TASKSCHEDULERSERVICE"]._serialized_start = 3554
63
- _globals["_TASKSCHEDULERSERVICE"]._serialized_end = 3848
64
- # @@protoc_insertion_point(module_scope)