indexify 0.3.17__py3-none-any.whl → 0.3.19__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. indexify/cli/cli.py +21 -18
  2. indexify/executor/api_objects.py +12 -0
  3. indexify/executor/downloader.py +4 -1
  4. indexify/executor/executor.py +65 -28
  5. indexify/executor/executor_flavor.py +7 -0
  6. indexify/executor/function_executor/function_executor.py +24 -11
  7. indexify/executor/function_executor/function_executor_state.py +9 -1
  8. indexify/executor/function_executor/function_executor_states_container.py +3 -1
  9. indexify/executor/function_executor/function_executor_status.py +2 -0
  10. indexify/executor/function_executor/health_checker.py +20 -2
  11. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
  12. indexify/executor/function_executor/single_task_runner.py +15 -11
  13. indexify/executor/function_executor/task_output.py +35 -2
  14. indexify/executor/grpc/channel_manager.py +160 -0
  15. indexify/executor/grpc/completed_tasks_container.py +26 -0
  16. indexify/executor/grpc/function_executor_controller.py +421 -0
  17. indexify/executor/grpc/state_reconciler.py +33 -38
  18. indexify/executor/grpc/state_reporter.py +100 -39
  19. indexify/executor/grpc/task_controller.py +449 -0
  20. indexify/executor/metrics/task_reporter.py +14 -0
  21. indexify/executor/task_fetcher.py +8 -3
  22. indexify/executor/task_reporter.py +112 -4
  23. indexify/executor/task_runner.py +1 -0
  24. indexify/proto/{task_scheduler.proto → executor_api.proto} +86 -11
  25. indexify/proto/executor_api_pb2.py +80 -0
  26. indexify/proto/{task_scheduler_pb2.pyi → executor_api_pb2.pyi} +162 -7
  27. indexify/proto/executor_api_pb2_grpc.py +227 -0
  28. {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/METADATA +1 -1
  29. {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/RECORD +32 -28
  30. indexify/executor/grpc/channel_creator.py +0 -53
  31. indexify/proto/task_scheduler_pb2.py +0 -64
  32. indexify/proto/task_scheduler_pb2_grpc.py +0 -170
  33. /indexify/executor/grpc/metrics/{channel_creator.py → channel_manager.py} +0 -0
  34. {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/WHEEL +0 -0
  35. {indexify-0.3.17.dist-info → indexify-0.3.19.dist-info}/entry_points.txt +0 -0
@@ -7,14 +7,14 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
7
7
  SerializedObject,
8
8
  )
9
9
 
10
- from indexify.proto.task_scheduler_pb2 import (
10
+ from indexify.proto.executor_api_pb2 import (
11
11
  DesiredExecutorState,
12
12
  FunctionExecutorDescription,
13
13
  FunctionExecutorStatus,
14
14
  GetDesiredExecutorStatesRequest,
15
15
  )
16
- from indexify.proto.task_scheduler_pb2_grpc import (
17
- TaskSchedulerServiceStub,
16
+ from indexify.proto.executor_api_pb2_grpc import (
17
+ ExecutorAPIStub,
18
18
  )
19
19
 
20
20
  from ..downloader import Downloader
@@ -30,20 +30,11 @@ from ..function_executor.server.function_executor_server_factory import (
30
30
  from ..function_executor.task_input import TaskInput
31
31
  from ..function_executor.task_output import TaskOutput
32
32
  from ..metrics.executor import (
33
- METRIC_TASKS_COMPLETED_OUTCOME_ALL,
34
- METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
35
- METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
36
- METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
37
- metric_task_completion_latency,
38
- metric_task_outcome_report_latency,
39
- metric_task_outcome_report_retries,
40
- metric_task_outcome_reports,
41
- metric_tasks_completed,
42
33
  metric_tasks_fetched,
43
- metric_tasks_reporting_outcome,
44
34
  )
45
35
  from ..task_reporter import TaskReporter
46
- from .channel_creator import ChannelCreator
36
+ from .channel_manager import ChannelManager
37
+ from .state_reporter import ExecutorStateReporter
47
38
 
48
39
  _RECONCILE_STREAM_BACKOFF_INTERVAL_SEC = 5
49
40
 
@@ -58,7 +49,8 @@ class ExecutorStateReconciler:
58
49
  config_path: Optional[str],
59
50
  downloader: Downloader,
60
51
  task_reporter: TaskReporter,
61
- channel_creator: ChannelCreator,
52
+ channel_manager: ChannelManager,
53
+ state_reporter: ExecutorStateReporter,
62
54
  logger: Any,
63
55
  ):
64
56
  self._executor_id: str = executor_id
@@ -72,7 +64,8 @@ class ExecutorStateReconciler:
72
64
  self._function_executor_states: FunctionExecutorStatesContainer = (
73
65
  function_executor_states
74
66
  )
75
- self._channel_creator = channel_creator
67
+ self._channel_manager: ChannelManager = channel_manager
68
+ self._state_reporter: ExecutorStateReporter = state_reporter
76
69
  self._logger: Any = logger.bind(module=__name__)
77
70
  self._is_shutdown: bool = False
78
71
  self._server_last_clock: Optional[int] = None
@@ -83,27 +76,25 @@ class ExecutorStateReconciler:
83
76
  Never raises any exceptions.
84
77
  """
85
78
  while not self._is_shutdown:
86
- async with await self._channel_creator.create() as server_channel:
87
- server_channel: grpc.aio.Channel
88
- stub = TaskSchedulerServiceStub(server_channel)
89
- while not self._is_shutdown:
90
- try:
91
- # TODO: Report state once before starting the stream.
92
- desired_states_stream: AsyncGenerator[
93
- DesiredExecutorState, None
94
- ] = stub.get_desired_executor_states(
95
- GetDesiredExecutorStatesRequest(
96
- executor_id=self._executor_id
97
- )
98
- )
99
- await self._process_desired_states_stream(desired_states_stream)
100
- except Exception as e:
101
- self._logger.error(
102
- f"Failed processing desired states stream, reconnecting in {_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC} sec.",
103
- exc_info=e,
104
- )
105
- await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
106
- break
79
+ stub = ExecutorAPIStub(await self._channel_manager.get_channel())
80
+ while not self._is_shutdown:
81
+ try:
82
+ # Report state once before starting the stream so Server
83
+ # doesn't use old state it knew about this Executor in the past.
84
+ await self._state_reporter.report_state(stub)
85
+ desired_states_stream: AsyncGenerator[
86
+ DesiredExecutorState, None
87
+ ] = stub.get_desired_executor_states(
88
+ GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
89
+ )
90
+ await self._process_desired_states_stream(desired_states_stream)
91
+ except Exception as e:
92
+ self._logger.error(
93
+ f"Failed processing desired states stream, reconnecting in {_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC} sec.",
94
+ exc_info=e,
95
+ )
96
+ await asyncio.sleep(_RECONCILE_STREAM_BACKOFF_INTERVAL_SEC)
97
+ break
107
98
 
108
99
  self._logger.info("State reconciler shutdown.")
109
100
 
@@ -123,6 +114,7 @@ class ExecutorStateReconciler:
123
114
  await self._reconcile_state(new_state)
124
115
 
125
116
  async def _reconcile_state(self, new_state: DesiredExecutorState):
117
+ # TODO: use completed_tasks_container to ignore tasks that were already completed.
126
118
  await self._reconcile_function_executors(new_state)
127
119
  # TODO
128
120
  # await self._reconcile_task_allocations(new_state)
@@ -148,6 +140,7 @@ class ExecutorStateReconciler:
148
140
  graph_version=desired_function_executor.graph_version,
149
141
  function_name=desired_function_executor.function_name,
150
142
  image_uri=desired_function_executor.image_uri,
143
+ secret_names=list(desired_function_executor.secret_names),
151
144
  )
152
145
  )
153
146
 
@@ -291,7 +284,9 @@ class ExecutorStateReconciler:
291
284
  while True:
292
285
  logger = logger.bind(retries=reporting_retries)
293
286
  try:
294
- await self._task_reporter.report(output=task_output, logger=logger)
287
+ await self._task_reporter.report(
288
+ data_payload=task_output, logger=logger
289
+ )
295
290
  break
296
291
  except Exception as e:
297
292
  logger.error(
@@ -1,37 +1,44 @@
1
1
  import asyncio
2
+ import hashlib
3
+ from socket import gethostname
2
4
  from typing import Any, Dict, List, Optional
3
5
 
4
6
  import grpc
5
7
 
6
- from indexify.proto.task_scheduler_pb2 import (
8
+ from indexify.proto.executor_api_pb2 import (
7
9
  AllowedFunction,
10
+ )
11
+ from indexify.proto.executor_api_pb2 import ExecutorFlavor as ExecutorFlavorProto
12
+ from indexify.proto.executor_api_pb2 import (
8
13
  ExecutorState,
9
14
  ExecutorStatus,
10
15
  FunctionExecutorDescription,
11
16
  )
12
- from indexify.proto.task_scheduler_pb2 import (
17
+ from indexify.proto.executor_api_pb2 import (
13
18
  FunctionExecutorState as FunctionExecutorStateProto,
14
19
  )
15
- from indexify.proto.task_scheduler_pb2 import (
20
+ from indexify.proto.executor_api_pb2 import (
16
21
  FunctionExecutorStatus as FunctionExecutorStatusProto,
17
22
  )
18
- from indexify.proto.task_scheduler_pb2 import (
23
+ from indexify.proto.executor_api_pb2 import (
19
24
  GPUModel,
20
25
  GPUResources,
21
26
  HostResources,
22
27
  ReportExecutorStateRequest,
23
28
  )
24
- from indexify.proto.task_scheduler_pb2_grpc import (
25
- TaskSchedulerServiceStub,
29
+ from indexify.proto.executor_api_pb2_grpc import (
30
+ ExecutorAPIStub,
26
31
  )
27
32
 
28
33
  from ..api_objects import FunctionURI
34
+ from ..executor_flavor import ExecutorFlavor
29
35
  from ..function_executor.function_executor_state import FunctionExecutorState
30
36
  from ..function_executor.function_executor_states_container import (
31
37
  FunctionExecutorStatesContainer,
32
38
  )
33
39
  from ..function_executor.function_executor_status import FunctionExecutorStatus
34
- from .channel_creator import ChannelCreator
40
+ from ..runtime_probes import RuntimeProbes
41
+ from .channel_manager import ChannelManager
35
42
  from .metrics.state_reporter import (
36
43
  metric_state_report_errors,
37
44
  metric_state_report_latency,
@@ -47,24 +54,32 @@ class ExecutorStateReporter:
47
54
  def __init__(
48
55
  self,
49
56
  executor_id: str,
57
+ flavor: ExecutorFlavor,
58
+ version: str,
59
+ labels: Dict[str, str],
50
60
  development_mode: bool,
51
61
  function_allowlist: Optional[List[FunctionURI]],
52
62
  function_executor_states: FunctionExecutorStatesContainer,
53
- channel_creator: ChannelCreator,
63
+ channel_manager: ChannelManager,
54
64
  logger: Any,
55
65
  ):
56
66
  self._executor_id: str = executor_id
67
+ self._flavor: ExecutorFlavor = flavor
68
+ self._version: str = version
69
+ self._labels: Dict[str, str] = labels.copy()
57
70
  self._development_mode: bool = development_mode
71
+ self._hostname: str = gethostname()
58
72
  self._function_executor_states: FunctionExecutorStatesContainer = (
59
73
  function_executor_states
60
74
  )
61
- self._channel_creator = channel_creator
75
+ self._channel_manager = channel_manager
62
76
  self._logger: Any = logger.bind(module=__name__)
63
77
  self._is_shutdown: bool = False
64
78
  self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
65
79
  self._allowed_functions: List[AllowedFunction] = _to_grpc_allowed_functions(
66
80
  function_allowlist
67
81
  )
82
+ self._labels.update(_label_values_to_strings(RuntimeProbes().probe().labels))
68
83
 
69
84
  def update_executor_status(self, value: ExecutorStatus):
70
85
  self._executor_status = value
@@ -75,24 +90,30 @@ class ExecutorStateReporter:
75
90
  Never raises any exceptions.
76
91
  """
77
92
  while not self._is_shutdown:
78
- async with await self._channel_creator.create() as server_channel:
79
- server_channel: grpc.aio.Channel
80
- stub = TaskSchedulerServiceStub(server_channel)
81
- while not self._is_shutdown:
82
- try:
83
- await self._report_state(stub)
84
- await asyncio.sleep(_REPORTING_INTERVAL_SEC)
85
- except Exception as e:
86
- self._logger.error(
87
- f"Failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
88
- exc_info=e,
89
- )
90
- await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
91
- break
93
+ stub = ExecutorAPIStub(await self._channel_manager.get_channel())
94
+ while not self._is_shutdown:
95
+ try:
96
+ # The periodic state reports serve as channel health monitoring requests
97
+ # (same as TCP keep-alive). Channel Manager returns the same healthy channel
98
+ # for all RPCs that we do from Executor to Server. So all the RPCs benefit
99
+ # from this channel health monitoring.
100
+ await self.report_state(stub)
101
+ await asyncio.sleep(_REPORTING_INTERVAL_SEC)
102
+ except Exception as e:
103
+ self._logger.error(
104
+ f"Failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
105
+ exc_info=e,
106
+ )
107
+ await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
108
+ break
92
109
 
93
110
  self._logger.info("State reporter shutdown")
94
111
 
95
- async def _report_state(self, stub: TaskSchedulerServiceStub):
112
+ async def report_state(self, stub: ExecutorAPIStub):
113
+ """Reports the current state to the server represented by the supplied stub.
114
+
115
+ Raises exceptions on failure.
116
+ """
96
117
  with (
97
118
  metric_state_report_errors.count_exceptions(),
98
119
  metric_state_report_latency.time(),
@@ -101,11 +122,16 @@ class ExecutorStateReporter:
101
122
  state = ExecutorState(
102
123
  executor_id=self._executor_id,
103
124
  development_mode=self._development_mode,
104
- executor_status=self._executor_status,
125
+ hostname=self._hostname,
126
+ flavor=_to_grpc_executor_flavor(self._flavor, self._logger),
127
+ version=self._version,
128
+ status=self._executor_status,
105
129
  free_resources=await self._fetch_free_host_resources(),
106
130
  allowed_functions=self._allowed_functions,
107
131
  function_executor_states=await self._fetch_function_executor_states(),
132
+ labels=self._labels,
108
133
  )
134
+ state.state_hash = _state_hash(state)
109
135
 
110
136
  await stub.report_executor_state(
111
137
  ReportExecutorStateRequest(executor_state=state),
@@ -129,20 +155,25 @@ class ExecutorStateReporter:
129
155
 
130
156
  async for function_executor_state in self._function_executor_states:
131
157
  function_executor_state: FunctionExecutorState
132
- states.append(
133
- FunctionExecutorStateProto(
134
- description=FunctionExecutorDescription(
135
- id=function_executor_state.id,
136
- namespace=function_executor_state.namespace,
137
- graph_name=function_executor_state.graph_name,
138
- graph_version=function_executor_state.graph_version,
139
- function_name=function_executor_state.function_name,
140
- ),
141
- status=_to_grpc_function_executor_status(
142
- function_executor_state.status, self._logger
143
- ),
144
- )
158
+ function_executor_state_proto = FunctionExecutorStateProto(
159
+ description=FunctionExecutorDescription(
160
+ id=function_executor_state.id,
161
+ namespace=function_executor_state.namespace,
162
+ graph_name=function_executor_state.graph_name,
163
+ graph_version=function_executor_state.graph_version,
164
+ function_name=function_executor_state.function_name,
165
+ secret_names=function_executor_state.secret_names,
166
+ ),
167
+ status=_to_grpc_function_executor_status(
168
+ function_executor_state.status, self._logger
169
+ ),
170
+ status_message=function_executor_state.status_message,
145
171
  )
172
+ if function_executor_state.image_uri:
173
+ function_executor_state_proto.description.image_uri = (
174
+ function_executor_state.image_uri
175
+ )
176
+ states.append(function_executor_state_proto)
146
177
 
147
178
  return states
148
179
 
@@ -182,7 +213,7 @@ _STATUS_MAPPING: Dict[FunctionExecutorStatus, Any] = {
182
213
  FunctionExecutorStatus.UNHEALTHY: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY,
183
214
  FunctionExecutorStatus.DESTROYING: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
184
215
  FunctionExecutorStatus.DESTROYED: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
185
- FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
216
+ FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
186
217
  }
187
218
 
188
219
 
@@ -197,3 +228,33 @@ def _to_grpc_function_executor_status(
197
228
  logger.error("Unexpected Function Executor status", status=status)
198
229
 
199
230
  return result
231
+
232
+
233
+ _FLAVOR_MAPPING = {
234
+ ExecutorFlavor.OSS: ExecutorFlavorProto.EXECUTOR_FLAVOR_OSS,
235
+ ExecutorFlavor.PLATFORM: ExecutorFlavorProto.EXECUTOR_FLAVOR_PLATFORM,
236
+ }
237
+
238
+
239
+ def _to_grpc_executor_flavor(
240
+ flavor: ExecutorFlavor, logger: Any
241
+ ) -> ExecutorFlavorProto:
242
+ result: ExecutorFlavorProto = _FLAVOR_MAPPING.get(
243
+ flavor, ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN
244
+ )
245
+
246
+ if result == ExecutorFlavorProto.EXECUTOR_FLAVOR_UNKNOWN:
247
+ logger.error("Unexpected Executor flavor", flavor=flavor)
248
+
249
+ return result
250
+
251
+
252
+ def _label_values_to_strings(labels: Dict[str, Any]) -> Dict[str, str]:
253
+ return {k: str(v) for k, v in labels.items()}
254
+
255
+
256
+ def _state_hash(state: ExecutorState) -> str:
257
+ serialized_state: bytes = state.SerializeToString(deterministic=True)
258
+ hasher = hashlib.sha256(usedforsecurity=False)
259
+ hasher.update(serialized_state)
260
+ return hasher.hexdigest()