indexify 0.3.31__py3-none-any.whl → 0.4.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. indexify/cli/__init__.py +18 -0
  2. indexify/cli/build_image.py +51 -0
  3. indexify/cli/deploy.py +57 -0
  4. indexify/cli/executor.py +205 -0
  5. indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
  6. indexify/executor/executor.py +57 -313
  7. indexify/executor/function_allowlist.py +59 -0
  8. indexify/executor/function_executor/function_executor.py +12 -6
  9. indexify/executor/function_executor/invocation_state_client.py +25 -3
  10. indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
  11. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
  12. indexify/executor/function_executor_controller/__init__.py +13 -0
  13. indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
  14. indexify/executor/function_executor_controller/create_function_executor.py +158 -0
  15. indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
  16. indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
  17. indexify/executor/function_executor_controller/downloads.py +199 -0
  18. indexify/executor/function_executor_controller/events.py +172 -0
  19. indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
  20. indexify/executor/function_executor_controller/loggers.py +57 -0
  21. indexify/executor/function_executor_controller/message_validators.py +69 -0
  22. indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
  23. indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
  24. indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
  25. indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
  26. indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
  27. indexify/executor/function_executor_controller/prepare_task.py +38 -0
  28. indexify/executor/function_executor_controller/run_task.py +201 -0
  29. indexify/executor/function_executor_controller/task_info.py +33 -0
  30. indexify/executor/function_executor_controller/task_output.py +122 -0
  31. indexify/executor/function_executor_controller/upload_task_output.py +234 -0
  32. indexify/executor/host_resources/host_resources.py +20 -25
  33. indexify/executor/host_resources/nvidia_gpu_allocator.py +8 -1
  34. indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
  35. indexify/executor/metrics/executor.py +0 -47
  36. indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
  37. indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
  38. indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
  39. indexify/executor/monitoring/health_checker/health_checker.py +0 -11
  40. indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
  41. indexify/executor/state_reporter.py +364 -0
  42. indexify/proto/executor_api.proto +68 -60
  43. indexify/proto/executor_api_pb2.py +52 -52
  44. indexify/proto/executor_api_pb2.pyi +129 -108
  45. indexify/proto/executor_api_pb2_grpc.py +0 -47
  46. {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/METADATA +2 -5
  47. indexify-0.4.3.dist-info/RECORD +68 -0
  48. indexify-0.4.3.dist-info/entry_points.txt +3 -0
  49. indexify/cli/cli.py +0 -268
  50. indexify/executor/api_objects.py +0 -92
  51. indexify/executor/downloader.py +0 -417
  52. indexify/executor/executor_flavor.py +0 -7
  53. indexify/executor/function_executor/function_executor_state.py +0 -107
  54. indexify/executor/function_executor/function_executor_states_container.py +0 -93
  55. indexify/executor/function_executor/function_executor_status.py +0 -95
  56. indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
  57. indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
  58. indexify/executor/function_executor/single_task_runner.py +0 -345
  59. indexify/executor/function_executor/task_input.py +0 -21
  60. indexify/executor/function_executor/task_output.py +0 -105
  61. indexify/executor/grpc/function_executor_controller.py +0 -418
  62. indexify/executor/grpc/metrics/task_controller.py +0 -8
  63. indexify/executor/grpc/state_reporter.py +0 -317
  64. indexify/executor/grpc/task_controller.py +0 -508
  65. indexify/executor/metrics/task_fetcher.py +0 -21
  66. indexify/executor/metrics/task_reporter.py +0 -53
  67. indexify/executor/metrics/task_runner.py +0 -52
  68. indexify/executor/monitoring/function_allowlist.py +0 -25
  69. indexify/executor/runtime_probes.py +0 -68
  70. indexify/executor/task_fetcher.py +0 -96
  71. indexify/executor/task_reporter.py +0 -459
  72. indexify/executor/task_runner.py +0 -177
  73. indexify-0.3.31.dist-info/RECORD +0 -68
  74. indexify-0.3.31.dist-info/entry_points.txt +0 -3
  75. {indexify-0.3.31.dist-info → indexify-0.4.3.dist-info}/WHEEL +0 -0
@@ -1,54 +1,35 @@
1
1
  import asyncio
2
2
  import signal
3
- import time
4
3
  from pathlib import Path
5
4
  from socket import gethostname
6
- from typing import Any, Dict, List, Optional
5
+ from typing import Dict, List, Optional
7
6
 
8
7
  import structlog
9
- from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
10
- from tensorlake.utils.logging import suppress as suppress_logging
11
8
 
12
9
  from indexify.proto.executor_api_pb2 import ExecutorStatus
13
10
 
14
- from .api_objects import FunctionURI, Task
15
11
  from .blob_store.blob_store import BLOBStore
16
- from .downloader import Downloader
17
- from .executor_flavor import ExecutorFlavor
18
- from .function_executor.function_executor_states_container import (
19
- FunctionExecutorStatesContainer,
12
+ from .channel_manager import ChannelManager
13
+ from .function_allowlist import (
14
+ FunctionURI,
15
+ function_allowlist_to_indexed_dict,
16
+ parse_function_uris,
20
17
  )
21
18
  from .function_executor.server.function_executor_server_factory import (
22
19
  FunctionExecutorServerFactory,
23
20
  )
24
- from .grpc.channel_manager import ChannelManager
25
- from .grpc.state_reconciler import ExecutorStateReconciler
26
- from .grpc.state_reporter import ExecutorStateReporter
27
21
  from .host_resources.host_resources import HostResourcesProvider
28
22
  from .metrics.executor import (
29
- METRIC_TASKS_COMPLETED_OUTCOME_ALL,
30
- METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
31
- METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
32
- METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
33
23
  metric_executor_info,
34
24
  metric_executor_state,
35
- metric_task_completion_latency,
36
- metric_task_outcome_report_latency,
37
- metric_task_outcome_report_retries,
38
- metric_task_outcome_reports,
39
- metric_tasks_completed,
40
- metric_tasks_fetched,
41
- metric_tasks_reporting_outcome,
42
25
  )
43
- from .monitoring.function_allowlist import function_allowlist_to_info_dict
44
26
  from .monitoring.health_check_handler import HealthCheckHandler
45
27
  from .monitoring.health_checker.health_checker import HealthChecker
46
28
  from .monitoring.prometheus_metrics_handler import PrometheusMetricsHandler
47
29
  from .monitoring.server import MonitoringServer
48
30
  from .monitoring.startup_probe_handler import StartupProbeHandler
49
- from .task_fetcher import TaskFetcher
50
- from .task_reporter import TaskReporter
51
- from .task_runner import TaskInput, TaskOutput, TaskRunner
31
+ from .state_reconciler import ExecutorStateReconciler
32
+ from .state_reporter import ExecutorStateReporter
52
33
 
53
34
  metric_executor_state.state("starting")
54
35
 
@@ -57,33 +38,26 @@ class Executor:
57
38
  def __init__(
58
39
  self,
59
40
  id: str,
60
- development_mode: bool,
61
- flavor: ExecutorFlavor,
62
41
  version: str,
63
42
  labels: Dict[str, str],
64
- code_path: Path,
43
+ cache_path: Path,
65
44
  health_checker: HealthChecker,
66
- function_allowlist: Optional[List[FunctionURI]],
45
+ function_uris: List[str],
67
46
  function_executor_server_factory: FunctionExecutorServerFactory,
68
47
  server_addr: str,
69
48
  grpc_server_addr: str,
70
49
  config_path: Optional[str],
71
50
  monitoring_server_host: str,
72
51
  monitoring_server_port: int,
73
- enable_grpc_state_reconciler: bool,
74
52
  blob_store: BLOBStore,
75
53
  host_resources_provider: HostResourcesProvider,
76
54
  ):
77
55
  self._logger = structlog.get_logger(module=__name__)
78
- self._is_shutdown: bool = False
79
56
  protocol: str = "http"
80
57
  if config_path:
81
58
  self._logger.info("running the extractor with TLS enabled")
82
59
  protocol = "https"
83
60
 
84
- self._server_addr = server_addr
85
- self._base_url = f"{protocol}://{self._server_addr}"
86
- self._code_path = code_path
87
61
  self._startup_probe_handler = StartupProbeHandler()
88
62
  self._monitoring_server = MonitoringServer(
89
63
  host=monitoring_server_host,
@@ -92,33 +66,17 @@ class Executor:
92
66
  health_probe_handler=HealthCheckHandler(health_checker),
93
67
  metrics_handler=PrometheusMetricsHandler(),
94
68
  )
95
- self._function_executor_states = FunctionExecutorStatesContainer(
96
- logger=self._logger
97
- )
98
- health_checker.set_function_executor_states_container(
99
- self._function_executor_states
100
- )
101
- self._downloader = Downloader(
102
- code_path=code_path,
103
- base_url=self._base_url,
104
- blob_store=blob_store,
105
- config_path=config_path,
106
- )
107
- self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
108
- self._function_executor_server_factory = function_executor_server_factory
109
69
  self._channel_manager = ChannelManager(
110
70
  server_address=grpc_server_addr,
111
71
  config_path=config_path,
112
72
  logger=self._logger,
113
73
  )
74
+ function_allowlist: List[FunctionURI] = parse_function_uris(function_uris)
114
75
  self._state_reporter = ExecutorStateReporter(
115
76
  executor_id=id,
116
- development_mode=development_mode,
117
- flavor=flavor,
118
77
  version=version,
119
78
  labels=labels,
120
- function_allowlist=self._function_allowlist,
121
- function_executor_states=self._function_executor_states,
79
+ function_allowlist=function_allowlist,
122
80
  channel_manager=self._channel_manager,
123
81
  host_resources_provider=host_resources_provider,
124
82
  logger=self._logger,
@@ -126,69 +84,48 @@ class Executor:
126
84
  self._state_reporter.update_executor_status(
127
85
  ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
128
86
  )
129
- self._task_reporter = TaskReporter(
130
- base_url=self._base_url,
87
+ self._state_reconciler = ExecutorStateReconciler(
131
88
  executor_id=id,
89
+ function_executor_server_factory=function_executor_server_factory,
90
+ base_url=f"{protocol}://{server_addr}",
132
91
  config_path=config_path,
133
- channel_manager=self._channel_manager,
92
+ cache_path=cache_path,
134
93
  blob_store=blob_store,
94
+ channel_manager=self._channel_manager,
95
+ state_reporter=self._state_reporter,
96
+ logger=self._logger,
135
97
  )
136
-
137
- # HTTP mode task runner
138
- self._task_runner: Optional[TaskRunner] = None
139
- self._task_fetcher: Optional[TaskFetcher] = None
140
- # gRPC mode state reconciler that runs tasks
141
- self._state_reconciler: Optional[ExecutorStateReconciler] = None
142
-
143
- if enable_grpc_state_reconciler:
144
- self._state_reconciler = ExecutorStateReconciler(
145
- executor_id=id,
146
- function_executor_server_factory=self._function_executor_server_factory,
147
- base_url=self._base_url,
148
- function_executor_states=self._function_executor_states,
149
- config_path=config_path,
150
- downloader=self._downloader,
151
- task_reporter=self._task_reporter,
152
- channel_manager=self._channel_manager,
153
- state_reporter=self._state_reporter,
154
- logger=self._logger,
155
- )
156
- else:
157
- self._task_runner = TaskRunner(
158
- executor_id=id,
159
- function_executor_server_factory=function_executor_server_factory,
160
- base_url=self._base_url,
161
- function_executor_states=self._function_executor_states,
162
- config_path=config_path,
163
- )
164
- self._task_fetcher = TaskFetcher(
165
- executor_id=id,
166
- executor_version=version,
167
- labels=labels,
168
- function_allowlist=function_allowlist,
169
- protocol=protocol,
170
- indexify_server_addr=self._server_addr,
171
- config_path=config_path,
172
- )
98
+ self._run_aio_task: Optional[asyncio.Task] = None
99
+ self._shutdown_aio_task: Optional[asyncio.Task] = None
173
100
 
174
101
  executor_info: Dict[str, str] = {
175
102
  "id": id,
176
- "flavor": flavor.name,
177
103
  "version": version,
178
- "code_path": str(code_path),
104
+ "cache_path": str(cache_path),
179
105
  "server_addr": server_addr,
180
106
  "grpc_server_addr": str(grpc_server_addr),
181
107
  "config_path": str(config_path),
182
- "enable_grpc_state_reconciler": str(enable_grpc_state_reconciler),
183
108
  "hostname": gethostname(),
184
109
  }
185
110
  for key, value in labels.items():
186
111
  executor_info["label_" + key] = value
187
- executor_info.update(function_allowlist_to_info_dict(function_allowlist))
112
+ executor_info.update(function_allowlist_to_indexed_dict(function_allowlist))
188
113
  metric_executor_info.info(executor_info)
189
114
 
190
115
  def run(self):
191
116
  asyncio.new_event_loop()
117
+
118
+ self._run_aio_task = asyncio.get_event_loop().create_task(
119
+ self._run(),
120
+ name="executor startup and run loop",
121
+ )
122
+
123
+ try:
124
+ asyncio.get_event_loop().run_until_complete(self._run_aio_task)
125
+ except asyncio.CancelledError:
126
+ pass # Expected exception on shutdown
127
+
128
+ async def _run(self):
192
129
  for signum in [
193
130
  signal.SIGABRT,
194
131
  signal.SIGINT,
@@ -197,235 +134,42 @@ class Executor:
197
134
  signal.SIGHUP,
198
135
  ]:
199
136
  asyncio.get_event_loop().add_signal_handler(
200
- signum, self.shutdown, asyncio.get_event_loop()
137
+ signum, self._shutdown_signal_handler, asyncio.get_event_loop()
201
138
  )
202
139
 
203
- asyncio.get_event_loop().create_task(
140
+ asyncio.create_task(
204
141
  self._monitoring_server.run(), name="monitoring server runner"
205
142
  )
206
143
  self._state_reporter.update_executor_status(
207
144
  ExecutorStatus.EXECUTOR_STATUS_RUNNING
208
145
  )
209
- asyncio.get_event_loop().create_task(
210
- self._state_reporter.run(), name="state reporter runner"
211
- )
212
-
146
+ self._state_reporter.run()
147
+ self._state_reconciler.run()
213
148
  metric_executor_state.state("running")
214
149
  self._startup_probe_handler.set_ready()
215
150
 
216
- try:
217
- if self._state_reconciler is None:
218
- asyncio.get_event_loop().run_until_complete(
219
- self._http_task_runner_loop()
220
- )
221
- else:
222
- asyncio.get_event_loop().run_until_complete(
223
- self._grpc_state_reconciler_loop()
224
- )
225
- except asyncio.CancelledError:
226
- pass # Suppress this expected exception and return without error (normally).
227
-
228
- async def _grpc_state_reconciler_loop(self):
229
- """Runs the gRPC state reconciler and state reporter.
230
-
231
- Never raises any exceptions."""
232
- await self._state_reconciler.run()
233
-
234
- async def _http_task_runner_loop(self):
235
- while not self._is_shutdown:
236
- try:
237
- async for task in self._task_fetcher.run():
238
- metric_tasks_fetched.inc()
239
- if not self._is_shutdown:
240
- asyncio.create_task(
241
- self._run_task(task), name="task runner (http mode)"
242
- )
243
- self._logger.info("fetching tasks finished, reconnecting in 5 seconds")
244
- except Exception as e:
245
- self._logger.error(
246
- "failed fetching tasks, retrying in 5 seconds", exc_info=e
247
- )
248
- if not self._is_shutdown:
249
- await asyncio.sleep(5)
250
-
251
- async def _run_task(self, task: Task) -> None:
252
- """Runs the supplied task.
253
-
254
- Doesn't raise any Exceptions. All errors are reported to the server."""
255
- start_time: float = time.monotonic()
256
- logger = self._task_logger(task)
257
- output: Optional[TaskOutput] = None
258
-
259
- try:
260
- output = await self._run_task_and_get_output(task, logger)
261
- logger.info("task execution finished", success=output.success)
262
- except Exception as e:
263
- output = TaskOutput.internal_error(
264
- task_id=task.id,
265
- namespace=task.namespace,
266
- graph_name=task.compute_graph,
267
- function_name=task.compute_fn,
268
- graph_version=task.graph_version,
269
- graph_invocation_id=task.invocation_id,
270
- output_payload_uri_prefix=task.output_payload_uri_prefix,
271
- )
272
- logger.error("task execution failed", exc_info=e)
273
-
274
- if output.metrics is not None:
275
- self.log_function_metrics(output)
276
-
277
- with (
278
- metric_tasks_reporting_outcome.track_inprogress(),
279
- metric_task_outcome_report_latency.time(),
280
- ):
281
- metric_task_outcome_reports.inc()
282
- await self._report_task_outcome(output=output, logger=logger)
283
-
284
- metric_task_completion_latency.observe(time.monotonic() - start_time)
285
-
286
- def log_function_metrics(self, output: TaskOutput):
287
- for counter_name, counter_value in output.metrics.counters.items():
288
- self._logger.info(
289
- f"function_metric",
290
- counter_name=counter_name,
291
- counter_value=counter_value,
292
- invocation_id=output.graph_invocation_id,
293
- function_name=output.function_name,
294
- graph_name=output.graph_name,
295
- namespace=output.namespace,
296
- )
297
- for timer_name, timer_value in output.metrics.timers.items():
298
- self._logger.info(
299
- f"function_metric",
300
- timer_name=timer_name,
301
- timer_value=timer_value,
302
- invocation_id=output.graph_invocation_id,
303
- function_name=output.function_name,
304
- graph_name=output.graph_name,
305
- namespace=output.namespace,
306
- )
307
-
308
- async def _run_task_and_get_output(self, task: Task, logger: Any) -> TaskOutput:
309
- graph: SerializedObject = await self._downloader.download_graph(
310
- namespace=task.namespace,
311
- graph_name=task.compute_graph,
312
- graph_version=task.graph_version,
313
- logger=logger,
314
- data_payload=task.graph_payload,
315
- )
316
- input: SerializedObject = await self._downloader.download_input(
317
- namespace=task.namespace,
318
- graph_name=task.compute_graph,
319
- graph_invocation_id=task.invocation_id,
320
- input_key=task.input_key,
321
- data_payload=task.input_payload,
322
- logger=logger,
323
- )
324
- init_value: Optional[SerializedObject] = (
325
- None
326
- if task.reducer_output_id is None and task.reducer_input_payload is None
327
- else (
328
- await self._downloader.download_init_value(
329
- namespace=task.namespace,
330
- graph_name=task.compute_graph,
331
- function_name=task.compute_fn,
332
- graph_invocation_id=task.invocation_id,
333
- reducer_output_key=task.reducer_output_id,
334
- data_payload=task.reducer_input_payload,
335
- logger=logger,
336
- )
337
- )
338
- )
339
- return await self._task_runner.run(
340
- TaskInput(
341
- task=task,
342
- graph=graph,
343
- input=input,
344
- init_value=init_value,
345
- ),
346
- logger=logger,
347
- )
348
-
349
- async def _report_task_outcome(self, output: TaskOutput, logger: Any) -> None:
350
- """Reports the task with the given output to the server.
351
-
352
- Doesn't raise any Exceptions. Runs till the reporting is successful."""
353
- reporting_retries: int = 0
354
-
151
+ # Run the Executor forever until it is shut down.
355
152
  while True:
356
- logger = logger.bind(retries=reporting_retries)
357
- try:
358
- await self._task_reporter.report(output=output, logger=logger)
359
- break
360
- except Exception as e:
361
- logger.error(
362
- "failed to report task",
363
- exc_info=e,
364
- )
365
- reporting_retries += 1
366
- metric_task_outcome_report_retries.inc()
367
- await asyncio.sleep(5)
153
+ await asyncio.sleep(10)
368
154
 
369
- metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
370
- if output.is_internal_error:
371
- metric_tasks_completed.labels(
372
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
373
- ).inc()
374
- elif output.success:
375
- metric_tasks_completed.labels(
376
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
377
- ).inc()
378
- else:
379
- metric_tasks_completed.labels(
380
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
381
- ).inc()
382
-
383
- async def _shutdown(self, loop):
384
- self._logger.info(
385
- "shutting down, all Executor logs are suppressed, no task outcomes will be reported to Server from this point"
386
- )
387
- if self._state_reporter is not None:
388
- self._state_reporter.update_executor_status(
389
- ExecutorStatus.EXECUTOR_STATUS_STOPPING
155
+ def _shutdown_signal_handler(self, loop):
156
+ if self._shutdown_aio_task is None:
157
+ self._shutdown_aio_task = loop.create_task(
158
+ self._shutdown(), name="executor shutdown"
390
159
  )
391
- metric_executor_state.state("shutting_down")
392
- # There will be lots of task cancellation exceptions and "X is shutting down"
393
- # exceptions logged during Executor shutdown. Suppress their logs as they are
394
- # expected and are confusing for users.
395
- suppress_logging()
396
160
 
397
- self._is_shutdown = True
398
- await self._monitoring_server.shutdown()
399
- await self._task_reporter.shutdown()
400
-
401
- if self._task_runner is not None:
402
- await self._task_runner.shutdown()
403
-
404
- if self._state_reporter is not None:
405
- await self._state_reporter.shutdown()
406
- if self._state_reconciler is not None:
407
- await self._state_reconciler.shutdown()
408
- if self._channel_manager is not None:
409
- await self._channel_manager.destroy()
410
-
411
- # We need to shutdown all users of FE states first,
412
- # otherwise states might disappear unexpectedly and we might
413
- # report errors, etc that are expected.
414
- await self._function_executor_states.shutdown()
415
- # We mainly need to cancel the task that runs _.*_mode_loop().
416
- for task in asyncio.all_tasks(loop):
417
- task.cancel()
418
- # The current task is cancelled, the code after this line will not run.
161
+ async def _shutdown(self):
162
+ self._logger.info("shutting down Executor")
163
+ metric_executor_state.state("shutting_down")
419
164
 
420
- def shutdown(self, loop):
421
- loop.create_task(self._shutdown(loop), name="executor shutdown")
165
+ # Shutdown state reconciler first because it changes reported state on shutdown.
166
+ await self._state_reconciler.shutdown()
422
167
 
423
- def _task_logger(self, task: Task) -> Any:
424
- return self._logger.bind(
425
- namespace=task.namespace,
426
- graph=task.compute_graph,
427
- graph_version=task.graph_version,
428
- invocation_id=task.invocation_id,
429
- function_name=task.compute_fn,
430
- task_id=task.id,
168
+ # Do one last state report with STOPPED status. This reduces latency in the system.
169
+ self._state_reporter.update_executor_status(
170
+ ExecutorStatus.EXECUTOR_STATUS_STOPPED
431
171
  )
172
+ await self._state_reporter.shutdown()
173
+ await self._channel_manager.destroy()
174
+ await self._monitoring_server.shutdown()
175
+ self._run_aio_task.cancel()
@@ -0,0 +1,59 @@
1
+ from dataclasses import dataclass
2
+ from typing import Dict, List, Optional
3
+
4
+
5
+ @dataclass
6
+ class FunctionURI:
7
+ namespace: str
8
+ compute_graph: str
9
+ compute_fn: str
10
+ version: Optional[str] = None
11
+
12
+
13
+ def function_allowlist_to_indexed_dict(
14
+ function_allowlist: List[FunctionURI],
15
+ ) -> Dict[str, str]:
16
+ """Returns a dictionary with each function URI in the allowlist as a key-value pair.
17
+
18
+ The keys are prefixed indexes in function allowlist, and the values are the function URIs
19
+ """
20
+ indexed_dict = {}
21
+ counter = 0
22
+ for function_uri in function_allowlist:
23
+ function_uri: FunctionURI
24
+ indexed_dict[f"function_allowlist_{counter}"] = ":".join(
25
+ [
26
+ function_uri.namespace,
27
+ function_uri.compute_graph,
28
+ function_uri.compute_fn,
29
+ str(function_uri.version),
30
+ ]
31
+ )
32
+ counter += 1
33
+ return indexed_dict
34
+
35
+
36
+ def parse_function_uris(function_uri_strs: List[str]) -> List[FunctionURI]:
37
+ """Parses a list of function URIs from strings to FunctionURI objects."""
38
+ uris: List[FunctionURI] = []
39
+ for uri_str in function_uri_strs:
40
+ tokens = uri_str.split(":")
41
+ if len(tokens) < 3 or len(tokens) > 4:
42
+ raise ValueError(
43
+ "Function should be specified as <namespace>:<workflow>:<function>:<version> or"
44
+ "<namespace>:<workflow>:<function>"
45
+ )
46
+ version: Optional[str] = None
47
+ if len(tokens) == 4:
48
+ version = tokens[3]
49
+
50
+ uris.append(
51
+ FunctionURI(
52
+ namespace=tokens[0],
53
+ compute_graph=tokens[1],
54
+ compute_fn=tokens[2],
55
+ version=version,
56
+ )
57
+ )
58
+
59
+ return uris
@@ -56,7 +56,11 @@ from .server.function_executor_server_factory import (
56
56
  )
57
57
 
58
58
 
59
- class CustomerError(RuntimeError):
59
+ class FunctionError(RuntimeError):
60
+ pass
61
+
62
+
63
+ class FunctionTimeoutError(FunctionError):
60
64
  pass
61
65
 
62
66
 
@@ -92,7 +96,7 @@ class FunctionExecutor:
92
96
  ):
93
97
  """Creates and initializes a FunctionExecutorServer and all resources associated with it.
94
98
 
95
- Raises CustomerError if the server failed to initialize due to an error in customer owned code or data.
99
+ Raises FunctionError if the server failed to initialize due to an error in customer owned code or data.
96
100
  Raises an Exception if an internal error occured."""
97
101
  try:
98
102
  with (
@@ -134,7 +138,9 @@ class FunctionExecutor:
134
138
  async def destroy(self):
135
139
  """Destroys all resources owned by this FunctionExecutor.
136
140
 
137
- Never raises any exceptions but logs them."""
141
+ Never raises any exceptions but logs them.
142
+ Idempotent.
143
+ """
138
144
  try:
139
145
  with (
140
146
  metric_destroy_errors.count_exceptions(),
@@ -312,12 +318,12 @@ async def _initialize_server(
312
318
  if initialize_response.success:
313
319
  return
314
320
  if initialize_response.HasField("customer_error"):
315
- raise CustomerError(initialize_response.customer_error)
321
+ raise FunctionError(initialize_response.customer_error)
316
322
  else:
317
323
  raise Exception("initialize RPC failed at function executor server")
318
324
  except grpc.aio.AioRpcError as e:
319
325
  if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
320
- raise CustomerError(
321
- f"Customer code timeout of {customer_code_timeout_sec:.3f} sec expired"
326
+ raise FunctionTimeoutError(
327
+ f"Function initialization exceeded its configured timeout of {customer_code_timeout_sec:.3f} sec."
322
328
  ) from e
323
329
  raise
@@ -15,7 +15,6 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
15
15
  )
16
16
  from tensorlake.function_executor.proto.message_validator import MessageValidator
17
17
 
18
- from ..downloader import serialized_object_from_http_response
19
18
  from .metrics.invocation_state_client import (
20
19
  metric_request_read_errors,
21
20
  metric_server_get_state_request_errors,
@@ -78,11 +77,18 @@ class InvocationStateClient:
78
77
  If a request is not comming from the task ID that was added here then it will
79
78
  be rejected. It's caller's responsibility to only add task IDs that are being
80
79
  executed by the Function Executor so the Function Executor can't get access to
81
- invocation state of tasks it doesn't run."""
80
+ invocation state of tasks it doesn't run.
81
+
82
+ Doesn't raise any exceptions.
83
+ """
82
84
  self._task_id_to_invocation_id[task_id] = invocation_id
83
85
 
84
86
  def remove_task_to_invocation_id_entry(self, task_id: str) -> None:
85
- del self._task_id_to_invocation_id[task_id]
87
+ """Removes a task ID to invocation ID entry from the client's internal state.
88
+
89
+ Doesn't raise any exceptions.
90
+ """
91
+ self._task_id_to_invocation_id.pop(task_id, None)
86
92
 
87
93
  async def destroy(self) -> None:
88
94
  if self._request_loop_task is not None:
@@ -257,3 +263,19 @@ class InvocationStateClient:
257
263
  )
258
264
  else:
259
265
  raise ValueError("unknown request type")
266
+
267
+
268
+ def serialized_object_from_http_response(response: httpx.Response) -> SerializedObject:
269
+ # We're hardcoding the content type currently used by Python SDK. It might change in the future.
270
+ # There's no other way for now to determine if the response is a bytes or string.
271
+ if response.headers["content-type"] in [
272
+ "application/octet-stream",
273
+ "application/pickle",
274
+ ]:
275
+ return SerializedObject(
276
+ bytes=response.content, content_type=response.headers["content-type"]
277
+ )
278
+ else:
279
+ return SerializedObject(
280
+ string=response.text, content_type=response.headers["content-type"]
281
+ )
@@ -24,9 +24,9 @@ class FunctionExecutorServerConfiguration:
24
24
  graph_version: str
25
25
  image_uri: Optional[str]
26
26
  secret_names: List[str]
27
- cpu_ms_per_sec: Optional[int]
28
- memory_bytes: Optional[int]
29
- disk_bytes: Optional[int]
27
+ cpu_ms_per_sec: int
28
+ memory_bytes: int
29
+ disk_bytes: int
30
30
  gpu_count: int
31
31
 
32
32