indexify 0.3.30__py3-none-any.whl → 0.4.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (74) hide show
  1. indexify/cli/__init__.py +18 -0
  2. indexify/cli/build_image.py +51 -0
  3. indexify/cli/deploy.py +57 -0
  4. indexify/cli/executor.py +205 -0
  5. indexify/executor/{grpc/channel_manager.py → channel_manager.py} +17 -11
  6. indexify/executor/executor.py +57 -311
  7. indexify/executor/function_allowlist.py +59 -0
  8. indexify/executor/function_executor/function_executor.py +12 -6
  9. indexify/executor/function_executor/invocation_state_client.py +25 -3
  10. indexify/executor/function_executor/server/function_executor_server_factory.py +3 -3
  11. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +22 -11
  12. indexify/executor/function_executor_controller/__init__.py +13 -0
  13. indexify/executor/function_executor_controller/completed_task_metrics.py +82 -0
  14. indexify/executor/function_executor_controller/create_function_executor.py +154 -0
  15. indexify/executor/function_executor_controller/debug_event_loop.py +37 -0
  16. indexify/executor/function_executor_controller/destroy_function_executor.py +28 -0
  17. indexify/executor/function_executor_controller/downloads.py +199 -0
  18. indexify/executor/function_executor_controller/events.py +172 -0
  19. indexify/executor/function_executor_controller/function_executor_controller.py +759 -0
  20. indexify/executor/function_executor_controller/loggers.py +57 -0
  21. indexify/executor/function_executor_controller/message_validators.py +65 -0
  22. indexify/executor/function_executor_controller/metrics/completed_task_metrics.py +68 -0
  23. indexify/executor/{metrics/downloader.py → function_executor_controller/metrics/downloads.py} +1 -3
  24. indexify/executor/function_executor_controller/metrics/function_executor_controller.py +60 -0
  25. indexify/executor/{function_executor/metrics/single_task_runner.py → function_executor_controller/metrics/run_task.py} +9 -3
  26. indexify/executor/function_executor_controller/metrics/upload_task_output.py +39 -0
  27. indexify/executor/function_executor_controller/prepare_task.py +38 -0
  28. indexify/executor/function_executor_controller/run_task.py +201 -0
  29. indexify/executor/function_executor_controller/task_info.py +33 -0
  30. indexify/executor/function_executor_controller/task_output.py +122 -0
  31. indexify/executor/function_executor_controller/upload_task_output.py +234 -0
  32. indexify/executor/host_resources/host_resources.py +20 -25
  33. indexify/executor/{grpc/metrics → metrics}/channel_manager.py +1 -1
  34. indexify/executor/metrics/executor.py +0 -47
  35. indexify/executor/{grpc/metrics → metrics}/state_reconciler.py +1 -1
  36. indexify/executor/{grpc/metrics → metrics}/state_reporter.py +1 -1
  37. indexify/executor/monitoring/health_checker/generic_health_checker.py +6 -59
  38. indexify/executor/monitoring/health_checker/health_checker.py +0 -11
  39. indexify/executor/{grpc/state_reconciler.py → state_reconciler.py} +139 -141
  40. indexify/executor/state_reporter.py +364 -0
  41. indexify/proto/executor_api.proto +67 -59
  42. indexify/proto/executor_api_pb2.py +52 -52
  43. indexify/proto/executor_api_pb2.pyi +125 -104
  44. indexify/proto/executor_api_pb2_grpc.py +0 -47
  45. {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/METADATA +1 -3
  46. indexify-0.4.2.dist-info/RECORD +68 -0
  47. indexify-0.4.2.dist-info/entry_points.txt +3 -0
  48. indexify/cli/cli.py +0 -267
  49. indexify/executor/api_objects.py +0 -92
  50. indexify/executor/downloader.py +0 -417
  51. indexify/executor/executor_flavor.py +0 -7
  52. indexify/executor/function_executor/function_executor_state.py +0 -107
  53. indexify/executor/function_executor/function_executor_states_container.py +0 -93
  54. indexify/executor/function_executor/function_executor_status.py +0 -95
  55. indexify/executor/function_executor/metrics/function_executor_state.py +0 -46
  56. indexify/executor/function_executor/metrics/function_executor_state_container.py +0 -10
  57. indexify/executor/function_executor/single_task_runner.py +0 -345
  58. indexify/executor/function_executor/task_input.py +0 -21
  59. indexify/executor/function_executor/task_output.py +0 -105
  60. indexify/executor/grpc/function_executor_controller.py +0 -418
  61. indexify/executor/grpc/metrics/task_controller.py +0 -8
  62. indexify/executor/grpc/state_reporter.py +0 -314
  63. indexify/executor/grpc/task_controller.py +0 -508
  64. indexify/executor/metrics/task_fetcher.py +0 -21
  65. indexify/executor/metrics/task_reporter.py +0 -53
  66. indexify/executor/metrics/task_runner.py +0 -52
  67. indexify/executor/monitoring/function_allowlist.py +0 -25
  68. indexify/executor/runtime_probes.py +0 -68
  69. indexify/executor/task_fetcher.py +0 -96
  70. indexify/executor/task_reporter.py +0 -459
  71. indexify/executor/task_runner.py +0 -177
  72. indexify-0.3.30.dist-info/RECORD +0 -68
  73. indexify-0.3.30.dist-info/entry_points.txt +0 -3
  74. {indexify-0.3.30.dist-info → indexify-0.4.2.dist-info}/WHEEL +0 -0
@@ -1,54 +1,35 @@
1
1
  import asyncio
2
2
  import signal
3
- import time
4
3
  from pathlib import Path
5
4
  from socket import gethostname
6
- from typing import Any, Dict, List, Optional
5
+ from typing import Dict, List, Optional
7
6
 
8
7
  import structlog
9
- from tensorlake.function_executor.proto.function_executor_pb2 import SerializedObject
10
- from tensorlake.utils.logging import suppress as suppress_logging
11
8
 
12
9
  from indexify.proto.executor_api_pb2 import ExecutorStatus
13
10
 
14
- from .api_objects import FunctionURI, Task
15
11
  from .blob_store.blob_store import BLOBStore
16
- from .downloader import Downloader
17
- from .executor_flavor import ExecutorFlavor
18
- from .function_executor.function_executor_states_container import (
19
- FunctionExecutorStatesContainer,
12
+ from .channel_manager import ChannelManager
13
+ from .function_allowlist import (
14
+ FunctionURI,
15
+ function_allowlist_to_indexed_dict,
16
+ parse_function_uris,
20
17
  )
21
18
  from .function_executor.server.function_executor_server_factory import (
22
19
  FunctionExecutorServerFactory,
23
20
  )
24
- from .grpc.channel_manager import ChannelManager
25
- from .grpc.state_reconciler import ExecutorStateReconciler
26
- from .grpc.state_reporter import ExecutorStateReporter
27
21
  from .host_resources.host_resources import HostResourcesProvider
28
22
  from .metrics.executor import (
29
- METRIC_TASKS_COMPLETED_OUTCOME_ALL,
30
- METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
31
- METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
32
- METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
33
23
  metric_executor_info,
34
24
  metric_executor_state,
35
- metric_task_completion_latency,
36
- metric_task_outcome_report_latency,
37
- metric_task_outcome_report_retries,
38
- metric_task_outcome_reports,
39
- metric_tasks_completed,
40
- metric_tasks_fetched,
41
- metric_tasks_reporting_outcome,
42
25
  )
43
- from .monitoring.function_allowlist import function_allowlist_to_info_dict
44
26
  from .monitoring.health_check_handler import HealthCheckHandler
45
27
  from .monitoring.health_checker.health_checker import HealthChecker
46
28
  from .monitoring.prometheus_metrics_handler import PrometheusMetricsHandler
47
29
  from .monitoring.server import MonitoringServer
48
30
  from .monitoring.startup_probe_handler import StartupProbeHandler
49
- from .task_fetcher import TaskFetcher
50
- from .task_reporter import TaskReporter
51
- from .task_runner import TaskInput, TaskOutput, TaskRunner
31
+ from .state_reconciler import ExecutorStateReconciler
32
+ from .state_reporter import ExecutorStateReporter
52
33
 
53
34
  metric_executor_state.state("starting")
54
35
 
@@ -57,32 +38,26 @@ class Executor:
57
38
  def __init__(
58
39
  self,
59
40
  id: str,
60
- flavor: ExecutorFlavor,
61
41
  version: str,
62
42
  labels: Dict[str, str],
63
- code_path: Path,
43
+ cache_path: Path,
64
44
  health_checker: HealthChecker,
65
- function_allowlist: Optional[List[FunctionURI]],
45
+ function_uris: List[str],
66
46
  function_executor_server_factory: FunctionExecutorServerFactory,
67
47
  server_addr: str,
68
48
  grpc_server_addr: str,
69
49
  config_path: Optional[str],
70
50
  monitoring_server_host: str,
71
51
  monitoring_server_port: int,
72
- enable_grpc_state_reconciler: bool,
73
52
  blob_store: BLOBStore,
74
53
  host_resources_provider: HostResourcesProvider,
75
54
  ):
76
55
  self._logger = structlog.get_logger(module=__name__)
77
- self._is_shutdown: bool = False
78
56
  protocol: str = "http"
79
57
  if config_path:
80
58
  self._logger.info("running the extractor with TLS enabled")
81
59
  protocol = "https"
82
60
 
83
- self._server_addr = server_addr
84
- self._base_url = f"{protocol}://{self._server_addr}"
85
- self._code_path = code_path
86
61
  self._startup_probe_handler = StartupProbeHandler()
87
62
  self._monitoring_server = MonitoringServer(
88
63
  host=monitoring_server_host,
@@ -91,32 +66,17 @@ class Executor:
91
66
  health_probe_handler=HealthCheckHandler(health_checker),
92
67
  metrics_handler=PrometheusMetricsHandler(),
93
68
  )
94
- self._function_executor_states = FunctionExecutorStatesContainer(
95
- logger=self._logger
96
- )
97
- health_checker.set_function_executor_states_container(
98
- self._function_executor_states
99
- )
100
- self._downloader = Downloader(
101
- code_path=code_path,
102
- base_url=self._base_url,
103
- blob_store=blob_store,
104
- config_path=config_path,
105
- )
106
- self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
107
- self._function_executor_server_factory = function_executor_server_factory
108
69
  self._channel_manager = ChannelManager(
109
70
  server_address=grpc_server_addr,
110
71
  config_path=config_path,
111
72
  logger=self._logger,
112
73
  )
74
+ function_allowlist: List[FunctionURI] = parse_function_uris(function_uris)
113
75
  self._state_reporter = ExecutorStateReporter(
114
76
  executor_id=id,
115
- flavor=flavor,
116
77
  version=version,
117
78
  labels=labels,
118
- function_allowlist=self._function_allowlist,
119
- function_executor_states=self._function_executor_states,
79
+ function_allowlist=function_allowlist,
120
80
  channel_manager=self._channel_manager,
121
81
  host_resources_provider=host_resources_provider,
122
82
  logger=self._logger,
@@ -124,69 +84,48 @@ class Executor:
124
84
  self._state_reporter.update_executor_status(
125
85
  ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
126
86
  )
127
- self._task_reporter = TaskReporter(
128
- base_url=self._base_url,
87
+ self._state_reconciler = ExecutorStateReconciler(
129
88
  executor_id=id,
89
+ function_executor_server_factory=function_executor_server_factory,
90
+ base_url=f"{protocol}://{server_addr}",
130
91
  config_path=config_path,
131
- channel_manager=self._channel_manager,
92
+ cache_path=cache_path,
132
93
  blob_store=blob_store,
94
+ channel_manager=self._channel_manager,
95
+ state_reporter=self._state_reporter,
96
+ logger=self._logger,
133
97
  )
134
-
135
- # HTTP mode task runner
136
- self._task_runner: Optional[TaskRunner] = None
137
- self._task_fetcher: Optional[TaskFetcher] = None
138
- # gRPC mode state reconciler that runs tasks
139
- self._state_reconciler: Optional[ExecutorStateReconciler] = None
140
-
141
- if enable_grpc_state_reconciler:
142
- self._state_reconciler = ExecutorStateReconciler(
143
- executor_id=id,
144
- function_executor_server_factory=self._function_executor_server_factory,
145
- base_url=self._base_url,
146
- function_executor_states=self._function_executor_states,
147
- config_path=config_path,
148
- downloader=self._downloader,
149
- task_reporter=self._task_reporter,
150
- channel_manager=self._channel_manager,
151
- state_reporter=self._state_reporter,
152
- logger=self._logger,
153
- )
154
- else:
155
- self._task_runner = TaskRunner(
156
- executor_id=id,
157
- function_executor_server_factory=function_executor_server_factory,
158
- base_url=self._base_url,
159
- function_executor_states=self._function_executor_states,
160
- config_path=config_path,
161
- )
162
- self._task_fetcher = TaskFetcher(
163
- executor_id=id,
164
- executor_version=version,
165
- labels=labels,
166
- function_allowlist=function_allowlist,
167
- protocol=protocol,
168
- indexify_server_addr=self._server_addr,
169
- config_path=config_path,
170
- )
98
+ self._run_aio_task: Optional[asyncio.Task] = None
99
+ self._shutdown_aio_task: Optional[asyncio.Task] = None
171
100
 
172
101
  executor_info: Dict[str, str] = {
173
102
  "id": id,
174
- "flavor": flavor.name,
175
103
  "version": version,
176
- "code_path": str(code_path),
104
+ "cache_path": str(cache_path),
177
105
  "server_addr": server_addr,
178
106
  "grpc_server_addr": str(grpc_server_addr),
179
107
  "config_path": str(config_path),
180
- "enable_grpc_state_reconciler": str(enable_grpc_state_reconciler),
181
108
  "hostname": gethostname(),
182
109
  }
183
110
  for key, value in labels.items():
184
111
  executor_info["label_" + key] = value
185
- executor_info.update(function_allowlist_to_info_dict(function_allowlist))
112
+ executor_info.update(function_allowlist_to_indexed_dict(function_allowlist))
186
113
  metric_executor_info.info(executor_info)
187
114
 
188
115
  def run(self):
189
116
  asyncio.new_event_loop()
117
+
118
+ self._run_aio_task = asyncio.get_event_loop().create_task(
119
+ self._run(),
120
+ name="executor startup and run loop",
121
+ )
122
+
123
+ try:
124
+ asyncio.get_event_loop().run_until_complete(self._run_aio_task)
125
+ except asyncio.CancelledError:
126
+ pass # Expected exception on shutdown
127
+
128
+ async def _run(self):
190
129
  for signum in [
191
130
  signal.SIGABRT,
192
131
  signal.SIGINT,
@@ -195,235 +134,42 @@ class Executor:
195
134
  signal.SIGHUP,
196
135
  ]:
197
136
  asyncio.get_event_loop().add_signal_handler(
198
- signum, self.shutdown, asyncio.get_event_loop()
137
+ signum, self._shutdown_signal_handler, asyncio.get_event_loop()
199
138
  )
200
139
 
201
- asyncio.get_event_loop().create_task(
140
+ asyncio.create_task(
202
141
  self._monitoring_server.run(), name="monitoring server runner"
203
142
  )
204
143
  self._state_reporter.update_executor_status(
205
144
  ExecutorStatus.EXECUTOR_STATUS_RUNNING
206
145
  )
207
- asyncio.get_event_loop().create_task(
208
- self._state_reporter.run(), name="state reporter runner"
209
- )
210
-
146
+ self._state_reporter.run()
147
+ self._state_reconciler.run()
211
148
  metric_executor_state.state("running")
212
149
  self._startup_probe_handler.set_ready()
213
150
 
214
- try:
215
- if self._state_reconciler is None:
216
- asyncio.get_event_loop().run_until_complete(
217
- self._http_task_runner_loop()
218
- )
219
- else:
220
- asyncio.get_event_loop().run_until_complete(
221
- self._grpc_state_reconciler_loop()
222
- )
223
- except asyncio.CancelledError:
224
- pass # Suppress this expected exception and return without error (normally).
225
-
226
- async def _grpc_state_reconciler_loop(self):
227
- """Runs the gRPC state reconciler and state reporter.
228
-
229
- Never raises any exceptions."""
230
- await self._state_reconciler.run()
231
-
232
- async def _http_task_runner_loop(self):
233
- while not self._is_shutdown:
234
- try:
235
- async for task in self._task_fetcher.run():
236
- metric_tasks_fetched.inc()
237
- if not self._is_shutdown:
238
- asyncio.create_task(
239
- self._run_task(task), name="task runner (http mode)"
240
- )
241
- self._logger.info("fetching tasks finished, reconnecting in 5 seconds")
242
- except Exception as e:
243
- self._logger.error(
244
- "failed fetching tasks, retrying in 5 seconds", exc_info=e
245
- )
246
- if not self._is_shutdown:
247
- await asyncio.sleep(5)
248
-
249
- async def _run_task(self, task: Task) -> None:
250
- """Runs the supplied task.
251
-
252
- Doesn't raise any Exceptions. All errors are reported to the server."""
253
- start_time: float = time.monotonic()
254
- logger = self._task_logger(task)
255
- output: Optional[TaskOutput] = None
256
-
257
- try:
258
- output = await self._run_task_and_get_output(task, logger)
259
- logger.info("task execution finished", success=output.success)
260
- except Exception as e:
261
- output = TaskOutput.internal_error(
262
- task_id=task.id,
263
- namespace=task.namespace,
264
- graph_name=task.compute_graph,
265
- function_name=task.compute_fn,
266
- graph_version=task.graph_version,
267
- graph_invocation_id=task.invocation_id,
268
- output_payload_uri_prefix=task.output_payload_uri_prefix,
269
- )
270
- logger.error("task execution failed", exc_info=e)
271
-
272
- if output.metrics is not None:
273
- self.log_function_metrics(output)
274
-
275
- with (
276
- metric_tasks_reporting_outcome.track_inprogress(),
277
- metric_task_outcome_report_latency.time(),
278
- ):
279
- metric_task_outcome_reports.inc()
280
- await self._report_task_outcome(output=output, logger=logger)
281
-
282
- metric_task_completion_latency.observe(time.monotonic() - start_time)
283
-
284
- def log_function_metrics(self, output: TaskOutput):
285
- for counter_name, counter_value in output.metrics.counters.items():
286
- self._logger.info(
287
- f"function_metric",
288
- counter_name=counter_name,
289
- counter_value=counter_value,
290
- invocation_id=output.graph_invocation_id,
291
- function_name=output.function_name,
292
- graph_name=output.graph_name,
293
- namespace=output.namespace,
294
- )
295
- for timer_name, timer_value in output.metrics.timers.items():
296
- self._logger.info(
297
- f"function_metric",
298
- timer_name=timer_name,
299
- timer_value=timer_value,
300
- invocation_id=output.graph_invocation_id,
301
- function_name=output.function_name,
302
- graph_name=output.graph_name,
303
- namespace=output.namespace,
304
- )
305
-
306
- async def _run_task_and_get_output(self, task: Task, logger: Any) -> TaskOutput:
307
- graph: SerializedObject = await self._downloader.download_graph(
308
- namespace=task.namespace,
309
- graph_name=task.compute_graph,
310
- graph_version=task.graph_version,
311
- logger=logger,
312
- data_payload=task.graph_payload,
313
- )
314
- input: SerializedObject = await self._downloader.download_input(
315
- namespace=task.namespace,
316
- graph_name=task.compute_graph,
317
- graph_invocation_id=task.invocation_id,
318
- input_key=task.input_key,
319
- data_payload=task.input_payload,
320
- logger=logger,
321
- )
322
- init_value: Optional[SerializedObject] = (
323
- None
324
- if task.reducer_output_id is None and task.reducer_input_payload is None
325
- else (
326
- await self._downloader.download_init_value(
327
- namespace=task.namespace,
328
- graph_name=task.compute_graph,
329
- function_name=task.compute_fn,
330
- graph_invocation_id=task.invocation_id,
331
- reducer_output_key=task.reducer_output_id,
332
- data_payload=task.reducer_input_payload,
333
- logger=logger,
334
- )
335
- )
336
- )
337
- return await self._task_runner.run(
338
- TaskInput(
339
- task=task,
340
- graph=graph,
341
- input=input,
342
- init_value=init_value,
343
- ),
344
- logger=logger,
345
- )
346
-
347
- async def _report_task_outcome(self, output: TaskOutput, logger: Any) -> None:
348
- """Reports the task with the given output to the server.
349
-
350
- Doesn't raise any Exceptions. Runs till the reporting is successful."""
351
- reporting_retries: int = 0
352
-
151
+ # Run the Executor forever until it is shut down.
353
152
  while True:
354
- logger = logger.bind(retries=reporting_retries)
355
- try:
356
- await self._task_reporter.report(output=output, logger=logger)
357
- break
358
- except Exception as e:
359
- logger.error(
360
- "failed to report task",
361
- exc_info=e,
362
- )
363
- reporting_retries += 1
364
- metric_task_outcome_report_retries.inc()
365
- await asyncio.sleep(5)
153
+ await asyncio.sleep(10)
366
154
 
367
- metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
368
- if output.is_internal_error:
369
- metric_tasks_completed.labels(
370
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
371
- ).inc()
372
- elif output.success:
373
- metric_tasks_completed.labels(
374
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
375
- ).inc()
376
- else:
377
- metric_tasks_completed.labels(
378
- outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
379
- ).inc()
380
-
381
- async def _shutdown(self, loop):
382
- self._logger.info(
383
- "shutting down, all Executor logs are suppressed, no task outcomes will be reported to Server from this point"
384
- )
385
- if self._state_reporter is not None:
386
- self._state_reporter.update_executor_status(
387
- ExecutorStatus.EXECUTOR_STATUS_STOPPING
155
+ def _shutdown_signal_handler(self, loop):
156
+ if self._shutdown_aio_task is None:
157
+ self._shutdown_aio_task = loop.create_task(
158
+ self._shutdown(), name="executor shutdown"
388
159
  )
389
- metric_executor_state.state("shutting_down")
390
- # There will be lots of task cancellation exceptions and "X is shutting down"
391
- # exceptions logged during Executor shutdown. Suppress their logs as they are
392
- # expected and are confusing for users.
393
- suppress_logging()
394
160
 
395
- self._is_shutdown = True
396
- await self._monitoring_server.shutdown()
397
- await self._task_reporter.shutdown()
398
-
399
- if self._task_runner is not None:
400
- await self._task_runner.shutdown()
401
-
402
- if self._state_reporter is not None:
403
- await self._state_reporter.shutdown()
404
- if self._state_reconciler is not None:
405
- await self._state_reconciler.shutdown()
406
- if self._channel_manager is not None:
407
- await self._channel_manager.destroy()
408
-
409
- # We need to shutdown all users of FE states first,
410
- # otherwise states might disappear unexpectedly and we might
411
- # report errors, etc that are expected.
412
- await self._function_executor_states.shutdown()
413
- # We mainly need to cancel the task that runs _.*_mode_loop().
414
- for task in asyncio.all_tasks(loop):
415
- task.cancel()
416
- # The current task is cancelled, the code after this line will not run.
161
+ async def _shutdown(self):
162
+ self._logger.info("shutting down Executor")
163
+ metric_executor_state.state("shutting_down")
417
164
 
418
- def shutdown(self, loop):
419
- loop.create_task(self._shutdown(loop), name="executor shutdown")
165
+ # Shutdown state reconciler first because it changes reported state on shutdown.
166
+ await self._state_reconciler.shutdown()
420
167
 
421
- def _task_logger(self, task: Task) -> Any:
422
- return self._logger.bind(
423
- namespace=task.namespace,
424
- graph=task.compute_graph,
425
- graph_version=task.graph_version,
426
- invocation_id=task.invocation_id,
427
- function_name=task.compute_fn,
428
- task_id=task.id,
168
+ # Do one last state report with STOPPED status. This reduces latency in the system.
169
+ self._state_reporter.update_executor_status(
170
+ ExecutorStatus.EXECUTOR_STATUS_STOPPED
429
171
  )
172
+ await self._state_reporter.shutdown()
173
+ await self._channel_manager.destroy()
174
+ await self._monitoring_server.shutdown()
175
+ self._run_aio_task.cancel()
@@ -0,0 +1,59 @@
1
+ from dataclasses import dataclass
2
+ from typing import Dict, List, Optional
3
+
4
+
5
+ @dataclass
6
+ class FunctionURI:
7
+ namespace: str
8
+ compute_graph: str
9
+ compute_fn: str
10
+ version: Optional[str] = None
11
+
12
+
13
+ def function_allowlist_to_indexed_dict(
14
+ function_allowlist: List[FunctionURI],
15
+ ) -> Dict[str, str]:
16
+ """Returns a dictionary with each function URI in the allowlist as a key-value pair.
17
+
18
+ The keys are prefixed indexes in function allowlist, and the values are the function URIs
19
+ """
20
+ indexed_dict = {}
21
+ counter = 0
22
+ for function_uri in function_allowlist:
23
+ function_uri: FunctionURI
24
+ indexed_dict[f"function_allowlist_{counter}"] = ":".join(
25
+ [
26
+ function_uri.namespace,
27
+ function_uri.compute_graph,
28
+ function_uri.compute_fn,
29
+ str(function_uri.version),
30
+ ]
31
+ )
32
+ counter += 1
33
+ return indexed_dict
34
+
35
+
36
+ def parse_function_uris(function_uri_strs: List[str]) -> List[FunctionURI]:
37
+ """Parses a list of function URIs from strings to FunctionURI objects."""
38
+ uris: List[FunctionURI] = []
39
+ for uri_str in function_uri_strs:
40
+ tokens = uri_str.split(":")
41
+ if len(tokens) < 3 or len(tokens) > 4:
42
+ raise ValueError(
43
+ "Function should be specified as <namespace>:<workflow>:<function>:<version> or"
44
+ "<namespace>:<workflow>:<function>"
45
+ )
46
+ version: Optional[str] = None
47
+ if len(tokens) == 4:
48
+ version = tokens[3]
49
+
50
+ uris.append(
51
+ FunctionURI(
52
+ namespace=tokens[0],
53
+ compute_graph=tokens[1],
54
+ compute_fn=tokens[2],
55
+ version=version,
56
+ )
57
+ )
58
+
59
+ return uris
@@ -56,7 +56,11 @@ from .server.function_executor_server_factory import (
56
56
  )
57
57
 
58
58
 
59
- class CustomerError(RuntimeError):
59
+ class FunctionError(RuntimeError):
60
+ pass
61
+
62
+
63
+ class FunctionTimeoutError(FunctionError):
60
64
  pass
61
65
 
62
66
 
@@ -92,7 +96,7 @@ class FunctionExecutor:
92
96
  ):
93
97
  """Creates and initializes a FunctionExecutorServer and all resources associated with it.
94
98
 
95
- Raises CustomerError if the server failed to initialize due to an error in customer owned code or data.
99
+ Raises FunctionError if the server failed to initialize due to an error in customer owned code or data.
96
100
  Raises an Exception if an internal error occured."""
97
101
  try:
98
102
  with (
@@ -134,7 +138,9 @@ class FunctionExecutor:
134
138
  async def destroy(self):
135
139
  """Destroys all resources owned by this FunctionExecutor.
136
140
 
137
- Never raises any exceptions but logs them."""
141
+ Never raises any exceptions but logs them.
142
+ Idempotent.
143
+ """
138
144
  try:
139
145
  with (
140
146
  metric_destroy_errors.count_exceptions(),
@@ -312,12 +318,12 @@ async def _initialize_server(
312
318
  if initialize_response.success:
313
319
  return
314
320
  if initialize_response.HasField("customer_error"):
315
- raise CustomerError(initialize_response.customer_error)
321
+ raise FunctionError(initialize_response.customer_error)
316
322
  else:
317
323
  raise Exception("initialize RPC failed at function executor server")
318
324
  except grpc.aio.AioRpcError as e:
319
325
  if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
320
- raise CustomerError(
321
- f"Customer code timeout of {customer_code_timeout_sec:.3f} sec expired"
326
+ raise FunctionTimeoutError(
327
+ f"Function initialization exceeded its configured timeout of {customer_code_timeout_sec:.3f} sec."
322
328
  ) from e
323
329
  raise
@@ -15,7 +15,6 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
15
15
  )
16
16
  from tensorlake.function_executor.proto.message_validator import MessageValidator
17
17
 
18
- from ..downloader import serialized_object_from_http_response
19
18
  from .metrics.invocation_state_client import (
20
19
  metric_request_read_errors,
21
20
  metric_server_get_state_request_errors,
@@ -78,11 +77,18 @@ class InvocationStateClient:
78
77
  If a request is not comming from the task ID that was added here then it will
79
78
  be rejected. It's caller's responsibility to only add task IDs that are being
80
79
  executed by the Function Executor so the Function Executor can't get access to
81
- invocation state of tasks it doesn't run."""
80
+ invocation state of tasks it doesn't run.
81
+
82
+ Doesn't raise any exceptions.
83
+ """
82
84
  self._task_id_to_invocation_id[task_id] = invocation_id
83
85
 
84
86
  def remove_task_to_invocation_id_entry(self, task_id: str) -> None:
85
- del self._task_id_to_invocation_id[task_id]
87
+ """Removes a task ID to invocation ID entry from the client's internal state.
88
+
89
+ Doesn't raise any exceptions.
90
+ """
91
+ self._task_id_to_invocation_id.pop(task_id, None)
86
92
 
87
93
  async def destroy(self) -> None:
88
94
  if self._request_loop_task is not None:
@@ -257,3 +263,19 @@ class InvocationStateClient:
257
263
  )
258
264
  else:
259
265
  raise ValueError("unknown request type")
266
+
267
+
268
+ def serialized_object_from_http_response(response: httpx.Response) -> SerializedObject:
269
+ # We're hardcoding the content type currently used by Python SDK. It might change in the future.
270
+ # There's no other way for now to determine if the response is a bytes or string.
271
+ if response.headers["content-type"] in [
272
+ "application/octet-stream",
273
+ "application/pickle",
274
+ ]:
275
+ return SerializedObject(
276
+ bytes=response.content, content_type=response.headers["content-type"]
277
+ )
278
+ else:
279
+ return SerializedObject(
280
+ string=response.text, content_type=response.headers["content-type"]
281
+ )
@@ -24,9 +24,9 @@ class FunctionExecutorServerConfiguration:
24
24
  graph_version: str
25
25
  image_uri: Optional[str]
26
26
  secret_names: List[str]
27
- cpu_ms_per_sec: Optional[int]
28
- memory_bytes: Optional[int]
29
- disk_bytes: Optional[int]
27
+ cpu_ms_per_sec: int
28
+ memory_bytes: int
29
+ disk_bytes: int
30
30
  gpu_count: int
31
31
 
32
32