indexify 0.3.18__py3-none-any.whl → 0.3.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. indexify/cli/cli.py +15 -17
  2. indexify/executor/api_objects.py +12 -0
  3. indexify/executor/blob_store/blob_store.py +69 -0
  4. indexify/executor/blob_store/local_fs_blob_store.py +48 -0
  5. indexify/executor/blob_store/metrics/blob_store.py +33 -0
  6. indexify/executor/blob_store/s3_blob_store.py +85 -0
  7. indexify/executor/downloader.py +149 -25
  8. indexify/executor/executor.py +77 -41
  9. indexify/executor/function_executor/function_executor.py +24 -11
  10. indexify/executor/function_executor/function_executor_state.py +9 -1
  11. indexify/executor/function_executor/function_executor_states_container.py +8 -1
  12. indexify/executor/function_executor/function_executor_status.py +4 -0
  13. indexify/executor/function_executor/health_checker.py +7 -2
  14. indexify/executor/function_executor/invocation_state_client.py +4 -2
  15. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
  16. indexify/executor/function_executor/single_task_runner.py +15 -11
  17. indexify/executor/function_executor/task_output.py +36 -2
  18. indexify/executor/grpc/channel_manager.py +4 -3
  19. indexify/executor/grpc/function_executor_controller.py +391 -0
  20. indexify/executor/grpc/metrics/state_reconciler.py +17 -0
  21. indexify/executor/grpc/metrics/task_controller.py +8 -0
  22. indexify/executor/grpc/state_reconciler.py +324 -217
  23. indexify/executor/grpc/state_reporter.py +52 -41
  24. indexify/executor/grpc/task_controller.py +492 -0
  25. indexify/executor/metrics/task_reporter.py +14 -0
  26. indexify/executor/task_reporter.py +115 -6
  27. indexify/executor/task_runner.py +1 -0
  28. indexify/proto/executor_api.proto +91 -7
  29. indexify/proto/executor_api_pb2.py +49 -37
  30. indexify/proto/executor_api_pb2.pyi +158 -3
  31. indexify/proto/executor_api_pb2_grpc.py +47 -0
  32. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/METADATA +2 -1
  33. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/RECORD +35 -27
  34. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/WHEEL +0 -0
  35. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/entry_points.txt +0 -0
@@ -12,6 +12,7 @@ from tensorlake.utils.logging import suppress as suppress_logging
12
12
  from indexify.proto.executor_api_pb2 import ExecutorStatus
13
13
 
14
14
  from .api_objects import FunctionURI, Task
15
+ from .blob_store.blob_store import BLOBStore
15
16
  from .downloader import Downloader
16
17
  from .executor_flavor import ExecutorFlavor
17
18
  from .function_executor.function_executor_states_container import (
@@ -64,11 +65,12 @@ class Executor:
64
65
  function_allowlist: Optional[List[FunctionURI]],
65
66
  function_executor_server_factory: FunctionExecutorServerFactory,
66
67
  server_addr: str,
68
+ grpc_server_addr: str,
67
69
  config_path: Optional[str],
68
70
  monitoring_server_host: str,
69
71
  monitoring_server_port: int,
70
- grpc_server_addr: Optional[str],
71
72
  enable_grpc_state_reconciler: bool,
73
+ blob_store: BLOBStore,
72
74
  ):
73
75
  self._logger = structlog.get_logger(module=__name__)
74
76
  self._is_shutdown: bool = False
@@ -95,45 +97,45 @@ class Executor:
95
97
  self._function_executor_states
96
98
  )
97
99
  self._downloader = Downloader(
98
- code_path=code_path, base_url=self._base_url, config_path=config_path
100
+ code_path=code_path,
101
+ base_url=self._base_url,
102
+ blob_store=blob_store,
103
+ config_path=config_path,
104
+ )
105
+ self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
106
+ self._function_executor_server_factory = function_executor_server_factory
107
+ self._channel_manager = ChannelManager(
108
+ server_address=grpc_server_addr,
109
+ config_path=config_path,
110
+ logger=self._logger,
111
+ )
112
+ self._state_reporter = ExecutorStateReporter(
113
+ executor_id=id,
114
+ flavor=flavor,
115
+ version=version,
116
+ labels=labels,
117
+ development_mode=development_mode,
118
+ function_allowlist=self._function_allowlist,
119
+ function_executor_states=self._function_executor_states,
120
+ channel_manager=self._channel_manager,
121
+ logger=self._logger,
122
+ )
123
+ self._state_reporter.update_executor_status(
124
+ ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
99
125
  )
100
126
  self._task_reporter = TaskReporter(
101
127
  base_url=self._base_url,
102
128
  executor_id=id,
103
129
  config_path=config_path,
130
+ channel_manager=self._channel_manager,
104
131
  )
105
- self._function_allowlist: Optional[List[FunctionURI]] = function_allowlist
106
- self._function_executor_server_factory = function_executor_server_factory
107
132
 
108
- # HTTP mode services
133
+ # HTTP mode task runner
109
134
  self._task_runner: Optional[TaskRunner] = None
110
135
  self._task_fetcher: Optional[TaskFetcher] = None
111
- # gRPC mode services
112
- self._channel_manager: Optional[ChannelManager] = None
113
- self._state_reporter: Optional[ExecutorStateReporter] = None
136
+ # gRPC mode state reconciler that runs tasks
114
137
  self._state_reconciler: Optional[ExecutorStateReconciler] = None
115
138
 
116
- if grpc_server_addr is not None:
117
- self._channel_manager = ChannelManager(
118
- server_address=grpc_server_addr,
119
- config_path=config_path,
120
- logger=self._logger,
121
- )
122
- self._state_reporter = ExecutorStateReporter(
123
- executor_id=id,
124
- flavor=flavor,
125
- version=version,
126
- labels=labels,
127
- development_mode=development_mode,
128
- function_allowlist=self._function_allowlist,
129
- function_executor_states=self._function_executor_states,
130
- channel_manager=self._channel_manager,
131
- logger=self._logger,
132
- )
133
- self._state_reporter.update_executor_status(
134
- ExecutorStatus.EXECUTOR_STATUS_STARTING_UP
135
- )
136
-
137
139
  if enable_grpc_state_reconciler:
138
140
  self._state_reconciler = ExecutorStateReconciler(
139
141
  executor_id=id,
@@ -171,8 +173,8 @@ class Executor:
171
173
  "version": version,
172
174
  "code_path": str(code_path),
173
175
  "server_addr": server_addr,
174
- "config_path": str(config_path),
175
176
  "grpc_server_addr": str(grpc_server_addr),
177
+ "config_path": str(config_path),
176
178
  "enable_grpc_state_reconciler": str(enable_grpc_state_reconciler),
177
179
  "hostname": gethostname(),
178
180
  }
@@ -192,12 +194,15 @@ class Executor:
192
194
  signum, self.shutdown, asyncio.get_event_loop()
193
195
  )
194
196
 
195
- asyncio.get_event_loop().create_task(self._monitoring_server.run())
196
- if self._state_reporter is not None:
197
- self._state_reporter.update_executor_status(
198
- ExecutorStatus.EXECUTOR_STATUS_RUNNING
199
- )
200
- asyncio.get_event_loop().create_task(self._state_reporter.run())
197
+ asyncio.get_event_loop().create_task(
198
+ self._monitoring_server.run(), name="monitoring server runner"
199
+ )
200
+ self._state_reporter.update_executor_status(
201
+ ExecutorStatus.EXECUTOR_STATUS_RUNNING
202
+ )
203
+ asyncio.get_event_loop().create_task(
204
+ self._state_reporter.run(), name="state reporter runner"
205
+ )
201
206
 
202
207
  metric_executor_state.state("running")
203
208
  self._startup_probe_handler.set_ready()
@@ -218,7 +223,6 @@ class Executor:
218
223
  """Runs the gRPC state reconciler and state reporter.
219
224
 
220
225
  Never raises any exceptions."""
221
- asyncio.create_task(self._state_reporter.run())
222
226
  await self._state_reconciler.run()
223
227
 
224
228
  async def _http_task_runner_loop(self):
@@ -227,11 +231,15 @@ class Executor:
227
231
  async for task in self._task_fetcher.run():
228
232
  metric_tasks_fetched.inc()
229
233
  if not self._is_shutdown:
230
- asyncio.create_task(self._run_task(task))
234
+ asyncio.create_task(
235
+ self._run_task(task), name="task runner (http mode)"
236
+ )
237
+ self._logger.info("fetching tasks finished, reconnecting in 5 seconds")
231
238
  except Exception as e:
232
239
  self._logger.error(
233
240
  "failed fetching tasks, retrying in 5 seconds", exc_info=e
234
241
  )
242
+ if not self._is_shutdown:
235
243
  await asyncio.sleep(5)
236
244
 
237
245
  async def _run_task(self, task: Task) -> None:
@@ -256,6 +264,9 @@ class Executor:
256
264
  )
257
265
  logger.error("task execution failed", exc_info=e)
258
266
 
267
+ if output.metrics is not None:
268
+ self.log_function_metrics(output)
269
+
259
270
  with (
260
271
  metric_tasks_reporting_outcome.track_inprogress(),
261
272
  metric_task_outcome_report_latency.time(),
@@ -265,18 +276,42 @@ class Executor:
265
276
 
266
277
  metric_task_completion_latency.observe(time.monotonic() - start_time)
267
278
 
279
+ def log_function_metrics(self, output: TaskOutput):
280
+ for counter_name, counter_value in output.metrics.counters.items():
281
+ self._logger.info(
282
+ f"function_metric",
283
+ counter_name=counter_name,
284
+ counter_value=counter_value,
285
+ invocation_id=output.graph_invocation_id,
286
+ function_name=output.function_name,
287
+ graph_name=output.graph_name,
288
+ namespace=output.namespace,
289
+ )
290
+ for timer_name, timer_value in output.metrics.timers.items():
291
+ self._logger.info(
292
+ f"function_metric",
293
+ timer_name=timer_name,
294
+ timer_value=timer_value,
295
+ invocation_id=output.graph_invocation_id,
296
+ function_name=output.function_name,
297
+ graph_name=output.graph_name,
298
+ namespace=output.namespace,
299
+ )
300
+
268
301
  async def _run_task_and_get_output(self, task: Task, logger: Any) -> TaskOutput:
269
302
  graph: SerializedObject = await self._downloader.download_graph(
270
303
  namespace=task.namespace,
271
304
  graph_name=task.compute_graph,
272
305
  graph_version=task.graph_version,
273
306
  logger=logger,
307
+ data_payload=None,
274
308
  )
275
309
  input: SerializedObject = await self._downloader.download_input(
276
310
  namespace=task.namespace,
277
311
  graph_name=task.compute_graph,
278
312
  graph_invocation_id=task.invocation_id,
279
313
  input_key=task.input_key,
314
+ data_payload=None,
280
315
  logger=logger,
281
316
  )
282
317
  init_value: Optional[SerializedObject] = (
@@ -289,6 +324,7 @@ class Executor:
289
324
  function_name=task.compute_fn,
290
325
  graph_invocation_id=task.invocation_id,
291
326
  reducer_output_key=task.reducer_output_id,
327
+ data_payload=None,
292
328
  logger=logger,
293
329
  )
294
330
  )
@@ -358,12 +394,12 @@ class Executor:
358
394
  if self._task_runner is not None:
359
395
  await self._task_runner.shutdown()
360
396
 
361
- if self._channel_manager is not None:
362
- await self._channel_manager.shutdown()
363
397
  if self._state_reporter is not None:
364
398
  await self._state_reporter.shutdown()
365
399
  if self._state_reconciler is not None:
366
400
  await self._state_reconciler.shutdown()
401
+ if self._channel_manager is not None:
402
+ await self._channel_manager.destroy()
367
403
 
368
404
  # We need to shutdown all users of FE states first,
369
405
  # otherwise states might disappear unexpectedly and we might
@@ -375,7 +411,7 @@ class Executor:
375
411
  # The current task is cancelled, the code after this line will not run.
376
412
 
377
413
  def shutdown(self, loop):
378
- loop.create_task(self._shutdown(loop))
414
+ loop.create_task(self._shutdown(loop), name="executor shutdown")
379
415
 
380
416
  def _task_logger(self, task: Task) -> Any:
381
417
  return self._logger.bind(
@@ -88,6 +88,7 @@ class FunctionExecutor:
88
88
  initialize_request: InitializeRequest,
89
89
  base_url: str,
90
90
  config_path: Optional[str],
91
+ customer_code_timeout_sec: Optional[float] = None,
91
92
  ):
92
93
  """Creates and initializes a FunctionExecutorServer and all resources associated with it.
93
94
 
@@ -103,7 +104,9 @@ class FunctionExecutor:
103
104
  await self._establish_channel()
104
105
  stub: FunctionExecutorStub = FunctionExecutorStub(self._channel)
105
106
  await _collect_server_info(stub)
106
- await _initialize_server(stub, initialize_request)
107
+ await _initialize_server(
108
+ stub, initialize_request, customer_code_timeout_sec
109
+ )
107
110
  await self._create_invocation_state_client(
108
111
  stub=stub,
109
112
  base_url=base_url,
@@ -293,18 +296,28 @@ async def _collect_server_info(stub: FunctionExecutorStub) -> None:
293
296
 
294
297
 
295
298
  async def _initialize_server(
296
- stub: FunctionExecutorStub, initialize_request: InitializeRequest
299
+ stub: FunctionExecutorStub,
300
+ initialize_request: InitializeRequest,
301
+ customer_code_timeout_sec: Optional[float],
297
302
  ) -> None:
298
303
  with (
299
304
  metric_initialize_rpc_errors.count_exceptions(),
300
305
  metric_initialize_rpc_latency.time(),
301
306
  ):
302
- initialize_response: InitializeResponse = await stub.initialize(
303
- initialize_request
304
- )
305
- if initialize_response.success:
306
- return
307
- if initialize_response.HasField("customer_error"):
308
- raise CustomerError(initialize_response.customer_error)
309
- else:
310
- raise Exception("initialize RPC failed at function executor server")
307
+ try:
308
+ initialize_response: InitializeResponse = await stub.initialize(
309
+ initialize_request,
310
+ timeout=customer_code_timeout_sec,
311
+ )
312
+ if initialize_response.success:
313
+ return
314
+ if initialize_response.HasField("customer_error"):
315
+ raise CustomerError(initialize_response.customer_error)
316
+ else:
317
+ raise Exception("initialize RPC failed at function executor server")
318
+ except grpc.aio.AioRpcError as e:
319
+ if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
320
+ raise CustomerError(
321
+ f"Customer code timeout of {customer_code_timeout_sec:.3f} sec expired"
322
+ ) from e
323
+ raise
@@ -25,6 +25,7 @@ class FunctionExecutorState:
25
25
  graph_version: str,
26
26
  function_name: str,
27
27
  image_uri: Optional[str],
28
+ secret_names: List[str],
28
29
  logger: Any,
29
30
  ):
30
31
  # Read only fields.
@@ -33,6 +34,7 @@ class FunctionExecutorState:
33
34
  self.graph_name: str = graph_name
34
35
  self.function_name: str = function_name
35
36
  self.image_uri: Optional[str] = image_uri
37
+ self.secret_names: List[str] = secret_names
36
38
  self._logger: Any = logger.bind(
37
39
  module=__name__,
38
40
  function_executor_id=id,
@@ -47,6 +49,7 @@ class FunctionExecutorState:
47
49
  # TODO: Move graph_version to immutable fields once we migrate to gRPC State Reconciler.
48
50
  self.graph_version: str = graph_version
49
51
  self.status: FunctionExecutorStatus = FunctionExecutorStatus.DESTROYED
52
+ self.status_message: str = ""
50
53
  self.status_change_notifier: asyncio.Condition = asyncio.Condition(
51
54
  lock=self.lock
52
55
  )
@@ -62,7 +65,9 @@ class FunctionExecutorState:
62
65
  while self.status not in allowlist:
63
66
  await self.status_change_notifier.wait()
64
67
 
65
- async def set_status(self, new_status: FunctionExecutorStatus) -> None:
68
+ async def set_status(
69
+ self, new_status: FunctionExecutorStatus, status_message: str = ""
70
+ ) -> None:
66
71
  """Sets the status of the Function Executor.
67
72
 
68
73
  The caller must hold the lock.
@@ -70,6 +75,7 @@ class FunctionExecutorState:
70
75
  """
71
76
  self.check_locked()
72
77
  if is_status_change_allowed(self.status, new_status):
78
+ # If status didn't change then still log it for visibility.
73
79
  self._logger.info(
74
80
  "function executor status changed",
75
81
  old_status=self.status.name,
@@ -78,12 +84,14 @@ class FunctionExecutorState:
78
84
  metric_function_executors_with_status.labels(status=self.status.name).dec()
79
85
  metric_function_executors_with_status.labels(status=new_status.name).inc()
80
86
  self.status = new_status
87
+ self.status_message = status_message
81
88
  self.status_change_notifier.notify_all()
82
89
  else:
83
90
  raise ValueError(
84
91
  f"Invalid status change from {self.status} to {new_status}"
85
92
  )
86
93
 
94
+ # TODO: Delete this method once HTTP protocol is removed as it's used only there.
87
95
  async def destroy_function_executor(self) -> None:
88
96
  """Destroys the Function Executor if it exists.
89
97
 
@@ -1,5 +1,5 @@
1
1
  import asyncio
2
- from typing import Any, AsyncGenerator, Dict, Optional
2
+ from typing import Any, AsyncGenerator, Dict, List, Optional
3
3
 
4
4
  from .function_executor_state import FunctionExecutorState
5
5
  from .function_executor_status import FunctionExecutorStatus
@@ -26,6 +26,7 @@ class FunctionExecutorStatesContainer:
26
26
  graph_version: str,
27
27
  function_name: str,
28
28
  image_uri: Optional[str],
29
+ secret_names: List[str],
29
30
  ) -> FunctionExecutorState:
30
31
  """Get or create a function executor state with the given ID.
31
32
 
@@ -45,6 +46,7 @@ class FunctionExecutorStatesContainer:
45
46
  graph_version=graph_version,
46
47
  function_name=function_name,
47
48
  image_uri=image_uri,
49
+ secret_names=secret_names,
48
50
  logger=self._logger,
49
51
  )
50
52
  self._states[id] = state
@@ -52,6 +54,11 @@ class FunctionExecutorStatesContainer:
52
54
 
53
55
  return self._states[id]
54
56
 
57
+ async def get(self, id: str) -> FunctionExecutorState:
58
+ """Get the state with the given ID. Raises Exception if the state does not exist."""
59
+ async with self._lock:
60
+ return self._states[id]
61
+
55
62
  async def __aiter__(self) -> AsyncGenerator[FunctionExecutorState, None]:
56
63
  async with self._lock:
57
64
  for state in self._states.values():
@@ -23,6 +23,7 @@ class FunctionExecutorStatus(Enum):
23
23
  UNHEALTHY = "Unhealthy"
24
24
  # STARTUP_FAILED_CUSTOMER_ERROR -> DESTROYING
25
25
  # STARTUP_FAILED_PLATFORM_ERROR -> DESTROYING
26
+ # RUNNING_TASK -> DESTROYING
26
27
  # UNHEALTHY -> DESTROYING
27
28
  # IDLE -> DESTROYING
28
29
  DESTROYING = "Destroying"
@@ -33,6 +34,8 @@ class FunctionExecutorStatus(Enum):
33
34
  SHUTDOWN = "Shutdown" # Permanent stop state
34
35
 
35
36
 
37
+ # TODO: After removing HTTP code simplify state transitions by not allowing to
38
+ # startup an FE after it was destroyed. grpc protocol treats FEs as ephimeral and never revives them.
36
39
  def is_status_change_allowed(
37
40
  current_status: FunctionExecutorStatus, new_status: FunctionExecutorStatus
38
41
  ) -> bool:
@@ -69,6 +72,7 @@ def is_status_change_allowed(
69
72
  ],
70
73
  FunctionExecutorStatus.RUNNING_TASK: [
71
74
  FunctionExecutorStatus.RUNNING_TASK,
75
+ FunctionExecutorStatus.DESTROYING,
72
76
  FunctionExecutorStatus.IDLE,
73
77
  FunctionExecutorStatus.UNHEALTHY,
74
78
  FunctionExecutorStatus.SHUTDOWN,
@@ -107,7 +107,9 @@ class HealthChecker:
107
107
  return
108
108
 
109
109
  self._health_check_failed_callback = callback
110
- self._health_check_loop_task = asyncio.create_task(self._health_check_loop())
110
+ self._health_check_loop_task = asyncio.create_task(
111
+ self._health_check_loop(), name="function executor health checker loop"
112
+ )
111
113
 
112
114
  def stop(self) -> None:
113
115
  """Stops the periodic health checks.
@@ -126,7 +128,10 @@ class HealthChecker:
126
128
  break
127
129
  await asyncio.sleep(HEALTH_CHECK_POLL_PERIOD_SEC)
128
130
 
129
- asyncio.create_task(self._health_check_failed_callback(result))
131
+ asyncio.create_task(
132
+ self._health_check_failed_callback(result),
133
+ name="function executor health check failure callback",
134
+ )
130
135
  self._health_check_loop_task = None
131
136
 
132
137
 
@@ -67,7 +67,8 @@ class InvocationStateClient:
67
67
  self._response_generator()
68
68
  )
69
69
  self._request_loop_task = asyncio.create_task(
70
- self._request_loop(server_requests)
70
+ self._request_loop(server_requests),
71
+ name="graph invocation state client request processing loop",
71
72
  )
72
73
 
73
74
  def add_task_to_invocation_id_entry(self, task_id: str, invocation_id: str) -> None:
@@ -100,7 +101,8 @@ class InvocationStateClient:
100
101
  pass
101
102
  except asyncio.CancelledError:
102
103
  # This async task was cancelled by destroy(). Normal situation too.
103
- pass
104
+ # This exception should not be suppressed, see Python asyncio docs.
105
+ raise
104
106
  except Exception as e:
105
107
  metric_request_read_errors.inc()
106
108
  self._logger.error(
@@ -25,6 +25,12 @@ class SubprocessFunctionExecutorServerFactory(FunctionExecutorServerFactory):
25
25
  logger = logger.bind(module=__name__)
26
26
  port: Optional[int] = None
27
27
 
28
+ if len(config.secret_names) > 0:
29
+ logger.warning(
30
+ "Subprocess Function Executor does not support secrets. Please supply secrets as environment variables.",
31
+ secret_names=config.secret_names,
32
+ )
33
+
28
34
  try:
29
35
  port = self._allocate_port()
30
36
  args = [
@@ -10,6 +10,7 @@ from tensorlake.function_executor.proto.function_executor_pb2 import (
10
10
  from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
11
11
  FunctionExecutorStub,
12
12
  )
13
+ from tensorlake.function_executor.proto.message_validator import MessageValidator
13
14
 
14
15
  from ..api_objects import Task
15
16
  from .function_executor import CustomerError, FunctionExecutor
@@ -26,7 +27,7 @@ from .server.function_executor_server_factory import (
26
27
  FunctionExecutorServerFactory,
27
28
  )
28
29
  from .task_input import TaskInput
29
- from .task_output import TaskOutput
30
+ from .task_output import TaskMetrics, TaskOutput
30
31
 
31
32
 
32
33
  class SingleTaskRunner:
@@ -286,16 +287,17 @@ class _RunningTaskContextManager:
286
287
 
287
288
 
288
289
  def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
289
- required_fields = [
290
- "stdout",
291
- "stderr",
292
- "is_reducer",
293
- "success",
294
- ]
295
-
296
- for field in required_fields:
297
- if not response.HasField(field):
298
- raise ValueError(f"Response is missing required field: {field}")
290
+ response_validator = MessageValidator(response)
291
+ response_validator.required_field("stdout")
292
+ response_validator.required_field("stderr")
293
+ response_validator.required_field("is_reducer")
294
+ response_validator.required_field("success")
295
+
296
+ metrics = TaskMetrics(counters={}, timers={})
297
+ if response.HasField("metrics"):
298
+ # Can be None if e.g. function failed.
299
+ metrics.counters = dict(response.metrics.counters)
300
+ metrics.timers = dict(response.metrics.timers)
299
301
 
300
302
  output = TaskOutput(
301
303
  task_id=task.id,
@@ -308,10 +310,12 @@ def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
308
310
  stderr=response.stderr,
309
311
  reducer=response.is_reducer,
310
312
  success=response.success,
313
+ metrics=metrics,
311
314
  )
312
315
 
313
316
  if response.HasField("function_output"):
314
317
  output.function_output = response.function_output
318
+ output.output_encoding = response.function_output.output_encoding
315
319
  if response.HasField("router_output"):
316
320
  output.router_output = response.router_output
317
321
 
@@ -1,11 +1,17 @@
1
- from typing import Optional
1
+ from typing import Dict, Optional
2
2
 
3
3
  from tensorlake.function_executor.proto.function_executor_pb2 import (
4
4
  FunctionOutput,
5
5
  RouterOutput,
6
6
  )
7
7
 
8
- from ..api_objects import Task
8
+
9
+ class TaskMetrics:
10
+ """Metrics for a task."""
11
+
12
+ def __init__(self, counters: Dict[str, int], timers: Dict[str, float]):
13
+ self.counters = counters
14
+ self.timers = timers
9
15
 
10
16
 
11
17
  class TaskOutput:
@@ -19,6 +25,7 @@ class TaskOutput:
19
25
  function_name: str,
20
26
  graph_version: str,
21
27
  graph_invocation_id: str,
28
+ output_encoding: Optional[str] = None,
22
29
  function_output: Optional[FunctionOutput] = None,
23
30
  router_output: Optional[RouterOutput] = None,
24
31
  stdout: Optional[str] = None,
@@ -26,6 +33,7 @@ class TaskOutput:
26
33
  reducer: bool = False,
27
34
  success: bool = False,
28
35
  is_internal_error: bool = False,
36
+ metrics: Optional[TaskMetrics] = None,
29
37
  ):
30
38
  self.task_id = task_id
31
39
  self.namespace = namespace
@@ -40,6 +48,8 @@ class TaskOutput:
40
48
  self.reducer = reducer
41
49
  self.success = success
42
50
  self.is_internal_error = is_internal_error
51
+ self.metrics = metrics
52
+ self.output_encoding = output_encoding
43
53
 
44
54
  @classmethod
45
55
  def internal_error(
@@ -63,3 +73,27 @@ class TaskOutput:
63
73
  stderr="Platform failed to execute the function.",
64
74
  is_internal_error=True,
65
75
  )
76
+
77
+ @classmethod
78
+ def function_timeout(
79
+ cls,
80
+ task_id: str,
81
+ namespace: str,
82
+ graph_name: str,
83
+ function_name: str,
84
+ graph_version: str,
85
+ graph_invocation_id: str,
86
+ timeout_sec: float,
87
+ ) -> "TaskOutput":
88
+ """Creates a TaskOutput for an function timeout error."""
89
+ # Task stdout, stderr is not available.
90
+ return TaskOutput(
91
+ task_id=task_id,
92
+ namespace=namespace,
93
+ graph_name=graph_name,
94
+ function_name=function_name,
95
+ graph_version=graph_version,
96
+ graph_invocation_id=graph_invocation_id,
97
+ stderr=f"Function exceeded its configured timeout of {timeout_sec:.3f} sec.",
98
+ is_internal_error=False,
99
+ )
@@ -69,6 +69,10 @@ class ChannelManager:
69
69
  certificate_chain=certificate_chain,
70
70
  )
71
71
 
72
+ async def destroy(self):
73
+ if self._channel is not None:
74
+ await self._destroy_locked_channel()
75
+
72
76
  async def get_channel(self) -> grpc.aio.Channel:
73
77
  """Returns a channel to the gRPC server.
74
78
 
@@ -155,6 +159,3 @@ class ChannelManager:
155
159
  except Exception as e:
156
160
  self._logger.error("failed closing channel", exc_info=e)
157
161
  self._channel = None
158
-
159
- async def shutdown(self):
160
- pass