indexify 0.4.9__py3-none-any.whl → 0.4.11__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,7 @@
1
1
  import asyncio
2
2
  import time
3
3
  from collections.abc import Coroutine
4
+ from enum import Enum
4
5
  from pathlib import Path
5
6
  from typing import Any, Dict, List, Optional
6
7
 
@@ -43,12 +44,14 @@ from .events import (
43
44
  from .function_executor_startup_output import FunctionExecutorStartupOutput
44
45
  from .loggers import function_executor_logger, task_allocation_logger
45
46
  from .metrics.function_executor_controller import (
46
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING,
47
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING,
48
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED,
49
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN,
47
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED,
48
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING,
49
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP,
50
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED,
51
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING,
52
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN,
50
53
  metric_control_loop_handle_event_latency,
51
- metric_function_executors_with_status,
54
+ metric_function_executors_with_state,
52
55
  metric_runnable_tasks,
53
56
  metric_runnable_tasks_per_function_name,
54
57
  metric_schedule_task_latency,
@@ -61,6 +64,16 @@ from .task_output import TaskOutput
61
64
  from .upload_task_output import upload_task_output
62
65
 
63
66
 
67
+ # Actual FE controller states, they are a bit different from statuses reported to the Server.
68
+ # All the valid state transitions are forward only (can skip multiple states in a row).
69
+ class _FE_CONTROLLER_STATE(Enum):
70
+ NOT_STARTED = 1
71
+ STARTING_UP = 2
72
+ RUNNING = 3
73
+ TERMINATING = 4
74
+ TERMINATED = 5
75
+
76
+
64
77
  class FunctionExecutorController:
65
78
  def __init__(
66
79
  self,
@@ -94,19 +107,18 @@ class FunctionExecutorController:
94
107
  self._logger: Any = function_executor_logger(
95
108
  function_executor_description, logger.bind(module=__name__)
96
109
  )
97
- # Mutable state. No lock needed as it's modified by async tasks running in
98
- # the same event loop.
110
+ self._destroy_lock: asyncio.Lock = asyncio.Lock()
111
+ # Mutable state. No lock needed as it's modified by async tasks running in the same event loop.
99
112
  self._fe: Optional[FunctionExecutor] = None
100
- self._fe_termination_reason: FunctionExecutorTerminationReason = (
101
- None # Optional
102
- )
103
- # FE Status reported to Server.
104
- self._status: FunctionExecutorStatus = (
105
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN
106
- )
107
- metric_function_executors_with_status.labels(
108
- status=_to_fe_status_metric_label(self._status, self._logger)
113
+ self._fe_termination_reason: Optional[FunctionExecutorTerminationReason] = None
114
+ self._internal_state = _FE_CONTROLLER_STATE.NOT_STARTED
115
+ metric_function_executors_with_state.labels(
116
+ state=_to_fe_state_metric_label(self._internal_state, self._logger)
109
117
  ).inc()
118
+ self._reported_state: FunctionExecutorState = FunctionExecutorState(
119
+ description=function_executor_description,
120
+ status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN,
121
+ )
110
122
  # Ordered list of events to be processed by the control loop.
111
123
  self._events: List[BaseEvent] = []
112
124
  # Asyncio event used to notify the control loop that there are new events to process.
@@ -124,13 +136,6 @@ class FunctionExecutorController:
124
136
  def function_executor_id(self) -> str:
125
137
  return self._fe_description.id
126
138
 
127
- def status(self) -> FunctionExecutorStatus:
128
- """Returns the current status of the Function Executor.
129
-
130
- Not blocking.
131
- """
132
- return self._status
133
-
134
139
  def add_task_allocation(self, task_allocation: TaskAllocation) -> None:
135
140
  """Adds a task to the Function Executor.
136
141
 
@@ -197,7 +202,6 @@ class FunctionExecutorController:
197
202
  task_info.is_cancelled = True
198
203
  logger.info(
199
204
  "cancelling task",
200
- allocation_id=task_info.allocation.allocation_id,
201
205
  )
202
206
  if task_info.aio_task is not None:
203
207
  task_info.aio_task.cancel()
@@ -206,9 +210,10 @@ class FunctionExecutorController:
206
210
  """Starts up the Function Executor and prepares it to run tasks.
207
211
 
208
212
  Not blocking. Never raises exceptions."""
209
- if self._control_loop_aio_task is not None:
213
+ if self._internal_state != _FE_CONTROLLER_STATE.NOT_STARTED:
210
214
  self._logger.warning(
211
- "ignoring startup call as the Function Executor is already started"
215
+ "function executor state is not NOT_STARTED, ignoring startup call",
216
+ internal_state=self._internal_state.name,
212
217
  )
213
218
  return
214
219
 
@@ -216,7 +221,13 @@ class FunctionExecutorController:
216
221
  self._control_loop(),
217
222
  name="function executor control loop",
218
223
  )
219
- self._set_status(FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING)
224
+ self._update_internal_state(_FE_CONTROLLER_STATE.STARTING_UP)
225
+ self._update_reported_state(
226
+ FunctionExecutorState(
227
+ description=self._fe_description,
228
+ status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING,
229
+ )
230
+ )
220
231
  next_aio = create_function_executor(
221
232
  function_executor_description=self._fe_description,
222
233
  function_executor_server_factory=self._fe_server_factory,
@@ -238,17 +249,13 @@ class FunctionExecutorController:
238
249
  ),
239
250
  )
240
251
 
241
- async def shutdown(
242
- self, termination_reason: FunctionExecutorTerminationReason
243
- ) -> None:
252
+ async def shutdown(self) -> None:
244
253
  """Shutsdown the Function Executor and frees all of its resources.
245
254
 
246
- All the tasks are reported as failed with FE Terminated failure code.
255
+ No task outcomes and outputs are getting reported to Server after this call.
247
256
  Doesn't raise any exceptions. Blocks until the shutdown is complete.
248
257
  """
249
- self._add_event(
250
- ShutdownInitiated(termination_reason=termination_reason), source="shutdown"
251
- )
258
+ self._add_event(ShutdownInitiated(), source="shutdown")
252
259
  try:
253
260
  await self._control_loop_aio_task
254
261
  except asyncio.CancelledError:
@@ -260,51 +267,49 @@ class FunctionExecutorController:
260
267
  )
261
268
  self._logger.info("function executor controller shutdown finished")
262
269
 
263
- def _set_status(
270
+ def _update_internal_state(self, new_state: _FE_CONTROLLER_STATE) -> None:
271
+ """Updates the internal state of the Function Executor Controller.
272
+
273
+ Not blocking. Never raises exceptions."""
274
+ old_state: _FE_CONTROLLER_STATE = self._internal_state
275
+ self._internal_state = new_state
276
+
277
+ self._logger.info(
278
+ "function executor internal state changed",
279
+ old_state=old_state.name,
280
+ new_state=new_state.name,
281
+ )
282
+
283
+ metric_function_executors_with_state.labels(
284
+ state=_to_fe_state_metric_label(old_state, self._logger)
285
+ ).dec()
286
+ metric_function_executors_with_state.labels(
287
+ state=_to_fe_state_metric_label(new_state, self._logger)
288
+ ).inc()
289
+
290
+ def _update_reported_state(
264
291
  self,
265
- status: FunctionExecutorStatus,
292
+ new_state: FunctionExecutorState,
266
293
  ) -> None:
267
- """Sets Function Executor status and reports it to the Server.
294
+ """Sets new Function Executor state and reports it to the Server.
268
295
 
269
296
  Not blocking. Never raises exceptions."""
270
- old_status: FunctionExecutorStatus = self._status
271
- new_status: FunctionExecutorStatus = status
272
- self._status: FunctionExecutorStatus = new_status
297
+ old_state: FunctionExecutorState = self._reported_state
298
+ self._reported_state = new_state
273
299
 
274
300
  self._logger.info(
275
- "function executor status changed",
276
- old_status=FunctionExecutorStatus.Name(old_status),
277
- new_status=FunctionExecutorStatus.Name(new_status),
301
+ "function executor grpc status changed",
302
+ old_status=FunctionExecutorStatus.Name(old_state.status),
303
+ new_status=FunctionExecutorStatus.Name(new_state.status),
278
304
  termination_reason=_termination_reason_to_short_name(
279
- self._fe_termination_reason
305
+ new_state.termination_reason
280
306
  ),
281
307
  )
282
- metric_function_executors_with_status.labels(
283
- status=_to_fe_status_metric_label(old_status, self._logger)
284
- ).dec()
285
- metric_function_executors_with_status.labels(
286
- status=_to_fe_status_metric_label(new_status, self._logger)
287
- ).inc()
288
308
 
289
- self._state_reporter.update_function_executor_state(self._current_state())
309
+ self._state_reporter.update_function_executor_state(new_state)
290
310
  # Report the status change to the Server asap to reduce latency in the system.
291
311
  self._state_reporter.schedule_state_report()
292
312
 
293
- def _current_state(self) -> FunctionExecutorState:
294
- """Returns the current state of the Function Executor.
295
-
296
- Not blocking. Never raises exceptions.
297
- """
298
- termination_reason: Optional[FunctionExecutorTerminationReason] = None
299
- if self._fe_termination_reason is not None:
300
- termination_reason = self._fe_termination_reason
301
-
302
- return FunctionExecutorState(
303
- description=self._fe_description,
304
- status=self._status,
305
- termination_reason=termination_reason,
306
- )
307
-
308
313
  async def _control_loop(self) -> None:
309
314
  """Runs control loop that coordinates all the work done by the Function Executor.
310
315
 
@@ -332,7 +337,7 @@ class FunctionExecutorController:
332
337
  self._logger.error(
333
338
  "unexpected exception in function executor controller control loop",
334
339
  exc_info=e,
335
- fe_event=str(event),
340
+ event_type=event.event_type.name,
336
341
  )
337
342
 
338
343
  def _handle_event(self, event: BaseEvent) -> None:
@@ -455,13 +460,17 @@ class FunctionExecutorController:
455
460
  self._state_reporter.schedule_state_report()
456
461
 
457
462
  if event.function_executor is None:
458
- self._destroy_function_executor_before_termination(
459
- event.output.termination_reason
460
- )
463
+ self._start_termination(termination_reason=event.output.termination_reason)
461
464
  return
462
465
 
463
466
  self._fe = event.function_executor
464
- self._set_status(FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING)
467
+ self._update_internal_state(_FE_CONTROLLER_STATE.RUNNING)
468
+ self._update_reported_state(
469
+ FunctionExecutorState(
470
+ description=self._fe_description,
471
+ status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING,
472
+ )
473
+ )
465
474
  # Health checker starts after FE creation and gets automatically stopped on FE destroy.
466
475
  self._fe.health_checker().start(self._health_check_failed_callback)
467
476
  self._add_event(
@@ -480,9 +489,18 @@ class FunctionExecutorController:
480
489
  self._logger.error(
481
490
  "Function Executor destroy failed unexpectedly, this should never happen",
482
491
  )
483
- # Set the status only after the FE got destroyed because Server assumes that all FE resources are freed when the status changes.
484
- self._fe_termination_reason = event.termination_reason
485
- self._set_status(FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED)
492
+
493
+ self._fe = None
494
+ # Set reported status only after the FE got destroyed because Server assumes that all FE resources are freed when the status changes.
495
+ self._update_reported_state(
496
+ FunctionExecutorState(
497
+ description=self._fe_description,
498
+ status=FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED,
499
+ termination_reason=self._fe_termination_reason,
500
+ )
501
+ )
502
+ self._update_internal_state(_FE_CONTROLLER_STATE.TERMINATED)
503
+
486
504
  # Invoke the scheduler so it can fail runnable tasks with FE Terminated error.
487
505
  self._add_event(
488
506
  ScheduleTaskExecution(),
@@ -494,7 +512,7 @@ class FunctionExecutorController:
494
512
  "Function Executor health check failed, terminating Function Executor",
495
513
  reason=result.reason,
496
514
  )
497
- self._destroy_function_executor_before_termination(
515
+ self._start_termination(
498
516
  termination_reason=FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY
499
517
  )
500
518
 
@@ -533,14 +551,15 @@ class FunctionExecutorController:
533
551
  if len(self._runnable_tasks) == 0:
534
552
  return
535
553
 
536
- if self._status not in [
537
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING,
538
- FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED,
554
+ if self._internal_state not in [
555
+ _FE_CONTROLLER_STATE.RUNNING,
556
+ _FE_CONTROLLER_STATE.TERMINATING,
557
+ _FE_CONTROLLER_STATE.TERMINATED,
539
558
  ]:
540
- return # Can't progress pending task with the current status.
559
+ return # Can't progress runnable tasks in the current state.
541
560
 
542
561
  if (
543
- self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING
562
+ self._internal_state == _FE_CONTROLLER_STATE.RUNNING
544
563
  and self._running_task is not None
545
564
  ):
546
565
  return
@@ -556,12 +575,15 @@ class FunctionExecutorController:
556
575
  if task_info.is_cancelled:
557
576
  task_info.output = TaskOutput.task_cancelled(task_info.allocation)
558
577
  self._start_task_output_upload(task_info)
559
- elif self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
578
+ elif self._internal_state in [
579
+ _FE_CONTROLLER_STATE.TERMINATING,
580
+ _FE_CONTROLLER_STATE.TERMINATED,
581
+ ]:
560
582
  task_info.output = TaskOutput.function_executor_terminated(
561
583
  task_info.allocation
562
584
  )
563
585
  self._start_task_output_upload(task_info)
564
- elif self._status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING:
586
+ elif self._internal_state == _FE_CONTROLLER_STATE.RUNNING:
565
587
  self._running_task = task_info
566
588
  next_aio = run_task_on_function_executor(
567
589
  task_info=task_info,
@@ -604,7 +626,7 @@ class FunctionExecutorController:
604
626
  ScheduleTaskExecution(), source="_handle_event_task_execution_finished"
605
627
  )
606
628
  else:
607
- self._destroy_function_executor_before_termination(
629
+ self._start_termination(
608
630
  termination_reason=event.function_executor_termination_reason
609
631
  )
610
632
 
@@ -661,24 +683,31 @@ class FunctionExecutorController:
661
683
  )
662
684
  self._state_reporter.schedule_state_report()
663
685
 
664
- def _destroy_function_executor_before_termination(
686
+ def _start_termination(
665
687
  self, termination_reason: FunctionExecutorTerminationReason
666
688
  ) -> None:
667
- """Destroys the Function Executor and frees all its resources to prepare for transitioning to the TERMINATED state.
689
+ """Starts termination of the Function Executor if it's not started yet.
668
690
 
669
691
  Doesn't raise any exceptions. Doesn't block.
670
692
  """
693
+ if self._internal_state in [
694
+ _FE_CONTROLLER_STATE.TERMINATING,
695
+ _FE_CONTROLLER_STATE.TERMINATED,
696
+ ]:
697
+ # _start_termination() can be called multiple times, e.g. by each failed task alloc
698
+ # when the FE is unhealthy. Dedup the calls to keep state machine consistent.
699
+ return
700
+
701
+ self._fe_termination_reason = termination_reason
702
+ self._update_internal_state(_FE_CONTROLLER_STATE.TERMINATING)
671
703
  next_aio = destroy_function_executor(
672
704
  function_executor=self._fe,
673
- termination_reason=termination_reason,
705
+ lock=self._destroy_lock,
674
706
  logger=self._logger,
675
707
  )
676
- self._fe = None
677
708
  self._spawn_aio_for_fe(
678
709
  aio=next_aio,
679
- on_exception=FunctionExecutorDestroyed(
680
- is_success=False, termination_reason=termination_reason
681
- ),
710
+ on_exception=FunctionExecutorDestroyed(is_success=False),
682
711
  )
683
712
 
684
713
  async def _shutdown_no_exceptions(self, event: ShutdownInitiated) -> None:
@@ -717,16 +746,15 @@ class FunctionExecutorController:
717
746
  # BaseException includes asyncio.CancelledError which is always raised here.
718
747
  pass
719
748
 
720
- if self._status != FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
721
- self._handle_event_function_executor_destroyed(
722
- await destroy_function_executor(
723
- function_executor=self._fe,
724
- termination_reason=event.termination_reason,
725
- logger=self._logger,
726
- )
727
- )
728
- metric_function_executors_with_status.labels(
729
- status=_to_fe_status_metric_label(self._status, self._logger)
749
+ await destroy_function_executor(
750
+ function_executor=self._fe,
751
+ lock=self._destroy_lock,
752
+ logger=self._logger,
753
+ )
754
+
755
+ # Cleanup the metric from this FE.
756
+ metric_function_executors_with_state.labels(
757
+ state=_to_fe_state_metric_label(self._internal_state, self._logger)
730
758
  ).dec()
731
759
 
732
760
  self._state_reporter.remove_function_executor_state(self.function_executor_id())
@@ -736,21 +764,23 @@ class FunctionExecutorController:
736
764
  debug_print_events(events=self._events, logger=self._logger)
737
765
 
738
766
 
739
- def _to_fe_status_metric_label(status: FunctionExecutorStatus, logger: Any) -> str:
740
- if status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_UNKNOWN:
741
- return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
742
- elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_PENDING:
743
- return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING
744
- elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING:
745
- return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING
746
- elif status == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_TERMINATED:
747
- return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED
767
+ def _to_fe_state_metric_label(state: _FE_CONTROLLER_STATE, logger: Any) -> str:
768
+ if state == _FE_CONTROLLER_STATE.NOT_STARTED:
769
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED
770
+ elif state == _FE_CONTROLLER_STATE.STARTING_UP:
771
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP
772
+ elif state == _FE_CONTROLLER_STATE.RUNNING:
773
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING
774
+ elif state == _FE_CONTROLLER_STATE.TERMINATING:
775
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING
776
+ elif state == _FE_CONTROLLER_STATE.TERMINATED:
777
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED
748
778
  else:
749
779
  logger.error(
750
- "unexpected Function Executor status",
751
- status=FunctionExecutorStatus.Name(status),
780
+ "unexpected Function Executor internal state",
781
+ state=state.name,
752
782
  )
753
- return METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
783
+ return METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN
754
784
 
755
785
 
756
786
  _termination_reason_to_short_name_map = {
@@ -758,8 +788,6 @@ _termination_reason_to_short_name_map = {
758
788
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_INTERNAL_ERROR: "STARTUP_FAILED_INTERNAL_ERROR",
759
789
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_ERROR: "STARTUP_FAILED_FUNCTION_ERROR",
760
790
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_STARTUP_FAILED_FUNCTION_TIMEOUT: "STARTUP_FAILED_FUNCTION_TIMEOUT",
761
- FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_EXECUTOR_SHUTDOWN: "EXECUTOR_SHUTDOWN",
762
- FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_REMOVED_FROM_DESIRED_STATE: "REMOVED_FROM_DESIRED_STATE",
763
791
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_UNHEALTHY: "UNHEALTHY",
764
792
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_INTERNAL_ERROR: "INTERNAL_ERROR",
765
793
  FunctionExecutorTerminationReason.FUNCTION_EXECUTOR_TERMINATION_REASON_FUNCTION_TIMEOUT: "FUNCTION_TIMEOUT",
@@ -786,16 +814,14 @@ def _to_task_result_proto(output: TaskOutput) -> TaskResult:
786
814
  graph_invocation_id=output.allocation.task.graph_invocation_id,
787
815
  reducer=output.reducer,
788
816
  outcome_code=output.outcome_code,
789
- next_functions=(output.router_output.edges if output.router_output else []),
817
+ failure_reason=output.failure_reason,
818
+ next_functions=output.next_functions,
790
819
  function_outputs=output.uploaded_data_payloads,
820
+ invocation_error_output=output.uploaded_invocation_error_output,
791
821
  )
792
- if output.failure_reason is not None:
793
- task_result.failure_reason = output.failure_reason
794
822
  if output.uploaded_stdout is not None:
795
823
  task_result.stdout.CopyFrom(output.uploaded_stdout)
796
824
  if output.uploaded_stderr is not None:
797
825
  task_result.stderr.CopyFrom(output.uploaded_stderr)
798
- if output.router_output is not None:
799
- task_result.routing.next_functions[:] = output.router_output.edges
800
826
 
801
827
  return task_result
@@ -34,27 +34,34 @@ metric_runnable_tasks_per_function_name: prometheus_client.Gauge = (
34
34
  )
35
35
  )
36
36
 
37
- metric_function_executors_with_status: prometheus_client.Gauge = (
38
- prometheus_client.Gauge(
39
- "function_executors_with_status",
40
- "Number of Function Executors with a particular status",
41
- ["status"],
42
- )
37
+ metric_function_executors_with_state: prometheus_client.Gauge = prometheus_client.Gauge(
38
+ "function_executors_with_state",
39
+ "Number of Function Executors with a particular internal state",
40
+ ["state"],
43
41
  )
44
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN = "unknown"
45
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING = "pending"
46
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING = "running"
47
- METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED = "terminated"
42
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN = "unknown"
43
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED = "not_started"
44
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP = "starting_up"
45
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING = "running"
46
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING = "terminating"
47
+ METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED = "terminated"
48
+
48
49
 
49
- metric_function_executors_with_status.labels(
50
- status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_UNKNOWN
50
+ metric_function_executors_with_state.labels(
51
+ state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_UNKNOWN
52
+ )
53
+ metric_function_executors_with_state.labels(
54
+ state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_NOT_STARTED
55
+ )
56
+ metric_function_executors_with_state.labels(
57
+ state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_STARTING_UP
51
58
  )
52
- metric_function_executors_with_status.labels(
53
- status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_PENDING
59
+ metric_function_executors_with_state.labels(
60
+ state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_RUNNING
54
61
  )
55
- metric_function_executors_with_status.labels(
56
- status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_RUNNING
62
+ metric_function_executors_with_state.labels(
63
+ state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATING
57
64
  )
58
- metric_function_executors_with_status.labels(
59
- status=METRIC_FUNCTION_EXECUTORS_WITH_STATUS_LABEL_TERMINATED
65
+ metric_function_executors_with_state.labels(
66
+ state=METRIC_FUNCTION_EXECUTORS_WITH_STATE_LABEL_TERMINATED
60
67
  )
@@ -1,4 +1,6 @@
1
1
  import asyncio
2
+ import os
3
+ import random
2
4
  import time
3
5
  from typing import Any, Optional
4
6
 
@@ -6,6 +8,13 @@ import grpc
6
8
  from tensorlake.function_executor.proto.function_executor_pb2 import (
7
9
  RunTaskRequest,
8
10
  RunTaskResponse,
11
+ SerializedObject,
12
+ )
13
+ from tensorlake.function_executor.proto.function_executor_pb2 import (
14
+ TaskFailureReason as FETaskFailureReason,
15
+ )
16
+ from tensorlake.function_executor.proto.function_executor_pb2 import (
17
+ TaskOutcomeCode as FETaskOutcomeCode,
9
18
  )
10
19
  from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
11
20
  FunctionExecutorStub,
@@ -31,6 +40,10 @@ from .metrics.run_task import (
31
40
  from .task_info import TaskInfo
32
41
  from .task_output import TaskMetrics, TaskOutput
33
42
 
43
+ _ENABLE_INJECT_TASK_CANCELLATIONS = (
44
+ os.getenv("INDEXIFY_INJECT_TASK_CANCELLATIONS", "0") == "1"
45
+ )
46
+
34
47
 
35
48
  async def run_task_on_function_executor(
36
49
  task_info: TaskInfo, function_executor: FunctionExecutor, logger: Any
@@ -83,6 +96,7 @@ async def run_task_on_function_executor(
83
96
  task_info.output = _task_output_from_function_executor_response(
84
97
  allocation=task_info.allocation,
85
98
  response=response,
99
+ logger=logger,
86
100
  )
87
101
  except grpc.aio.AioRpcError as e:
88
102
  if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
@@ -140,13 +154,13 @@ async def run_task_on_function_executor(
140
154
 
141
155
 
142
156
  def _task_output_from_function_executor_response(
143
- allocation: TaskAllocation, response: RunTaskResponse
157
+ allocation: TaskAllocation, response: RunTaskResponse, logger: Any
144
158
  ) -> TaskOutput:
145
159
  response_validator = MessageValidator(response)
146
160
  response_validator.required_field("stdout")
147
161
  response_validator.required_field("stderr")
148
162
  response_validator.required_field("is_reducer")
149
- response_validator.required_field("success")
163
+ response_validator.required_field("outcome_code")
150
164
 
151
165
  metrics = TaskMetrics(counters={}, timers={})
152
166
  if response.HasField("metrics"):
@@ -154,31 +168,42 @@ def _task_output_from_function_executor_response(
154
168
  metrics.counters = dict(response.metrics.counters)
155
169
  metrics.timers = dict(response.metrics.timers)
156
170
 
157
- output = TaskOutput(
171
+ outcome_code: TaskOutcomeCode = _to_task_outcome_code(
172
+ response.outcome_code, logger=logger
173
+ )
174
+ failure_reason: Optional[TaskFailureReason] = None
175
+ invocation_error_output: Optional[SerializedObject] = None
176
+
177
+ if outcome_code == TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE:
178
+ response_validator.required_field("failure_reason")
179
+ failure_reason: Optional[TaskFailureReason] = _to_task_failure_reason(
180
+ response.failure_reason, logger
181
+ )
182
+ if failure_reason == TaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR:
183
+ response_validator.required_field("invocation_error_output")
184
+ invocation_error_output = response.invocation_error_output
185
+
186
+ if _ENABLE_INJECT_TASK_CANCELLATIONS:
187
+ logger.warning("injecting cancellation failure for the task allocation")
188
+ if (
189
+ random.random() < 0.5
190
+ ): # 50% chance to get stable reproduction in manual testing
191
+ outcome_code = TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
192
+ failure_reason = TaskFailureReason.TASK_FAILURE_REASON_TASK_CANCELLED
193
+
194
+ return TaskOutput(
158
195
  allocation=allocation,
159
- outcome_code=(
160
- TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS
161
- if response.success
162
- else TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
163
- ),
164
- failure_reason=(
165
- None
166
- if response.success
167
- else TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR
168
- ),
196
+ outcome_code=outcome_code,
197
+ failure_reason=failure_reason,
198
+ invocation_error_output=invocation_error_output,
199
+ function_outputs=response.function_outputs,
200
+ next_functions=response.next_functions,
169
201
  stdout=response.stdout,
170
202
  stderr=response.stderr,
171
203
  reducer=response.is_reducer,
172
204
  metrics=metrics,
173
205
  )
174
206
 
175
- if response.HasField("function_output"):
176
- output.function_output = response.function_output
177
- if response.HasField("router_output"):
178
- output.router_output = response.router_output
179
-
180
- return output
181
-
182
207
 
183
208
  def _log_task_execution_finished(output: TaskOutput, logger: Any) -> None:
184
209
  logger.info(
@@ -191,3 +216,40 @@ def _log_task_execution_finished(output: TaskOutput, logger: Any) -> None:
191
216
  else None
192
217
  ),
193
218
  )
219
+
220
+
221
+ def _to_task_outcome_code(
222
+ fe_task_outcome_code: FETaskOutcomeCode, logger
223
+ ) -> TaskOutcomeCode:
224
+ if fe_task_outcome_code == FETaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS:
225
+ return TaskOutcomeCode.TASK_OUTCOME_CODE_SUCCESS
226
+ elif fe_task_outcome_code == FETaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE:
227
+ return TaskOutcomeCode.TASK_OUTCOME_CODE_FAILURE
228
+ else:
229
+ logger.warning(
230
+ "Unknown TaskOutcomeCode received from Function Executor",
231
+ value=FETaskOutcomeCode.Name(fe_task_outcome_code),
232
+ )
233
+ return TaskOutcomeCode.TASK_OUTCOME_CODE_UNKNOWN
234
+
235
+
236
+ def _to_task_failure_reason(
237
+ fe_task_failure_reason: FETaskFailureReason, logger: Any
238
+ ) -> TaskFailureReason:
239
+ if fe_task_failure_reason == FETaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR:
240
+ return TaskFailureReason.TASK_FAILURE_REASON_FUNCTION_ERROR
241
+ elif (
242
+ fe_task_failure_reason
243
+ == FETaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR
244
+ ):
245
+ return TaskFailureReason.TASK_FAILURE_REASON_INVOCATION_ERROR
246
+ elif (
247
+ fe_task_failure_reason == FETaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR
248
+ ):
249
+ return TaskFailureReason.TASK_FAILURE_REASON_INTERNAL_ERROR
250
+ else:
251
+ logger.warning(
252
+ "Unknown TaskFailureReason received from Function Executor",
253
+ value=FETaskFailureReason.Name(fe_task_failure_reason),
254
+ )
255
+ return TaskFailureReason.TASK_FAILURE_REASON_UNKNOWN