indexify 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,6 @@
1
1
  import asyncio
2
- from typing import AsyncGenerator, Dict
2
+ from typing import AsyncGenerator, Dict, Optional
3
3
 
4
- from ..api_objects import Task
5
4
  from .function_executor_state import FunctionExecutorState
6
5
  from .metrics.function_executor_state_container import (
7
6
  metric_function_executor_states_count,
@@ -17,19 +16,33 @@ class FunctionExecutorStatesContainer:
17
16
  self._states: Dict[str, FunctionExecutorState] = {}
18
17
  self._is_shutdown: bool = False
19
18
 
20
- async def get_or_create_state(self, task: Task) -> FunctionExecutorState:
21
- """Get or create a function executor state for the given task.
19
+ async def get_or_create_state(
20
+ self,
21
+ id: str,
22
+ namespace: str,
23
+ graph_name: str,
24
+ graph_version: str,
25
+ function_name: str,
26
+ image_uri: Optional[str],
27
+ ) -> FunctionExecutorState:
28
+ """Get or create a function executor state with the given ID.
22
29
 
30
+ If the state already exists, it is returned. Otherwise, a new state is created from the supplied task.
23
31
  Raises Exception if it's not possible to create a new state at this time."""
24
32
  async with self._lock:
25
33
  if self._is_shutdown:
26
- raise RuntimeError("Task runner is shutting down.")
34
+ raise RuntimeError(
35
+ "Function Executor states container is shutting down."
36
+ )
27
37
 
28
- id = function_id_without_version(task)
29
38
  if id not in self._states:
30
39
  state = FunctionExecutorState(
31
- function_id_with_version=function_id_with_version(task),
32
- function_id_without_version=id,
40
+ id=id,
41
+ namespace=namespace,
42
+ graph_name=graph_name,
43
+ graph_version=graph_version,
44
+ function_name=function_name,
45
+ image_uri=image_uri,
33
46
  )
34
47
  self._states[id] = state
35
48
  metric_function_executor_states_count.set(len(self._states))
@@ -41,6 +54,13 @@ class FunctionExecutorStatesContainer:
41
54
  for state in self._states.values():
42
55
  yield state
43
56
 
57
+ async def pop(self, id: str) -> FunctionExecutorState:
58
+ """Removes the state with the given ID and returns it."""
59
+ async with self._lock:
60
+ state = self._states.pop(id)
61
+ metric_function_executor_states_count.set(len(self._states))
62
+ return state
63
+
44
64
  async def shutdown(self):
45
65
  # Function Executors are outside the Executor process
46
66
  # so they need to get cleaned up explicitly and reliably.
@@ -54,11 +74,3 @@ class FunctionExecutorStatesContainer:
54
74
  async with state.lock:
55
75
  await state.shutdown()
56
76
  # The task running inside the Function Executor will fail because it's destroyed.
57
-
58
-
59
- def function_id_with_version(task: Task) -> str:
60
- return f"versioned/{task.namespace}/{task.compute_graph}/{task.graph_version}/{task.compute_fn}"
61
-
62
-
63
- def function_id_without_version(task: Task) -> str:
64
- return f"not_versioned/{task.namespace}/{task.compute_graph}/{task.compute_fn}"
@@ -20,16 +20,22 @@ from .server.client_configuration import HEALTH_CHECK_TIMEOUT_SEC
20
20
  HEALTH_CHECK_POLL_PERIOD_SEC = 10
21
21
 
22
22
 
23
+ class HealthCheckResult:
24
+ def __init__(self, is_healthy: bool, reason: str):
25
+ self.is_healthy: bool = is_healthy
26
+ self.reason: str = reason
27
+
28
+
23
29
  class HealthChecker:
24
30
  def __init__(self, stub: FunctionExecutorStub, logger: Any):
25
31
  self._stub: FunctionExecutorStub = stub
26
32
  self._logger: Any = logger.bind(module=__name__)
27
33
  self._health_check_loop_task: Optional[asyncio.Task] = None
28
- self._health_check_failed_callback: Optional[Callable[[], Awaitable[None]]] = (
29
- None
30
- )
34
+ self._health_check_failed_callback: Optional[
35
+ Callable[[HealthCheckResult], Awaitable[None]]
36
+ ] = None
31
37
 
32
- async def check(self) -> bool:
38
+ async def check(self) -> HealthCheckResult:
33
39
  """Runs the health check once and returns the result.
34
40
 
35
41
  Does not raise any exceptions."""
@@ -40,17 +46,25 @@ class HealthChecker:
40
46
  )
41
47
  if not response.healthy:
42
48
  metric_failed_health_checks.inc()
43
- return response.healthy
44
- except AioRpcError:
49
+ return HealthCheckResult(
50
+ is_healthy=response.healthy, reason=response.status_message
51
+ )
52
+ except AioRpcError as e:
45
53
  metric_failed_health_checks.inc()
46
54
  # Expected exception when there are problems with communication because e.g. the server is unhealthy.
47
- return False
55
+ return HealthCheckResult(
56
+ is_healthy=False,
57
+ reason=f"Executor side RPC channel error: {str(e)}",
58
+ )
48
59
  except Exception as e:
49
60
  metric_failed_health_checks.inc()
50
61
  self._logger.warning("Got unexpected exception, ignoring", exc_info=e)
51
- return False
62
+ return HealthCheckResult(
63
+ is_healthy=False,
64
+ reason=f"Unexpected exception in Executor: {str(e)}",
65
+ )
52
66
 
53
- def start(self, callback: Callable[[], Awaitable[None]]) -> None:
67
+ def start(self, callback: Callable[[HealthCheckResult], Awaitable[None]]) -> None:
54
68
  """Starts periodic health checks.
55
69
 
56
70
  The supplied callback is an async function called in the calling thread's
@@ -81,9 +95,10 @@ class HealthChecker:
81
95
 
82
96
  async def _health_check_loop(self) -> None:
83
97
  while True:
84
- if not await self.check():
98
+ result: HealthCheckResult = await self.check()
99
+ if not result.is_healthy:
85
100
  break
86
101
  await asyncio.sleep(HEALTH_CHECK_POLL_PERIOD_SEC)
87
102
 
88
- asyncio.create_task(self._health_check_failed_callback())
103
+ asyncio.create_task(self._health_check_failed_callback(result))
89
104
  self._health_check_loop_task = None
@@ -78,6 +78,22 @@ metric_destroy_channel_errors: prometheus_client.Counter = prometheus_client.Cou
78
78
  "Number of Function Executor channel destruction errors",
79
79
  )
80
80
 
81
+ # FE get_info RPC metrics.
82
+ metric_get_info_rpc_latency: prometheus_client.Histogram = (
83
+ latency_metric_for_fast_operation(
84
+ "function_executor_get_info_rpc", "Function Executor get_info RPC"
85
+ )
86
+ )
87
+ metric_get_info_rpc_errors: prometheus_client.Counter = prometheus_client.Counter(
88
+ "function_executor_get_info_rpc_errors",
89
+ "Number of Function Executor get_info RPC errors",
90
+ )
91
+ metric_function_executor_infos: prometheus_client.Counter = prometheus_client.Counter(
92
+ "function_executor_infos",
93
+ "Number of Function Executors with particular info",
94
+ ["version", "sdk_version", "sdk_language", "sdk_language_version"],
95
+ )
96
+
81
97
  # FE initialization RPC metrics.
82
98
  metric_initialize_rpc_latency: prometheus_client.Histogram = (
83
99
  latency_metric_for_customer_controlled_operation(
@@ -14,8 +14,11 @@ class FunctionExecutorServerConfiguration:
14
14
  configuration parameters or raise an exception if it can't implement
15
15
  them."""
16
16
 
17
- def __init__(self, executor_id: str, image_uri: Optional[str]):
17
+ def __init__(
18
+ self, executor_id: str, function_executor_id: str, image_uri: Optional[str]
19
+ ):
18
20
  self.executor_id: str = executor_id
21
+ self.function_executor_id: str = function_executor_id
19
22
  # Container image URI of the Function Executor Server.
20
23
  self.image_uri: Optional[str] = image_uri
21
24
 
@@ -14,6 +14,7 @@ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
14
14
  from ..api_objects import Task
15
15
  from .function_executor import CustomerError, FunctionExecutor
16
16
  from .function_executor_state import FunctionExecutorState
17
+ from .health_checker import HealthChecker, HealthCheckResult
17
18
  from .metrics.single_task_runner import (
18
19
  metric_function_executor_run_task_rpc_errors,
19
20
  metric_function_executor_run_task_rpc_latency,
@@ -69,7 +70,12 @@ class SingleTaskRunner:
69
70
  await self._create_function_executor()
70
71
  except CustomerError as e:
71
72
  return TaskOutput(
72
- task=self._task_input.task,
73
+ task_id=self._task_input.task.id,
74
+ namespace=self._task_input.task.namespace,
75
+ graph_name=self._task_input.task.compute_graph,
76
+ function_name=self._task_input.task.compute_fn,
77
+ graph_version=self._task_input.task.graph_version,
78
+ graph_invocation_id=self._task_input.task.invocation_id,
73
79
  stderr=str(e),
74
80
  success=False,
75
81
  )
@@ -88,6 +94,7 @@ class SingleTaskRunner:
88
94
  config: FunctionExecutorServerConfiguration = (
89
95
  FunctionExecutorServerConfiguration(
90
96
  executor_id=self._executor_id,
97
+ function_executor_id=self._state.id,
91
98
  image_uri=self._task_input.task.image_uri,
92
99
  )
93
100
  )
@@ -144,24 +151,32 @@ class SingleTaskRunner:
144
151
  ).run_task(request)
145
152
  return _task_output(task=self._task_input.task, response=response)
146
153
 
147
- async def _health_check_failed_callback(self):
154
+ async def _health_check_failed_callback(self, result: HealthCheckResult):
148
155
  # Function Executor destroy due to the periodic health check failure ensures that
149
156
  # a running task RPC stuck in unhealthy Function Executor fails immidiately.
150
157
  async with self._state.lock:
151
158
  if self._state.function_executor is not None:
152
- await self._destroy_function_executor_on_failed_health_check()
159
+ await self._destroy_function_executor_on_failed_health_check(
160
+ result.reason
161
+ )
153
162
 
154
163
  async def _destroy_existing_function_executor_if_unhealthy(self):
155
164
  self._state.check_locked()
156
165
  if self._state.function_executor is None:
157
166
  return
158
- if await self._state.function_executor.health_checker().check():
167
+ result: HealthCheckResult = (
168
+ await self._state.function_executor.health_checker().check()
169
+ )
170
+ if result.is_healthy:
159
171
  return
160
- await self._destroy_function_executor_on_failed_health_check()
172
+ await self._destroy_function_executor_on_failed_health_check(result.reason)
161
173
 
162
- async def _destroy_function_executor_on_failed_health_check(self):
174
+ async def _destroy_function_executor_on_failed_health_check(self, reason: str):
163
175
  self._state.check_locked()
164
- self._logger.error("Health check failed, destroying FunctionExecutor.")
176
+ self._logger.error(
177
+ "Function Executor health check failed, destroying Function Executor",
178
+ health_check_fail_reason=reason,
179
+ )
165
180
  self._state.health_check_failed = True
166
181
  await self._state.destroy_function_executor()
167
182
 
@@ -220,7 +235,12 @@ def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
220
235
  raise ValueError(f"Response is missing required field: {field}")
221
236
 
222
237
  output = TaskOutput(
223
- task=task,
238
+ task_id=task.id,
239
+ namespace=task.namespace,
240
+ graph_name=task.compute_graph,
241
+ function_name=task.compute_fn,
242
+ graph_version=task.graph_version,
243
+ graph_invocation_id=task.invocation_id,
224
244
  stdout=response.stdout,
225
245
  stderr=response.stderr,
226
246
  reducer=response.is_reducer,
@@ -13,7 +13,12 @@ class TaskOutput:
13
13
 
14
14
  def __init__(
15
15
  self,
16
- task: Task,
16
+ task_id: str,
17
+ namespace: str,
18
+ graph_name: str,
19
+ function_name: str,
20
+ graph_version: str,
21
+ graph_invocation_id: str,
17
22
  function_output: Optional[FunctionOutput] = None,
18
23
  router_output: Optional[RouterOutput] = None,
19
24
  stdout: Optional[str] = None,
@@ -22,7 +27,12 @@ class TaskOutput:
22
27
  success: bool = False,
23
28
  is_internal_error: bool = False,
24
29
  ):
25
- self.task = task
30
+ self.task_id = task_id
31
+ self.namespace = namespace
32
+ self.graph_name = graph_name
33
+ self.function_name = function_name
34
+ self.graph_version = graph_version
35
+ self.graph_invocation_id = graph_invocation_id
26
36
  self.function_output = function_output
27
37
  self.router_output = router_output
28
38
  self.stdout = stdout
@@ -32,11 +42,24 @@ class TaskOutput:
32
42
  self.is_internal_error = is_internal_error
33
43
 
34
44
  @classmethod
35
- def internal_error(cls, task: Task) -> "TaskOutput":
45
+ def internal_error(
46
+ cls,
47
+ task_id: str,
48
+ namespace: str,
49
+ graph_name: str,
50
+ function_name: str,
51
+ graph_version: str,
52
+ graph_invocation_id: str,
53
+ ) -> "TaskOutput":
36
54
  """Creates a TaskOutput for an internal error."""
37
55
  # We are not sharing internal error messages with the customer.
38
56
  return TaskOutput(
39
- task=task,
57
+ task_id=task_id,
58
+ namespace=namespace,
59
+ graph_name=graph_name,
60
+ function_name=function_name,
61
+ graph_version=graph_version,
62
+ graph_invocation_id=graph_invocation_id,
40
63
  stderr="Platform failed to execute the function.",
41
64
  is_internal_error=True,
42
65
  )
@@ -0,0 +1,288 @@
1
+ import asyncio
2
+ from typing import Any, AsyncGenerator, List, Optional, Set
3
+
4
+ import grpc
5
+ from tensorlake.function_executor.proto.function_executor_pb2 import (
6
+ InitializeRequest,
7
+ SerializedObject,
8
+ )
9
+
10
+ from indexify.task_scheduler.proto.task_scheduler_pb2 import (
11
+ DesiredExecutorState,
12
+ FunctionExecutorDescription,
13
+ FunctionExecutorStatus,
14
+ GetDesiredExecutorStatesRequest,
15
+ )
16
+ from indexify.task_scheduler.proto.task_scheduler_pb2_grpc import (
17
+ TaskSchedulerServiceStub,
18
+ )
19
+
20
+ from .downloader import Downloader
21
+ from .function_executor.function_executor import CustomerError, FunctionExecutor
22
+ from .function_executor.function_executor_state import FunctionExecutorState
23
+ from .function_executor.function_executor_states_container import (
24
+ FunctionExecutorStatesContainer,
25
+ )
26
+ from .function_executor.server.function_executor_server_factory import (
27
+ FunctionExecutorServerConfiguration,
28
+ FunctionExecutorServerFactory,
29
+ )
30
+ from .function_executor.task_input import TaskInput
31
+ from .function_executor.task_output import TaskOutput
32
+ from .metrics.executor import (
33
+ METRIC_TASKS_COMPLETED_OUTCOME_ALL,
34
+ METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
35
+ METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
36
+ METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
37
+ metric_task_completion_latency,
38
+ metric_task_outcome_report_latency,
39
+ metric_task_outcome_report_retries,
40
+ metric_task_outcome_reports,
41
+ metric_tasks_completed,
42
+ metric_tasks_fetched,
43
+ metric_tasks_reporting_outcome,
44
+ )
45
+ from .task_reporter import TaskReporter
46
+
47
+
48
+ class ExecutorStateReconciler:
49
+ def __init__(
50
+ self,
51
+ executor_id: str,
52
+ function_executor_server_factory: FunctionExecutorServerFactory,
53
+ base_url: str,
54
+ function_executor_states: FunctionExecutorStatesContainer,
55
+ config_path: Optional[str],
56
+ downloader: Downloader,
57
+ task_reporter: TaskReporter,
58
+ server_channel: grpc.aio.Channel,
59
+ logger: Any,
60
+ ):
61
+ self._executor_id: str = executor_id
62
+ self._factory: FunctionExecutorServerFactory = function_executor_server_factory
63
+ self._base_url: str = base_url
64
+ self._config_path: Optional[str] = config_path
65
+ self._downloader: Downloader = downloader
66
+ self._task_reporter: TaskReporter = task_reporter
67
+ self._function_executor_states: FunctionExecutorStatesContainer = (
68
+ function_executor_states
69
+ )
70
+ self._stub: TaskSchedulerServiceStub = TaskSchedulerServiceStub(server_channel)
71
+ self._logger: Any = logger.bind(module=__name__)
72
+ self._is_shutdown: bool = False
73
+ self._reconciliation_lock: asyncio.Lock = asyncio.Lock()
74
+ self._server_last_clock: Optional[int] = None
75
+
76
+ async def run(self):
77
+ desired_states: AsyncGenerator[DesiredExecutorState, None] = (
78
+ self._stub.get_desired_executor_states(
79
+ GetDesiredExecutorStatesRequest(executor_id=self._executor_id)
80
+ )
81
+ )
82
+ async for new_state in desired_states:
83
+ if self._is_shutdown:
84
+ return
85
+ new_state: DesiredExecutorState
86
+ if self._server_last_clock is not None:
87
+ if self._server_last_clock >= new_state.clock:
88
+ continue # Duplicate or outdated message state sent by Server.
89
+
90
+ self._server_last_clock = new_state.clock
91
+ asyncio.create_task(self._reconcile_state(new_state))
92
+
93
+ async def _reconcile_state(self, new_state: DesiredExecutorState):
94
+ if self._is_shutdown:
95
+ return
96
+
97
+ # Simple non concurrent implementation for now for the PoC.
98
+ # Obtain this lock to force only a single coroutine doing the reconciliation.
99
+ async with self._reconciliation_lock:
100
+ await self._reconcile_function_executors(new_state)
101
+ # TODO
102
+ # await self._reconcile_task_allocations(new_state)
103
+
104
+ async def shutdown(self):
105
+ """Shuts down the state reconciler.
106
+
107
+ Never raises any exceptions.
108
+ """
109
+ self._is_shutdown = True
110
+
111
+ async def _reconcile_function_executors(self, desired_state: DesiredExecutorState):
112
+ desired_function_executor_ids: Set[str] = set()
113
+ for desired_function_executor in desired_state.function_executors:
114
+ desired_function_executor: FunctionExecutorDescription
115
+ desired_function_executor_ids.add(desired_function_executor.id)
116
+
117
+ function_executor_state: FunctionExecutorState = (
118
+ self._function_executor_states.get_or_create_state(
119
+ id=desired_function_executor.id,
120
+ namespace=desired_function_executor.namespace,
121
+ graph_name=desired_function_executor.graph_name,
122
+ graph_version=desired_function_executor.graph_version,
123
+ function_name=desired_function_executor.function_name,
124
+ )
125
+ )
126
+
127
+ async with function_executor_state.lock:
128
+ if (
129
+ function_executor_state.status
130
+ == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPED
131
+ ):
132
+ function_executor_state.status = (
133
+ FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTING_UP
134
+ )
135
+ try:
136
+ function_executor_state.function_executor = (
137
+ await self._create_function_executor()
138
+ )
139
+ function_executor_state.status = (
140
+ FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_IDLE
141
+ )
142
+ except CustomerError as e:
143
+ function_executor_state.status = (
144
+ FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_CUSTOMER_ERROR
145
+ )
146
+ except Exception as e:
147
+ function_executor_state.status = (
148
+ FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STARTUP_FAILED_PLATFORM_ERROR
149
+ )
150
+ self._logger.error(
151
+ f"Failed to create Function Executor", exc_info=e
152
+ )
153
+
154
+ function_executor_state_ids_to_destroy: List[str] = []
155
+ async for function_executor_state in self._function_executor_states:
156
+ function_executor_state: FunctionExecutorState
157
+ if function_executor_state.id not in desired_function_executor_ids:
158
+ function_executor_state_ids_to_destroy.append(
159
+ function_executor_state.id
160
+ )
161
+
162
+ for function_executor_state_id in function_executor_state_ids_to_destroy:
163
+ function_executor_state: FunctionExecutorState = (
164
+ self._function_executor_states.pop_state(function_executor_state_id)
165
+ )
166
+ async with function_executor_state.lock:
167
+ logger = self._function_executor_logger(
168
+ id=function_executor_state.id,
169
+ namespace=function_executor_state.namespace,
170
+ graph_name=function_executor_state.graph_name,
171
+ graph_version=function_executor_state.graph_version,
172
+ function_name=function_executor_state.function_name,
173
+ )
174
+ if (
175
+ function_executor_state.status
176
+ == FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_RUNNING_TASK
177
+ ):
178
+ logger.warning(
179
+ "Destroying Function Executor that is running a task. No task output will be reported as this is expected by the Server."
180
+ )
181
+ function_executor_state.status = (
182
+ FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPING
183
+ )
184
+ await function_executor_state.destroy_function_executor()
185
+ function_executor_state.status = (
186
+ FunctionExecutorStatus.FUNCTION_EXECUTOR_STATUS_STOPPED
187
+ )
188
+
189
+ async def _create_function_executor(
190
+ self, description: FunctionExecutorDescription
191
+ ) -> FunctionExecutor:
192
+ logger = self._function_executor_logger(
193
+ id=description.id,
194
+ namespace=description.namespace,
195
+ graph_name=description.graph_name,
196
+ graph_version=description.graph_version,
197
+ function_name=description.function_name,
198
+ )
199
+ graph: SerializedObject = await self._downloader.download_graph(
200
+ namespace=description.namespace,
201
+ graph_name=description.graph_name,
202
+ graph_version=description.graph_version,
203
+ logger=logger,
204
+ )
205
+ function_executor: FunctionExecutor = FunctionExecutor(
206
+ server_factory=self._factory, logger=logger
207
+ )
208
+ config: FunctionExecutorServerConfiguration = (
209
+ FunctionExecutorServerConfiguration(
210
+ executor_id=self._executor_id,
211
+ function_executor_id=description.id,
212
+ image_uri=description.image_uri,
213
+ )
214
+ )
215
+ initialize_request: InitializeRequest = InitializeRequest(
216
+ namespace=description.namespace,
217
+ graph_name=description.graph_name,
218
+ graph_version=description.graph_version,
219
+ function_name=description.function_name,
220
+ graph=graph,
221
+ )
222
+
223
+ try:
224
+ await function_executor.initialize(
225
+ config=config,
226
+ initialize_request=initialize_request,
227
+ base_url=self._base_url,
228
+ config_path=self._config_path,
229
+ )
230
+ return function_executor
231
+ except Exception:
232
+ await function_executor.destroy()
233
+ raise
234
+
235
+ async def _cancel_running_tasks(
236
+ self, function_executor_state: FunctionExecutorState
237
+ ):
238
+ pass
239
+
240
+ def _function_executor_logger(
241
+ self,
242
+ id: str,
243
+ namespace: str,
244
+ graph_name: str,
245
+ graph_version: str,
246
+ function_name: str,
247
+ ) -> Any:
248
+ return self._logger.bind(
249
+ id=id,
250
+ namespace=namespace,
251
+ graph=graph_name,
252
+ graph_version=graph_version,
253
+ function_name=function_name,
254
+ )
255
+
256
+ async def _report_task_outcome(self, task_output: TaskOutput):
257
+ """Reports the task with the given output to the server.
258
+
259
+ Doesn't raise any Exceptions. Runs till the reporting is successful."""
260
+ reporting_retries: int = 0
261
+
262
+ while True:
263
+ logger = logger.bind(retries=reporting_retries)
264
+ try:
265
+ await self._task_reporter.report(output=task_output, logger=logger)
266
+ break
267
+ except Exception as e:
268
+ logger.error(
269
+ "failed to report task",
270
+ exc_info=e,
271
+ )
272
+ reporting_retries += 1
273
+ metric_task_outcome_report_retries.inc()
274
+ await asyncio.sleep(5)
275
+
276
+ metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
277
+ if task_output.is_internal_error:
278
+ metric_tasks_completed.labels(
279
+ outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
280
+ ).inc()
281
+ elif task_output.success:
282
+ metric_tasks_completed.labels(
283
+ outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
284
+ ).inc()
285
+ else:
286
+ metric_tasks_completed.labels(
287
+ outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
288
+ ).inc()