indexify 0.3.18__py3-none-any.whl → 0.3.20__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (35) hide show
  1. indexify/cli/cli.py +15 -17
  2. indexify/executor/api_objects.py +12 -0
  3. indexify/executor/blob_store/blob_store.py +69 -0
  4. indexify/executor/blob_store/local_fs_blob_store.py +48 -0
  5. indexify/executor/blob_store/metrics/blob_store.py +33 -0
  6. indexify/executor/blob_store/s3_blob_store.py +85 -0
  7. indexify/executor/downloader.py +149 -25
  8. indexify/executor/executor.py +77 -41
  9. indexify/executor/function_executor/function_executor.py +24 -11
  10. indexify/executor/function_executor/function_executor_state.py +9 -1
  11. indexify/executor/function_executor/function_executor_states_container.py +8 -1
  12. indexify/executor/function_executor/function_executor_status.py +4 -0
  13. indexify/executor/function_executor/health_checker.py +7 -2
  14. indexify/executor/function_executor/invocation_state_client.py +4 -2
  15. indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
  16. indexify/executor/function_executor/single_task_runner.py +15 -11
  17. indexify/executor/function_executor/task_output.py +36 -2
  18. indexify/executor/grpc/channel_manager.py +4 -3
  19. indexify/executor/grpc/function_executor_controller.py +391 -0
  20. indexify/executor/grpc/metrics/state_reconciler.py +17 -0
  21. indexify/executor/grpc/metrics/task_controller.py +8 -0
  22. indexify/executor/grpc/state_reconciler.py +324 -217
  23. indexify/executor/grpc/state_reporter.py +52 -41
  24. indexify/executor/grpc/task_controller.py +492 -0
  25. indexify/executor/metrics/task_reporter.py +14 -0
  26. indexify/executor/task_reporter.py +115 -6
  27. indexify/executor/task_runner.py +1 -0
  28. indexify/proto/executor_api.proto +91 -7
  29. indexify/proto/executor_api_pb2.py +49 -37
  30. indexify/proto/executor_api_pb2.pyi +158 -3
  31. indexify/proto/executor_api_pb2_grpc.py +47 -0
  32. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/METADATA +2 -1
  33. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/RECORD +35 -27
  34. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/WHEEL +0 -0
  35. {indexify-0.3.18.dist-info → indexify-0.3.20.dist-info}/entry_points.txt +0 -0
@@ -3,8 +3,6 @@ import hashlib
3
3
  from socket import gethostname
4
4
  from typing import Any, Dict, List, Optional
5
5
 
6
- import grpc
7
-
8
6
  from indexify.proto.executor_api_pb2 import (
9
7
  AllowedFunction,
10
8
  )
@@ -62,6 +60,7 @@ class ExecutorStateReporter:
62
60
  function_executor_states: FunctionExecutorStatesContainer,
63
61
  channel_manager: ChannelManager,
64
62
  logger: Any,
63
+ reporting_interval_sec: int = _REPORTING_INTERVAL_SEC,
65
64
  ):
66
65
  self._executor_id: str = executor_id
67
66
  self._flavor: ExecutorFlavor = flavor
@@ -74,40 +73,45 @@ class ExecutorStateReporter:
74
73
  )
75
74
  self._channel_manager = channel_manager
76
75
  self._logger: Any = logger.bind(module=__name__)
76
+ self._reporting_interval_sec: int = reporting_interval_sec
77
+
77
78
  self._is_shutdown: bool = False
78
79
  self._executor_status: ExecutorStatus = ExecutorStatus.EXECUTOR_STATUS_UNKNOWN
79
80
  self._allowed_functions: List[AllowedFunction] = _to_grpc_allowed_functions(
80
81
  function_allowlist
81
82
  )
82
83
  self._labels.update(_label_values_to_strings(RuntimeProbes().probe().labels))
84
+ self._last_server_clock: Optional[int] = None
83
85
 
84
86
  def update_executor_status(self, value: ExecutorStatus):
85
87
  self._executor_status = value
86
88
 
89
+ def update_last_server_clock(self, value: int):
90
+ self._last_server_clock = value
91
+
87
92
  async def run(self):
88
93
  """Runs the state reporter.
89
94
 
90
95
  Never raises any exceptions.
91
96
  """
97
+ # TODO: Move this into a new async task and cancel it in shutdown().
92
98
  while not self._is_shutdown:
93
- async with await self._channel_manager.get_channel() as server_channel:
94
- server_channel: grpc.aio.Channel
95
- stub = ExecutorAPIStub(server_channel)
96
- while not self._is_shutdown:
97
- try:
98
- # The periodic state reports serve as channel health monitoring requests
99
- # (same as TCP keep-alive). Channel Manager returns the same healthy channel
100
- # for all RPCs that we do from Executor to Server. So all the RPCs benefit
101
- # from this channel health monitoring.
102
- await self.report_state(stub)
103
- await asyncio.sleep(_REPORTING_INTERVAL_SEC)
104
- except Exception as e:
105
- self._logger.error(
106
- f"Failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
107
- exc_info=e,
108
- )
109
- await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
110
- break
99
+ stub = ExecutorAPIStub(await self._channel_manager.get_channel())
100
+ while not self._is_shutdown:
101
+ try:
102
+ # The periodic state reports serve as channel health monitoring requests
103
+ # (same as TCP keep-alive). Channel Manager returns the same healthy channel
104
+ # for all RPCs that we do from Executor to Server. So all the RPCs benefit
105
+ # from this channel health monitoring.
106
+ await self.report_state(stub)
107
+ await asyncio.sleep(self._reporting_interval_sec)
108
+ except Exception as e:
109
+ self._logger.error(
110
+ f"Failed to report state to the server, reconnecting in {_REPORT_BACKOFF_ON_ERROR_SEC} sec.",
111
+ exc_info=e,
112
+ )
113
+ await asyncio.sleep(_REPORT_BACKOFF_ON_ERROR_SEC)
114
+ break
111
115
 
112
116
  self._logger.info("State reporter shutdown")
113
117
 
@@ -134,12 +138,21 @@ class ExecutorStateReporter:
134
138
  labels=self._labels,
135
139
  )
136
140
  state.state_hash = _state_hash(state)
141
+ if self._last_server_clock is not None:
142
+ state.server_clock = self._last_server_clock
137
143
 
138
144
  await stub.report_executor_state(
139
145
  ReportExecutorStateRequest(executor_state=state),
140
146
  timeout=_REPORT_RPC_TIMEOUT_SEC,
141
147
  )
142
148
 
149
+ async def shutdown(self):
150
+ """Shuts down the state reporter.
151
+
152
+ Never raises any exceptions.
153
+ """
154
+ self._is_shutdown = True
155
+
143
156
  async def _fetch_free_host_resources(self) -> HostResources:
144
157
  # TODO: Implement host resource metrics reporting.
145
158
  return HostResources(
@@ -157,30 +170,28 @@ class ExecutorStateReporter:
157
170
 
158
171
  async for function_executor_state in self._function_executor_states:
159
172
  function_executor_state: FunctionExecutorState
160
- states.append(
161
- FunctionExecutorStateProto(
162
- description=FunctionExecutorDescription(
163
- id=function_executor_state.id,
164
- namespace=function_executor_state.namespace,
165
- graph_name=function_executor_state.graph_name,
166
- graph_version=function_executor_state.graph_version,
167
- function_name=function_executor_state.function_name,
168
- ),
169
- status=_to_grpc_function_executor_status(
170
- function_executor_state.status, self._logger
171
- ),
172
- )
173
+ function_executor_state_proto = FunctionExecutorStateProto(
174
+ description=FunctionExecutorDescription(
175
+ id=function_executor_state.id,
176
+ namespace=function_executor_state.namespace,
177
+ graph_name=function_executor_state.graph_name,
178
+ graph_version=function_executor_state.graph_version,
179
+ function_name=function_executor_state.function_name,
180
+ secret_names=function_executor_state.secret_names,
181
+ ),
182
+ status=_to_grpc_function_executor_status(
183
+ function_executor_state.status, self._logger
184
+ ),
185
+ status_message=function_executor_state.status_message,
173
186
  )
187
+ if function_executor_state.image_uri:
188
+ function_executor_state_proto.description.image_uri = (
189
+ function_executor_state.image_uri
190
+ )
191
+ states.append(function_executor_state_proto)
174
192
 
175
193
  return states
176
194
 
177
- async def shutdown(self):
178
- """Shuts down the state reporter.
179
-
180
- Never raises any exceptions.
181
- """
182
- self._is_shutdown = True
183
-
184
195
 
185
196
  def _to_grpc_allowed_functions(function_allowlist: Optional[List[FunctionURI]]):
186
197
  if function_allowlist is None:
@@ -210,7 +221,7 @@ _STATUS_MAPPING: Dict[FunctionExecutorStatus, Any] = {
210
221
  FunctionExecutorStatus.UNHEALTHY: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_UNHEALTHY,
211
222
  FunctionExecutorStatus.DESTROYING: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPING,
212
223
  FunctionExecutorStatus.DESTROYED: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
213
- FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_STOPPED,
224
+ FunctionExecutorStatus.SHUTDOWN: FunctionExecutorStatusProto.FUNCTION_EXECUTOR_STATUS_SHUTDOWN,
214
225
  }
215
226
 
216
227
 
@@ -0,0 +1,492 @@
1
+ import asyncio
2
+ from typing import Any, Optional
3
+
4
+ import grpc
5
+ from tensorlake.function_executor.proto.function_executor_pb2 import (
6
+ RunTaskRequest,
7
+ RunTaskResponse,
8
+ SerializedObject,
9
+ )
10
+ from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
11
+ FunctionExecutorStub,
12
+ )
13
+ from tensorlake.function_executor.proto.message_validator import MessageValidator
14
+
15
+ from indexify.proto.executor_api_pb2 import Task
16
+
17
+ from ..downloader import Downloader
18
+ from ..function_executor.function_executor import FunctionExecutor
19
+ from ..function_executor.function_executor_state import FunctionExecutorState
20
+ from ..function_executor.function_executor_status import FunctionExecutorStatus
21
+ from ..function_executor.metrics.single_task_runner import (
22
+ metric_function_executor_run_task_rpc_errors,
23
+ metric_function_executor_run_task_rpc_latency,
24
+ metric_function_executor_run_task_rpcs,
25
+ )
26
+ from ..function_executor.task_output import TaskMetrics, TaskOutput
27
+
28
+ # TODO: combine these metrics into a single python file once gRPC migration is over and old code is removed.
29
+ from ..metrics.executor import (
30
+ METRIC_TASKS_COMPLETED_OUTCOME_ALL,
31
+ METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
32
+ METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
33
+ METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
34
+ metric_task_completion_latency,
35
+ metric_task_outcome_report_latency,
36
+ metric_task_outcome_report_retries,
37
+ metric_task_outcome_reports,
38
+ metric_tasks_completed,
39
+ metric_tasks_fetched,
40
+ metric_tasks_reporting_outcome,
41
+ )
42
+ from ..metrics.task_runner import (
43
+ metric_task_policy_latency,
44
+ metric_task_policy_runs,
45
+ metric_task_run_latency,
46
+ metric_task_run_platform_errors,
47
+ metric_task_runs,
48
+ metric_tasks_blocked_by_policy,
49
+ metric_tasks_blocked_by_policy_per_function_name,
50
+ metric_tasks_running,
51
+ )
52
+ from ..task_reporter import TaskReporter
53
+ from .metrics.task_controller import metric_task_cancellations
54
+
55
+ _TASK_OUTCOME_REPORT_BACKOFF_SEC = 5.0
56
+
57
+
58
+ def validate_task(task: Task) -> None:
59
+ """Validates the supplied Task.
60
+
61
+ Raises ValueError if the Task is not valid.
62
+ """
63
+ validator = MessageValidator(task)
64
+ validator.required_field("id")
65
+ validator.required_field("namespace")
66
+ validator.required_field("graph_name")
67
+ validator.required_field("graph_version")
68
+ validator.required_field("function_name")
69
+ validator.required_field("graph_invocation_id")
70
+ if not (task.HasField("input_key") or task.HasField("input")):
71
+ raise ValueError(
72
+ "Task must have either input_key or input field set. " f"Got task: {task}"
73
+ )
74
+
75
+
76
+ def task_logger(task: Task, logger: Any) -> Any:
77
+ """Returns a logger bound with the task's metadata.
78
+
79
+ The function assumes that the task might be invalid."""
80
+ return logger.bind(
81
+ task_id=task.id if task.HasField("id") else None,
82
+ namespace=task.namespace if task.HasField("namespace") else None,
83
+ graph_name=task.graph_name if task.HasField("graph_name") else None,
84
+ graph_version=task.graph_version if task.HasField("graph_version") else None,
85
+ function_name=task.function_name if task.HasField("function_name") else None,
86
+ graph_invocation_id=(
87
+ task.graph_invocation_id if task.HasField("graph_invocation_id") else None
88
+ ),
89
+ )
90
+
91
+
92
+ class TaskController:
93
+ def __init__(
94
+ self,
95
+ task: Task,
96
+ downloader: Downloader,
97
+ task_reporter: TaskReporter,
98
+ function_executor_id: str,
99
+ function_executor_state: FunctionExecutorState,
100
+ logger: Any,
101
+ ):
102
+ """Creates a new TaskController instance.
103
+
104
+ The supplied Task must be already validated by the caller using validate_task().
105
+ """
106
+ self._task: Task = task
107
+ self._downloader: Downloader = downloader
108
+ self._task_reporter: TaskReporter = task_reporter
109
+ self._function_executor_id: str = function_executor_id
110
+ self._function_executor_state: FunctionExecutorState = function_executor_state
111
+ self._logger: Any = task_logger(task, logger).bind(
112
+ function_executor_id=function_executor_id,
113
+ module=__name__,
114
+ )
115
+
116
+ self._input: Optional[SerializedObject] = None
117
+ self._init_value: Optional[SerializedObject] = None
118
+ self._is_timed_out: bool = False
119
+ # Automatically start the controller on creation.
120
+ self._task_runner: asyncio.Task = asyncio.create_task(
121
+ self._run(), name="task controller task runner"
122
+ )
123
+
124
+ def function_executor_id(self) -> str:
125
+ return self._function_executor_id
126
+
127
+ def task(self) -> Task:
128
+ return self._task
129
+
130
+ async def destroy(self) -> None:
131
+ """Destroys the controller and cancells the task if it didn't finish yet.
132
+
133
+ A running task is cancelled by destroying its Function Executor.
134
+ Doesn't raise any exceptions.
135
+ """
136
+ if self._task_runner.done():
137
+ return # Nothin to do, the task is finished already.
138
+
139
+ # The task runner code handles asyncio.CancelledError properly.
140
+ self._task_runner.cancel()
141
+ # Don't await the cancelled task to not block the caller unnecessary.
142
+
143
+ async def _run(self) -> None:
144
+ metric_tasks_fetched.inc()
145
+ with metric_task_completion_latency.time():
146
+ await self._run_task()
147
+
148
+ async def _run_task(self) -> None:
149
+ """Runs the supplied task and does full managemenet of its lifecycle.
150
+
151
+ Doesn't raise any exceptions."""
152
+ output: Optional[TaskOutput] = None
153
+
154
+ try:
155
+ await self._download_inputs()
156
+ output = await self._run_task_when_function_executor_is_available()
157
+ self._logger.info("task execution finished", success=output.success)
158
+ _log_function_metrics(output, self._logger)
159
+ except Exception as e:
160
+ metric_task_run_platform_errors.inc(),
161
+ output = self._internal_error_output()
162
+ self._logger.error("task execution failed", exc_info=e)
163
+ except asyncio.CancelledError:
164
+ metric_task_cancellations.inc()
165
+ self._logger.info("task execution cancelled")
166
+ # Don't report task outcome according to the current policy.
167
+ # asyncio.CancelledError can't be suppressed, see Python docs.
168
+ raise
169
+
170
+ # Current task outcome reporting policy:
171
+ # Don't report task outcomes for tasks that didn't fail with internal or customer error.
172
+ # This is required to simplify the protocol so Server doesn't need to care about task states
173
+ # and cancel each tasks carefully to not get its outcome as failed.
174
+ with (
175
+ metric_tasks_reporting_outcome.track_inprogress(),
176
+ metric_task_outcome_report_latency.time(),
177
+ ):
178
+ metric_task_outcome_reports.inc()
179
+ await self._report_task_outcome(output)
180
+
181
+ async def _download_inputs(self) -> None:
182
+ """Downloads the task inputs and init value.
183
+
184
+ Raises an Exception if the inputs failed to download.
185
+ """
186
+ self._input = await self._downloader.download_input(
187
+ namespace=self._task.namespace,
188
+ graph_name=self._task.graph_name,
189
+ graph_invocation_id=self._task.graph_invocation_id,
190
+ input_key=self._task.input_key,
191
+ data_payload=self._task.input if self._task.HasField("input") else None,
192
+ logger=self._logger,
193
+ )
194
+
195
+ if self._task.HasField("reducer_output_key") or self._task.HasField(
196
+ "reducer_input"
197
+ ):
198
+ self._init_value = await self._downloader.download_init_value(
199
+ namespace=self._task.namespace,
200
+ graph_name=self._task.graph_name,
201
+ function_name=self._task.function_name,
202
+ graph_invocation_id=self._task.graph_invocation_id,
203
+ reducer_output_key=(
204
+ self._task.reducer_output_key
205
+ if self._task.HasField("reducer_output_key")
206
+ else ""
207
+ ),
208
+ data_payload=(
209
+ self._task.reducer_input
210
+ if self._task.HasField("reducer_input")
211
+ else None
212
+ ),
213
+ logger=self._logger,
214
+ )
215
+
216
+ async def _run_task_when_function_executor_is_available(self) -> TaskOutput:
217
+ """Runs the task on the Function Executor when it's available.
218
+
219
+ Raises an Exception if task failed due to an internal error."""
220
+ await self._acquire_function_executor()
221
+
222
+ next_status: FunctionExecutorStatus = FunctionExecutorStatus.IDLE
223
+ try:
224
+ return await self._run_task_on_acquired_function_executor()
225
+ except asyncio.CancelledError:
226
+ # This one is raised here when destroy() was called while we were running the task on this FE.
227
+ next_status = FunctionExecutorStatus.UNHEALTHY
228
+ # asyncio.CancelledError can't be suppressed, see Python docs.
229
+ raise
230
+ finally:
231
+ # If the task finished running on FE then put it into IDLE state so other tasks can run on it.
232
+ # Otherwise, mark the FE as unhealthy to force its destruction so the task stops running on it eventually
233
+ # and no other tasks run on this FE because it'd result in undefined behavior.
234
+ if self._is_timed_out:
235
+ next_status = FunctionExecutorStatus.UNHEALTHY
236
+ await self._release_function_executor(next_status=next_status)
237
+
238
+ async def _acquire_function_executor(self) -> None:
239
+ """Waits until the Function Executor is in IDLE state and then locks it so the task can run on it.
240
+
241
+ Doesn't raise any exceptions.
242
+ """
243
+ with (
244
+ metric_tasks_blocked_by_policy.track_inprogress(),
245
+ metric_tasks_blocked_by_policy_per_function_name.labels(
246
+ function_name=self._task.function_name
247
+ ).track_inprogress(),
248
+ metric_task_policy_latency.time(),
249
+ ):
250
+ metric_task_policy_runs.inc()
251
+ self._logger.info(
252
+ "task is blocked by policy: waiting for idle function executor"
253
+ )
254
+ async with self._function_executor_state.lock:
255
+ await self._function_executor_state.wait_status(
256
+ allowlist=[FunctionExecutorStatus.IDLE]
257
+ )
258
+ await self._function_executor_state.set_status(
259
+ FunctionExecutorStatus.RUNNING_TASK
260
+ )
261
+
262
+ # At this point the Function Executor belongs to this task controller due to RUNNING_TASK status.
263
+ # We can now unlock the FE state. We have to update the FE status once the task succeeds or fails.
264
+
265
+ async def _release_function_executor(
266
+ self, next_status: FunctionExecutorStatus
267
+ ) -> None:
268
+ # Release the Function Executor so others can run tasks on it if FE status didn't change.
269
+ # If FE status changed, then it means that we're off normal task execution path, e.g.
270
+ # Server decided to do something with FE.
271
+ async with self._function_executor_state.lock:
272
+ if (
273
+ self._function_executor_state.status
274
+ == FunctionExecutorStatus.RUNNING_TASK
275
+ ):
276
+ await self._function_executor_state.set_status(next_status)
277
+ if next_status == FunctionExecutorStatus.UNHEALTHY:
278
+ # Destroy the unhealthy FE asap so it doesn't consume resources.
279
+ # Don't do it under the state lock to not add unnecessary delays.
280
+ asyncio.create_task(
281
+ self._function_executor_state.function_executor.destroy()
282
+ )
283
+ self._function_executor_state.function_executor = None
284
+ else:
285
+ self._logger.warning(
286
+ "skipping releasing Function Executor after running the task due to unexpected Function Executor status",
287
+ status=self._function_executor_state.status.name,
288
+ next_status=next_status.name,
289
+ )
290
+
291
+ async def _run_task_on_acquired_function_executor(self) -> TaskOutput:
292
+ """Runs the task on the Function Executor acquired by this task already and returns the output.
293
+
294
+ Raises an Exception if the task failed to run due to an internal error."""
295
+ with metric_tasks_running.track_inprogress(), metric_task_run_latency.time():
296
+ metric_task_runs.inc()
297
+ return await self._run_task_rpc_on_function_executor()
298
+
299
+ async def _run_task_rpc_on_function_executor(self) -> TaskOutput:
300
+ """Runs the task on the Function Executor and returns the output.
301
+
302
+ Raises an Exception if the task failed to run due to an internal error.
303
+ """
304
+ request: RunTaskRequest = RunTaskRequest(
305
+ namespace=self._task.namespace,
306
+ graph_name=self._task.graph_name,
307
+ graph_version=self._task.graph_version,
308
+ function_name=self._task.function_name,
309
+ graph_invocation_id=self._task.graph_invocation_id,
310
+ task_id=self._task.id,
311
+ function_input=self._input,
312
+ )
313
+ # Don't keep the input in memory after we started running the task.
314
+ self._input = None
315
+
316
+ if self._init_value is not None:
317
+ request.function_init_value.CopyFrom(self._init_value)
318
+ # Don't keep the init value in memory after we started running the task.
319
+ self._init_value = None
320
+
321
+ channel: grpc.aio.Channel = (
322
+ self._function_executor_state.function_executor.channel()
323
+ )
324
+
325
+ timeout_sec: Optional[float] = None
326
+ if self._task.HasField("timeout_ms"):
327
+ # TODO: Add integration tests with function timeout when end-to-end implementation is done.
328
+ timeout_sec = self._task.timeout_ms / 1000.0
329
+
330
+ async with _RunningTaskContextManager(
331
+ task=self._task,
332
+ function_executor=self._function_executor_state.function_executor,
333
+ ):
334
+ with (
335
+ metric_function_executor_run_task_rpc_errors.count_exceptions(),
336
+ metric_function_executor_run_task_rpc_latency.time(),
337
+ ):
338
+ metric_function_executor_run_task_rpcs.inc()
339
+ # If this RPC failed due to customer code crashing the server we won't be
340
+ # able to detect this. We'll treat this as our own error for now and thus
341
+ # let the AioRpcError to be raised here.
342
+ try:
343
+ response: RunTaskResponse = await FunctionExecutorStub(
344
+ channel
345
+ ).run_task(request, timeout=timeout_sec)
346
+ except grpc.aio.AioRpcError as e:
347
+ if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
348
+ # Not logging customer error.
349
+ self._is_timed_out = True
350
+ return self._function_timeout_output(timeout_sec=timeout_sec)
351
+ raise
352
+
353
+ return _task_output_from_function_executor_response(
354
+ task=self._task, response=response
355
+ )
356
+
357
+ async def _report_task_outcome(self, output: TaskOutput) -> None:
358
+ """Reports the task with the given output to the server.
359
+
360
+ Doesn't raise any Exceptions. Runs till the reporting is successful."""
361
+ reporting_retries: int = 0
362
+
363
+ while True:
364
+ logger = self._logger.bind(retries=reporting_retries)
365
+ try:
366
+ await self._task_reporter.report(output=output, logger=logger)
367
+ break
368
+ except Exception as e:
369
+ logger.error(
370
+ "failed to report task",
371
+ exc_info=e,
372
+ )
373
+ reporting_retries += 1
374
+ metric_task_outcome_report_retries.inc()
375
+ await asyncio.sleep(_TASK_OUTCOME_REPORT_BACKOFF_SEC)
376
+
377
+ metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
378
+ if output.is_internal_error:
379
+ metric_tasks_completed.labels(
380
+ outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
381
+ ).inc()
382
+ elif output.success:
383
+ metric_tasks_completed.labels(
384
+ outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
385
+ ).inc()
386
+ else:
387
+ metric_tasks_completed.labels(
388
+ outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
389
+ ).inc()
390
+
391
+ def _internal_error_output(self) -> TaskOutput:
392
+ return TaskOutput.internal_error(
393
+ task_id=self._task.id,
394
+ namespace=self._task.namespace,
395
+ graph_name=self._task.graph_name,
396
+ function_name=self._task.function_name,
397
+ graph_version=self._task.graph_version,
398
+ graph_invocation_id=self._task.graph_invocation_id,
399
+ )
400
+
401
+ def _function_timeout_output(self, timeout_sec: float) -> TaskOutput:
402
+ return TaskOutput.function_timeout(
403
+ task_id=self._task.id,
404
+ namespace=self._task.namespace,
405
+ graph_name=self._task.graph_name,
406
+ function_name=self._task.function_name,
407
+ graph_version=self._task.graph_version,
408
+ graph_invocation_id=self._task.graph_invocation_id,
409
+ timeout_sec=timeout_sec,
410
+ )
411
+
412
+
413
+ def _task_output_from_function_executor_response(
414
+ task: Task, response: RunTaskResponse
415
+ ) -> TaskOutput:
416
+ response_validator = MessageValidator(response)
417
+ response_validator.required_field("stdout")
418
+ response_validator.required_field("stderr")
419
+ response_validator.required_field("is_reducer")
420
+ response_validator.required_field("success")
421
+
422
+ metrics = TaskMetrics(counters={}, timers={})
423
+ if response.HasField("metrics"):
424
+ # Can be None if e.g. function failed.
425
+ metrics.counters = dict(response.metrics.counters)
426
+ metrics.timers = dict(response.metrics.timers)
427
+
428
+ output = TaskOutput(
429
+ task_id=task.id,
430
+ namespace=task.namespace,
431
+ graph_name=task.graph_name,
432
+ function_name=task.function_name,
433
+ graph_version=task.graph_version,
434
+ graph_invocation_id=task.graph_invocation_id,
435
+ stdout=response.stdout,
436
+ stderr=response.stderr,
437
+ reducer=response.is_reducer,
438
+ success=response.success,
439
+ metrics=metrics,
440
+ )
441
+
442
+ if response.HasField("function_output"):
443
+ output.function_output = response.function_output
444
+ if response.HasField("router_output"):
445
+ output.router_output = response.router_output
446
+
447
+ return output
448
+
449
+
450
+ # Temporary workaround is logging customer metrics until we store them somewhere
451
+ # for future retrieval and processing.
452
+ def _log_function_metrics(output: TaskOutput, logger: Any):
453
+ if output.metrics is None:
454
+ return
455
+
456
+ logger = logger.bind(
457
+ invocation_id=output.graph_invocation_id,
458
+ function_name=output.function_name,
459
+ graph_name=output.graph_name,
460
+ namespace=output.namespace,
461
+ )
462
+
463
+ for counter_name, counter_value in output.metrics.counters.items():
464
+ logger.info(
465
+ "function_metric", counter_name=counter_name, counter_value=counter_value
466
+ )
467
+ for timer_name, timer_value in output.metrics.timers.items():
468
+ logger.info("function_metric", timer_name=timer_name, timer_value=timer_value)
469
+
470
+
471
+ class _RunningTaskContextManager:
472
+ """Performs all the actions required before and after running a task."""
473
+
474
+ def __init__(
475
+ self,
476
+ task: Task,
477
+ function_executor: FunctionExecutor,
478
+ ):
479
+ self._task = task
480
+ self._function_executor: FunctionExecutor = function_executor
481
+
482
+ async def __aenter__(self):
483
+ self._function_executor.invocation_state_client().add_task_to_invocation_id_entry(
484
+ task_id=self._task.id,
485
+ invocation_id=self._task.graph_invocation_id,
486
+ )
487
+ return self
488
+
489
+ async def __aexit__(self, exc_type, exc_val, exc_tb):
490
+ self._function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
491
+ task_id=self._task.id,
492
+ )
@@ -20,3 +20,17 @@ metric_server_ingest_files_latency: prometheus_client.Histogram = (
20
20
  "server_ingest_files_request", "Ingest files request to Server"
21
21
  )
22
22
  )
23
+
24
+ metric_report_task_outcome_rpcs = prometheus_client.Counter(
25
+ "report_task_outcome_rpcs",
26
+ "Number of report task outcome RPCs to Server",
27
+ )
28
+ metric_report_task_outcome_errors = prometheus_client.Counter(
29
+ "report_task_outcome_rpc_errors",
30
+ "Number of report task outcome RPC errors",
31
+ )
32
+ metric_report_task_outcome_latency: prometheus_client.Histogram = (
33
+ latency_metric_for_fast_operation(
34
+ "report_task_outcome_rpc", "Report task outcome RPC to Server"
35
+ )
36
+ )