indexify 0.3.18__py3-none-any.whl → 0.3.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- indexify/cli/cli.py +3 -17
- indexify/executor/api_objects.py +12 -0
- indexify/executor/downloader.py +4 -1
- indexify/executor/executor.py +51 -29
- indexify/executor/function_executor/function_executor.py +24 -11
- indexify/executor/function_executor/function_executor_state.py +9 -1
- indexify/executor/function_executor/function_executor_states_container.py +3 -1
- indexify/executor/function_executor/function_executor_status.py +2 -0
- indexify/executor/function_executor/server/subprocess_function_executor_server_factory.py +6 -0
- indexify/executor/function_executor/single_task_runner.py +15 -11
- indexify/executor/function_executor/task_output.py +35 -2
- indexify/executor/grpc/completed_tasks_container.py +26 -0
- indexify/executor/grpc/function_executor_controller.py +421 -0
- indexify/executor/grpc/state_reconciler.py +24 -34
- indexify/executor/grpc/state_reporter.py +35 -32
- indexify/executor/grpc/task_controller.py +449 -0
- indexify/executor/metrics/task_reporter.py +14 -0
- indexify/executor/task_reporter.py +95 -4
- indexify/executor/task_runner.py +1 -0
- indexify/proto/executor_api.proto +63 -5
- indexify/proto/executor_api_pb2.py +40 -30
- indexify/proto/executor_api_pb2.pyi +118 -3
- indexify/proto/executor_api_pb2_grpc.py +47 -0
- {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/METADATA +1 -1
- {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/RECORD +27 -24
- {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/WHEEL +0 -0
- {indexify-0.3.18.dist-info → indexify-0.3.19.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,449 @@
|
|
1
|
+
import asyncio
|
2
|
+
import time
|
3
|
+
from typing import Any, Optional
|
4
|
+
|
5
|
+
import grpc
|
6
|
+
from tensorlake.function_executor.proto.function_executor_pb2 import (
|
7
|
+
RunTaskRequest,
|
8
|
+
RunTaskResponse,
|
9
|
+
SerializedObject,
|
10
|
+
)
|
11
|
+
from tensorlake.function_executor.proto.function_executor_pb2_grpc import (
|
12
|
+
FunctionExecutorStub,
|
13
|
+
)
|
14
|
+
from tensorlake.function_executor.proto.message_validator import MessageValidator
|
15
|
+
|
16
|
+
from indexify.proto.executor_api_pb2 import Task
|
17
|
+
|
18
|
+
from ..downloader import Downloader
|
19
|
+
from ..function_executor.function_executor_state import FunctionExecutorState
|
20
|
+
from ..function_executor.function_executor_status import FunctionExecutorStatus
|
21
|
+
from ..function_executor.metrics.single_task_runner import (
|
22
|
+
metric_function_executor_run_task_rpc_errors,
|
23
|
+
metric_function_executor_run_task_rpc_latency,
|
24
|
+
metric_function_executor_run_task_rpcs,
|
25
|
+
)
|
26
|
+
from ..function_executor.task_output import TaskMetrics, TaskOutput
|
27
|
+
|
28
|
+
# TODO: combine these metrics into a single python file once gRPC migration is over and old code is removed.
|
29
|
+
from ..metrics.executor import (
|
30
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ALL,
|
31
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE,
|
32
|
+
METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM,
|
33
|
+
METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS,
|
34
|
+
metric_task_completion_latency,
|
35
|
+
metric_task_outcome_report_latency,
|
36
|
+
metric_task_outcome_report_retries,
|
37
|
+
metric_task_outcome_reports,
|
38
|
+
metric_tasks_completed,
|
39
|
+
metric_tasks_reporting_outcome,
|
40
|
+
)
|
41
|
+
from ..metrics.task_runner import (
|
42
|
+
metric_task_policy_errors,
|
43
|
+
metric_task_policy_latency,
|
44
|
+
metric_task_policy_runs,
|
45
|
+
metric_task_run_latency,
|
46
|
+
metric_task_run_platform_errors,
|
47
|
+
metric_task_runs,
|
48
|
+
metric_tasks_blocked_by_policy,
|
49
|
+
metric_tasks_blocked_by_policy_per_function_name,
|
50
|
+
metric_tasks_running,
|
51
|
+
)
|
52
|
+
from ..task_reporter import TaskReporter
|
53
|
+
from .completed_tasks_container import CompletedTasksContainer
|
54
|
+
|
55
|
+
_TASK_OUTCOME_REPORT_BACKOFF_SEC = 5.0
|
56
|
+
|
57
|
+
|
58
|
+
class FunctionTimeoutError(Exception):
|
59
|
+
"""Exception raised when a customer's task execution exceeds the allowed timeout."""
|
60
|
+
|
61
|
+
def __init__(self, message: str):
|
62
|
+
super().__init__(message)
|
63
|
+
|
64
|
+
|
65
|
+
class TaskController:
|
66
|
+
def __init__(
|
67
|
+
self,
|
68
|
+
task: Task,
|
69
|
+
function_executor_state: FunctionExecutorState,
|
70
|
+
downloader: Downloader,
|
71
|
+
task_reporter: TaskReporter,
|
72
|
+
completed_tasks_container: CompletedTasksContainer,
|
73
|
+
logger: Any,
|
74
|
+
):
|
75
|
+
"""Creates a new TaskController instance.
|
76
|
+
|
77
|
+
Raises ValueError if the supplied Task is not valid.
|
78
|
+
"""
|
79
|
+
_validate_task(task)
|
80
|
+
self._task: Task = task
|
81
|
+
self._function_executor_state: FunctionExecutorState = function_executor_state
|
82
|
+
self._downloader: Downloader = downloader
|
83
|
+
self._task_reporter: TaskReporter = task_reporter
|
84
|
+
self._completed_tasks_container: CompletedTasksContainer = (
|
85
|
+
completed_tasks_container
|
86
|
+
)
|
87
|
+
self._logger: Any = logger.bind(
|
88
|
+
function_executor_id=function_executor_state.id,
|
89
|
+
task_id=task.id,
|
90
|
+
module=__name__,
|
91
|
+
namespace=task.namespace,
|
92
|
+
graph_name=task.graph_name,
|
93
|
+
graph_version=task.graph_version,
|
94
|
+
function_name=task.function_name,
|
95
|
+
invocation_id=task.graph_invocation_id,
|
96
|
+
)
|
97
|
+
self._is_running: bool = False
|
98
|
+
self._is_cancelled: bool = False
|
99
|
+
self._input: Optional[SerializedObject] = None
|
100
|
+
self._init_value: Optional[SerializedObject] = None
|
101
|
+
self._output: Optional[TaskOutput] = None
|
102
|
+
|
103
|
+
async def cancel_task(self) -> None:
|
104
|
+
"""Cancells the task."""
|
105
|
+
self._is_cancelled = True
|
106
|
+
|
107
|
+
async with self._function_executor_state.lock:
|
108
|
+
if not self._is_running:
|
109
|
+
return
|
110
|
+
|
111
|
+
# Mark the Function Executor as unhealthy to destroy it to cancel the running function.
|
112
|
+
# If FE status changed, then it means that we're off normal task execution path, e.g.
|
113
|
+
# Server decided to do something with FE.
|
114
|
+
if (
|
115
|
+
self._function_executor_state.status
|
116
|
+
== FunctionExecutorStatus.RUNNING_TASK
|
117
|
+
):
|
118
|
+
# TODO: Add a separate FE status for cancelled function so we don't lie to server that FE is unhealthy to destroy it.
|
119
|
+
await self._function_executor_state.set_status(
|
120
|
+
FunctionExecutorStatus.UNHEALTHY,
|
121
|
+
)
|
122
|
+
self._logger.warning("task is cancelled")
|
123
|
+
else:
|
124
|
+
self._logger.warning(
|
125
|
+
"skipping marking Function Executor unhealthy on task cancellation due to unexpected FE status",
|
126
|
+
status=self._function_executor_state.status.name,
|
127
|
+
)
|
128
|
+
|
129
|
+
async def run_task(self) -> None:
|
130
|
+
"""Runs the supplied task and does full managemenet of its lifecycle.
|
131
|
+
|
132
|
+
Doesn't raise any exceptions."""
|
133
|
+
start_time: float = time.monotonic()
|
134
|
+
|
135
|
+
try:
|
136
|
+
# The task can be cancelled at any time but we'll just wait until FE gets shutdown
|
137
|
+
# because we require this to happen from the cancel_task() caller.
|
138
|
+
self._input = await self._downloader.download_input(
|
139
|
+
namespace=self._task.namespace,
|
140
|
+
graph_name=self._task.graph_name,
|
141
|
+
graph_invocation_id=self._task.graph_invocation_id,
|
142
|
+
input_key=self._task.input_key,
|
143
|
+
logger=self._logger,
|
144
|
+
)
|
145
|
+
if self._task.HasField("reducer_output_key"):
|
146
|
+
self._init_value = await self._downloader.download_init_value(
|
147
|
+
namespace=self._task.namespace,
|
148
|
+
graph_name=self._task.graph_name,
|
149
|
+
function_name=self._task.function_name,
|
150
|
+
graph_invocation_id=self._task.graph_invocation_id,
|
151
|
+
reducer_output_key=self._task.reducer_output_key,
|
152
|
+
logger=self._logger,
|
153
|
+
)
|
154
|
+
|
155
|
+
await self._wait_for_idle_function_executor()
|
156
|
+
|
157
|
+
with (
|
158
|
+
metric_task_run_platform_errors.count_exceptions(),
|
159
|
+
metric_tasks_running.track_inprogress(),
|
160
|
+
metric_task_run_latency.time(),
|
161
|
+
):
|
162
|
+
metric_task_runs.inc()
|
163
|
+
await self._run_task()
|
164
|
+
|
165
|
+
self._logger.info("task execution finished", success=self._output.success)
|
166
|
+
except FunctionTimeoutError:
|
167
|
+
self._output = TaskOutput.function_timeout(
|
168
|
+
task_id=self._task.id,
|
169
|
+
namespace=self._task.namespace,
|
170
|
+
graph_name=self._task.graph_name,
|
171
|
+
function_name=self._task.function_name,
|
172
|
+
graph_version=self._task.graph_version,
|
173
|
+
graph_invocation_id=self._task.graph_invocation_id,
|
174
|
+
)
|
175
|
+
async with self._function_executor_state.lock:
|
176
|
+
# Mark the Function Executor as unhealthy to destroy it to cancel the running function.
|
177
|
+
# If FE status changed, then it means that we're off normal task execution path, e.g.
|
178
|
+
# Server decided to do something with FE.
|
179
|
+
if (
|
180
|
+
self._function_executor_state.status
|
181
|
+
== FunctionExecutorStatus.RUNNING_TASK
|
182
|
+
):
|
183
|
+
# TODO: Add a separate FE status for timed out function so we don't lie to server that FE is unhealthy to destroy it.
|
184
|
+
await self._function_executor_state.set_status(
|
185
|
+
FunctionExecutorStatus.UNHEALTHY,
|
186
|
+
)
|
187
|
+
else:
|
188
|
+
self._logger.warning(
|
189
|
+
"skipping marking Function Executor unhealthy on task timeout due to unexpected FE status",
|
190
|
+
status=self._function_executor_state.status.name,
|
191
|
+
)
|
192
|
+
except Exception as e:
|
193
|
+
self._output = TaskOutput.internal_error(
|
194
|
+
task_id=self._task.id,
|
195
|
+
namespace=self._task.namespace,
|
196
|
+
graph_name=self._task.graph_name,
|
197
|
+
function_name=self._task.function_name,
|
198
|
+
graph_version=self._task.graph_version,
|
199
|
+
graph_invocation_id=self._task.graph_invocation_id,
|
200
|
+
)
|
201
|
+
self._logger.error("task execution failed", exc_info=e)
|
202
|
+
finally:
|
203
|
+
# Release the Function Executor so others can run tasks on it if FE status didn't change.
|
204
|
+
# If FE status changed, then it means that we're off normal task execution path, e.g.
|
205
|
+
# Server decided to do something with FE.
|
206
|
+
async with self._function_executor_state.lock:
|
207
|
+
if (
|
208
|
+
self._function_executor_state.status
|
209
|
+
== FunctionExecutorStatus.RUNNING_TASK
|
210
|
+
):
|
211
|
+
await self._function_executor_state.set_status(
|
212
|
+
FunctionExecutorStatus.IDLE
|
213
|
+
)
|
214
|
+
else:
|
215
|
+
self._logger.warning(
|
216
|
+
"skipping marking Function Executor IDLE due to unexpected FE status",
|
217
|
+
status=self._function_executor_state.status,
|
218
|
+
)
|
219
|
+
|
220
|
+
_log_function_metrics(self._output, self._logger)
|
221
|
+
|
222
|
+
with (
|
223
|
+
metric_tasks_reporting_outcome.track_inprogress(),
|
224
|
+
metric_task_outcome_report_latency.time(),
|
225
|
+
):
|
226
|
+
metric_task_outcome_reports.inc()
|
227
|
+
await self._report_task_outcome()
|
228
|
+
|
229
|
+
metric_task_completion_latency.observe(time.monotonic() - start_time)
|
230
|
+
|
231
|
+
async def _wait_for_idle_function_executor(self) -> None:
|
232
|
+
"""Waits until the Function Executor is in IDLE state.
|
233
|
+
|
234
|
+
Raises an Exception if the Function Executor is in SHUTDOWN state.
|
235
|
+
"""
|
236
|
+
with (
|
237
|
+
metric_task_policy_errors.count_exceptions(),
|
238
|
+
metric_tasks_blocked_by_policy.track_inprogress(),
|
239
|
+
metric_tasks_blocked_by_policy_per_function_name.labels(
|
240
|
+
function_name=self._task.function_name
|
241
|
+
).track_inprogress(),
|
242
|
+
metric_task_policy_latency.time(),
|
243
|
+
):
|
244
|
+
metric_task_policy_runs.inc()
|
245
|
+
self._logger.info(
|
246
|
+
"task is blocked by policy: waiting for idle function executor"
|
247
|
+
)
|
248
|
+
async with self._function_executor_state.lock:
|
249
|
+
await self._function_executor_state.wait_status(
|
250
|
+
allowlist=[
|
251
|
+
FunctionExecutorStatus.IDLE,
|
252
|
+
FunctionExecutorStatus.SHUTDOWN,
|
253
|
+
]
|
254
|
+
)
|
255
|
+
if (
|
256
|
+
self._function_executor_state.status
|
257
|
+
== FunctionExecutorStatus.SHUTDOWN
|
258
|
+
):
|
259
|
+
raise Exception(
|
260
|
+
"Task's Function Executor got shutdown, can't run task"
|
261
|
+
)
|
262
|
+
await self._function_executor_state.set_status(
|
263
|
+
FunctionExecutorStatus.RUNNING_TASK
|
264
|
+
)
|
265
|
+
|
266
|
+
# At this point the Function Executor belongs to this task controller due to RUNNING_TASK status.
|
267
|
+
# We can now unlock the FE state. We have to update the FE status once the task succeeds or fails.
|
268
|
+
|
269
|
+
async def _run_task(self) -> None:
|
270
|
+
request: RunTaskRequest = RunTaskRequest(
|
271
|
+
namespace=self._task.namespace,
|
272
|
+
graph_name=self._task.graph_name,
|
273
|
+
graph_version=self._task.graph_version,
|
274
|
+
function_name=self._task.function_name,
|
275
|
+
graph_invocation_id=self._task.graph_invocation_id,
|
276
|
+
task_id=self._task.id,
|
277
|
+
function_input=self._input,
|
278
|
+
)
|
279
|
+
if self._init_value is not None:
|
280
|
+
request.function_init_value.CopyFrom(self._init_value)
|
281
|
+
channel: grpc.aio.Channel = (
|
282
|
+
self._function_executor_state.function_executor.channel()
|
283
|
+
)
|
284
|
+
|
285
|
+
timeout_sec: Optional[float] = None
|
286
|
+
if self._task.HasField("timeout_ms"):
|
287
|
+
# TODO: Add integration tests with function timeout when end-to-end implementation is done.
|
288
|
+
timeout_sec = self._task.timeout_ms / 1000.0
|
289
|
+
|
290
|
+
async with _RunningTaskContextManager(
|
291
|
+
task=self._task,
|
292
|
+
function_executor_state=self._function_executor_state,
|
293
|
+
):
|
294
|
+
with (
|
295
|
+
metric_function_executor_run_task_rpc_errors.count_exceptions(),
|
296
|
+
metric_function_executor_run_task_rpc_latency.time(),
|
297
|
+
):
|
298
|
+
metric_function_executor_run_task_rpcs.inc()
|
299
|
+
# If this RPC failed due to customer code crashing the server we won't be
|
300
|
+
# able to detect this. We'll treat this as our own error for now and thus
|
301
|
+
# let the AioRpcError to be raised here.
|
302
|
+
try:
|
303
|
+
response: RunTaskResponse = await FunctionExecutorStub(
|
304
|
+
channel
|
305
|
+
).run_task(request, timeout=timeout_sec)
|
306
|
+
except grpc.aio.AioRpcError as e:
|
307
|
+
if e.code() == grpc.StatusCode.DEADLINE_EXCEEDED:
|
308
|
+
raise FunctionTimeoutError(
|
309
|
+
f"Task execution timeout {timeout_sec} expired"
|
310
|
+
) from e
|
311
|
+
raise
|
312
|
+
|
313
|
+
self._output = _task_output(task=self._task, response=response)
|
314
|
+
|
315
|
+
async def _report_task_outcome(self) -> None:
|
316
|
+
"""Reports the task with the given output to the server.
|
317
|
+
|
318
|
+
Doesn't raise any Exceptions. Runs till the reporting is successful."""
|
319
|
+
reporting_retries: int = 0
|
320
|
+
|
321
|
+
while True:
|
322
|
+
logger = self._logger.bind(retries=reporting_retries)
|
323
|
+
if self._is_cancelled:
|
324
|
+
logger.warning(
|
325
|
+
"task is cancelled, skipping its outcome reporting to workaround lack of server side retries"
|
326
|
+
)
|
327
|
+
break
|
328
|
+
|
329
|
+
try:
|
330
|
+
await self._task_reporter.report(output=self._output, logger=logger)
|
331
|
+
break
|
332
|
+
except Exception as e:
|
333
|
+
logger.error(
|
334
|
+
"failed to report task",
|
335
|
+
exc_info=e,
|
336
|
+
)
|
337
|
+
reporting_retries += 1
|
338
|
+
metric_task_outcome_report_retries.inc()
|
339
|
+
await asyncio.sleep(_TASK_OUTCOME_REPORT_BACKOFF_SEC)
|
340
|
+
|
341
|
+
await self._completed_tasks_container.add(self._task.id)
|
342
|
+
metric_tasks_completed.labels(outcome=METRIC_TASKS_COMPLETED_OUTCOME_ALL).inc()
|
343
|
+
if self._output.is_internal_error:
|
344
|
+
metric_tasks_completed.labels(
|
345
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_PLATFORM
|
346
|
+
).inc()
|
347
|
+
elif self._output.success:
|
348
|
+
metric_tasks_completed.labels(
|
349
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_SUCCESS
|
350
|
+
).inc()
|
351
|
+
else:
|
352
|
+
metric_tasks_completed.labels(
|
353
|
+
outcome=METRIC_TASKS_COMPLETED_OUTCOME_ERROR_CUSTOMER_CODE
|
354
|
+
).inc()
|
355
|
+
|
356
|
+
|
357
|
+
def _validate_task(task: Task) -> None:
|
358
|
+
"""Validates the supplied Task.
|
359
|
+
|
360
|
+
Raises ValueError if the Task is not valid.
|
361
|
+
"""
|
362
|
+
validator = MessageValidator(task)
|
363
|
+
validator.required_field("id")
|
364
|
+
validator.required_field("namespace")
|
365
|
+
validator.required_field("graph_name")
|
366
|
+
validator.required_field("graph_version")
|
367
|
+
validator.required_field("function_name")
|
368
|
+
validator.required_field("graph_invocation_id")
|
369
|
+
validator.required_field("input_key")
|
370
|
+
|
371
|
+
|
372
|
+
def _task_output(task: Task, response: RunTaskResponse) -> TaskOutput:
|
373
|
+
response_validator = MessageValidator(response)
|
374
|
+
response_validator.required_field("stdout")
|
375
|
+
response_validator.required_field("stderr")
|
376
|
+
response_validator.required_field("is_reducer")
|
377
|
+
response_validator.required_field("success")
|
378
|
+
|
379
|
+
metrics = TaskMetrics(counters={}, timers={})
|
380
|
+
if response.HasField("metrics"):
|
381
|
+
# Can be None if e.g. function failed.
|
382
|
+
metrics.counters = dict(response.metrics.counters)
|
383
|
+
metrics.timers = dict(response.metrics.timers)
|
384
|
+
|
385
|
+
output = TaskOutput(
|
386
|
+
task_id=task.id,
|
387
|
+
namespace=task.namespace,
|
388
|
+
graph_name=task.graph_name,
|
389
|
+
function_name=task.function_name,
|
390
|
+
graph_version=task.graph_version,
|
391
|
+
graph_invocation_id=task.graph_invocation_id,
|
392
|
+
stdout=response.stdout,
|
393
|
+
stderr=response.stderr,
|
394
|
+
reducer=response.is_reducer,
|
395
|
+
success=response.success,
|
396
|
+
metrics=metrics,
|
397
|
+
)
|
398
|
+
|
399
|
+
if response.HasField("function_output"):
|
400
|
+
output.function_output = response.function_output
|
401
|
+
if response.HasField("router_output"):
|
402
|
+
output.router_output = response.router_output
|
403
|
+
|
404
|
+
return output
|
405
|
+
|
406
|
+
|
407
|
+
# Temporary workaround is logging customer metrics until we store them somewhere
|
408
|
+
# for future retrieval and processing.
|
409
|
+
def _log_function_metrics(output: TaskOutput, logger: Any):
|
410
|
+
if output.metrics is None:
|
411
|
+
return
|
412
|
+
|
413
|
+
logger = logger.bind(
|
414
|
+
invocation_id=output.graph_invocation_id,
|
415
|
+
function_name=output.function_name,
|
416
|
+
graph_name=output.graph_name,
|
417
|
+
namespace=output.namespace,
|
418
|
+
)
|
419
|
+
|
420
|
+
for counter_name, counter_value in output.metrics.counters.items():
|
421
|
+
logger.info(
|
422
|
+
"function_metric", counter_name=counter_name, counter_value=counter_value
|
423
|
+
)
|
424
|
+
for timer_name, timer_value in output.metrics.timers.items():
|
425
|
+
logger.info("function_metric", timer_name=timer_name, timer_value=timer_value)
|
426
|
+
|
427
|
+
|
428
|
+
class _RunningTaskContextManager:
|
429
|
+
"""Performs all the actions required before and after running a task."""
|
430
|
+
|
431
|
+
def __init__(
|
432
|
+
self,
|
433
|
+
task_controller: TaskController,
|
434
|
+
):
|
435
|
+
self._task_controller: TaskController = task_controller
|
436
|
+
|
437
|
+
async def __aenter__(self):
|
438
|
+
self._task_controller._function_executor_state.function_executor.invocation_state_client().add_task_to_invocation_id_entry(
|
439
|
+
task_id=self._task_controller._task.id,
|
440
|
+
invocation_id=self._task_controller._task.graph_invocation_id,
|
441
|
+
)
|
442
|
+
self._task_controller._is_running = True
|
443
|
+
return self
|
444
|
+
|
445
|
+
async def __aexit__(self, exc_type, exc_val, exc_tb):
|
446
|
+
self._task_controller._is_running = False
|
447
|
+
self._task_controller._function_executor_state.function_executor.invocation_state_client().remove_task_to_invocation_id_entry(
|
448
|
+
task_id=self._task_controller._task.id,
|
449
|
+
)
|
@@ -20,3 +20,17 @@ metric_server_ingest_files_latency: prometheus_client.Histogram = (
|
|
20
20
|
"server_ingest_files_request", "Ingest files request to Server"
|
21
21
|
)
|
22
22
|
)
|
23
|
+
|
24
|
+
metric_report_task_outcome_rpcs = prometheus_client.Counter(
|
25
|
+
"report_task_outcome_rpcs",
|
26
|
+
"Number of report task outcome RPCs to Server",
|
27
|
+
)
|
28
|
+
metric_report_task_outcome_errors = prometheus_client.Counter(
|
29
|
+
"report_task_outcome_rpc_errors",
|
30
|
+
"Number of report task outcome RPC errors",
|
31
|
+
)
|
32
|
+
metric_report_task_outcome_latency: prometheus_client.Histogram = (
|
33
|
+
latency_metric_for_fast_operation(
|
34
|
+
"report_task_outcome_rpc", "Report task outcome RPC to Server"
|
35
|
+
)
|
36
|
+
)
|
@@ -7,14 +7,27 @@ from httpx import Timeout
|
|
7
7
|
from tensorlake.function_executor.proto.function_executor_pb2 import FunctionOutput
|
8
8
|
from tensorlake.utils.http_client import get_httpx_client
|
9
9
|
|
10
|
+
from indexify.proto.executor_api_pb2 import (
|
11
|
+
DataPayload,
|
12
|
+
OutputEncoding,
|
13
|
+
ReportTaskOutcomeRequest,
|
14
|
+
TaskOutcome,
|
15
|
+
)
|
16
|
+
from indexify.proto.executor_api_pb2_grpc import ExecutorAPIStub
|
17
|
+
|
10
18
|
from .api_objects import (
|
11
19
|
TASK_OUTCOME_FAILURE,
|
12
20
|
TASK_OUTCOME_SUCCESS,
|
21
|
+
IngestFnOutputsResponse,
|
13
22
|
RouterOutput,
|
14
23
|
TaskResult,
|
15
24
|
)
|
16
25
|
from .function_executor.task_output import TaskOutput
|
26
|
+
from .grpc.channel_manager import ChannelManager
|
17
27
|
from .metrics.task_reporter import (
|
28
|
+
metric_report_task_outcome_errors,
|
29
|
+
metric_report_task_outcome_latency,
|
30
|
+
metric_report_task_outcome_rpcs,
|
18
31
|
metric_server_ingest_files_errors,
|
19
32
|
metric_server_ingest_files_latency,
|
20
33
|
metric_server_ingest_files_requests,
|
@@ -45,7 +58,11 @@ class TaskOutputSummary:
|
|
45
58
|
|
46
59
|
class TaskReporter:
|
47
60
|
def __init__(
|
48
|
-
self,
|
61
|
+
self,
|
62
|
+
base_url: str,
|
63
|
+
executor_id: str,
|
64
|
+
channel_manager: ChannelManager,
|
65
|
+
config_path: Optional[str] = None,
|
49
66
|
):
|
50
67
|
self._base_url = base_url
|
51
68
|
self._executor_id = executor_id
|
@@ -56,6 +73,7 @@ class TaskReporter:
|
|
56
73
|
# Creating a new async client for each request fixes this but it
|
57
74
|
# results in not reusing established TCP connections to server.
|
58
75
|
self._client = get_httpx_client(config_path, make_async=False)
|
76
|
+
self._channel_manager = channel_manager
|
59
77
|
|
60
78
|
async def shutdown(self):
|
61
79
|
"""Shuts down the task reporter.
|
@@ -109,12 +127,12 @@ class TaskReporter:
|
|
109
127
|
# Run in a separate thread to not block the main event loop.
|
110
128
|
response = await asyncio.to_thread(
|
111
129
|
self._client.post,
|
112
|
-
url=f"{self._base_url}/internal/
|
130
|
+
url=f"{self._base_url}/internal/ingest_fn_outputs",
|
113
131
|
**kwargs,
|
114
132
|
)
|
115
133
|
end_time = time.time()
|
116
134
|
logger.info(
|
117
|
-
"
|
135
|
+
"files uploaded",
|
118
136
|
response_time=end_time - start_time,
|
119
137
|
response_code=response.status_code,
|
120
138
|
)
|
@@ -125,11 +143,70 @@ class TaskReporter:
|
|
125
143
|
metric_server_ingest_files_errors.inc()
|
126
144
|
# Caller catches and logs the exception.
|
127
145
|
raise Exception(
|
128
|
-
"failed to
|
146
|
+
"failed to upload files. "
|
129
147
|
f"Response code: {response.status_code}. "
|
130
148
|
f"Response text: '{response.text}'."
|
131
149
|
) from e
|
132
150
|
|
151
|
+
# TODO: If the files are uploaded successfully,
|
152
|
+
# we should record that so that if we fail to report
|
153
|
+
# the task outcome, we don't retry the upload.
|
154
|
+
# This will save us some time and resources.
|
155
|
+
|
156
|
+
ingested_files_response = response.json()
|
157
|
+
ingested_files = IngestFnOutputsResponse.model_validate(ingested_files_response)
|
158
|
+
fn_outputs = []
|
159
|
+
for data_payload in ingested_files.data_payloads:
|
160
|
+
fn_outputs.append(
|
161
|
+
DataPayload(
|
162
|
+
path=data_payload.path,
|
163
|
+
size=data_payload.size,
|
164
|
+
sha256_hash=data_payload.sha256_hash,
|
165
|
+
)
|
166
|
+
)
|
167
|
+
stdout, stderr = None, None
|
168
|
+
if ingested_files.stdout:
|
169
|
+
stdout = DataPayload(
|
170
|
+
path=ingested_files.stdout.path,
|
171
|
+
size=ingested_files.stdout.size,
|
172
|
+
sha256_hash=ingested_files.stdout.sha256_hash,
|
173
|
+
)
|
174
|
+
if ingested_files.stderr:
|
175
|
+
stderr = DataPayload(
|
176
|
+
path=ingested_files.stderr.path,
|
177
|
+
size=ingested_files.stderr.size,
|
178
|
+
sha256_hash=ingested_files.stderr.sha256_hash,
|
179
|
+
)
|
180
|
+
|
181
|
+
request = ReportTaskOutcomeRequest(
|
182
|
+
task_id=output.task_id,
|
183
|
+
namespace=output.namespace,
|
184
|
+
graph_name=output.graph_name,
|
185
|
+
function_name=output.function_name,
|
186
|
+
graph_invocation_id=output.graph_invocation_id,
|
187
|
+
outcome=_to_grpc_task_outcome(output),
|
188
|
+
invocation_id=output.graph_invocation_id,
|
189
|
+
executor_id=self._executor_id,
|
190
|
+
reducer=output.reducer,
|
191
|
+
next_functions=(output.router_output.edges if output.router_output else []),
|
192
|
+
fn_outputs=fn_outputs,
|
193
|
+
stdout=stdout,
|
194
|
+
stderr=stderr,
|
195
|
+
output_encoding=_to_grpc_output_encoding(output),
|
196
|
+
output_encoding_version=0,
|
197
|
+
)
|
198
|
+
try:
|
199
|
+
stub = ExecutorAPIStub(await self._channel_manager.get_channel())
|
200
|
+
with (
|
201
|
+
metric_report_task_outcome_latency.time(),
|
202
|
+
metric_report_task_outcome_errors.count_exceptions(),
|
203
|
+
):
|
204
|
+
metric_report_task_outcome_rpcs.inc()
|
205
|
+
await stub.report_task_outcome(request, timeout=5.0)
|
206
|
+
except Exception as e:
|
207
|
+
logger.error("failed to report task outcome", error=e)
|
208
|
+
raise e
|
209
|
+
|
133
210
|
def _process_task_output(
|
134
211
|
self, output: TaskOutput
|
135
212
|
) -> Tuple[TaskResult, List[Any], TaskOutputSummary]:
|
@@ -246,3 +323,17 @@ def _process_stderr(
|
|
246
323
|
)
|
247
324
|
summary.stderr_count += 1
|
248
325
|
summary.stderr_total_bytes += len(stderr)
|
326
|
+
|
327
|
+
|
328
|
+
def _to_grpc_task_outcome(task_output: TaskOutput) -> TaskOutcome:
|
329
|
+
if task_output.success:
|
330
|
+
return TaskOutcome.TASK_OUTCOME_SUCCESS
|
331
|
+
else:
|
332
|
+
return TaskOutcome.TASK_OUTCOME_FAILURE
|
333
|
+
|
334
|
+
|
335
|
+
def _to_grpc_output_encoding(task_output: TaskOutput) -> OutputEncoding:
|
336
|
+
if task_output.output_encoding == "json":
|
337
|
+
return OutputEncoding.OUTPUT_ENCODING_JSON
|
338
|
+
else:
|
339
|
+
return OutputEncoding.OUTPUT_ENCODING_PICKLE
|
indexify/executor/task_runner.py
CHANGED