gooddata-flight-server 1.28.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of gooddata-flight-server might be problematic. Click here for more details.
- gooddata_flight_server/__init__.py +23 -0
- gooddata_flight_server/_version.py +7 -0
- gooddata_flight_server/cli.py +137 -0
- gooddata_flight_server/config/__init__.py +1 -0
- gooddata_flight_server/config/config.py +536 -0
- gooddata_flight_server/errors/__init__.py +1 -0
- gooddata_flight_server/errors/error_code.py +209 -0
- gooddata_flight_server/errors/error_info.py +475 -0
- gooddata_flight_server/exceptions.py +16 -0
- gooddata_flight_server/health/__init__.py +1 -0
- gooddata_flight_server/health/health_check_http_server.py +103 -0
- gooddata_flight_server/health/server_health_monitor.py +83 -0
- gooddata_flight_server/metrics.py +16 -0
- gooddata_flight_server/py.typed +1 -0
- gooddata_flight_server/server/__init__.py +1 -0
- gooddata_flight_server/server/auth/__init__.py +1 -0
- gooddata_flight_server/server/auth/auth_middleware.py +83 -0
- gooddata_flight_server/server/auth/token_verifier.py +62 -0
- gooddata_flight_server/server/auth/token_verifier_factory.py +55 -0
- gooddata_flight_server/server/auth/token_verifier_impl.py +41 -0
- gooddata_flight_server/server/base.py +63 -0
- gooddata_flight_server/server/default.logging.ini +28 -0
- gooddata_flight_server/server/flight_rpc/__init__.py +1 -0
- gooddata_flight_server/server/flight_rpc/flight_middleware.py +162 -0
- gooddata_flight_server/server/flight_rpc/flight_server.py +230 -0
- gooddata_flight_server/server/flight_rpc/flight_service.py +281 -0
- gooddata_flight_server/server/flight_rpc/server_methods.py +200 -0
- gooddata_flight_server/server/server_base.py +321 -0
- gooddata_flight_server/server/server_main.py +116 -0
- gooddata_flight_server/tasks/__init__.py +1 -0
- gooddata_flight_server/tasks/base.py +21 -0
- gooddata_flight_server/tasks/metrics.py +115 -0
- gooddata_flight_server/tasks/task.py +193 -0
- gooddata_flight_server/tasks/task_error.py +60 -0
- gooddata_flight_server/tasks/task_executor.py +96 -0
- gooddata_flight_server/tasks/task_result.py +363 -0
- gooddata_flight_server/tasks/temporal_container.py +247 -0
- gooddata_flight_server/tasks/thread_task_executor.py +639 -0
- gooddata_flight_server/utils/__init__.py +1 -0
- gooddata_flight_server/utils/libc_utils.py +35 -0
- gooddata_flight_server/utils/logging.py +158 -0
- gooddata_flight_server/utils/methods_discovery.py +98 -0
- gooddata_flight_server/utils/otel_tracing.py +142 -0
- gooddata_flight_server-1.28.0.data/scripts/gooddata-flight-server +10 -0
- gooddata_flight_server-1.28.0.dist-info/LICENSE.txt +1066 -0
- gooddata_flight_server-1.28.0.dist-info/METADATA +737 -0
- gooddata_flight_server-1.28.0.dist-info/RECORD +49 -0
- gooddata_flight_server-1.28.0.dist-info/WHEEL +5 -0
- gooddata_flight_server-1.28.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,639 @@
|
|
|
1
|
+
# (C) 2024 GoodData Corporation
|
|
2
|
+
import abc
|
|
3
|
+
import threading
|
|
4
|
+
import time
|
|
5
|
+
from collections.abc import Generator
|
|
6
|
+
from concurrent.futures import CancelledError, Future, ThreadPoolExecutor
|
|
7
|
+
from contextlib import contextmanager
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Any, Optional, Union
|
|
10
|
+
|
|
11
|
+
import opentelemetry.context as otelctx
|
|
12
|
+
import pyarrow.flight
|
|
13
|
+
import structlog
|
|
14
|
+
from opentelemetry import trace
|
|
15
|
+
|
|
16
|
+
from gooddata_flight_server.errors.error_code import ErrorCode
|
|
17
|
+
from gooddata_flight_server.errors.error_info import ErrorInfo
|
|
18
|
+
from gooddata_flight_server.tasks.base import TaskWaitTimeoutError
|
|
19
|
+
from gooddata_flight_server.tasks.metrics import TaskExecutorMetrics
|
|
20
|
+
from gooddata_flight_server.tasks.task import Task
|
|
21
|
+
from gooddata_flight_server.tasks.task_error import TaskError
|
|
22
|
+
from gooddata_flight_server.tasks.task_executor import (
|
|
23
|
+
TaskAttributes,
|
|
24
|
+
TaskExecutor,
|
|
25
|
+
)
|
|
26
|
+
from gooddata_flight_server.tasks.task_result import (
|
|
27
|
+
FlightDataTaskResult,
|
|
28
|
+
TaskExecutionResult,
|
|
29
|
+
TaskResult,
|
|
30
|
+
)
|
|
31
|
+
from gooddata_flight_server.tasks.temporal_container import TemporalContainer
|
|
32
|
+
from gooddata_flight_server.utils.otel_tracing import SERVER_TRACER
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass()
|
|
36
|
+
class _TaskExecutionStats:
|
|
37
|
+
"""
|
|
38
|
+
Container for task execution statistics.
|
|
39
|
+
|
|
40
|
+
Most of the fields here are optional. When the task completes, only fields
|
|
41
|
+
that describe work that was actually done will be set (e.g. so say prereq fields
|
|
42
|
+
will not be set if the task had no prerequisites).
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
created: float
|
|
46
|
+
"""
|
|
47
|
+
time when the task was created
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
run_submitted: Optional[float] = None
|
|
51
|
+
"""
|
|
52
|
+
time when task was submitted to thread pool to invoke the run()
|
|
53
|
+
"""
|
|
54
|
+
|
|
55
|
+
run_started: Optional[float] = None
|
|
56
|
+
"""
|
|
57
|
+
time when some thread actually started the run()
|
|
58
|
+
"""
|
|
59
|
+
|
|
60
|
+
run_completed: Optional[float] = None
|
|
61
|
+
"""
|
|
62
|
+
time when the run() completed (regardless of the result)
|
|
63
|
+
"""
|
|
64
|
+
|
|
65
|
+
completed: Optional[float] = None
|
|
66
|
+
"""
|
|
67
|
+
time when all work for task execution completed. if the task was actually run,
|
|
68
|
+
then this value is same as `run_completed`. if the task failed/was cancelled
|
|
69
|
+
during prerequisite resolution, then this will be equal to `prereq_completed`.
|
|
70
|
+
"""
|
|
71
|
+
|
|
72
|
+
@property
|
|
73
|
+
def run_waited_duration(self) -> float:
|
|
74
|
+
if self.run_submitted is None or self.run_started is None:
|
|
75
|
+
return -1
|
|
76
|
+
|
|
77
|
+
return self.run_started - self.run_submitted
|
|
78
|
+
|
|
79
|
+
@property
|
|
80
|
+
def run_duration(self) -> float:
|
|
81
|
+
if self.run_started is None or self.run_completed is None:
|
|
82
|
+
return -1
|
|
83
|
+
|
|
84
|
+
return self.run_completed - self.run_started
|
|
85
|
+
|
|
86
|
+
@property
|
|
87
|
+
def duration(self) -> float:
|
|
88
|
+
if self.completed is None:
|
|
89
|
+
return -1
|
|
90
|
+
|
|
91
|
+
return self.completed - self.created
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def durations_to_dict(self) -> dict[str, float]:
|
|
95
|
+
return {
|
|
96
|
+
"run_waited_duration": self.run_waited_duration,
|
|
97
|
+
"run_duration": self.run_duration,
|
|
98
|
+
"duration": self.duration,
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
class _TaskExecutionCallbacks(abc.ABC):
|
|
103
|
+
"""
|
|
104
|
+
This is an interface between ThreadTaskExecutor and the TaskExecution. The TaskExecution
|
|
105
|
+
orchestrates work for a particular task while the ThreadTaskExecutor knows how to actually
|
|
106
|
+
do the work.
|
|
107
|
+
"""
|
|
108
|
+
|
|
109
|
+
def run_task(
|
|
110
|
+
self,
|
|
111
|
+
task_execution: "_TaskExecution",
|
|
112
|
+
) -> Future:
|
|
113
|
+
"""
|
|
114
|
+
Asynchronously run the task.
|
|
115
|
+
|
|
116
|
+
:param task_execution: task execution whose task is now ready to run
|
|
117
|
+
:return: future result of the task run
|
|
118
|
+
"""
|
|
119
|
+
raise NotImplementedError
|
|
120
|
+
|
|
121
|
+
def process_task_result(
|
|
122
|
+
self,
|
|
123
|
+
task_execution: "_TaskExecution",
|
|
124
|
+
result: Future,
|
|
125
|
+
) -> TaskExecutionResult:
|
|
126
|
+
"""
|
|
127
|
+
This will be called when the task run itself completes. It is guaranteed
|
|
128
|
+
that the `result` future is completed.
|
|
129
|
+
|
|
130
|
+
Errors (including cancellation) are propagated either as the special TaskError
|
|
131
|
+
return value or by raising exception. The method must be prepared for this and
|
|
132
|
+
handle both cases.
|
|
133
|
+
|
|
134
|
+
After this method completes, the task executor must have a result associated
|
|
135
|
+
with the task and must be able to return it to the callers who come asking for it.
|
|
136
|
+
|
|
137
|
+
:param task_execution: task_execution whose task run finished
|
|
138
|
+
:param result: completed future
|
|
139
|
+
:return: task's execution result
|
|
140
|
+
"""
|
|
141
|
+
raise NotImplementedError
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
class _TaskExecution:
|
|
145
|
+
"""
|
|
146
|
+
This class represents a task execution and is responsible for its orchestration.
|
|
147
|
+
|
|
148
|
+
The class itself does not do any heavy lifting, but keeps track of the task execution
|
|
149
|
+
state and interacts with TaskExecutor to actually perform the next steps (e.g. running
|
|
150
|
+
prereq resolution, running task itself, processing results / errors etc)
|
|
151
|
+
"""
|
|
152
|
+
|
|
153
|
+
__slots__ = (
|
|
154
|
+
"_task",
|
|
155
|
+
"_cb",
|
|
156
|
+
"_logging_ctx",
|
|
157
|
+
"_trace_exec",
|
|
158
|
+
"_result_future",
|
|
159
|
+
"_lock",
|
|
160
|
+
"_completed",
|
|
161
|
+
"_stats",
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
def __init__(
|
|
165
|
+
self,
|
|
166
|
+
task: Task,
|
|
167
|
+
cb: _TaskExecutionCallbacks,
|
|
168
|
+
) -> None:
|
|
169
|
+
self._task = task
|
|
170
|
+
self._cb = cb
|
|
171
|
+
self._logging_ctx = structlog.contextvars.get_contextvars() or {}
|
|
172
|
+
self._trace_exec = (
|
|
173
|
+
otelctx.get_current(),
|
|
174
|
+
SERVER_TRACER.start_span(
|
|
175
|
+
"task_execution",
|
|
176
|
+
attributes={TaskAttributes.TaskId: self._task.task_id},
|
|
177
|
+
),
|
|
178
|
+
)
|
|
179
|
+
|
|
180
|
+
self._stats = _TaskExecutionStats(created=time.perf_counter())
|
|
181
|
+
# lock for state synchro, intentionally reused in conditional variable
|
|
182
|
+
#
|
|
183
|
+
# note: reentrant lock is necessary because of the cancel() method:
|
|
184
|
+
# - future cancellation happens while holding this lock
|
|
185
|
+
# - during future cancellation, the code in Future invokes the
|
|
186
|
+
# registered callbacks and these also use this same lock
|
|
187
|
+
self._lock = threading.RLock()
|
|
188
|
+
|
|
189
|
+
# all these are protected using the lock
|
|
190
|
+
self._result_future: Optional[Future[Union[TaskResult, TaskError]]] = None
|
|
191
|
+
self._completed: threading.Condition = threading.Condition(self._lock)
|
|
192
|
+
|
|
193
|
+
@property
|
|
194
|
+
def task(self) -> Task:
|
|
195
|
+
return self._task
|
|
196
|
+
|
|
197
|
+
@property
|
|
198
|
+
def task_id(self) -> str:
|
|
199
|
+
return self._task.task_id
|
|
200
|
+
|
|
201
|
+
@property
|
|
202
|
+
def stats(self) -> _TaskExecutionStats:
|
|
203
|
+
return self._stats
|
|
204
|
+
|
|
205
|
+
@property
|
|
206
|
+
def logging_ctx(self) -> dict[str, Any]:
|
|
207
|
+
return self._logging_ctx
|
|
208
|
+
|
|
209
|
+
def _complete_execution_span(self, execution_result: TaskExecutionResult) -> None:
|
|
210
|
+
execution_span = self._trace_exec[1]
|
|
211
|
+
|
|
212
|
+
if execution_result.error is not None:
|
|
213
|
+
task_error = execution_result.error
|
|
214
|
+
execution_span.set_status(trace.StatusCode.ERROR)
|
|
215
|
+
|
|
216
|
+
execution_span.add_event(
|
|
217
|
+
TaskAttributes.TaskError,
|
|
218
|
+
attributes={
|
|
219
|
+
TaskAttributes.TaskErrorCode: ErrorCode.name(task_error.error_info.code),
|
|
220
|
+
TaskAttributes.TaskErrorMsg: task_error.error_info.msg,
|
|
221
|
+
TaskAttributes.TaskErrorDetail: task_error.error_info.detail or "",
|
|
222
|
+
},
|
|
223
|
+
)
|
|
224
|
+
elif execution_result.cancelled:
|
|
225
|
+
execution_span.set_status(trace.StatusCode.OK)
|
|
226
|
+
execution_span.add_event(TaskAttributes.TaskCancelled)
|
|
227
|
+
else:
|
|
228
|
+
execution_span.set_status(trace.StatusCode.OK)
|
|
229
|
+
|
|
230
|
+
execution_span.end()
|
|
231
|
+
|
|
232
|
+
def on_result_done(self, fut: Future) -> None:
|
|
233
|
+
assert fut == self._result_future
|
|
234
|
+
|
|
235
|
+
with self._lock:
|
|
236
|
+
execution_result = self._cb.process_task_result(self, self._result_future)
|
|
237
|
+
self._completed.notify_all()
|
|
238
|
+
|
|
239
|
+
self._complete_execution_span(execution_result)
|
|
240
|
+
|
|
241
|
+
@contextmanager
|
|
242
|
+
def use_execution_span(self) -> Generator[trace.Span, None, None]:
|
|
243
|
+
prev_otel_ctx = otelctx.attach(self._trace_exec[0])
|
|
244
|
+
try:
|
|
245
|
+
with trace.use_span(
|
|
246
|
+
self._trace_exec[1],
|
|
247
|
+
end_on_exit=False,
|
|
248
|
+
record_exception=False,
|
|
249
|
+
set_status_on_exception=False,
|
|
250
|
+
) as span:
|
|
251
|
+
yield span
|
|
252
|
+
finally:
|
|
253
|
+
otelctx.detach(prev_otel_ctx)
|
|
254
|
+
|
|
255
|
+
def start(self) -> None:
|
|
256
|
+
"""
|
|
257
|
+
Starts the task execution:
|
|
258
|
+
|
|
259
|
+
- submits the task for execution & sets up callbacks
|
|
260
|
+
|
|
261
|
+
Note: this method is called at point where noone else yet knows about the
|
|
262
|
+
task. Thus, there is no need for synchro.
|
|
263
|
+
"""
|
|
264
|
+
assert self._result_future is None
|
|
265
|
+
|
|
266
|
+
self._result_future = self._cb.run_task(self)
|
|
267
|
+
self._result_future.add_done_callback(self.on_result_done)
|
|
268
|
+
|
|
269
|
+
def cancel(self) -> bool:
|
|
270
|
+
"""
|
|
271
|
+
Cancels the execution.
|
|
272
|
+
|
|
273
|
+
IMPORTANT: task executor most not hold any locks at the time of cancellation.
|
|
274
|
+
|
|
275
|
+
:return: True if cancel was successful, false if it was not possible
|
|
276
|
+
"""
|
|
277
|
+
assert self._result_future is not None
|
|
278
|
+
|
|
279
|
+
with self._lock:
|
|
280
|
+
if self._result_future.cancel():
|
|
281
|
+
# if future cancel() succeeded, it means the task never run, it was just
|
|
282
|
+
# sitting in the queue and got dropped; note: the done callback registered
|
|
283
|
+
# on the future itself will take care of cleaning the books
|
|
284
|
+
return True
|
|
285
|
+
|
|
286
|
+
# otherwise, the task must be running already, so try to force cancel
|
|
287
|
+
# as it is running; depending on the state of the task, this may or
|
|
288
|
+
# may not be possible
|
|
289
|
+
return self._task.cancel()
|
|
290
|
+
|
|
291
|
+
def wait_for_completion(self, timeout: Optional[float] = None) -> None:
|
|
292
|
+
with self._lock:
|
|
293
|
+
completed = self._completed.wait(timeout=timeout)
|
|
294
|
+
|
|
295
|
+
if not completed:
|
|
296
|
+
raise TaskWaitTimeoutError(task_id=self._task.task_id, cmd=self._task.cmd)
|
|
297
|
+
|
|
298
|
+
|
|
299
|
+
def _create_task_error(e: Exception) -> TaskError:
|
|
300
|
+
if isinstance(e, pyarrow.flight.FlightError):
|
|
301
|
+
error_info = ErrorInfo.from_pyarrow_error(e)
|
|
302
|
+
|
|
303
|
+
# propagate any captured flight errors as-is
|
|
304
|
+
return TaskError(
|
|
305
|
+
error_info=error_info,
|
|
306
|
+
error_factory=type(e),
|
|
307
|
+
)
|
|
308
|
+
elif isinstance(e, ValueError):
|
|
309
|
+
return TaskError(
|
|
310
|
+
error_info=ErrorInfo.for_reason(ErrorCode.BAD_ARGUMENT, str(e)),
|
|
311
|
+
error_factory=pyarrow.flight.FlightServerError,
|
|
312
|
+
)
|
|
313
|
+
|
|
314
|
+
# other, non-flight errors are bundled into a more generic FlightServerError with
|
|
315
|
+
# COMMAND_FAILED error code.
|
|
316
|
+
extra_msg = "There was an error while running task"
|
|
317
|
+
|
|
318
|
+
error_info = ErrorInfo.for_exc(
|
|
319
|
+
ErrorCode.COMMAND_FAILED,
|
|
320
|
+
e=e,
|
|
321
|
+
extra_msg=extra_msg,
|
|
322
|
+
include_traceback=True,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
return TaskError(
|
|
326
|
+
error_info=error_info,
|
|
327
|
+
error_factory=pyarrow.flight.FlightServerError,
|
|
328
|
+
)
|
|
329
|
+
|
|
330
|
+
|
|
331
|
+
class ThreadTaskExecutor(TaskExecutor, _TaskExecutionCallbacks):
|
|
332
|
+
"""
|
|
333
|
+
Implementation of TaskExecutor interface that uses a pluggable TaskFactory
|
|
334
|
+
to create tasks to run and then submits those into a ThreadPoolExecutor.
|
|
335
|
+
"""
|
|
336
|
+
|
|
337
|
+
def __init__(
|
|
338
|
+
self,
|
|
339
|
+
metric_prefix: str,
|
|
340
|
+
task_threads: int = 4,
|
|
341
|
+
result_close_threads: int = 2,
|
|
342
|
+
keep_results_for: int = 15,
|
|
343
|
+
) -> None:
|
|
344
|
+
self._logger = structlog.get_logger("gooddata_flight_server.task_executor")
|
|
345
|
+
self._metric_prefix = metric_prefix
|
|
346
|
+
|
|
347
|
+
self._metrics = TaskExecutorMetrics(prefix=metric_prefix)
|
|
348
|
+
self._executor = ThreadPoolExecutor(
|
|
349
|
+
max_workers=task_threads,
|
|
350
|
+
thread_name_prefix="gooddata_flight_server.task",
|
|
351
|
+
)
|
|
352
|
+
self._close_executor = ThreadPoolExecutor(
|
|
353
|
+
max_workers=result_close_threads,
|
|
354
|
+
thread_name_prefix="gooddata_flight_server.result_close",
|
|
355
|
+
)
|
|
356
|
+
|
|
357
|
+
self._task_lock = threading.Lock()
|
|
358
|
+
self._queue_size: int = 0
|
|
359
|
+
self._executions: dict[str, _TaskExecution] = {}
|
|
360
|
+
|
|
361
|
+
self._results: TemporalContainer[TaskExecutionResult] = TemporalContainer(
|
|
362
|
+
logger_name="gooddata_flight_server.result_container",
|
|
363
|
+
grace_period=keep_results_for,
|
|
364
|
+
entry_evict_fun=self._on_finished_task_evicted,
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
def _async_close_result(self, task_id: str, task_result: FlightDataTaskResult) -> None:
|
|
368
|
+
self._metrics.close_queue_size.dec()
|
|
369
|
+
|
|
370
|
+
try:
|
|
371
|
+
task_result.close()
|
|
372
|
+
except Exception:
|
|
373
|
+
self._logger.warning("expired_result_close_failed", task_id=task_id, exc_info=True)
|
|
374
|
+
|
|
375
|
+
def _on_finished_task_evicted(self, result: TaskExecutionResult) -> None:
|
|
376
|
+
"""
|
|
377
|
+
When a finished task is evicted from the temporal container, it means the
|
|
378
|
+
GetFlightInfo for this particular task can no longer be answered on this node.
|
|
379
|
+
"""
|
|
380
|
+
self._logger.debug(
|
|
381
|
+
"result_evicted",
|
|
382
|
+
task_id=result.task_id,
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
task_result = result.result
|
|
386
|
+
if isinstance(task_result, FlightDataTaskResult):
|
|
387
|
+
self._metrics.close_queue_size.inc()
|
|
388
|
+
self._close_executor.submit(self._async_close_result, result.task_id, task_result)
|
|
389
|
+
|
|
390
|
+
self._executions.pop(result.task_id, None)
|
|
391
|
+
|
|
392
|
+
def _create_task_exec_result(
|
|
393
|
+
self,
|
|
394
|
+
task_execution: _TaskExecution,
|
|
395
|
+
f: Future,
|
|
396
|
+
) -> TaskExecutionResult:
|
|
397
|
+
assert f.done()
|
|
398
|
+
|
|
399
|
+
task = task_execution.task
|
|
400
|
+
durations = task_execution.stats.durations_to_dict
|
|
401
|
+
self._metrics.task_completed.inc()
|
|
402
|
+
|
|
403
|
+
try:
|
|
404
|
+
r = f.result()
|
|
405
|
+
|
|
406
|
+
if isinstance(r, TaskError):
|
|
407
|
+
task_error = task.on_task_error(r) or r
|
|
408
|
+
|
|
409
|
+
if task_error.client_error:
|
|
410
|
+
self._logger.info("task_failed", task_id=task.task_id, **durations)
|
|
411
|
+
else:
|
|
412
|
+
self._logger.error("task_failed", task_id=task.task_id, **durations)
|
|
413
|
+
self._metrics.task_errors.inc()
|
|
414
|
+
|
|
415
|
+
return TaskExecutionResult(
|
|
416
|
+
task_id=task.task_id,
|
|
417
|
+
cmd=task.cmd,
|
|
418
|
+
result=None,
|
|
419
|
+
error=task_error,
|
|
420
|
+
cancelled=False,
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
self._logger.info("task_finished", task_id=task.task_id, **durations)
|
|
424
|
+
|
|
425
|
+
return TaskExecutionResult(
|
|
426
|
+
task_id=task.task_id,
|
|
427
|
+
cmd=task.cmd,
|
|
428
|
+
result=r,
|
|
429
|
+
error=None,
|
|
430
|
+
cancelled=False,
|
|
431
|
+
)
|
|
432
|
+
except CancelledError:
|
|
433
|
+
self._metrics.task_cancelled.inc()
|
|
434
|
+
|
|
435
|
+
self._logger.info("task_cancelled", task_id=task.task_id, **durations)
|
|
436
|
+
|
|
437
|
+
return TaskExecutionResult(
|
|
438
|
+
task_id=task.task_id,
|
|
439
|
+
cmd=task.cmd,
|
|
440
|
+
result=None,
|
|
441
|
+
error=None,
|
|
442
|
+
cancelled=True,
|
|
443
|
+
)
|
|
444
|
+
except Exception as e:
|
|
445
|
+
if isinstance(e, pyarrow.flight.FlightError):
|
|
446
|
+
# the FlightError's usually come when interfacing with other services such as
|
|
447
|
+
# doing DoPut to shard or when doing DoExchange to some worker process.
|
|
448
|
+
#
|
|
449
|
+
# FlightError raised by task will usually (in case of serious, unexpected errors)
|
|
450
|
+
# include additional detail (such as stacktrace) - this is normally omitted when
|
|
451
|
+
# just logging the exception itself because they are part of the extra_info
|
|
452
|
+
#
|
|
453
|
+
# so this code unpack the error and does additional login
|
|
454
|
+
task_error = _create_task_error(e)
|
|
455
|
+
error_data = {
|
|
456
|
+
"nested_msg": task_error.error_info.msg,
|
|
457
|
+
"nested_stacktrace": task_error.error_info.detail,
|
|
458
|
+
"nested_code": ErrorCode.name(task_error.error_info.code),
|
|
459
|
+
}
|
|
460
|
+
else:
|
|
461
|
+
task_error = _create_task_error(e)
|
|
462
|
+
error_data = {}
|
|
463
|
+
|
|
464
|
+
try:
|
|
465
|
+
task_error = task.on_task_error(task_error) or task_error
|
|
466
|
+
except Exception:
|
|
467
|
+
pass
|
|
468
|
+
|
|
469
|
+
if task_error.client_error:
|
|
470
|
+
self._logger.info(
|
|
471
|
+
"task_failed",
|
|
472
|
+
task_id=task.task_id,
|
|
473
|
+
**durations,
|
|
474
|
+
**error_data,
|
|
475
|
+
)
|
|
476
|
+
else:
|
|
477
|
+
self._logger.error(
|
|
478
|
+
"task_failed",
|
|
479
|
+
task_id=task.task_id,
|
|
480
|
+
exc_info=e,
|
|
481
|
+
**durations,
|
|
482
|
+
**error_data,
|
|
483
|
+
)
|
|
484
|
+
self._metrics.task_errors.inc()
|
|
485
|
+
|
|
486
|
+
return TaskExecutionResult(
|
|
487
|
+
task_id=task.task_id,
|
|
488
|
+
cmd=task.cmd,
|
|
489
|
+
result=None,
|
|
490
|
+
cancelled=False,
|
|
491
|
+
error=task_error,
|
|
492
|
+
)
|
|
493
|
+
|
|
494
|
+
def _task_run_wrapper(self, task_execution: _TaskExecution) -> Any:
|
|
495
|
+
task = task_execution.task
|
|
496
|
+
logging_ctx = task_execution.logging_ctx
|
|
497
|
+
stats = task_execution.stats
|
|
498
|
+
|
|
499
|
+
stats.run_started = time.perf_counter()
|
|
500
|
+
structlog.contextvars.clear_contextvars()
|
|
501
|
+
structlog.contextvars.bind_contextvars(**logging_ctx)
|
|
502
|
+
|
|
503
|
+
with (
|
|
504
|
+
task_execution.use_execution_span(),
|
|
505
|
+
SERVER_TRACER.start_as_current_span("task_run", attributes={TaskAttributes.TaskId: task.task_id}),
|
|
506
|
+
):
|
|
507
|
+
self._logger.info(
|
|
508
|
+
"task_run",
|
|
509
|
+
task_id=task.task_id,
|
|
510
|
+
waited=stats.run_waited_duration,
|
|
511
|
+
)
|
|
512
|
+
self._metrics.wait_time.observe(stats.run_waited_duration)
|
|
513
|
+
|
|
514
|
+
try:
|
|
515
|
+
return task.run()
|
|
516
|
+
finally:
|
|
517
|
+
stats.run_completed = time.perf_counter()
|
|
518
|
+
stats.completed = stats.run_completed
|
|
519
|
+
|
|
520
|
+
self._metrics.task_duration.observe(stats.run_duration)
|
|
521
|
+
self._metrics.task_e2e_duration.observe(stats.duration)
|
|
522
|
+
|
|
523
|
+
def _finish_task_with_result(self, task_execution: "_TaskExecution", result: TaskExecutionResult) -> None:
|
|
524
|
+
task = task_execution.task
|
|
525
|
+
with self._task_lock:
|
|
526
|
+
self._executions.pop(task.task_id)
|
|
527
|
+
self._results[task.task_id] = result
|
|
528
|
+
self._queue_size -= 1
|
|
529
|
+
|
|
530
|
+
if self._queue_size < 0:
|
|
531
|
+
self._logger.warning("queue_size_corrupt", queue_size=self._queue_size)
|
|
532
|
+
|
|
533
|
+
self._metrics.queue_size.set(self._queue_size)
|
|
534
|
+
|
|
535
|
+
def run_task(
|
|
536
|
+
self,
|
|
537
|
+
task_execution: _TaskExecution,
|
|
538
|
+
) -> Future:
|
|
539
|
+
with task_execution.use_execution_span(), SERVER_TRACER.start_as_current_span("task_run_submit"):
|
|
540
|
+
task_execution.stats.run_submitted = time.perf_counter()
|
|
541
|
+
|
|
542
|
+
return self._executor.submit(self._task_run_wrapper, task_execution)
|
|
543
|
+
|
|
544
|
+
def process_task_result(
|
|
545
|
+
self,
|
|
546
|
+
task_execution: "_TaskExecution",
|
|
547
|
+
future: Future,
|
|
548
|
+
) -> TaskExecutionResult:
|
|
549
|
+
result = self._create_task_exec_result(task_execution, future)
|
|
550
|
+
self._finish_task_with_result(task_execution, result)
|
|
551
|
+
|
|
552
|
+
return result
|
|
553
|
+
|
|
554
|
+
def submit(
|
|
555
|
+
self,
|
|
556
|
+
task: Task,
|
|
557
|
+
) -> None:
|
|
558
|
+
# note: task execution constructor will snapshot current logging and tracing context
|
|
559
|
+
execution = _TaskExecution(task=task, cb=self)
|
|
560
|
+
|
|
561
|
+
with self._task_lock:
|
|
562
|
+
self._queue_size += 1
|
|
563
|
+
self._executions[task.task_id] = execution
|
|
564
|
+
|
|
565
|
+
execution.start()
|
|
566
|
+
self._metrics.queue_size.set(self._queue_size)
|
|
567
|
+
|
|
568
|
+
def wait_for_result(self, task_id: str, timeout: Optional[float] = None) -> Optional[TaskExecutionResult]:
|
|
569
|
+
with self._task_lock:
|
|
570
|
+
execution = self._executions.get(task_id)
|
|
571
|
+
result = self._results.get_entry(task_id)
|
|
572
|
+
|
|
573
|
+
if result is not None:
|
|
574
|
+
return result
|
|
575
|
+
elif execution is not None:
|
|
576
|
+
execution.wait_for_completion(timeout=timeout)
|
|
577
|
+
|
|
578
|
+
return self._results.get_entry(task_id)
|
|
579
|
+
|
|
580
|
+
return None
|
|
581
|
+
|
|
582
|
+
def cancel(self, task_id: str) -> bool:
|
|
583
|
+
with self._task_lock:
|
|
584
|
+
execution = self._executions.get(task_id)
|
|
585
|
+
result = self._results.get_entry(task_id)
|
|
586
|
+
|
|
587
|
+
if result is not None:
|
|
588
|
+
# task has already completed and there is a result associated
|
|
589
|
+
#
|
|
590
|
+
# interpret cancel as client not being interested in this anymore
|
|
591
|
+
# and throw the result away
|
|
592
|
+
#
|
|
593
|
+
# unless the task resulted in a flight path (e.g. possibly a persisted
|
|
594
|
+
# result), treat the cancellation as successful
|
|
595
|
+
self._results.evict_entry(task_id)
|
|
596
|
+
|
|
597
|
+
return True
|
|
598
|
+
|
|
599
|
+
if execution is None:
|
|
600
|
+
# the task was not and is not running - cancel not possible
|
|
601
|
+
return False
|
|
602
|
+
|
|
603
|
+
return execution.cancel()
|
|
604
|
+
|
|
605
|
+
def close_result(self, task_id: str) -> bool:
|
|
606
|
+
with self._task_lock:
|
|
607
|
+
result = self._results.pop_entry(entry_id=task_id)
|
|
608
|
+
|
|
609
|
+
if result is None:
|
|
610
|
+
return False
|
|
611
|
+
|
|
612
|
+
self._on_finished_task_evicted(result)
|
|
613
|
+
return True
|
|
614
|
+
|
|
615
|
+
def stop(self, cancel_running: bool = True, timeout: Optional[float] = None) -> None:
|
|
616
|
+
"""
|
|
617
|
+
Stops the service. Any pending tasks will be immediately cancelled. Tasks that are already executing
|
|
618
|
+
are allowed to complete.
|
|
619
|
+
|
|
620
|
+
:param cancel_running: whether to cancel already running tasks
|
|
621
|
+
:param timeout: time to way for all running tasks to finish
|
|
622
|
+
:return: nothing
|
|
623
|
+
"""
|
|
624
|
+
self._logger.info("task_exec_stopping", pending_tasks=len(self._executions))
|
|
625
|
+
self._executor.shutdown(wait=False, cancel_futures=True)
|
|
626
|
+
|
|
627
|
+
if cancel_running:
|
|
628
|
+
with self._task_lock:
|
|
629
|
+
for task in self._executions.values():
|
|
630
|
+
task.cancel()
|
|
631
|
+
|
|
632
|
+
def _shutdown_executor() -> None:
|
|
633
|
+
self._executor.shutdown(wait=True, cancel_futures=True)
|
|
634
|
+
|
|
635
|
+
shutdown_thd = threading.Thread(target=_shutdown_executor)
|
|
636
|
+
shutdown_thd.start()
|
|
637
|
+
shutdown_thd.join(timeout=timeout)
|
|
638
|
+
|
|
639
|
+
self._results.close()
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
# (C) 2024 GoodData Corporation
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
# (C) 2024 GoodData Corporation
|
|
2
|
+
import ctypes
|
|
3
|
+
from typing import Any, Union
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
class NoLibc:
|
|
7
|
+
@staticmethod
|
|
8
|
+
def _noop(*args: Any, **kwargs: Any) -> None:
|
|
9
|
+
return None
|
|
10
|
+
|
|
11
|
+
def __getattr__(self, item: Any) -> Any:
|
|
12
|
+
return NoLibc._noop
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class LibcUtils:
|
|
16
|
+
"""
|
|
17
|
+
Wrapper for calls of libc functions.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
def __init__(self) -> None:
|
|
21
|
+
# also see
|
|
22
|
+
# https://stackoverflow.com/questions/67338017/does-calling-a-c-function-via-ctypes-in-python-release-the-gil-during-execution
|
|
23
|
+
try:
|
|
24
|
+
self._libc: Union[ctypes.CDLL, NoLibc] = ctypes.CDLL("libc.so.6")
|
|
25
|
+
except OSError:
|
|
26
|
+
self._libc = NoLibc()
|
|
27
|
+
|
|
28
|
+
def malloc_trim(self) -> None:
|
|
29
|
+
"""
|
|
30
|
+
Call malloc_trim - cutting away excess allocations that were not yet returned to the OS.
|
|
31
|
+
|
|
32
|
+
See: https://issues.apache.org/jira/browse/ARROW-16697
|
|
33
|
+
:return: nothing
|
|
34
|
+
"""
|
|
35
|
+
self._libc.malloc_trim(0)
|