pixeltable 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +64 -11
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +50 -27
- pixeltable/catalog/column.py +27 -11
- pixeltable/catalog/dir.py +6 -4
- pixeltable/catalog/globals.py +8 -1
- pixeltable/catalog/insertable_table.py +22 -12
- pixeltable/catalog/named_function.py +10 -6
- pixeltable/catalog/path.py +3 -2
- pixeltable/catalog/path_dict.py +8 -6
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +121 -101
- pixeltable/catalog/table_version.py +291 -142
- pixeltable/catalog/table_version_path.py +8 -5
- pixeltable/catalog/view.py +67 -26
- pixeltable/dataframe.py +102 -72
- pixeltable/env.py +20 -21
- pixeltable/exec/__init__.py +2 -2
- pixeltable/exec/aggregation_node.py +10 -4
- pixeltable/exec/cache_prefetch_node.py +5 -3
- pixeltable/exec/component_iteration_node.py +9 -8
- pixeltable/exec/data_row_batch.py +21 -10
- pixeltable/exec/exec_context.py +10 -3
- pixeltable/exec/exec_node.py +23 -12
- pixeltable/exec/expr_eval/evaluators.py +13 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +24 -15
- pixeltable/exec/expr_eval/globals.py +30 -7
- pixeltable/exec/expr_eval/row_buffer.py +5 -6
- pixeltable/exec/expr_eval/schedulers.py +151 -31
- pixeltable/exec/in_memory_data_node.py +8 -7
- pixeltable/exec/row_update_node.py +15 -5
- pixeltable/exec/sql_node.py +56 -27
- pixeltable/exprs/__init__.py +2 -2
- pixeltable/exprs/arithmetic_expr.py +57 -26
- pixeltable/exprs/array_slice.py +1 -1
- pixeltable/exprs/column_property_ref.py +2 -1
- pixeltable/exprs/column_ref.py +20 -15
- pixeltable/exprs/comparison.py +6 -2
- pixeltable/exprs/compound_predicate.py +1 -3
- pixeltable/exprs/data_row.py +2 -2
- pixeltable/exprs/expr.py +101 -72
- pixeltable/exprs/expr_dict.py +2 -1
- pixeltable/exprs/expr_set.py +3 -1
- pixeltable/exprs/function_call.py +39 -41
- pixeltable/exprs/globals.py +1 -0
- pixeltable/exprs/in_predicate.py +2 -2
- pixeltable/exprs/inline_expr.py +20 -17
- pixeltable/exprs/json_mapper.py +4 -2
- pixeltable/exprs/json_path.py +12 -18
- pixeltable/exprs/literal.py +5 -9
- pixeltable/exprs/method_ref.py +1 -0
- pixeltable/exprs/object_ref.py +1 -1
- pixeltable/exprs/row_builder.py +32 -17
- pixeltable/exprs/rowid_ref.py +14 -5
- pixeltable/exprs/similarity_expr.py +11 -6
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/type_cast.py +24 -9
- pixeltable/ext/__init__.py +1 -0
- pixeltable/ext/functions/__init__.py +1 -0
- pixeltable/ext/functions/whisperx.py +2 -2
- pixeltable/ext/functions/yolox.py +11 -11
- pixeltable/func/aggregate_function.py +17 -13
- pixeltable/func/callable_function.py +6 -6
- pixeltable/func/expr_template_function.py +15 -14
- pixeltable/func/function.py +16 -16
- pixeltable/func/function_registry.py +11 -8
- pixeltable/func/globals.py +4 -2
- pixeltable/func/query_template_function.py +12 -13
- pixeltable/func/signature.py +18 -9
- pixeltable/func/tools.py +10 -17
- pixeltable/func/udf.py +106 -11
- pixeltable/functions/__init__.py +21 -2
- pixeltable/functions/anthropic.py +16 -12
- pixeltable/functions/fireworks.py +63 -5
- pixeltable/functions/gemini.py +13 -3
- pixeltable/functions/globals.py +18 -6
- pixeltable/functions/huggingface.py +20 -38
- pixeltable/functions/image.py +7 -3
- pixeltable/functions/json.py +1 -0
- pixeltable/functions/llama_cpp.py +1 -4
- pixeltable/functions/mistralai.py +31 -20
- pixeltable/functions/ollama.py +4 -18
- pixeltable/functions/openai.py +201 -108
- pixeltable/functions/replicate.py +11 -10
- pixeltable/functions/string.py +70 -7
- pixeltable/functions/timestamp.py +21 -8
- pixeltable/functions/together.py +66 -52
- pixeltable/functions/video.py +1 -0
- pixeltable/functions/vision.py +14 -11
- pixeltable/functions/whisper.py +2 -1
- pixeltable/globals.py +60 -26
- pixeltable/index/__init__.py +1 -1
- pixeltable/index/btree.py +5 -3
- pixeltable/index/embedding_index.py +15 -14
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +30 -25
- pixeltable/io/fiftyone.py +6 -14
- pixeltable/io/globals.py +33 -27
- pixeltable/io/hf_datasets.py +2 -1
- pixeltable/io/label_studio.py +77 -68
- pixeltable/io/pandas.py +33 -9
- pixeltable/io/parquet.py +9 -12
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/audio.py +205 -0
- pixeltable/iterators/document.py +19 -8
- pixeltable/iterators/image.py +6 -24
- pixeltable/iterators/string.py +3 -6
- pixeltable/iterators/video.py +1 -7
- pixeltable/metadata/__init__.py +7 -1
- pixeltable/metadata/converters/convert_10.py +2 -2
- pixeltable/metadata/converters/convert_15.py +1 -5
- pixeltable/metadata/converters/convert_16.py +2 -4
- pixeltable/metadata/converters/convert_17.py +2 -4
- pixeltable/metadata/converters/convert_18.py +2 -4
- pixeltable/metadata/converters/convert_19.py +2 -5
- pixeltable/metadata/converters/convert_20.py +1 -4
- pixeltable/metadata/converters/convert_21.py +4 -6
- pixeltable/metadata/converters/convert_22.py +1 -0
- pixeltable/metadata/converters/convert_23.py +5 -5
- pixeltable/metadata/converters/convert_24.py +12 -13
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/util.py +3 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +13 -2
- pixeltable/plan.py +173 -98
- pixeltable/store.py +42 -26
- pixeltable/type_system.py +62 -54
- pixeltable/utils/arrow.py +1 -2
- pixeltable/utils/coco.py +16 -17
- pixeltable/utils/code.py +1 -1
- pixeltable/utils/console_output.py +6 -3
- pixeltable/utils/description_helper.py +7 -7
- pixeltable/utils/documents.py +3 -1
- pixeltable/utils/filecache.py +12 -7
- pixeltable/utils/http_server.py +9 -8
- pixeltable/utils/media_store.py +2 -1
- pixeltable/utils/pytorch.py +11 -14
- pixeltable/utils/s3.py +1 -0
- pixeltable/utils/sql.py +1 -0
- pixeltable/utils/transactional_directory.py +2 -2
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/METADATA +6 -8
- pixeltable-0.3.3.dist-info/RECORD +163 -0
- pixeltable-0.3.2.dist-info/RECORD +0 -161
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/entry_points.txt +0 -0
|
@@ -1,16 +1,18 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
1
3
|
import abc
|
|
2
4
|
import asyncio
|
|
3
5
|
from dataclasses import dataclass
|
|
4
6
|
from types import TracebackType
|
|
5
|
-
from typing import Any,
|
|
7
|
+
from typing import Any, Optional, Protocol
|
|
6
8
|
|
|
7
|
-
from pixeltable import exprs
|
|
8
|
-
from pixeltable import func
|
|
9
|
+
from pixeltable import exprs, func
|
|
9
10
|
|
|
10
11
|
|
|
11
12
|
@dataclass
|
|
12
13
|
class FnCallArgs:
|
|
13
14
|
"""Container for everything needed to execute a FunctionCall against one or more DataRows"""
|
|
15
|
+
|
|
14
16
|
fn_call: exprs.FunctionCall
|
|
15
17
|
rows: list[exprs.DataRow]
|
|
16
18
|
# single call
|
|
@@ -37,16 +39,36 @@ class FnCallArgs:
|
|
|
37
39
|
|
|
38
40
|
class Scheduler(abc.ABC):
|
|
39
41
|
"""
|
|
40
|
-
Base class for schedulers. A scheduler executes FunctionCalls against a limited resource pool.
|
|
42
|
+
Base class for queueing schedulers. A scheduler executes FunctionCalls against a limited resource pool.
|
|
41
43
|
|
|
42
44
|
Expected behavior:
|
|
43
45
|
- all created tasks must be recorded in dispatcher.tasks
|
|
44
46
|
- schedulers are responsible for aborting execution when a) the task is cancelled or b) when an exception occurred
|
|
45
47
|
elsewhere (indicated by dispatcher.exc_event)
|
|
46
48
|
"""
|
|
47
|
-
|
|
49
|
+
|
|
50
|
+
@dataclass(frozen=True)
|
|
51
|
+
class QueueItem:
|
|
52
|
+
"""Container of work items for queueing schedulers"""
|
|
53
|
+
|
|
54
|
+
request: FnCallArgs
|
|
55
|
+
num_retries: int
|
|
56
|
+
|
|
57
|
+
def __lt__(self, other: Scheduler.QueueItem) -> bool:
|
|
58
|
+
# prioritize by number of retries (more retries = higher priority)
|
|
59
|
+
return self.num_retries > other.num_retries
|
|
60
|
+
|
|
61
|
+
resource_pool: str
|
|
62
|
+
queue: asyncio.PriorityQueue[QueueItem] # prioritizes retries
|
|
63
|
+
dispatcher: Dispatcher
|
|
64
|
+
|
|
65
|
+
def __init__(self, resource_pool: str, dispatcher: Dispatcher):
|
|
66
|
+
self.resource_pool = resource_pool
|
|
67
|
+
self.queue = asyncio.PriorityQueue()
|
|
68
|
+
self.dispatcher = dispatcher
|
|
69
|
+
|
|
48
70
|
def submit(self, item: FnCallArgs) -> None:
|
|
49
|
-
|
|
71
|
+
self.queue.put_nowait(self.QueueItem(item, 0))
|
|
50
72
|
|
|
51
73
|
@classmethod
|
|
52
74
|
@abc.abstractmethod
|
|
@@ -63,6 +85,7 @@ class Dispatcher(Protocol):
|
|
|
63
85
|
Exceptions: evaluators/schedulers need to check exc_event prior to starting long-running (non-interruptible)
|
|
64
86
|
computations
|
|
65
87
|
"""
|
|
88
|
+
|
|
66
89
|
row_builder: exprs.RowBuilder
|
|
67
90
|
exc_event: asyncio.Event
|
|
68
91
|
schedulers: dict[str, Scheduler] # key: resource pool id
|
|
@@ -90,6 +113,7 @@ class Evaluator(abc.ABC):
|
|
|
90
113
|
- evaluators are responsible for aborting execution when a) the task is cancelled or b) when an exception occurred
|
|
91
114
|
elsewhere (indicated by dispatcher.exc_event)
|
|
92
115
|
"""
|
|
116
|
+
|
|
93
117
|
dispatcher: Dispatcher
|
|
94
118
|
is_closed: bool
|
|
95
119
|
|
|
@@ -110,4 +134,3 @@ class Evaluator(abc.ABC):
|
|
|
110
134
|
"""Indicates that there may not be any more rows getting scheduled"""
|
|
111
135
|
self.is_closed = True
|
|
112
136
|
self._close()
|
|
113
|
-
|
|
@@ -62,15 +62,14 @@ class RowBuffer:
|
|
|
62
62
|
return []
|
|
63
63
|
rows: list[exprs.DataRow]
|
|
64
64
|
if self.head_idx + n <= self.size:
|
|
65
|
-
rows = self.buffer[self.head_idx:self.head_idx + n].tolist()
|
|
66
|
-
self.buffer[self.head_idx:self.head_idx + n] = None
|
|
65
|
+
rows = self.buffer[self.head_idx : self.head_idx + n].tolist()
|
|
66
|
+
self.buffer[self.head_idx : self.head_idx + n] = None
|
|
67
67
|
else:
|
|
68
|
-
rows = np.concatenate([self.buffer[self.head_idx:], self.buffer[:self.head_idx + n - self.size]]).tolist()
|
|
69
|
-
self.buffer[self.head_idx:] = None
|
|
70
|
-
self.buffer[:self.head_idx + n - self.size] = None
|
|
68
|
+
rows = np.concatenate([self.buffer[self.head_idx :], self.buffer[: self.head_idx + n - self.size]]).tolist()
|
|
69
|
+
self.buffer[self.head_idx :] = None
|
|
70
|
+
self.buffer[: self.head_idx + n - self.size] = None
|
|
71
71
|
self.head_pos += n
|
|
72
72
|
self.head_idx = (self.head_idx + n) % self.size
|
|
73
73
|
self.num_rows -= n
|
|
74
74
|
self.num_ready -= n
|
|
75
75
|
return rows
|
|
76
|
-
|
|
@@ -5,12 +5,12 @@ import datetime
|
|
|
5
5
|
import inspect
|
|
6
6
|
import logging
|
|
7
7
|
import sys
|
|
8
|
-
|
|
9
|
-
from typing import
|
|
8
|
+
import time
|
|
9
|
+
from typing import Awaitable, Collection, Optional
|
|
10
10
|
|
|
11
|
-
from pixeltable import env
|
|
12
|
-
|
|
13
|
-
from .globals import
|
|
11
|
+
from pixeltable import env, func
|
|
12
|
+
|
|
13
|
+
from .globals import Dispatcher, FnCallArgs, Scheduler
|
|
14
14
|
|
|
15
15
|
_logger = logging.getLogger('pixeltable')
|
|
16
16
|
|
|
@@ -29,19 +29,7 @@ class RateLimitsScheduler(Scheduler):
|
|
|
29
29
|
TODO:
|
|
30
30
|
- limit the number of in-flight requests based on the open file limit
|
|
31
31
|
"""
|
|
32
|
-
|
|
33
|
-
class QueueItem:
|
|
34
|
-
request: FnCallArgs
|
|
35
|
-
num_retries: int
|
|
36
|
-
|
|
37
|
-
def __lt__(self, other: RateLimitsScheduler.QueueItem) -> bool:
|
|
38
|
-
# prioritize by number of retries
|
|
39
|
-
return self.num_retries > other.num_retries
|
|
40
|
-
|
|
41
|
-
resource_pool: str
|
|
42
|
-
queue: asyncio.PriorityQueue[QueueItem] # prioritizes retries
|
|
43
|
-
loop_task: asyncio.Task
|
|
44
|
-
dispatcher: Dispatcher
|
|
32
|
+
|
|
45
33
|
get_request_resources_param_names: list[str] # names of parameters of RateLimitsInfo.get_request_resources()
|
|
46
34
|
|
|
47
35
|
# scheduling-related state
|
|
@@ -58,11 +46,9 @@ class RateLimitsScheduler(Scheduler):
|
|
|
58
46
|
MAX_RETRIES = 10
|
|
59
47
|
|
|
60
48
|
def __init__(self, resource_pool: str, dispatcher: Dispatcher):
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
self.dispatcher
|
|
64
|
-
self.loop_task = asyncio.create_task(self._main_loop())
|
|
65
|
-
self.dispatcher.register_task(self.loop_task)
|
|
49
|
+
super().__init__(resource_pool, dispatcher)
|
|
50
|
+
loop_task = asyncio.create_task(self._main_loop())
|
|
51
|
+
self.dispatcher.register_task(loop_task)
|
|
66
52
|
self.pool_info = None # initialized in _main_loop by the first request
|
|
67
53
|
self.est_usage = {}
|
|
68
54
|
self.num_in_flight = 0
|
|
@@ -104,6 +90,7 @@ class RateLimitsScheduler(Scheduler):
|
|
|
104
90
|
# wait for a single request to get rate limits
|
|
105
91
|
_logger.debug(f'initializing rate limits for {self.resource_pool}')
|
|
106
92
|
await self._exec(item.request, item.num_retries, is_task=False)
|
|
93
|
+
_logger.debug(f'initialized rate limits for {self.resource_pool}')
|
|
107
94
|
item = None
|
|
108
95
|
# if this was the first request, it created the pool_info
|
|
109
96
|
if self.pool_info is None:
|
|
@@ -111,6 +98,7 @@ class RateLimitsScheduler(Scheduler):
|
|
|
111
98
|
continue
|
|
112
99
|
|
|
113
100
|
# check rate limits
|
|
101
|
+
_logger.debug(f'checking rate limits for {self.resource_pool}')
|
|
114
102
|
request_resources = self._get_request_resources(item.request)
|
|
115
103
|
limits_info = self._check_resource_limits(request_resources)
|
|
116
104
|
aws: list[Awaitable[None]] = []
|
|
@@ -169,7 +157,6 @@ class RateLimitsScheduler(Scheduler):
|
|
|
169
157
|
constant_kwargs, batch_kwargs = request.pxt_fn.create_batch_kwargs(batch_kwargs)
|
|
170
158
|
return self.pool_info.get_request_resources(**constant_kwargs, **batch_kwargs)
|
|
171
159
|
|
|
172
|
-
|
|
173
160
|
def _check_resource_limits(self, request_resources: dict[str, int]) -> Optional[env.RateLimitInfo]:
|
|
174
161
|
"""Returns the most depleted resource, relative to its limit, or None if all resources are within limits"""
|
|
175
162
|
candidates: list[tuple[env.RateLimitInfo, float]] = [] # (info, relative usage)
|
|
@@ -191,7 +178,9 @@ class RateLimitsScheduler(Scheduler):
|
|
|
191
178
|
start_ts = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
192
179
|
pxt_fn = request.fn_call.fn
|
|
193
180
|
assert isinstance(pxt_fn, func.CallableFunction)
|
|
194
|
-
_logger.debug(
|
|
181
|
+
_logger.debug(
|
|
182
|
+
f'scheduler {self.resource_pool}: start evaluating slot {request.fn_call.slot_idx}, batch_size={len(request.rows)}'
|
|
183
|
+
)
|
|
195
184
|
self.total_requests += 1
|
|
196
185
|
if request.is_batched:
|
|
197
186
|
batch_result = await pxt_fn.aexec_batch(*request.batch_args, **request.batch_kwargs)
|
|
@@ -202,7 +191,9 @@ class RateLimitsScheduler(Scheduler):
|
|
|
202
191
|
result = await pxt_fn.aexec(*request.args, **request.kwargs)
|
|
203
192
|
request.row[request.fn_call.slot_idx] = result
|
|
204
193
|
end_ts = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
205
|
-
_logger.debug(
|
|
194
|
+
_logger.debug(
|
|
195
|
+
f'scheduler {self.resource_pool}: evaluated slot {request.fn_call.slot_idx} in {end_ts - start_ts}, batch_size={len(request.rows)}'
|
|
196
|
+
)
|
|
206
197
|
|
|
207
198
|
# purge accumulated usage estimate, now that we have a new report
|
|
208
199
|
self.est_usage = {r: 0 for r in self._resources}
|
|
@@ -210,10 +201,11 @@ class RateLimitsScheduler(Scheduler):
|
|
|
210
201
|
self.dispatcher.dispatch(request.rows)
|
|
211
202
|
except Exception as exc:
|
|
212
203
|
_logger.debug(f'scheduler {self.resource_pool}: exception in slot {request.fn_call.slot_idx}: {exc}')
|
|
213
|
-
if
|
|
204
|
+
if self.pool_info is None:
|
|
214
205
|
# our pool info should be available at this point
|
|
215
206
|
self._set_pool_info()
|
|
216
|
-
|
|
207
|
+
assert self.pool_info is not None
|
|
208
|
+
if num_retries < self.MAX_RETRIES:
|
|
217
209
|
retry_delay = self.pool_info.get_retry_delay(exc)
|
|
218
210
|
if retry_delay is not None:
|
|
219
211
|
self.total_retried += 1
|
|
@@ -229,12 +221,140 @@ class RateLimitsScheduler(Scheduler):
|
|
|
229
221
|
row.set_exc(request.fn_call.slot_idx, exc)
|
|
230
222
|
self.dispatcher.dispatch_exc(request.rows, request.fn_call.slot_idx, exc_tb)
|
|
231
223
|
finally:
|
|
232
|
-
_logger.debug(
|
|
233
|
-
f'Scheduler stats: #requests={self.total_requests}, #retried={self.total_retried}')
|
|
224
|
+
_logger.debug(f'Scheduler stats: #requests={self.total_requests}, #retried={self.total_retried}')
|
|
234
225
|
if is_task:
|
|
235
226
|
self.num_in_flight -= 1
|
|
236
227
|
self.request_completed.set()
|
|
237
228
|
|
|
238
229
|
|
|
230
|
+
class RequestRateScheduler(Scheduler):
|
|
231
|
+
"""
|
|
232
|
+
Scheduler for FunctionCalls with a fixed request rate limit and no runtime resource usage reports.
|
|
233
|
+
|
|
234
|
+
Rate limits are supplied in the config, in one of two ways:
|
|
235
|
+
- resource_pool='request-rate:<endpoint>':
|
|
236
|
+
* a single rate limit for all calls against that endpoint
|
|
237
|
+
* in the config: section '<endpoint>', key 'rate_limit'
|
|
238
|
+
- resource_pool='request-rate:<endpoint>:<model>':
|
|
239
|
+
* a single rate limit for all calls against that model
|
|
240
|
+
* in the config: section '<endpoint>.rate_limits', key '<model>'
|
|
241
|
+
- if no rate limit is found in the config, uses a default of 600 RPM
|
|
242
|
+
|
|
243
|
+
TODO:
|
|
244
|
+
- adaptive rate limiting based on 429 errors
|
|
245
|
+
"""
|
|
246
|
+
|
|
247
|
+
secs_per_request: float # inverted rate limit
|
|
248
|
+
num_in_flight: int
|
|
249
|
+
total_requests: int
|
|
250
|
+
total_retried: int
|
|
251
|
+
|
|
252
|
+
TIME_FORMAT = '%H:%M.%S %f'
|
|
253
|
+
MAX_RETRIES = 10
|
|
254
|
+
DEFAULT_RATE_LIMIT = 600 # requests per minute
|
|
255
|
+
|
|
256
|
+
def __init__(self, resource_pool: str, dispatcher: Dispatcher):
|
|
257
|
+
super().__init__(resource_pool, dispatcher)
|
|
258
|
+
loop_task = asyncio.create_task(self._main_loop())
|
|
259
|
+
self.dispatcher.register_task(loop_task)
|
|
260
|
+
self.num_in_flight = 0
|
|
261
|
+
self.total_requests = 0
|
|
262
|
+
self.total_retried = 0
|
|
263
|
+
|
|
264
|
+
# try to get the rate limit from the config
|
|
265
|
+
elems = resource_pool.split(':')
|
|
266
|
+
section: str
|
|
267
|
+
key: str
|
|
268
|
+
if len(elems) == 2:
|
|
269
|
+
# resource_pool: request-rate:endpoint
|
|
270
|
+
_, endpoint = elems
|
|
271
|
+
section = endpoint
|
|
272
|
+
key = 'rate_limit'
|
|
273
|
+
else:
|
|
274
|
+
# resource_pool: request-rate:endpoint:model
|
|
275
|
+
assert len(elems) == 3
|
|
276
|
+
_, endpoint, model = elems
|
|
277
|
+
section = f'{endpoint}.rate_limits'
|
|
278
|
+
key = model
|
|
279
|
+
requests_per_min = env.Env.get().config.get_int_value(key, section=section)
|
|
280
|
+
requests_per_min = requests_per_min or self.DEFAULT_RATE_LIMIT
|
|
281
|
+
self.secs_per_request = 1 / (requests_per_min / 60)
|
|
282
|
+
|
|
283
|
+
@classmethod
|
|
284
|
+
def matches(cls, resource_pool: str) -> bool:
|
|
285
|
+
return resource_pool.startswith('request-rate:')
|
|
286
|
+
|
|
287
|
+
async def _main_loop(self) -> None:
|
|
288
|
+
last_request_ts = 0.0
|
|
289
|
+
while True:
|
|
290
|
+
item = await self.queue.get()
|
|
291
|
+
if item.num_retries > 0:
|
|
292
|
+
self.total_retried += 1
|
|
293
|
+
now = time.monotonic()
|
|
294
|
+
if now - last_request_ts < self.secs_per_request:
|
|
295
|
+
wait_duration = self.secs_per_request - (now - last_request_ts)
|
|
296
|
+
_logger.debug(f'waiting for {wait_duration} for {self.resource_pool}')
|
|
297
|
+
await asyncio.sleep(wait_duration)
|
|
298
|
+
|
|
299
|
+
last_request_ts = time.monotonic()
|
|
300
|
+
if item.num_retries > 0:
|
|
301
|
+
# the last request encountered some problem: retry it synchronously, to wait for the problem to pass
|
|
302
|
+
_logger.debug(f'retrying request for {self.resource_pool}: #retries={item.num_retries}')
|
|
303
|
+
await self._exec(item.request, item.num_retries, is_task=False)
|
|
304
|
+
_logger.debug(f'retried request for {self.resource_pool}: #retries={item.num_retries}')
|
|
305
|
+
else:
|
|
306
|
+
_logger.debug(f'creating task for {self.resource_pool}')
|
|
307
|
+
self.num_in_flight += 1
|
|
308
|
+
task = asyncio.create_task(self._exec(item.request, item.num_retries, is_task=True))
|
|
309
|
+
self.dispatcher.register_task(task)
|
|
310
|
+
|
|
311
|
+
async def _exec(self, request: FnCallArgs, num_retries: int, is_task: bool) -> None:
|
|
312
|
+
assert all(not row.has_val[request.fn_call.slot_idx] for row in request.rows)
|
|
313
|
+
assert all(not row.has_exc(request.fn_call.slot_idx) for row in request.rows)
|
|
314
|
+
|
|
315
|
+
try:
|
|
316
|
+
start_ts = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
317
|
+
pxt_fn = request.fn_call.fn
|
|
318
|
+
assert isinstance(pxt_fn, func.CallableFunction)
|
|
319
|
+
_logger.debug(
|
|
320
|
+
f'scheduler {self.resource_pool}: start evaluating slot {request.fn_call.slot_idx}, batch_size={len(request.rows)}'
|
|
321
|
+
)
|
|
322
|
+
self.total_requests += 1
|
|
323
|
+
if request.is_batched:
|
|
324
|
+
batch_result = await pxt_fn.aexec_batch(*request.batch_args, **request.batch_kwargs)
|
|
325
|
+
assert len(batch_result) == len(request.rows)
|
|
326
|
+
for row, result in zip(request.rows, batch_result):
|
|
327
|
+
row[request.fn_call.slot_idx] = result
|
|
328
|
+
else:
|
|
329
|
+
result = await pxt_fn.aexec(*request.args, **request.kwargs)
|
|
330
|
+
request.row[request.fn_call.slot_idx] = result
|
|
331
|
+
end_ts = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
332
|
+
_logger.debug(
|
|
333
|
+
f'scheduler {self.resource_pool}: evaluated slot {request.fn_call.slot_idx} in {end_ts - start_ts}, batch_size={len(request.rows)}'
|
|
334
|
+
)
|
|
335
|
+
self.dispatcher.dispatch(request.rows)
|
|
336
|
+
|
|
337
|
+
except Exception as exc:
|
|
338
|
+
# TODO: which exception can be retried?
|
|
339
|
+
_logger.debug(f'exception for {self.resource_pool}: {exc}')
|
|
340
|
+
status = getattr(exc, 'status', None)
|
|
341
|
+
_logger.debug(f'type={type(exc)} has_status={hasattr(exc, "status")} status={status}')
|
|
342
|
+
if num_retries < self.MAX_RETRIES:
|
|
343
|
+
self.queue.put_nowait(self.QueueItem(request, num_retries + 1))
|
|
344
|
+
return
|
|
345
|
+
|
|
346
|
+
# record the exception
|
|
347
|
+
_, _, exc_tb = sys.exc_info()
|
|
348
|
+
for row in request.rows:
|
|
349
|
+
row.set_exc(request.fn_call.slot_idx, exc)
|
|
350
|
+
self.dispatcher.dispatch_exc(request.rows, request.fn_call.slot_idx, exc_tb)
|
|
351
|
+
finally:
|
|
352
|
+
_logger.debug(
|
|
353
|
+
f'Scheduler stats: #in-flight={self.num_in_flight} #requests={self.total_requests}, #retried={self.total_retried}'
|
|
354
|
+
)
|
|
355
|
+
if is_task:
|
|
356
|
+
self.num_in_flight -= 1
|
|
357
|
+
|
|
358
|
+
|
|
239
359
|
# all concrete Scheduler subclasses that implement matches()
|
|
240
|
-
SCHEDULERS = [RateLimitsScheduler]
|
|
360
|
+
SCHEDULERS = [RateLimitsScheduler, RequestRateScheduler]
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, Iterator, Optional
|
|
2
|
+
from typing import Any, AsyncIterator, Iterator, Optional
|
|
3
3
|
|
|
4
4
|
import pixeltable.catalog as catalog
|
|
5
5
|
import pixeltable.exprs as exprs
|
|
@@ -10,6 +10,7 @@ from .exec_node import ExecNode
|
|
|
10
10
|
|
|
11
11
|
_logger = logging.getLogger('pixeltable')
|
|
12
12
|
|
|
13
|
+
|
|
13
14
|
class InMemoryDataNode(ExecNode):
|
|
14
15
|
"""
|
|
15
16
|
Outputs in-memory data as a DataRowBatch of a particular table.
|
|
@@ -18,6 +19,7 @@ class InMemoryDataNode(ExecNode):
|
|
|
18
19
|
- with the values provided in the input rows
|
|
19
20
|
- if an input row doesn't provide a value, sets the slot to the column default
|
|
20
21
|
"""
|
|
22
|
+
|
|
21
23
|
tbl: catalog.TableVersion
|
|
22
24
|
input_rows: list[dict[str, Any]]
|
|
23
25
|
start_row_id: int
|
|
@@ -27,8 +29,7 @@ class InMemoryDataNode(ExecNode):
|
|
|
27
29
|
output_exprs: list[exprs.ColumnRef]
|
|
28
30
|
|
|
29
31
|
def __init__(
|
|
30
|
-
self, tbl: catalog.TableVersion, rows: list[dict[str, Any]],
|
|
31
|
-
row_builder: exprs.RowBuilder, start_row_id: int,
|
|
32
|
+
self, tbl: catalog.TableVersion, rows: list[dict[str, Any]], row_builder: exprs.RowBuilder, start_row_id: int
|
|
32
33
|
):
|
|
33
34
|
# we materialize the input slots
|
|
34
35
|
output_exprs = list(row_builder.input_exprs)
|
|
@@ -43,11 +44,11 @@ class InMemoryDataNode(ExecNode):
|
|
|
43
44
|
"""Create row batch and populate with self.input_rows"""
|
|
44
45
|
user_cols_by_name = {
|
|
45
46
|
col_ref.col.name: exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
|
|
46
|
-
for col_ref in self.output_exprs
|
|
47
|
+
for col_ref in self.output_exprs
|
|
48
|
+
if col_ref.col.name is not None
|
|
47
49
|
}
|
|
48
50
|
output_cols_by_idx = {
|
|
49
|
-
col_ref.slot_idx: exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx)
|
|
50
|
-
for col_ref in self.output_exprs
|
|
51
|
+
col_ref.slot_idx: exprs.ColumnSlotIdx(col_ref.col, col_ref.slot_idx) for col_ref in self.output_exprs
|
|
51
52
|
}
|
|
52
53
|
output_slot_idxs = {e.slot_idx for e in self.output_exprs}
|
|
53
54
|
|
|
@@ -68,7 +69,7 @@ class InMemoryDataNode(ExecNode):
|
|
|
68
69
|
input_slot_idxs.add(col_info.slot_idx)
|
|
69
70
|
|
|
70
71
|
# set the remaining output slots to their default values (presently None)
|
|
71
|
-
missing_slot_idxs =
|
|
72
|
+
missing_slot_idxs = output_slot_idxs - input_slot_idxs
|
|
72
73
|
for slot_idx in missing_slot_idxs:
|
|
73
74
|
col_info = output_cols_by_idx.get(slot_idx)
|
|
74
75
|
assert col_info is not None
|
|
@@ -4,11 +4,13 @@ from typing import Any, AsyncIterator
|
|
|
4
4
|
import pixeltable.catalog as catalog
|
|
5
5
|
import pixeltable.exprs as exprs
|
|
6
6
|
from pixeltable.utils.media_store import MediaStore
|
|
7
|
+
|
|
7
8
|
from .data_row_batch import DataRowBatch
|
|
8
9
|
from .exec_node import ExecNode
|
|
9
10
|
|
|
10
11
|
_logger = logging.getLogger('pixeltable')
|
|
11
12
|
|
|
13
|
+
|
|
12
14
|
class RowUpdateNode(ExecNode):
|
|
13
15
|
"""
|
|
14
16
|
Update individual rows in the input batches, identified by key columns.
|
|
@@ -17,9 +19,15 @@ class RowUpdateNode(ExecNode):
|
|
|
17
19
|
The node assumes that all update dicts contain the same keys, and it populates the slots of the columns present in
|
|
18
20
|
the update list.
|
|
19
21
|
"""
|
|
22
|
+
|
|
20
23
|
def __init__(
|
|
21
|
-
|
|
22
|
-
|
|
24
|
+
self,
|
|
25
|
+
tbl: catalog.TableVersionPath,
|
|
26
|
+
key_vals_batch: list[tuple],
|
|
27
|
+
is_rowid_key: bool,
|
|
28
|
+
col_vals_batch: list[dict[catalog.Column, Any]],
|
|
29
|
+
row_builder: exprs.RowBuilder,
|
|
30
|
+
input: ExecNode,
|
|
23
31
|
):
|
|
24
32
|
super().__init__(row_builder, [], [], input)
|
|
25
33
|
self.updates = {key_vals: col_vals for key_vals, col_vals in zip(key_vals_batch, col_vals_batch)}
|
|
@@ -28,7 +36,8 @@ class RowUpdateNode(ExecNode):
|
|
|
28
36
|
# retrieve ColumnRefs from the RowBuilder (has slot_idx set)
|
|
29
37
|
all_col_slot_idxs = {
|
|
30
38
|
col_ref.col: col_ref.slot_idx
|
|
31
|
-
for col_ref in row_builder.unique_exprs
|
|
39
|
+
for col_ref in row_builder.unique_exprs
|
|
40
|
+
if isinstance(col_ref, exprs.ColumnRef)
|
|
32
41
|
}
|
|
33
42
|
self.col_slot_idxs = {col: all_col_slot_idxs[col] for col in col_vals_batch[0].keys()}
|
|
34
43
|
self.key_slot_idxs = {col: all_col_slot_idxs[col] for col in tbl.tbl_version.primary_key_columns()}
|
|
@@ -37,8 +46,9 @@ class RowUpdateNode(ExecNode):
|
|
|
37
46
|
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
38
47
|
async for batch in self.input:
|
|
39
48
|
for row in batch:
|
|
40
|
-
key_vals =
|
|
41
|
-
tuple(row[slot_idx] for slot_idx in self.key_slot_idxs.values())
|
|
49
|
+
key_vals = (
|
|
50
|
+
row.rowid if self.is_rowid_key else tuple(row[slot_idx] for slot_idx in self.key_slot_idxs.values())
|
|
51
|
+
)
|
|
42
52
|
if key_vals not in self.updates:
|
|
43
53
|
continue
|
|
44
54
|
self.matched_key_vals.add(key_vals)
|
pixeltable/exec/sql_node.py
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import warnings
|
|
3
3
|
from decimal import Decimal
|
|
4
|
-
from typing import Iterable, Iterator, NamedTuple, Optional,
|
|
4
|
+
from typing import TYPE_CHECKING, AsyncIterator, Iterable, Iterator, NamedTuple, Optional, Sequence
|
|
5
5
|
from uuid import UUID
|
|
6
6
|
|
|
7
7
|
import sqlalchemy as sql
|
|
8
8
|
|
|
9
9
|
import pixeltable.catalog as catalog
|
|
10
10
|
import pixeltable.exprs as exprs
|
|
11
|
+
|
|
11
12
|
from .data_row_batch import DataRowBatch
|
|
12
13
|
from .exec_node import ExecNode
|
|
13
14
|
|
|
@@ -53,10 +54,12 @@ def combine_order_by_clauses(clauses: Iterable[OrderByClause]) -> Optional[Order
|
|
|
53
54
|
|
|
54
55
|
|
|
55
56
|
def print_order_by_clause(clause: OrderByClause) -> str:
|
|
56
|
-
return ', '.join(
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
57
|
+
return ', '.join(
|
|
58
|
+
[
|
|
59
|
+
f'({item.expr}{", asc=True" if item.asc is True else ""}{", asc=False" if item.asc is False else ""})'
|
|
60
|
+
for item in clause
|
|
61
|
+
]
|
|
62
|
+
)
|
|
60
63
|
|
|
61
64
|
|
|
62
65
|
class SqlNode(ExecNode):
|
|
@@ -82,8 +85,12 @@ class SqlNode(ExecNode):
|
|
|
82
85
|
limit: Optional[int]
|
|
83
86
|
|
|
84
87
|
def __init__(
|
|
85
|
-
|
|
86
|
-
|
|
88
|
+
self,
|
|
89
|
+
tbl: Optional[catalog.TableVersionPath],
|
|
90
|
+
row_builder: exprs.RowBuilder,
|
|
91
|
+
select_list: Iterable[exprs.Expr],
|
|
92
|
+
sql_elements: exprs.SqlElementCache,
|
|
93
|
+
set_pk: bool = False,
|
|
87
94
|
):
|
|
88
95
|
"""
|
|
89
96
|
If row_builder contains references to unstored iter columns, expands the select list to include their
|
|
@@ -186,8 +193,11 @@ class SqlNode(ExecNode):
|
|
|
186
193
|
|
|
187
194
|
@classmethod
|
|
188
195
|
def create_from_clause(
|
|
189
|
-
|
|
190
|
-
|
|
196
|
+
cls,
|
|
197
|
+
tbl: catalog.TableVersionPath,
|
|
198
|
+
stmt: sql.Select,
|
|
199
|
+
refd_tbl_ids: Optional[set[UUID]] = None,
|
|
200
|
+
exact_version_only: Optional[set[UUID]] = None,
|
|
191
201
|
) -> sql.Select:
|
|
192
202
|
"""Add From clause to stmt for tables/views referenced by materialized_exprs
|
|
193
203
|
Args:
|
|
@@ -220,15 +230,14 @@ class SqlNode(ExecNode):
|
|
|
220
230
|
# join tbl to prev_tbl on prev_tbl's rowid cols
|
|
221
231
|
prev_tbl_rowid_cols = prev_tbl.store_tbl.rowid_columns()
|
|
222
232
|
tbl_rowid_cols = tbl.store_tbl.rowid_columns()
|
|
223
|
-
rowid_clauses =
|
|
224
|
-
|
|
233
|
+
rowid_clauses = [
|
|
234
|
+
c1 == c2 for c1, c2 in zip(prev_tbl_rowid_cols, tbl_rowid_cols[: len(prev_tbl_rowid_cols)])
|
|
235
|
+
]
|
|
225
236
|
stmt = stmt.join(tbl.store_tbl.sa_tbl, sql.and_(*rowid_clauses))
|
|
226
237
|
if tbl.id in exact_version_only:
|
|
227
238
|
stmt = stmt.where(tbl.store_tbl.v_min_col == tbl.version)
|
|
228
239
|
else:
|
|
229
|
-
stmt = stmt
|
|
230
|
-
.where(tbl.store_tbl.v_min_col <= tbl.version) \
|
|
231
|
-
.where(tbl.store_tbl.v_max_col > tbl.version)
|
|
240
|
+
stmt = stmt.where(tbl.store_tbl.v_min_col <= tbl.version).where(tbl.store_tbl.v_max_col > tbl.version)
|
|
232
241
|
prev_tbl = tbl
|
|
233
242
|
return stmt
|
|
234
243
|
|
|
@@ -291,7 +300,7 @@ class SqlNode(ExecNode):
|
|
|
291
300
|
|
|
292
301
|
# populate output_row
|
|
293
302
|
if self.num_pk_cols > 0:
|
|
294
|
-
output_row.set_pk(tuple(sql_row[-self.num_pk_cols:]))
|
|
303
|
+
output_row.set_pk(tuple(sql_row[-self.num_pk_cols :]))
|
|
295
304
|
# copy the output of the SQL query into the output row
|
|
296
305
|
for i, e in enumerate(self.select_list):
|
|
297
306
|
slot_idx = e.slot_idx
|
|
@@ -341,12 +350,16 @@ class SqlScanNode(SqlNode):
|
|
|
341
350
|
|
|
342
351
|
Supports filtering and ordering.
|
|
343
352
|
"""
|
|
353
|
+
|
|
344
354
|
exact_version_only: list[catalog.TableVersion]
|
|
345
355
|
|
|
346
356
|
def __init__(
|
|
347
|
-
self,
|
|
357
|
+
self,
|
|
358
|
+
tbl: catalog.TableVersionPath,
|
|
359
|
+
row_builder: exprs.RowBuilder,
|
|
348
360
|
select_list: Iterable[exprs.Expr],
|
|
349
|
-
set_pk: bool = False,
|
|
361
|
+
set_pk: bool = False,
|
|
362
|
+
exact_version_only: Optional[list[catalog.TableVersion]] = None,
|
|
350
363
|
):
|
|
351
364
|
"""
|
|
352
365
|
Args:
|
|
@@ -367,7 +380,8 @@ class SqlScanNode(SqlNode):
|
|
|
367
380
|
where_clause_tbl_ids = self.where_clause.tbl_ids() if self.where_clause is not None else set()
|
|
368
381
|
refd_tbl_ids = exprs.Expr.all_tbl_ids(self.select_list) | where_clause_tbl_ids | self._ordering_tbl_ids()
|
|
369
382
|
stmt = self.create_from_clause(
|
|
370
|
-
self.tbl, stmt, refd_tbl_ids, exact_version_only={t.id for t in self.exact_version_only}
|
|
383
|
+
self.tbl, stmt, refd_tbl_ids, exact_version_only={t.id for t in self.exact_version_only}
|
|
384
|
+
)
|
|
371
385
|
return stmt
|
|
372
386
|
|
|
373
387
|
|
|
@@ -377,8 +391,12 @@ class SqlLookupNode(SqlNode):
|
|
|
377
391
|
"""
|
|
378
392
|
|
|
379
393
|
def __init__(
|
|
380
|
-
self,
|
|
381
|
-
|
|
394
|
+
self,
|
|
395
|
+
tbl: catalog.TableVersionPath,
|
|
396
|
+
row_builder: exprs.RowBuilder,
|
|
397
|
+
select_list: Iterable[exprs.Expr],
|
|
398
|
+
sa_key_cols: list[sql.Column],
|
|
399
|
+
key_vals: list[tuple],
|
|
382
400
|
):
|
|
383
401
|
"""
|
|
384
402
|
Args:
|
|
@@ -406,11 +424,13 @@ class SqlAggregationNode(SqlNode):
|
|
|
406
424
|
group_by_items: Optional[list[exprs.Expr]]
|
|
407
425
|
|
|
408
426
|
def __init__(
|
|
409
|
-
self,
|
|
427
|
+
self,
|
|
428
|
+
row_builder: exprs.RowBuilder,
|
|
410
429
|
input: SqlNode,
|
|
411
430
|
select_list: Iterable[exprs.Expr],
|
|
412
431
|
group_by_items: Optional[list[exprs.Expr]] = None,
|
|
413
|
-
limit: Optional[int] = None,
|
|
432
|
+
limit: Optional[int] = None,
|
|
433
|
+
exact_version_only: Optional[list[catalog.TableVersion]] = None,
|
|
414
434
|
):
|
|
415
435
|
"""
|
|
416
436
|
Args:
|
|
@@ -436,12 +456,16 @@ class SqlJoinNode(SqlNode):
|
|
|
436
456
|
"""
|
|
437
457
|
Materializes data from the store via a Select ... From ... that contains joins
|
|
438
458
|
"""
|
|
459
|
+
|
|
439
460
|
input_ctes: list[sql.CTE]
|
|
440
461
|
join_clauses: list['pixeltable.plan.JoinClause']
|
|
441
462
|
|
|
442
463
|
def __init__(
|
|
443
|
-
self,
|
|
444
|
-
|
|
464
|
+
self,
|
|
465
|
+
row_builder: exprs.RowBuilder,
|
|
466
|
+
inputs: Sequence[SqlNode],
|
|
467
|
+
join_clauses: list['pixeltable.plan.JoinClause'],
|
|
468
|
+
select_list: Iterable[exprs.Expr],
|
|
445
469
|
):
|
|
446
470
|
assert len(inputs) > 1
|
|
447
471
|
assert len(inputs) == len(join_clauses) + 1
|
|
@@ -456,16 +480,21 @@ class SqlJoinNode(SqlNode):
|
|
|
456
480
|
|
|
457
481
|
def _create_stmt(self) -> sql.Select:
|
|
458
482
|
from pixeltable import plan
|
|
483
|
+
|
|
459
484
|
stmt = super()._create_stmt()
|
|
460
485
|
stmt = stmt.select_from(self.input_ctes[0])
|
|
461
486
|
for i in range(len(self.join_clauses)):
|
|
462
487
|
join_clause = self.join_clauses[i]
|
|
463
488
|
on_clause = (
|
|
464
|
-
self.sql_elements.get(join_clause.join_predicate)
|
|
489
|
+
self.sql_elements.get(join_clause.join_predicate)
|
|
490
|
+
if join_clause.join_type != plan.JoinType.CROSS
|
|
465
491
|
else sql.sql.expression.literal(True)
|
|
466
492
|
)
|
|
467
493
|
is_outer = join_clause.join_type == plan.JoinType.LEFT or join_clause.join_type == plan.JoinType.FULL_OUTER
|
|
468
494
|
stmt = stmt.join(
|
|
469
|
-
self.input_ctes[i + 1],
|
|
470
|
-
|
|
495
|
+
self.input_ctes[i + 1],
|
|
496
|
+
onclause=on_clause,
|
|
497
|
+
isouter=is_outer,
|
|
498
|
+
full=join_clause == plan.JoinType.FULL_OUTER,
|
|
499
|
+
)
|
|
471
500
|
return stmt
|