pixeltable 0.3.0__py3-none-any.whl → 0.3.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/insertable_table.py +3 -3
- pixeltable/catalog/table.py +2 -2
- pixeltable/catalog/table_version.py +3 -2
- pixeltable/catalog/view.py +1 -1
- pixeltable/dataframe.py +52 -27
- pixeltable/env.py +109 -4
- pixeltable/exec/__init__.py +1 -1
- pixeltable/exec/aggregation_node.py +3 -3
- pixeltable/exec/cache_prefetch_node.py +13 -7
- pixeltable/exec/component_iteration_node.py +3 -9
- pixeltable/exec/data_row_batch.py +17 -5
- pixeltable/exec/exec_node.py +32 -12
- pixeltable/exec/expr_eval/__init__.py +1 -0
- pixeltable/exec/expr_eval/evaluators.py +240 -0
- pixeltable/exec/expr_eval/expr_eval_node.py +408 -0
- pixeltable/exec/expr_eval/globals.py +113 -0
- pixeltable/exec/expr_eval/row_buffer.py +76 -0
- pixeltable/exec/expr_eval/schedulers.py +240 -0
- pixeltable/exec/in_memory_data_node.py +2 -2
- pixeltable/exec/row_update_node.py +14 -14
- pixeltable/exec/sql_node.py +2 -2
- pixeltable/exprs/column_ref.py +5 -1
- pixeltable/exprs/data_row.py +50 -40
- pixeltable/exprs/expr.py +57 -12
- pixeltable/exprs/function_call.py +54 -19
- pixeltable/exprs/inline_expr.py +12 -21
- pixeltable/exprs/literal.py +25 -8
- pixeltable/exprs/row_builder.py +25 -2
- pixeltable/func/aggregate_function.py +4 -0
- pixeltable/func/callable_function.py +54 -4
- pixeltable/func/expr_template_function.py +5 -1
- pixeltable/func/function.py +48 -7
- pixeltable/func/query_template_function.py +16 -7
- pixeltable/func/udf.py +7 -1
- pixeltable/functions/__init__.py +1 -1
- pixeltable/functions/anthropic.py +97 -21
- pixeltable/functions/gemini.py +2 -6
- pixeltable/functions/openai.py +219 -28
- pixeltable/globals.py +2 -3
- pixeltable/io/hf_datasets.py +1 -1
- pixeltable/io/label_studio.py +5 -5
- pixeltable/io/parquet.py +1 -1
- pixeltable/metadata/__init__.py +2 -1
- pixeltable/plan.py +24 -9
- pixeltable/store.py +6 -0
- pixeltable/type_system.py +73 -36
- pixeltable/utils/arrow.py +3 -8
- pixeltable/utils/console_output.py +41 -0
- pixeltable/utils/filecache.py +1 -1
- {pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/METADATA +4 -1
- {pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/RECORD +55 -49
- pixeltable/exec/expr_eval_node.py +0 -232
- {pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.0.dist-info → pixeltable-0.3.2.dist-info}/entry_points.txt +0 -0
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
import abc
|
|
2
|
+
import asyncio
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from types import TracebackType
|
|
5
|
+
from typing import Any, Protocol, Optional
|
|
6
|
+
|
|
7
|
+
from pixeltable import exprs
|
|
8
|
+
from pixeltable import func
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
@dataclass
|
|
12
|
+
class FnCallArgs:
|
|
13
|
+
"""Container for everything needed to execute a FunctionCall against one or more DataRows"""
|
|
14
|
+
fn_call: exprs.FunctionCall
|
|
15
|
+
rows: list[exprs.DataRow]
|
|
16
|
+
# single call
|
|
17
|
+
args: Optional[list[Any]] = None
|
|
18
|
+
kwargs: Optional[dict[str, Any]] = None
|
|
19
|
+
# batch call
|
|
20
|
+
batch_args: Optional[list[list[Optional[Any]]]] = None
|
|
21
|
+
batch_kwargs: Optional[dict[str, list[Optional[Any]]]] = None
|
|
22
|
+
|
|
23
|
+
@property
|
|
24
|
+
def pxt_fn(self) -> func.CallableFunction:
|
|
25
|
+
assert isinstance(self.fn_call.fn, func.CallableFunction)
|
|
26
|
+
return self.fn_call.fn
|
|
27
|
+
|
|
28
|
+
@property
|
|
29
|
+
def is_batched(self) -> bool:
|
|
30
|
+
return self.batch_args is not None
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def row(self) -> exprs.DataRow:
|
|
34
|
+
assert len(self.rows) == 1
|
|
35
|
+
return self.rows[0]
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class Scheduler(abc.ABC):
|
|
39
|
+
"""
|
|
40
|
+
Base class for schedulers. A scheduler executes FunctionCalls against a limited resource pool.
|
|
41
|
+
|
|
42
|
+
Expected behavior:
|
|
43
|
+
- all created tasks must be recorded in dispatcher.tasks
|
|
44
|
+
- schedulers are responsible for aborting execution when a) the task is cancelled or b) when an exception occurred
|
|
45
|
+
elsewhere (indicated by dispatcher.exc_event)
|
|
46
|
+
"""
|
|
47
|
+
@abc.abstractmethod
|
|
48
|
+
def submit(self, item: FnCallArgs) -> None:
|
|
49
|
+
pass
|
|
50
|
+
|
|
51
|
+
@classmethod
|
|
52
|
+
@abc.abstractmethod
|
|
53
|
+
def matches(cls, resource_pool: str) -> bool:
|
|
54
|
+
"""Returns True if the scheduler can handle the given resource pool"""
|
|
55
|
+
pass
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
class Dispatcher(Protocol):
|
|
59
|
+
"""
|
|
60
|
+
Row dispatcher used by Evaluators/Schedulers for post-processing after slot materialization and for task management.
|
|
61
|
+
|
|
62
|
+
Task management: all tasks need to be registered via register_task()
|
|
63
|
+
Exceptions: evaluators/schedulers need to check exc_event prior to starting long-running (non-interruptible)
|
|
64
|
+
computations
|
|
65
|
+
"""
|
|
66
|
+
row_builder: exprs.RowBuilder
|
|
67
|
+
exc_event: asyncio.Event
|
|
68
|
+
schedulers: dict[str, Scheduler] # key: resource pool id
|
|
69
|
+
|
|
70
|
+
def dispatch(self, rows: list[exprs.DataRow]) -> None:
|
|
71
|
+
"""Dispatches row slots to the appropriate schedulers; does not block"""
|
|
72
|
+
...
|
|
73
|
+
|
|
74
|
+
def dispatch_exc(self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType) -> None:
|
|
75
|
+
"""Propagates exception in slot_with_exc to all dependent slots and dispatches the rest; does not block"""
|
|
76
|
+
...
|
|
77
|
+
|
|
78
|
+
def register_task(self, f: asyncio.Task) -> None:
|
|
79
|
+
"""Register task with dispatcher for subsequent cleanup; does not block"""
|
|
80
|
+
...
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class Evaluator(abc.ABC):
|
|
84
|
+
"""
|
|
85
|
+
Base class for expression evaluators. Each DataRow slot is assigned an evaluator, which is responsible for the
|
|
86
|
+
execution of the expression evaluation logic as well as the scheduling/task breakdown of that execution.
|
|
87
|
+
|
|
88
|
+
Expected behavior:
|
|
89
|
+
- all created tasks must be recorded in dispatcher.tasks
|
|
90
|
+
- evaluators are responsible for aborting execution when a) the task is cancelled or b) when an exception occurred
|
|
91
|
+
elsewhere (indicated by dispatcher.exc_event)
|
|
92
|
+
"""
|
|
93
|
+
dispatcher: Dispatcher
|
|
94
|
+
is_closed: bool
|
|
95
|
+
|
|
96
|
+
def __init__(self, dispatcher: Dispatcher):
|
|
97
|
+
self.dispatcher = dispatcher
|
|
98
|
+
self.is_closed = False
|
|
99
|
+
|
|
100
|
+
@abc.abstractmethod
|
|
101
|
+
def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
|
|
102
|
+
"""Create tasks to evaluate the expression in the given slot for the given rows; must not block."""
|
|
103
|
+
...
|
|
104
|
+
|
|
105
|
+
def _close(self) -> None:
|
|
106
|
+
"""Close the evaluator; must not block"""
|
|
107
|
+
pass
|
|
108
|
+
|
|
109
|
+
def close(self) -> None:
|
|
110
|
+
"""Indicates that there may not be any more rows getting scheduled"""
|
|
111
|
+
self.is_closed = True
|
|
112
|
+
self._close()
|
|
113
|
+
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
from typing import Optional
|
|
5
|
+
|
|
6
|
+
import numpy as np
|
|
7
|
+
|
|
8
|
+
from pixeltable import exprs
|
|
9
|
+
|
|
10
|
+
_logger = logging.getLogger('pixeltable')
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
class RowBuffer:
|
|
14
|
+
"""Fixed-length circular buffer of DataRows; knows how to maintain input order"""
|
|
15
|
+
|
|
16
|
+
size: int
|
|
17
|
+
row_pos_map: Optional[dict[int, int]] # id(row) -> position of row in output; None if not maintaining order
|
|
18
|
+
num_rows: int # number of rows in the buffer
|
|
19
|
+
num_ready: int # number of consecutive non-None rows at head
|
|
20
|
+
buffer: np.ndarray # of object
|
|
21
|
+
head_idx: int # index of beginning of the buffer
|
|
22
|
+
head_pos: int # row position of the beginning of the buffer
|
|
23
|
+
|
|
24
|
+
def __init__(self, size: int):
|
|
25
|
+
self.size = size
|
|
26
|
+
self.row_pos_map = None
|
|
27
|
+
self.num_rows = 0
|
|
28
|
+
self.num_ready = 0
|
|
29
|
+
self.buffer = np.full(size, None, dtype=object)
|
|
30
|
+
self.head_pos = 0
|
|
31
|
+
self.head_idx = 0
|
|
32
|
+
|
|
33
|
+
def set_row_pos_map(self, row_pos_map: dict[int, int]) -> None:
|
|
34
|
+
self.row_pos_map = row_pos_map
|
|
35
|
+
|
|
36
|
+
def add_row(self, row: exprs.DataRow) -> None:
|
|
37
|
+
offset: int # of new row from head
|
|
38
|
+
if self.row_pos_map is not None:
|
|
39
|
+
pos = self.row_pos_map.get(id(row))
|
|
40
|
+
assert pos is not None and (pos - self.head_pos < self.size), f'{pos} {self.head_pos} {self.size}'
|
|
41
|
+
offset = pos - self.head_pos
|
|
42
|
+
else:
|
|
43
|
+
offset = self.num_rows
|
|
44
|
+
idx = (self.head_idx + offset) % self.size
|
|
45
|
+
assert self.buffer[idx] is None
|
|
46
|
+
|
|
47
|
+
self.buffer[idx] = row
|
|
48
|
+
self.num_rows += 1
|
|
49
|
+
if self.row_pos_map is not None:
|
|
50
|
+
if offset == self.num_ready:
|
|
51
|
+
# we have new ready rows; find out how many
|
|
52
|
+
while offset < self.size and self.buffer[(self.head_idx + offset) % self.size] is not None:
|
|
53
|
+
offset += 1
|
|
54
|
+
self.num_ready = offset
|
|
55
|
+
else:
|
|
56
|
+
self.num_ready += 1
|
|
57
|
+
|
|
58
|
+
def get_rows(self, n: int) -> list[exprs.DataRow]:
|
|
59
|
+
"""Get up to n ready rows from head"""
|
|
60
|
+
n = min(n, self.num_ready)
|
|
61
|
+
if n == 0:
|
|
62
|
+
return []
|
|
63
|
+
rows: list[exprs.DataRow]
|
|
64
|
+
if self.head_idx + n <= self.size:
|
|
65
|
+
rows = self.buffer[self.head_idx:self.head_idx + n].tolist()
|
|
66
|
+
self.buffer[self.head_idx:self.head_idx + n] = None
|
|
67
|
+
else:
|
|
68
|
+
rows = np.concatenate([self.buffer[self.head_idx:], self.buffer[:self.head_idx + n - self.size]]).tolist()
|
|
69
|
+
self.buffer[self.head_idx:] = None
|
|
70
|
+
self.buffer[:self.head_idx + n - self.size] = None
|
|
71
|
+
self.head_pos += n
|
|
72
|
+
self.head_idx = (self.head_idx + n) % self.size
|
|
73
|
+
self.num_rows -= n
|
|
74
|
+
self.num_ready -= n
|
|
75
|
+
return rows
|
|
76
|
+
|
|
@@ -0,0 +1,240 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import asyncio
|
|
4
|
+
import datetime
|
|
5
|
+
import inspect
|
|
6
|
+
import logging
|
|
7
|
+
import sys
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from typing import Optional, Awaitable, Collection
|
|
10
|
+
|
|
11
|
+
from pixeltable import env
|
|
12
|
+
from pixeltable import func
|
|
13
|
+
from .globals import Scheduler, FnCallArgs, Dispatcher
|
|
14
|
+
|
|
15
|
+
_logger = logging.getLogger('pixeltable')
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RateLimitsScheduler(Scheduler):
|
|
19
|
+
"""
|
|
20
|
+
Scheduler for FunctionCalls with a RateLimitsInfo pool, which provides information about actual resource usage.
|
|
21
|
+
|
|
22
|
+
Scheduling strategy:
|
|
23
|
+
- try to stay below resource limits by utilizing reported RateLimitInfo.remaining
|
|
24
|
+
- also take into account the estimated resource usage for in-flight requests
|
|
25
|
+
(obtained via RateLimitsInfo.get_request_resources())
|
|
26
|
+
- issue synchronous requests when we don't have a RateLimitsInfo yet or when we depleted a resource and need to
|
|
27
|
+
wait for a reset
|
|
28
|
+
|
|
29
|
+
TODO:
|
|
30
|
+
- limit the number of in-flight requests based on the open file limit
|
|
31
|
+
"""
|
|
32
|
+
@dataclass(frozen=True)
|
|
33
|
+
class QueueItem:
|
|
34
|
+
request: FnCallArgs
|
|
35
|
+
num_retries: int
|
|
36
|
+
|
|
37
|
+
def __lt__(self, other: RateLimitsScheduler.QueueItem) -> bool:
|
|
38
|
+
# prioritize by number of retries
|
|
39
|
+
return self.num_retries > other.num_retries
|
|
40
|
+
|
|
41
|
+
resource_pool: str
|
|
42
|
+
queue: asyncio.PriorityQueue[QueueItem] # prioritizes retries
|
|
43
|
+
loop_task: asyncio.Task
|
|
44
|
+
dispatcher: Dispatcher
|
|
45
|
+
get_request_resources_param_names: list[str] # names of parameters of RateLimitsInfo.get_request_resources()
|
|
46
|
+
|
|
47
|
+
# scheduling-related state
|
|
48
|
+
pool_info: Optional[env.RateLimitsInfo]
|
|
49
|
+
est_usage: dict[str, int] # value per resource; accumulated estimates since the last util. report
|
|
50
|
+
|
|
51
|
+
num_in_flight: int # unfinished tasks
|
|
52
|
+
request_completed: asyncio.Event
|
|
53
|
+
|
|
54
|
+
total_requests: int
|
|
55
|
+
total_retried: int
|
|
56
|
+
|
|
57
|
+
TIME_FORMAT = '%H:%M.%S %f'
|
|
58
|
+
MAX_RETRIES = 10
|
|
59
|
+
|
|
60
|
+
def __init__(self, resource_pool: str, dispatcher: Dispatcher):
|
|
61
|
+
self.resource_pool = resource_pool
|
|
62
|
+
self.queue = asyncio.PriorityQueue()
|
|
63
|
+
self.dispatcher = dispatcher
|
|
64
|
+
self.loop_task = asyncio.create_task(self._main_loop())
|
|
65
|
+
self.dispatcher.register_task(self.loop_task)
|
|
66
|
+
self.pool_info = None # initialized in _main_loop by the first request
|
|
67
|
+
self.est_usage = {}
|
|
68
|
+
self.num_in_flight = 0
|
|
69
|
+
self.request_completed = asyncio.Event()
|
|
70
|
+
self.total_requests = 0
|
|
71
|
+
self.total_retried = 0
|
|
72
|
+
self.get_request_resources_param_names = []
|
|
73
|
+
|
|
74
|
+
@classmethod
|
|
75
|
+
def matches(cls, resource_pool: str) -> bool:
|
|
76
|
+
return resource_pool.startswith('rate-limits:')
|
|
77
|
+
|
|
78
|
+
def submit(self, item: FnCallArgs) -> None:
|
|
79
|
+
self.queue.put_nowait(self.QueueItem(item, 0))
|
|
80
|
+
|
|
81
|
+
def _set_pool_info(self) -> None:
|
|
82
|
+
"""Initialize pool_info with the RateLimitsInfo for the resource pool, if available"""
|
|
83
|
+
if self.pool_info is not None:
|
|
84
|
+
return
|
|
85
|
+
self.pool_info = env.Env.get().get_resource_pool_info(self.resource_pool, None)
|
|
86
|
+
if self.pool_info is None:
|
|
87
|
+
return
|
|
88
|
+
assert isinstance(self.pool_info, env.RateLimitsInfo)
|
|
89
|
+
assert hasattr(self.pool_info, 'get_request_resources')
|
|
90
|
+
sig = inspect.signature(self.pool_info.get_request_resources)
|
|
91
|
+
self.get_request_resources_param_names = [p.name for p in sig.parameters.values()]
|
|
92
|
+
self.est_usage = {r: 0 for r in self._resources}
|
|
93
|
+
|
|
94
|
+
async def _main_loop(self) -> None:
|
|
95
|
+
item: Optional[RateLimitsScheduler.QueueItem] = None
|
|
96
|
+
while True:
|
|
97
|
+
if item is None:
|
|
98
|
+
item = await self.queue.get()
|
|
99
|
+
if item.num_retries > 0:
|
|
100
|
+
self.total_retried += 1
|
|
101
|
+
|
|
102
|
+
now = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
103
|
+
if self.pool_info is None or not self.pool_info.is_initialized():
|
|
104
|
+
# wait for a single request to get rate limits
|
|
105
|
+
_logger.debug(f'initializing rate limits for {self.resource_pool}')
|
|
106
|
+
await self._exec(item.request, item.num_retries, is_task=False)
|
|
107
|
+
item = None
|
|
108
|
+
# if this was the first request, it created the pool_info
|
|
109
|
+
if self.pool_info is None:
|
|
110
|
+
self._set_pool_info()
|
|
111
|
+
continue
|
|
112
|
+
|
|
113
|
+
# check rate limits
|
|
114
|
+
request_resources = self._get_request_resources(item.request)
|
|
115
|
+
limits_info = self._check_resource_limits(request_resources)
|
|
116
|
+
aws: list[Awaitable[None]] = []
|
|
117
|
+
completed_aw: Optional[asyncio.Task] = None
|
|
118
|
+
wait_for_reset: Optional[asyncio.Task] = None
|
|
119
|
+
if limits_info is not None:
|
|
120
|
+
# limits_info's resource is depleted, wait for capacity to free up
|
|
121
|
+
|
|
122
|
+
if self.num_in_flight > 0:
|
|
123
|
+
# a completed request can free up capacity
|
|
124
|
+
self.request_completed.clear()
|
|
125
|
+
completed_aw = asyncio.create_task(self.request_completed.wait())
|
|
126
|
+
aws.append(completed_aw)
|
|
127
|
+
_logger.debug(f'waiting for completed request for {self.resource_pool}')
|
|
128
|
+
|
|
129
|
+
reset_at = limits_info.reset_at
|
|
130
|
+
if reset_at > now:
|
|
131
|
+
# we're waiting for the rate limit to reset
|
|
132
|
+
wait_for_reset = asyncio.create_task(asyncio.sleep((reset_at - now).total_seconds()))
|
|
133
|
+
aws.append(wait_for_reset)
|
|
134
|
+
_logger.debug(f'waiting for rate limit reset for {self.resource_pool}')
|
|
135
|
+
|
|
136
|
+
if len(aws) > 0:
|
|
137
|
+
# we have something to wait for
|
|
138
|
+
done, pending = await asyncio.wait(aws, return_when=asyncio.FIRST_COMPLETED)
|
|
139
|
+
for task in pending:
|
|
140
|
+
task.cancel()
|
|
141
|
+
if completed_aw in done:
|
|
142
|
+
_logger.debug(f'wait(): completed request for {self.resource_pool}')
|
|
143
|
+
if wait_for_reset in done:
|
|
144
|
+
_logger.debug(f'wait(): rate limit reset for {self.resource_pool}')
|
|
145
|
+
# force waiting for another rate limit report before making any scheduling decisions
|
|
146
|
+
self.pool_info.reset()
|
|
147
|
+
# re-evaluate current capacity for current item
|
|
148
|
+
continue
|
|
149
|
+
|
|
150
|
+
# we have a new in-flight request
|
|
151
|
+
for resource, val in request_resources.items():
|
|
152
|
+
self.est_usage[resource] += val
|
|
153
|
+
_logger.debug(f'creating task for {self.resource_pool}')
|
|
154
|
+
self.num_in_flight += 1
|
|
155
|
+
task = asyncio.create_task(self._exec(item.request, item.num_retries, is_task=True))
|
|
156
|
+
self.dispatcher.register_task(task)
|
|
157
|
+
item = None
|
|
158
|
+
|
|
159
|
+
@property
|
|
160
|
+
def _resources(self) -> Collection[str]:
|
|
161
|
+
return self.pool_info.resource_limits.keys() if self.pool_info is not None else []
|
|
162
|
+
|
|
163
|
+
def _get_request_resources(self, request: FnCallArgs) -> dict[str, int]:
|
|
164
|
+
kwargs_batch = request.fn_call.get_param_values(self.get_request_resources_param_names, request.rows)
|
|
165
|
+
if not request.is_batched:
|
|
166
|
+
return self.pool_info.get_request_resources(**kwargs_batch[0])
|
|
167
|
+
else:
|
|
168
|
+
batch_kwargs = {k: [d[k] for d in kwargs_batch] for k in kwargs_batch[0]}
|
|
169
|
+
constant_kwargs, batch_kwargs = request.pxt_fn.create_batch_kwargs(batch_kwargs)
|
|
170
|
+
return self.pool_info.get_request_resources(**constant_kwargs, **batch_kwargs)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def _check_resource_limits(self, request_resources: dict[str, int]) -> Optional[env.RateLimitInfo]:
|
|
174
|
+
"""Returns the most depleted resource, relative to its limit, or None if all resources are within limits"""
|
|
175
|
+
candidates: list[tuple[env.RateLimitInfo, float]] = [] # (info, relative usage)
|
|
176
|
+
for resource, usage in request_resources.items():
|
|
177
|
+
# 0.05: leave some headroom, we don't have perfect information
|
|
178
|
+
info = self.pool_info.resource_limits[resource]
|
|
179
|
+
est_remaining = info.remaining - self.est_usage[resource] - usage
|
|
180
|
+
if est_remaining < 0.05 * info.limit:
|
|
181
|
+
candidates.append((info, est_remaining / info.limit))
|
|
182
|
+
if len(candidates) == 0:
|
|
183
|
+
return None
|
|
184
|
+
return min(candidates, key=lambda x: x[1])[0]
|
|
185
|
+
|
|
186
|
+
async def _exec(self, request: FnCallArgs, num_retries: int, is_task: bool) -> None:
|
|
187
|
+
assert all(not row.has_val[request.fn_call.slot_idx] for row in request.rows)
|
|
188
|
+
assert all(not row.has_exc(request.fn_call.slot_idx) for row in request.rows)
|
|
189
|
+
|
|
190
|
+
try:
|
|
191
|
+
start_ts = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
192
|
+
pxt_fn = request.fn_call.fn
|
|
193
|
+
assert isinstance(pxt_fn, func.CallableFunction)
|
|
194
|
+
_logger.debug(f'scheduler {self.resource_pool}: start evaluating slot {request.fn_call.slot_idx}, batch_size={len(request.rows)}')
|
|
195
|
+
self.total_requests += 1
|
|
196
|
+
if request.is_batched:
|
|
197
|
+
batch_result = await pxt_fn.aexec_batch(*request.batch_args, **request.batch_kwargs)
|
|
198
|
+
assert len(batch_result) == len(request.rows)
|
|
199
|
+
for row, result in zip(request.rows, batch_result):
|
|
200
|
+
row[request.fn_call.slot_idx] = result
|
|
201
|
+
else:
|
|
202
|
+
result = await pxt_fn.aexec(*request.args, **request.kwargs)
|
|
203
|
+
request.row[request.fn_call.slot_idx] = result
|
|
204
|
+
end_ts = datetime.datetime.now(tz=datetime.timezone.utc)
|
|
205
|
+
_logger.debug(f'scheduler {self.resource_pool}: evaluated slot {request.fn_call.slot_idx} in {end_ts - start_ts}, batch_size={len(request.rows)}')
|
|
206
|
+
|
|
207
|
+
# purge accumulated usage estimate, now that we have a new report
|
|
208
|
+
self.est_usage = {r: 0 for r in self._resources}
|
|
209
|
+
|
|
210
|
+
self.dispatcher.dispatch(request.rows)
|
|
211
|
+
except Exception as exc:
|
|
212
|
+
_logger.debug(f'scheduler {self.resource_pool}: exception in slot {request.fn_call.slot_idx}: {exc}')
|
|
213
|
+
if self.pool_info is None:
|
|
214
|
+
# our pool info should be available at this point
|
|
215
|
+
self._set_pool_info()
|
|
216
|
+
if num_retries < self.MAX_RETRIES and self.pool_info is not None:
|
|
217
|
+
retry_delay = self.pool_info.get_retry_delay(exc)
|
|
218
|
+
if retry_delay is not None:
|
|
219
|
+
self.total_retried += 1
|
|
220
|
+
_logger.debug(f'scheduler {self.resource_pool}: retrying in {retry_delay} seconds')
|
|
221
|
+
await asyncio.sleep(retry_delay)
|
|
222
|
+
self.queue.put_nowait(self.QueueItem(request, num_retries + 1))
|
|
223
|
+
return
|
|
224
|
+
# TODO: update resource limits reported in exc.response.headers, if present
|
|
225
|
+
|
|
226
|
+
# record the exception
|
|
227
|
+
_, _, exc_tb = sys.exc_info()
|
|
228
|
+
for row in request.rows:
|
|
229
|
+
row.set_exc(request.fn_call.slot_idx, exc)
|
|
230
|
+
self.dispatcher.dispatch_exc(request.rows, request.fn_call.slot_idx, exc_tb)
|
|
231
|
+
finally:
|
|
232
|
+
_logger.debug(
|
|
233
|
+
f'Scheduler stats: #requests={self.total_requests}, #retried={self.total_retried}')
|
|
234
|
+
if is_task:
|
|
235
|
+
self.num_in_flight -= 1
|
|
236
|
+
self.request_completed.set()
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
# all concrete Scheduler subclasses that implement matches()
|
|
240
|
+
SCHEDULERS = [RateLimitsScheduler]
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, Iterator, Optional
|
|
2
|
+
from typing import Any, Iterator, Optional, AsyncIterator
|
|
3
3
|
|
|
4
4
|
import pixeltable.catalog as catalog
|
|
5
5
|
import pixeltable.exprs as exprs
|
|
@@ -76,6 +76,6 @@ class InMemoryDataNode(ExecNode):
|
|
|
76
76
|
|
|
77
77
|
self.ctx.num_rows = len(self.output_rows)
|
|
78
78
|
|
|
79
|
-
def
|
|
79
|
+
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
80
80
|
_logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_rows)} output_rows')
|
|
81
81
|
yield self.output_rows
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any
|
|
2
|
+
from typing import Any, AsyncIterator
|
|
3
3
|
|
|
4
4
|
import pixeltable.catalog as catalog
|
|
5
5
|
import pixeltable.exprs as exprs
|
|
@@ -34,19 +34,19 @@ class RowUpdateNode(ExecNode):
|
|
|
34
34
|
self.key_slot_idxs = {col: all_col_slot_idxs[col] for col in tbl.tbl_version.primary_key_columns()}
|
|
35
35
|
self.matched_key_vals: set[tuple] = set()
|
|
36
36
|
|
|
37
|
-
def
|
|
38
|
-
batch
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
37
|
+
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
38
|
+
async for batch in self.input:
|
|
39
|
+
for row in batch:
|
|
40
|
+
key_vals = row.rowid if self.is_rowid_key else \
|
|
41
|
+
tuple(row[slot_idx] for slot_idx in self.key_slot_idxs.values())
|
|
42
|
+
if key_vals not in self.updates:
|
|
43
|
+
continue
|
|
44
|
+
self.matched_key_vals.add(key_vals)
|
|
45
|
+
col_vals = self.updates[key_vals]
|
|
46
|
+
for col, val in col_vals.items():
|
|
47
|
+
slot_idx = self.col_slot_idxs[col]
|
|
48
|
+
row[slot_idx] = val
|
|
49
|
+
yield batch
|
|
50
50
|
|
|
51
51
|
def unmatched_rows(self) -> list[dict[str, Any]]:
|
|
52
52
|
"""Return rows that didn't get used in the updates as a list of dicts compatible with TableVersion.insert()."""
|
pixeltable/exec/sql_node.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import logging
|
|
2
2
|
import warnings
|
|
3
3
|
from decimal import Decimal
|
|
4
|
-
from typing import Iterable, Iterator, NamedTuple, Optional, TYPE_CHECKING, Sequence
|
|
4
|
+
from typing import Iterable, Iterator, NamedTuple, Optional, TYPE_CHECKING, Sequence, AsyncIterator
|
|
5
5
|
from uuid import UUID
|
|
6
6
|
|
|
7
7
|
import sqlalchemy as sql
|
|
@@ -264,7 +264,7 @@ class SqlNode(ExecNode):
|
|
|
264
264
|
except Exception as e:
|
|
265
265
|
_logger.warning(f'EXPLAIN failed with error: {e}')
|
|
266
266
|
|
|
267
|
-
def
|
|
267
|
+
async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
|
|
268
268
|
# run the query; do this here rather than in _open(), exceptions are only expected during iteration
|
|
269
269
|
assert self.ctx.conn is not None
|
|
270
270
|
with warnings.catch_warnings(record=True) as w:
|
pixeltable/exprs/column_ref.py
CHANGED
|
@@ -101,7 +101,8 @@ class ColumnRef(Expr):
|
|
|
101
101
|
# resolve column properties
|
|
102
102
|
if name == ColumnPropertyRef.Property.ERRORTYPE.name.lower() \
|
|
103
103
|
or name == ColumnPropertyRef.Property.ERRORMSG.name.lower():
|
|
104
|
-
|
|
104
|
+
property_is_present = self.col.is_stored and (self.col.is_computed or self.col_type.is_media_type())
|
|
105
|
+
if not property_is_present:
|
|
105
106
|
raise excs.Error(f'{name} only valid for a stored computed or media column: {self}')
|
|
106
107
|
return ColumnPropertyRef(self, ColumnPropertyRef.Property[name.upper()])
|
|
107
108
|
if name == ColumnPropertyRef.Property.FILEURL.name.lower() \
|
|
@@ -239,3 +240,6 @@ class ColumnRef(Expr):
|
|
|
239
240
|
col = cls.get_column(d)
|
|
240
241
|
perform_validation = d['perform_validation']
|
|
241
242
|
return cls(col, perform_validation=perform_validation)
|
|
243
|
+
|
|
244
|
+
def is_constant(self) -> bool:
|
|
245
|
+
return False
|
pixeltable/exprs/data_row.py
CHANGED
|
@@ -6,10 +6,10 @@ import urllib.parse
|
|
|
6
6
|
import urllib.request
|
|
7
7
|
from typing import Any, Optional
|
|
8
8
|
|
|
9
|
-
import numpy as np
|
|
10
|
-
import pgvector.sqlalchemy # type: ignore[import-untyped]
|
|
11
9
|
import PIL
|
|
12
10
|
import PIL.Image
|
|
11
|
+
import numpy as np
|
|
12
|
+
import pgvector.sqlalchemy # type: ignore[import-untyped]
|
|
13
13
|
import sqlalchemy as sql
|
|
14
14
|
|
|
15
15
|
from pixeltable import env
|
|
@@ -34,9 +34,14 @@ class DataRow:
|
|
|
34
34
|
- VideoType: local path if available, otherwise url
|
|
35
35
|
"""
|
|
36
36
|
|
|
37
|
-
vals:
|
|
38
|
-
has_val:
|
|
39
|
-
excs:
|
|
37
|
+
vals: np.ndarray # of object
|
|
38
|
+
has_val: np.ndarray # of bool
|
|
39
|
+
excs: np.ndarray # of object
|
|
40
|
+
|
|
41
|
+
# expr evaluation state; indexed by slot idx
|
|
42
|
+
missing_slots: np.ndarray # of bool; number of missing dependencies
|
|
43
|
+
missing_dependents: np.ndarray # of int16; number of missing dependents
|
|
44
|
+
is_scheduled: np.ndarray # of bool; True if this slot is scheduled for evaluation
|
|
40
45
|
|
|
41
46
|
# control structures that are shared across all DataRows in a batch
|
|
42
47
|
img_slot_idxs: list[int]
|
|
@@ -50,32 +55,47 @@ class DataRow:
|
|
|
50
55
|
# - stored url of file for media in vals[i]
|
|
51
56
|
# - None if vals[i] is not media type
|
|
52
57
|
# - not None if file_paths[i] is not None
|
|
53
|
-
file_urls:
|
|
58
|
+
file_urls: np.ndarray # of str
|
|
54
59
|
|
|
55
60
|
# file_paths:
|
|
56
61
|
# - local path of media file in vals[i]; points to the file cache if file_urls[i] is remote
|
|
57
62
|
# - None if vals[i] is not a media type or if there is no local file yet for file_urls[i]
|
|
58
|
-
file_paths:
|
|
63
|
+
file_paths: np.ndarray # of str
|
|
59
64
|
|
|
60
65
|
def __init__(self, size: int, img_slot_idxs: list[int], media_slot_idxs: list[int], array_slot_idxs: list[int]):
|
|
61
|
-
self.vals = [None] * size
|
|
62
|
-
self.has_val = [False] * size
|
|
63
|
-
self.excs = [None] * size
|
|
64
66
|
self.img_slot_idxs = img_slot_idxs
|
|
65
67
|
self.media_slot_idxs = media_slot_idxs
|
|
66
68
|
self.array_slot_idxs = array_slot_idxs
|
|
69
|
+
self.init(size)
|
|
70
|
+
|
|
71
|
+
def init(self, num_slots: int) -> None:
|
|
72
|
+
self.vals = np.full(num_slots, None, dtype=object)
|
|
73
|
+
self.has_val = np.zeros(num_slots, dtype=bool)
|
|
74
|
+
self.excs = np.full(num_slots, None, dtype=object)
|
|
75
|
+
self.missing_slots = np.zeros(num_slots, dtype=bool)
|
|
76
|
+
self.missing_dependents = np.zeros(num_slots, dtype=np.int16)
|
|
77
|
+
self.is_scheduled = np.zeros(num_slots, dtype=bool)
|
|
67
78
|
self.pk = None
|
|
68
|
-
self.file_urls =
|
|
69
|
-
self.file_paths =
|
|
70
|
-
|
|
71
|
-
def clear(self) -> None:
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
+
self.file_urls = np.full(num_slots, None, dtype=object)
|
|
80
|
+
self.file_paths = np.full(num_slots, None, dtype=object)
|
|
81
|
+
|
|
82
|
+
def clear(self, idxs: Optional[np.ndarray] = None) -> None:
|
|
83
|
+
if idxs is not None:
|
|
84
|
+
self.has_val[idxs] = False
|
|
85
|
+
self.vals[idxs] = None
|
|
86
|
+
self.excs[idxs] = None
|
|
87
|
+
self.file_urls[idxs] = None
|
|
88
|
+
self.file_paths[idxs] = None
|
|
89
|
+
else:
|
|
90
|
+
self.init(len(self.vals))
|
|
91
|
+
|
|
92
|
+
def set_file_path(self, idx: int, path: str) -> None:
|
|
93
|
+
"""Augment an existing url with a local file path"""
|
|
94
|
+
assert self.has_val[idx]
|
|
95
|
+
assert idx in self.img_slot_idxs or idx in self.media_slot_idxs
|
|
96
|
+
self.file_paths[idx] = path
|
|
97
|
+
if idx in self.media_slot_idxs:
|
|
98
|
+
self.vals[idx] = path
|
|
79
99
|
|
|
80
100
|
def copy(self, target: DataRow) -> None:
|
|
81
101
|
"""Create a copy of the contents of this DataRow in target
|
|
@@ -98,16 +118,18 @@ class DataRow:
|
|
|
98
118
|
"""
|
|
99
119
|
if slot_idx is not None:
|
|
100
120
|
return self.excs[slot_idx] is not None
|
|
101
|
-
return
|
|
121
|
+
return (self.excs != None).any()
|
|
102
122
|
|
|
103
123
|
def get_exc(self, slot_idx: int) -> Optional[Exception]:
|
|
104
|
-
|
|
124
|
+
exc = self.excs[slot_idx]
|
|
125
|
+
assert exc is None or isinstance(exc, Exception)
|
|
126
|
+
return exc
|
|
105
127
|
|
|
106
128
|
def get_first_exc(self) -> Optional[Exception]:
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
return
|
|
129
|
+
mask = self.excs != None
|
|
130
|
+
if not mask.any():
|
|
131
|
+
return None
|
|
132
|
+
return self.excs[mask][0]
|
|
111
133
|
|
|
112
134
|
def set_exc(self, slot_idx: int, exc: Exception) -> None:
|
|
113
135
|
assert self.excs[slot_idx] is None
|
|
@@ -119,9 +141,6 @@ class DataRow:
|
|
|
119
141
|
self.file_paths[slot_idx] = None
|
|
120
142
|
self.file_urls[slot_idx] = None
|
|
121
143
|
|
|
122
|
-
def __len__(self) -> int:
|
|
123
|
-
return len(self.vals)
|
|
124
|
-
|
|
125
144
|
def __getitem__(self, index: object) -> Any:
|
|
126
145
|
"""Returns in-memory value, ie, what is needed for expr evaluation"""
|
|
127
146
|
assert isinstance(index, int)
|
|
@@ -171,11 +190,10 @@ class DataRow:
|
|
|
171
190
|
|
|
172
191
|
return self.vals[index]
|
|
173
192
|
|
|
174
|
-
def __setitem__(self, idx:
|
|
193
|
+
def __setitem__(self, idx: int, val: Any) -> None:
|
|
175
194
|
"""Assign in-memory cell value
|
|
176
195
|
This allows overwriting
|
|
177
196
|
"""
|
|
178
|
-
assert isinstance(idx, int)
|
|
179
197
|
assert self.excs[idx] is None
|
|
180
198
|
|
|
181
199
|
if (idx in self.img_slot_idxs or idx in self.media_slot_idxs) and isinstance(val, str):
|
|
@@ -207,14 +225,6 @@ class DataRow:
|
|
|
207
225
|
self.vals[idx] = val
|
|
208
226
|
self.has_val[idx] = True
|
|
209
227
|
|
|
210
|
-
def set_file_path(self, idx: int, path: str) -> None:
|
|
211
|
-
"""Augment an existing url with a local file path"""
|
|
212
|
-
assert self.has_val[idx]
|
|
213
|
-
assert idx in self.img_slot_idxs or idx in self.media_slot_idxs
|
|
214
|
-
self.file_paths[idx] = path
|
|
215
|
-
if idx in self.media_slot_idxs:
|
|
216
|
-
self.vals[idx] = path
|
|
217
|
-
|
|
218
228
|
def flush_img(self, index: int, filepath: Optional[str] = None) -> None:
|
|
219
229
|
"""Discard the in-memory value and save it to a local file, if filepath is not None"""
|
|
220
230
|
if self.vals[index] is None:
|