pixeltable 0.3.0__py3-none-any.whl → 0.3.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of pixeltable might be problematic. Click here for more details.

Files changed (47) hide show
  1. pixeltable/__version__.py +2 -2
  2. pixeltable/catalog/table_version.py +2 -1
  3. pixeltable/dataframe.py +52 -27
  4. pixeltable/env.py +92 -4
  5. pixeltable/exec/__init__.py +1 -1
  6. pixeltable/exec/aggregation_node.py +3 -3
  7. pixeltable/exec/cache_prefetch_node.py +13 -7
  8. pixeltable/exec/component_iteration_node.py +3 -9
  9. pixeltable/exec/data_row_batch.py +17 -5
  10. pixeltable/exec/exec_node.py +32 -12
  11. pixeltable/exec/expr_eval/__init__.py +1 -0
  12. pixeltable/exec/expr_eval/evaluators.py +245 -0
  13. pixeltable/exec/expr_eval/expr_eval_node.py +404 -0
  14. pixeltable/exec/expr_eval/globals.py +114 -0
  15. pixeltable/exec/expr_eval/row_buffer.py +76 -0
  16. pixeltable/exec/expr_eval/schedulers.py +232 -0
  17. pixeltable/exec/in_memory_data_node.py +2 -2
  18. pixeltable/exec/row_update_node.py +14 -14
  19. pixeltable/exec/sql_node.py +2 -2
  20. pixeltable/exprs/column_ref.py +5 -1
  21. pixeltable/exprs/data_row.py +50 -40
  22. pixeltable/exprs/expr.py +57 -12
  23. pixeltable/exprs/function_call.py +54 -19
  24. pixeltable/exprs/inline_expr.py +12 -21
  25. pixeltable/exprs/literal.py +25 -8
  26. pixeltable/exprs/row_builder.py +23 -0
  27. pixeltable/func/aggregate_function.py +4 -0
  28. pixeltable/func/callable_function.py +54 -4
  29. pixeltable/func/expr_template_function.py +5 -1
  30. pixeltable/func/function.py +48 -7
  31. pixeltable/func/query_template_function.py +16 -7
  32. pixeltable/func/udf.py +7 -1
  33. pixeltable/functions/__init__.py +1 -1
  34. pixeltable/functions/anthropic.py +95 -21
  35. pixeltable/functions/gemini.py +2 -6
  36. pixeltable/functions/openai.py +207 -28
  37. pixeltable/globals.py +1 -1
  38. pixeltable/plan.py +24 -9
  39. pixeltable/store.py +6 -0
  40. pixeltable/type_system.py +3 -3
  41. pixeltable/utils/arrow.py +3 -3
  42. {pixeltable-0.3.0.dist-info → pixeltable-0.3.1.dist-info}/METADATA +3 -1
  43. {pixeltable-0.3.0.dist-info → pixeltable-0.3.1.dist-info}/RECORD +46 -41
  44. pixeltable/exec/expr_eval_node.py +0 -232
  45. {pixeltable-0.3.0.dist-info → pixeltable-0.3.1.dist-info}/LICENSE +0 -0
  46. {pixeltable-0.3.0.dist-info → pixeltable-0.3.1.dist-info}/WHEEL +0 -0
  47. {pixeltable-0.3.0.dist-info → pixeltable-0.3.1.dist-info}/entry_points.txt +0 -0
@@ -0,0 +1,114 @@
1
+ import abc
2
+ import asyncio
3
+ from dataclasses import dataclass
4
+ from types import TracebackType
5
+ from typing import Any, Protocol, Optional
6
+
7
+ from pixeltable import exprs
8
+ from pixeltable import func
9
+
10
+
11
+ @dataclass
12
+ class FnCallArgs:
13
+ """Container for everything needed to execute a FunctionCall against one or more DataRows"""
14
+ fn_call: exprs.FunctionCall
15
+ rows: list[exprs.DataRow]
16
+ # single call
17
+ args: Optional[list[Any]] = None
18
+ kwargs: Optional[dict[str, Any]] = None
19
+ # batch call
20
+ batch_args: Optional[list[list[Optional[Any]]]] = None
21
+ batch_kwargs: Optional[dict[str, list[Optional[Any]]]] = None
22
+
23
+ @property
24
+ def pxt_fn(self) -> func.CallableFunction:
25
+ assert isinstance(self.fn_call.fn, func.CallableFunction)
26
+ return self.fn_call.fn
27
+
28
+ @property
29
+ def is_batched(self) -> bool:
30
+ return self.batch_args is not None
31
+
32
+ @property
33
+ def row(self) -> exprs.DataRow:
34
+ assert len(self.rows) == 1
35
+ return self.rows[0]
36
+
37
+
38
+ class Scheduler(abc.ABC):
39
+ """
40
+ Base class for schedulers. A scheduler executes FunctionCalls against a limited resource pool.
41
+
42
+ Expected behavior:
43
+ - all created tasks must be recorded in dispatcher.tasks
44
+ - schedulers are responsible for aborting execution when a) the task is cancelled or b) when an exception occurred
45
+ elsewhere (indicated by dispatcher.exc_event)
46
+ """
47
+ @abc.abstractmethod
48
+ def submit(self, item: FnCallArgs) -> None:
49
+ pass
50
+
51
+ @classmethod
52
+ @abc.abstractmethod
53
+ def matches(cls, resource_pool: str) -> bool:
54
+ """Returns True if the scheduler can handle the given resource pool"""
55
+ pass
56
+
57
+
58
+ class Dispatcher(Protocol):
59
+ """
60
+ Row dispatcher used by Evaluators/Schedulers for post-processing after slot materialization and for task management.
61
+
62
+ Task management: all tasks need to be recorded in tasks and have done_cb registered with add_done_callback()
63
+ Exceptions: evaluators/schedulers need to check exc_event prior to starting long-running (non-interruptible)
64
+ computations
65
+ """
66
+ tasks: set[asyncio.Task]
67
+ row_builder: exprs.RowBuilder
68
+ exc_event: asyncio.Event
69
+ schedulers: dict[str, Scheduler] # key: resource pool id
70
+
71
+ def dispatch(self, rows: list[exprs.DataRow]) -> None:
72
+ """Dispatches row slots to the appropriate schedulers; does not block"""
73
+ ...
74
+
75
+ def dispatch_exc(self, rows: list[exprs.DataRow], slot_with_exc: int, exc_tb: TracebackType) -> None:
76
+ """Propagates exception in slot_with_exc to all dependent slots and dispatches the rest; does not block"""
77
+ ...
78
+
79
+ def done_cb(self, f: asyncio.Task) -> None:
80
+ """Callback for task completion; does not block"""
81
+ ...
82
+
83
+
84
+ class Evaluator(abc.ABC):
85
+ """
86
+ Base class for expression evaluators. Each DataRow slot is assigned an evaluator, which is responsible for the
87
+ execution of the expression evaluation logic as well as the scheduling/task breakdown of that execution.
88
+
89
+ Expected behavior:
90
+ - all created tasks must be recorded in dispatcher.tasks
91
+ - evaluators are responsible for aborting execution when a) the task is cancelled or b) when an exception occurred
92
+ elsewhere (indicated by dispatcher.exc_event)
93
+ """
94
+ dispatcher: Dispatcher
95
+ is_closed: bool
96
+
97
+ def __init__(self, dispatcher: Dispatcher):
98
+ self.dispatcher = dispatcher
99
+ self.is_closed = False
100
+
101
+ @abc.abstractmethod
102
+ def schedule(self, rows: list[exprs.DataRow], slot_idx: int) -> None:
103
+ """Create tasks to evaluate the expression in the given slot for the given rows; must not block."""
104
+ ...
105
+
106
+ def _close(self) -> None:
107
+ """Close the evaluator; must not block"""
108
+ pass
109
+
110
+ def close(self) -> None:
111
+ """Indicates that there may not be any more rows getting scheduled"""
112
+ self.is_closed = True
113
+ self._close()
114
+
@@ -0,0 +1,76 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ from typing import Optional
5
+
6
+ import numpy as np
7
+
8
+ from pixeltable import exprs
9
+
10
+ _logger = logging.getLogger('pixeltable')
11
+
12
+
13
+ class RowBuffer:
14
+ """Fixed-length circular buffer of DataRows; knows how to maintain input order"""
15
+
16
+ size: int
17
+ row_pos_map: Optional[dict[int, int]] # id(row) -> position of row in output; None if not maintaining order
18
+ num_rows: int # number of rows in the buffer
19
+ num_ready: int # number of consecutive non-None rows at head
20
+ buffer: np.ndarray # of object
21
+ head_idx: int # index of beginning of the buffer
22
+ head_pos: int # row position of the beginning of the buffer
23
+
24
+ def __init__(self, size: int):
25
+ self.size = size
26
+ self.row_pos_map = None
27
+ self.num_rows = 0
28
+ self.num_ready = 0
29
+ self.buffer = np.full(size, None, dtype=object)
30
+ self.head_pos = 0
31
+ self.head_idx = 0
32
+
33
+ def set_row_pos_map(self, row_pos_map: dict[int, int]) -> None:
34
+ self.row_pos_map = row_pos_map
35
+
36
+ def add_row(self, row: exprs.DataRow) -> None:
37
+ offset: int # of new row from head
38
+ if self.row_pos_map is not None:
39
+ pos = self.row_pos_map.get(id(row))
40
+ assert pos is not None and (pos - self.head_pos < self.size), f'{pos} {self.head_pos} {self.size}'
41
+ offset = pos - self.head_pos
42
+ else:
43
+ offset = self.num_rows
44
+ idx = (self.head_idx + offset) % self.size
45
+ assert self.buffer[idx] is None
46
+
47
+ self.buffer[idx] = row
48
+ self.num_rows += 1
49
+ if self.row_pos_map is not None:
50
+ if offset == self.num_ready:
51
+ # we have new ready rows; find out how many
52
+ while offset < self.size and self.buffer[(self.head_idx + offset) % self.size] is not None:
53
+ offset += 1
54
+ self.num_ready = offset
55
+ else:
56
+ self.num_ready += 1
57
+
58
+ def get_rows(self, n: int) -> list[exprs.DataRow]:
59
+ """Get up to n ready rows from head"""
60
+ n = min(n, self.num_ready)
61
+ if n == 0:
62
+ return []
63
+ rows: list[exprs.DataRow]
64
+ if self.head_idx + n <= self.size:
65
+ rows = self.buffer[self.head_idx:self.head_idx + n].tolist()
66
+ self.buffer[self.head_idx:self.head_idx + n] = None
67
+ else:
68
+ rows = np.concatenate([self.buffer[self.head_idx:], self.buffer[:self.head_idx + n - self.size]]).tolist()
69
+ self.buffer[self.head_idx:] = None
70
+ self.buffer[:self.head_idx + n - self.size] = None
71
+ self.head_pos += n
72
+ self.head_idx = (self.head_idx + n) % self.size
73
+ self.num_rows -= n
74
+ self.num_ready -= n
75
+ return rows
76
+
@@ -0,0 +1,232 @@
1
+ from __future__ import annotations
2
+
3
+ import asyncio
4
+ import datetime
5
+ import inspect
6
+ import logging
7
+ import sys
8
+ from dataclasses import dataclass
9
+ from typing import Optional, Awaitable, Collection
10
+
11
+ from pixeltable import env
12
+ from pixeltable import func
13
+ from .globals import Scheduler, FnCallArgs, Dispatcher
14
+
15
+ _logger = logging.getLogger('pixeltable')
16
+
17
+
18
+ class RateLimitsScheduler(Scheduler):
19
+ """
20
+ Scheduler for FunctionCalls with a RateLimitsInfo pool, which provides information about actual resource usage.
21
+
22
+ Scheduling strategy:
23
+ - try to stay below resource limits by utilizing reported RateLimitInfo.remaining
24
+ - also take into account the estimated resource usage for in-flight requests
25
+ (obtained via RateLimitsInfo.get_request_resources())
26
+ - issue synchronous requests when we don't have a RateLimitsInfo yet or when we depleted a resource and need to
27
+ wait for a reset
28
+
29
+ TODO:
30
+ - limit the number of in-flight requests based on the open file limit
31
+ """
32
+ @dataclass(frozen=True)
33
+ class QueueItem:
34
+ request: FnCallArgs
35
+ num_retries: int
36
+
37
+ def __lt__(self, other: RateLimitsScheduler.QueueItem) -> bool:
38
+ # prioritize by number of retries
39
+ return self.num_retries > other.num_retries
40
+
41
+ resource_pool: str
42
+ queue: asyncio.PriorityQueue[QueueItem] # prioritizes retries
43
+ loop_task: asyncio.Task
44
+ dispatcher: Dispatcher
45
+ get_request_resources_param_names: list[str] # names of parameters of RateLimitsInfo.get_request_resources()
46
+
47
+ # scheduling-related state
48
+ pool_info: Optional[env.RateLimitsInfo]
49
+ est_usage: dict[str, int] # value per resource; accumulated estimates since the last util. report
50
+
51
+ num_in_flight: int # unfinished tasks
52
+ request_completed: asyncio.Event
53
+
54
+ total_requests: int
55
+ total_retried: int
56
+
57
+ TIME_FORMAT = '%H:%M.%S %f'
58
+ MAX_RETRIES = 10
59
+
60
+ def __init__(self, resource_pool: str, dispatcher: Dispatcher):
61
+ self.resource_pool = resource_pool
62
+ self.queue = asyncio.PriorityQueue()
63
+ self.dispatcher = dispatcher
64
+ self.loop_task = asyncio.create_task(self._main_loop())
65
+ self.dispatcher.tasks.add(self.loop_task)
66
+ self.loop_task.add_done_callback(self.dispatcher.done_cb)
67
+ self.pool_info = None # initialized in _main_loop by the first request
68
+ self.est_usage = {}
69
+ self.num_in_flight = 0
70
+ self.request_completed = asyncio.Event()
71
+ self.total_requests = 0
72
+ self.total_retried = 0
73
+ self.get_request_resources_param_names = []
74
+
75
+ @classmethod
76
+ def matches(cls, resource_pool: str) -> bool:
77
+ return resource_pool.startswith('rate-limits:')
78
+
79
+ def submit(self, item: FnCallArgs) -> None:
80
+ self.queue.put_nowait(self.QueueItem(item, 0))
81
+
82
+ async def _main_loop(self) -> None:
83
+ item: Optional[RateLimitsScheduler.QueueItem] = None
84
+ while True:
85
+ if item is None:
86
+ item = await self.queue.get()
87
+ if item.num_retries > 0:
88
+ self.total_retried += 1
89
+
90
+ now = datetime.datetime.now(tz=datetime.timezone.utc)
91
+ if self.pool_info is None or not self.pool_info.is_initialized():
92
+ # wait for a single request to get rate limits
93
+ _logger.debug(f'initializing rate limits for {self.resource_pool}')
94
+ await self._exec(item.request, item.num_retries, is_task=False)
95
+ item = None
96
+ # if this was the first request, it created the pool_info
97
+ if self.pool_info is None:
98
+ self.pool_info = env.Env.get().get_resource_pool_info(self.resource_pool, None)
99
+ if self.pool_info is None:
100
+ # we still don't have rate limits, wait for the next request
101
+ continue
102
+ assert isinstance(self.pool_info, env.RateLimitsInfo)
103
+ assert hasattr(self.pool_info, 'get_request_resources')
104
+ sig = inspect.signature(self.pool_info.get_request_resources)
105
+ self.get_request_resources_param_names = [p.name for p in sig.parameters.values()]
106
+ self.est_usage = {r: 0 for r in self._resources}
107
+ continue
108
+
109
+ # check rate limits
110
+ request_resources = self._get_request_resources(item.request)
111
+ limits_info = self._check_resource_limits(request_resources)
112
+ aws: list[Awaitable[None]] = []
113
+ completed_aw: Optional[asyncio.Task] = None
114
+ wait_for_reset: Optional[asyncio.Task] = None
115
+ if limits_info is not None:
116
+ # limits_info's resource is depleted, wait for capacity to free up
117
+
118
+ if self.num_in_flight > 0:
119
+ # a completed request can free up capacity
120
+ self.request_completed.clear()
121
+ completed_aw = asyncio.create_task(self.request_completed.wait())
122
+ aws.append(completed_aw)
123
+ _logger.debug(f'waiting for completed request for {self.resource_pool}')
124
+
125
+ reset_at = limits_info.reset_at
126
+ if reset_at > now:
127
+ # we're waiting for the rate limit to reset
128
+ wait_for_reset = asyncio.create_task(asyncio.sleep((reset_at - now).total_seconds()))
129
+ aws.append(wait_for_reset)
130
+ _logger.debug(f'waiting for rate limit reset for {self.resource_pool}')
131
+
132
+ if len(aws) > 0:
133
+ # we have something to wait for
134
+ done, pending = await asyncio.wait(aws, return_when=asyncio.FIRST_COMPLETED)
135
+ for task in pending:
136
+ task.cancel()
137
+ if completed_aw in done:
138
+ _logger.debug(f'wait(): completed request for {self.resource_pool}')
139
+ if wait_for_reset in done:
140
+ _logger.debug(f'wait(): rate limit reset for {self.resource_pool}')
141
+ # force waiting for another rate limit report before making any scheduling decisions
142
+ self.pool_info.reset()
143
+ # re-evaluate current capacity for current item
144
+ continue
145
+
146
+ # we have a new in-flight request
147
+ for resource, val in request_resources.items():
148
+ self.est_usage[resource] += val
149
+ _logger.debug(f'creating task for {self.resource_pool}')
150
+ self.num_in_flight += 1
151
+ task = asyncio.create_task(self._exec(item.request, item.num_retries, is_task=True))
152
+ self.dispatcher.tasks.add(task)
153
+ task.add_done_callback(self.dispatcher.done_cb)
154
+ item = None
155
+
156
+ @property
157
+ def _resources(self) -> Collection[str]:
158
+ return self.pool_info.resource_limits.keys() if self.pool_info is not None else []
159
+
160
+ def _get_request_resources(self, request: FnCallArgs) -> dict[str, int]:
161
+ kwargs_batch = request.fn_call.get_param_values(self.get_request_resources_param_names, request.rows)
162
+ if not request.is_batched:
163
+ return self.pool_info.get_request_resources(**kwargs_batch[0])
164
+ else:
165
+ batch_kwargs = {k: [d[k] for d in kwargs_batch] for k in kwargs_batch[0]}
166
+ constant_kwargs, batch_kwargs = request.pxt_fn.create_batch_kwargs(batch_kwargs)
167
+ return self.pool_info.get_request_resources(**constant_kwargs, **batch_kwargs)
168
+
169
+
170
+ def _check_resource_limits(self, request_resources: dict[str, int]) -> Optional[env.RateLimitInfo]:
171
+ """Returns the most depleted resource, relative to its limit, or None if all resources are within limits"""
172
+ candidates: list[tuple[env.RateLimitInfo, float]] = [] # (info, relative usage)
173
+ for resource, usage in request_resources.items():
174
+ # 0.05: leave some headroom, we don't have perfect information
175
+ info = self.pool_info.resource_limits[resource]
176
+ est_remaining = info.remaining - self.est_usage[resource] - usage
177
+ if est_remaining < 0.05 * info.limit:
178
+ candidates.append((info, est_remaining / info.limit))
179
+ if len(candidates) == 0:
180
+ return None
181
+ return min(candidates, key=lambda x: x[1])[0]
182
+
183
+ async def _exec(self, request: FnCallArgs, num_retries: int, is_task: bool) -> None:
184
+ assert all(not row.has_val[request.fn_call.slot_idx] for row in request.rows)
185
+ assert all(not row.has_exc(request.fn_call.slot_idx) for row in request.rows)
186
+
187
+ try:
188
+ start_ts = datetime.datetime.now(tz=datetime.timezone.utc)
189
+ pxt_fn = request.fn_call.fn
190
+ assert isinstance(pxt_fn, func.CallableFunction)
191
+ _logger.debug(f'scheduler {self.resource_pool}: start evaluating slot {request.fn_call.slot_idx}, batch_size={len(request.rows)}')
192
+ self.total_requests += 1
193
+ if request.is_batched:
194
+ batch_result = await pxt_fn.aexec_batch(*request.batch_args, **request.batch_kwargs)
195
+ assert len(batch_result) == len(request.rows)
196
+ for row, result in zip(request.rows, batch_result):
197
+ row[request.fn_call.slot_idx] = result
198
+ else:
199
+ result = await pxt_fn.aexec(*request.args, **request.kwargs)
200
+ request.row[request.fn_call.slot_idx] = result
201
+ end_ts = datetime.datetime.now(tz=datetime.timezone.utc)
202
+ _logger.debug(f'scheduler {self.resource_pool}: evaluated slot {request.fn_call.slot_idx} in {end_ts - start_ts}, batch_size={len(request.rows)}')
203
+
204
+ # purge accumulated usage estimate, now that we have a new report
205
+ self.est_usage = {r: 0 for r in self._resources}
206
+
207
+ self.dispatcher.dispatch(request.rows)
208
+ except Exception as exc:
209
+ if num_retries < self.MAX_RETRIES and self.pool_info is not None:
210
+ retry_delay = self.pool_info.get_retry_delay(exc)
211
+ if retry_delay is not None:
212
+ self.total_retried += 1
213
+ await asyncio.sleep(retry_delay)
214
+ self.queue.put_nowait(self.QueueItem(request, num_retries + 1))
215
+ return
216
+ # TODO: update resource limits reported in exc.response.headers, if present
217
+
218
+ # record the exception
219
+ _, _, exc_tb = sys.exc_info()
220
+ for row in request.rows:
221
+ row.set_exc(request.fn_call.slot_idx, exc)
222
+ self.dispatcher.dispatch_exc(request.rows, request.fn_call.slot_idx, exc_tb)
223
+ finally:
224
+ _logger.debug(
225
+ f'Scheduler stats: #requests={self.total_requests}, #retried={self.total_retried}')
226
+ if is_task:
227
+ self.num_in_flight -= 1
228
+ self.request_completed.set()
229
+
230
+
231
+ # all concrete Scheduler subclasses that implement matches()
232
+ SCHEDULERS = [RateLimitsScheduler]
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Any, Iterator, Optional
2
+ from typing import Any, Iterator, Optional, AsyncIterator
3
3
 
4
4
  import pixeltable.catalog as catalog
5
5
  import pixeltable.exprs as exprs
@@ -76,6 +76,6 @@ class InMemoryDataNode(ExecNode):
76
76
 
77
77
  self.ctx.num_rows = len(self.output_rows)
78
78
 
79
- def __iter__(self) -> Iterator[DataRowBatch]:
79
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
80
80
  _logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_rows)} output_rows')
81
81
  yield self.output_rows
@@ -1,5 +1,5 @@
1
1
  import logging
2
- from typing import Any
2
+ from typing import Any, AsyncIterator
3
3
 
4
4
  import pixeltable.catalog as catalog
5
5
  import pixeltable.exprs as exprs
@@ -34,19 +34,19 @@ class RowUpdateNode(ExecNode):
34
34
  self.key_slot_idxs = {col: all_col_slot_idxs[col] for col in tbl.tbl_version.primary_key_columns()}
35
35
  self.matched_key_vals: set[tuple] = set()
36
36
 
37
- def __next__(self) -> DataRowBatch:
38
- batch = next(self.input)
39
- for row in batch:
40
- key_vals = row.rowid if self.is_rowid_key else \
41
- tuple(row[slot_idx] for slot_idx in self.key_slot_idxs.values())
42
- if key_vals not in self.updates:
43
- continue
44
- self.matched_key_vals.add(key_vals)
45
- col_vals = self.updates[key_vals]
46
- for col, val in col_vals.items():
47
- slot_idx = self.col_slot_idxs[col]
48
- row[slot_idx] = val
49
- return batch
37
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
38
+ async for batch in self.input:
39
+ for row in batch:
40
+ key_vals = row.rowid if self.is_rowid_key else \
41
+ tuple(row[slot_idx] for slot_idx in self.key_slot_idxs.values())
42
+ if key_vals not in self.updates:
43
+ continue
44
+ self.matched_key_vals.add(key_vals)
45
+ col_vals = self.updates[key_vals]
46
+ for col, val in col_vals.items():
47
+ slot_idx = self.col_slot_idxs[col]
48
+ row[slot_idx] = val
49
+ yield batch
50
50
 
51
51
  def unmatched_rows(self) -> list[dict[str, Any]]:
52
52
  """Return rows that didn't get used in the updates as a list of dicts compatible with TableVersion.insert()."""
@@ -1,7 +1,7 @@
1
1
  import logging
2
2
  import warnings
3
3
  from decimal import Decimal
4
- from typing import Iterable, Iterator, NamedTuple, Optional, TYPE_CHECKING, Sequence
4
+ from typing import Iterable, Iterator, NamedTuple, Optional, TYPE_CHECKING, Sequence, AsyncIterator
5
5
  from uuid import UUID
6
6
 
7
7
  import sqlalchemy as sql
@@ -264,7 +264,7 @@ class SqlNode(ExecNode):
264
264
  except Exception as e:
265
265
  _logger.warning(f'EXPLAIN failed with error: {e}')
266
266
 
267
- def __iter__(self) -> Iterator[DataRowBatch]:
267
+ async def __aiter__(self) -> AsyncIterator[DataRowBatch]:
268
268
  # run the query; do this here rather than in _open(), exceptions are only expected during iteration
269
269
  assert self.ctx.conn is not None
270
270
  with warnings.catch_warnings(record=True) as w:
@@ -101,7 +101,8 @@ class ColumnRef(Expr):
101
101
  # resolve column properties
102
102
  if name == ColumnPropertyRef.Property.ERRORTYPE.name.lower() \
103
103
  or name == ColumnPropertyRef.Property.ERRORMSG.name.lower():
104
- if not (self.col.is_computed and self.col.is_stored) and not self.col.col_type.is_media_type():
104
+ property_is_present = self.col.is_stored and (self.col.is_computed or self.col_type.is_media_type())
105
+ if not property_is_present:
105
106
  raise excs.Error(f'{name} only valid for a stored computed or media column: {self}')
106
107
  return ColumnPropertyRef(self, ColumnPropertyRef.Property[name.upper()])
107
108
  if name == ColumnPropertyRef.Property.FILEURL.name.lower() \
@@ -239,3 +240,6 @@ class ColumnRef(Expr):
239
240
  col = cls.get_column(d)
240
241
  perform_validation = d['perform_validation']
241
242
  return cls(col, perform_validation=perform_validation)
243
+
244
+ def is_constant(self) -> bool:
245
+ return False
@@ -6,10 +6,10 @@ import urllib.parse
6
6
  import urllib.request
7
7
  from typing import Any, Optional
8
8
 
9
- import numpy as np
10
- import pgvector.sqlalchemy # type: ignore[import-untyped]
11
9
  import PIL
12
10
  import PIL.Image
11
+ import numpy as np
12
+ import pgvector.sqlalchemy # type: ignore[import-untyped]
13
13
  import sqlalchemy as sql
14
14
 
15
15
  from pixeltable import env
@@ -34,9 +34,14 @@ class DataRow:
34
34
  - VideoType: local path if available, otherwise url
35
35
  """
36
36
 
37
- vals: list[Any]
38
- has_val: list[bool]
39
- excs: list[Optional[Exception]]
37
+ vals: np.ndarray # of object
38
+ has_val: np.ndarray # of bool
39
+ excs: np.ndarray # of object
40
+
41
+ # expr evaluation state; indexed by slot idx
42
+ missing_slots: np.ndarray # of bool; number of missing dependencies
43
+ missing_dependents: np.ndarray # of int16; number of missing dependents
44
+ is_scheduled: np.ndarray # of bool; True if this slot is scheduled for evaluation
40
45
 
41
46
  # control structures that are shared across all DataRows in a batch
42
47
  img_slot_idxs: list[int]
@@ -50,32 +55,47 @@ class DataRow:
50
55
  # - stored url of file for media in vals[i]
51
56
  # - None if vals[i] is not media type
52
57
  # - not None if file_paths[i] is not None
53
- file_urls: list[Optional[str]]
58
+ file_urls: np.ndarray # of str
54
59
 
55
60
  # file_paths:
56
61
  # - local path of media file in vals[i]; points to the file cache if file_urls[i] is remote
57
62
  # - None if vals[i] is not a media type or if there is no local file yet for file_urls[i]
58
- file_paths: list[Optional[str]]
63
+ file_paths: np.ndarray # of str
59
64
 
60
65
  def __init__(self, size: int, img_slot_idxs: list[int], media_slot_idxs: list[int], array_slot_idxs: list[int]):
61
- self.vals = [None] * size
62
- self.has_val = [False] * size
63
- self.excs = [None] * size
64
66
  self.img_slot_idxs = img_slot_idxs
65
67
  self.media_slot_idxs = media_slot_idxs
66
68
  self.array_slot_idxs = array_slot_idxs
69
+ self.init(size)
70
+
71
+ def init(self, num_slots: int) -> None:
72
+ self.vals = np.full(num_slots, None, dtype=object)
73
+ self.has_val = np.zeros(num_slots, dtype=bool)
74
+ self.excs = np.full(num_slots, None, dtype=object)
75
+ self.missing_slots = np.zeros(num_slots, dtype=bool)
76
+ self.missing_dependents = np.zeros(num_slots, dtype=np.int16)
77
+ self.is_scheduled = np.zeros(num_slots, dtype=bool)
67
78
  self.pk = None
68
- self.file_urls = [None] * size
69
- self.file_paths = [None] * size
70
-
71
- def clear(self) -> None:
72
- size = len(self.vals)
73
- self.vals = [None] * size
74
- self.has_val = [False] * size
75
- self.excs = [None] * size
76
- self.pk = None
77
- self.file_urls = [None] * size
78
- self.file_paths = [None] * size
79
+ self.file_urls = np.full(num_slots, None, dtype=object)
80
+ self.file_paths = np.full(num_slots, None, dtype=object)
81
+
82
+ def clear(self, idxs: Optional[np.ndarray] = None) -> None:
83
+ if idxs is not None:
84
+ self.has_val[idxs] = False
85
+ self.vals[idxs] = None
86
+ self.excs[idxs] = None
87
+ self.file_urls[idxs] = None
88
+ self.file_paths[idxs] = None
89
+ else:
90
+ self.init(len(self.vals))
91
+
92
+ def set_file_path(self, idx: int, path: str) -> None:
93
+ """Augment an existing url with a local file path"""
94
+ assert self.has_val[idx]
95
+ assert idx in self.img_slot_idxs or idx in self.media_slot_idxs
96
+ self.file_paths[idx] = path
97
+ if idx in self.media_slot_idxs:
98
+ self.vals[idx] = path
79
99
 
80
100
  def copy(self, target: DataRow) -> None:
81
101
  """Create a copy of the contents of this DataRow in target
@@ -98,16 +118,18 @@ class DataRow:
98
118
  """
99
119
  if slot_idx is not None:
100
120
  return self.excs[slot_idx] is not None
101
- return any(exc is not None for exc in self.excs)
121
+ return (self.excs != None).any()
102
122
 
103
123
  def get_exc(self, slot_idx: int) -> Optional[Exception]:
104
- return self.excs[slot_idx]
124
+ exc = self.excs[slot_idx]
125
+ assert exc is None or isinstance(exc, Exception)
126
+ return exc
105
127
 
106
128
  def get_first_exc(self) -> Optional[Exception]:
107
- for exc in self.excs:
108
- if exc is not None:
109
- return exc
110
- return None
129
+ mask = self.excs != None
130
+ if not mask.any():
131
+ return None
132
+ return self.excs[mask][0]
111
133
 
112
134
  def set_exc(self, slot_idx: int, exc: Exception) -> None:
113
135
  assert self.excs[slot_idx] is None
@@ -119,9 +141,6 @@ class DataRow:
119
141
  self.file_paths[slot_idx] = None
120
142
  self.file_urls[slot_idx] = None
121
143
 
122
- def __len__(self) -> int:
123
- return len(self.vals)
124
-
125
144
  def __getitem__(self, index: object) -> Any:
126
145
  """Returns in-memory value, ie, what is needed for expr evaluation"""
127
146
  assert isinstance(index, int)
@@ -171,11 +190,10 @@ class DataRow:
171
190
 
172
191
  return self.vals[index]
173
192
 
174
- def __setitem__(self, idx: object, val: Any) -> None:
193
+ def __setitem__(self, idx: int, val: Any) -> None:
175
194
  """Assign in-memory cell value
176
195
  This allows overwriting
177
196
  """
178
- assert isinstance(idx, int)
179
197
  assert self.excs[idx] is None
180
198
 
181
199
  if (idx in self.img_slot_idxs or idx in self.media_slot_idxs) and isinstance(val, str):
@@ -207,14 +225,6 @@ class DataRow:
207
225
  self.vals[idx] = val
208
226
  self.has_val[idx] = True
209
227
 
210
- def set_file_path(self, idx: int, path: str) -> None:
211
- """Augment an existing url with a local file path"""
212
- assert self.has_val[idx]
213
- assert idx in self.img_slot_idxs or idx in self.media_slot_idxs
214
- self.file_paths[idx] = path
215
- if idx in self.media_slot_idxs:
216
- self.vals[idx] = path
217
-
218
228
  def flush_img(self, index: int, filepath: Optional[str] = None) -> None:
219
229
  """Discard the in-memory value and save it to a local file, if filepath is not None"""
220
230
  if self.vals[index] is None: