pixeltable 0.2.15__py3-none-any.whl → 0.2.17__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/column.py +3 -0
- pixeltable/catalog/dir.py +1 -1
- pixeltable/catalog/globals.py +15 -6
- pixeltable/catalog/insertable_table.py +23 -8
- pixeltable/catalog/named_function.py +1 -1
- pixeltable/catalog/path_dict.py +4 -4
- pixeltable/catalog/schema_object.py +30 -18
- pixeltable/catalog/table.py +84 -99
- pixeltable/catalog/table_version.py +35 -24
- pixeltable/catalog/table_version_path.py +2 -2
- pixeltable/catalog/view.py +15 -8
- pixeltable/dataframe.py +56 -56
- pixeltable/env.py +7 -5
- pixeltable/exec/__init__.py +3 -3
- pixeltable/exec/aggregation_node.py +3 -3
- pixeltable/exec/expr_eval_node.py +3 -3
- pixeltable/exec/in_memory_data_node.py +4 -4
- pixeltable/exec/sql_node.py +4 -1
- pixeltable/exprs/array_slice.py +3 -4
- pixeltable/exprs/column_ref.py +20 -4
- pixeltable/exprs/comparison.py +11 -6
- pixeltable/exprs/data_row.py +3 -0
- pixeltable/exprs/expr.py +51 -23
- pixeltable/exprs/function_call.py +8 -1
- pixeltable/exprs/inline_array.py +2 -2
- pixeltable/exprs/json_path.py +36 -20
- pixeltable/exprs/row_builder.py +4 -4
- pixeltable/exprs/rowid_ref.py +1 -1
- pixeltable/functions/__init__.py +1 -2
- pixeltable/functions/anthropic.py +97 -0
- pixeltable/functions/audio.py +32 -0
- pixeltable/functions/fireworks.py +1 -1
- pixeltable/functions/huggingface.py +4 -4
- pixeltable/functions/image.py +1 -1
- pixeltable/functions/together.py +1 -1
- pixeltable/functions/video.py +5 -1
- pixeltable/functions/vision.py +2 -6
- pixeltable/globals.py +57 -28
- pixeltable/io/external_store.py +4 -4
- pixeltable/io/globals.py +12 -13
- pixeltable/io/label_studio.py +6 -6
- pixeltable/io/pandas.py +27 -12
- pixeltable/io/parquet.py +14 -14
- pixeltable/iterators/document.py +7 -7
- pixeltable/plan.py +58 -29
- pixeltable/store.py +32 -31
- pixeltable/tool/create_test_db_dump.py +12 -6
- pixeltable/type_system.py +89 -97
- pixeltable/utils/pytorch.py +12 -10
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.17.dist-info}/METADATA +10 -10
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.17.dist-info}/RECORD +55 -53
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.17.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.17.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.15.dist-info → pixeltable-0.2.17.dist-info}/entry_points.txt +0 -0
pixeltable/dataframe.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import builtins
|
|
3
4
|
import copy
|
|
4
5
|
import hashlib
|
|
5
6
|
import json
|
|
@@ -7,7 +8,7 @@ import logging
|
|
|
7
8
|
import mimetypes
|
|
8
9
|
import traceback
|
|
9
10
|
from pathlib import Path
|
|
10
|
-
from typing import
|
|
11
|
+
from typing import TYPE_CHECKING, Any, Callable, Dict, Hashable, Iterator, List, Optional, Set, Tuple
|
|
11
12
|
|
|
12
13
|
import pandas as pd
|
|
13
14
|
import pandas.io.formats.style
|
|
@@ -16,6 +17,7 @@ import sqlalchemy as sql
|
|
|
16
17
|
import pixeltable.catalog as catalog
|
|
17
18
|
import pixeltable.exceptions as excs
|
|
18
19
|
import pixeltable.exprs as exprs
|
|
20
|
+
from pixeltable import exec
|
|
19
21
|
from pixeltable.catalog import is_valid_identifier
|
|
20
22
|
from pixeltable.catalog.globals import UpdateStatus
|
|
21
23
|
from pixeltable.env import Env
|
|
@@ -24,6 +26,9 @@ from pixeltable.type_system import ColumnType
|
|
|
24
26
|
from pixeltable.utils.formatter import Formatter
|
|
25
27
|
from pixeltable.utils.http_server import get_file_uri
|
|
26
28
|
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
import torch
|
|
31
|
+
|
|
27
32
|
__all__ = ['DataFrame']
|
|
28
33
|
|
|
29
34
|
_logger = logging.getLogger('pixeltable')
|
|
@@ -38,27 +43,25 @@ def _create_source_tag(file_path: str) -> str:
|
|
|
38
43
|
|
|
39
44
|
|
|
40
45
|
class DataFrameResultSet:
|
|
41
|
-
def __init__(self, rows:
|
|
46
|
+
def __init__(self, rows: list[list[Any]], schema: dict[str, ColumnType]):
|
|
42
47
|
self._rows = rows
|
|
43
|
-
self._col_names =
|
|
44
|
-
self.
|
|
48
|
+
self._col_names = list(schema.keys())
|
|
49
|
+
self.__schema = schema
|
|
45
50
|
self.__formatter = Formatter(len(self._rows), len(self._col_names), Env.get().http_address)
|
|
46
51
|
|
|
52
|
+
@property
|
|
53
|
+
def schema(self) -> dict[str, ColumnType]:
|
|
54
|
+
return self.__schema
|
|
55
|
+
|
|
47
56
|
def __len__(self) -> int:
|
|
48
57
|
return len(self._rows)
|
|
49
58
|
|
|
50
|
-
def column_names(self) -> List[str]:
|
|
51
|
-
return self._col_names
|
|
52
|
-
|
|
53
|
-
def column_types(self) -> List[ColumnType]:
|
|
54
|
-
return self._col_types
|
|
55
|
-
|
|
56
59
|
def __repr__(self) -> str:
|
|
57
60
|
return self.to_pandas().__repr__()
|
|
58
61
|
|
|
59
62
|
def _repr_html_(self) -> str:
|
|
60
|
-
formatters: dict[
|
|
61
|
-
for col_name, col_type in
|
|
63
|
+
formatters: dict[Hashable, Callable[[object], str]] = {}
|
|
64
|
+
for col_name, col_type in self.schema.items():
|
|
62
65
|
formatter = self.__formatter.get_pandas_formatter(col_type)
|
|
63
66
|
if formatter is not None:
|
|
64
67
|
formatters[col_name] = formatter
|
|
@@ -169,8 +172,9 @@ class DataFrame:
|
|
|
169
172
|
DataFrame._select_list_check_rep(list(zip(select_list_exprs, column_names)))
|
|
170
173
|
# check select list after expansion to catch early
|
|
171
174
|
# the following two lists are always non empty, even if select list is None.
|
|
175
|
+
assert len(column_names) == len(select_list_exprs)
|
|
172
176
|
self._select_list_exprs = select_list_exprs
|
|
173
|
-
self.
|
|
177
|
+
self._schema = {column_names[i]: select_list_exprs[i].col_type for i in range(len(column_names))}
|
|
174
178
|
self.select_list = select_list
|
|
175
179
|
|
|
176
180
|
self.where_clause = copy.deepcopy(where_clause)
|
|
@@ -202,22 +206,20 @@ class DataFrame:
|
|
|
202
206
|
def _normalize_select_list(
|
|
203
207
|
cls,
|
|
204
208
|
tbl: catalog.TableVersionPath,
|
|
205
|
-
select_list: Optional[
|
|
206
|
-
) ->
|
|
209
|
+
select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
|
|
210
|
+
) -> tuple[list[exprs.Expr], list[str]]:
|
|
207
211
|
"""
|
|
208
212
|
Expand select list information with all columns and their names
|
|
209
213
|
Returns:
|
|
210
214
|
a pair composed of the list of expressions and the list of corresponding names
|
|
211
215
|
"""
|
|
212
216
|
if select_list is None:
|
|
213
|
-
|
|
214
|
-
else:
|
|
215
|
-
expanded_list = select_list
|
|
217
|
+
select_list = [(exprs.ColumnRef(col), None) for col in tbl.columns()]
|
|
216
218
|
|
|
217
|
-
out_exprs:
|
|
218
|
-
out_names:
|
|
219
|
+
out_exprs: list[exprs.Expr] = []
|
|
220
|
+
out_names: list[str] = [] # keep track of order
|
|
219
221
|
seen_out_names: set[str] = set() # use to check for duplicates in loop, avoid square complexity
|
|
220
|
-
for i, (expr, name) in enumerate(
|
|
222
|
+
for i, (expr, name) in enumerate(select_list):
|
|
221
223
|
if name is None:
|
|
222
224
|
# use default, add suffix if needed so default adds no duplicates
|
|
223
225
|
default_name = expr.default_column_name()
|
|
@@ -275,6 +277,24 @@ class DataFrame:
|
|
|
275
277
|
"""Run the query and return rows as a generator.
|
|
276
278
|
This function must not modify the state of the DataFrame, otherwise it breaks dataset caching.
|
|
277
279
|
"""
|
|
280
|
+
plan = self._create_query_plan()
|
|
281
|
+
|
|
282
|
+
def exec_plan(conn: sql.engine.Connection) -> Iterator[exec.DataRowBatch]:
|
|
283
|
+
plan.ctx.set_conn(conn)
|
|
284
|
+
plan.open()
|
|
285
|
+
try:
|
|
286
|
+
for row_batch in plan:
|
|
287
|
+
yield from row_batch
|
|
288
|
+
finally:
|
|
289
|
+
plan.close()
|
|
290
|
+
|
|
291
|
+
if conn is None:
|
|
292
|
+
with Env.get().engine.begin() as conn:
|
|
293
|
+
yield from exec_plan(conn)
|
|
294
|
+
else:
|
|
295
|
+
yield from exec_plan(conn)
|
|
296
|
+
|
|
297
|
+
def _create_query_plan(self) -> exec.ExecNode:
|
|
278
298
|
# construct a group-by clause if we're grouping by a table
|
|
279
299
|
group_by_clause: List[exprs.Expr] = []
|
|
280
300
|
if self.grouping_tbl is not None:
|
|
@@ -289,7 +309,7 @@ class DataFrame:
|
|
|
289
309
|
for item in self._select_list_exprs:
|
|
290
310
|
item.bind_rel_paths(None)
|
|
291
311
|
|
|
292
|
-
|
|
312
|
+
return Planner.create_query_plan(
|
|
293
313
|
self.tbl,
|
|
294
314
|
self._select_list_exprs,
|
|
295
315
|
where_clause=self.where_clause,
|
|
@@ -298,21 +318,6 @@ class DataFrame:
|
|
|
298
318
|
limit=self.limit_val if self.limit_val is not None else 0,
|
|
299
319
|
) # limit_val == 0: no limit_val
|
|
300
320
|
|
|
301
|
-
def exec_plan(conn: sql.engine.Connection) -> Iterator[exprs.DataRow]:
|
|
302
|
-
plan.ctx.set_conn(conn)
|
|
303
|
-
plan.open()
|
|
304
|
-
try:
|
|
305
|
-
for row_batch in plan:
|
|
306
|
-
for data_row in row_batch:
|
|
307
|
-
yield data_row
|
|
308
|
-
finally:
|
|
309
|
-
plan.close()
|
|
310
|
-
|
|
311
|
-
if conn is None:
|
|
312
|
-
with Env.get().engine.begin() as conn:
|
|
313
|
-
yield from exec_plan(conn)
|
|
314
|
-
else:
|
|
315
|
-
yield from exec_plan(conn)
|
|
316
321
|
|
|
317
322
|
def show(self, n: int = 20) -> DataFrameResultSet:
|
|
318
323
|
assert n is not None
|
|
@@ -334,11 +339,9 @@ class DataFrame:
|
|
|
334
339
|
result._reverse()
|
|
335
340
|
return result
|
|
336
341
|
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
def get_column_types(self) -> List[ColumnType]:
|
|
341
|
-
return [expr.col_type for expr in self._select_list_exprs]
|
|
342
|
+
@property
|
|
343
|
+
def schema(self) -> dict[str, ColumnType]:
|
|
344
|
+
return self._schema
|
|
342
345
|
|
|
343
346
|
def bind(self, args: dict[str, Any]) -> DataFrame:
|
|
344
347
|
"""Bind arguments to parameters and return a new DataFrame."""
|
|
@@ -369,7 +372,7 @@ class DataFrame:
|
|
|
369
372
|
if order_by_exprs is not None:
|
|
370
373
|
exprs.Expr.list_substitute(order_by_exprs, var_exprs)
|
|
371
374
|
|
|
372
|
-
select_list = list(zip(select_list_exprs, self.
|
|
375
|
+
select_list = list(zip(select_list_exprs, self.schema.keys()))
|
|
373
376
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None
|
|
374
377
|
if order_by_exprs is not None:
|
|
375
378
|
order_by_clause = [
|
|
@@ -409,8 +412,7 @@ class DataFrame:
|
|
|
409
412
|
except sql.exc.DBAPIError as e:
|
|
410
413
|
raise excs.Error(f'Error during SQL execution:\n{e}')
|
|
411
414
|
|
|
412
|
-
|
|
413
|
-
return DataFrameResultSet(result_rows, self._column_names, col_types)
|
|
415
|
+
return DataFrameResultSet(result_rows, self.schema)
|
|
414
416
|
|
|
415
417
|
def count(self) -> int:
|
|
416
418
|
from pixeltable.plan import Planner
|
|
@@ -429,7 +431,7 @@ class DataFrame:
|
|
|
429
431
|
assert len(self.select_list) > 0
|
|
430
432
|
heading_vals.append('Select')
|
|
431
433
|
heading_vals.extend([''] * (len(self.select_list) - 1))
|
|
432
|
-
info_vals.extend(self.
|
|
434
|
+
info_vals.extend(self.schema.keys())
|
|
433
435
|
if self.where_clause is not None:
|
|
434
436
|
heading_vals.append('Where')
|
|
435
437
|
info_vals.append(self.where_clause.display_str(inline=False))
|
|
@@ -457,7 +459,7 @@ class DataFrame:
|
|
|
457
459
|
# white-space: pre-wrap: print \n as newline
|
|
458
460
|
# th: center-align headings
|
|
459
461
|
return (
|
|
460
|
-
pd_df.style.set_properties(**{'white-space': 'pre-wrap', 'text-align': 'left'})
|
|
462
|
+
pd_df.style.set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left'})
|
|
461
463
|
.set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
|
|
462
464
|
.hide(axis='index')
|
|
463
465
|
.hide(axis='columns')
|
|
@@ -469,19 +471,17 @@ class DataFrame:
|
|
|
469
471
|
The description has two columns, heading and info, which list the contents of each 'component'
|
|
470
472
|
(select list, where clause, ...) vertically.
|
|
471
473
|
"""
|
|
472
|
-
|
|
473
|
-
__IPYTHON__
|
|
474
|
+
if getattr(builtins, '__IPYTHON__', False):
|
|
474
475
|
from IPython.display import display
|
|
475
|
-
|
|
476
476
|
display(self._description_html())
|
|
477
|
-
|
|
477
|
+
else:
|
|
478
478
|
print(self.__repr__())
|
|
479
479
|
|
|
480
480
|
def __repr__(self) -> str:
|
|
481
481
|
return self._description().to_string(header=False, index=False)
|
|
482
482
|
|
|
483
483
|
def _repr_html_(self) -> str:
|
|
484
|
-
return self._description_html()._repr_html_()
|
|
484
|
+
return self._description_html()._repr_html_() # type: ignore[attr-defined]
|
|
485
485
|
|
|
486
486
|
def select(self, *items: Any, **named_items: Any) -> DataFrame:
|
|
487
487
|
if self.select_list is not None:
|
|
@@ -562,7 +562,7 @@ class DataFrame:
|
|
|
562
562
|
# we need to make sure that the grouping table is a base of self.tbl
|
|
563
563
|
base = self.tbl.find_tbl_version(item._tbl_version_path.tbl_id())
|
|
564
564
|
if base is None or base.id == self.tbl.tbl_id():
|
|
565
|
-
raise excs.Error(f'group_by(): {item.
|
|
565
|
+
raise excs.Error(f'group_by(): {item._name} is not a base table of {self.tbl.tbl_name()}')
|
|
566
566
|
grouping_tbl = item._tbl_version_path.tbl_version
|
|
567
567
|
break
|
|
568
568
|
if not isinstance(item, exprs.Expr):
|
|
@@ -756,12 +756,12 @@ class DataFrame:
|
|
|
756
756
|
Env.get().require_package('torch')
|
|
757
757
|
Env.get().require_package('torchvision')
|
|
758
758
|
|
|
759
|
-
from pixeltable.io.parquet import save_parquet
|
|
760
|
-
from pixeltable.utils.pytorch import PixeltablePytorchDataset
|
|
759
|
+
from pixeltable.io.parquet import save_parquet
|
|
760
|
+
from pixeltable.utils.pytorch import PixeltablePytorchDataset
|
|
761
761
|
|
|
762
762
|
cache_key = self._hash_result_set()
|
|
763
763
|
|
|
764
|
-
dest_path = (Env.get().dataset_cache_dir / f'df_{cache_key}').with_suffix('.parquet')
|
|
764
|
+
dest_path = (Env.get().dataset_cache_dir / f'df_{cache_key}').with_suffix('.parquet')
|
|
765
765
|
if dest_path.exists(): # fast path: use cache
|
|
766
766
|
assert dest_path.is_dir()
|
|
767
767
|
else:
|
pixeltable/env.py
CHANGED
|
@@ -268,7 +268,7 @@ class Env:
|
|
|
268
268
|
|
|
269
269
|
# in pixeltable_pgserver.get_server(): cleanup_mode=None will leave db on for debugging purposes
|
|
270
270
|
self._db_server = pixeltable_pgserver.get_server(self._pgdata_dir, cleanup_mode=None)
|
|
271
|
-
self._db_url = self._db_server.get_uri(database=self._db_name)
|
|
271
|
+
self._db_url = self._db_server.get_uri(database=self._db_name, driver='psycopg')
|
|
272
272
|
|
|
273
273
|
if reinit_db:
|
|
274
274
|
if self._store_db_exists():
|
|
@@ -297,7 +297,7 @@ class Env:
|
|
|
297
297
|
def _store_db_exists(self) -> bool:
|
|
298
298
|
assert self._db_name is not None
|
|
299
299
|
# don't try to connect to self.db_name, it may not exist
|
|
300
|
-
db_url = self._db_server.get_uri(database='postgres')
|
|
300
|
+
db_url = self._db_server.get_uri(database='postgres', driver='psycopg')
|
|
301
301
|
engine = sql.create_engine(db_url, future=True)
|
|
302
302
|
try:
|
|
303
303
|
with engine.begin() as conn:
|
|
@@ -312,7 +312,7 @@ class Env:
|
|
|
312
312
|
def _create_store_db(self) -> None:
|
|
313
313
|
assert self._db_name is not None
|
|
314
314
|
# create the db
|
|
315
|
-
pg_db_url = self._db_server.get_uri(database='postgres')
|
|
315
|
+
pg_db_url = self._db_server.get_uri(database='postgres', driver='psycopg')
|
|
316
316
|
engine = sql.create_engine(pg_db_url, future=True, isolation_level='AUTOCOMMIT')
|
|
317
317
|
preparer = engine.dialect.identifier_preparer
|
|
318
318
|
try:
|
|
@@ -327,7 +327,7 @@ class Env:
|
|
|
327
327
|
engine.dispose()
|
|
328
328
|
|
|
329
329
|
# enable pgvector
|
|
330
|
-
store_db_url = self._db_server.get_uri(database=self._db_name)
|
|
330
|
+
store_db_url = self._db_server.get_uri(database=self._db_name, driver='psycopg')
|
|
331
331
|
engine = sql.create_engine(store_db_url, future=True, isolation_level='AUTOCOMMIT')
|
|
332
332
|
try:
|
|
333
333
|
with engine.begin() as conn:
|
|
@@ -337,7 +337,7 @@ class Env:
|
|
|
337
337
|
|
|
338
338
|
def _drop_store_db(self) -> None:
|
|
339
339
|
assert self._db_name is not None
|
|
340
|
-
db_url = self._db_server.get_uri(database='postgres')
|
|
340
|
+
db_url = self._db_server.get_uri(database='postgres', driver='psycopg')
|
|
341
341
|
engine = sql.create_engine(db_url, future=True, isolation_level='AUTOCOMMIT')
|
|
342
342
|
preparer = engine.dialect.identifier_preparer
|
|
343
343
|
try:
|
|
@@ -425,6 +425,7 @@ class Env:
|
|
|
425
425
|
else:
|
|
426
426
|
self._installed_packages[package] = None
|
|
427
427
|
|
|
428
|
+
check('toml')
|
|
428
429
|
check('datasets')
|
|
429
430
|
check('torch')
|
|
430
431
|
check('torchvision')
|
|
@@ -443,6 +444,7 @@ class Env:
|
|
|
443
444
|
self._spacy_nlp = spacy.load('en_core_web_sm')
|
|
444
445
|
check('tiktoken')
|
|
445
446
|
check('openai')
|
|
447
|
+
check('anthropic')
|
|
446
448
|
check('together')
|
|
447
449
|
check('fireworks')
|
|
448
450
|
check('label_studio_sdk')
|
pixeltable/exec/__init__.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from .aggregation_node import AggregationNode
|
|
2
2
|
from .cache_prefetch_node import CachePrefetchNode
|
|
3
3
|
from .component_iteration_node import ComponentIterationNode
|
|
4
|
+
from .data_row_batch import DataRowBatch
|
|
4
5
|
from .exec_context import ExecContext
|
|
5
6
|
from .exec_node import ExecNode
|
|
6
7
|
from .expr_eval_node import ExprEvalNode
|
|
7
8
|
from .in_memory_data_node import InMemoryDataNode
|
|
8
|
-
from .sql_node import SqlScanNode, SqlLookupNode
|
|
9
|
-
from .row_update_node import RowUpdateNode
|
|
10
9
|
from .media_validation_node import MediaValidationNode
|
|
11
|
-
from .
|
|
10
|
+
from .row_update_node import RowUpdateNode
|
|
11
|
+
from .sql_node import SqlLookupNode, SqlScanNode
|
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
4
|
import sys
|
|
5
|
-
from typing import List, Optional, Any
|
|
5
|
+
from typing import Iterable, List, Optional, Any
|
|
6
6
|
|
|
7
7
|
import pixeltable.catalog as catalog
|
|
8
8
|
import pixeltable.exceptions as excs
|
|
@@ -15,12 +15,12 @@ _logger = logging.getLogger('pixeltable')
|
|
|
15
15
|
class AggregationNode(ExecNode):
|
|
16
16
|
def __init__(
|
|
17
17
|
self, tbl: catalog.TableVersion, row_builder: exprs.RowBuilder, group_by: List[exprs.Expr],
|
|
18
|
-
agg_fn_calls: List[exprs.FunctionCall], input_exprs:
|
|
18
|
+
agg_fn_calls: List[exprs.FunctionCall], input_exprs: Iterable[exprs.Expr], input: ExecNode
|
|
19
19
|
):
|
|
20
20
|
super().__init__(row_builder, group_by + agg_fn_calls, input_exprs, input)
|
|
21
21
|
self.input = input
|
|
22
22
|
self.group_by = group_by
|
|
23
|
-
self.input_exprs = input_exprs
|
|
23
|
+
self.input_exprs = list(input_exprs)
|
|
24
24
|
self.agg_fn_calls = agg_fn_calls
|
|
25
25
|
self.agg_fn_eval_ctx = row_builder.create_eval_ctx(agg_fn_calls, exclude=input_exprs)
|
|
26
26
|
self.output_batch = DataRowBatch(tbl, row_builder, 0)
|
|
@@ -3,7 +3,7 @@ import sys
|
|
|
3
3
|
import time
|
|
4
4
|
import warnings
|
|
5
5
|
from dataclasses import dataclass
|
|
6
|
-
from typing import List, Optional
|
|
6
|
+
from typing import Iterable, List, Optional
|
|
7
7
|
|
|
8
8
|
from tqdm import tqdm, TqdmWarning
|
|
9
9
|
|
|
@@ -23,12 +23,12 @@ class ExprEvalNode(ExecNode):
|
|
|
23
23
|
"""List of exprs that form an evaluation context and contain calls to at most one external function"""
|
|
24
24
|
exprs: List[exprs.Expr]
|
|
25
25
|
batched_fn: Optional[CallableFunction]
|
|
26
|
-
segment_ctxs: List[exprs.RowBuilder.EvalCtx]
|
|
26
|
+
segment_ctxs: List['exprs.RowBuilder.EvalCtx']
|
|
27
27
|
target_slot_idxs: List[int]
|
|
28
28
|
batch_size: int = 8
|
|
29
29
|
|
|
30
30
|
def __init__(
|
|
31
|
-
self, row_builder: exprs.RowBuilder, output_exprs:
|
|
31
|
+
self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr], input_exprs: Iterable[exprs.Expr],
|
|
32
32
|
input: ExecNode
|
|
33
33
|
):
|
|
34
34
|
super().__init__(row_builder, output_exprs, input_exprs, input)
|
|
@@ -1,9 +1,10 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import
|
|
2
|
+
from typing import Any, Optional
|
|
3
3
|
|
|
4
4
|
import pixeltable.catalog as catalog
|
|
5
5
|
import pixeltable.exprs as exprs
|
|
6
6
|
from pixeltable.utils.media_store import MediaStore
|
|
7
|
+
|
|
7
8
|
from .data_row_batch import DataRowBatch
|
|
8
9
|
from .exec_node import ExecNode
|
|
9
10
|
|
|
@@ -18,8 +19,8 @@ class InMemoryDataNode(ExecNode):
|
|
|
18
19
|
- if an input row doesn't provide a value, sets the slot to the column default
|
|
19
20
|
"""
|
|
20
21
|
def __init__(
|
|
21
|
-
|
|
22
|
-
|
|
22
|
+
self, tbl: catalog.TableVersion, rows: list[dict[str, Any]],
|
|
23
|
+
row_builder: exprs.RowBuilder, start_row_id: int,
|
|
23
24
|
):
|
|
24
25
|
# we materialize all output slots
|
|
25
26
|
output_exprs = [e for e in row_builder.get_output_exprs() if isinstance(e, exprs.ColumnRef)]
|
|
@@ -75,4 +76,3 @@ class InMemoryDataNode(ExecNode):
|
|
|
75
76
|
self.has_returned_data = True
|
|
76
77
|
_logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_rows)} output_rows')
|
|
77
78
|
return self.output_rows
|
|
78
|
-
|
pixeltable/exec/sql_node.py
CHANGED
|
@@ -258,6 +258,10 @@ class SqlLookupNode(SqlNode):
|
|
|
258
258
|
"""
|
|
259
259
|
Materializes data from the store via a Select stmt with a WHERE clause that matches a list of key values
|
|
260
260
|
"""
|
|
261
|
+
|
|
262
|
+
stmt: sql.Select
|
|
263
|
+
where_clause: sql.ColumnElement[bool]
|
|
264
|
+
|
|
261
265
|
def __init__(
|
|
262
266
|
self, tbl: catalog.TableVersionPath, row_builder: exprs.RowBuilder,
|
|
263
267
|
select_list: Iterable[exprs.Expr], sa_key_cols: list[sql.Column], key_vals: list[tuple],
|
|
@@ -287,4 +291,3 @@ class SqlLookupNode(SqlNode):
|
|
|
287
291
|
_logger.debug(f'SqlLookupNode stmt:\n{stmt_str}')
|
|
288
292
|
except Exception as e:
|
|
289
293
|
pass
|
|
290
|
-
|
pixeltable/exprs/array_slice.py
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import
|
|
2
|
+
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
import sqlalchemy as sql
|
|
6
6
|
|
|
7
|
+
from .data_row import DataRow
|
|
7
8
|
from .expr import Expr
|
|
8
9
|
from .globals import print_slice
|
|
9
|
-
from .data_row import DataRow
|
|
10
10
|
from .row_builder import RowBuilder
|
|
11
|
-
import pixeltable.catalog as catalog
|
|
12
11
|
|
|
13
12
|
|
|
14
13
|
class ArraySlice(Expr):
|
pixeltable/exprs/column_ref.py
CHANGED
|
@@ -19,19 +19,29 @@ class ColumnRef(Expr):
|
|
|
19
19
|
For that reason, a ColumnRef needs to be serialized with the qualifying table id (column ids are only
|
|
20
20
|
unique in the context of a particular table).
|
|
21
21
|
"""
|
|
22
|
+
|
|
23
|
+
col: catalog.Column
|
|
24
|
+
is_unstored_iter_col: bool
|
|
25
|
+
iter_arg_ctx: Optional[RowBuilder.EvalCtx]
|
|
26
|
+
base_rowid_len: int
|
|
27
|
+
base_rowid: list[Optional[Any]]
|
|
28
|
+
iterator: Optional[iters.ComponentIterator]
|
|
29
|
+
pos_idx: Optional[int]
|
|
30
|
+
id: int
|
|
31
|
+
|
|
22
32
|
def __init__(self, col: catalog.Column):
|
|
23
33
|
super().__init__(col.col_type)
|
|
24
34
|
assert col.tbl is not None
|
|
25
35
|
self.col = col
|
|
26
36
|
self.is_unstored_iter_col = \
|
|
27
37
|
col.tbl.is_component_view() and col.tbl.is_iterator_column(col) and not col.is_stored
|
|
28
|
-
self.iter_arg_ctx
|
|
38
|
+
self.iter_arg_ctx = None
|
|
29
39
|
# number of rowid columns in the base table
|
|
30
40
|
self.base_rowid_len = col.tbl.base.num_rowid_columns() if self.is_unstored_iter_col else 0
|
|
31
41
|
self.base_rowid = [None] * self.base_rowid_len
|
|
32
|
-
self.iterator
|
|
42
|
+
self.iterator = None
|
|
33
43
|
# index of the position column in the view's primary key; don't try to reference tbl.store_tbl here
|
|
34
|
-
self.pos_idx
|
|
44
|
+
self.pos_idx = col.tbl.num_rowid_columns() - 1 if self.is_unstored_iter_col else None
|
|
35
45
|
self.id = self._create_id()
|
|
36
46
|
|
|
37
47
|
def set_iter_arg_ctx(self, iter_arg_ctx: RowBuilder.EvalCtx) -> None:
|
|
@@ -74,7 +84,13 @@ class ColumnRef(Expr):
|
|
|
74
84
|
return self.col == other.col
|
|
75
85
|
|
|
76
86
|
def __str__(self) -> str:
|
|
77
|
-
|
|
87
|
+
if self.col.name is None:
|
|
88
|
+
return f'<unnamed column {self.col.id}>'
|
|
89
|
+
else:
|
|
90
|
+
return self.col.name
|
|
91
|
+
|
|
92
|
+
def __repr__(self) -> str:
|
|
93
|
+
return f'ColumnRef({self.col!r})'
|
|
78
94
|
|
|
79
95
|
def sql_expr(self) -> Optional[sql.ClauseElement]:
|
|
80
96
|
return self.col.sa_col
|
pixeltable/exprs/comparison.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from datetime import datetime
|
|
3
4
|
from typing import Optional, List, Any, Dict
|
|
4
5
|
|
|
5
6
|
import sqlalchemy as sql
|
|
@@ -78,6 +79,7 @@ class Comparison(Expr):
|
|
|
78
79
|
right = self._op2.sql_expr()
|
|
79
80
|
if left is None or right is None:
|
|
80
81
|
return None
|
|
82
|
+
|
|
81
83
|
if self.operator == ComparisonOperator.LT:
|
|
82
84
|
return left < right
|
|
83
85
|
if self.operator == ComparisonOperator.LE:
|
|
@@ -92,18 +94,21 @@ class Comparison(Expr):
|
|
|
92
94
|
return left >= right
|
|
93
95
|
|
|
94
96
|
def eval(self, data_row: DataRow, row_builder: RowBuilder) -> None:
|
|
97
|
+
left = data_row[self._op1.slot_idx]
|
|
98
|
+
right = data_row[self._op2.slot_idx]
|
|
99
|
+
|
|
95
100
|
if self.operator == ComparisonOperator.LT:
|
|
96
|
-
data_row[self.slot_idx] =
|
|
101
|
+
data_row[self.slot_idx] = left < right
|
|
97
102
|
elif self.operator == ComparisonOperator.LE:
|
|
98
|
-
data_row[self.slot_idx] =
|
|
103
|
+
data_row[self.slot_idx] = left <= right
|
|
99
104
|
elif self.operator == ComparisonOperator.EQ:
|
|
100
|
-
data_row[self.slot_idx] =
|
|
105
|
+
data_row[self.slot_idx] = left == right
|
|
101
106
|
elif self.operator == ComparisonOperator.NE:
|
|
102
|
-
data_row[self.slot_idx] =
|
|
107
|
+
data_row[self.slot_idx] = left != right
|
|
103
108
|
elif self.operator == ComparisonOperator.GT:
|
|
104
|
-
data_row[self.slot_idx] =
|
|
109
|
+
data_row[self.slot_idx] = left > right
|
|
105
110
|
elif self.operator == ComparisonOperator.GE:
|
|
106
|
-
data_row[self.slot_idx] =
|
|
111
|
+
data_row[self.slot_idx] = left >= right
|
|
107
112
|
|
|
108
113
|
def _as_dict(self) -> Dict:
|
|
109
114
|
return {'operator': self.operator.value, **super()._as_dict()}
|
pixeltable/exprs/data_row.py
CHANGED
|
@@ -96,6 +96,9 @@ class DataRow:
|
|
|
96
96
|
self.file_paths[slot_idx] = None
|
|
97
97
|
self.file_urls[slot_idx] = None
|
|
98
98
|
|
|
99
|
+
def __len__(self) -> int:
|
|
100
|
+
return len(self.vals)
|
|
101
|
+
|
|
99
102
|
def __getitem__(self, index: object) -> Any:
|
|
100
103
|
"""Returns in-memory value, ie, what is needed for expr evaluation"""
|
|
101
104
|
if not self.has_val[index]:
|