pixeltable 0.2.19__py3-none-any.whl → 0.2.21__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +7 -19
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +7 -7
- pixeltable/catalog/globals.py +3 -0
- pixeltable/catalog/insertable_table.py +9 -7
- pixeltable/catalog/table.py +220 -143
- pixeltable/catalog/table_version.py +36 -18
- pixeltable/catalog/table_version_path.py +0 -8
- pixeltable/catalog/view.py +3 -3
- pixeltable/dataframe.py +9 -24
- pixeltable/env.py +107 -36
- pixeltable/exceptions.py +7 -4
- pixeltable/exec/__init__.py +1 -1
- pixeltable/exec/aggregation_node.py +22 -15
- pixeltable/exec/component_iteration_node.py +62 -41
- pixeltable/exec/data_row_batch.py +7 -7
- pixeltable/exec/exec_node.py +35 -7
- pixeltable/exec/expr_eval_node.py +2 -1
- pixeltable/exec/in_memory_data_node.py +9 -9
- pixeltable/exec/sql_node.py +265 -136
- pixeltable/exprs/__init__.py +1 -0
- pixeltable/exprs/data_row.py +30 -19
- pixeltable/exprs/expr.py +15 -14
- pixeltable/exprs/expr_dict.py +55 -0
- pixeltable/exprs/expr_set.py +21 -15
- pixeltable/exprs/function_call.py +21 -8
- pixeltable/exprs/json_path.py +3 -6
- pixeltable/exprs/rowid_ref.py +2 -2
- pixeltable/exprs/sql_element_cache.py +5 -1
- pixeltable/ext/functions/whisperx.py +7 -2
- pixeltable/func/callable_function.py +2 -2
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/query_template_function.py +11 -12
- pixeltable/func/signature.py +17 -15
- pixeltable/func/udf.py +0 -4
- pixeltable/functions/__init__.py +1 -1
- pixeltable/functions/audio.py +4 -6
- pixeltable/functions/globals.py +86 -42
- pixeltable/functions/huggingface.py +12 -14
- pixeltable/functions/image.py +59 -45
- pixeltable/functions/json.py +0 -1
- pixeltable/functions/mistralai.py +2 -2
- pixeltable/functions/openai.py +22 -25
- pixeltable/functions/string.py +50 -50
- pixeltable/functions/timestamp.py +20 -20
- pixeltable/functions/together.py +26 -12
- pixeltable/functions/video.py +11 -20
- pixeltable/functions/whisper.py +2 -20
- pixeltable/globals.py +57 -56
- pixeltable/index/base.py +2 -2
- pixeltable/index/btree.py +7 -7
- pixeltable/index/embedding_index.py +8 -10
- pixeltable/io/external_store.py +11 -5
- pixeltable/io/globals.py +3 -1
- pixeltable/io/hf_datasets.py +4 -4
- pixeltable/io/label_studio.py +6 -6
- pixeltable/io/parquet.py +14 -13
- pixeltable/iterators/document.py +10 -8
- pixeltable/iterators/video.py +10 -1
- pixeltable/metadata/__init__.py +3 -2
- pixeltable/metadata/converters/convert_14.py +4 -2
- pixeltable/metadata/converters/convert_15.py +1 -1
- pixeltable/metadata/converters/convert_19.py +1 -0
- pixeltable/metadata/converters/convert_20.py +1 -1
- pixeltable/metadata/converters/util.py +9 -8
- pixeltable/metadata/schema.py +32 -21
- pixeltable/plan.py +136 -154
- pixeltable/store.py +51 -36
- pixeltable/tool/create_test_db_dump.py +7 -7
- pixeltable/tool/doc_plugins/griffe.py +3 -34
- pixeltable/tool/mypy_plugin.py +32 -0
- pixeltable/type_system.py +243 -60
- pixeltable/utils/arrow.py +10 -9
- pixeltable/utils/coco.py +4 -4
- pixeltable/utils/documents.py +1 -1
- pixeltable/utils/filecache.py +131 -84
- pixeltable/utils/formatter.py +1 -1
- pixeltable/utils/http_server.py +2 -5
- pixeltable/utils/media_store.py +6 -6
- pixeltable/utils/pytorch.py +10 -11
- pixeltable/utils/sql.py +2 -1
- {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/METADATA +16 -7
- pixeltable-0.2.21.dist-info/RECORD +148 -0
- pixeltable/utils/help.py +0 -11
- pixeltable-0.2.19.dist-info/RECORD +0 -147
- {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.19.dist-info → pixeltable-0.2.21.dist-info}/entry_points.txt +0 -0
|
@@ -1,10 +1,12 @@
|
|
|
1
|
-
|
|
1
|
+
import inspect
|
|
2
|
+
from typing import Iterator, Optional
|
|
2
3
|
|
|
3
|
-
from .data_row_batch import DataRowBatch
|
|
4
|
-
from .exec_node import ExecNode
|
|
5
4
|
import pixeltable.catalog as catalog
|
|
6
|
-
import pixeltable.exprs as exprs
|
|
7
5
|
import pixeltable.exceptions as excs
|
|
6
|
+
import pixeltable.exprs as exprs
|
|
7
|
+
|
|
8
|
+
from .data_row_batch import DataRowBatch
|
|
9
|
+
from .exec_node import ExecNode
|
|
8
10
|
|
|
9
11
|
|
|
10
12
|
class ComponentIterationNode(ExecNode):
|
|
@@ -12,7 +14,7 @@ class ComponentIterationNode(ExecNode):
|
|
|
12
14
|
|
|
13
15
|
Returns row batches of OUTPUT_BATCH_SIZE size.
|
|
14
16
|
"""
|
|
15
|
-
|
|
17
|
+
__OUTPUT_BATCH_SIZE = 1024
|
|
16
18
|
|
|
17
19
|
def __init__(self, view: catalog.TableVersion, input: ExecNode):
|
|
18
20
|
assert view.is_component_view()
|
|
@@ -23,57 +25,76 @@ class ComponentIterationNode(ExecNode):
|
|
|
23
25
|
self.iterator_args = iterator_args[0]
|
|
24
26
|
assert isinstance(self.iterator_args, exprs.InlineDict)
|
|
25
27
|
self.iterator_args_ctx = self.row_builder.create_eval_ctx([self.iterator_args])
|
|
26
|
-
self.iterator_output_schema, self.unstored_column_names =
|
|
28
|
+
self.iterator_output_schema, self.unstored_column_names = (
|
|
27
29
|
self.view.iterator_cls.output_schema(**self.iterator_args.to_kwargs())
|
|
30
|
+
)
|
|
28
31
|
self.iterator_output_fields = list(self.iterator_output_schema.keys())
|
|
29
|
-
self.iterator_output_cols =
|
|
30
|
-
|
|
32
|
+
self.iterator_output_cols = {
|
|
33
|
+
field_name: self.view.cols_by_name[field_name] for field_name in self.iterator_output_fields
|
|
34
|
+
}
|
|
31
35
|
# referenced iterator output fields
|
|
32
36
|
self.refd_output_slot_idxs = {
|
|
33
37
|
e.col.name: e.slot_idx for e in self.row_builder.unique_exprs
|
|
34
38
|
if isinstance(e, exprs.ColumnRef) and e.col.name in self.iterator_output_fields
|
|
35
39
|
}
|
|
36
|
-
self.
|
|
40
|
+
self.__output: Optional[Iterator[DataRowBatch]] = None
|
|
37
41
|
|
|
38
|
-
def
|
|
42
|
+
def __output_batches(self) -> Iterator[DataRowBatch]:
|
|
39
43
|
output_batch = DataRowBatch(self.view, self.row_builder)
|
|
40
44
|
for input_batch in self.input:
|
|
41
45
|
for input_row in input_batch:
|
|
42
46
|
self.row_builder.eval(input_row, self.iterator_args_ctx)
|
|
43
47
|
iterator_args = input_row[self.iterator_args.slot_idx]
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
# we can ignore this
|
|
59
|
-
continue
|
|
60
|
-
output_col = self.iterator_output_cols[field_name]
|
|
61
|
-
output_col.col_type.validate_literal(field_val)
|
|
62
|
-
output_row[self.refd_output_slot_idxs[field_name]] = field_val
|
|
63
|
-
if len(component_dict) != len(self.iterator_output_fields):
|
|
64
|
-
missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
|
|
65
|
-
raise excs.Error(
|
|
66
|
-
f'Invalid output of {self.view.iterator_cls.__name__}: '
|
|
67
|
-
f'missing fields {", ".join(missing_fields)}')
|
|
68
|
-
|
|
69
|
-
if len(output_batch) == self.OUTPUT_BATCH_SIZE:
|
|
70
|
-
yield output_batch
|
|
71
|
-
output_batch = DataRowBatch(self.view, self.row_builder)
|
|
48
|
+
assert isinstance(iterator_args, dict)
|
|
49
|
+
# We need to ensure that all of the required (non-nullable) parameters of the iterator are
|
|
50
|
+
# specified and are not null. If any of them are null, then we skip this row (i.e., we emit 0
|
|
51
|
+
# output rows for this input row).
|
|
52
|
+
if self.__non_nullable_args_specified(iterator_args):
|
|
53
|
+
iterator = self.view.iterator_cls(**iterator_args)
|
|
54
|
+
for pos, component_dict in enumerate(iterator):
|
|
55
|
+
output_row = output_batch.add_row()
|
|
56
|
+
input_row.copy(output_row)
|
|
57
|
+
# we're expanding the input and need to add the iterator position to the pk
|
|
58
|
+
self.__populate_output_row(output_row, pos, component_dict)
|
|
59
|
+
if len(output_batch) == self.__OUTPUT_BATCH_SIZE:
|
|
60
|
+
yield output_batch
|
|
61
|
+
output_batch = DataRowBatch(self.view, self.row_builder)
|
|
72
62
|
|
|
73
63
|
if len(output_batch) > 0:
|
|
74
64
|
yield output_batch
|
|
75
65
|
|
|
66
|
+
def __non_nullable_args_specified(self, iterator_args: dict) -> bool:
|
|
67
|
+
"""
|
|
68
|
+
Returns true if all non-nullable iterator arguments are not `None`.
|
|
69
|
+
"""
|
|
70
|
+
input_schema = self.view.iterator_cls.input_schema()
|
|
71
|
+
for arg_name, arg_value in iterator_args.items():
|
|
72
|
+
col_type = input_schema[arg_name]
|
|
73
|
+
if arg_value is None and not col_type.nullable:
|
|
74
|
+
return False
|
|
75
|
+
return True
|
|
76
|
+
|
|
77
|
+
def __populate_output_row(self, output_row: exprs.DataRow, pos: int, component_dict: dict) -> None:
|
|
78
|
+
pk = output_row.pk[:-1] + (pos,) + output_row.pk[-1:]
|
|
79
|
+
output_row.set_pk(pk)
|
|
80
|
+
# verify and copy component_dict fields to their respective slots in output_row
|
|
81
|
+
for field_name, field_val in component_dict.items():
|
|
82
|
+
if field_name not in self.iterator_output_fields:
|
|
83
|
+
raise excs.Error(
|
|
84
|
+
f'Invalid field name {field_name} in output of {self.view.iterator_cls.__name__}')
|
|
85
|
+
if field_name not in self.refd_output_slot_idxs:
|
|
86
|
+
# we can ignore this
|
|
87
|
+
continue
|
|
88
|
+
output_col = self.iterator_output_cols[field_name]
|
|
89
|
+
output_col.col_type.validate_literal(field_val)
|
|
90
|
+
output_row[self.refd_output_slot_idxs[field_name]] = field_val
|
|
91
|
+
if len(component_dict) != len(self.iterator_output_fields):
|
|
92
|
+
missing_fields = set(self.refd_output_slot_idxs.keys()) - set(component_dict.keys())
|
|
93
|
+
raise excs.Error(
|
|
94
|
+
f'Invalid output of {self.view.iterator_cls.__name__}: '
|
|
95
|
+
f'missing fields {", ".join(missing_fields)}')
|
|
96
|
+
|
|
76
97
|
def __next__(self) -> DataRowBatch:
|
|
77
|
-
if self.
|
|
78
|
-
self.
|
|
79
|
-
return next(self.
|
|
98
|
+
if self.__output is None:
|
|
99
|
+
self.__output = self.__output_batches()
|
|
100
|
+
return next(self.__output)
|
|
@@ -14,6 +14,13 @@ class DataRowBatch:
|
|
|
14
14
|
|
|
15
15
|
Contains the metadata needed to initialize DataRows.
|
|
16
16
|
"""
|
|
17
|
+
tbl: Optional[catalog.TableVersion]
|
|
18
|
+
row_builder: exprs.RowBuilder
|
|
19
|
+
img_slot_idxs: list[int]
|
|
20
|
+
media_slot_idxs: list[int] # non-image media slots
|
|
21
|
+
array_slot_idxs: list[int]
|
|
22
|
+
rows: list[exprs.DataRow]
|
|
23
|
+
|
|
17
24
|
def __init__(self, tbl: Optional[catalog.TableVersion], row_builder: exprs.RowBuilder, len: int = 0):
|
|
18
25
|
self.tbl = tbl
|
|
19
26
|
self.row_builder = row_builder
|
|
@@ -39,13 +46,6 @@ class DataRowBatch:
|
|
|
39
46
|
def pop_row(self) -> exprs.DataRow:
|
|
40
47
|
return self.rows.pop()
|
|
41
48
|
|
|
42
|
-
def set_row_ids(self, row_ids: List[int]) -> None:
|
|
43
|
-
"""Sets pks for rows in batch"""
|
|
44
|
-
assert self.tbl is not None
|
|
45
|
-
assert len(row_ids) == len(self.rows)
|
|
46
|
-
for row, row_id in zip(self.rows, row_ids):
|
|
47
|
-
row.set_pk((row_id, self.tbl))
|
|
48
|
-
|
|
49
49
|
def __len__(self) -> int:
|
|
50
50
|
return len(self.rows)
|
|
51
51
|
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -1,13 +1,25 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
|
|
2
|
+
|
|
3
3
|
import abc
|
|
4
|
+
from typing import Iterable, Optional, List, TYPE_CHECKING, Iterator
|
|
4
5
|
|
|
6
|
+
import pixeltable.exprs as exprs
|
|
5
7
|
from .data_row_batch import DataRowBatch
|
|
6
8
|
from .exec_context import ExecContext
|
|
7
|
-
|
|
9
|
+
|
|
10
|
+
if TYPE_CHECKING:
|
|
11
|
+
from pixeltable import exec
|
|
8
12
|
|
|
9
13
|
class ExecNode(abc.ABC):
|
|
10
14
|
"""Base class of all execution nodes"""
|
|
15
|
+
output_exprs: Iterable[exprs.Expr]
|
|
16
|
+
row_builder: exprs.RowBuilder
|
|
17
|
+
input: Optional[ExecNode]
|
|
18
|
+
flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
|
|
19
|
+
stored_img_cols: list[exprs.ColumnSlotIdx]
|
|
20
|
+
ctx: Optional[ExecContext]
|
|
21
|
+
__iter: Optional[Iterator[DataRowBatch]]
|
|
22
|
+
|
|
11
23
|
def __init__(
|
|
12
24
|
self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr],
|
|
13
25
|
input_exprs: Iterable[exprs.Expr], input: Optional[ExecNode] = None):
|
|
@@ -21,8 +33,9 @@ class ExecNode(abc.ABC):
|
|
|
21
33
|
e.slot_idx for e in output_dependencies
|
|
22
34
|
if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
23
35
|
]
|
|
24
|
-
self.stored_img_cols
|
|
25
|
-
self.ctx
|
|
36
|
+
self.stored_img_cols = []
|
|
37
|
+
self.ctx = None # all nodes of a tree share the same context
|
|
38
|
+
self.__iter = None
|
|
26
39
|
|
|
27
40
|
def set_ctx(self, ctx: ExecContext) -> None:
|
|
28
41
|
self.ctx = ctx
|
|
@@ -35,12 +48,15 @@ class ExecNode(abc.ABC):
|
|
|
35
48
|
if self.input is not None:
|
|
36
49
|
self.input.set_stored_img_cols(stored_img_cols)
|
|
37
50
|
|
|
38
|
-
|
|
51
|
+
# TODO: make this an abstractmethod when __next__() is removed
|
|
52
|
+
def __iter__(self) -> Iterator[DataRowBatch]:
|
|
39
53
|
return self
|
|
40
54
|
|
|
41
|
-
|
|
55
|
+
# TODO: remove this and switch every subclass over to implementing __iter__
|
|
42
56
|
def __next__(self) -> DataRowBatch:
|
|
43
|
-
|
|
57
|
+
if self.__iter is None:
|
|
58
|
+
self.__iter = iter(self)
|
|
59
|
+
return next(self.__iter)
|
|
44
60
|
|
|
45
61
|
def open(self) -> None:
|
|
46
62
|
"""Bottom-up initialization of nodes for execution. Must be called before __next__."""
|
|
@@ -60,3 +76,15 @@ class ExecNode(abc.ABC):
|
|
|
60
76
|
def _close(self) -> None:
|
|
61
77
|
pass
|
|
62
78
|
|
|
79
|
+
def get_sql_node(self) -> Optional['exec.SqlNode']:
|
|
80
|
+
from .sql_node import SqlNode
|
|
81
|
+
if isinstance(self, SqlNode):
|
|
82
|
+
return self
|
|
83
|
+
if self.input is not None:
|
|
84
|
+
return self.input.get_sql_node()
|
|
85
|
+
return None
|
|
86
|
+
|
|
87
|
+
def set_limit(self, limit: int) -> None:
|
|
88
|
+
"""Default implementation propagates to input"""
|
|
89
|
+
if self.input is not None:
|
|
90
|
+
self.input.set_limit(limit)
|
|
@@ -5,10 +5,11 @@ import warnings
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from typing import Iterable, List, Optional
|
|
7
7
|
|
|
8
|
-
from tqdm import
|
|
8
|
+
from tqdm import TqdmWarning, tqdm
|
|
9
9
|
|
|
10
10
|
import pixeltable.exprs as exprs
|
|
11
11
|
from pixeltable.func import CallableFunction
|
|
12
|
+
|
|
12
13
|
from .data_row_batch import DataRowBatch
|
|
13
14
|
from .exec_node import ExecNode
|
|
14
15
|
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, Optional
|
|
2
|
+
from typing import Any, Optional, Iterator
|
|
3
3
|
|
|
4
4
|
import pixeltable.catalog as catalog
|
|
5
5
|
import pixeltable.exprs as exprs
|
|
@@ -18,6 +18,11 @@ class InMemoryDataNode(ExecNode):
|
|
|
18
18
|
- with the values provided in the input rows
|
|
19
19
|
- if an input row doesn't provide a value, sets the slot to the column default
|
|
20
20
|
"""
|
|
21
|
+
tbl: catalog.TableVersion
|
|
22
|
+
input_rows: list[dict[str, Any]]
|
|
23
|
+
start_row_id: int
|
|
24
|
+
output_rows: Optional[DataRowBatch]
|
|
25
|
+
|
|
21
26
|
def __init__(
|
|
22
27
|
self, tbl: catalog.TableVersion, rows: list[dict[str, Any]],
|
|
23
28
|
row_builder: exprs.RowBuilder, start_row_id: int,
|
|
@@ -29,8 +34,7 @@ class InMemoryDataNode(ExecNode):
|
|
|
29
34
|
self.tbl = tbl
|
|
30
35
|
self.input_rows = rows
|
|
31
36
|
self.start_row_id = start_row_id
|
|
32
|
-
self.
|
|
33
|
-
self.output_rows: Optional[DataRowBatch] = None
|
|
37
|
+
self.output_rows = None
|
|
34
38
|
|
|
35
39
|
def _open(self) -> None:
|
|
36
40
|
"""Create row batch and populate with self.input_rows"""
|
|
@@ -67,12 +71,8 @@ class InMemoryDataNode(ExecNode):
|
|
|
67
71
|
assert col_info is not None
|
|
68
72
|
self.output_rows[row_idx][col_info.slot_idx] = None
|
|
69
73
|
|
|
70
|
-
self.output_rows.set_row_ids([self.start_row_id + i for i in range(len(self.output_rows))])
|
|
71
74
|
self.ctx.num_rows = len(self.output_rows)
|
|
72
75
|
|
|
73
|
-
def
|
|
74
|
-
if self.has_returned_data:
|
|
75
|
-
raise StopIteration
|
|
76
|
-
self.has_returned_data = True
|
|
76
|
+
def __iter__(self) -> Iterator[DataRowBatch]:
|
|
77
77
|
_logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_rows)} output_rows')
|
|
78
|
-
|
|
78
|
+
yield self.output_rows
|