pixeltable 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +7 -19
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +7 -7
- pixeltable/catalog/column.py +37 -11
- pixeltable/catalog/globals.py +21 -0
- pixeltable/catalog/insertable_table.py +6 -4
- pixeltable/catalog/table.py +227 -148
- pixeltable/catalog/table_version.py +66 -28
- pixeltable/catalog/table_version_path.py +0 -8
- pixeltable/catalog/view.py +18 -19
- pixeltable/dataframe.py +16 -32
- pixeltable/env.py +6 -1
- pixeltable/exec/__init__.py +1 -2
- pixeltable/exec/aggregation_node.py +27 -17
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/data_row_batch.py +9 -26
- pixeltable/exec/exec_node.py +36 -7
- pixeltable/exec/expr_eval_node.py +19 -11
- pixeltable/exec/in_memory_data_node.py +14 -11
- pixeltable/exec/sql_node.py +266 -138
- pixeltable/exprs/__init__.py +1 -0
- pixeltable/exprs/arithmetic_expr.py +3 -1
- pixeltable/exprs/array_slice.py +7 -7
- pixeltable/exprs/column_property_ref.py +37 -10
- pixeltable/exprs/column_ref.py +93 -14
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +8 -7
- pixeltable/exprs/data_row.py +56 -36
- pixeltable/exprs/expr.py +65 -63
- pixeltable/exprs/expr_dict.py +55 -0
- pixeltable/exprs/expr_set.py +26 -15
- pixeltable/exprs/function_call.py +53 -24
- pixeltable/exprs/globals.py +4 -1
- pixeltable/exprs/in_predicate.py +8 -7
- pixeltable/exprs/inline_expr.py +4 -4
- pixeltable/exprs/is_null.py +4 -4
- pixeltable/exprs/json_mapper.py +11 -12
- pixeltable/exprs/json_path.py +5 -10
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +5 -4
- pixeltable/exprs/object_ref.py +2 -1
- pixeltable/exprs/row_builder.py +88 -36
- pixeltable/exprs/rowid_ref.py +14 -13
- pixeltable/exprs/similarity_expr.py +12 -7
- pixeltable/exprs/sql_element_cache.py +12 -6
- pixeltable/exprs/type_cast.py +8 -6
- pixeltable/exprs/variable.py +5 -4
- pixeltable/ext/functions/whisperx.py +7 -2
- pixeltable/func/aggregate_function.py +1 -1
- pixeltable/func/callable_function.py +2 -2
- pixeltable/func/function.py +11 -10
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/query_template_function.py +11 -12
- pixeltable/func/signature.py +17 -15
- pixeltable/func/udf.py +0 -4
- pixeltable/functions/__init__.py +2 -2
- pixeltable/functions/audio.py +4 -6
- pixeltable/functions/globals.py +84 -42
- pixeltable/functions/huggingface.py +31 -34
- pixeltable/functions/image.py +59 -45
- pixeltable/functions/json.py +0 -1
- pixeltable/functions/llama_cpp.py +106 -0
- pixeltable/functions/mistralai.py +2 -2
- pixeltable/functions/ollama.py +147 -0
- pixeltable/functions/openai.py +22 -25
- pixeltable/functions/replicate.py +72 -0
- pixeltable/functions/string.py +59 -50
- pixeltable/functions/timestamp.py +20 -20
- pixeltable/functions/together.py +2 -2
- pixeltable/functions/video.py +11 -20
- pixeltable/functions/whisper.py +2 -20
- pixeltable/globals.py +65 -74
- pixeltable/index/base.py +2 -2
- pixeltable/index/btree.py +20 -7
- pixeltable/index/embedding_index.py +12 -14
- pixeltable/io/__init__.py +1 -2
- pixeltable/io/external_store.py +11 -5
- pixeltable/io/fiftyone.py +178 -0
- pixeltable/io/globals.py +98 -2
- pixeltable/io/hf_datasets.py +1 -1
- pixeltable/io/label_studio.py +6 -6
- pixeltable/io/parquet.py +14 -13
- pixeltable/iterators/base.py +3 -2
- pixeltable/iterators/document.py +10 -8
- pixeltable/iterators/video.py +126 -60
- pixeltable/metadata/__init__.py +4 -3
- pixeltable/metadata/converters/convert_14.py +4 -2
- pixeltable/metadata/converters/convert_15.py +1 -1
- pixeltable/metadata/converters/convert_19.py +1 -0
- pixeltable/metadata/converters/convert_20.py +1 -1
- pixeltable/metadata/converters/convert_21.py +34 -0
- pixeltable/metadata/converters/util.py +54 -12
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +40 -21
- pixeltable/plan.py +149 -165
- pixeltable/py.typed +0 -0
- pixeltable/store.py +57 -37
- pixeltable/tool/create_test_db_dump.py +6 -6
- pixeltable/tool/create_test_video.py +1 -1
- pixeltable/tool/doc_plugins/griffe.py +3 -34
- pixeltable/tool/embed_udf.py +1 -1
- pixeltable/tool/mypy_plugin.py +55 -0
- pixeltable/type_system.py +260 -61
- pixeltable/utils/arrow.py +10 -9
- pixeltable/utils/coco.py +4 -4
- pixeltable/utils/documents.py +16 -2
- pixeltable/utils/filecache.py +9 -9
- pixeltable/utils/formatter.py +10 -11
- pixeltable/utils/http_server.py +2 -5
- pixeltable/utils/media_store.py +6 -6
- pixeltable/utils/pytorch.py +10 -11
- pixeltable/utils/sql.py +2 -1
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/METADATA +50 -13
- pixeltable-0.2.22.dist-info/RECORD +153 -0
- pixeltable/exec/media_validation_node.py +0 -43
- pixeltable/utils/help.py +0 -11
- pixeltable-0.2.20.dist-info/RECORD +0 -147
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
- {pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0
pixeltable/exec/exec_node.py
CHANGED
|
@@ -1,13 +1,26 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
|
-
|
|
2
|
+
|
|
3
3
|
import abc
|
|
4
|
+
from typing import TYPE_CHECKING, Iterable, Iterator, List, Optional
|
|
5
|
+
|
|
6
|
+
import pixeltable.exprs as exprs
|
|
4
7
|
|
|
5
8
|
from .data_row_batch import DataRowBatch
|
|
6
9
|
from .exec_context import ExecContext
|
|
7
|
-
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from pixeltable import exec
|
|
8
13
|
|
|
9
14
|
class ExecNode(abc.ABC):
|
|
10
15
|
"""Base class of all execution nodes"""
|
|
16
|
+
output_exprs: Iterable[exprs.Expr]
|
|
17
|
+
row_builder: exprs.RowBuilder
|
|
18
|
+
input: Optional[ExecNode]
|
|
19
|
+
flushed_img_slots: list[int] # idxs of image slots of our output_exprs dependencies
|
|
20
|
+
stored_img_cols: list[exprs.ColumnSlotIdx]
|
|
21
|
+
ctx: Optional[ExecContext]
|
|
22
|
+
__iter: Optional[Iterator[DataRowBatch]]
|
|
23
|
+
|
|
11
24
|
def __init__(
|
|
12
25
|
self, row_builder: exprs.RowBuilder, output_exprs: Iterable[exprs.Expr],
|
|
13
26
|
input_exprs: Iterable[exprs.Expr], input: Optional[ExecNode] = None):
|
|
@@ -21,8 +34,9 @@ class ExecNode(abc.ABC):
|
|
|
21
34
|
e.slot_idx for e in output_dependencies
|
|
22
35
|
if e.col_type.is_image_type() and e.slot_idx not in output_slot_idxs
|
|
23
36
|
]
|
|
24
|
-
self.stored_img_cols
|
|
25
|
-
self.ctx
|
|
37
|
+
self.stored_img_cols = []
|
|
38
|
+
self.ctx = None # all nodes of a tree share the same context
|
|
39
|
+
self.__iter = None
|
|
26
40
|
|
|
27
41
|
def set_ctx(self, ctx: ExecContext) -> None:
|
|
28
42
|
self.ctx = ctx
|
|
@@ -35,12 +49,15 @@ class ExecNode(abc.ABC):
|
|
|
35
49
|
if self.input is not None:
|
|
36
50
|
self.input.set_stored_img_cols(stored_img_cols)
|
|
37
51
|
|
|
38
|
-
|
|
52
|
+
# TODO: make this an abstractmethod when __next__() is removed
|
|
53
|
+
def __iter__(self) -> Iterator[DataRowBatch]:
|
|
39
54
|
return self
|
|
40
55
|
|
|
41
|
-
|
|
56
|
+
# TODO: remove this and switch every subclass over to implementing __iter__
|
|
42
57
|
def __next__(self) -> DataRowBatch:
|
|
43
|
-
|
|
58
|
+
if self.__iter is None:
|
|
59
|
+
self.__iter = iter(self)
|
|
60
|
+
return next(self.__iter)
|
|
44
61
|
|
|
45
62
|
def open(self) -> None:
|
|
46
63
|
"""Bottom-up initialization of nodes for execution. Must be called before __next__."""
|
|
@@ -60,3 +77,15 @@ class ExecNode(abc.ABC):
|
|
|
60
77
|
def _close(self) -> None:
|
|
61
78
|
pass
|
|
62
79
|
|
|
80
|
+
def get_sql_node(self) -> Optional['exec.SqlNode']:
|
|
81
|
+
from .sql_node import SqlNode
|
|
82
|
+
if isinstance(self, SqlNode):
|
|
83
|
+
return self
|
|
84
|
+
if self.input is not None:
|
|
85
|
+
return self.input.get_sql_node()
|
|
86
|
+
return None
|
|
87
|
+
|
|
88
|
+
def set_limit(self, limit: int) -> None:
|
|
89
|
+
"""Default implementation propagates to input"""
|
|
90
|
+
if self.input is not None:
|
|
91
|
+
self.input.set_limit(limit)
|
|
@@ -5,10 +5,11 @@ import warnings
|
|
|
5
5
|
from dataclasses import dataclass
|
|
6
6
|
from typing import Iterable, List, Optional
|
|
7
7
|
|
|
8
|
-
from tqdm import
|
|
8
|
+
from tqdm import TqdmWarning, tqdm
|
|
9
9
|
|
|
10
|
-
|
|
10
|
+
from pixeltable import exprs
|
|
11
11
|
from pixeltable.func import CallableFunction
|
|
12
|
+
|
|
12
13
|
from .data_row_batch import DataRowBatch
|
|
13
14
|
from .exec_node import ExecNode
|
|
14
15
|
|
|
@@ -21,7 +22,7 @@ class ExprEvalNode(ExecNode):
|
|
|
21
22
|
@dataclass
|
|
22
23
|
class Cohort:
|
|
23
24
|
"""List of exprs that form an evaluation context and contain calls to at most one external function"""
|
|
24
|
-
|
|
25
|
+
exprs_: List[exprs.Expr]
|
|
25
26
|
batched_fn: Optional[CallableFunction]
|
|
26
27
|
segment_ctxs: List['exprs.RowBuilder.EvalCtx']
|
|
27
28
|
target_slot_idxs: List[int]
|
|
@@ -37,7 +38,7 @@ class ExprEvalNode(ExecNode):
|
|
|
37
38
|
# we're only materializing exprs that are not already in the input
|
|
38
39
|
self.target_exprs = [e for e in output_exprs if e.slot_idx not in input_slot_idxs]
|
|
39
40
|
self.pbar: Optional[tqdm] = None
|
|
40
|
-
self.cohorts: List[
|
|
41
|
+
self.cohorts: List[ExprEvalNode.Cohort] = []
|
|
41
42
|
self._create_cohorts()
|
|
42
43
|
|
|
43
44
|
def __next__(self) -> DataRowBatch:
|
|
@@ -87,6 +88,8 @@ class ExprEvalNode(ExecNode):
|
|
|
87
88
|
for e in all_exprs:
|
|
88
89
|
if not self._is_batched_fn_call(e):
|
|
89
90
|
continue
|
|
91
|
+
assert isinstance(e, exprs.FunctionCall)
|
|
92
|
+
assert isinstance(e.fn, CallableFunction)
|
|
90
93
|
if current_batched_fn is None or current_batched_fn != e.fn:
|
|
91
94
|
# create a new cohort
|
|
92
95
|
cohorts.append([])
|
|
@@ -95,8 +98,8 @@ class ExprEvalNode(ExecNode):
|
|
|
95
98
|
|
|
96
99
|
# expand the cohorts to include all exprs that are in the same evaluation context as the external calls;
|
|
97
100
|
# cohorts are evaluated in order, so we can exclude the target slots from preceding cohorts and input slots
|
|
98
|
-
exclude = set(
|
|
99
|
-
all_target_slot_idxs = set(
|
|
101
|
+
exclude = set(e.slot_idx for e in self.input_exprs)
|
|
102
|
+
all_target_slot_idxs = set(e.slot_idx for e in self.target_exprs)
|
|
100
103
|
target_slot_idxs: List[List[int]] = [] # the ones materialized by each cohort
|
|
101
104
|
for i in range(len(cohorts)):
|
|
102
105
|
cohorts[i] = self.row_builder.get_dependencies(
|
|
@@ -105,7 +108,7 @@ class ExprEvalNode(ExecNode):
|
|
|
105
108
|
[e.slot_idx for e in cohorts[i] if e.slot_idx in all_target_slot_idxs])
|
|
106
109
|
exclude.update(target_slot_idxs[-1])
|
|
107
110
|
|
|
108
|
-
all_cohort_slot_idxs = set(
|
|
111
|
+
all_cohort_slot_idxs = set(e.slot_idx for cohort in cohorts for e in cohort)
|
|
109
112
|
remaining_slot_idxs = set(all_target_slot_idxs) - all_cohort_slot_idxs
|
|
110
113
|
if len(remaining_slot_idxs) > 0:
|
|
111
114
|
cohorts.append(self.row_builder.get_dependencies(
|
|
@@ -163,9 +166,10 @@ class ExprEvalNode(ExecNode):
|
|
|
163
166
|
rows[row_idx], segment_ctx, self.ctx.profile, ignore_errors=self.ctx.ignore_errors)
|
|
164
167
|
else:
|
|
165
168
|
fn_call = segment_ctx.exprs[0]
|
|
169
|
+
assert isinstance(fn_call, exprs.FunctionCall)
|
|
166
170
|
# make a batched external function call
|
|
167
|
-
arg_batches = [[] for _ in range(len(fn_call.args))]
|
|
168
|
-
kwarg_batches = {k: [] for k in fn_call.kwargs.keys()}
|
|
171
|
+
arg_batches: list[list[exprs.Expr]] = [[] for _ in range(len(fn_call.args))]
|
|
172
|
+
kwarg_batches: dict[str, list[exprs.Expr]] = {k: [] for k in fn_call.kwargs.keys()}
|
|
169
173
|
|
|
170
174
|
valid_batch_idxs: List[int] = [] # rows with exceptions are not valid
|
|
171
175
|
for row_idx in range(batch_start_idx, batch_start_idx + num_batch_rows):
|
|
@@ -175,12 +179,15 @@ class ExprEvalNode(ExecNode):
|
|
|
175
179
|
continue
|
|
176
180
|
valid_batch_idxs.append(row_idx)
|
|
177
181
|
args, kwargs = fn_call._make_args(row)
|
|
178
|
-
|
|
179
|
-
|
|
182
|
+
for i in range(len(args)):
|
|
183
|
+
arg_batches[i].append(args[i])
|
|
184
|
+
for k in kwargs.keys():
|
|
185
|
+
kwarg_batches[k].append(kwargs[k])
|
|
180
186
|
num_valid_batch_rows = len(valid_batch_idxs)
|
|
181
187
|
|
|
182
188
|
if ext_batch_size is None:
|
|
183
189
|
# we need to choose a batch size based on the args
|
|
190
|
+
assert isinstance(fn_call.fn, CallableFunction)
|
|
184
191
|
sample_args = [arg_batches[i][0] for i in range(len(arg_batches))]
|
|
185
192
|
ext_batch_size = fn_call.fn.get_batch_size(*sample_args)
|
|
186
193
|
|
|
@@ -200,6 +207,7 @@ class ExprEvalNode(ExecNode):
|
|
|
200
207
|
for k in kwarg_batches.keys()
|
|
201
208
|
}
|
|
202
209
|
start_ts = time.perf_counter()
|
|
210
|
+
assert isinstance(fn_call.fn, CallableFunction)
|
|
203
211
|
result_batch = fn_call.fn.exec_batch(*call_args, **call_kwargs)
|
|
204
212
|
self.ctx.profile.eval_time[fn_call.slot_idx] += time.perf_counter() - start_ts
|
|
205
213
|
self.ctx.profile.eval_count[fn_call.slot_idx] += num_ext_batch_rows
|
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
-
from typing import Any, Optional
|
|
2
|
+
from typing import Any, Iterator, Optional
|
|
3
3
|
|
|
4
4
|
import pixeltable.catalog as catalog
|
|
5
5
|
import pixeltable.exprs as exprs
|
|
@@ -18,19 +18,26 @@ class InMemoryDataNode(ExecNode):
|
|
|
18
18
|
- with the values provided in the input rows
|
|
19
19
|
- if an input row doesn't provide a value, sets the slot to the column default
|
|
20
20
|
"""
|
|
21
|
+
tbl: catalog.TableVersion
|
|
22
|
+
input_rows: list[dict[str, Any]]
|
|
23
|
+
start_row_id: int
|
|
24
|
+
output_rows: Optional[DataRowBatch]
|
|
25
|
+
|
|
26
|
+
# output_exprs is declared in the superclass, but we redeclare it here with a more specific type
|
|
27
|
+
output_exprs: list[exprs.ColumnRef]
|
|
28
|
+
|
|
21
29
|
def __init__(
|
|
22
30
|
self, tbl: catalog.TableVersion, rows: list[dict[str, Any]],
|
|
23
31
|
row_builder: exprs.RowBuilder, start_row_id: int,
|
|
24
32
|
):
|
|
25
|
-
# we materialize
|
|
26
|
-
output_exprs =
|
|
33
|
+
# we materialize the input slots
|
|
34
|
+
output_exprs = list(row_builder.input_exprs)
|
|
27
35
|
super().__init__(row_builder, output_exprs, [], None)
|
|
28
36
|
assert tbl.is_insertable()
|
|
29
37
|
self.tbl = tbl
|
|
30
38
|
self.input_rows = rows
|
|
31
39
|
self.start_row_id = start_row_id
|
|
32
|
-
self.
|
|
33
|
-
self.output_rows: Optional[DataRowBatch] = None
|
|
40
|
+
self.output_rows = None
|
|
34
41
|
|
|
35
42
|
def _open(self) -> None:
|
|
36
43
|
"""Create row batch and populate with self.input_rows"""
|
|
@@ -67,12 +74,8 @@ class InMemoryDataNode(ExecNode):
|
|
|
67
74
|
assert col_info is not None
|
|
68
75
|
self.output_rows[row_idx][col_info.slot_idx] = None
|
|
69
76
|
|
|
70
|
-
self.output_rows.set_row_ids([self.start_row_id + i for i in range(len(self.output_rows))])
|
|
71
77
|
self.ctx.num_rows = len(self.output_rows)
|
|
72
78
|
|
|
73
|
-
def
|
|
74
|
-
if self.has_returned_data:
|
|
75
|
-
raise StopIteration
|
|
76
|
-
self.has_returned_data = True
|
|
79
|
+
def __iter__(self) -> Iterator[DataRowBatch]:
|
|
77
80
|
_logger.debug(f'InMemoryDataNode: created row batch with {len(self.output_rows)} output_rows')
|
|
78
|
-
|
|
81
|
+
yield self.output_rows
|