pixeltable 0.3.2__py3-none-any.whl → 0.3.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +64 -11
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +50 -27
- pixeltable/catalog/column.py +27 -11
- pixeltable/catalog/dir.py +6 -4
- pixeltable/catalog/globals.py +8 -1
- pixeltable/catalog/insertable_table.py +22 -12
- pixeltable/catalog/named_function.py +10 -6
- pixeltable/catalog/path.py +3 -2
- pixeltable/catalog/path_dict.py +8 -6
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +121 -101
- pixeltable/catalog/table_version.py +291 -142
- pixeltable/catalog/table_version_path.py +8 -5
- pixeltable/catalog/view.py +67 -26
- pixeltable/dataframe.py +106 -81
- pixeltable/env.py +28 -24
- pixeltable/exec/__init__.py +2 -2
- pixeltable/exec/aggregation_node.py +10 -4
- pixeltable/exec/cache_prefetch_node.py +5 -3
- pixeltable/exec/component_iteration_node.py +9 -9
- pixeltable/exec/data_row_batch.py +21 -10
- pixeltable/exec/exec_context.py +10 -3
- pixeltable/exec/exec_node.py +23 -12
- pixeltable/exec/expr_eval/evaluators.py +13 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +24 -15
- pixeltable/exec/expr_eval/globals.py +30 -7
- pixeltable/exec/expr_eval/row_buffer.py +5 -6
- pixeltable/exec/expr_eval/schedulers.py +151 -31
- pixeltable/exec/in_memory_data_node.py +8 -7
- pixeltable/exec/row_update_node.py +15 -5
- pixeltable/exec/sql_node.py +56 -27
- pixeltable/exprs/__init__.py +2 -2
- pixeltable/exprs/arithmetic_expr.py +57 -26
- pixeltable/exprs/array_slice.py +1 -1
- pixeltable/exprs/column_property_ref.py +2 -1
- pixeltable/exprs/column_ref.py +20 -15
- pixeltable/exprs/comparison.py +6 -2
- pixeltable/exprs/compound_predicate.py +1 -3
- pixeltable/exprs/data_row.py +2 -2
- pixeltable/exprs/expr.py +108 -72
- pixeltable/exprs/expr_dict.py +2 -1
- pixeltable/exprs/expr_set.py +3 -1
- pixeltable/exprs/function_call.py +39 -41
- pixeltable/exprs/globals.py +1 -0
- pixeltable/exprs/in_predicate.py +2 -2
- pixeltable/exprs/inline_expr.py +20 -17
- pixeltable/exprs/json_mapper.py +4 -2
- pixeltable/exprs/json_path.py +12 -18
- pixeltable/exprs/literal.py +5 -9
- pixeltable/exprs/method_ref.py +1 -0
- pixeltable/exprs/object_ref.py +1 -1
- pixeltable/exprs/row_builder.py +32 -17
- pixeltable/exprs/rowid_ref.py +14 -5
- pixeltable/exprs/similarity_expr.py +11 -6
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/type_cast.py +24 -9
- pixeltable/ext/__init__.py +1 -0
- pixeltable/ext/functions/__init__.py +1 -0
- pixeltable/ext/functions/whisperx.py +2 -2
- pixeltable/ext/functions/yolox.py +11 -11
- pixeltable/func/aggregate_function.py +17 -13
- pixeltable/func/callable_function.py +6 -6
- pixeltable/func/expr_template_function.py +15 -14
- pixeltable/func/function.py +16 -16
- pixeltable/func/function_registry.py +11 -8
- pixeltable/func/globals.py +4 -2
- pixeltable/func/query_template_function.py +12 -13
- pixeltable/func/signature.py +18 -9
- pixeltable/func/tools.py +10 -17
- pixeltable/func/udf.py +106 -11
- pixeltable/functions/__init__.py +21 -2
- pixeltable/functions/anthropic.py +16 -12
- pixeltable/functions/fireworks.py +63 -5
- pixeltable/functions/gemini.py +13 -3
- pixeltable/functions/globals.py +18 -6
- pixeltable/functions/huggingface.py +20 -38
- pixeltable/functions/image.py +7 -3
- pixeltable/functions/json.py +1 -0
- pixeltable/functions/llama_cpp.py +1 -4
- pixeltable/functions/mistralai.py +31 -20
- pixeltable/functions/ollama.py +4 -18
- pixeltable/functions/openai.py +231 -113
- pixeltable/functions/replicate.py +11 -10
- pixeltable/functions/string.py +70 -7
- pixeltable/functions/timestamp.py +21 -8
- pixeltable/functions/together.py +66 -52
- pixeltable/functions/video.py +1 -0
- pixeltable/functions/vision.py +14 -11
- pixeltable/functions/whisper.py +2 -1
- pixeltable/globals.py +60 -26
- pixeltable/index/__init__.py +1 -1
- pixeltable/index/btree.py +5 -3
- pixeltable/index/embedding_index.py +15 -14
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +30 -25
- pixeltable/io/fiftyone.py +6 -14
- pixeltable/io/globals.py +33 -27
- pixeltable/io/hf_datasets.py +2 -1
- pixeltable/io/label_studio.py +77 -68
- pixeltable/io/pandas.py +36 -23
- pixeltable/io/parquet.py +9 -12
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/audio.py +205 -0
- pixeltable/iterators/document.py +19 -8
- pixeltable/iterators/image.py +6 -24
- pixeltable/iterators/string.py +3 -6
- pixeltable/iterators/video.py +1 -7
- pixeltable/metadata/__init__.py +7 -1
- pixeltable/metadata/converters/convert_10.py +2 -2
- pixeltable/metadata/converters/convert_15.py +1 -5
- pixeltable/metadata/converters/convert_16.py +2 -4
- pixeltable/metadata/converters/convert_17.py +2 -4
- pixeltable/metadata/converters/convert_18.py +2 -4
- pixeltable/metadata/converters/convert_19.py +2 -5
- pixeltable/metadata/converters/convert_20.py +1 -4
- pixeltable/metadata/converters/convert_21.py +4 -6
- pixeltable/metadata/converters/convert_22.py +1 -0
- pixeltable/metadata/converters/convert_23.py +5 -5
- pixeltable/metadata/converters/convert_24.py +12 -13
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/util.py +3 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +13 -2
- pixeltable/plan.py +173 -98
- pixeltable/share/__init__.py +0 -0
- pixeltable/share/packager.py +218 -0
- pixeltable/store.py +42 -26
- pixeltable/type_system.py +102 -75
- pixeltable/utils/arrow.py +7 -8
- pixeltable/utils/coco.py +16 -17
- pixeltable/utils/code.py +1 -1
- pixeltable/utils/console_output.py +6 -3
- pixeltable/utils/description_helper.py +7 -7
- pixeltable/utils/documents.py +3 -1
- pixeltable/utils/filecache.py +12 -7
- pixeltable/utils/http_server.py +9 -8
- pixeltable/utils/iceberg.py +14 -0
- pixeltable/utils/media_store.py +3 -2
- pixeltable/utils/pytorch.py +11 -14
- pixeltable/utils/s3.py +1 -0
- pixeltable/utils/sql.py +1 -0
- pixeltable/utils/transactional_directory.py +2 -2
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/METADATA +9 -9
- pixeltable-0.3.4.dist-info/RECORD +166 -0
- pixeltable-0.3.2.dist-info/RECORD +0 -161
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.4.dist-info}/entry_points.txt +0 -0
|
@@ -84,6 +84,7 @@ class TableVersionPath:
|
|
|
84
84
|
def get_column_ref(self, col_name: str) -> exprs.ColumnRef:
|
|
85
85
|
"""Return a ColumnRef for the given column name."""
|
|
86
86
|
from pixeltable.exprs import ColumnRef
|
|
87
|
+
|
|
87
88
|
if col_name not in self.tbl_version.cols_by_name:
|
|
88
89
|
if self.base is None:
|
|
89
90
|
raise AttributeError(f'Column {col_name} unknown')
|
|
@@ -121,11 +122,13 @@ class TableVersionPath:
|
|
|
121
122
|
return None
|
|
122
123
|
|
|
123
124
|
def has_column(self, col: Column, include_bases: bool = True) -> bool:
|
|
124
|
-
"""Return True if this table has the given column.
|
|
125
|
-
"""
|
|
125
|
+
"""Return True if this table has the given column."""
|
|
126
126
|
assert col.tbl is not None
|
|
127
|
-
if
|
|
128
|
-
|
|
127
|
+
if (
|
|
128
|
+
col.tbl.id == self.tbl_version.id
|
|
129
|
+
and col.tbl.effective_version == self.tbl_version.effective_version
|
|
130
|
+
and col.id in self.tbl_version.cols_by_id
|
|
131
|
+
):
|
|
129
132
|
# the column is visible in this table version
|
|
130
133
|
return True
|
|
131
134
|
elif self.base is not None and include_bases:
|
|
@@ -136,7 +139,7 @@ class TableVersionPath:
|
|
|
136
139
|
def as_dict(self) -> dict:
|
|
137
140
|
return {
|
|
138
141
|
'tbl_version': self.tbl_version.as_dict(),
|
|
139
|
-
'base': self.base.as_dict() if self.base is not None else None
|
|
142
|
+
'base': self.base.as_dict() if self.base is not None else None,
|
|
140
143
|
}
|
|
141
144
|
|
|
142
145
|
@classmethod
|
pixeltable/catalog/view.py
CHANGED
|
@@ -35,9 +35,10 @@ class View(Table):
|
|
|
35
35
|
The exception is a snapshot view without a predicate and without additional columns: in that case, the view
|
|
36
36
|
is simply a reference to a specific set of base versions.
|
|
37
37
|
"""
|
|
38
|
+
|
|
38
39
|
def __init__(
|
|
39
|
-
|
|
40
|
-
|
|
40
|
+
self, id: UUID, dir_id: UUID, name: str, tbl_version_path: TableVersionPath, base_id: UUID, snapshot_only: bool
|
|
41
|
+
):
|
|
41
42
|
super().__init__(id, dir_id, name, tbl_version_path)
|
|
42
43
|
assert base_id in catalog.Catalog.get().tbl_dependents
|
|
43
44
|
self._base_id = base_id # keep a reference to the base Table ID, so that we can keep track of its dependents
|
|
@@ -49,10 +50,18 @@ class View(Table):
|
|
|
49
50
|
|
|
50
51
|
@classmethod
|
|
51
52
|
def _create(
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
53
|
+
cls,
|
|
54
|
+
dir_id: UUID,
|
|
55
|
+
name: str,
|
|
56
|
+
base: TableVersionPath,
|
|
57
|
+
additional_columns: dict[str, Any],
|
|
58
|
+
predicate: Optional['pxt.exprs.Expr'],
|
|
59
|
+
is_snapshot: bool,
|
|
60
|
+
num_retained_versions: int,
|
|
61
|
+
comment: str,
|
|
62
|
+
media_validation: MediaValidation,
|
|
63
|
+
iterator_cls: Optional[type[ComponentIterator]],
|
|
64
|
+
iterator_args: Optional[dict],
|
|
56
65
|
) -> View:
|
|
57
66
|
columns = cls._create_columns(additional_columns)
|
|
58
67
|
cls._verify_schema(columns)
|
|
@@ -71,7 +80,8 @@ class View(Table):
|
|
|
71
80
|
# make sure that the value can be computed in the context of the base
|
|
72
81
|
if col.value_expr is not None and not col.value_expr.is_bound_by([base]):
|
|
73
82
|
raise excs.Error(
|
|
74
|
-
f'Column {col.name}: value expression cannot be computed in the context of the base {base.tbl_name()}'
|
|
83
|
+
f'Column {col.name}: value expression cannot be computed in the context of the base {base.tbl_name()}'
|
|
84
|
+
)
|
|
75
85
|
|
|
76
86
|
if iterator_cls is not None:
|
|
77
87
|
assert iterator_args is not None
|
|
@@ -92,6 +102,7 @@ class View(Table):
|
|
|
92
102
|
]
|
|
93
103
|
sig = func.Signature(ts.InvalidType(), params)
|
|
94
104
|
from pixeltable.exprs import FunctionCall
|
|
105
|
+
|
|
95
106
|
FunctionCall.normalize_args(iterator_cls.__name__, sig, bound_args)
|
|
96
107
|
except TypeError as e:
|
|
97
108
|
raise excs.Error(f'Cannot instantiate iterator with given arguments: {e}')
|
|
@@ -102,22 +113,28 @@ class View(Table):
|
|
|
102
113
|
# stored=False: it is not stored separately (it's already stored as part of the rowid)
|
|
103
114
|
iterator_cols = [Column(_POS_COLUMN_NAME, ts.IntType(), stored=False)]
|
|
104
115
|
output_dict, unstored_cols = iterator_cls.output_schema(**bound_args)
|
|
105
|
-
iterator_cols.extend(
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
116
|
+
iterator_cols.extend(
|
|
117
|
+
[
|
|
118
|
+
Column(col_name, col_type, stored=col_name not in unstored_cols)
|
|
119
|
+
for col_name, col_type in output_dict.items()
|
|
120
|
+
]
|
|
121
|
+
)
|
|
109
122
|
|
|
110
123
|
iterator_col_names = {col.name for col in iterator_cols}
|
|
111
124
|
for col in columns:
|
|
112
125
|
if col.name in iterator_col_names:
|
|
113
|
-
raise excs.Error(
|
|
126
|
+
raise excs.Error(
|
|
127
|
+
f'Duplicate name: column {col.name} is already present in the iterator output schema'
|
|
128
|
+
)
|
|
114
129
|
columns = iterator_cols + columns
|
|
115
130
|
|
|
116
131
|
with orm.Session(Env.get().engine, future=True) as session:
|
|
117
132
|
from pixeltable.exprs import InlineDict
|
|
133
|
+
|
|
118
134
|
iterator_args_expr: exprs.Expr = InlineDict(iterator_args) if iterator_args is not None else None
|
|
119
|
-
iterator_class_fqn =
|
|
120
|
-
else None
|
|
135
|
+
iterator_class_fqn = (
|
|
136
|
+
f'{iterator_cls.__module__}.{iterator_cls.__name__}' if iterator_cls is not None else None
|
|
137
|
+
)
|
|
121
138
|
base_version_path = cls._get_snapshot_path(base) if is_snapshot else base
|
|
122
139
|
base_versions = [
|
|
123
140
|
(tbl_version.id.hex, tbl_version.version if is_snapshot or tbl_version.is_snapshot else None)
|
|
@@ -127,35 +144,53 @@ class View(Table):
|
|
|
127
144
|
# if this is a snapshot, we need to retarget all exprs to the snapshot tbl versions
|
|
128
145
|
if is_snapshot:
|
|
129
146
|
predicate = predicate.retarget(base_version_path) if predicate is not None else None
|
|
130
|
-
iterator_args_expr =
|
|
131
|
-
if iterator_args_expr is not None else None
|
|
147
|
+
iterator_args_expr = (
|
|
148
|
+
iterator_args_expr.retarget(base_version_path) if iterator_args_expr is not None else None
|
|
149
|
+
)
|
|
132
150
|
for col in columns:
|
|
133
151
|
if col.value_expr is not None:
|
|
134
152
|
col.set_value_expr(col.value_expr.retarget(base_version_path))
|
|
135
153
|
|
|
136
154
|
view_md = md_schema.ViewMd(
|
|
137
|
-
is_snapshot=is_snapshot,
|
|
155
|
+
is_snapshot=is_snapshot,
|
|
156
|
+
predicate=predicate.as_dict() if predicate is not None else None,
|
|
138
157
|
base_versions=base_versions,
|
|
139
158
|
iterator_class_fqn=iterator_class_fqn,
|
|
140
|
-
iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None
|
|
159
|
+
iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None,
|
|
160
|
+
)
|
|
141
161
|
|
|
142
162
|
id, tbl_version = TableVersion.create(
|
|
143
|
-
session,
|
|
144
|
-
|
|
163
|
+
session,
|
|
164
|
+
dir_id,
|
|
165
|
+
name,
|
|
166
|
+
columns,
|
|
167
|
+
num_retained_versions,
|
|
168
|
+
comment,
|
|
169
|
+
media_validation=media_validation,
|
|
170
|
+
base_path=base_version_path,
|
|
171
|
+
view_md=view_md,
|
|
172
|
+
)
|
|
145
173
|
if tbl_version is None:
|
|
146
174
|
# this is purely a snapshot: we use the base's tbl version path
|
|
147
175
|
view = cls(id, dir_id, name, base_version_path, base.tbl_id(), snapshot_only=True)
|
|
148
176
|
_logger.info(f'created snapshot {name}')
|
|
149
177
|
else:
|
|
150
178
|
view = cls(
|
|
151
|
-
id,
|
|
152
|
-
|
|
179
|
+
id,
|
|
180
|
+
dir_id,
|
|
181
|
+
name,
|
|
182
|
+
TableVersionPath(tbl_version, base=base_version_path),
|
|
183
|
+
base.tbl_id(),
|
|
184
|
+
snapshot_only=False,
|
|
185
|
+
)
|
|
153
186
|
_logger.info(f'Created view `{name}`, id={tbl_version.id}')
|
|
154
187
|
|
|
155
188
|
from pixeltable.plan import Planner
|
|
189
|
+
|
|
156
190
|
plan, num_values_per_row = Planner.create_view_load_plan(view._tbl_version_path)
|
|
157
191
|
num_rows, num_excs, cols_with_excs = tbl_version.store_tbl.insert_rows(
|
|
158
|
-
plan, session.connection(), v_min=tbl_version.version
|
|
192
|
+
plan, session.connection(), v_min=tbl_version.version
|
|
193
|
+
)
|
|
159
194
|
Env.get().console_logger.info(f'Created view `{name}` with {num_rows} rows, {num_excs} exceptions.')
|
|
160
195
|
|
|
161
196
|
session.commit()
|
|
@@ -188,7 +223,8 @@ class View(Table):
|
|
|
188
223
|
|
|
189
224
|
return TableVersionPath(
|
|
190
225
|
tbl_version,
|
|
191
|
-
base=cls._get_snapshot_path(tbl_version_path.base) if tbl_version_path.base is not None else None
|
|
226
|
+
base=cls._get_snapshot_path(tbl_version_path.base) if tbl_version_path.base is not None else None,
|
|
227
|
+
)
|
|
192
228
|
|
|
193
229
|
def _drop(self) -> None:
|
|
194
230
|
cat = catalog.Catalog.get()
|
|
@@ -216,8 +252,13 @@ class View(Table):
|
|
|
216
252
|
return md
|
|
217
253
|
|
|
218
254
|
def insert(
|
|
219
|
-
|
|
220
|
-
|
|
255
|
+
self,
|
|
256
|
+
rows: Optional[Iterable[dict[str, Any]]] = None,
|
|
257
|
+
/,
|
|
258
|
+
*,
|
|
259
|
+
print_stats: bool = False,
|
|
260
|
+
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
261
|
+
**kwargs: Any,
|
|
221
262
|
) -> UpdateStatus:
|
|
222
263
|
raise excs.Error(f'{self._display_name()} {self._name!r}: cannot insert into view')
|
|
223
264
|
|
pixeltable/dataframe.py
CHANGED
|
@@ -8,7 +8,7 @@ import json
|
|
|
8
8
|
import logging
|
|
9
9
|
import traceback
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterator, Optional, Sequence, Union
|
|
11
|
+
from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Hashable, Iterator, NoReturn, Optional, Sequence, Union
|
|
12
12
|
|
|
13
13
|
import numpy as np
|
|
14
14
|
import pandas as pd
|
|
@@ -138,7 +138,7 @@ class DataFrame:
|
|
|
138
138
|
group_by_clause: Optional[list[exprs.Expr]]
|
|
139
139
|
grouping_tbl: Optional[catalog.TableVersion]
|
|
140
140
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]]
|
|
141
|
-
limit_val: Optional[
|
|
141
|
+
limit_val: Optional[exprs.Expr]
|
|
142
142
|
|
|
143
143
|
def __init__(
|
|
144
144
|
self,
|
|
@@ -148,7 +148,7 @@ class DataFrame:
|
|
|
148
148
|
group_by_clause: Optional[list[exprs.Expr]] = None,
|
|
149
149
|
grouping_tbl: Optional[catalog.TableVersion] = None,
|
|
150
150
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None, # list[(expr, asc)]
|
|
151
|
-
limit: Optional[
|
|
151
|
+
limit: Optional[exprs.Expr] = None,
|
|
152
152
|
):
|
|
153
153
|
self._from_clause = from_clause
|
|
154
154
|
|
|
@@ -171,9 +171,7 @@ class DataFrame:
|
|
|
171
171
|
|
|
172
172
|
@classmethod
|
|
173
173
|
def _normalize_select_list(
|
|
174
|
-
cls,
|
|
175
|
-
tbls: list[catalog.TableVersionPath],
|
|
176
|
-
select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
|
|
174
|
+
cls, tbls: list[catalog.TableVersionPath], select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]]
|
|
177
175
|
) -> tuple[list[exprs.Expr], list[str]]:
|
|
178
176
|
"""
|
|
179
177
|
Expand select list information with all columns and their names
|
|
@@ -227,6 +225,8 @@ class DataFrame:
|
|
|
227
225
|
all_exprs.extend(self.group_by_clause)
|
|
228
226
|
if self.order_by_clause is not None:
|
|
229
227
|
all_exprs.extend([expr for expr, _ in self.order_by_clause])
|
|
228
|
+
if self.limit_val is not None:
|
|
229
|
+
all_exprs.append(self.limit_val)
|
|
230
230
|
vars = exprs.Expr.list_subexprs(all_exprs, expr_class=exprs.Variable)
|
|
231
231
|
unique_vars: dict[str, exprs.Variable] = {}
|
|
232
232
|
for var in vars:
|
|
@@ -301,7 +301,7 @@ class DataFrame:
|
|
|
301
301
|
where_clause=self.where_clause,
|
|
302
302
|
group_by_clause=group_by_clause,
|
|
303
303
|
order_by_clause=self.order_by_clause if self.order_by_clause is not None else [],
|
|
304
|
-
limit=self.limit_val
|
|
304
|
+
limit=self.limit_val,
|
|
305
305
|
)
|
|
306
306
|
|
|
307
307
|
def _has_joins(self) -> bool:
|
|
@@ -369,8 +369,12 @@ class DataFrame:
|
|
|
369
369
|
select_list_exprs = copy.deepcopy(self._select_list_exprs)
|
|
370
370
|
where_clause = copy.deepcopy(self.where_clause)
|
|
371
371
|
group_by_clause = copy.deepcopy(self.group_by_clause)
|
|
372
|
-
order_by_exprs =
|
|
373
|
-
|
|
372
|
+
order_by_exprs = (
|
|
373
|
+
[copy.deepcopy(order_by_expr) for order_by_expr, _ in self.order_by_clause]
|
|
374
|
+
if self.order_by_clause is not None
|
|
375
|
+
else None
|
|
376
|
+
)
|
|
377
|
+
limit_val = copy.deepcopy(self.limit_val)
|
|
374
378
|
|
|
375
379
|
var_exprs: dict[exprs.Expr, exprs.Expr] = {}
|
|
376
380
|
vars = self._vars()
|
|
@@ -386,7 +390,7 @@ class DataFrame:
|
|
|
386
390
|
|
|
387
391
|
exprs.Expr.list_substitute(select_list_exprs, var_exprs)
|
|
388
392
|
if where_clause is not None:
|
|
389
|
-
where_clause.substitute(var_exprs)
|
|
393
|
+
where_clause = where_clause.substitute(var_exprs)
|
|
390
394
|
if group_by_clause is not None:
|
|
391
395
|
exprs.Expr.list_substitute(group_by_clause, var_exprs)
|
|
392
396
|
if order_by_exprs is not None:
|
|
@@ -398,14 +402,23 @@ class DataFrame:
|
|
|
398
402
|
order_by_clause = [
|
|
399
403
|
(expr, asc) for expr, asc in zip(order_by_exprs, [asc for _, asc in self.order_by_clause])
|
|
400
404
|
]
|
|
405
|
+
if limit_val is not None:
|
|
406
|
+
limit_val = limit_val.substitute(var_exprs)
|
|
407
|
+
if limit_val is not None and not isinstance(limit_val, exprs.Literal):
|
|
408
|
+
raise excs.Error(f'limit(): parameter must be a constant, but got {limit_val}')
|
|
401
409
|
|
|
402
410
|
return DataFrame(
|
|
403
|
-
from_clause=self._from_clause,
|
|
404
|
-
|
|
405
|
-
|
|
411
|
+
from_clause=self._from_clause,
|
|
412
|
+
select_list=select_list,
|
|
413
|
+
where_clause=where_clause,
|
|
414
|
+
group_by_clause=group_by_clause,
|
|
415
|
+
grouping_tbl=self.grouping_tbl,
|
|
416
|
+
order_by_clause=order_by_clause,
|
|
417
|
+
limit=limit_val,
|
|
418
|
+
)
|
|
406
419
|
|
|
407
420
|
def _raise_expr_eval_err(self, e: excs.ExprEvalError) -> NoReturn:
|
|
408
|
-
msg = f'In row {e.row_num} the {e.expr_msg} encountered exception
|
|
421
|
+
msg = f'In row {e.row_num} the {e.expr_msg} encountered exception {type(e.exc).__name__}:\n{str(e.exc)}'
|
|
409
422
|
if len(e.input_vals) > 0:
|
|
410
423
|
input_msgs = [
|
|
411
424
|
f"'{d}' = {d.col_type.print_value(e.input_vals[i])}" for i, d in enumerate(e.expr.dependencies())
|
|
@@ -419,7 +432,7 @@ class DataFrame:
|
|
|
419
432
|
nl = '\n'
|
|
420
433
|
# [-1:0:-1]: leave out entry 0 and reverse order, so that the most recent frame is at the top
|
|
421
434
|
msg += f'\nStack:\n{nl.join(stack_trace[-1:1:-1])}'
|
|
422
|
-
raise excs.Error(msg)
|
|
435
|
+
raise excs.Error(msg) from e
|
|
423
436
|
|
|
424
437
|
def _output_row_iterator(self, conn: Optional[sql.engine.Connection] = None) -> Iterator[list]:
|
|
425
438
|
try:
|
|
@@ -438,10 +451,7 @@ class DataFrame:
|
|
|
438
451
|
|
|
439
452
|
async def _acollect(self, conn: sql.engine.Connection) -> DataFrameResultSet:
|
|
440
453
|
try:
|
|
441
|
-
result = [
|
|
442
|
-
[row[e.slot_idx] for e in self._select_list_exprs]
|
|
443
|
-
async for row in self._aexec(conn)
|
|
444
|
-
]
|
|
454
|
+
result = [[row[e.slot_idx] for e in self._select_list_exprs] async for row in self._aexec(conn)]
|
|
445
455
|
return DataFrameResultSet(result, self.schema)
|
|
446
456
|
except excs.ExprEvalError as e:
|
|
447
457
|
self._raise_expr_eval_err(e)
|
|
@@ -471,14 +481,16 @@ class DataFrame:
|
|
|
471
481
|
return helper
|
|
472
482
|
|
|
473
483
|
def _col_descriptor(self) -> pd.DataFrame:
|
|
474
|
-
return pd.DataFrame(
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
484
|
+
return pd.DataFrame(
|
|
485
|
+
[
|
|
486
|
+
{
|
|
487
|
+
'Name': name,
|
|
488
|
+
'Type': expr.col_type._to_str(as_schema=True),
|
|
489
|
+
'Expression': expr.display_str(inline=False),
|
|
490
|
+
}
|
|
491
|
+
for name, expr in zip(self.schema.keys(), self._select_list_exprs)
|
|
492
|
+
]
|
|
493
|
+
)
|
|
482
494
|
|
|
483
495
|
def _query_descriptor(self) -> pd.DataFrame:
|
|
484
496
|
heading_vals: list[str] = []
|
|
@@ -500,7 +512,7 @@ class DataFrame:
|
|
|
500
512
|
)
|
|
501
513
|
if self.limit_val is not None:
|
|
502
514
|
heading_vals.append('Limit')
|
|
503
|
-
info_vals.append(
|
|
515
|
+
info_vals.append(self.limit_val.display_str(inline=False))
|
|
504
516
|
assert len(heading_vals) == len(info_vals)
|
|
505
517
|
return pd.DataFrame(info_vals, index=heading_vals)
|
|
506
518
|
|
|
@@ -512,6 +524,7 @@ class DataFrame:
|
|
|
512
524
|
"""
|
|
513
525
|
if getattr(builtins, '__IPYTHON__', False):
|
|
514
526
|
from IPython.display import display
|
|
527
|
+
|
|
515
528
|
display(self._repr_html_())
|
|
516
529
|
else:
|
|
517
530
|
print(repr(self))
|
|
@@ -523,7 +536,7 @@ class DataFrame:
|
|
|
523
536
|
return self._descriptors().to_html()
|
|
524
537
|
|
|
525
538
|
def select(self, *items: Any, **named_items: Any) -> DataFrame:
|
|
526
|
-
"""
|
|
539
|
+
"""Select columns or expressions from the DataFrame.
|
|
527
540
|
|
|
528
541
|
Args:
|
|
529
542
|
items: expressions to be selected
|
|
@@ -565,21 +578,17 @@ class DataFrame:
|
|
|
565
578
|
# analyze select list; wrap literals with the corresponding expressions
|
|
566
579
|
select_list: list[tuple[exprs.Expr, Optional[str]]] = []
|
|
567
580
|
for raw_expr, name in base_list:
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
select_list.append((exprs.Expr.from_object(raw_expr), name))
|
|
572
|
-
elif isinstance(raw_expr, np.ndarray):
|
|
573
|
-
select_list.append((exprs.Expr.from_array(raw_expr), name))
|
|
574
|
-
else:
|
|
575
|
-
select_list.append((exprs.Literal(raw_expr), name))
|
|
576
|
-
expr = select_list[-1][0]
|
|
581
|
+
expr = exprs.Expr.from_object(raw_expr)
|
|
582
|
+
if expr is None:
|
|
583
|
+
raise excs.Error(f'Invalid expression: {raw_expr}')
|
|
577
584
|
if expr.col_type.is_invalid_type():
|
|
578
585
|
raise excs.Error(f'Invalid type: {raw_expr}')
|
|
579
586
|
if not expr.is_bound_by(self._from_clause.tbls):
|
|
580
587
|
raise excs.Error(
|
|
581
588
|
f"Expression '{expr}' cannot be evaluated in the context of this query's tables "
|
|
582
|
-
f
|
|
589
|
+
f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)})'
|
|
590
|
+
)
|
|
591
|
+
select_list.append((expr, name))
|
|
583
592
|
|
|
584
593
|
# check user provided names do not conflict among themselves or with auto-generated ones
|
|
585
594
|
seen: set[str] = set()
|
|
@@ -640,7 +649,7 @@ class DataFrame:
|
|
|
640
649
|
)
|
|
641
650
|
|
|
642
651
|
def _create_join_predicate(
|
|
643
|
-
|
|
652
|
+
self, other: catalog.TableVersionPath, on: Union[exprs.Expr, Sequence[exprs.ColumnRef]]
|
|
644
653
|
) -> exprs.Expr:
|
|
645
654
|
"""Verifies user-specified 'on' argument and converts it into a join predicate."""
|
|
646
655
|
col_refs: list[exprs.ColumnRef] = []
|
|
@@ -656,14 +665,12 @@ class DataFrame:
|
|
|
656
665
|
return on
|
|
657
666
|
else:
|
|
658
667
|
if not isinstance(on, Sequence) or len(on) == 0:
|
|
659
|
-
raise excs.Error(
|
|
660
|
-
f"'on': must be a sequence of column references or a boolean expression")
|
|
668
|
+
raise excs.Error(f"'on': must be a sequence of column references or a boolean expression")
|
|
661
669
|
|
|
662
670
|
assert isinstance(on, Sequence)
|
|
663
671
|
for col_ref in on:
|
|
664
672
|
if not isinstance(col_ref, exprs.ColumnRef):
|
|
665
|
-
raise excs.Error(
|
|
666
|
-
f"'on': must be a sequence of column references or a boolean expression")
|
|
673
|
+
raise excs.Error(f"'on': must be a sequence of column references or a boolean expression")
|
|
667
674
|
if not col_ref.is_bound_by(joined_tbls):
|
|
668
675
|
raise excs.Error(f"'on': expression cannot be evaluated in the context of the joined tables: {col_ref}")
|
|
669
676
|
col_refs.append(col_ref)
|
|
@@ -693,8 +700,7 @@ class DataFrame:
|
|
|
693
700
|
lhs_col_ref = exprs.ColumnRef(col)
|
|
694
701
|
if lhs_col_ref is None:
|
|
695
702
|
tbl_names = [tbl.tbl_name() for tbl in self._from_clause.tbls]
|
|
696
|
-
raise excs.Error(
|
|
697
|
-
f"'on': column {col_ref.col.name!r} not found in any of: {' '.join(tbl_names)}")
|
|
703
|
+
raise excs.Error(f"'on': column {col_ref.col.name!r} not found in any of: {' '.join(tbl_names)}")
|
|
698
704
|
pred = exprs.Comparison(exprs.ComparisonOperator.EQ, lhs_col_ref, rhs_col_ref)
|
|
699
705
|
predicates.append(pred)
|
|
700
706
|
|
|
@@ -705,8 +711,10 @@ class DataFrame:
|
|
|
705
711
|
return exprs.CompoundPredicate(operator=exprs.LogicalOperator.AND, operands=predicates)
|
|
706
712
|
|
|
707
713
|
def join(
|
|
708
|
-
self,
|
|
709
|
-
|
|
714
|
+
self,
|
|
715
|
+
other: catalog.Table,
|
|
716
|
+
on: Optional[Union[exprs.Expr, Sequence[exprs.ColumnRef]]] = None,
|
|
717
|
+
how: plan.JoinType.LiteralType = 'inner',
|
|
710
718
|
) -> DataFrame:
|
|
711
719
|
"""
|
|
712
720
|
Join this DataFrame with a table.
|
|
@@ -766,16 +774,20 @@ class DataFrame:
|
|
|
766
774
|
join_clause = plan.JoinClause(join_type=plan.JoinType.validated(how, "'how'"), join_predicate=join_pred)
|
|
767
775
|
from_clause = plan.FromClause(
|
|
768
776
|
tbls=[*self._from_clause.tbls, other._tbl_version_path],
|
|
769
|
-
join_clauses=[*self._from_clause.join_clauses, join_clause]
|
|
777
|
+
join_clauses=[*self._from_clause.join_clauses, join_clause],
|
|
778
|
+
)
|
|
770
779
|
return DataFrame(
|
|
771
780
|
from_clause=from_clause,
|
|
772
|
-
select_list=self.select_list,
|
|
773
|
-
|
|
774
|
-
|
|
781
|
+
select_list=self.select_list,
|
|
782
|
+
where_clause=self.where_clause,
|
|
783
|
+
group_by_clause=self.group_by_clause,
|
|
784
|
+
grouping_tbl=self.grouping_tbl,
|
|
785
|
+
order_by_clause=self.order_by_clause,
|
|
786
|
+
limit=self.limit_val,
|
|
775
787
|
)
|
|
776
788
|
|
|
777
789
|
def group_by(self, *grouping_items: Any) -> DataFrame:
|
|
778
|
-
"""
|
|
790
|
+
"""Add a group-by clause to this DataFrame.
|
|
779
791
|
|
|
780
792
|
Variants:
|
|
781
793
|
- group_by(<base table>): group a component view by their respective base table rows
|
|
@@ -846,7 +858,7 @@ class DataFrame:
|
|
|
846
858
|
)
|
|
847
859
|
|
|
848
860
|
def order_by(self, *expr_list: exprs.Expr, asc: bool = True) -> DataFrame:
|
|
849
|
-
"""
|
|
861
|
+
"""Add an order-by clause to this DataFrame.
|
|
850
862
|
|
|
851
863
|
Args:
|
|
852
864
|
expr_list: expressions to order by
|
|
@@ -891,7 +903,7 @@ class DataFrame:
|
|
|
891
903
|
)
|
|
892
904
|
|
|
893
905
|
def limit(self, n: int) -> DataFrame:
|
|
894
|
-
"""
|
|
906
|
+
"""Limit the number of rows in the DataFrame.
|
|
895
907
|
|
|
896
908
|
Args:
|
|
897
909
|
n: Number of rows to select.
|
|
@@ -899,8 +911,10 @@ class DataFrame:
|
|
|
899
911
|
Returns:
|
|
900
912
|
A new DataFrame with the specified limited rows.
|
|
901
913
|
"""
|
|
902
|
-
|
|
903
|
-
|
|
914
|
+
assert n is not None
|
|
915
|
+
n = exprs.Expr.from_object(n)
|
|
916
|
+
if not n.col_type.is_int_type():
|
|
917
|
+
raise excs.Error(f'limit(): parameter must be of type int, instead of {n.col_type}')
|
|
904
918
|
return DataFrame(
|
|
905
919
|
from_clause=self._from_clause,
|
|
906
920
|
select_list=self.select_list,
|
|
@@ -912,7 +926,7 @@ class DataFrame:
|
|
|
912
926
|
)
|
|
913
927
|
|
|
914
928
|
def update(self, value_spec: dict[str, Any], cascade: bool = True) -> UpdateStatus:
|
|
915
|
-
"""
|
|
929
|
+
"""Update rows in the underlying table of the DataFrame.
|
|
916
930
|
|
|
917
931
|
Update rows in the table with the specified value_spec.
|
|
918
932
|
|
|
@@ -941,7 +955,7 @@ class DataFrame:
|
|
|
941
955
|
return self._first_tbl.tbl_version.update(value_spec, where=self.where_clause, cascade=cascade)
|
|
942
956
|
|
|
943
957
|
def delete(self) -> UpdateStatus:
|
|
944
|
-
"""
|
|
958
|
+
"""Delete rows form the underlying table of the DataFrame.
|
|
945
959
|
|
|
946
960
|
The delete operation is only allowed for DataFrames on base tables.
|
|
947
961
|
|
|
@@ -982,17 +996,20 @@ class DataFrame:
|
|
|
982
996
|
'_classname': 'DataFrame',
|
|
983
997
|
'from_clause': {
|
|
984
998
|
'tbls': [tbl.as_dict() for tbl in self._from_clause.tbls],
|
|
985
|
-
'join_clauses': [dataclasses.asdict(clause) for clause in self._from_clause.join_clauses]
|
|
999
|
+
'join_clauses': [dataclasses.asdict(clause) for clause in self._from_clause.join_clauses],
|
|
986
1000
|
},
|
|
987
|
-
'select_list':
|
|
988
|
-
|
|
1001
|
+
'select_list': [(e.as_dict(), name) for (e, name) in self.select_list]
|
|
1002
|
+
if self.select_list is not None
|
|
1003
|
+
else None,
|
|
989
1004
|
'where_clause': self.where_clause.as_dict() if self.where_clause is not None else None,
|
|
990
|
-
'group_by_clause':
|
|
991
|
-
|
|
1005
|
+
'group_by_clause': [e.as_dict() for e in self.group_by_clause]
|
|
1006
|
+
if self.group_by_clause is not None
|
|
1007
|
+
else None,
|
|
992
1008
|
'grouping_tbl': self.grouping_tbl.as_dict() if self.grouping_tbl is not None else None,
|
|
993
|
-
'order_by_clause':
|
|
994
|
-
|
|
995
|
-
|
|
1009
|
+
'order_by_clause': [(e.as_dict(), asc) for (e, asc) in self.order_by_clause]
|
|
1010
|
+
if self.order_by_clause is not None
|
|
1011
|
+
else None,
|
|
1012
|
+
'limit_val': self.limit_val.as_dict() if self.limit_val is not None else None,
|
|
996
1013
|
}
|
|
997
1014
|
return d
|
|
998
1015
|
|
|
@@ -1001,21 +1018,29 @@ class DataFrame:
|
|
|
1001
1018
|
tbls = [catalog.TableVersionPath.from_dict(tbl_dict) for tbl_dict in d['from_clause']['tbls']]
|
|
1002
1019
|
join_clauses = [plan.JoinClause(**clause_dict) for clause_dict in d['from_clause']['join_clauses']]
|
|
1003
1020
|
from_clause = plan.FromClause(tbls=tbls, join_clauses=join_clauses)
|
|
1004
|
-
select_list =
|
|
1005
|
-
if d['select_list'] is not None else None
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
group_by_clause =
|
|
1009
|
-
if d['group_by_clause'] is not None else None
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
order_by_clause =
|
|
1013
|
-
|
|
1014
|
-
|
|
1021
|
+
select_list = (
|
|
1022
|
+
[(exprs.Expr.from_dict(e), name) for e, name in d['select_list']] if d['select_list'] is not None else None
|
|
1023
|
+
)
|
|
1024
|
+
where_clause = exprs.Expr.from_dict(d['where_clause']) if d['where_clause'] is not None else None
|
|
1025
|
+
group_by_clause = (
|
|
1026
|
+
[exprs.Expr.from_dict(e) for e in d['group_by_clause']] if d['group_by_clause'] is not None else None
|
|
1027
|
+
)
|
|
1028
|
+
grouping_tbl = catalog.TableVersion.from_dict(d['grouping_tbl']) if d['grouping_tbl'] is not None else None
|
|
1029
|
+
order_by_clause = (
|
|
1030
|
+
[(exprs.Expr.from_dict(e), asc) for e, asc in d['order_by_clause']]
|
|
1031
|
+
if d['order_by_clause'] is not None
|
|
1032
|
+
else None
|
|
1033
|
+
)
|
|
1034
|
+
limit_val = exprs.Expr.from_dict(d['limit_val']) if d['limit_val'] is not None else None
|
|
1015
1035
|
return DataFrame(
|
|
1016
|
-
from_clause=from_clause,
|
|
1017
|
-
|
|
1018
|
-
|
|
1036
|
+
from_clause=from_clause,
|
|
1037
|
+
select_list=select_list,
|
|
1038
|
+
where_clause=where_clause,
|
|
1039
|
+
group_by_clause=group_by_clause,
|
|
1040
|
+
grouping_tbl=grouping_tbl,
|
|
1041
|
+
order_by_clause=order_by_clause,
|
|
1042
|
+
limit=limit_val,
|
|
1043
|
+
)
|
|
1019
1044
|
|
|
1020
1045
|
def _hash_result_set(self) -> str:
|
|
1021
1046
|
"""Return a hash that changes when the result set changes."""
|