pixeltable 0.3.2__py3-none-any.whl → 0.3.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +64 -11
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +1 -1
- pixeltable/catalog/catalog.py +50 -27
- pixeltable/catalog/column.py +27 -11
- pixeltable/catalog/dir.py +6 -4
- pixeltable/catalog/globals.py +8 -1
- pixeltable/catalog/insertable_table.py +22 -12
- pixeltable/catalog/named_function.py +10 -6
- pixeltable/catalog/path.py +3 -2
- pixeltable/catalog/path_dict.py +8 -6
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +121 -101
- pixeltable/catalog/table_version.py +291 -142
- pixeltable/catalog/table_version_path.py +8 -5
- pixeltable/catalog/view.py +67 -26
- pixeltable/dataframe.py +102 -72
- pixeltable/env.py +20 -21
- pixeltable/exec/__init__.py +2 -2
- pixeltable/exec/aggregation_node.py +10 -4
- pixeltable/exec/cache_prefetch_node.py +5 -3
- pixeltable/exec/component_iteration_node.py +9 -8
- pixeltable/exec/data_row_batch.py +21 -10
- pixeltable/exec/exec_context.py +10 -3
- pixeltable/exec/exec_node.py +23 -12
- pixeltable/exec/expr_eval/evaluators.py +13 -7
- pixeltable/exec/expr_eval/expr_eval_node.py +24 -15
- pixeltable/exec/expr_eval/globals.py +30 -7
- pixeltable/exec/expr_eval/row_buffer.py +5 -6
- pixeltable/exec/expr_eval/schedulers.py +151 -31
- pixeltable/exec/in_memory_data_node.py +8 -7
- pixeltable/exec/row_update_node.py +15 -5
- pixeltable/exec/sql_node.py +56 -27
- pixeltable/exprs/__init__.py +2 -2
- pixeltable/exprs/arithmetic_expr.py +57 -26
- pixeltable/exprs/array_slice.py +1 -1
- pixeltable/exprs/column_property_ref.py +2 -1
- pixeltable/exprs/column_ref.py +20 -15
- pixeltable/exprs/comparison.py +6 -2
- pixeltable/exprs/compound_predicate.py +1 -3
- pixeltable/exprs/data_row.py +2 -2
- pixeltable/exprs/expr.py +101 -72
- pixeltable/exprs/expr_dict.py +2 -1
- pixeltable/exprs/expr_set.py +3 -1
- pixeltable/exprs/function_call.py +39 -41
- pixeltable/exprs/globals.py +1 -0
- pixeltable/exprs/in_predicate.py +2 -2
- pixeltable/exprs/inline_expr.py +20 -17
- pixeltable/exprs/json_mapper.py +4 -2
- pixeltable/exprs/json_path.py +12 -18
- pixeltable/exprs/literal.py +5 -9
- pixeltable/exprs/method_ref.py +1 -0
- pixeltable/exprs/object_ref.py +1 -1
- pixeltable/exprs/row_builder.py +32 -17
- pixeltable/exprs/rowid_ref.py +14 -5
- pixeltable/exprs/similarity_expr.py +11 -6
- pixeltable/exprs/sql_element_cache.py +1 -1
- pixeltable/exprs/type_cast.py +24 -9
- pixeltable/ext/__init__.py +1 -0
- pixeltable/ext/functions/__init__.py +1 -0
- pixeltable/ext/functions/whisperx.py +2 -2
- pixeltable/ext/functions/yolox.py +11 -11
- pixeltable/func/aggregate_function.py +17 -13
- pixeltable/func/callable_function.py +6 -6
- pixeltable/func/expr_template_function.py +15 -14
- pixeltable/func/function.py +16 -16
- pixeltable/func/function_registry.py +11 -8
- pixeltable/func/globals.py +4 -2
- pixeltable/func/query_template_function.py +12 -13
- pixeltable/func/signature.py +18 -9
- pixeltable/func/tools.py +10 -17
- pixeltable/func/udf.py +106 -11
- pixeltable/functions/__init__.py +21 -2
- pixeltable/functions/anthropic.py +16 -12
- pixeltable/functions/fireworks.py +63 -5
- pixeltable/functions/gemini.py +13 -3
- pixeltable/functions/globals.py +18 -6
- pixeltable/functions/huggingface.py +20 -38
- pixeltable/functions/image.py +7 -3
- pixeltable/functions/json.py +1 -0
- pixeltable/functions/llama_cpp.py +1 -4
- pixeltable/functions/mistralai.py +31 -20
- pixeltable/functions/ollama.py +4 -18
- pixeltable/functions/openai.py +201 -108
- pixeltable/functions/replicate.py +11 -10
- pixeltable/functions/string.py +70 -7
- pixeltable/functions/timestamp.py +21 -8
- pixeltable/functions/together.py +66 -52
- pixeltable/functions/video.py +1 -0
- pixeltable/functions/vision.py +14 -11
- pixeltable/functions/whisper.py +2 -1
- pixeltable/globals.py +60 -26
- pixeltable/index/__init__.py +1 -1
- pixeltable/index/btree.py +5 -3
- pixeltable/index/embedding_index.py +15 -14
- pixeltable/io/__init__.py +1 -1
- pixeltable/io/external_store.py +30 -25
- pixeltable/io/fiftyone.py +6 -14
- pixeltable/io/globals.py +33 -27
- pixeltable/io/hf_datasets.py +2 -1
- pixeltable/io/label_studio.py +77 -68
- pixeltable/io/pandas.py +33 -9
- pixeltable/io/parquet.py +9 -12
- pixeltable/iterators/__init__.py +1 -0
- pixeltable/iterators/audio.py +205 -0
- pixeltable/iterators/document.py +19 -8
- pixeltable/iterators/image.py +6 -24
- pixeltable/iterators/string.py +3 -6
- pixeltable/iterators/video.py +1 -7
- pixeltable/metadata/__init__.py +7 -1
- pixeltable/metadata/converters/convert_10.py +2 -2
- pixeltable/metadata/converters/convert_15.py +1 -5
- pixeltable/metadata/converters/convert_16.py +2 -4
- pixeltable/metadata/converters/convert_17.py +2 -4
- pixeltable/metadata/converters/convert_18.py +2 -4
- pixeltable/metadata/converters/convert_19.py +2 -5
- pixeltable/metadata/converters/convert_20.py +1 -4
- pixeltable/metadata/converters/convert_21.py +4 -6
- pixeltable/metadata/converters/convert_22.py +1 -0
- pixeltable/metadata/converters/convert_23.py +5 -5
- pixeltable/metadata/converters/convert_24.py +12 -13
- pixeltable/metadata/converters/convert_26.py +23 -0
- pixeltable/metadata/converters/util.py +3 -4
- pixeltable/metadata/notes.py +1 -0
- pixeltable/metadata/schema.py +13 -2
- pixeltable/plan.py +173 -98
- pixeltable/store.py +42 -26
- pixeltable/type_system.py +62 -54
- pixeltable/utils/arrow.py +1 -2
- pixeltable/utils/coco.py +16 -17
- pixeltable/utils/code.py +1 -1
- pixeltable/utils/console_output.py +6 -3
- pixeltable/utils/description_helper.py +7 -7
- pixeltable/utils/documents.py +3 -1
- pixeltable/utils/filecache.py +12 -7
- pixeltable/utils/http_server.py +9 -8
- pixeltable/utils/media_store.py +2 -1
- pixeltable/utils/pytorch.py +11 -14
- pixeltable/utils/s3.py +1 -0
- pixeltable/utils/sql.py +1 -0
- pixeltable/utils/transactional_directory.py +2 -2
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/METADATA +6 -8
- pixeltable-0.3.3.dist-info/RECORD +163 -0
- pixeltable-0.3.2.dist-info/RECORD +0 -161
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.2.dist-info → pixeltable-0.3.3.dist-info}/entry_points.txt +0 -0
|
@@ -84,6 +84,7 @@ class TableVersionPath:
|
|
|
84
84
|
def get_column_ref(self, col_name: str) -> exprs.ColumnRef:
|
|
85
85
|
"""Return a ColumnRef for the given column name."""
|
|
86
86
|
from pixeltable.exprs import ColumnRef
|
|
87
|
+
|
|
87
88
|
if col_name not in self.tbl_version.cols_by_name:
|
|
88
89
|
if self.base is None:
|
|
89
90
|
raise AttributeError(f'Column {col_name} unknown')
|
|
@@ -121,11 +122,13 @@ class TableVersionPath:
|
|
|
121
122
|
return None
|
|
122
123
|
|
|
123
124
|
def has_column(self, col: Column, include_bases: bool = True) -> bool:
|
|
124
|
-
"""Return True if this table has the given column.
|
|
125
|
-
"""
|
|
125
|
+
"""Return True if this table has the given column."""
|
|
126
126
|
assert col.tbl is not None
|
|
127
|
-
if
|
|
128
|
-
|
|
127
|
+
if (
|
|
128
|
+
col.tbl.id == self.tbl_version.id
|
|
129
|
+
and col.tbl.effective_version == self.tbl_version.effective_version
|
|
130
|
+
and col.id in self.tbl_version.cols_by_id
|
|
131
|
+
):
|
|
129
132
|
# the column is visible in this table version
|
|
130
133
|
return True
|
|
131
134
|
elif self.base is not None and include_bases:
|
|
@@ -136,7 +139,7 @@ class TableVersionPath:
|
|
|
136
139
|
def as_dict(self) -> dict:
|
|
137
140
|
return {
|
|
138
141
|
'tbl_version': self.tbl_version.as_dict(),
|
|
139
|
-
'base': self.base.as_dict() if self.base is not None else None
|
|
142
|
+
'base': self.base.as_dict() if self.base is not None else None,
|
|
140
143
|
}
|
|
141
144
|
|
|
142
145
|
@classmethod
|
pixeltable/catalog/view.py
CHANGED
|
@@ -35,9 +35,10 @@ class View(Table):
|
|
|
35
35
|
The exception is a snapshot view without a predicate and without additional columns: in that case, the view
|
|
36
36
|
is simply a reference to a specific set of base versions.
|
|
37
37
|
"""
|
|
38
|
+
|
|
38
39
|
def __init__(
|
|
39
|
-
|
|
40
|
-
|
|
40
|
+
self, id: UUID, dir_id: UUID, name: str, tbl_version_path: TableVersionPath, base_id: UUID, snapshot_only: bool
|
|
41
|
+
):
|
|
41
42
|
super().__init__(id, dir_id, name, tbl_version_path)
|
|
42
43
|
assert base_id in catalog.Catalog.get().tbl_dependents
|
|
43
44
|
self._base_id = base_id # keep a reference to the base Table ID, so that we can keep track of its dependents
|
|
@@ -49,10 +50,18 @@ class View(Table):
|
|
|
49
50
|
|
|
50
51
|
@classmethod
|
|
51
52
|
def _create(
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
53
|
+
cls,
|
|
54
|
+
dir_id: UUID,
|
|
55
|
+
name: str,
|
|
56
|
+
base: TableVersionPath,
|
|
57
|
+
additional_columns: dict[str, Any],
|
|
58
|
+
predicate: Optional['pxt.exprs.Expr'],
|
|
59
|
+
is_snapshot: bool,
|
|
60
|
+
num_retained_versions: int,
|
|
61
|
+
comment: str,
|
|
62
|
+
media_validation: MediaValidation,
|
|
63
|
+
iterator_cls: Optional[type[ComponentIterator]],
|
|
64
|
+
iterator_args: Optional[dict],
|
|
56
65
|
) -> View:
|
|
57
66
|
columns = cls._create_columns(additional_columns)
|
|
58
67
|
cls._verify_schema(columns)
|
|
@@ -71,7 +80,8 @@ class View(Table):
|
|
|
71
80
|
# make sure that the value can be computed in the context of the base
|
|
72
81
|
if col.value_expr is not None and not col.value_expr.is_bound_by([base]):
|
|
73
82
|
raise excs.Error(
|
|
74
|
-
f'Column {col.name}: value expression cannot be computed in the context of the base {base.tbl_name()}'
|
|
83
|
+
f'Column {col.name}: value expression cannot be computed in the context of the base {base.tbl_name()}'
|
|
84
|
+
)
|
|
75
85
|
|
|
76
86
|
if iterator_cls is not None:
|
|
77
87
|
assert iterator_args is not None
|
|
@@ -92,6 +102,7 @@ class View(Table):
|
|
|
92
102
|
]
|
|
93
103
|
sig = func.Signature(ts.InvalidType(), params)
|
|
94
104
|
from pixeltable.exprs import FunctionCall
|
|
105
|
+
|
|
95
106
|
FunctionCall.normalize_args(iterator_cls.__name__, sig, bound_args)
|
|
96
107
|
except TypeError as e:
|
|
97
108
|
raise excs.Error(f'Cannot instantiate iterator with given arguments: {e}')
|
|
@@ -102,22 +113,28 @@ class View(Table):
|
|
|
102
113
|
# stored=False: it is not stored separately (it's already stored as part of the rowid)
|
|
103
114
|
iterator_cols = [Column(_POS_COLUMN_NAME, ts.IntType(), stored=False)]
|
|
104
115
|
output_dict, unstored_cols = iterator_cls.output_schema(**bound_args)
|
|
105
|
-
iterator_cols.extend(
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
116
|
+
iterator_cols.extend(
|
|
117
|
+
[
|
|
118
|
+
Column(col_name, col_type, stored=col_name not in unstored_cols)
|
|
119
|
+
for col_name, col_type in output_dict.items()
|
|
120
|
+
]
|
|
121
|
+
)
|
|
109
122
|
|
|
110
123
|
iterator_col_names = {col.name for col in iterator_cols}
|
|
111
124
|
for col in columns:
|
|
112
125
|
if col.name in iterator_col_names:
|
|
113
|
-
raise excs.Error(
|
|
126
|
+
raise excs.Error(
|
|
127
|
+
f'Duplicate name: column {col.name} is already present in the iterator output schema'
|
|
128
|
+
)
|
|
114
129
|
columns = iterator_cols + columns
|
|
115
130
|
|
|
116
131
|
with orm.Session(Env.get().engine, future=True) as session:
|
|
117
132
|
from pixeltable.exprs import InlineDict
|
|
133
|
+
|
|
118
134
|
iterator_args_expr: exprs.Expr = InlineDict(iterator_args) if iterator_args is not None else None
|
|
119
|
-
iterator_class_fqn =
|
|
120
|
-
else None
|
|
135
|
+
iterator_class_fqn = (
|
|
136
|
+
f'{iterator_cls.__module__}.{iterator_cls.__name__}' if iterator_cls is not None else None
|
|
137
|
+
)
|
|
121
138
|
base_version_path = cls._get_snapshot_path(base) if is_snapshot else base
|
|
122
139
|
base_versions = [
|
|
123
140
|
(tbl_version.id.hex, tbl_version.version if is_snapshot or tbl_version.is_snapshot else None)
|
|
@@ -127,35 +144,53 @@ class View(Table):
|
|
|
127
144
|
# if this is a snapshot, we need to retarget all exprs to the snapshot tbl versions
|
|
128
145
|
if is_snapshot:
|
|
129
146
|
predicate = predicate.retarget(base_version_path) if predicate is not None else None
|
|
130
|
-
iterator_args_expr =
|
|
131
|
-
if iterator_args_expr is not None else None
|
|
147
|
+
iterator_args_expr = (
|
|
148
|
+
iterator_args_expr.retarget(base_version_path) if iterator_args_expr is not None else None
|
|
149
|
+
)
|
|
132
150
|
for col in columns:
|
|
133
151
|
if col.value_expr is not None:
|
|
134
152
|
col.set_value_expr(col.value_expr.retarget(base_version_path))
|
|
135
153
|
|
|
136
154
|
view_md = md_schema.ViewMd(
|
|
137
|
-
is_snapshot=is_snapshot,
|
|
155
|
+
is_snapshot=is_snapshot,
|
|
156
|
+
predicate=predicate.as_dict() if predicate is not None else None,
|
|
138
157
|
base_versions=base_versions,
|
|
139
158
|
iterator_class_fqn=iterator_class_fqn,
|
|
140
|
-
iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None
|
|
159
|
+
iterator_args=iterator_args_expr.as_dict() if iterator_args_expr is not None else None,
|
|
160
|
+
)
|
|
141
161
|
|
|
142
162
|
id, tbl_version = TableVersion.create(
|
|
143
|
-
session,
|
|
144
|
-
|
|
163
|
+
session,
|
|
164
|
+
dir_id,
|
|
165
|
+
name,
|
|
166
|
+
columns,
|
|
167
|
+
num_retained_versions,
|
|
168
|
+
comment,
|
|
169
|
+
media_validation=media_validation,
|
|
170
|
+
base_path=base_version_path,
|
|
171
|
+
view_md=view_md,
|
|
172
|
+
)
|
|
145
173
|
if tbl_version is None:
|
|
146
174
|
# this is purely a snapshot: we use the base's tbl version path
|
|
147
175
|
view = cls(id, dir_id, name, base_version_path, base.tbl_id(), snapshot_only=True)
|
|
148
176
|
_logger.info(f'created snapshot {name}')
|
|
149
177
|
else:
|
|
150
178
|
view = cls(
|
|
151
|
-
id,
|
|
152
|
-
|
|
179
|
+
id,
|
|
180
|
+
dir_id,
|
|
181
|
+
name,
|
|
182
|
+
TableVersionPath(tbl_version, base=base_version_path),
|
|
183
|
+
base.tbl_id(),
|
|
184
|
+
snapshot_only=False,
|
|
185
|
+
)
|
|
153
186
|
_logger.info(f'Created view `{name}`, id={tbl_version.id}')
|
|
154
187
|
|
|
155
188
|
from pixeltable.plan import Planner
|
|
189
|
+
|
|
156
190
|
plan, num_values_per_row = Planner.create_view_load_plan(view._tbl_version_path)
|
|
157
191
|
num_rows, num_excs, cols_with_excs = tbl_version.store_tbl.insert_rows(
|
|
158
|
-
plan, session.connection(), v_min=tbl_version.version
|
|
192
|
+
plan, session.connection(), v_min=tbl_version.version
|
|
193
|
+
)
|
|
159
194
|
Env.get().console_logger.info(f'Created view `{name}` with {num_rows} rows, {num_excs} exceptions.')
|
|
160
195
|
|
|
161
196
|
session.commit()
|
|
@@ -188,7 +223,8 @@ class View(Table):
|
|
|
188
223
|
|
|
189
224
|
return TableVersionPath(
|
|
190
225
|
tbl_version,
|
|
191
|
-
base=cls._get_snapshot_path(tbl_version_path.base) if tbl_version_path.base is not None else None
|
|
226
|
+
base=cls._get_snapshot_path(tbl_version_path.base) if tbl_version_path.base is not None else None,
|
|
227
|
+
)
|
|
192
228
|
|
|
193
229
|
def _drop(self) -> None:
|
|
194
230
|
cat = catalog.Catalog.get()
|
|
@@ -216,8 +252,13 @@ class View(Table):
|
|
|
216
252
|
return md
|
|
217
253
|
|
|
218
254
|
def insert(
|
|
219
|
-
|
|
220
|
-
|
|
255
|
+
self,
|
|
256
|
+
rows: Optional[Iterable[dict[str, Any]]] = None,
|
|
257
|
+
/,
|
|
258
|
+
*,
|
|
259
|
+
print_stats: bool = False,
|
|
260
|
+
on_error: Literal['abort', 'ignore'] = 'abort',
|
|
261
|
+
**kwargs: Any,
|
|
221
262
|
) -> UpdateStatus:
|
|
222
263
|
raise excs.Error(f'{self._display_name()} {self._name!r}: cannot insert into view')
|
|
223
264
|
|
pixeltable/dataframe.py
CHANGED
|
@@ -8,7 +8,7 @@ import json
|
|
|
8
8
|
import logging
|
|
9
9
|
import traceback
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterator, Optional, Sequence, Union
|
|
11
|
+
from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Hashable, Iterator, NoReturn, Optional, Sequence, Union
|
|
12
12
|
|
|
13
13
|
import numpy as np
|
|
14
14
|
import pandas as pd
|
|
@@ -138,7 +138,7 @@ class DataFrame:
|
|
|
138
138
|
group_by_clause: Optional[list[exprs.Expr]]
|
|
139
139
|
grouping_tbl: Optional[catalog.TableVersion]
|
|
140
140
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]]
|
|
141
|
-
limit_val: Optional[
|
|
141
|
+
limit_val: Optional[exprs.Expr]
|
|
142
142
|
|
|
143
143
|
def __init__(
|
|
144
144
|
self,
|
|
@@ -148,7 +148,7 @@ class DataFrame:
|
|
|
148
148
|
group_by_clause: Optional[list[exprs.Expr]] = None,
|
|
149
149
|
grouping_tbl: Optional[catalog.TableVersion] = None,
|
|
150
150
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None, # list[(expr, asc)]
|
|
151
|
-
limit: Optional[
|
|
151
|
+
limit: Optional[exprs.Expr] = None,
|
|
152
152
|
):
|
|
153
153
|
self._from_clause = from_clause
|
|
154
154
|
|
|
@@ -171,9 +171,7 @@ class DataFrame:
|
|
|
171
171
|
|
|
172
172
|
@classmethod
|
|
173
173
|
def _normalize_select_list(
|
|
174
|
-
cls,
|
|
175
|
-
tbls: list[catalog.TableVersionPath],
|
|
176
|
-
select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
|
|
174
|
+
cls, tbls: list[catalog.TableVersionPath], select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]]
|
|
177
175
|
) -> tuple[list[exprs.Expr], list[str]]:
|
|
178
176
|
"""
|
|
179
177
|
Expand select list information with all columns and their names
|
|
@@ -227,6 +225,8 @@ class DataFrame:
|
|
|
227
225
|
all_exprs.extend(self.group_by_clause)
|
|
228
226
|
if self.order_by_clause is not None:
|
|
229
227
|
all_exprs.extend([expr for expr, _ in self.order_by_clause])
|
|
228
|
+
if self.limit_val is not None:
|
|
229
|
+
all_exprs.append(self.limit_val)
|
|
230
230
|
vars = exprs.Expr.list_subexprs(all_exprs, expr_class=exprs.Variable)
|
|
231
231
|
unique_vars: dict[str, exprs.Variable] = {}
|
|
232
232
|
for var in vars:
|
|
@@ -301,7 +301,7 @@ class DataFrame:
|
|
|
301
301
|
where_clause=self.where_clause,
|
|
302
302
|
group_by_clause=group_by_clause,
|
|
303
303
|
order_by_clause=self.order_by_clause if self.order_by_clause is not None else [],
|
|
304
|
-
limit=self.limit_val
|
|
304
|
+
limit=self.limit_val,
|
|
305
305
|
)
|
|
306
306
|
|
|
307
307
|
def _has_joins(self) -> bool:
|
|
@@ -369,8 +369,12 @@ class DataFrame:
|
|
|
369
369
|
select_list_exprs = copy.deepcopy(self._select_list_exprs)
|
|
370
370
|
where_clause = copy.deepcopy(self.where_clause)
|
|
371
371
|
group_by_clause = copy.deepcopy(self.group_by_clause)
|
|
372
|
-
order_by_exprs =
|
|
373
|
-
|
|
372
|
+
order_by_exprs = (
|
|
373
|
+
[copy.deepcopy(order_by_expr) for order_by_expr, _ in self.order_by_clause]
|
|
374
|
+
if self.order_by_clause is not None
|
|
375
|
+
else None
|
|
376
|
+
)
|
|
377
|
+
limit_val = copy.deepcopy(self.limit_val)
|
|
374
378
|
|
|
375
379
|
var_exprs: dict[exprs.Expr, exprs.Expr] = {}
|
|
376
380
|
vars = self._vars()
|
|
@@ -386,7 +390,7 @@ class DataFrame:
|
|
|
386
390
|
|
|
387
391
|
exprs.Expr.list_substitute(select_list_exprs, var_exprs)
|
|
388
392
|
if where_clause is not None:
|
|
389
|
-
where_clause.substitute(var_exprs)
|
|
393
|
+
where_clause = where_clause.substitute(var_exprs)
|
|
390
394
|
if group_by_clause is not None:
|
|
391
395
|
exprs.Expr.list_substitute(group_by_clause, var_exprs)
|
|
392
396
|
if order_by_exprs is not None:
|
|
@@ -398,14 +402,23 @@ class DataFrame:
|
|
|
398
402
|
order_by_clause = [
|
|
399
403
|
(expr, asc) for expr, asc in zip(order_by_exprs, [asc for _, asc in self.order_by_clause])
|
|
400
404
|
]
|
|
405
|
+
if limit_val is not None:
|
|
406
|
+
limit_val = limit_val.substitute(var_exprs)
|
|
407
|
+
if limit_val is not None and not isinstance(limit_val, exprs.Literal):
|
|
408
|
+
raise excs.Error(f'limit(): parameter must be a constant, but got {limit_val}')
|
|
401
409
|
|
|
402
410
|
return DataFrame(
|
|
403
|
-
from_clause=self._from_clause,
|
|
404
|
-
|
|
405
|
-
|
|
411
|
+
from_clause=self._from_clause,
|
|
412
|
+
select_list=select_list,
|
|
413
|
+
where_clause=where_clause,
|
|
414
|
+
group_by_clause=group_by_clause,
|
|
415
|
+
grouping_tbl=self.grouping_tbl,
|
|
416
|
+
order_by_clause=order_by_clause,
|
|
417
|
+
limit=limit_val,
|
|
418
|
+
)
|
|
406
419
|
|
|
407
420
|
def _raise_expr_eval_err(self, e: excs.ExprEvalError) -> NoReturn:
|
|
408
|
-
msg = f'In row {e.row_num} the {e.expr_msg} encountered exception
|
|
421
|
+
msg = f'In row {e.row_num} the {e.expr_msg} encountered exception {type(e.exc).__name__}:\n{str(e.exc)}'
|
|
409
422
|
if len(e.input_vals) > 0:
|
|
410
423
|
input_msgs = [
|
|
411
424
|
f"'{d}' = {d.col_type.print_value(e.input_vals[i])}" for i, d in enumerate(e.expr.dependencies())
|
|
@@ -419,7 +432,7 @@ class DataFrame:
|
|
|
419
432
|
nl = '\n'
|
|
420
433
|
# [-1:0:-1]: leave out entry 0 and reverse order, so that the most recent frame is at the top
|
|
421
434
|
msg += f'\nStack:\n{nl.join(stack_trace[-1:1:-1])}'
|
|
422
|
-
raise excs.Error(msg)
|
|
435
|
+
raise excs.Error(msg) from e
|
|
423
436
|
|
|
424
437
|
def _output_row_iterator(self, conn: Optional[sql.engine.Connection] = None) -> Iterator[list]:
|
|
425
438
|
try:
|
|
@@ -438,10 +451,7 @@ class DataFrame:
|
|
|
438
451
|
|
|
439
452
|
async def _acollect(self, conn: sql.engine.Connection) -> DataFrameResultSet:
|
|
440
453
|
try:
|
|
441
|
-
result = [
|
|
442
|
-
[row[e.slot_idx] for e in self._select_list_exprs]
|
|
443
|
-
async for row in self._aexec(conn)
|
|
444
|
-
]
|
|
454
|
+
result = [[row[e.slot_idx] for e in self._select_list_exprs] async for row in self._aexec(conn)]
|
|
445
455
|
return DataFrameResultSet(result, self.schema)
|
|
446
456
|
except excs.ExprEvalError as e:
|
|
447
457
|
self._raise_expr_eval_err(e)
|
|
@@ -471,14 +481,16 @@ class DataFrame:
|
|
|
471
481
|
return helper
|
|
472
482
|
|
|
473
483
|
def _col_descriptor(self) -> pd.DataFrame:
|
|
474
|
-
return pd.DataFrame(
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
484
|
+
return pd.DataFrame(
|
|
485
|
+
[
|
|
486
|
+
{
|
|
487
|
+
'Name': name,
|
|
488
|
+
'Type': expr.col_type._to_str(as_schema=True),
|
|
489
|
+
'Expression': expr.display_str(inline=False),
|
|
490
|
+
}
|
|
491
|
+
for name, expr in zip(self.schema.keys(), self._select_list_exprs)
|
|
492
|
+
]
|
|
493
|
+
)
|
|
482
494
|
|
|
483
495
|
def _query_descriptor(self) -> pd.DataFrame:
|
|
484
496
|
heading_vals: list[str] = []
|
|
@@ -500,7 +512,7 @@ class DataFrame:
|
|
|
500
512
|
)
|
|
501
513
|
if self.limit_val is not None:
|
|
502
514
|
heading_vals.append('Limit')
|
|
503
|
-
info_vals.append(
|
|
515
|
+
info_vals.append(self.limit_val.display_str(inline=False))
|
|
504
516
|
assert len(heading_vals) == len(info_vals)
|
|
505
517
|
return pd.DataFrame(info_vals, index=heading_vals)
|
|
506
518
|
|
|
@@ -512,6 +524,7 @@ class DataFrame:
|
|
|
512
524
|
"""
|
|
513
525
|
if getattr(builtins, '__IPYTHON__', False):
|
|
514
526
|
from IPython.display import display
|
|
527
|
+
|
|
515
528
|
display(self._repr_html_())
|
|
516
529
|
else:
|
|
517
530
|
print(repr(self))
|
|
@@ -523,7 +536,7 @@ class DataFrame:
|
|
|
523
536
|
return self._descriptors().to_html()
|
|
524
537
|
|
|
525
538
|
def select(self, *items: Any, **named_items: Any) -> DataFrame:
|
|
526
|
-
"""
|
|
539
|
+
"""Select columns or expressions from the DataFrame.
|
|
527
540
|
|
|
528
541
|
Args:
|
|
529
542
|
items: expressions to be selected
|
|
@@ -579,7 +592,8 @@ class DataFrame:
|
|
|
579
592
|
if not expr.is_bound_by(self._from_clause.tbls):
|
|
580
593
|
raise excs.Error(
|
|
581
594
|
f"Expression '{expr}' cannot be evaluated in the context of this query's tables "
|
|
582
|
-
f
|
|
595
|
+
f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)})'
|
|
596
|
+
)
|
|
583
597
|
|
|
584
598
|
# check user provided names do not conflict among themselves or with auto-generated ones
|
|
585
599
|
seen: set[str] = set()
|
|
@@ -640,7 +654,7 @@ class DataFrame:
|
|
|
640
654
|
)
|
|
641
655
|
|
|
642
656
|
def _create_join_predicate(
|
|
643
|
-
|
|
657
|
+
self, other: catalog.TableVersionPath, on: Union[exprs.Expr, Sequence[exprs.ColumnRef]]
|
|
644
658
|
) -> exprs.Expr:
|
|
645
659
|
"""Verifies user-specified 'on' argument and converts it into a join predicate."""
|
|
646
660
|
col_refs: list[exprs.ColumnRef] = []
|
|
@@ -656,14 +670,12 @@ class DataFrame:
|
|
|
656
670
|
return on
|
|
657
671
|
else:
|
|
658
672
|
if not isinstance(on, Sequence) or len(on) == 0:
|
|
659
|
-
raise excs.Error(
|
|
660
|
-
f"'on': must be a sequence of column references or a boolean expression")
|
|
673
|
+
raise excs.Error(f"'on': must be a sequence of column references or a boolean expression")
|
|
661
674
|
|
|
662
675
|
assert isinstance(on, Sequence)
|
|
663
676
|
for col_ref in on:
|
|
664
677
|
if not isinstance(col_ref, exprs.ColumnRef):
|
|
665
|
-
raise excs.Error(
|
|
666
|
-
f"'on': must be a sequence of column references or a boolean expression")
|
|
678
|
+
raise excs.Error(f"'on': must be a sequence of column references or a boolean expression")
|
|
667
679
|
if not col_ref.is_bound_by(joined_tbls):
|
|
668
680
|
raise excs.Error(f"'on': expression cannot be evaluated in the context of the joined tables: {col_ref}")
|
|
669
681
|
col_refs.append(col_ref)
|
|
@@ -693,8 +705,7 @@ class DataFrame:
|
|
|
693
705
|
lhs_col_ref = exprs.ColumnRef(col)
|
|
694
706
|
if lhs_col_ref is None:
|
|
695
707
|
tbl_names = [tbl.tbl_name() for tbl in self._from_clause.tbls]
|
|
696
|
-
raise excs.Error(
|
|
697
|
-
f"'on': column {col_ref.col.name!r} not found in any of: {' '.join(tbl_names)}")
|
|
708
|
+
raise excs.Error(f"'on': column {col_ref.col.name!r} not found in any of: {' '.join(tbl_names)}")
|
|
698
709
|
pred = exprs.Comparison(exprs.ComparisonOperator.EQ, lhs_col_ref, rhs_col_ref)
|
|
699
710
|
predicates.append(pred)
|
|
700
711
|
|
|
@@ -705,8 +716,10 @@ class DataFrame:
|
|
|
705
716
|
return exprs.CompoundPredicate(operator=exprs.LogicalOperator.AND, operands=predicates)
|
|
706
717
|
|
|
707
718
|
def join(
|
|
708
|
-
self,
|
|
709
|
-
|
|
719
|
+
self,
|
|
720
|
+
other: catalog.Table,
|
|
721
|
+
on: Optional[Union[exprs.Expr, Sequence[exprs.ColumnRef]]] = None,
|
|
722
|
+
how: plan.JoinType.LiteralType = 'inner',
|
|
710
723
|
) -> DataFrame:
|
|
711
724
|
"""
|
|
712
725
|
Join this DataFrame with a table.
|
|
@@ -766,16 +779,20 @@ class DataFrame:
|
|
|
766
779
|
join_clause = plan.JoinClause(join_type=plan.JoinType.validated(how, "'how'"), join_predicate=join_pred)
|
|
767
780
|
from_clause = plan.FromClause(
|
|
768
781
|
tbls=[*self._from_clause.tbls, other._tbl_version_path],
|
|
769
|
-
join_clauses=[*self._from_clause.join_clauses, join_clause]
|
|
782
|
+
join_clauses=[*self._from_clause.join_clauses, join_clause],
|
|
783
|
+
)
|
|
770
784
|
return DataFrame(
|
|
771
785
|
from_clause=from_clause,
|
|
772
|
-
select_list=self.select_list,
|
|
773
|
-
|
|
774
|
-
|
|
786
|
+
select_list=self.select_list,
|
|
787
|
+
where_clause=self.where_clause,
|
|
788
|
+
group_by_clause=self.group_by_clause,
|
|
789
|
+
grouping_tbl=self.grouping_tbl,
|
|
790
|
+
order_by_clause=self.order_by_clause,
|
|
791
|
+
limit=self.limit_val,
|
|
775
792
|
)
|
|
776
793
|
|
|
777
794
|
def group_by(self, *grouping_items: Any) -> DataFrame:
|
|
778
|
-
"""
|
|
795
|
+
"""Add a group-by clause to this DataFrame.
|
|
779
796
|
|
|
780
797
|
Variants:
|
|
781
798
|
- group_by(<base table>): group a component view by their respective base table rows
|
|
@@ -846,7 +863,7 @@ class DataFrame:
|
|
|
846
863
|
)
|
|
847
864
|
|
|
848
865
|
def order_by(self, *expr_list: exprs.Expr, asc: bool = True) -> DataFrame:
|
|
849
|
-
"""
|
|
866
|
+
"""Add an order-by clause to this DataFrame.
|
|
850
867
|
|
|
851
868
|
Args:
|
|
852
869
|
expr_list: expressions to order by
|
|
@@ -891,7 +908,7 @@ class DataFrame:
|
|
|
891
908
|
)
|
|
892
909
|
|
|
893
910
|
def limit(self, n: int) -> DataFrame:
|
|
894
|
-
"""
|
|
911
|
+
"""Limit the number of rows in the DataFrame.
|
|
895
912
|
|
|
896
913
|
Args:
|
|
897
914
|
n: Number of rows to select.
|
|
@@ -899,8 +916,10 @@ class DataFrame:
|
|
|
899
916
|
Returns:
|
|
900
917
|
A new DataFrame with the specified limited rows.
|
|
901
918
|
"""
|
|
902
|
-
|
|
903
|
-
|
|
919
|
+
assert n is not None
|
|
920
|
+
n = exprs.Expr.from_object(n)
|
|
921
|
+
if not n.col_type.is_int_type():
|
|
922
|
+
raise excs.Error(f'limit(): parameter must be of type int, instead of {n.col_type}')
|
|
904
923
|
return DataFrame(
|
|
905
924
|
from_clause=self._from_clause,
|
|
906
925
|
select_list=self.select_list,
|
|
@@ -912,7 +931,7 @@ class DataFrame:
|
|
|
912
931
|
)
|
|
913
932
|
|
|
914
933
|
def update(self, value_spec: dict[str, Any], cascade: bool = True) -> UpdateStatus:
|
|
915
|
-
"""
|
|
934
|
+
"""Update rows in the underlying table of the DataFrame.
|
|
916
935
|
|
|
917
936
|
Update rows in the table with the specified value_spec.
|
|
918
937
|
|
|
@@ -941,7 +960,7 @@ class DataFrame:
|
|
|
941
960
|
return self._first_tbl.tbl_version.update(value_spec, where=self.where_clause, cascade=cascade)
|
|
942
961
|
|
|
943
962
|
def delete(self) -> UpdateStatus:
|
|
944
|
-
"""
|
|
963
|
+
"""Delete rows form the underlying table of the DataFrame.
|
|
945
964
|
|
|
946
965
|
The delete operation is only allowed for DataFrames on base tables.
|
|
947
966
|
|
|
@@ -982,17 +1001,20 @@ class DataFrame:
|
|
|
982
1001
|
'_classname': 'DataFrame',
|
|
983
1002
|
'from_clause': {
|
|
984
1003
|
'tbls': [tbl.as_dict() for tbl in self._from_clause.tbls],
|
|
985
|
-
'join_clauses': [dataclasses.asdict(clause) for clause in self._from_clause.join_clauses]
|
|
1004
|
+
'join_clauses': [dataclasses.asdict(clause) for clause in self._from_clause.join_clauses],
|
|
986
1005
|
},
|
|
987
|
-
'select_list':
|
|
988
|
-
|
|
1006
|
+
'select_list': [(e.as_dict(), name) for (e, name) in self.select_list]
|
|
1007
|
+
if self.select_list is not None
|
|
1008
|
+
else None,
|
|
989
1009
|
'where_clause': self.where_clause.as_dict() if self.where_clause is not None else None,
|
|
990
|
-
'group_by_clause':
|
|
991
|
-
|
|
1010
|
+
'group_by_clause': [e.as_dict() for e in self.group_by_clause]
|
|
1011
|
+
if self.group_by_clause is not None
|
|
1012
|
+
else None,
|
|
992
1013
|
'grouping_tbl': self.grouping_tbl.as_dict() if self.grouping_tbl is not None else None,
|
|
993
|
-
'order_by_clause':
|
|
994
|
-
|
|
995
|
-
|
|
1014
|
+
'order_by_clause': [(e.as_dict(), asc) for (e, asc) in self.order_by_clause]
|
|
1015
|
+
if self.order_by_clause is not None
|
|
1016
|
+
else None,
|
|
1017
|
+
'limit_val': self.limit_val.as_dict() if self.limit_val is not None else None,
|
|
996
1018
|
}
|
|
997
1019
|
return d
|
|
998
1020
|
|
|
@@ -1001,21 +1023,29 @@ class DataFrame:
|
|
|
1001
1023
|
tbls = [catalog.TableVersionPath.from_dict(tbl_dict) for tbl_dict in d['from_clause']['tbls']]
|
|
1002
1024
|
join_clauses = [plan.JoinClause(**clause_dict) for clause_dict in d['from_clause']['join_clauses']]
|
|
1003
1025
|
from_clause = plan.FromClause(tbls=tbls, join_clauses=join_clauses)
|
|
1004
|
-
select_list =
|
|
1005
|
-
if d['select_list'] is not None else None
|
|
1006
|
-
|
|
1007
|
-
|
|
1008
|
-
group_by_clause =
|
|
1009
|
-
if d['group_by_clause'] is not None else None
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
order_by_clause =
|
|
1013
|
-
|
|
1014
|
-
|
|
1026
|
+
select_list = (
|
|
1027
|
+
[(exprs.Expr.from_dict(e), name) for e, name in d['select_list']] if d['select_list'] is not None else None
|
|
1028
|
+
)
|
|
1029
|
+
where_clause = exprs.Expr.from_dict(d['where_clause']) if d['where_clause'] is not None else None
|
|
1030
|
+
group_by_clause = (
|
|
1031
|
+
[exprs.Expr.from_dict(e) for e in d['group_by_clause']] if d['group_by_clause'] is not None else None
|
|
1032
|
+
)
|
|
1033
|
+
grouping_tbl = catalog.TableVersion.from_dict(d['grouping_tbl']) if d['grouping_tbl'] is not None else None
|
|
1034
|
+
order_by_clause = (
|
|
1035
|
+
[(exprs.Expr.from_dict(e), asc) for e, asc in d['order_by_clause']]
|
|
1036
|
+
if d['order_by_clause'] is not None
|
|
1037
|
+
else None
|
|
1038
|
+
)
|
|
1039
|
+
limit_val = exprs.Expr.from_dict(d['limit_val']) if d['limit_val'] is not None else None
|
|
1015
1040
|
return DataFrame(
|
|
1016
|
-
from_clause=from_clause,
|
|
1017
|
-
|
|
1018
|
-
|
|
1041
|
+
from_clause=from_clause,
|
|
1042
|
+
select_list=select_list,
|
|
1043
|
+
where_clause=where_clause,
|
|
1044
|
+
group_by_clause=group_by_clause,
|
|
1045
|
+
grouping_tbl=grouping_tbl,
|
|
1046
|
+
order_by_clause=order_by_clause,
|
|
1047
|
+
limit=limit_val,
|
|
1048
|
+
)
|
|
1019
1049
|
|
|
1020
1050
|
def _hash_result_set(self) -> str:
|
|
1021
1051
|
"""Return a hash that changes when the result set changes."""
|