pixeltable 0.4.0rc3__py3-none-any.whl → 0.4.20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +23 -5
- pixeltable/_version.py +1 -0
- pixeltable/catalog/__init__.py +5 -3
- pixeltable/catalog/catalog.py +1318 -404
- pixeltable/catalog/column.py +186 -115
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +11 -43
- pixeltable/catalog/insertable_table.py +167 -79
- pixeltable/catalog/path.py +61 -23
- pixeltable/catalog/schema_object.py +9 -10
- pixeltable/catalog/table.py +626 -308
- pixeltable/catalog/table_metadata.py +101 -0
- pixeltable/catalog/table_version.py +713 -569
- pixeltable/catalog/table_version_handle.py +37 -6
- pixeltable/catalog/table_version_path.py +42 -29
- pixeltable/catalog/tbl_ops.py +50 -0
- pixeltable/catalog/update_status.py +191 -0
- pixeltable/catalog/view.py +108 -94
- pixeltable/config.py +128 -22
- pixeltable/dataframe.py +188 -100
- pixeltable/env.py +407 -136
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +3 -0
- pixeltable/exec/aggregation_node.py +7 -8
- pixeltable/exec/cache_prefetch_node.py +83 -110
- pixeltable/exec/cell_materialization_node.py +231 -0
- pixeltable/exec/cell_reconstruction_node.py +135 -0
- pixeltable/exec/component_iteration_node.py +4 -3
- pixeltable/exec/data_row_batch.py +8 -65
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +13 -36
- pixeltable/exec/expr_eval/evaluators.py +7 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +27 -12
- pixeltable/exec/expr_eval/globals.py +8 -5
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +190 -30
- pixeltable/exec/globals.py +32 -0
- pixeltable/exec/in_memory_data_node.py +18 -18
- pixeltable/exec/object_store_save_node.py +293 -0
- pixeltable/exec/row_update_node.py +16 -9
- pixeltable/exec/sql_node.py +206 -101
- pixeltable/exprs/__init__.py +1 -1
- pixeltable/exprs/arithmetic_expr.py +27 -22
- pixeltable/exprs/array_slice.py +3 -3
- pixeltable/exprs/column_property_ref.py +34 -30
- pixeltable/exprs/column_ref.py +92 -96
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +5 -4
- pixeltable/exprs/data_row.py +152 -55
- pixeltable/exprs/expr.py +62 -43
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +17 -10
- pixeltable/exprs/function_call.py +75 -37
- pixeltable/exprs/globals.py +1 -2
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +10 -27
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +56 -22
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +127 -53
- pixeltable/exprs/rowid_ref.py +8 -12
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +5 -5
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +10 -10
- pixeltable/func/function.py +18 -20
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +20 -18
- pixeltable/func/signature.py +43 -16
- pixeltable/func/tools.py +23 -13
- pixeltable/func/udf.py +18 -20
- pixeltable/functions/__init__.py +6 -0
- pixeltable/functions/anthropic.py +93 -33
- pixeltable/functions/audio.py +114 -10
- pixeltable/functions/bedrock.py +13 -6
- pixeltable/functions/date.py +1 -1
- pixeltable/functions/deepseek.py +20 -9
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +28 -11
- pixeltable/functions/globals.py +13 -13
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/huggingface.py +1046 -23
- pixeltable/functions/image.py +9 -18
- pixeltable/functions/llama_cpp.py +23 -8
- pixeltable/functions/math.py +3 -4
- pixeltable/functions/mistralai.py +4 -15
- pixeltable/functions/ollama.py +16 -9
- pixeltable/functions/openai.py +104 -82
- pixeltable/functions/openrouter.py +143 -0
- pixeltable/functions/replicate.py +2 -2
- pixeltable/functions/reve.py +250 -0
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +13 -14
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/util.py +6 -1
- pixeltable/functions/video.py +1388 -106
- pixeltable/functions/vision.py +7 -7
- pixeltable/functions/whisper.py +15 -7
- pixeltable/functions/whisperx.py +179 -0
- pixeltable/{ext/functions → functions}/yolox.py +2 -4
- pixeltable/globals.py +332 -105
- pixeltable/index/base.py +13 -22
- pixeltable/index/btree.py +23 -22
- pixeltable/index/embedding_index.py +32 -44
- pixeltable/io/__init__.py +4 -2
- pixeltable/io/datarows.py +7 -6
- pixeltable/io/external_store.py +49 -77
- pixeltable/io/fiftyone.py +11 -11
- pixeltable/io/globals.py +29 -28
- pixeltable/io/hf_datasets.py +17 -9
- pixeltable/io/label_studio.py +70 -66
- pixeltable/io/lancedb.py +3 -0
- pixeltable/io/pandas.py +12 -11
- pixeltable/io/parquet.py +13 -93
- pixeltable/io/table_data_conduit.py +71 -47
- pixeltable/io/utils.py +3 -3
- pixeltable/iterators/__init__.py +2 -1
- pixeltable/iterators/audio.py +21 -11
- pixeltable/iterators/document.py +116 -55
- pixeltable/iterators/image.py +5 -2
- pixeltable/iterators/video.py +293 -13
- pixeltable/metadata/__init__.py +4 -2
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/convert_38.py +39 -0
- pixeltable/metadata/converters/convert_39.py +124 -0
- pixeltable/metadata/converters/convert_40.py +73 -0
- pixeltable/metadata/converters/util.py +13 -12
- pixeltable/metadata/notes.py +4 -0
- pixeltable/metadata/schema.py +79 -42
- pixeltable/metadata/utils.py +74 -0
- pixeltable/mypy/__init__.py +3 -0
- pixeltable/mypy/mypy_plugin.py +123 -0
- pixeltable/plan.py +274 -223
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +259 -129
- pixeltable/share/protocol/__init__.py +34 -0
- pixeltable/share/protocol/common.py +170 -0
- pixeltable/share/protocol/operation_types.py +33 -0
- pixeltable/share/protocol/replica.py +109 -0
- pixeltable/share/publish.py +213 -57
- pixeltable/store.py +238 -175
- pixeltable/type_system.py +104 -63
- pixeltable/utils/__init__.py +2 -3
- pixeltable/utils/arrow.py +108 -13
- pixeltable/utils/av.py +298 -0
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +3 -3
- pixeltable/utils/console_output.py +4 -1
- pixeltable/utils/coroutine.py +6 -23
- pixeltable/utils/dbms.py +31 -5
- pixeltable/utils/description_helper.py +4 -5
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +7 -30
- pixeltable/utils/filecache.py +6 -6
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +283 -0
- pixeltable/utils/http_server.py +2 -3
- pixeltable/utils/iceberg.py +1 -2
- pixeltable/utils/image.py +17 -0
- pixeltable/utils/lancedb.py +88 -0
- pixeltable/utils/local_store.py +316 -0
- pixeltable/utils/misc.py +5 -0
- pixeltable/utils/object_stores.py +528 -0
- pixeltable/utils/pydantic.py +60 -0
- pixeltable/utils/pytorch.py +5 -6
- pixeltable/utils/s3_store.py +392 -0
- pixeltable-0.4.20.dist-info/METADATA +587 -0
- pixeltable-0.4.20.dist-info/RECORD +218 -0
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info}/WHEEL +1 -1
- pixeltable-0.4.20.dist-info/entry_points.txt +2 -0
- pixeltable/__version__.py +0 -3
- pixeltable/ext/__init__.py +0 -17
- pixeltable/ext/functions/__init__.py +0 -11
- pixeltable/ext/functions/whisperx.py +0 -77
- pixeltable/utils/media_store.py +0 -77
- pixeltable/utils/s3.py +0 -17
- pixeltable/utils/sample.py +0 -25
- pixeltable-0.4.0rc3.dist-info/METADATA +0 -435
- pixeltable-0.4.0rc3.dist-info/RECORD +0 -189
- pixeltable-0.4.0rc3.dist-info/entry_points.txt +0 -3
- {pixeltable-0.4.0rc3.dist-info → pixeltable-0.4.20.dist-info/licenses}/LICENSE +0 -0
pixeltable/dataframe.py
CHANGED
|
@@ -8,14 +8,15 @@ import json
|
|
|
8
8
|
import logging
|
|
9
9
|
import traceback
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Hashable, Iterator, NoReturn,
|
|
11
|
+
from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Hashable, Iterator, NoReturn, Sequence, TypeVar
|
|
12
12
|
|
|
13
13
|
import pandas as pd
|
|
14
|
-
import
|
|
14
|
+
import pydantic
|
|
15
|
+
import sqlalchemy.exc as sql_exc
|
|
15
16
|
|
|
16
17
|
from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
|
|
17
18
|
from pixeltable.catalog import Catalog, is_valid_identifier
|
|
18
|
-
from pixeltable.catalog.
|
|
19
|
+
from pixeltable.catalog.update_status import UpdateStatus
|
|
19
20
|
from pixeltable.env import Env
|
|
20
21
|
from pixeltable.plan import Planner, SampleClause
|
|
21
22
|
from pixeltable.type_system import ColumnType
|
|
@@ -32,6 +33,11 @@ _logger = logging.getLogger('pixeltable')
|
|
|
32
33
|
|
|
33
34
|
|
|
34
35
|
class DataFrameResultSet:
|
|
36
|
+
_rows: list[list[Any]]
|
|
37
|
+
_col_names: list[str]
|
|
38
|
+
__schema: dict[str, ColumnType]
|
|
39
|
+
__formatter: Formatter
|
|
40
|
+
|
|
35
41
|
def __init__(self, rows: list[list[Any]], schema: dict[str, ColumnType]):
|
|
36
42
|
self._rows = rows
|
|
37
43
|
self._col_names = list(schema.keys())
|
|
@@ -66,6 +72,44 @@ class DataFrameResultSet:
|
|
|
66
72
|
def to_pandas(self) -> pd.DataFrame:
|
|
67
73
|
return pd.DataFrame.from_records(self._rows, columns=self._col_names)
|
|
68
74
|
|
|
75
|
+
BaseModelT = TypeVar('BaseModelT', bound=pydantic.BaseModel)
|
|
76
|
+
|
|
77
|
+
def to_pydantic(self, model: type[BaseModelT]) -> Iterator[BaseModelT]:
|
|
78
|
+
"""
|
|
79
|
+
Convert the DataFrameResultSet to a list of Pydantic model instances.
|
|
80
|
+
|
|
81
|
+
Args:
|
|
82
|
+
model: A Pydantic model class.
|
|
83
|
+
|
|
84
|
+
Returns:
|
|
85
|
+
An iterator over Pydantic model instances, one for each row in the result set.
|
|
86
|
+
|
|
87
|
+
Raises:
|
|
88
|
+
Error: If the row data doesn't match the model schema.
|
|
89
|
+
"""
|
|
90
|
+
model_fields = model.model_fields
|
|
91
|
+
model_config = getattr(model, 'model_config', {})
|
|
92
|
+
forbid_extra_fields = model_config.get('extra') == 'forbid'
|
|
93
|
+
|
|
94
|
+
# schema validation
|
|
95
|
+
required_fields = {name for name, field in model_fields.items() if field.is_required()}
|
|
96
|
+
col_names = set(self._col_names)
|
|
97
|
+
missing_fields = required_fields - col_names
|
|
98
|
+
if len(missing_fields) > 0:
|
|
99
|
+
raise excs.Error(
|
|
100
|
+
f'Required model fields {missing_fields} are missing from result set columns {self._col_names}'
|
|
101
|
+
)
|
|
102
|
+
if forbid_extra_fields:
|
|
103
|
+
extra_fields = col_names - set(model_fields.keys())
|
|
104
|
+
if len(extra_fields) > 0:
|
|
105
|
+
raise excs.Error(f"Extra fields {extra_fields} are not allowed in model with extra='forbid'")
|
|
106
|
+
|
|
107
|
+
for row in self:
|
|
108
|
+
try:
|
|
109
|
+
yield model(**row)
|
|
110
|
+
except pydantic.ValidationError as e:
|
|
111
|
+
raise excs.Error(str(e)) from e
|
|
112
|
+
|
|
69
113
|
def _row_to_dict(self, row_idx: int) -> dict[str, Any]:
|
|
70
114
|
return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
|
|
71
115
|
|
|
@@ -107,14 +151,14 @@ class DataFrameResultSet:
|
|
|
107
151
|
# # output of the agg stage
|
|
108
152
|
# self.agg_output_exprs: list[exprs.Expr] = []
|
|
109
153
|
# # Where clause of the Select stmt of the SQL scan stage
|
|
110
|
-
# self.sql_where_clause:
|
|
154
|
+
# self.sql_where_clause: sql.ClauseElement | None = None
|
|
111
155
|
# # filter predicate applied to input rows of the SQL scan stage
|
|
112
|
-
# self.filter:
|
|
113
|
-
# self.similarity_clause:
|
|
156
|
+
# self.filter: exprs.Predicate | None = None
|
|
157
|
+
# self.similarity_clause: exprs.ImageSimilarityPredicate | None = None
|
|
114
158
|
# self.agg_fn_calls: list[exprs.FunctionCall] = [] # derived from unique_exprs
|
|
115
159
|
# self.has_frame_col: bool = False # True if we're referencing the frame col
|
|
116
160
|
#
|
|
117
|
-
# self.evaluator:
|
|
161
|
+
# self.evaluator: exprs.Evaluator | None = None
|
|
118
162
|
# self.sql_scan_eval_ctx: list[exprs.Expr] = [] # needed to materialize output of SQL scan stage
|
|
119
163
|
# self.agg_eval_ctx: list[exprs.Expr] = [] # needed to materialize output of agg stage
|
|
120
164
|
# self.filter_eval_ctx: list[exprs.Expr] = []
|
|
@@ -131,27 +175,29 @@ class DataFrameResultSet:
|
|
|
131
175
|
|
|
132
176
|
|
|
133
177
|
class DataFrame:
|
|
178
|
+
"""Represents a query for retrieving and transforming data from Pixeltable tables."""
|
|
179
|
+
|
|
134
180
|
_from_clause: plan.FromClause
|
|
135
181
|
_select_list_exprs: list[exprs.Expr]
|
|
136
182
|
_schema: dict[str, ts.ColumnType]
|
|
137
|
-
select_list:
|
|
138
|
-
where_clause:
|
|
139
|
-
group_by_clause:
|
|
140
|
-
grouping_tbl:
|
|
141
|
-
order_by_clause:
|
|
142
|
-
limit_val:
|
|
143
|
-
sample_clause:
|
|
183
|
+
select_list: list[tuple[exprs.Expr, str | None]] | None
|
|
184
|
+
where_clause: exprs.Expr | None
|
|
185
|
+
group_by_clause: list[exprs.Expr] | None
|
|
186
|
+
grouping_tbl: catalog.TableVersion | None
|
|
187
|
+
order_by_clause: list[tuple[exprs.Expr, bool]] | None
|
|
188
|
+
limit_val: exprs.Expr | None
|
|
189
|
+
sample_clause: SampleClause | None
|
|
144
190
|
|
|
145
191
|
def __init__(
|
|
146
192
|
self,
|
|
147
|
-
from_clause:
|
|
148
|
-
select_list:
|
|
149
|
-
where_clause:
|
|
150
|
-
group_by_clause:
|
|
151
|
-
grouping_tbl:
|
|
152
|
-
order_by_clause:
|
|
153
|
-
limit:
|
|
154
|
-
sample_clause:
|
|
193
|
+
from_clause: plan.FromClause | None = None,
|
|
194
|
+
select_list: list[tuple[exprs.Expr, str | None]] | None = None,
|
|
195
|
+
where_clause: exprs.Expr | None = None,
|
|
196
|
+
group_by_clause: list[exprs.Expr] | None = None,
|
|
197
|
+
grouping_tbl: catalog.TableVersion | None = None,
|
|
198
|
+
order_by_clause: list[tuple[exprs.Expr, bool]] | None = None, # list[(expr, asc)]
|
|
199
|
+
limit: exprs.Expr | None = None,
|
|
200
|
+
sample_clause: SampleClause | None = None,
|
|
155
201
|
):
|
|
156
202
|
self._from_clause = from_clause
|
|
157
203
|
|
|
@@ -175,7 +221,7 @@ class DataFrame:
|
|
|
175
221
|
|
|
176
222
|
@classmethod
|
|
177
223
|
def _normalize_select_list(
|
|
178
|
-
cls, tbls: list[catalog.TableVersionPath], select_list:
|
|
224
|
+
cls, tbls: list[catalog.TableVersionPath], select_list: list[tuple[exprs.Expr, str | None]] | None
|
|
179
225
|
) -> tuple[list[exprs.Expr], list[str]]:
|
|
180
226
|
"""
|
|
181
227
|
Expand select list information with all columns and their names
|
|
@@ -236,23 +282,23 @@ class DataFrame:
|
|
|
236
282
|
if var.name not in unique_vars:
|
|
237
283
|
unique_vars[var.name] = var
|
|
238
284
|
elif unique_vars[var.name].col_type != var.col_type:
|
|
239
|
-
raise excs.Error(f'Multiple definitions of parameter {var.name}')
|
|
285
|
+
raise excs.Error(f'Multiple definitions of parameter {var.name!r}')
|
|
240
286
|
return unique_vars
|
|
241
287
|
|
|
242
288
|
@classmethod
|
|
243
289
|
def _convert_param_to_typed_expr(
|
|
244
|
-
cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range:
|
|
245
|
-
) ->
|
|
290
|
+
cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: tuple[Any, Any] | None = None
|
|
291
|
+
) -> exprs.Expr | None:
|
|
246
292
|
if v is None:
|
|
247
293
|
if required:
|
|
248
294
|
raise excs.Error(f'{name!r} parameter must be present')
|
|
249
295
|
return v
|
|
250
296
|
v_expr = exprs.Expr.from_object(v)
|
|
251
297
|
if not v_expr.col_type.matches(required_type):
|
|
252
|
-
raise excs.Error(f'{name!r} parameter must be of type {required_type
|
|
298
|
+
raise excs.Error(f'{name!r} parameter must be of type `{required_type}`; got `{v_expr.col_type}`')
|
|
253
299
|
if range is not None:
|
|
254
300
|
if not isinstance(v_expr, exprs.Literal):
|
|
255
|
-
raise excs.Error(f'{name!r} parameter must be a constant
|
|
301
|
+
raise excs.Error(f'{name!r} parameter must be a constant; got: {v_expr}')
|
|
256
302
|
if range[0] is not None and not (v_expr.val >= range[0]):
|
|
257
303
|
raise excs.Error(f'{name!r} parameter must be >= {range[0]}')
|
|
258
304
|
if range[1] is not None and not (v_expr.val <= range[1]):
|
|
@@ -261,7 +307,7 @@ class DataFrame:
|
|
|
261
307
|
|
|
262
308
|
@classmethod
|
|
263
309
|
def validate_constant_type_range(
|
|
264
|
-
cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range:
|
|
310
|
+
cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: tuple[Any, Any] | None = None
|
|
265
311
|
) -> Any:
|
|
266
312
|
"""Validate that the given named parameter is a constant of the required type and within the specified range."""
|
|
267
313
|
v_expr = cls._convert_param_to_typed_expr(v, required_type, required, name, range)
|
|
@@ -307,7 +353,7 @@ class DataFrame:
|
|
|
307
353
|
|
|
308
354
|
def _create_query_plan(self) -> exec.ExecNode:
|
|
309
355
|
# construct a group-by clause if we're grouping by a table
|
|
310
|
-
group_by_clause:
|
|
356
|
+
group_by_clause: list[exprs.Expr] | None = None
|
|
311
357
|
if self.grouping_tbl is not None:
|
|
312
358
|
assert self.group_by_clause is None
|
|
313
359
|
num_rowid_cols = len(self.grouping_tbl.store_tbl.rowid_columns())
|
|
@@ -330,7 +376,7 @@ class DataFrame:
|
|
|
330
376
|
sample_clause=self.sample_clause,
|
|
331
377
|
)
|
|
332
378
|
|
|
333
|
-
def __rowid_columns(self, num_rowid_cols:
|
|
379
|
+
def __rowid_columns(self, num_rowid_cols: int | None = None) -> list[exprs.Expr]:
|
|
334
380
|
"""Return list of RowidRef for the given number of associated rowids"""
|
|
335
381
|
return Planner.rowid_columns(self._first_tbl.tbl_version, num_rowid_cols)
|
|
336
382
|
|
|
@@ -401,6 +447,7 @@ class DataFrame:
|
|
|
401
447
|
|
|
402
448
|
@property
|
|
403
449
|
def schema(self) -> dict[str, ColumnType]:
|
|
450
|
+
"""Column names and types in this DataFrame."""
|
|
404
451
|
return self._schema
|
|
405
452
|
|
|
406
453
|
def bind(self, args: dict[str, Any]) -> DataFrame:
|
|
@@ -425,7 +472,7 @@ class DataFrame:
|
|
|
425
472
|
var_expr = vars[arg_name]
|
|
426
473
|
arg_expr = exprs.Expr.from_object(arg_val)
|
|
427
474
|
if arg_expr is None:
|
|
428
|
-
raise excs.Error(f'
|
|
475
|
+
raise excs.Error(f'That argument cannot be converted to a Pixeltable expression: {arg_val}')
|
|
429
476
|
var_exprs[var_expr] = arg_expr
|
|
430
477
|
|
|
431
478
|
exprs.Expr.list_substitute(select_list_exprs, var_exprs)
|
|
@@ -437,7 +484,7 @@ class DataFrame:
|
|
|
437
484
|
exprs.Expr.list_substitute(order_by_exprs, var_exprs)
|
|
438
485
|
|
|
439
486
|
select_list = list(zip(select_list_exprs, self.schema.keys()))
|
|
440
|
-
order_by_clause:
|
|
487
|
+
order_by_clause: list[tuple[exprs.Expr, bool]] | None = None
|
|
441
488
|
if order_by_exprs is not None:
|
|
442
489
|
order_by_clause = [
|
|
443
490
|
(expr, asc) for expr, asc in zip(order_by_exprs, [asc for _, asc in self.order_by_clause])
|
|
@@ -445,7 +492,7 @@ class DataFrame:
|
|
|
445
492
|
if limit_val is not None:
|
|
446
493
|
limit_val = limit_val.substitute(var_exprs)
|
|
447
494
|
if limit_val is not None and not isinstance(limit_val, exprs.Literal):
|
|
448
|
-
raise excs.Error(f'limit(): parameter must be a constant
|
|
495
|
+
raise excs.Error(f'limit(): parameter must be a constant; got: {limit_val}')
|
|
449
496
|
|
|
450
497
|
return DataFrame(
|
|
451
498
|
from_clause=self._from_clause,
|
|
@@ -475,26 +522,31 @@ class DataFrame:
|
|
|
475
522
|
raise excs.Error(msg) from e
|
|
476
523
|
|
|
477
524
|
def _output_row_iterator(self) -> Iterator[list]:
|
|
478
|
-
|
|
525
|
+
# TODO: extend begin_xact() to accept multiple TVPs for joins
|
|
526
|
+
single_tbl = self._first_tbl if len(self._from_clause.tbls) == 1 else None
|
|
527
|
+
with Catalog.get().begin_xact(tbl=single_tbl, for_write=False):
|
|
479
528
|
try:
|
|
480
529
|
for data_row in self._exec():
|
|
481
530
|
yield [data_row[e.slot_idx] for e in self._select_list_exprs]
|
|
482
531
|
except excs.ExprEvalError as e:
|
|
483
532
|
self._raise_expr_eval_err(e)
|
|
484
|
-
except
|
|
485
|
-
|
|
533
|
+
except (sql_exc.DBAPIError, sql_exc.OperationalError, sql_exc.InternalError) as e:
|
|
534
|
+
Catalog.get().convert_sql_exc(e, tbl=(single_tbl.tbl_version if single_tbl is not None else None))
|
|
535
|
+
raise # just re-raise if not converted to a Pixeltable error
|
|
486
536
|
|
|
487
537
|
def collect(self) -> DataFrameResultSet:
|
|
488
538
|
return DataFrameResultSet(list(self._output_row_iterator()), self.schema)
|
|
489
539
|
|
|
490
540
|
async def _acollect(self) -> DataFrameResultSet:
|
|
541
|
+
single_tbl = self._first_tbl if len(self._from_clause.tbls) == 1 else None
|
|
491
542
|
try:
|
|
492
543
|
result = [[row[e.slot_idx] for e in self._select_list_exprs] async for row in self._aexec()]
|
|
493
544
|
return DataFrameResultSet(result, self.schema)
|
|
494
545
|
except excs.ExprEvalError as e:
|
|
495
546
|
self._raise_expr_eval_err(e)
|
|
496
|
-
except
|
|
497
|
-
|
|
547
|
+
except (sql_exc.DBAPIError, sql_exc.OperationalError, sql_exc.InternalError) as e:
|
|
548
|
+
Catalog.get().convert_sql_exc(e, tbl=(single_tbl.tbl_version if single_tbl is not None else None))
|
|
549
|
+
raise # just re-raise if not converted to a Pixeltable error
|
|
498
550
|
|
|
499
551
|
def count(self) -> int:
|
|
500
552
|
"""Return the number of rows in the DataFrame.
|
|
@@ -507,7 +559,7 @@ class DataFrame:
|
|
|
507
559
|
|
|
508
560
|
from pixeltable.plan import Planner
|
|
509
561
|
|
|
510
|
-
with Catalog.get().begin_xact(for_write=False) as conn:
|
|
562
|
+
with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False) as conn:
|
|
511
563
|
stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
|
|
512
564
|
result: int = conn.execute(stmt).scalar_one()
|
|
513
565
|
assert isinstance(result, int)
|
|
@@ -620,7 +672,7 @@ class DataFrame:
|
|
|
620
672
|
return self
|
|
621
673
|
|
|
622
674
|
# analyze select list; wrap literals with the corresponding expressions
|
|
623
|
-
select_list: list[tuple[exprs.Expr,
|
|
675
|
+
select_list: list[tuple[exprs.Expr, str | None]] = []
|
|
624
676
|
for raw_expr, name in base_list:
|
|
625
677
|
expr = exprs.Expr.from_object(raw_expr)
|
|
626
678
|
if expr is None:
|
|
@@ -640,8 +692,8 @@ class DataFrame:
|
|
|
640
692
|
pass
|
|
641
693
|
if not expr.is_bound_by(self._from_clause.tbls):
|
|
642
694
|
raise excs.Error(
|
|
643
|
-
f"
|
|
644
|
-
f'({",".join(tbl.tbl_version.get().versioned_name for tbl in self._from_clause.tbls)})'
|
|
695
|
+
f"That expression cannot be evaluated in the context of this query's tables "
|
|
696
|
+
f'({",".join(tbl.tbl_version.get().versioned_name for tbl in self._from_clause.tbls)}): {expr}'
|
|
645
697
|
)
|
|
646
698
|
select_list.append((expr, name))
|
|
647
699
|
|
|
@@ -652,7 +704,7 @@ class DataFrame:
|
|
|
652
704
|
if name in seen:
|
|
653
705
|
repeated_names = [j for j, x in enumerate(names) if x == name]
|
|
654
706
|
pretty = ', '.join(map(str, repeated_names))
|
|
655
|
-
raise excs.Error(f'Repeated column name
|
|
707
|
+
raise excs.Error(f'Repeated column name {name!r} in select() at positions: {pretty}')
|
|
656
708
|
seen.add(name)
|
|
657
709
|
|
|
658
710
|
return DataFrame(
|
|
@@ -690,13 +742,13 @@ class DataFrame:
|
|
|
690
742
|
>>> df = person.where(t.age > 30)
|
|
691
743
|
"""
|
|
692
744
|
if self.where_clause is not None:
|
|
693
|
-
raise excs.Error('
|
|
745
|
+
raise excs.Error('where() clause already specified')
|
|
694
746
|
if self.sample_clause is not None:
|
|
695
|
-
raise excs.Error('where cannot be used after sample()')
|
|
747
|
+
raise excs.Error('where() cannot be used after sample()')
|
|
696
748
|
if not isinstance(pred, exprs.Expr):
|
|
697
|
-
raise excs.Error(f'
|
|
749
|
+
raise excs.Error(f'where() expects a Pixeltable expression; got: {pred}')
|
|
698
750
|
if not pred.col_type.is_bool_type():
|
|
699
|
-
raise excs.Error(f'
|
|
751
|
+
raise excs.Error(f'where() expression needs to return `Bool`, but instead returns `{pred.col_type}`')
|
|
700
752
|
return DataFrame(
|
|
701
753
|
from_clause=self._from_clause,
|
|
702
754
|
select_list=self.select_list,
|
|
@@ -708,7 +760,7 @@ class DataFrame:
|
|
|
708
760
|
)
|
|
709
761
|
|
|
710
762
|
def _create_join_predicate(
|
|
711
|
-
self, other: catalog.TableVersionPath, on:
|
|
763
|
+
self, other: catalog.TableVersionPath, on: exprs.Expr | Sequence[exprs.ColumnRef]
|
|
712
764
|
) -> exprs.Expr:
|
|
713
765
|
"""Verifies user-specified 'on' argument and converts it into a join predicate."""
|
|
714
766
|
col_refs: list[exprs.ColumnRef] = []
|
|
@@ -718,19 +770,21 @@ class DataFrame:
|
|
|
718
770
|
on = [on]
|
|
719
771
|
elif isinstance(on, exprs.Expr):
|
|
720
772
|
if not on.is_bound_by(joined_tbls):
|
|
721
|
-
raise excs.Error(f
|
|
773
|
+
raise excs.Error(f'`on` expression cannot be evaluated in the context of the joined tables: {on}')
|
|
722
774
|
if not on.col_type.is_bool_type():
|
|
723
|
-
raise excs.Error(
|
|
775
|
+
raise excs.Error(
|
|
776
|
+
f'`on` expects an expression of type `Bool`, but got one of type `{on.col_type}`: {on}'
|
|
777
|
+
)
|
|
724
778
|
return on
|
|
725
779
|
elif not isinstance(on, Sequence) or len(on) == 0:
|
|
726
|
-
raise excs.Error(
|
|
780
|
+
raise excs.Error('`on` must be a sequence of column references or a boolean expression')
|
|
727
781
|
|
|
728
782
|
assert isinstance(on, Sequence)
|
|
729
783
|
for col_ref in on:
|
|
730
784
|
if not isinstance(col_ref, exprs.ColumnRef):
|
|
731
|
-
raise excs.Error(
|
|
785
|
+
raise excs.Error('`on` must be a sequence of column references or a boolean expression')
|
|
732
786
|
if not col_ref.is_bound_by(joined_tbls):
|
|
733
|
-
raise excs.Error(f
|
|
787
|
+
raise excs.Error(f'`on` expression cannot be evaluated in the context of the joined tables: {col_ref}')
|
|
734
788
|
col_refs.append(col_ref)
|
|
735
789
|
|
|
736
790
|
predicates: list[exprs.Expr] = []
|
|
@@ -738,27 +792,27 @@ class DataFrame:
|
|
|
738
792
|
assert len(col_refs) > 0 and len(joined_tbls) >= 2
|
|
739
793
|
for col_ref in col_refs:
|
|
740
794
|
# identify the referenced column by name in 'other'
|
|
741
|
-
rhs_col = other.get_column(col_ref.col.name
|
|
795
|
+
rhs_col = other.get_column(col_ref.col.name)
|
|
742
796
|
if rhs_col is None:
|
|
743
|
-
raise excs.Error(f
|
|
797
|
+
raise excs.Error(f'`on` column {col_ref.col.name!r} not found in joined table')
|
|
744
798
|
rhs_col_ref = exprs.ColumnRef(rhs_col)
|
|
745
799
|
|
|
746
|
-
lhs_col_ref:
|
|
747
|
-
if any(tbl.has_column(col_ref.col
|
|
800
|
+
lhs_col_ref: exprs.ColumnRef | None = None
|
|
801
|
+
if any(tbl.has_column(col_ref.col) for tbl in self._from_clause.tbls):
|
|
748
802
|
# col_ref comes from the existing from_clause, we use that directly
|
|
749
803
|
lhs_col_ref = col_ref
|
|
750
804
|
else:
|
|
751
805
|
# col_ref comes from other, we need to look for a match in the existing from_clause by name
|
|
752
806
|
for tbl in self._from_clause.tbls:
|
|
753
|
-
col = tbl.get_column(col_ref.col.name
|
|
807
|
+
col = tbl.get_column(col_ref.col.name)
|
|
754
808
|
if col is None:
|
|
755
809
|
continue
|
|
756
810
|
if lhs_col_ref is not None:
|
|
757
|
-
raise excs.Error(f
|
|
811
|
+
raise excs.Error(f'`on`: ambiguous column reference: {col_ref.col.name}')
|
|
758
812
|
lhs_col_ref = exprs.ColumnRef(col)
|
|
759
813
|
if lhs_col_ref is None:
|
|
760
814
|
tbl_names = [tbl.tbl_name() for tbl in self._from_clause.tbls]
|
|
761
|
-
raise excs.Error(f
|
|
815
|
+
raise excs.Error(f'`on`: column {col_ref.col.name!r} not found in any of: {" ".join(tbl_names)}')
|
|
762
816
|
pred = exprs.Comparison(exprs.ComparisonOperator.EQ, lhs_col_ref, rhs_col_ref)
|
|
763
817
|
predicates.append(pred)
|
|
764
818
|
|
|
@@ -771,7 +825,7 @@ class DataFrame:
|
|
|
771
825
|
def join(
|
|
772
826
|
self,
|
|
773
827
|
other: catalog.Table,
|
|
774
|
-
on:
|
|
828
|
+
on: exprs.Expr | Sequence[exprs.ColumnRef] | None = None,
|
|
775
829
|
how: plan.JoinType.LiteralType = 'inner',
|
|
776
830
|
) -> DataFrame:
|
|
777
831
|
"""
|
|
@@ -822,16 +876,16 @@ class DataFrame:
|
|
|
822
876
|
"""
|
|
823
877
|
if self.sample_clause is not None:
|
|
824
878
|
raise excs.Error('join() cannot be used with sample()')
|
|
825
|
-
join_pred:
|
|
879
|
+
join_pred: exprs.Expr | None
|
|
826
880
|
if how == 'cross':
|
|
827
881
|
if on is not None:
|
|
828
|
-
raise excs.Error(
|
|
882
|
+
raise excs.Error('`on` not allowed for cross join')
|
|
829
883
|
join_pred = None
|
|
830
884
|
else:
|
|
831
885
|
if on is None:
|
|
832
|
-
raise excs.Error(f
|
|
886
|
+
raise excs.Error(f'`how={how!r}` requires `on` to be present')
|
|
833
887
|
join_pred = self._create_join_predicate(other._tbl_version_path, on)
|
|
834
|
-
join_clause = plan.JoinClause(join_type=plan.JoinType.validated(how,
|
|
888
|
+
join_clause = plan.JoinClause(join_type=plan.JoinType.validated(how, '`how`'), join_predicate=join_pred)
|
|
835
889
|
from_clause = plan.FromClause(
|
|
836
890
|
tbls=[*self._from_clause.tbls, other._tbl_version_path],
|
|
837
891
|
join_clauses=[*self._from_clause.join_clauses, join_clause],
|
|
@@ -888,24 +942,24 @@ class DataFrame:
|
|
|
888
942
|
>>> df = book.group_by(t.genre).select(t.genre, total=sum(t.price)).show()
|
|
889
943
|
"""
|
|
890
944
|
if self.group_by_clause is not None:
|
|
891
|
-
raise excs.Error('
|
|
945
|
+
raise excs.Error('group_by() already specified')
|
|
892
946
|
if self.sample_clause is not None:
|
|
893
947
|
raise excs.Error('group_by() cannot be used with sample()')
|
|
894
948
|
|
|
895
|
-
grouping_tbl:
|
|
896
|
-
group_by_clause:
|
|
949
|
+
grouping_tbl: catalog.TableVersion | None = None
|
|
950
|
+
group_by_clause: list[exprs.Expr] | None = None
|
|
897
951
|
for item in grouping_items:
|
|
898
952
|
if isinstance(item, (catalog.Table, catalog.TableVersion)):
|
|
899
953
|
if len(grouping_items) > 1:
|
|
900
|
-
raise excs.Error('group_by(): only one
|
|
954
|
+
raise excs.Error('group_by(): only one Table can be specified')
|
|
901
955
|
if len(self._from_clause.tbls) > 1:
|
|
902
956
|
raise excs.Error('group_by() with Table not supported for joins')
|
|
903
957
|
grouping_tbl = item if isinstance(item, catalog.TableVersion) else item._tbl_version.get()
|
|
904
958
|
# we need to make sure that the grouping table is a base of self.tbl
|
|
905
959
|
base = self._first_tbl.find_tbl_version(grouping_tbl.id)
|
|
906
|
-
if base is None or base.id == self._first_tbl.tbl_id
|
|
960
|
+
if base is None or base.id == self._first_tbl.tbl_id:
|
|
907
961
|
raise excs.Error(
|
|
908
|
-
f'group_by(): {grouping_tbl.name} is not a base table of {self._first_tbl.tbl_name()}'
|
|
962
|
+
f'group_by(): {grouping_tbl.name!r} is not a base table of {self._first_tbl.tbl_name()!r}'
|
|
909
963
|
)
|
|
910
964
|
break
|
|
911
965
|
if not isinstance(item, exprs.Expr):
|
|
@@ -976,7 +1030,7 @@ class DataFrame:
|
|
|
976
1030
|
>>> df = book.order_by(t.price, asc=False).order_by(t.pages)
|
|
977
1031
|
"""
|
|
978
1032
|
if self.sample_clause is not None:
|
|
979
|
-
raise excs.Error('
|
|
1033
|
+
raise excs.Error('order_by() cannot be used with sample()')
|
|
980
1034
|
for e in expr_list:
|
|
981
1035
|
if not isinstance(e, exprs.Expr):
|
|
982
1036
|
raise excs.Error(f'Invalid expression in order_by(): {e}')
|
|
@@ -1017,10 +1071,10 @@ class DataFrame:
|
|
|
1017
1071
|
|
|
1018
1072
|
def sample(
|
|
1019
1073
|
self,
|
|
1020
|
-
n:
|
|
1021
|
-
n_per_stratum:
|
|
1022
|
-
fraction:
|
|
1023
|
-
seed:
|
|
1074
|
+
n: int | None = None,
|
|
1075
|
+
n_per_stratum: int | None = None,
|
|
1076
|
+
fraction: float | None = None,
|
|
1077
|
+
seed: int | None = None,
|
|
1024
1078
|
stratify_by: Any = None,
|
|
1025
1079
|
) -> DataFrame:
|
|
1026
1080
|
"""
|
|
@@ -1074,7 +1128,7 @@ class DataFrame:
|
|
|
1074
1128
|
"""
|
|
1075
1129
|
# Check context of usage
|
|
1076
1130
|
if self.sample_clause is not None:
|
|
1077
|
-
raise excs.Error('sample()
|
|
1131
|
+
raise excs.Error('Multiple sample() clauses not allowed')
|
|
1078
1132
|
if self.group_by_clause is not None:
|
|
1079
1133
|
raise excs.Error('sample() cannot be used with group_by()')
|
|
1080
1134
|
if self.order_by_clause is not None:
|
|
@@ -1111,11 +1165,11 @@ class DataFrame:
|
|
|
1111
1165
|
if expr is None or not isinstance(expr, exprs.Expr):
|
|
1112
1166
|
raise excs.Error(f'Invalid expression: {expr}')
|
|
1113
1167
|
if not expr.col_type.is_scalar_type():
|
|
1114
|
-
raise excs.Error(f'Invalid type: expression must be a scalar type (not {expr.col_type})')
|
|
1168
|
+
raise excs.Error(f'Invalid type: expression must be a scalar type (not `{expr.col_type}`)')
|
|
1115
1169
|
if not expr.is_bound_by(self._from_clause.tbls):
|
|
1116
1170
|
raise excs.Error(
|
|
1117
|
-
f"
|
|
1118
|
-
f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)})'
|
|
1171
|
+
f"That expression cannot be evaluated in the context of this query's tables "
|
|
1172
|
+
f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)}): {expr}'
|
|
1119
1173
|
)
|
|
1120
1174
|
stratify_exprs.append(expr)
|
|
1121
1175
|
|
|
@@ -1153,18 +1207,42 @@ class DataFrame:
|
|
|
1153
1207
|
Via the above DataFrame person, update the column 'city' to 'Oakland'
|
|
1154
1208
|
and 'state' to 'CA' in the table t:
|
|
1155
1209
|
|
|
1156
|
-
>>>
|
|
1210
|
+
>>> person.update({'city': 'Oakland', 'state': 'CA'})
|
|
1157
1211
|
|
|
1158
1212
|
Via the above DataFrame person, update the column 'age' to 30 for any
|
|
1159
1213
|
rows where 'year' is 2014 in the table t:
|
|
1160
1214
|
|
|
1161
|
-
>>>
|
|
1215
|
+
>>> person.where(t.year == 2014).update({'age': 30})
|
|
1162
1216
|
"""
|
|
1163
1217
|
self._validate_mutable('update', False)
|
|
1164
|
-
|
|
1165
|
-
with Catalog.get().begin_xact(tbl_id=tbl_id, for_write=True):
|
|
1218
|
+
with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
|
|
1166
1219
|
return self._first_tbl.tbl_version.get().update(value_spec, where=self.where_clause, cascade=cascade)
|
|
1167
1220
|
|
|
1221
|
+
def recompute_columns(
|
|
1222
|
+
self, *columns: str | exprs.ColumnRef, errors_only: bool = False, cascade: bool = True
|
|
1223
|
+
) -> UpdateStatus:
|
|
1224
|
+
"""Recompute one or more computed columns of the underlying table of the DataFrame.
|
|
1225
|
+
|
|
1226
|
+
Args:
|
|
1227
|
+
columns: The names or references of the computed columns to recompute.
|
|
1228
|
+
errors_only: If True, only run the recomputation for rows that have errors in the column (ie, the column's
|
|
1229
|
+
`errortype` property indicates that an error occurred). Only allowed for recomputing a single column.
|
|
1230
|
+
cascade: if True, also update all computed columns that transitively depend on the recomputed columns.
|
|
1231
|
+
|
|
1232
|
+
Returns:
|
|
1233
|
+
UpdateStatus: the status of the operation.
|
|
1234
|
+
|
|
1235
|
+
Example:
|
|
1236
|
+
For table `person` with column `age` and computed column `height`, recompute the value of `height` for all
|
|
1237
|
+
rows where `age` is less than 18:
|
|
1238
|
+
|
|
1239
|
+
>>> df = person.where(t.age < 18).recompute_columns(person.height)
|
|
1240
|
+
"""
|
|
1241
|
+
self._validate_mutable('recompute_columns', False)
|
|
1242
|
+
with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
|
|
1243
|
+
tbl = Catalog.get().get_table_by_id(self._first_tbl.tbl_id)
|
|
1244
|
+
return tbl.recompute_columns(*columns, where=self.where_clause, errors_only=errors_only, cascade=cascade)
|
|
1245
|
+
|
|
1168
1246
|
def delete(self) -> UpdateStatus:
|
|
1169
1247
|
"""Delete rows form the underlying table of the DataFrame.
|
|
1170
1248
|
|
|
@@ -1174,19 +1252,14 @@ class DataFrame:
|
|
|
1174
1252
|
UpdateStatus: the status of the delete operation.
|
|
1175
1253
|
|
|
1176
1254
|
Example:
|
|
1177
|
-
|
|
1255
|
+
For a table `person` with column `age`, delete all rows where 'age' is less than 18:
|
|
1178
1256
|
|
|
1179
|
-
>>> person
|
|
1180
|
-
|
|
1181
|
-
Via the above DataFrame person, delete all rows from the table t where the column 'age' is less than 18:
|
|
1182
|
-
|
|
1183
|
-
>>> df = person.where(t.age < 18).delete()
|
|
1257
|
+
>>> person.where(t.age < 18).delete()
|
|
1184
1258
|
"""
|
|
1185
1259
|
self._validate_mutable('delete', False)
|
|
1186
1260
|
if not self._first_tbl.is_insertable():
|
|
1187
|
-
raise excs.Error('Cannot delete
|
|
1188
|
-
|
|
1189
|
-
with Catalog.get().begin_xact(tbl_id=tbl_id, for_write=True):
|
|
1261
|
+
raise excs.Error('Cannot use `delete` on a view.')
|
|
1262
|
+
with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
|
|
1190
1263
|
return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
|
|
1191
1264
|
|
|
1192
1265
|
def _validate_mutable(self, op_name: str, allow_select: bool) -> None:
|
|
@@ -1196,14 +1269,28 @@ class DataFrame:
|
|
|
1196
1269
|
op_name: The name of the operation for which the test is being performed.
|
|
1197
1270
|
allow_select: If True, allow a select() specification in the Dataframe.
|
|
1198
1271
|
"""
|
|
1272
|
+
self._validate_mutable_op_sequence(op_name, allow_select)
|
|
1273
|
+
|
|
1274
|
+
# TODO: Reconcile these with Table.__check_mutable()
|
|
1275
|
+
assert len(self._from_clause.tbls) == 1
|
|
1276
|
+
# First check if it's a replica, since every replica handle is also a snapshot
|
|
1277
|
+
if self._first_tbl.is_replica():
|
|
1278
|
+
raise excs.Error(f'Cannot use `{op_name}` on a replica.')
|
|
1279
|
+
if self._first_tbl.is_snapshot():
|
|
1280
|
+
raise excs.Error(f'Cannot use `{op_name}` on a snapshot.')
|
|
1281
|
+
|
|
1282
|
+
def _validate_mutable_op_sequence(self, op_name: str, allow_select: bool) -> None:
|
|
1283
|
+
"""Tests whether the sequence of operations on this DataFrame is valid for a mutation operation."""
|
|
1199
1284
|
if self.group_by_clause is not None or self.grouping_tbl is not None:
|
|
1200
|
-
raise excs.Error(f'Cannot use `{op_name}` after `group_by
|
|
1285
|
+
raise excs.Error(f'Cannot use `{op_name}` after `group_by`.')
|
|
1201
1286
|
if self.order_by_clause is not None:
|
|
1202
|
-
raise excs.Error(f'Cannot use `{op_name}` after `order_by
|
|
1287
|
+
raise excs.Error(f'Cannot use `{op_name}` after `order_by`.')
|
|
1203
1288
|
if self.select_list is not None and not allow_select:
|
|
1204
|
-
raise excs.Error(f'Cannot use `{op_name}` after `select
|
|
1289
|
+
raise excs.Error(f'Cannot use `{op_name}` after `select`.')
|
|
1205
1290
|
if self.limit_val is not None:
|
|
1206
|
-
raise excs.Error(f'Cannot use `{op_name}` after `limit
|
|
1291
|
+
raise excs.Error(f'Cannot use `{op_name}` after `limit`.')
|
|
1292
|
+
if self._has_joins():
|
|
1293
|
+
raise excs.Error(f'Cannot use `{op_name}` after `join`.')
|
|
1207
1294
|
|
|
1208
1295
|
def as_dict(self) -> dict[str, Any]:
|
|
1209
1296
|
"""
|
|
@@ -1307,7 +1394,8 @@ class DataFrame:
|
|
|
1307
1394
|
assert data_file_path.is_file()
|
|
1308
1395
|
return data_file_path
|
|
1309
1396
|
else:
|
|
1310
|
-
|
|
1397
|
+
# TODO: extend begin_xact() to accept multiple TVPs for joins
|
|
1398
|
+
with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False):
|
|
1311
1399
|
return write_coco_dataset(self, dest_path)
|
|
1312
1400
|
|
|
1313
1401
|
def to_pytorch_dataset(self, image_format: str = 'pt') -> 'torch.utils.data.IterableDataset':
|
|
@@ -1352,7 +1440,7 @@ class DataFrame:
|
|
|
1352
1440
|
if dest_path.exists(): # fast path: use cache
|
|
1353
1441
|
assert dest_path.is_dir()
|
|
1354
1442
|
else:
|
|
1355
|
-
with Catalog.get().begin_xact(for_write=False):
|
|
1443
|
+
with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False):
|
|
1356
1444
|
export_parquet(self, dest_path, inline_images=True)
|
|
1357
1445
|
|
|
1358
1446
|
return PixeltablePytorchDataset(path=dest_path, image_format=image_format)
|