pixeltable 0.4.17__py3-none-any.whl → 0.4.19__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/_version.py +1 -0
- pixeltable/catalog/catalog.py +144 -118
- pixeltable/catalog/column.py +104 -115
- pixeltable/catalog/globals.py +1 -2
- pixeltable/catalog/insertable_table.py +44 -49
- pixeltable/catalog/path.py +3 -4
- pixeltable/catalog/schema_object.py +4 -4
- pixeltable/catalog/table.py +139 -124
- pixeltable/catalog/table_metadata.py +6 -6
- pixeltable/catalog/table_version.py +315 -246
- pixeltable/catalog/table_version_handle.py +4 -4
- pixeltable/catalog/table_version_path.py +9 -10
- pixeltable/catalog/tbl_ops.py +9 -3
- pixeltable/catalog/view.py +34 -28
- pixeltable/config.py +14 -10
- pixeltable/dataframe.py +69 -78
- pixeltable/env.py +78 -64
- pixeltable/exec/aggregation_node.py +6 -6
- pixeltable/exec/cache_prefetch_node.py +10 -10
- pixeltable/exec/data_row_batch.py +3 -3
- pixeltable/exec/exec_context.py +16 -4
- pixeltable/exec/exec_node.py +5 -5
- pixeltable/exec/expr_eval/evaluators.py +6 -6
- pixeltable/exec/expr_eval/expr_eval_node.py +8 -7
- pixeltable/exec/expr_eval/globals.py +6 -6
- pixeltable/exec/expr_eval/row_buffer.py +1 -2
- pixeltable/exec/expr_eval/schedulers.py +11 -11
- pixeltable/exec/in_memory_data_node.py +2 -2
- pixeltable/exec/object_store_save_node.py +14 -17
- pixeltable/exec/sql_node.py +28 -27
- pixeltable/exprs/arithmetic_expr.py +4 -4
- pixeltable/exprs/array_slice.py +2 -2
- pixeltable/exprs/column_property_ref.py +3 -3
- pixeltable/exprs/column_ref.py +61 -74
- pixeltable/exprs/comparison.py +5 -5
- pixeltable/exprs/compound_predicate.py +3 -3
- pixeltable/exprs/data_row.py +12 -12
- pixeltable/exprs/expr.py +41 -31
- pixeltable/exprs/expr_dict.py +3 -3
- pixeltable/exprs/expr_set.py +3 -3
- pixeltable/exprs/function_call.py +14 -14
- pixeltable/exprs/in_predicate.py +4 -4
- pixeltable/exprs/inline_expr.py +8 -8
- pixeltable/exprs/is_null.py +1 -3
- pixeltable/exprs/json_mapper.py +8 -8
- pixeltable/exprs/json_path.py +6 -6
- pixeltable/exprs/literal.py +5 -5
- pixeltable/exprs/method_ref.py +2 -2
- pixeltable/exprs/object_ref.py +2 -2
- pixeltable/exprs/row_builder.py +14 -14
- pixeltable/exprs/rowid_ref.py +8 -8
- pixeltable/exprs/similarity_expr.py +50 -25
- pixeltable/exprs/sql_element_cache.py +4 -4
- pixeltable/exprs/string_op.py +2 -2
- pixeltable/exprs/type_cast.py +3 -5
- pixeltable/func/aggregate_function.py +8 -8
- pixeltable/func/callable_function.py +9 -9
- pixeltable/func/expr_template_function.py +3 -3
- pixeltable/func/function.py +15 -17
- pixeltable/func/function_registry.py +6 -7
- pixeltable/func/globals.py +2 -3
- pixeltable/func/mcp.py +2 -2
- pixeltable/func/query_template_function.py +16 -16
- pixeltable/func/signature.py +14 -14
- pixeltable/func/tools.py +11 -11
- pixeltable/func/udf.py +16 -18
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +7 -7
- pixeltable/functions/audio.py +76 -0
- pixeltable/functions/bedrock.py +6 -6
- pixeltable/functions/deepseek.py +4 -4
- pixeltable/functions/fireworks.py +2 -2
- pixeltable/functions/gemini.py +6 -6
- pixeltable/functions/globals.py +12 -12
- pixeltable/functions/groq.py +4 -4
- pixeltable/functions/huggingface.py +1033 -6
- pixeltable/functions/image.py +7 -10
- pixeltable/functions/llama_cpp.py +7 -7
- pixeltable/functions/math.py +2 -3
- pixeltable/functions/mistralai.py +3 -3
- pixeltable/functions/ollama.py +9 -9
- pixeltable/functions/openai.py +21 -21
- pixeltable/functions/openrouter.py +7 -7
- pixeltable/functions/string.py +21 -28
- pixeltable/functions/timestamp.py +7 -8
- pixeltable/functions/together.py +4 -6
- pixeltable/functions/twelvelabs.py +92 -0
- pixeltable/functions/video.py +36 -31
- pixeltable/functions/vision.py +6 -6
- pixeltable/functions/whisper.py +7 -7
- pixeltable/functions/whisperx.py +16 -16
- pixeltable/globals.py +75 -40
- pixeltable/index/base.py +12 -8
- pixeltable/index/btree.py +19 -22
- pixeltable/index/embedding_index.py +30 -39
- pixeltable/io/datarows.py +3 -3
- pixeltable/io/external_store.py +13 -16
- pixeltable/io/fiftyone.py +5 -5
- pixeltable/io/globals.py +5 -5
- pixeltable/io/hf_datasets.py +4 -4
- pixeltable/io/label_studio.py +12 -12
- pixeltable/io/pandas.py +6 -6
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +12 -12
- pixeltable/io/utils.py +2 -2
- pixeltable/iterators/audio.py +2 -2
- pixeltable/iterators/document.py +88 -57
- pixeltable/iterators/video.py +66 -37
- pixeltable/metadata/converters/convert_18.py +2 -2
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_20.py +2 -2
- pixeltable/metadata/converters/convert_21.py +2 -2
- pixeltable/metadata/converters/convert_22.py +2 -2
- pixeltable/metadata/converters/convert_24.py +2 -2
- pixeltable/metadata/converters/convert_25.py +2 -2
- pixeltable/metadata/converters/convert_26.py +2 -2
- pixeltable/metadata/converters/convert_29.py +4 -4
- pixeltable/metadata/converters/convert_34.py +2 -2
- pixeltable/metadata/converters/convert_36.py +2 -2
- pixeltable/metadata/converters/convert_38.py +2 -2
- pixeltable/metadata/converters/convert_39.py +1 -2
- pixeltable/metadata/converters/util.py +11 -13
- pixeltable/metadata/schema.py +22 -21
- pixeltable/metadata/utils.py +2 -6
- pixeltable/mypy/mypy_plugin.py +5 -5
- pixeltable/plan.py +32 -34
- pixeltable/share/packager.py +7 -7
- pixeltable/share/publish.py +3 -3
- pixeltable/store.py +126 -41
- pixeltable/type_system.py +43 -46
- pixeltable/utils/__init__.py +1 -2
- pixeltable/utils/arrow.py +4 -4
- pixeltable/utils/av.py +74 -38
- pixeltable/utils/azure_store.py +305 -0
- pixeltable/utils/code.py +1 -2
- pixeltable/utils/dbms.py +15 -19
- pixeltable/utils/description_helper.py +2 -3
- pixeltable/utils/documents.py +5 -6
- pixeltable/utils/exception_handler.py +2 -2
- pixeltable/utils/filecache.py +5 -5
- pixeltable/utils/formatter.py +4 -6
- pixeltable/utils/gcs_store.py +9 -9
- pixeltable/utils/local_store.py +17 -17
- pixeltable/utils/object_stores.py +59 -43
- pixeltable/utils/s3_store.py +35 -30
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/METADATA +4 -4
- pixeltable-0.4.19.dist-info/RECORD +213 -0
- pixeltable/__version__.py +0 -3
- pixeltable-0.4.17.dist-info/RECORD +0 -211
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/WHEEL +0 -0
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/entry_points.txt +0 -0
- {pixeltable-0.4.17.dist-info → pixeltable-0.4.19.dist-info}/licenses/LICENSE +0 -0
pixeltable/dataframe.py
CHANGED
|
@@ -8,18 +8,7 @@ import json
|
|
|
8
8
|
import logging
|
|
9
9
|
import traceback
|
|
10
10
|
from pathlib import Path
|
|
11
|
-
from typing import
|
|
12
|
-
TYPE_CHECKING,
|
|
13
|
-
Any,
|
|
14
|
-
AsyncIterator,
|
|
15
|
-
Callable,
|
|
16
|
-
Hashable,
|
|
17
|
-
Iterator,
|
|
18
|
-
NoReturn,
|
|
19
|
-
Optional,
|
|
20
|
-
Sequence,
|
|
21
|
-
TypeVar,
|
|
22
|
-
)
|
|
11
|
+
from typing import TYPE_CHECKING, Any, AsyncIterator, Callable, Hashable, Iterator, NoReturn, Sequence, TypeVar
|
|
23
12
|
|
|
24
13
|
import pandas as pd
|
|
25
14
|
import pydantic
|
|
@@ -162,14 +151,14 @@ class DataFrameResultSet:
|
|
|
162
151
|
# # output of the agg stage
|
|
163
152
|
# self.agg_output_exprs: list[exprs.Expr] = []
|
|
164
153
|
# # Where clause of the Select stmt of the SQL scan stage
|
|
165
|
-
# self.sql_where_clause:
|
|
154
|
+
# self.sql_where_clause: sql.ClauseElement | None = None
|
|
166
155
|
# # filter predicate applied to input rows of the SQL scan stage
|
|
167
|
-
# self.filter:
|
|
168
|
-
# self.similarity_clause:
|
|
156
|
+
# self.filter: exprs.Predicate | None = None
|
|
157
|
+
# self.similarity_clause: exprs.ImageSimilarityPredicate | None = None
|
|
169
158
|
# self.agg_fn_calls: list[exprs.FunctionCall] = [] # derived from unique_exprs
|
|
170
159
|
# self.has_frame_col: bool = False # True if we're referencing the frame col
|
|
171
160
|
#
|
|
172
|
-
# self.evaluator:
|
|
161
|
+
# self.evaluator: exprs.Evaluator | None = None
|
|
173
162
|
# self.sql_scan_eval_ctx: list[exprs.Expr] = [] # needed to materialize output of SQL scan stage
|
|
174
163
|
# self.agg_eval_ctx: list[exprs.Expr] = [] # needed to materialize output of agg stage
|
|
175
164
|
# self.filter_eval_ctx: list[exprs.Expr] = []
|
|
@@ -191,24 +180,24 @@ class DataFrame:
|
|
|
191
180
|
_from_clause: plan.FromClause
|
|
192
181
|
_select_list_exprs: list[exprs.Expr]
|
|
193
182
|
_schema: dict[str, ts.ColumnType]
|
|
194
|
-
select_list:
|
|
195
|
-
where_clause:
|
|
196
|
-
group_by_clause:
|
|
197
|
-
grouping_tbl:
|
|
198
|
-
order_by_clause:
|
|
199
|
-
limit_val:
|
|
200
|
-
sample_clause:
|
|
183
|
+
select_list: list[tuple[exprs.Expr, str | None]] | None
|
|
184
|
+
where_clause: exprs.Expr | None
|
|
185
|
+
group_by_clause: list[exprs.Expr] | None
|
|
186
|
+
grouping_tbl: catalog.TableVersion | None
|
|
187
|
+
order_by_clause: list[tuple[exprs.Expr, bool]] | None
|
|
188
|
+
limit_val: exprs.Expr | None
|
|
189
|
+
sample_clause: SampleClause | None
|
|
201
190
|
|
|
202
191
|
def __init__(
|
|
203
192
|
self,
|
|
204
|
-
from_clause:
|
|
205
|
-
select_list:
|
|
206
|
-
where_clause:
|
|
207
|
-
group_by_clause:
|
|
208
|
-
grouping_tbl:
|
|
209
|
-
order_by_clause:
|
|
210
|
-
limit:
|
|
211
|
-
sample_clause:
|
|
193
|
+
from_clause: plan.FromClause | None = None,
|
|
194
|
+
select_list: list[tuple[exprs.Expr, str | None]] | None = None,
|
|
195
|
+
where_clause: exprs.Expr | None = None,
|
|
196
|
+
group_by_clause: list[exprs.Expr] | None = None,
|
|
197
|
+
grouping_tbl: catalog.TableVersion | None = None,
|
|
198
|
+
order_by_clause: list[tuple[exprs.Expr, bool]] | None = None, # list[(expr, asc)]
|
|
199
|
+
limit: exprs.Expr | None = None,
|
|
200
|
+
sample_clause: SampleClause | None = None,
|
|
212
201
|
):
|
|
213
202
|
self._from_clause = from_clause
|
|
214
203
|
|
|
@@ -232,7 +221,7 @@ class DataFrame:
|
|
|
232
221
|
|
|
233
222
|
@classmethod
|
|
234
223
|
def _normalize_select_list(
|
|
235
|
-
cls, tbls: list[catalog.TableVersionPath], select_list:
|
|
224
|
+
cls, tbls: list[catalog.TableVersionPath], select_list: list[tuple[exprs.Expr, str | None]] | None
|
|
236
225
|
) -> tuple[list[exprs.Expr], list[str]]:
|
|
237
226
|
"""
|
|
238
227
|
Expand select list information with all columns and their names
|
|
@@ -293,23 +282,23 @@ class DataFrame:
|
|
|
293
282
|
if var.name not in unique_vars:
|
|
294
283
|
unique_vars[var.name] = var
|
|
295
284
|
elif unique_vars[var.name].col_type != var.col_type:
|
|
296
|
-
raise excs.Error(f'Multiple definitions of parameter {var.name}')
|
|
285
|
+
raise excs.Error(f'Multiple definitions of parameter {var.name!r}')
|
|
297
286
|
return unique_vars
|
|
298
287
|
|
|
299
288
|
@classmethod
|
|
300
289
|
def _convert_param_to_typed_expr(
|
|
301
|
-
cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range:
|
|
302
|
-
) ->
|
|
290
|
+
cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: tuple[Any, Any] | None = None
|
|
291
|
+
) -> exprs.Expr | None:
|
|
303
292
|
if v is None:
|
|
304
293
|
if required:
|
|
305
294
|
raise excs.Error(f'{name!r} parameter must be present')
|
|
306
295
|
return v
|
|
307
296
|
v_expr = exprs.Expr.from_object(v)
|
|
308
297
|
if not v_expr.col_type.matches(required_type):
|
|
309
|
-
raise excs.Error(f'{name!r} parameter must be of type {required_type
|
|
298
|
+
raise excs.Error(f'{name!r} parameter must be of type `{required_type}`; got `{v_expr.col_type}`')
|
|
310
299
|
if range is not None:
|
|
311
300
|
if not isinstance(v_expr, exprs.Literal):
|
|
312
|
-
raise excs.Error(f'{name!r} parameter must be a constant
|
|
301
|
+
raise excs.Error(f'{name!r} parameter must be a constant; got: {v_expr}')
|
|
313
302
|
if range[0] is not None and not (v_expr.val >= range[0]):
|
|
314
303
|
raise excs.Error(f'{name!r} parameter must be >= {range[0]}')
|
|
315
304
|
if range[1] is not None and not (v_expr.val <= range[1]):
|
|
@@ -318,7 +307,7 @@ class DataFrame:
|
|
|
318
307
|
|
|
319
308
|
@classmethod
|
|
320
309
|
def validate_constant_type_range(
|
|
321
|
-
cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range:
|
|
310
|
+
cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: tuple[Any, Any] | None = None
|
|
322
311
|
) -> Any:
|
|
323
312
|
"""Validate that the given named parameter is a constant of the required type and within the specified range."""
|
|
324
313
|
v_expr = cls._convert_param_to_typed_expr(v, required_type, required, name, range)
|
|
@@ -364,7 +353,7 @@ class DataFrame:
|
|
|
364
353
|
|
|
365
354
|
def _create_query_plan(self) -> exec.ExecNode:
|
|
366
355
|
# construct a group-by clause if we're grouping by a table
|
|
367
|
-
group_by_clause:
|
|
356
|
+
group_by_clause: list[exprs.Expr] | None = None
|
|
368
357
|
if self.grouping_tbl is not None:
|
|
369
358
|
assert self.group_by_clause is None
|
|
370
359
|
num_rowid_cols = len(self.grouping_tbl.store_tbl.rowid_columns())
|
|
@@ -387,7 +376,7 @@ class DataFrame:
|
|
|
387
376
|
sample_clause=self.sample_clause,
|
|
388
377
|
)
|
|
389
378
|
|
|
390
|
-
def __rowid_columns(self, num_rowid_cols:
|
|
379
|
+
def __rowid_columns(self, num_rowid_cols: int | None = None) -> list[exprs.Expr]:
|
|
391
380
|
"""Return list of RowidRef for the given number of associated rowids"""
|
|
392
381
|
return Planner.rowid_columns(self._first_tbl.tbl_version, num_rowid_cols)
|
|
393
382
|
|
|
@@ -483,7 +472,7 @@ class DataFrame:
|
|
|
483
472
|
var_expr = vars[arg_name]
|
|
484
473
|
arg_expr = exprs.Expr.from_object(arg_val)
|
|
485
474
|
if arg_expr is None:
|
|
486
|
-
raise excs.Error(f'
|
|
475
|
+
raise excs.Error(f'That argument cannot be converted to a Pixeltable expression: {arg_val}')
|
|
487
476
|
var_exprs[var_expr] = arg_expr
|
|
488
477
|
|
|
489
478
|
exprs.Expr.list_substitute(select_list_exprs, var_exprs)
|
|
@@ -495,7 +484,7 @@ class DataFrame:
|
|
|
495
484
|
exprs.Expr.list_substitute(order_by_exprs, var_exprs)
|
|
496
485
|
|
|
497
486
|
select_list = list(zip(select_list_exprs, self.schema.keys()))
|
|
498
|
-
order_by_clause:
|
|
487
|
+
order_by_clause: list[tuple[exprs.Expr, bool]] | None = None
|
|
499
488
|
if order_by_exprs is not None:
|
|
500
489
|
order_by_clause = [
|
|
501
490
|
(expr, asc) for expr, asc in zip(order_by_exprs, [asc for _, asc in self.order_by_clause])
|
|
@@ -503,7 +492,7 @@ class DataFrame:
|
|
|
503
492
|
if limit_val is not None:
|
|
504
493
|
limit_val = limit_val.substitute(var_exprs)
|
|
505
494
|
if limit_val is not None and not isinstance(limit_val, exprs.Literal):
|
|
506
|
-
raise excs.Error(f'limit(): parameter must be a constant
|
|
495
|
+
raise excs.Error(f'limit(): parameter must be a constant; got: {limit_val}')
|
|
507
496
|
|
|
508
497
|
return DataFrame(
|
|
509
498
|
from_clause=self._from_clause,
|
|
@@ -683,7 +672,7 @@ class DataFrame:
|
|
|
683
672
|
return self
|
|
684
673
|
|
|
685
674
|
# analyze select list; wrap literals with the corresponding expressions
|
|
686
|
-
select_list: list[tuple[exprs.Expr,
|
|
675
|
+
select_list: list[tuple[exprs.Expr, str | None]] = []
|
|
687
676
|
for raw_expr, name in base_list:
|
|
688
677
|
expr = exprs.Expr.from_object(raw_expr)
|
|
689
678
|
if expr is None:
|
|
@@ -703,8 +692,8 @@ class DataFrame:
|
|
|
703
692
|
pass
|
|
704
693
|
if not expr.is_bound_by(self._from_clause.tbls):
|
|
705
694
|
raise excs.Error(
|
|
706
|
-
f"
|
|
707
|
-
f'({",".join(tbl.tbl_version.get().versioned_name for tbl in self._from_clause.tbls)})'
|
|
695
|
+
f"That expression cannot be evaluated in the context of this query's tables "
|
|
696
|
+
f'({",".join(tbl.tbl_version.get().versioned_name for tbl in self._from_clause.tbls)}): {expr}'
|
|
708
697
|
)
|
|
709
698
|
select_list.append((expr, name))
|
|
710
699
|
|
|
@@ -715,7 +704,7 @@ class DataFrame:
|
|
|
715
704
|
if name in seen:
|
|
716
705
|
repeated_names = [j for j, x in enumerate(names) if x == name]
|
|
717
706
|
pretty = ', '.join(map(str, repeated_names))
|
|
718
|
-
raise excs.Error(f'Repeated column name
|
|
707
|
+
raise excs.Error(f'Repeated column name {name!r} in select() at positions: {pretty}')
|
|
719
708
|
seen.add(name)
|
|
720
709
|
|
|
721
710
|
return DataFrame(
|
|
@@ -753,13 +742,13 @@ class DataFrame:
|
|
|
753
742
|
>>> df = person.where(t.age > 30)
|
|
754
743
|
"""
|
|
755
744
|
if self.where_clause is not None:
|
|
756
|
-
raise excs.Error('
|
|
745
|
+
raise excs.Error('where() clause already specified')
|
|
757
746
|
if self.sample_clause is not None:
|
|
758
|
-
raise excs.Error('where cannot be used after sample()')
|
|
747
|
+
raise excs.Error('where() cannot be used after sample()')
|
|
759
748
|
if not isinstance(pred, exprs.Expr):
|
|
760
|
-
raise excs.Error(f'
|
|
749
|
+
raise excs.Error(f'where() expects a Pixeltable expression; got: {pred}')
|
|
761
750
|
if not pred.col_type.is_bool_type():
|
|
762
|
-
raise excs.Error(f'
|
|
751
|
+
raise excs.Error(f'where() expression needs to return `Bool`, but instead returns `{pred.col_type}`')
|
|
763
752
|
return DataFrame(
|
|
764
753
|
from_clause=self._from_clause,
|
|
765
754
|
select_list=self.select_list,
|
|
@@ -781,19 +770,21 @@ class DataFrame:
|
|
|
781
770
|
on = [on]
|
|
782
771
|
elif isinstance(on, exprs.Expr):
|
|
783
772
|
if not on.is_bound_by(joined_tbls):
|
|
784
|
-
raise excs.Error(f
|
|
773
|
+
raise excs.Error(f'`on` expression cannot be evaluated in the context of the joined tables: {on}')
|
|
785
774
|
if not on.col_type.is_bool_type():
|
|
786
|
-
raise excs.Error(
|
|
775
|
+
raise excs.Error(
|
|
776
|
+
f'`on` expects an expression of type `Bool`, but got one of type `{on.col_type}`: {on}'
|
|
777
|
+
)
|
|
787
778
|
return on
|
|
788
779
|
elif not isinstance(on, Sequence) or len(on) == 0:
|
|
789
|
-
raise excs.Error(
|
|
780
|
+
raise excs.Error('`on` must be a sequence of column references or a boolean expression')
|
|
790
781
|
|
|
791
782
|
assert isinstance(on, Sequence)
|
|
792
783
|
for col_ref in on:
|
|
793
784
|
if not isinstance(col_ref, exprs.ColumnRef):
|
|
794
|
-
raise excs.Error(
|
|
785
|
+
raise excs.Error('`on` must be a sequence of column references or a boolean expression')
|
|
795
786
|
if not col_ref.is_bound_by(joined_tbls):
|
|
796
|
-
raise excs.Error(f
|
|
787
|
+
raise excs.Error(f'`on` expression cannot be evaluated in the context of the joined tables: {col_ref}')
|
|
797
788
|
col_refs.append(col_ref)
|
|
798
789
|
|
|
799
790
|
predicates: list[exprs.Expr] = []
|
|
@@ -803,10 +794,10 @@ class DataFrame:
|
|
|
803
794
|
# identify the referenced column by name in 'other'
|
|
804
795
|
rhs_col = other.get_column(col_ref.col.name)
|
|
805
796
|
if rhs_col is None:
|
|
806
|
-
raise excs.Error(f
|
|
797
|
+
raise excs.Error(f'`on` column {col_ref.col.name!r} not found in joined table')
|
|
807
798
|
rhs_col_ref = exprs.ColumnRef(rhs_col)
|
|
808
799
|
|
|
809
|
-
lhs_col_ref:
|
|
800
|
+
lhs_col_ref: exprs.ColumnRef | None = None
|
|
810
801
|
if any(tbl.has_column(col_ref.col) for tbl in self._from_clause.tbls):
|
|
811
802
|
# col_ref comes from the existing from_clause, we use that directly
|
|
812
803
|
lhs_col_ref = col_ref
|
|
@@ -817,11 +808,11 @@ class DataFrame:
|
|
|
817
808
|
if col is None:
|
|
818
809
|
continue
|
|
819
810
|
if lhs_col_ref is not None:
|
|
820
|
-
raise excs.Error(f
|
|
811
|
+
raise excs.Error(f'`on`: ambiguous column reference: {col_ref.col.name}')
|
|
821
812
|
lhs_col_ref = exprs.ColumnRef(col)
|
|
822
813
|
if lhs_col_ref is None:
|
|
823
814
|
tbl_names = [tbl.tbl_name() for tbl in self._from_clause.tbls]
|
|
824
|
-
raise excs.Error(f
|
|
815
|
+
raise excs.Error(f'`on`: column {col_ref.col.name!r} not found in any of: {" ".join(tbl_names)}')
|
|
825
816
|
pred = exprs.Comparison(exprs.ComparisonOperator.EQ, lhs_col_ref, rhs_col_ref)
|
|
826
817
|
predicates.append(pred)
|
|
827
818
|
|
|
@@ -885,16 +876,16 @@ class DataFrame:
|
|
|
885
876
|
"""
|
|
886
877
|
if self.sample_clause is not None:
|
|
887
878
|
raise excs.Error('join() cannot be used with sample()')
|
|
888
|
-
join_pred:
|
|
879
|
+
join_pred: exprs.Expr | None
|
|
889
880
|
if how == 'cross':
|
|
890
881
|
if on is not None:
|
|
891
|
-
raise excs.Error(
|
|
882
|
+
raise excs.Error('`on` not allowed for cross join')
|
|
892
883
|
join_pred = None
|
|
893
884
|
else:
|
|
894
885
|
if on is None:
|
|
895
|
-
raise excs.Error(f
|
|
886
|
+
raise excs.Error(f'`how={how!r}` requires `on` to be present')
|
|
896
887
|
join_pred = self._create_join_predicate(other._tbl_version_path, on)
|
|
897
|
-
join_clause = plan.JoinClause(join_type=plan.JoinType.validated(how,
|
|
888
|
+
join_clause = plan.JoinClause(join_type=plan.JoinType.validated(how, '`how`'), join_predicate=join_pred)
|
|
898
889
|
from_clause = plan.FromClause(
|
|
899
890
|
tbls=[*self._from_clause.tbls, other._tbl_version_path],
|
|
900
891
|
join_clauses=[*self._from_clause.join_clauses, join_clause],
|
|
@@ -951,16 +942,16 @@ class DataFrame:
|
|
|
951
942
|
>>> df = book.group_by(t.genre).select(t.genre, total=sum(t.price)).show()
|
|
952
943
|
"""
|
|
953
944
|
if self.group_by_clause is not None:
|
|
954
|
-
raise excs.Error('
|
|
945
|
+
raise excs.Error('group_by() already specified')
|
|
955
946
|
if self.sample_clause is not None:
|
|
956
947
|
raise excs.Error('group_by() cannot be used with sample()')
|
|
957
948
|
|
|
958
|
-
grouping_tbl:
|
|
959
|
-
group_by_clause:
|
|
949
|
+
grouping_tbl: catalog.TableVersion | None = None
|
|
950
|
+
group_by_clause: list[exprs.Expr] | None = None
|
|
960
951
|
for item in grouping_items:
|
|
961
952
|
if isinstance(item, (catalog.Table, catalog.TableVersion)):
|
|
962
953
|
if len(grouping_items) > 1:
|
|
963
|
-
raise excs.Error('group_by(): only one
|
|
954
|
+
raise excs.Error('group_by(): only one Table can be specified')
|
|
964
955
|
if len(self._from_clause.tbls) > 1:
|
|
965
956
|
raise excs.Error('group_by() with Table not supported for joins')
|
|
966
957
|
grouping_tbl = item if isinstance(item, catalog.TableVersion) else item._tbl_version.get()
|
|
@@ -968,7 +959,7 @@ class DataFrame:
|
|
|
968
959
|
base = self._first_tbl.find_tbl_version(grouping_tbl.id)
|
|
969
960
|
if base is None or base.id == self._first_tbl.tbl_id:
|
|
970
961
|
raise excs.Error(
|
|
971
|
-
f'group_by(): {grouping_tbl.name} is not a base table of {self._first_tbl.tbl_name()}'
|
|
962
|
+
f'group_by(): {grouping_tbl.name!r} is not a base table of {self._first_tbl.tbl_name()!r}'
|
|
972
963
|
)
|
|
973
964
|
break
|
|
974
965
|
if not isinstance(item, exprs.Expr):
|
|
@@ -1039,7 +1030,7 @@ class DataFrame:
|
|
|
1039
1030
|
>>> df = book.order_by(t.price, asc=False).order_by(t.pages)
|
|
1040
1031
|
"""
|
|
1041
1032
|
if self.sample_clause is not None:
|
|
1042
|
-
raise excs.Error('
|
|
1033
|
+
raise excs.Error('order_by() cannot be used with sample()')
|
|
1043
1034
|
for e in expr_list:
|
|
1044
1035
|
if not isinstance(e, exprs.Expr):
|
|
1045
1036
|
raise excs.Error(f'Invalid expression in order_by(): {e}')
|
|
@@ -1080,10 +1071,10 @@ class DataFrame:
|
|
|
1080
1071
|
|
|
1081
1072
|
def sample(
|
|
1082
1073
|
self,
|
|
1083
|
-
n:
|
|
1084
|
-
n_per_stratum:
|
|
1085
|
-
fraction:
|
|
1086
|
-
seed:
|
|
1074
|
+
n: int | None = None,
|
|
1075
|
+
n_per_stratum: int | None = None,
|
|
1076
|
+
fraction: float | None = None,
|
|
1077
|
+
seed: int | None = None,
|
|
1087
1078
|
stratify_by: Any = None,
|
|
1088
1079
|
) -> DataFrame:
|
|
1089
1080
|
"""
|
|
@@ -1137,7 +1128,7 @@ class DataFrame:
|
|
|
1137
1128
|
"""
|
|
1138
1129
|
# Check context of usage
|
|
1139
1130
|
if self.sample_clause is not None:
|
|
1140
|
-
raise excs.Error('sample()
|
|
1131
|
+
raise excs.Error('Multiple sample() clauses not allowed')
|
|
1141
1132
|
if self.group_by_clause is not None:
|
|
1142
1133
|
raise excs.Error('sample() cannot be used with group_by()')
|
|
1143
1134
|
if self.order_by_clause is not None:
|
|
@@ -1174,11 +1165,11 @@ class DataFrame:
|
|
|
1174
1165
|
if expr is None or not isinstance(expr, exprs.Expr):
|
|
1175
1166
|
raise excs.Error(f'Invalid expression: {expr}')
|
|
1176
1167
|
if not expr.col_type.is_scalar_type():
|
|
1177
|
-
raise excs.Error(f'Invalid type: expression must be a scalar type (not {expr.col_type})')
|
|
1168
|
+
raise excs.Error(f'Invalid type: expression must be a scalar type (not `{expr.col_type}`)')
|
|
1178
1169
|
if not expr.is_bound_by(self._from_clause.tbls):
|
|
1179
1170
|
raise excs.Error(
|
|
1180
|
-
f"
|
|
1181
|
-
f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)})'
|
|
1171
|
+
f"That expression cannot be evaluated in the context of this query's tables "
|
|
1172
|
+
f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)}): {expr}'
|
|
1182
1173
|
)
|
|
1183
1174
|
stratify_exprs.append(expr)
|
|
1184
1175
|
|