pixeltable 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__init__.py +1 -1
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/__init__.py +9 -1
- pixeltable/catalog/catalog.py +559 -134
- pixeltable/catalog/column.py +36 -32
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/globals.py +12 -0
- pixeltable/catalog/insertable_table.py +30 -25
- pixeltable/catalog/schema_object.py +9 -6
- pixeltable/catalog/table.py +334 -267
- pixeltable/catalog/table_version.py +358 -241
- pixeltable/catalog/table_version_handle.py +18 -2
- pixeltable/catalog/table_version_path.py +86 -16
- pixeltable/catalog/view.py +47 -23
- pixeltable/dataframe.py +198 -19
- pixeltable/env.py +6 -4
- pixeltable/exceptions.py +6 -0
- pixeltable/exec/__init__.py +1 -1
- pixeltable/exec/exec_node.py +2 -0
- pixeltable/exec/expr_eval/evaluators.py +4 -1
- pixeltable/exec/expr_eval/expr_eval_node.py +4 -4
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/sql_node.py +188 -22
- pixeltable/exprs/column_property_ref.py +16 -6
- pixeltable/exprs/column_ref.py +33 -11
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/data_row.py +5 -3
- pixeltable/exprs/expr.py +11 -4
- pixeltable/exprs/literal.py +2 -0
- pixeltable/exprs/row_builder.py +4 -6
- pixeltable/exprs/rowid_ref.py +8 -0
- pixeltable/exprs/similarity_expr.py +1 -0
- pixeltable/func/__init__.py +1 -0
- pixeltable/func/mcp.py +74 -0
- pixeltable/func/query_template_function.py +5 -3
- pixeltable/func/tools.py +12 -2
- pixeltable/func/udf.py +2 -2
- pixeltable/functions/__init__.py +1 -0
- pixeltable/functions/anthropic.py +19 -45
- pixeltable/functions/deepseek.py +19 -38
- pixeltable/functions/fireworks.py +9 -18
- pixeltable/functions/gemini.py +2 -3
- pixeltable/functions/groq.py +108 -0
- pixeltable/functions/llama_cpp.py +6 -6
- pixeltable/functions/mistralai.py +16 -53
- pixeltable/functions/ollama.py +1 -1
- pixeltable/functions/openai.py +82 -165
- pixeltable/functions/string.py +212 -58
- pixeltable/functions/together.py +22 -80
- pixeltable/globals.py +10 -4
- pixeltable/index/base.py +5 -0
- pixeltable/index/btree.py +5 -0
- pixeltable/index/embedding_index.py +5 -0
- pixeltable/io/external_store.py +10 -31
- pixeltable/io/label_studio.py +5 -5
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +1 -32
- pixeltable/metadata/__init__.py +11 -2
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_30.py +6 -11
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/convert_37.py +15 -0
- pixeltable/metadata/converters/util.py +3 -9
- pixeltable/metadata/notes.py +3 -0
- pixeltable/metadata/schema.py +13 -1
- pixeltable/plan.py +135 -12
- pixeltable/share/packager.py +138 -14
- pixeltable/share/publish.py +2 -2
- pixeltable/store.py +19 -13
- pixeltable/type_system.py +30 -0
- pixeltable/utils/dbms.py +1 -1
- pixeltable/utils/formatter.py +64 -42
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/METADATA +2 -1
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/RECORD +78 -73
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/entry_points.txt +0 -0
pixeltable/dataframe.py
CHANGED
|
@@ -14,9 +14,10 @@ import pandas as pd
|
|
|
14
14
|
import sqlalchemy as sql
|
|
15
15
|
|
|
16
16
|
from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
|
|
17
|
-
from pixeltable.catalog import is_valid_identifier
|
|
17
|
+
from pixeltable.catalog import Catalog, is_valid_identifier
|
|
18
18
|
from pixeltable.catalog.globals import UpdateStatus
|
|
19
19
|
from pixeltable.env import Env
|
|
20
|
+
from pixeltable.plan import Planner, SampleClause
|
|
20
21
|
from pixeltable.type_system import ColumnType
|
|
21
22
|
from pixeltable.utils.description_helper import DescriptionHelper
|
|
22
23
|
from pixeltable.utils.formatter import Formatter
|
|
@@ -139,6 +140,7 @@ class DataFrame:
|
|
|
139
140
|
grouping_tbl: Optional[catalog.TableVersion]
|
|
140
141
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]]
|
|
141
142
|
limit_val: Optional[exprs.Expr]
|
|
143
|
+
sample_clause: Optional[SampleClause]
|
|
142
144
|
|
|
143
145
|
def __init__(
|
|
144
146
|
self,
|
|
@@ -149,6 +151,7 @@ class DataFrame:
|
|
|
149
151
|
grouping_tbl: Optional[catalog.TableVersion] = None,
|
|
150
152
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None, # list[(expr, asc)]
|
|
151
153
|
limit: Optional[exprs.Expr] = None,
|
|
154
|
+
sample_clause: Optional[SampleClause] = None,
|
|
152
155
|
):
|
|
153
156
|
self._from_clause = from_clause
|
|
154
157
|
|
|
@@ -168,6 +171,7 @@ class DataFrame:
|
|
|
168
171
|
self.grouping_tbl = grouping_tbl
|
|
169
172
|
self.order_by_clause = copy.deepcopy(order_by_clause)
|
|
170
173
|
self.limit_val = limit
|
|
174
|
+
self.sample_clause = sample_clause
|
|
171
175
|
|
|
172
176
|
@classmethod
|
|
173
177
|
def _normalize_select_list(
|
|
@@ -210,8 +214,7 @@ class DataFrame:
|
|
|
210
214
|
|
|
211
215
|
@property
|
|
212
216
|
def _first_tbl(self) -> catalog.TableVersionPath:
|
|
213
|
-
|
|
214
|
-
return self._from_clause.tbls[0]
|
|
217
|
+
return self._from_clause._first_tbl
|
|
215
218
|
|
|
216
219
|
def _vars(self) -> dict[str, exprs.Variable]:
|
|
217
220
|
"""
|
|
@@ -236,6 +239,36 @@ class DataFrame:
|
|
|
236
239
|
raise excs.Error(f'Multiple definitions of parameter {var.name}')
|
|
237
240
|
return unique_vars
|
|
238
241
|
|
|
242
|
+
@classmethod
|
|
243
|
+
def _convert_param_to_typed_expr(
|
|
244
|
+
cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: Optional[tuple[Any, Any]] = None
|
|
245
|
+
) -> Optional[exprs.Expr]:
|
|
246
|
+
if v is None:
|
|
247
|
+
if required:
|
|
248
|
+
raise excs.Error(f'{name!r} parameter must be present')
|
|
249
|
+
return v
|
|
250
|
+
v_expr = exprs.Expr.from_object(v)
|
|
251
|
+
if not v_expr.col_type.matches(required_type):
|
|
252
|
+
raise excs.Error(f'{name!r} parameter must be of type {required_type!r}, instead of {v_expr.col_type}')
|
|
253
|
+
if range is not None:
|
|
254
|
+
if not isinstance(v_expr, exprs.Literal):
|
|
255
|
+
raise excs.Error(f'{name!r} parameter must be a constant, not {v_expr}')
|
|
256
|
+
if range[0] is not None and not (v_expr.val >= range[0]):
|
|
257
|
+
raise excs.Error(f'{name!r} parameter must be >= {range[0]}')
|
|
258
|
+
if range[1] is not None and not (v_expr.val <= range[1]):
|
|
259
|
+
raise excs.Error(f'{name!r} parameter must be <= {range[1]}')
|
|
260
|
+
return v_expr
|
|
261
|
+
|
|
262
|
+
@classmethod
|
|
263
|
+
def validate_constant_type_range(
|
|
264
|
+
cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: Optional[tuple[Any, Any]] = None
|
|
265
|
+
) -> Any:
|
|
266
|
+
"""Validate that the given named parameter is a constant of the required type and within the specified range."""
|
|
267
|
+
v_expr = cls._convert_param_to_typed_expr(v, required_type, required, name, range)
|
|
268
|
+
if v_expr is None:
|
|
269
|
+
return None
|
|
270
|
+
return v_expr.val
|
|
271
|
+
|
|
239
272
|
def parameters(self) -> dict[str, ColumnType]:
|
|
240
273
|
"""Return a dict mapping parameter name to parameter type.
|
|
241
274
|
|
|
@@ -280,7 +313,7 @@ class DataFrame:
|
|
|
280
313
|
num_rowid_cols = len(self.grouping_tbl.store_tbl.rowid_columns())
|
|
281
314
|
# the grouping table must be a base of self.tbl
|
|
282
315
|
assert num_rowid_cols <= len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
|
|
283
|
-
group_by_clause =
|
|
316
|
+
group_by_clause = self.__rowid_columns(num_rowid_cols)
|
|
284
317
|
elif self.group_by_clause is not None:
|
|
285
318
|
group_by_clause = self.group_by_clause
|
|
286
319
|
|
|
@@ -292,14 +325,21 @@ class DataFrame:
|
|
|
292
325
|
self._select_list_exprs,
|
|
293
326
|
where_clause=self.where_clause,
|
|
294
327
|
group_by_clause=group_by_clause,
|
|
295
|
-
order_by_clause=self.order_by_clause
|
|
328
|
+
order_by_clause=self.order_by_clause,
|
|
296
329
|
limit=self.limit_val,
|
|
330
|
+
sample_clause=self.sample_clause,
|
|
297
331
|
)
|
|
298
332
|
|
|
333
|
+
def __rowid_columns(self, num_rowid_cols: Optional[int] = None) -> list[exprs.Expr]:
|
|
334
|
+
"""Return list of RowidRef for the given number of associated rowids"""
|
|
335
|
+
return Planner.rowid_columns(self._first_tbl.tbl_version, num_rowid_cols)
|
|
336
|
+
|
|
299
337
|
def _has_joins(self) -> bool:
|
|
300
338
|
return len(self._from_clause.join_clauses) > 0
|
|
301
339
|
|
|
302
340
|
def show(self, n: int = 20) -> DataFrameResultSet:
|
|
341
|
+
if self.sample_clause is not None:
|
|
342
|
+
raise excs.Error('show() cannot be used with sample()')
|
|
303
343
|
assert n is not None
|
|
304
344
|
return self.limit(n).collect()
|
|
305
345
|
|
|
@@ -322,6 +362,8 @@ class DataFrame:
|
|
|
322
362
|
raise excs.Error('head() cannot be used with order_by()')
|
|
323
363
|
if self._has_joins():
|
|
324
364
|
raise excs.Error('head() not supported for joins')
|
|
365
|
+
if self.sample_clause is not None:
|
|
366
|
+
raise excs.Error('head() cannot be used with sample()')
|
|
325
367
|
if self.group_by_clause is not None:
|
|
326
368
|
raise excs.Error('head() cannot be used with group_by()')
|
|
327
369
|
num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
|
|
@@ -347,6 +389,8 @@ class DataFrame:
|
|
|
347
389
|
raise excs.Error('tail() cannot be used with order_by()')
|
|
348
390
|
if self._has_joins():
|
|
349
391
|
raise excs.Error('tail() not supported for joins')
|
|
392
|
+
if self.sample_clause is not None:
|
|
393
|
+
raise excs.Error('tail() cannot be used with sample()')
|
|
350
394
|
if self.group_by_clause is not None:
|
|
351
395
|
raise excs.Error('tail() cannot be used with group_by()')
|
|
352
396
|
num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
|
|
@@ -431,7 +475,9 @@ class DataFrame:
|
|
|
431
475
|
raise excs.Error(msg) from e
|
|
432
476
|
|
|
433
477
|
def _output_row_iterator(self) -> Iterator[list]:
|
|
434
|
-
|
|
478
|
+
# TODO: extend begin_xact() to accept multiple TVPs for joins
|
|
479
|
+
single_tbl = self._first_tbl if len(self._from_clause.tbls) == 1 else None
|
|
480
|
+
with Catalog.get().begin_xact(tbl=single_tbl, for_write=False):
|
|
435
481
|
try:
|
|
436
482
|
for data_row in self._exec():
|
|
437
483
|
yield [data_row[e.slot_idx] for e in self._select_list_exprs]
|
|
@@ -463,8 +509,8 @@ class DataFrame:
|
|
|
463
509
|
|
|
464
510
|
from pixeltable.plan import Planner
|
|
465
511
|
|
|
466
|
-
|
|
467
|
-
|
|
512
|
+
with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False) as conn:
|
|
513
|
+
stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
|
|
468
514
|
result: int = conn.execute(stmt).scalar_one()
|
|
469
515
|
assert isinstance(result, int)
|
|
470
516
|
return result
|
|
@@ -510,6 +556,9 @@ class DataFrame:
|
|
|
510
556
|
if self.limit_val is not None:
|
|
511
557
|
heading_vals.append('Limit')
|
|
512
558
|
info_vals.append(self.limit_val.display_str(inline=False))
|
|
559
|
+
if self.sample_clause is not None:
|
|
560
|
+
heading_vals.append('Sample')
|
|
561
|
+
info_vals.append(self.sample_clause.display_str(inline=False))
|
|
513
562
|
assert len(heading_vals) == len(info_vals)
|
|
514
563
|
return pd.DataFrame(info_vals, index=heading_vals)
|
|
515
564
|
|
|
@@ -644,6 +693,8 @@ class DataFrame:
|
|
|
644
693
|
"""
|
|
645
694
|
if self.where_clause is not None:
|
|
646
695
|
raise excs.Error('Where clause already specified')
|
|
696
|
+
if self.sample_clause is not None:
|
|
697
|
+
raise excs.Error('where cannot be used after sample()')
|
|
647
698
|
if not isinstance(pred, exprs.Expr):
|
|
648
699
|
raise excs.Error(f'Where() requires a Pixeltable expression, but instead got {type(pred)}')
|
|
649
700
|
if not pred.col_type.is_bool_type():
|
|
@@ -771,6 +822,8 @@ class DataFrame:
|
|
|
771
822
|
|
|
772
823
|
>>> df = t.join(d, on=(t.d1 == d.pk1) & (t.d2 == d.pk2), how='left')
|
|
773
824
|
"""
|
|
825
|
+
if self.sample_clause is not None:
|
|
826
|
+
raise excs.Error('join() cannot be used with sample()')
|
|
774
827
|
join_pred: Optional[exprs.Expr]
|
|
775
828
|
if how == 'cross':
|
|
776
829
|
if on is not None:
|
|
@@ -838,6 +891,9 @@ class DataFrame:
|
|
|
838
891
|
"""
|
|
839
892
|
if self.group_by_clause is not None:
|
|
840
893
|
raise excs.Error('Group-by already specified')
|
|
894
|
+
if self.sample_clause is not None:
|
|
895
|
+
raise excs.Error('group_by() cannot be used with sample()')
|
|
896
|
+
|
|
841
897
|
grouping_tbl: Optional[catalog.TableVersion] = None
|
|
842
898
|
group_by_clause: Optional[list[exprs.Expr]] = None
|
|
843
899
|
for item in grouping_items:
|
|
@@ -849,7 +905,7 @@ class DataFrame:
|
|
|
849
905
|
grouping_tbl = item if isinstance(item, catalog.TableVersion) else item._tbl_version.get()
|
|
850
906
|
# we need to make sure that the grouping table is a base of self.tbl
|
|
851
907
|
base = self._first_tbl.find_tbl_version(grouping_tbl.id)
|
|
852
|
-
if base is None or base.id == self._first_tbl.tbl_id
|
|
908
|
+
if base is None or base.id == self._first_tbl.tbl_id:
|
|
853
909
|
raise excs.Error(
|
|
854
910
|
f'group_by(): {grouping_tbl.name} is not a base table of {self._first_tbl.tbl_name()}'
|
|
855
911
|
)
|
|
@@ -921,6 +977,8 @@ class DataFrame:
|
|
|
921
977
|
|
|
922
978
|
>>> df = book.order_by(t.price, asc=False).order_by(t.pages)
|
|
923
979
|
"""
|
|
980
|
+
if self.sample_clause is not None:
|
|
981
|
+
raise excs.Error('group_by() cannot be used with sample()')
|
|
924
982
|
for e in expr_list:
|
|
925
983
|
if not isinstance(e, exprs.Expr):
|
|
926
984
|
raise excs.Error(f'Invalid expression in order_by(): {e}')
|
|
@@ -945,10 +1003,10 @@ class DataFrame:
|
|
|
945
1003
|
Returns:
|
|
946
1004
|
A new DataFrame with the specified limited rows.
|
|
947
1005
|
"""
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
1006
|
+
if self.sample_clause is not None:
|
|
1007
|
+
raise excs.Error('limit() cannot be used with sample()')
|
|
1008
|
+
|
|
1009
|
+
limit_expr = self._convert_param_to_typed_expr(n, ts.IntType(nullable=False), True, 'limit()')
|
|
952
1010
|
return DataFrame(
|
|
953
1011
|
from_clause=self._from_clause,
|
|
954
1012
|
select_list=self.select_list,
|
|
@@ -956,7 +1014,124 @@ class DataFrame:
|
|
|
956
1014
|
group_by_clause=self.group_by_clause,
|
|
957
1015
|
grouping_tbl=self.grouping_tbl,
|
|
958
1016
|
order_by_clause=self.order_by_clause,
|
|
959
|
-
limit=
|
|
1017
|
+
limit=limit_expr,
|
|
1018
|
+
)
|
|
1019
|
+
|
|
1020
|
+
def sample(
|
|
1021
|
+
self,
|
|
1022
|
+
n: Optional[int] = None,
|
|
1023
|
+
n_per_stratum: Optional[int] = None,
|
|
1024
|
+
fraction: Optional[float] = None,
|
|
1025
|
+
seed: Optional[int] = None,
|
|
1026
|
+
stratify_by: Any = None,
|
|
1027
|
+
) -> DataFrame:
|
|
1028
|
+
"""
|
|
1029
|
+
Return a new DataFrame specifying a sample of rows from the DataFrame, considered in a shuffled order.
|
|
1030
|
+
|
|
1031
|
+
The size of the sample can be specified in three ways:
|
|
1032
|
+
|
|
1033
|
+
- `n`: the total number of rows to produce as a sample
|
|
1034
|
+
- `n_per_stratum`: the number of rows to produce per stratum as a sample
|
|
1035
|
+
- `fraction`: the fraction of available rows to produce as a sample
|
|
1036
|
+
|
|
1037
|
+
The sample can be stratified by one or more columns, which means that the sample will
|
|
1038
|
+
be selected from each stratum separately.
|
|
1039
|
+
|
|
1040
|
+
The data is shuffled before creating the sample.
|
|
1041
|
+
|
|
1042
|
+
Args:
|
|
1043
|
+
n: Total number of rows to produce as a sample.
|
|
1044
|
+
n_per_stratum: Number of rows to produce per stratum as a sample. This parameter is only valid if
|
|
1045
|
+
`stratify_by` is specified. Only one of `n` or `n_per_stratum` can be specified.
|
|
1046
|
+
fraction: Fraction of available rows to produce as a sample. This parameter is not usable with `n` or
|
|
1047
|
+
`n_per_stratum`. The fraction must be between 0.0 and 1.0.
|
|
1048
|
+
seed: Random seed for reproducible shuffling
|
|
1049
|
+
stratify_by: If specified, the sample will be stratified by these values.
|
|
1050
|
+
|
|
1051
|
+
Returns:
|
|
1052
|
+
A new DataFrame which specifies the sampled rows
|
|
1053
|
+
|
|
1054
|
+
Examples:
|
|
1055
|
+
Given the Table `person` containing the field 'age', we can create samples of the table in various ways:
|
|
1056
|
+
|
|
1057
|
+
Sample 100 rows from the above Table:
|
|
1058
|
+
|
|
1059
|
+
>>> df = person.sample(n=100)
|
|
1060
|
+
|
|
1061
|
+
Sample 10% of the rows from the above Table:
|
|
1062
|
+
|
|
1063
|
+
>>> df = person.sample(fraction=0.1)
|
|
1064
|
+
|
|
1065
|
+
Sample 10% of the rows from the above Table, stratified by the column 'age':
|
|
1066
|
+
|
|
1067
|
+
>>> df = person.sample(fraction=0.1, stratify_by=t.age)
|
|
1068
|
+
|
|
1069
|
+
Equal allocation sampling: Sample 2 rows from each age present in the above Table:
|
|
1070
|
+
|
|
1071
|
+
>>> df = person.sample(n_per_stratum=2, stratify_by=t.age)
|
|
1072
|
+
|
|
1073
|
+
Sampling is compatible with the where clause, so we can also sample from a filtered DataFrame:
|
|
1074
|
+
|
|
1075
|
+
>>> df = person.where(t.age > 30).sample(n=100)
|
|
1076
|
+
"""
|
|
1077
|
+
# Check context of usage
|
|
1078
|
+
if self.sample_clause is not None:
|
|
1079
|
+
raise excs.Error('sample() cannot be used with sample()')
|
|
1080
|
+
if self.group_by_clause is not None:
|
|
1081
|
+
raise excs.Error('sample() cannot be used with group_by()')
|
|
1082
|
+
if self.order_by_clause is not None:
|
|
1083
|
+
raise excs.Error('sample() cannot be used with order_by()')
|
|
1084
|
+
if self.limit_val is not None:
|
|
1085
|
+
raise excs.Error('sample() cannot be used with limit()')
|
|
1086
|
+
if self._has_joins():
|
|
1087
|
+
raise excs.Error('sample() cannot be used with join()')
|
|
1088
|
+
|
|
1089
|
+
# Check paramter combinations
|
|
1090
|
+
if (n is not None) + (n_per_stratum is not None) + (fraction is not None) != 1:
|
|
1091
|
+
raise excs.Error('Exactly one of `n`, `n_per_stratum`, or `fraction` must be specified.')
|
|
1092
|
+
if n_per_stratum is not None and stratify_by is None:
|
|
1093
|
+
raise excs.Error('Must specify `stratify_by` to use `n_per_stratum`')
|
|
1094
|
+
|
|
1095
|
+
# Check parameter types and values
|
|
1096
|
+
n = self.validate_constant_type_range(n, ts.IntType(nullable=False), False, 'n', (1, None))
|
|
1097
|
+
n_per_stratum = self.validate_constant_type_range(
|
|
1098
|
+
n_per_stratum, ts.IntType(nullable=False), False, 'n_per_stratum', (1, None)
|
|
1099
|
+
)
|
|
1100
|
+
fraction = self.validate_constant_type_range(
|
|
1101
|
+
fraction, ts.FloatType(nullable=False), False, 'fraction', (0.0, 1.0)
|
|
1102
|
+
)
|
|
1103
|
+
seed = self.validate_constant_type_range(seed, ts.IntType(nullable=False), False, 'seed')
|
|
1104
|
+
|
|
1105
|
+
# analyze stratify list
|
|
1106
|
+
stratify_exprs: list[exprs.Expr] = []
|
|
1107
|
+
if stratify_by is not None:
|
|
1108
|
+
if isinstance(stratify_by, exprs.Expr):
|
|
1109
|
+
stratify_by = [stratify_by]
|
|
1110
|
+
if not isinstance(stratify_by, (list, tuple)):
|
|
1111
|
+
raise excs.Error('`stratify_by` must be a list of scalar expressions')
|
|
1112
|
+
for expr in stratify_by:
|
|
1113
|
+
if expr is None or not isinstance(expr, exprs.Expr):
|
|
1114
|
+
raise excs.Error(f'Invalid expression: {expr}')
|
|
1115
|
+
if not expr.col_type.is_scalar_type():
|
|
1116
|
+
raise excs.Error(f'Invalid type: expression must be a scalar type (not {expr.col_type})')
|
|
1117
|
+
if not expr.is_bound_by(self._from_clause.tbls):
|
|
1118
|
+
raise excs.Error(
|
|
1119
|
+
f"Expression '{expr}' cannot be evaluated in the context of this query's tables "
|
|
1120
|
+
f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)})'
|
|
1121
|
+
)
|
|
1122
|
+
stratify_exprs.append(expr)
|
|
1123
|
+
|
|
1124
|
+
sample_clause = SampleClause(None, n, n_per_stratum, fraction, seed, stratify_exprs)
|
|
1125
|
+
|
|
1126
|
+
return DataFrame(
|
|
1127
|
+
from_clause=self._from_clause,
|
|
1128
|
+
select_list=self.select_list,
|
|
1129
|
+
where_clause=self.where_clause,
|
|
1130
|
+
group_by_clause=self.group_by_clause,
|
|
1131
|
+
grouping_tbl=self.grouping_tbl,
|
|
1132
|
+
order_by_clause=self.order_by_clause,
|
|
1133
|
+
limit=self.limit_val,
|
|
1134
|
+
sample_clause=sample_clause,
|
|
960
1135
|
)
|
|
961
1136
|
|
|
962
1137
|
def update(self, value_spec: dict[str, Any], cascade: bool = True) -> UpdateStatus:
|
|
@@ -988,7 +1163,7 @@ class DataFrame:
|
|
|
988
1163
|
>>> df = person.where(t.year == 2014).update({'age': 30})
|
|
989
1164
|
"""
|
|
990
1165
|
self._validate_mutable('update', False)
|
|
991
|
-
with
|
|
1166
|
+
with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
|
|
992
1167
|
return self._first_tbl.tbl_version.get().update(value_spec, where=self.where_clause, cascade=cascade)
|
|
993
1168
|
|
|
994
1169
|
def delete(self) -> UpdateStatus:
|
|
@@ -1011,7 +1186,7 @@ class DataFrame:
|
|
|
1011
1186
|
self._validate_mutable('delete', False)
|
|
1012
1187
|
if not self._first_tbl.is_insertable():
|
|
1013
1188
|
raise excs.Error('Cannot delete from view')
|
|
1014
|
-
with
|
|
1189
|
+
with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=True, lock_mutable_tree=True):
|
|
1015
1190
|
return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
|
|
1016
1191
|
|
|
1017
1192
|
def _validate_mutable(self, op_name: str, allow_select: bool) -> None:
|
|
@@ -1053,13 +1228,14 @@ class DataFrame:
|
|
|
1053
1228
|
if self.order_by_clause is not None
|
|
1054
1229
|
else None,
|
|
1055
1230
|
'limit_val': self.limit_val.as_dict() if self.limit_val is not None else None,
|
|
1231
|
+
'sample_clause': self.sample_clause.as_dict() if self.sample_clause is not None else None,
|
|
1056
1232
|
}
|
|
1057
1233
|
return d
|
|
1058
1234
|
|
|
1059
1235
|
@classmethod
|
|
1060
1236
|
def from_dict(cls, d: dict[str, Any]) -> 'DataFrame':
|
|
1061
1237
|
# we need to wrap the construction with a transaction, because it might need to load metadata
|
|
1062
|
-
with
|
|
1238
|
+
with Catalog.get().begin_xact(for_write=False):
|
|
1063
1239
|
tbls = [catalog.TableVersionPath.from_dict(tbl_dict) for tbl_dict in d['from_clause']['tbls']]
|
|
1064
1240
|
join_clauses = [plan.JoinClause(**clause_dict) for clause_dict in d['from_clause']['join_clauses']]
|
|
1065
1241
|
from_clause = plan.FromClause(tbls=tbls, join_clauses=join_clauses)
|
|
@@ -1079,6 +1255,7 @@ class DataFrame:
|
|
|
1079
1255
|
else None
|
|
1080
1256
|
)
|
|
1081
1257
|
limit_val = exprs.Expr.from_dict(d['limit_val']) if d['limit_val'] is not None else None
|
|
1258
|
+
sample_clause = SampleClause.from_dict(d['sample_clause']) if d['sample_clause'] is not None else None
|
|
1082
1259
|
|
|
1083
1260
|
return DataFrame(
|
|
1084
1261
|
from_clause=from_clause,
|
|
@@ -1088,6 +1265,7 @@ class DataFrame:
|
|
|
1088
1265
|
grouping_tbl=grouping_tbl,
|
|
1089
1266
|
order_by_clause=order_by_clause,
|
|
1090
1267
|
limit=limit_val,
|
|
1268
|
+
sample_clause=sample_clause,
|
|
1091
1269
|
)
|
|
1092
1270
|
|
|
1093
1271
|
def _hash_result_set(self) -> str:
|
|
@@ -1129,7 +1307,8 @@ class DataFrame:
|
|
|
1129
1307
|
assert data_file_path.is_file()
|
|
1130
1308
|
return data_file_path
|
|
1131
1309
|
else:
|
|
1132
|
-
|
|
1310
|
+
# TODO: extend begin_xact() to accept multiple TVPs for joins
|
|
1311
|
+
with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False):
|
|
1133
1312
|
return write_coco_dataset(self, dest_path)
|
|
1134
1313
|
|
|
1135
1314
|
def to_pytorch_dataset(self, image_format: str = 'pt') -> 'torch.utils.data.IterableDataset':
|
|
@@ -1174,7 +1353,7 @@ class DataFrame:
|
|
|
1174
1353
|
if dest_path.exists(): # fast path: use cache
|
|
1175
1354
|
assert dest_path.is_dir()
|
|
1176
1355
|
else:
|
|
1177
|
-
with
|
|
1356
|
+
with Catalog.get().begin_xact(tbl=self._first_tbl, for_write=False):
|
|
1178
1357
|
export_parquet(self, dest_path, inline_images=True)
|
|
1179
1358
|
|
|
1180
1359
|
return PixeltablePytorchDataset(path=dest_path, image_format=image_format)
|
pixeltable/env.py
CHANGED
|
@@ -25,6 +25,7 @@ from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
|
25
25
|
|
|
26
26
|
import pixeltable_pgserver
|
|
27
27
|
import sqlalchemy as sql
|
|
28
|
+
from pillow_heif import register_heif_opener # type: ignore[import-untyped]
|
|
28
29
|
from tqdm import TqdmWarning
|
|
29
30
|
|
|
30
31
|
from pixeltable import exceptions as excs
|
|
@@ -191,6 +192,7 @@ class Env:
|
|
|
191
192
|
assert self._dbms is not None
|
|
192
193
|
return self._dbms
|
|
193
194
|
|
|
195
|
+
@property
|
|
194
196
|
def in_xact(self) -> bool:
|
|
195
197
|
return self._current_conn is not None
|
|
196
198
|
|
|
@@ -201,20 +203,17 @@ class Env:
|
|
|
201
203
|
|
|
202
204
|
@contextmanager
|
|
203
205
|
def begin_xact(self) -> Iterator[sql.Connection]:
|
|
204
|
-
"""
|
|
206
|
+
"""Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly."""
|
|
205
207
|
if self._current_conn is None:
|
|
206
208
|
assert self._current_session is None
|
|
207
209
|
try:
|
|
208
210
|
with self.engine.begin() as conn, sql.orm.Session(conn) as session:
|
|
209
|
-
# TODO: remove print() once we're done with debugging the concurrent update behavior
|
|
210
|
-
# print(f'{datetime.datetime.now()}: start xact')
|
|
211
211
|
self._current_conn = conn
|
|
212
212
|
self._current_session = session
|
|
213
213
|
yield conn
|
|
214
214
|
finally:
|
|
215
215
|
self._current_session = None
|
|
216
216
|
self._current_conn = None
|
|
217
|
-
# print(f'{datetime.datetime.now()}: end xact')
|
|
218
217
|
else:
|
|
219
218
|
assert self._current_session is not None
|
|
220
219
|
yield self._current_conn
|
|
@@ -600,6 +599,7 @@ class Env:
|
|
|
600
599
|
|
|
601
600
|
def _set_up_runtime(self) -> None:
|
|
602
601
|
"""Check for and start runtime services"""
|
|
602
|
+
register_heif_opener()
|
|
603
603
|
self._start_web_server()
|
|
604
604
|
self.__register_packages()
|
|
605
605
|
|
|
@@ -611,9 +611,11 @@ class Env:
|
|
|
611
611
|
self.__register_package('fiftyone')
|
|
612
612
|
self.__register_package('fireworks', library_name='fireworks-ai')
|
|
613
613
|
self.__register_package('google.genai', library_name='google-genai')
|
|
614
|
+
self.__register_package('groq')
|
|
614
615
|
self.__register_package('huggingface_hub', library_name='huggingface-hub')
|
|
615
616
|
self.__register_package('label_studio_sdk', library_name='label-studio-sdk')
|
|
616
617
|
self.__register_package('llama_cpp', library_name='llama-cpp-python')
|
|
618
|
+
self.__register_package('mcp')
|
|
617
619
|
self.__register_package('mistralai')
|
|
618
620
|
self.__register_package('mistune')
|
|
619
621
|
self.__register_package('ollama')
|
pixeltable/exceptions.py
CHANGED
|
@@ -10,6 +10,12 @@ class Error(Exception):
|
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
class ExprEvalError(Exception):
|
|
13
|
+
"""
|
|
14
|
+
Used during query execution to signal expr evaluation failures.
|
|
15
|
+
|
|
16
|
+
NOT A USER-FACING EXCEPTION. All ExprEvalError instances need to be converted into Error instances.
|
|
17
|
+
"""
|
|
18
|
+
|
|
13
19
|
expr: 'exprs.Expr'
|
|
14
20
|
expr_msg: str
|
|
15
21
|
exc: Exception
|
pixeltable/exec/__init__.py
CHANGED
|
@@ -9,4 +9,4 @@ from .exec_node import ExecNode
|
|
|
9
9
|
from .expr_eval import ExprEvalNode
|
|
10
10
|
from .in_memory_data_node import InMemoryDataNode
|
|
11
11
|
from .row_update_node import RowUpdateNode
|
|
12
|
-
from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlScanNode
|
|
12
|
+
from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlSampleNode, SqlScanNode
|
pixeltable/exec/exec_node.py
CHANGED
|
@@ -73,6 +73,8 @@ class ExecNode(abc.ABC):
|
|
|
73
73
|
except RuntimeError:
|
|
74
74
|
loop = asyncio.new_event_loop()
|
|
75
75
|
asyncio.set_event_loop(loop)
|
|
76
|
+
# we set a deliberately long duration to avoid warnings getting printed to the console in debug mode
|
|
77
|
+
loop.slow_callback_duration = 3600
|
|
76
78
|
|
|
77
79
|
if _logger.isEnabledFor(logging.DEBUG):
|
|
78
80
|
loop.set_debug(True)
|
|
@@ -317,7 +317,10 @@ class JsonMapperDispatcher(Evaluator):
|
|
|
317
317
|
for _ in src
|
|
318
318
|
]
|
|
319
319
|
for nested_row, anchor_val in zip(nested_rows, src):
|
|
320
|
-
|
|
320
|
+
# It's possible that self.scope_anchor.slot_idx is None; this corresponds to the case where the
|
|
321
|
+
# mapper expression doesn't actually contain references to RELATIVE_PATH_ROOT.
|
|
322
|
+
if self.scope_anchor.slot_idx is not None:
|
|
323
|
+
nested_row[self.scope_anchor.slot_idx] = anchor_val
|
|
321
324
|
for slot_idx_, nested_slot_idx in self.external_slot_map.items():
|
|
322
325
|
nested_row[nested_slot_idx] = row[slot_idx_]
|
|
323
326
|
self.nested_exec_ctx.init_rows(nested_rows)
|
|
@@ -49,7 +49,7 @@ class ExprEvalNode(ExecNode):
|
|
|
49
49
|
# execution state
|
|
50
50
|
tasks: set[asyncio.Task] # collects all running tasks to prevent them from getting gc'd
|
|
51
51
|
exc_event: asyncio.Event # set if an exception needs to be propagated
|
|
52
|
-
error: Optional[Union[
|
|
52
|
+
error: Optional[Union[Exception]] # exception that needs to be propagated
|
|
53
53
|
completed_rows: asyncio.Queue[exprs.DataRow] # rows that have completed evaluation
|
|
54
54
|
completed_event: asyncio.Event # set when completed_rows is non-empty
|
|
55
55
|
input_iter: AsyncIterator[DataRowBatch]
|
|
@@ -133,10 +133,10 @@ class ExprEvalNode(ExecNode):
|
|
|
133
133
|
except StopAsyncIteration:
|
|
134
134
|
self.input_complete = True
|
|
135
135
|
_logger.debug(f'finished input: #input_rows={self.num_input_rows}, #avail={self.avail_input_rows}')
|
|
136
|
-
|
|
137
|
-
|
|
136
|
+
# make sure to pass DBAPIError through, so the transaction handling logic sees it
|
|
137
|
+
except Exception as exc:
|
|
138
|
+
self.error = exc
|
|
138
139
|
self.exc_event.set()
|
|
139
|
-
# TODO: should we also handle Exception here and create an excs.Error from it?
|
|
140
140
|
|
|
141
141
|
@property
|
|
142
142
|
def total_buffered(self) -> int:
|
|
@@ -38,7 +38,7 @@ class InMemoryDataNode(ExecNode):
|
|
|
38
38
|
# we materialize the input slots
|
|
39
39
|
output_exprs = list(row_builder.input_exprs)
|
|
40
40
|
super().__init__(row_builder, output_exprs, [], None)
|
|
41
|
-
assert tbl.get().is_insertable
|
|
41
|
+
assert tbl.get().is_insertable
|
|
42
42
|
self.tbl = tbl
|
|
43
43
|
self.input_rows = rows
|
|
44
44
|
self.start_row_id = start_row_id
|