pixeltable 0.3.15__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of pixeltable might be problematic. Click here for more details.
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +296 -105
- pixeltable/catalog/column.py +10 -8
- pixeltable/catalog/dir.py +1 -2
- pixeltable/catalog/insertable_table.py +25 -20
- pixeltable/catalog/schema_object.py +3 -6
- pixeltable/catalog/table.py +261 -189
- pixeltable/catalog/table_version.py +333 -202
- pixeltable/catalog/table_version_handle.py +15 -2
- pixeltable/catalog/table_version_path.py +60 -14
- pixeltable/catalog/view.py +38 -6
- pixeltable/dataframe.py +196 -18
- pixeltable/env.py +4 -4
- pixeltable/exec/__init__.py +1 -1
- pixeltable/exec/expr_eval/evaluators.py +4 -1
- pixeltable/exec/in_memory_data_node.py +1 -1
- pixeltable/exec/sql_node.py +171 -22
- pixeltable/exprs/column_property_ref.py +15 -6
- pixeltable/exprs/column_ref.py +32 -11
- pixeltable/exprs/comparison.py +1 -1
- pixeltable/exprs/data_row.py +5 -3
- pixeltable/exprs/expr.py +7 -0
- pixeltable/exprs/literal.py +2 -0
- pixeltable/exprs/row_builder.py +4 -6
- pixeltable/exprs/rowid_ref.py +8 -0
- pixeltable/exprs/similarity_expr.py +1 -0
- pixeltable/func/query_template_function.py +1 -1
- pixeltable/func/tools.py +1 -1
- pixeltable/functions/gemini.py +0 -1
- pixeltable/functions/string.py +212 -58
- pixeltable/globals.py +12 -4
- pixeltable/index/base.py +5 -0
- pixeltable/index/btree.py +5 -0
- pixeltable/index/embedding_index.py +5 -0
- pixeltable/io/external_store.py +8 -29
- pixeltable/io/label_studio.py +1 -1
- pixeltable/io/parquet.py +2 -2
- pixeltable/io/table_data_conduit.py +0 -31
- pixeltable/metadata/__init__.py +11 -2
- pixeltable/metadata/converters/convert_13.py +2 -2
- pixeltable/metadata/converters/convert_30.py +6 -11
- pixeltable/metadata/converters/convert_35.py +9 -0
- pixeltable/metadata/converters/convert_36.py +38 -0
- pixeltable/metadata/converters/util.py +3 -9
- pixeltable/metadata/notes.py +2 -0
- pixeltable/metadata/schema.py +8 -1
- pixeltable/plan.py +221 -14
- pixeltable/share/packager.py +137 -13
- pixeltable/share/publish.py +2 -2
- pixeltable/store.py +19 -13
- pixeltable/utils/dbms.py +1 -1
- pixeltable/utils/formatter.py +64 -42
- pixeltable/utils/sample.py +25 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/METADATA +2 -1
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/RECORD +58 -55
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/WHEEL +0 -0
- {pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/entry_points.txt +0 -0
pixeltable/dataframe.py
CHANGED
|
@@ -14,9 +14,10 @@ import pandas as pd
|
|
|
14
14
|
import sqlalchemy as sql
|
|
15
15
|
|
|
16
16
|
from pixeltable import catalog, exceptions as excs, exec, exprs, plan, type_system as ts
|
|
17
|
-
from pixeltable.catalog import is_valid_identifier
|
|
17
|
+
from pixeltable.catalog import Catalog, is_valid_identifier
|
|
18
18
|
from pixeltable.catalog.globals import UpdateStatus
|
|
19
19
|
from pixeltable.env import Env
|
|
20
|
+
from pixeltable.plan import Planner, SampleClause
|
|
20
21
|
from pixeltable.type_system import ColumnType
|
|
21
22
|
from pixeltable.utils.description_helper import DescriptionHelper
|
|
22
23
|
from pixeltable.utils.formatter import Formatter
|
|
@@ -139,6 +140,7 @@ class DataFrame:
|
|
|
139
140
|
grouping_tbl: Optional[catalog.TableVersion]
|
|
140
141
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]]
|
|
141
142
|
limit_val: Optional[exprs.Expr]
|
|
143
|
+
sample_clause: Optional[SampleClause]
|
|
142
144
|
|
|
143
145
|
def __init__(
|
|
144
146
|
self,
|
|
@@ -149,6 +151,7 @@ class DataFrame:
|
|
|
149
151
|
grouping_tbl: Optional[catalog.TableVersion] = None,
|
|
150
152
|
order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None, # list[(expr, asc)]
|
|
151
153
|
limit: Optional[exprs.Expr] = None,
|
|
154
|
+
sample_clause: Optional[SampleClause] = None,
|
|
152
155
|
):
|
|
153
156
|
self._from_clause = from_clause
|
|
154
157
|
|
|
@@ -168,6 +171,7 @@ class DataFrame:
|
|
|
168
171
|
self.grouping_tbl = grouping_tbl
|
|
169
172
|
self.order_by_clause = copy.deepcopy(order_by_clause)
|
|
170
173
|
self.limit_val = limit
|
|
174
|
+
self.sample_clause = sample_clause
|
|
171
175
|
|
|
172
176
|
@classmethod
|
|
173
177
|
def _normalize_select_list(
|
|
@@ -210,8 +214,7 @@ class DataFrame:
|
|
|
210
214
|
|
|
211
215
|
@property
|
|
212
216
|
def _first_tbl(self) -> catalog.TableVersionPath:
|
|
213
|
-
|
|
214
|
-
return self._from_clause.tbls[0]
|
|
217
|
+
return self._from_clause._first_tbl
|
|
215
218
|
|
|
216
219
|
def _vars(self) -> dict[str, exprs.Variable]:
|
|
217
220
|
"""
|
|
@@ -236,6 +239,36 @@ class DataFrame:
|
|
|
236
239
|
raise excs.Error(f'Multiple definitions of parameter {var.name}')
|
|
237
240
|
return unique_vars
|
|
238
241
|
|
|
242
|
+
@classmethod
|
|
243
|
+
def _convert_param_to_typed_expr(
|
|
244
|
+
cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: Optional[tuple[Any, Any]] = None
|
|
245
|
+
) -> Optional[exprs.Expr]:
|
|
246
|
+
if v is None:
|
|
247
|
+
if required:
|
|
248
|
+
raise excs.Error(f'{name!r} parameter must be present')
|
|
249
|
+
return v
|
|
250
|
+
v_expr = exprs.Expr.from_object(v)
|
|
251
|
+
if not v_expr.col_type.matches(required_type):
|
|
252
|
+
raise excs.Error(f'{name!r} parameter must be of type {required_type!r}, instead of {v_expr.col_type}')
|
|
253
|
+
if range is not None:
|
|
254
|
+
if not isinstance(v_expr, exprs.Literal):
|
|
255
|
+
raise excs.Error(f'{name!r} parameter must be a constant, not {v_expr}')
|
|
256
|
+
if range[0] is not None and not (v_expr.val >= range[0]):
|
|
257
|
+
raise excs.Error(f'{name!r} parameter must be >= {range[0]}')
|
|
258
|
+
if range[1] is not None and not (v_expr.val <= range[1]):
|
|
259
|
+
raise excs.Error(f'{name!r} parameter must be <= {range[1]}')
|
|
260
|
+
return v_expr
|
|
261
|
+
|
|
262
|
+
@classmethod
|
|
263
|
+
def validate_constant_type_range(
|
|
264
|
+
cls, v: Any, required_type: ts.ColumnType, required: bool, name: str, range: Optional[tuple[Any, Any]] = None
|
|
265
|
+
) -> Any:
|
|
266
|
+
"""Validate that the given named parameter is a constant of the required type and within the specified range."""
|
|
267
|
+
v_expr = cls._convert_param_to_typed_expr(v, required_type, required, name, range)
|
|
268
|
+
if v_expr is None:
|
|
269
|
+
return None
|
|
270
|
+
return v_expr.val
|
|
271
|
+
|
|
239
272
|
def parameters(self) -> dict[str, ColumnType]:
|
|
240
273
|
"""Return a dict mapping parameter name to parameter type.
|
|
241
274
|
|
|
@@ -280,7 +313,7 @@ class DataFrame:
|
|
|
280
313
|
num_rowid_cols = len(self.grouping_tbl.store_tbl.rowid_columns())
|
|
281
314
|
# the grouping table must be a base of self.tbl
|
|
282
315
|
assert num_rowid_cols <= len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
|
|
283
|
-
group_by_clause =
|
|
316
|
+
group_by_clause = self.__rowid_columns(num_rowid_cols)
|
|
284
317
|
elif self.group_by_clause is not None:
|
|
285
318
|
group_by_clause = self.group_by_clause
|
|
286
319
|
|
|
@@ -292,14 +325,21 @@ class DataFrame:
|
|
|
292
325
|
self._select_list_exprs,
|
|
293
326
|
where_clause=self.where_clause,
|
|
294
327
|
group_by_clause=group_by_clause,
|
|
295
|
-
order_by_clause=self.order_by_clause
|
|
328
|
+
order_by_clause=self.order_by_clause,
|
|
296
329
|
limit=self.limit_val,
|
|
330
|
+
sample_clause=self.sample_clause,
|
|
297
331
|
)
|
|
298
332
|
|
|
333
|
+
def __rowid_columns(self, num_rowid_cols: Optional[int] = None) -> list[exprs.Expr]:
|
|
334
|
+
"""Return list of RowidRef for the given number of associated rowids"""
|
|
335
|
+
return Planner.rowid_columns(self._first_tbl.tbl_version, num_rowid_cols)
|
|
336
|
+
|
|
299
337
|
def _has_joins(self) -> bool:
|
|
300
338
|
return len(self._from_clause.join_clauses) > 0
|
|
301
339
|
|
|
302
340
|
def show(self, n: int = 20) -> DataFrameResultSet:
|
|
341
|
+
if self.sample_clause is not None:
|
|
342
|
+
raise excs.Error('show() cannot be used with sample()')
|
|
303
343
|
assert n is not None
|
|
304
344
|
return self.limit(n).collect()
|
|
305
345
|
|
|
@@ -322,6 +362,8 @@ class DataFrame:
|
|
|
322
362
|
raise excs.Error('head() cannot be used with order_by()')
|
|
323
363
|
if self._has_joins():
|
|
324
364
|
raise excs.Error('head() not supported for joins')
|
|
365
|
+
if self.sample_clause is not None:
|
|
366
|
+
raise excs.Error('head() cannot be used with sample()')
|
|
325
367
|
if self.group_by_clause is not None:
|
|
326
368
|
raise excs.Error('head() cannot be used with group_by()')
|
|
327
369
|
num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
|
|
@@ -347,6 +389,8 @@ class DataFrame:
|
|
|
347
389
|
raise excs.Error('tail() cannot be used with order_by()')
|
|
348
390
|
if self._has_joins():
|
|
349
391
|
raise excs.Error('tail() not supported for joins')
|
|
392
|
+
if self.sample_clause is not None:
|
|
393
|
+
raise excs.Error('tail() cannot be used with sample()')
|
|
350
394
|
if self.group_by_clause is not None:
|
|
351
395
|
raise excs.Error('tail() cannot be used with group_by()')
|
|
352
396
|
num_rowid_cols = len(self._first_tbl.tbl_version.get().store_tbl.rowid_columns())
|
|
@@ -431,7 +475,7 @@ class DataFrame:
|
|
|
431
475
|
raise excs.Error(msg) from e
|
|
432
476
|
|
|
433
477
|
def _output_row_iterator(self) -> Iterator[list]:
|
|
434
|
-
with
|
|
478
|
+
with Catalog.get().begin_xact(for_write=False):
|
|
435
479
|
try:
|
|
436
480
|
for data_row in self._exec():
|
|
437
481
|
yield [data_row[e.slot_idx] for e in self._select_list_exprs]
|
|
@@ -463,8 +507,8 @@ class DataFrame:
|
|
|
463
507
|
|
|
464
508
|
from pixeltable.plan import Planner
|
|
465
509
|
|
|
466
|
-
|
|
467
|
-
|
|
510
|
+
with Catalog.get().begin_xact(for_write=False) as conn:
|
|
511
|
+
stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
|
|
468
512
|
result: int = conn.execute(stmt).scalar_one()
|
|
469
513
|
assert isinstance(result, int)
|
|
470
514
|
return result
|
|
@@ -510,6 +554,9 @@ class DataFrame:
|
|
|
510
554
|
if self.limit_val is not None:
|
|
511
555
|
heading_vals.append('Limit')
|
|
512
556
|
info_vals.append(self.limit_val.display_str(inline=False))
|
|
557
|
+
if self.sample_clause is not None:
|
|
558
|
+
heading_vals.append('Sample')
|
|
559
|
+
info_vals.append(self.sample_clause.display_str(inline=False))
|
|
513
560
|
assert len(heading_vals) == len(info_vals)
|
|
514
561
|
return pd.DataFrame(info_vals, index=heading_vals)
|
|
515
562
|
|
|
@@ -644,6 +691,8 @@ class DataFrame:
|
|
|
644
691
|
"""
|
|
645
692
|
if self.where_clause is not None:
|
|
646
693
|
raise excs.Error('Where clause already specified')
|
|
694
|
+
if self.sample_clause is not None:
|
|
695
|
+
raise excs.Error('where cannot be used after sample()')
|
|
647
696
|
if not isinstance(pred, exprs.Expr):
|
|
648
697
|
raise excs.Error(f'Where() requires a Pixeltable expression, but instead got {type(pred)}')
|
|
649
698
|
if not pred.col_type.is_bool_type():
|
|
@@ -771,6 +820,8 @@ class DataFrame:
|
|
|
771
820
|
|
|
772
821
|
>>> df = t.join(d, on=(t.d1 == d.pk1) & (t.d2 == d.pk2), how='left')
|
|
773
822
|
"""
|
|
823
|
+
if self.sample_clause is not None:
|
|
824
|
+
raise excs.Error('join() cannot be used with sample()')
|
|
774
825
|
join_pred: Optional[exprs.Expr]
|
|
775
826
|
if how == 'cross':
|
|
776
827
|
if on is not None:
|
|
@@ -838,6 +889,9 @@ class DataFrame:
|
|
|
838
889
|
"""
|
|
839
890
|
if self.group_by_clause is not None:
|
|
840
891
|
raise excs.Error('Group-by already specified')
|
|
892
|
+
if self.sample_clause is not None:
|
|
893
|
+
raise excs.Error('group_by() cannot be used with sample()')
|
|
894
|
+
|
|
841
895
|
grouping_tbl: Optional[catalog.TableVersion] = None
|
|
842
896
|
group_by_clause: Optional[list[exprs.Expr]] = None
|
|
843
897
|
for item in grouping_items:
|
|
@@ -921,6 +975,8 @@ class DataFrame:
|
|
|
921
975
|
|
|
922
976
|
>>> df = book.order_by(t.price, asc=False).order_by(t.pages)
|
|
923
977
|
"""
|
|
978
|
+
if self.sample_clause is not None:
|
|
979
|
+
raise excs.Error('group_by() cannot be used with sample()')
|
|
924
980
|
for e in expr_list:
|
|
925
981
|
if not isinstance(e, exprs.Expr):
|
|
926
982
|
raise excs.Error(f'Invalid expression in order_by(): {e}')
|
|
@@ -945,10 +1001,10 @@ class DataFrame:
|
|
|
945
1001
|
Returns:
|
|
946
1002
|
A new DataFrame with the specified limited rows.
|
|
947
1003
|
"""
|
|
948
|
-
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
1004
|
+
if self.sample_clause is not None:
|
|
1005
|
+
raise excs.Error('limit() cannot be used with sample()')
|
|
1006
|
+
|
|
1007
|
+
limit_expr = self._convert_param_to_typed_expr(n, ts.IntType(nullable=False), True, 'limit()')
|
|
952
1008
|
return DataFrame(
|
|
953
1009
|
from_clause=self._from_clause,
|
|
954
1010
|
select_list=self.select_list,
|
|
@@ -956,7 +1012,124 @@ class DataFrame:
|
|
|
956
1012
|
group_by_clause=self.group_by_clause,
|
|
957
1013
|
grouping_tbl=self.grouping_tbl,
|
|
958
1014
|
order_by_clause=self.order_by_clause,
|
|
959
|
-
limit=
|
|
1015
|
+
limit=limit_expr,
|
|
1016
|
+
)
|
|
1017
|
+
|
|
1018
|
+
def sample(
|
|
1019
|
+
self,
|
|
1020
|
+
n: Optional[int] = None,
|
|
1021
|
+
n_per_stratum: Optional[int] = None,
|
|
1022
|
+
fraction: Optional[float] = None,
|
|
1023
|
+
seed: Optional[int] = None,
|
|
1024
|
+
stratify_by: Any = None,
|
|
1025
|
+
) -> DataFrame:
|
|
1026
|
+
"""
|
|
1027
|
+
Return a new DataFrame specifying a sample of rows from the DataFrame, considered in a shuffled order.
|
|
1028
|
+
|
|
1029
|
+
The size of the sample can be specified in three ways:
|
|
1030
|
+
|
|
1031
|
+
- `n`: the total number of rows to produce as a sample
|
|
1032
|
+
- `n_per_stratum`: the number of rows to produce per stratum as a sample
|
|
1033
|
+
- `fraction`: the fraction of available rows to produce as a sample
|
|
1034
|
+
|
|
1035
|
+
The sample can be stratified by one or more columns, which means that the sample will
|
|
1036
|
+
be selected from each stratum separately.
|
|
1037
|
+
|
|
1038
|
+
The data is shuffled before creating the sample.
|
|
1039
|
+
|
|
1040
|
+
Args:
|
|
1041
|
+
n: Total number of rows to produce as a sample.
|
|
1042
|
+
n_per_stratum: Number of rows to produce per stratum as a sample. This parameter is only valid if
|
|
1043
|
+
`stratify_by` is specified. Only one of `n` or `n_per_stratum` can be specified.
|
|
1044
|
+
fraction: Fraction of available rows to produce as a sample. This parameter is not usable with `n` or
|
|
1045
|
+
`n_per_stratum`. The fraction must be between 0.0 and 1.0.
|
|
1046
|
+
seed: Random seed for reproducible shuffling
|
|
1047
|
+
stratify_by: If specified, the sample will be stratified by these values.
|
|
1048
|
+
|
|
1049
|
+
Returns:
|
|
1050
|
+
A new DataFrame which specifies the sampled rows
|
|
1051
|
+
|
|
1052
|
+
Examples:
|
|
1053
|
+
Given the Table `person` containing the field 'age', we can create samples of the table in various ways:
|
|
1054
|
+
|
|
1055
|
+
Sample 100 rows from the above Table:
|
|
1056
|
+
|
|
1057
|
+
>>> df = person.sample(n=100)
|
|
1058
|
+
|
|
1059
|
+
Sample 10% of the rows from the above Table:
|
|
1060
|
+
|
|
1061
|
+
>>> df = person.sample(fraction=0.1)
|
|
1062
|
+
|
|
1063
|
+
Sample 10% of the rows from the above Table, stratified by the column 'age':
|
|
1064
|
+
|
|
1065
|
+
>>> df = person.sample(fraction=0.1, stratify_by=t.age)
|
|
1066
|
+
|
|
1067
|
+
Equal allocation sampling: Sample 2 rows from each age present in the above Table:
|
|
1068
|
+
|
|
1069
|
+
>>> df = person.sample(n_per_stratum=2, stratify_by=t.age)
|
|
1070
|
+
|
|
1071
|
+
Sampling is compatible with the where clause, so we can also sample from a filtered DataFrame:
|
|
1072
|
+
|
|
1073
|
+
>>> df = person.where(t.age > 30).sample(n=100)
|
|
1074
|
+
"""
|
|
1075
|
+
# Check context of usage
|
|
1076
|
+
if self.sample_clause is not None:
|
|
1077
|
+
raise excs.Error('sample() cannot be used with sample()')
|
|
1078
|
+
if self.group_by_clause is not None:
|
|
1079
|
+
raise excs.Error('sample() cannot be used with group_by()')
|
|
1080
|
+
if self.order_by_clause is not None:
|
|
1081
|
+
raise excs.Error('sample() cannot be used with order_by()')
|
|
1082
|
+
if self.limit_val is not None:
|
|
1083
|
+
raise excs.Error('sample() cannot be used with limit()')
|
|
1084
|
+
if self._has_joins():
|
|
1085
|
+
raise excs.Error('sample() cannot be used with join()')
|
|
1086
|
+
|
|
1087
|
+
# Check paramter combinations
|
|
1088
|
+
if (n is not None) + (n_per_stratum is not None) + (fraction is not None) != 1:
|
|
1089
|
+
raise excs.Error('Exactly one of `n`, `n_per_stratum`, or `fraction` must be specified.')
|
|
1090
|
+
if n_per_stratum is not None and stratify_by is None:
|
|
1091
|
+
raise excs.Error('Must specify `stratify_by` to use `n_per_stratum`')
|
|
1092
|
+
|
|
1093
|
+
# Check parameter types and values
|
|
1094
|
+
n = self.validate_constant_type_range(n, ts.IntType(nullable=False), False, 'n', (1, None))
|
|
1095
|
+
n_per_stratum = self.validate_constant_type_range(
|
|
1096
|
+
n_per_stratum, ts.IntType(nullable=False), False, 'n_per_stratum', (1, None)
|
|
1097
|
+
)
|
|
1098
|
+
fraction = self.validate_constant_type_range(
|
|
1099
|
+
fraction, ts.FloatType(nullable=False), False, 'fraction', (0.0, 1.0)
|
|
1100
|
+
)
|
|
1101
|
+
seed = self.validate_constant_type_range(seed, ts.IntType(nullable=False), False, 'seed')
|
|
1102
|
+
|
|
1103
|
+
# analyze stratify list
|
|
1104
|
+
stratify_exprs: list[exprs.Expr] = []
|
|
1105
|
+
if stratify_by is not None:
|
|
1106
|
+
if isinstance(stratify_by, exprs.Expr):
|
|
1107
|
+
stratify_by = [stratify_by]
|
|
1108
|
+
if not isinstance(stratify_by, (list, tuple)):
|
|
1109
|
+
raise excs.Error('`stratify_by` must be a list of scalar expressions')
|
|
1110
|
+
for expr in stratify_by:
|
|
1111
|
+
if expr is None or not isinstance(expr, exprs.Expr):
|
|
1112
|
+
raise excs.Error(f'Invalid expression: {expr}')
|
|
1113
|
+
if not expr.col_type.is_scalar_type():
|
|
1114
|
+
raise excs.Error(f'Invalid type: expression must be a scalar type (not {expr.col_type})')
|
|
1115
|
+
if not expr.is_bound_by(self._from_clause.tbls):
|
|
1116
|
+
raise excs.Error(
|
|
1117
|
+
f"Expression '{expr}' cannot be evaluated in the context of this query's tables "
|
|
1118
|
+
f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)})'
|
|
1119
|
+
)
|
|
1120
|
+
stratify_exprs.append(expr)
|
|
1121
|
+
|
|
1122
|
+
sample_clause = SampleClause(None, n, n_per_stratum, fraction, seed, stratify_exprs)
|
|
1123
|
+
|
|
1124
|
+
return DataFrame(
|
|
1125
|
+
from_clause=self._from_clause,
|
|
1126
|
+
select_list=self.select_list,
|
|
1127
|
+
where_clause=self.where_clause,
|
|
1128
|
+
group_by_clause=self.group_by_clause,
|
|
1129
|
+
grouping_tbl=self.grouping_tbl,
|
|
1130
|
+
order_by_clause=self.order_by_clause,
|
|
1131
|
+
limit=self.limit_val,
|
|
1132
|
+
sample_clause=sample_clause,
|
|
960
1133
|
)
|
|
961
1134
|
|
|
962
1135
|
def update(self, value_spec: dict[str, Any], cascade: bool = True) -> UpdateStatus:
|
|
@@ -988,7 +1161,8 @@ class DataFrame:
|
|
|
988
1161
|
>>> df = person.where(t.year == 2014).update({'age': 30})
|
|
989
1162
|
"""
|
|
990
1163
|
self._validate_mutable('update', False)
|
|
991
|
-
|
|
1164
|
+
tbl_id = self._first_tbl.tbl_id()
|
|
1165
|
+
with Catalog.get().begin_xact(tbl_id=tbl_id, for_write=True):
|
|
992
1166
|
return self._first_tbl.tbl_version.get().update(value_spec, where=self.where_clause, cascade=cascade)
|
|
993
1167
|
|
|
994
1168
|
def delete(self) -> UpdateStatus:
|
|
@@ -1011,7 +1185,8 @@ class DataFrame:
|
|
|
1011
1185
|
self._validate_mutable('delete', False)
|
|
1012
1186
|
if not self._first_tbl.is_insertable():
|
|
1013
1187
|
raise excs.Error('Cannot delete from view')
|
|
1014
|
-
|
|
1188
|
+
tbl_id = self._first_tbl.tbl_id()
|
|
1189
|
+
with Catalog.get().begin_xact(tbl_id=tbl_id, for_write=True):
|
|
1015
1190
|
return self._first_tbl.tbl_version.get().delete(where=self.where_clause)
|
|
1016
1191
|
|
|
1017
1192
|
def _validate_mutable(self, op_name: str, allow_select: bool) -> None:
|
|
@@ -1053,13 +1228,14 @@ class DataFrame:
|
|
|
1053
1228
|
if self.order_by_clause is not None
|
|
1054
1229
|
else None,
|
|
1055
1230
|
'limit_val': self.limit_val.as_dict() if self.limit_val is not None else None,
|
|
1231
|
+
'sample_clause': self.sample_clause.as_dict() if self.sample_clause is not None else None,
|
|
1056
1232
|
}
|
|
1057
1233
|
return d
|
|
1058
1234
|
|
|
1059
1235
|
@classmethod
|
|
1060
1236
|
def from_dict(cls, d: dict[str, Any]) -> 'DataFrame':
|
|
1061
1237
|
# we need to wrap the construction with a transaction, because it might need to load metadata
|
|
1062
|
-
with
|
|
1238
|
+
with Catalog.get().begin_xact(for_write=False):
|
|
1063
1239
|
tbls = [catalog.TableVersionPath.from_dict(tbl_dict) for tbl_dict in d['from_clause']['tbls']]
|
|
1064
1240
|
join_clauses = [plan.JoinClause(**clause_dict) for clause_dict in d['from_clause']['join_clauses']]
|
|
1065
1241
|
from_clause = plan.FromClause(tbls=tbls, join_clauses=join_clauses)
|
|
@@ -1079,6 +1255,7 @@ class DataFrame:
|
|
|
1079
1255
|
else None
|
|
1080
1256
|
)
|
|
1081
1257
|
limit_val = exprs.Expr.from_dict(d['limit_val']) if d['limit_val'] is not None else None
|
|
1258
|
+
sample_clause = SampleClause.from_dict(d['sample_clause']) if d['sample_clause'] is not None else None
|
|
1082
1259
|
|
|
1083
1260
|
return DataFrame(
|
|
1084
1261
|
from_clause=from_clause,
|
|
@@ -1088,6 +1265,7 @@ class DataFrame:
|
|
|
1088
1265
|
grouping_tbl=grouping_tbl,
|
|
1089
1266
|
order_by_clause=order_by_clause,
|
|
1090
1267
|
limit=limit_val,
|
|
1268
|
+
sample_clause=sample_clause,
|
|
1091
1269
|
)
|
|
1092
1270
|
|
|
1093
1271
|
def _hash_result_set(self) -> str:
|
|
@@ -1129,7 +1307,7 @@ class DataFrame:
|
|
|
1129
1307
|
assert data_file_path.is_file()
|
|
1130
1308
|
return data_file_path
|
|
1131
1309
|
else:
|
|
1132
|
-
with
|
|
1310
|
+
with Catalog.get().begin_xact(for_write=False):
|
|
1133
1311
|
return write_coco_dataset(self, dest_path)
|
|
1134
1312
|
|
|
1135
1313
|
def to_pytorch_dataset(self, image_format: str = 'pt') -> 'torch.utils.data.IterableDataset':
|
|
@@ -1174,7 +1352,7 @@ class DataFrame:
|
|
|
1174
1352
|
if dest_path.exists(): # fast path: use cache
|
|
1175
1353
|
assert dest_path.is_dir()
|
|
1176
1354
|
else:
|
|
1177
|
-
with
|
|
1355
|
+
with Catalog.get().begin_xact(for_write=False):
|
|
1178
1356
|
export_parquet(self, dest_path, inline_images=True)
|
|
1179
1357
|
|
|
1180
1358
|
return PixeltablePytorchDataset(path=dest_path, image_format=image_format)
|
pixeltable/env.py
CHANGED
|
@@ -25,6 +25,7 @@ from zoneinfo import ZoneInfo, ZoneInfoNotFoundError
|
|
|
25
25
|
|
|
26
26
|
import pixeltable_pgserver
|
|
27
27
|
import sqlalchemy as sql
|
|
28
|
+
from pillow_heif import register_heif_opener # type: ignore[import-untyped]
|
|
28
29
|
from tqdm import TqdmWarning
|
|
29
30
|
|
|
30
31
|
from pixeltable import exceptions as excs
|
|
@@ -191,6 +192,7 @@ class Env:
|
|
|
191
192
|
assert self._dbms is not None
|
|
192
193
|
return self._dbms
|
|
193
194
|
|
|
195
|
+
@property
|
|
194
196
|
def in_xact(self) -> bool:
|
|
195
197
|
return self._current_conn is not None
|
|
196
198
|
|
|
@@ -201,20 +203,17 @@ class Env:
|
|
|
201
203
|
|
|
202
204
|
@contextmanager
|
|
203
205
|
def begin_xact(self) -> Iterator[sql.Connection]:
|
|
204
|
-
"""
|
|
206
|
+
"""Call Catalog.begin_xact() instead, unless there is a specific reason to call this directly."""
|
|
205
207
|
if self._current_conn is None:
|
|
206
208
|
assert self._current_session is None
|
|
207
209
|
try:
|
|
208
210
|
with self.engine.begin() as conn, sql.orm.Session(conn) as session:
|
|
209
|
-
# TODO: remove print() once we're done with debugging the concurrent update behavior
|
|
210
|
-
# print(f'{datetime.datetime.now()}: start xact')
|
|
211
211
|
self._current_conn = conn
|
|
212
212
|
self._current_session = session
|
|
213
213
|
yield conn
|
|
214
214
|
finally:
|
|
215
215
|
self._current_session = None
|
|
216
216
|
self._current_conn = None
|
|
217
|
-
# print(f'{datetime.datetime.now()}: end xact')
|
|
218
217
|
else:
|
|
219
218
|
assert self._current_session is not None
|
|
220
219
|
yield self._current_conn
|
|
@@ -600,6 +599,7 @@ class Env:
|
|
|
600
599
|
|
|
601
600
|
def _set_up_runtime(self) -> None:
|
|
602
601
|
"""Check for and start runtime services"""
|
|
602
|
+
register_heif_opener()
|
|
603
603
|
self._start_web_server()
|
|
604
604
|
self.__register_packages()
|
|
605
605
|
|
pixeltable/exec/__init__.py
CHANGED
|
@@ -9,4 +9,4 @@ from .exec_node import ExecNode
|
|
|
9
9
|
from .expr_eval import ExprEvalNode
|
|
10
10
|
from .in_memory_data_node import InMemoryDataNode
|
|
11
11
|
from .row_update_node import RowUpdateNode
|
|
12
|
-
from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlScanNode
|
|
12
|
+
from .sql_node import SqlAggregationNode, SqlJoinNode, SqlLookupNode, SqlNode, SqlSampleNode, SqlScanNode
|
|
@@ -317,7 +317,10 @@ class JsonMapperDispatcher(Evaluator):
|
|
|
317
317
|
for _ in src
|
|
318
318
|
]
|
|
319
319
|
for nested_row, anchor_val in zip(nested_rows, src):
|
|
320
|
-
|
|
320
|
+
# It's possible that self.scope_anchor.slot_idx is None; this corresponds to the case where the
|
|
321
|
+
# mapper expression doesn't actually contain references to RELATIVE_PATH_ROOT.
|
|
322
|
+
if self.scope_anchor.slot_idx is not None:
|
|
323
|
+
nested_row[self.scope_anchor.slot_idx] = anchor_val
|
|
321
324
|
for slot_idx_, nested_slot_idx in self.external_slot_map.items():
|
|
322
325
|
nested_row[nested_slot_idx] = row[slot_idx_]
|
|
323
326
|
self.nested_exec_ctx.init_rows(nested_rows)
|
|
@@ -38,7 +38,7 @@ class InMemoryDataNode(ExecNode):
|
|
|
38
38
|
# we materialize the input slots
|
|
39
39
|
output_exprs = list(row_builder.input_exprs)
|
|
40
40
|
super().__init__(row_builder, output_exprs, [], None)
|
|
41
|
-
assert tbl.get().is_insertable
|
|
41
|
+
assert tbl.get().is_insertable
|
|
42
42
|
self.tbl = tbl
|
|
43
43
|
self.input_rows = rows
|
|
44
44
|
self.start_row_id = start_row_id
|