pixeltable 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- pixeltable/__init__.py +2 -27
- pixeltable/__version__.py +2 -2
- pixeltable/catalog/catalog.py +9 -7
- pixeltable/catalog/column.py +6 -2
- pixeltable/catalog/dir.py +2 -1
- pixeltable/catalog/insertable_table.py +11 -0
- pixeltable/catalog/schema_object.py +2 -1
- pixeltable/catalog/table.py +27 -38
- pixeltable/catalog/table_version.py +19 -0
- pixeltable/catalog/table_version_path.py +7 -0
- pixeltable/catalog/view.py +31 -0
- pixeltable/dataframe.py +50 -7
- pixeltable/env.py +1 -1
- pixeltable/exceptions.py +20 -2
- pixeltable/exec/aggregation_node.py +14 -0
- pixeltable/exec/cache_prefetch_node.py +1 -1
- pixeltable/exec/expr_eval/evaluators.py +0 -4
- pixeltable/exec/expr_eval/expr_eval_node.py +1 -2
- pixeltable/exec/sql_node.py +3 -2
- pixeltable/exprs/column_ref.py +42 -17
- pixeltable/exprs/data_row.py +3 -0
- pixeltable/exprs/globals.py +1 -1
- pixeltable/exprs/literal.py +11 -1
- pixeltable/exprs/rowid_ref.py +4 -1
- pixeltable/exprs/similarity_expr.py +1 -1
- pixeltable/func/function.py +1 -1
- pixeltable/func/udf.py +1 -1
- pixeltable/functions/__init__.py +2 -0
- pixeltable/functions/anthropic.py +1 -1
- pixeltable/functions/bedrock.py +130 -0
- pixeltable/functions/date.py +185 -0
- pixeltable/functions/gemini.py +22 -20
- pixeltable/functions/globals.py +1 -16
- pixeltable/functions/huggingface.py +7 -6
- pixeltable/functions/image.py +15 -16
- pixeltable/functions/json.py +2 -1
- pixeltable/functions/math.py +40 -0
- pixeltable/functions/mistralai.py +3 -2
- pixeltable/functions/openai.py +9 -8
- pixeltable/functions/string.py +1 -2
- pixeltable/functions/together.py +4 -3
- pixeltable/functions/video.py +2 -2
- pixeltable/globals.py +26 -9
- pixeltable/io/datarows.py +4 -3
- pixeltable/io/hf_datasets.py +2 -2
- pixeltable/io/label_studio.py +17 -17
- pixeltable/io/pandas.py +29 -16
- pixeltable/io/parquet.py +2 -0
- pixeltable/io/table_data_conduit.py +8 -2
- pixeltable/metadata/__init__.py +1 -1
- pixeltable/metadata/converters/convert_19.py +2 -2
- pixeltable/metadata/converters/convert_34.py +21 -0
- pixeltable/metadata/notes.py +1 -0
- pixeltable/plan.py +12 -5
- pixeltable/share/__init__.py +1 -1
- pixeltable/share/packager.py +219 -119
- pixeltable/share/publish.py +61 -16
- pixeltable/store.py +45 -20
- pixeltable/type_system.py +46 -2
- pixeltable/utils/arrow.py +8 -2
- pixeltable/utils/pytorch.py +4 -0
- {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/METADATA +2 -4
- {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/RECORD +66 -63
- {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/WHEEL +1 -1
- {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/LICENSE +0 -0
- {pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/entry_points.txt +0 -0
pixeltable/io/pandas.py
CHANGED
|
@@ -8,6 +8,8 @@ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
|
|
|
8
8
|
|
|
9
9
|
import pixeltable as pxt
|
|
10
10
|
import pixeltable.exceptions as excs
|
|
11
|
+
import pixeltable.type_system as ts
|
|
12
|
+
from pixeltable.env import Env
|
|
11
13
|
|
|
12
14
|
|
|
13
15
|
def import_pandas(
|
|
@@ -119,15 +121,15 @@ def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> No
|
|
|
119
121
|
|
|
120
122
|
|
|
121
123
|
def df_infer_schema(
|
|
122
|
-
df: pd.DataFrame, schema_overrides: dict[str,
|
|
123
|
-
) -> dict[str,
|
|
124
|
+
df: pd.DataFrame, schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
|
|
125
|
+
) -> dict[str, ts.ColumnType]:
|
|
124
126
|
"""
|
|
125
127
|
Infers a Pixeltable schema from a Pandas DataFrame.
|
|
126
128
|
|
|
127
129
|
Returns:
|
|
128
130
|
A tuple containing a Pixeltable schema and a list of primary key column names.
|
|
129
131
|
"""
|
|
130
|
-
pd_schema: dict[str,
|
|
132
|
+
pd_schema: dict[str, ts.ColumnType] = {}
|
|
131
133
|
for pd_name, pd_dtype in zip(df.columns, df.dtypes):
|
|
132
134
|
if pd_name in schema_overrides:
|
|
133
135
|
pxt_type = schema_overrides[pd_name]
|
|
@@ -138,7 +140,7 @@ def df_infer_schema(
|
|
|
138
140
|
return pd_schema
|
|
139
141
|
|
|
140
142
|
|
|
141
|
-
def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[
|
|
143
|
+
def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[ts.ColumnType]:
|
|
142
144
|
"""
|
|
143
145
|
Determines a pixeltable ColumnType from a pandas dtype
|
|
144
146
|
|
|
@@ -146,21 +148,21 @@ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.C
|
|
|
146
148
|
pd_dtype: A pandas dtype object
|
|
147
149
|
|
|
148
150
|
Returns:
|
|
149
|
-
|
|
151
|
+
ts.ColumnType: A pixeltable ColumnType
|
|
150
152
|
"""
|
|
151
153
|
# Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
|
|
152
154
|
# compatible with NumPy dtypes
|
|
153
155
|
# The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
|
|
154
156
|
if is_datetime64_any_dtype(pd_dtype):
|
|
155
|
-
return
|
|
157
|
+
return ts.TimestampType(nullable=nullable)
|
|
156
158
|
if is_extension_array_dtype(pd_dtype):
|
|
157
159
|
return None
|
|
158
160
|
# Most other pandas dtypes are directly NumPy compatible
|
|
159
161
|
assert isinstance(pd_dtype, np.dtype)
|
|
160
|
-
return
|
|
162
|
+
return ts.ArrayType.from_np_dtype(pd_dtype, nullable)
|
|
161
163
|
|
|
162
164
|
|
|
163
|
-
def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) ->
|
|
165
|
+
def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> ts.ColumnType:
|
|
164
166
|
"""
|
|
165
167
|
Infers a Pixeltable type based on a pandas dtype.
|
|
166
168
|
"""
|
|
@@ -176,12 +178,12 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
|
|
|
176
178
|
|
|
177
179
|
if len(data_col) == 0:
|
|
178
180
|
# No non-null values; default to FloatType (the Pandas type of an all-NaN column)
|
|
179
|
-
return
|
|
181
|
+
return ts.FloatType(nullable=nullable)
|
|
180
182
|
|
|
181
|
-
inferred_type =
|
|
183
|
+
inferred_type = ts.ColumnType.infer_common_literal_type(data_col)
|
|
182
184
|
if inferred_type is None:
|
|
183
185
|
# Fallback on StringType if everything else fails
|
|
184
|
-
return
|
|
186
|
+
return ts.StringType(nullable=nullable)
|
|
185
187
|
else:
|
|
186
188
|
return inferred_type.copy(nullable=nullable)
|
|
187
189
|
|
|
@@ -189,7 +191,7 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
|
|
|
189
191
|
|
|
190
192
|
|
|
191
193
|
def _df_row_to_pxt_row(
|
|
192
|
-
row: tuple[Any, ...], schema: dict[str,
|
|
194
|
+
row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping: Optional[dict[str, str]]
|
|
193
195
|
) -> dict[str, Any]:
|
|
194
196
|
"""Convert a row to insertable format"""
|
|
195
197
|
pxt_row: dict[str, Any] = {}
|
|
@@ -208,14 +210,25 @@ def _df_row_to_pxt_row(
|
|
|
208
210
|
nval = bool(val)
|
|
209
211
|
elif pxt_type.is_string_type():
|
|
210
212
|
nval = str(val)
|
|
213
|
+
elif pxt_type.is_date_type():
|
|
214
|
+
if pd.isnull(val):
|
|
215
|
+
# pandas has the bespoke 'NaT' valud for a missing timestamp
|
|
216
|
+
# This is not supported by postgres, and must be converted to None
|
|
217
|
+
nval = None
|
|
218
|
+
else:
|
|
219
|
+
nval = pd.Timestamp(val).date()
|
|
211
220
|
elif pxt_type.is_timestamp_type():
|
|
212
221
|
if pd.isnull(val):
|
|
213
|
-
# pandas has the bespoke 'NaT'
|
|
214
|
-
#
|
|
215
|
-
# table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
|
|
222
|
+
# pandas has the bespoke 'NaT' value for a missing timestamp
|
|
223
|
+
# This is not supported by postgres, and must be converted to None
|
|
216
224
|
nval = None
|
|
217
225
|
else:
|
|
218
|
-
|
|
226
|
+
tval = pd.Timestamp(val)
|
|
227
|
+
# pandas supports tz-aware and naive timestamps.
|
|
228
|
+
if tval.tz is None:
|
|
229
|
+
nval = pd.Timestamp(tval).tz_localize(tz=Env.get().default_time_zone)
|
|
230
|
+
else:
|
|
231
|
+
nval = tval.astimezone(Env.get().default_time_zone)
|
|
219
232
|
else:
|
|
220
233
|
nval = val
|
|
221
234
|
pxt_row[pxt_name] = nval
|
pixeltable/io/parquet.py
CHANGED
|
@@ -15,6 +15,7 @@ from pyarrow.parquet import ParquetDataset
|
|
|
15
15
|
|
|
16
16
|
import pixeltable as pxt
|
|
17
17
|
import pixeltable.exceptions as excs
|
|
18
|
+
import pixeltable.type_system as ts
|
|
18
19
|
from pixeltable.io.pandas import _df_check_primary_key_values, _df_row_to_pxt_row, df_infer_schema
|
|
19
20
|
from pixeltable.utils import parse_local_file_path
|
|
20
21
|
|
|
@@ -72,6 +73,11 @@ class TableDataConduit:
|
|
|
72
73
|
def check_source_format(self) -> None:
|
|
73
74
|
assert self.source_format is None or TableDataConduitFormat.is_valid(self.source_format)
|
|
74
75
|
|
|
76
|
+
def __post_init__(self) -> None:
|
|
77
|
+
"""If no extra_fields were provided, initialize to empty dict"""
|
|
78
|
+
if self.extra_fields is None:
|
|
79
|
+
self.extra_fields = {}
|
|
80
|
+
|
|
75
81
|
@classmethod
|
|
76
82
|
def is_rowdata_structure(cls, d: TableDataSource) -> bool:
|
|
77
83
|
if not isinstance(d, list) or len(d) == 0:
|
|
@@ -83,7 +89,7 @@ class TableDataConduit:
|
|
|
83
89
|
|
|
84
90
|
def normalize_pxt_schema_types(self) -> None:
|
|
85
91
|
for name, coltype in self.pxt_schema.items():
|
|
86
|
-
self.pxt_schema[name] =
|
|
92
|
+
self.pxt_schema[name] = ts.ColumnType.normalize_type(coltype)
|
|
87
93
|
|
|
88
94
|
def infer_schema(self) -> dict[str, Any]:
|
|
89
95
|
raise NotImplementedError
|
|
@@ -393,7 +399,7 @@ class HFTableDataConduit(TableDataConduit):
|
|
|
393
399
|
f'Column name `{self.column_name_for_split}` already exists in dataset schema;'
|
|
394
400
|
f'provide a different `column_name_for_split`'
|
|
395
401
|
)
|
|
396
|
-
self.src_schema[self.column_name_for_split] =
|
|
402
|
+
self.src_schema[self.column_name_for_split] = ts.StringType(nullable=True)
|
|
397
403
|
|
|
398
404
|
inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
|
|
399
405
|
self.src_schema, self.src_pk, self.src_schema_overrides, True
|
pixeltable/metadata/__init__.py
CHANGED
|
@@ -16,7 +16,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
# current version of the metadata; this is incremented whenever the metadata schema changes
|
|
19
|
-
VERSION =
|
|
19
|
+
VERSION = 35
|
|
20
20
|
|
|
21
21
|
|
|
22
22
|
def create_system_info(engine: sql.engine.Engine) -> None:
|
|
@@ -3,7 +3,7 @@ from typing import Any, Optional
|
|
|
3
3
|
|
|
4
4
|
import sqlalchemy as sql
|
|
5
5
|
|
|
6
|
-
import pixeltable as
|
|
6
|
+
import pixeltable.type_system as ts
|
|
7
7
|
from pixeltable.metadata import register_converter, schema
|
|
8
8
|
from pixeltable.metadata.converters.util import convert_table_md
|
|
9
9
|
|
|
@@ -34,7 +34,7 @@ def __update_timestamp_literals(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
|
|
|
34
34
|
# timestamp literal, which (in version 19) is stored in the DB as a naive datetime.
|
|
35
35
|
# We convert it to an aware datetime, stored in UTC.
|
|
36
36
|
assert v['_classname'] == 'Literal'
|
|
37
|
-
assert v['val_t'] ==
|
|
37
|
+
assert v['val_t'] == ts.ColumnType.Type.TIMESTAMP.name
|
|
38
38
|
assert isinstance(v['val'], str)
|
|
39
39
|
dt = datetime.datetime.fromisoformat(v['val'])
|
|
40
40
|
assert dt.tzinfo is None # In version 19 all timestamps are naive
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
from typing import Any, Optional
|
|
2
|
+
|
|
3
|
+
import sqlalchemy as sql
|
|
4
|
+
|
|
5
|
+
from pixeltable.metadata import register_converter
|
|
6
|
+
from pixeltable.metadata.converters.util import convert_table_md
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@register_converter(version=34)
|
|
10
|
+
def _(engine: sql.engine.Engine) -> None:
|
|
11
|
+
convert_table_md(engine, substitution_fn=__substitute_md)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
|
|
15
|
+
if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ColumnRef':
|
|
16
|
+
# Add reference_tbl to ColumnRef; for historical metadata it is always equal to tbl
|
|
17
|
+
assert 'reference_tbl' not in v
|
|
18
|
+
v['reference_tbl'] = None
|
|
19
|
+
return k, v
|
|
20
|
+
|
|
21
|
+
return None
|
pixeltable/metadata/notes.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
|
2
2
|
# rather than as a comment, so that the existence of a description can be enforced by
|
|
3
3
|
# the unit tests when new versions are added.
|
|
4
4
|
VERSION_NOTES = {
|
|
5
|
+
35: 'Track reference_tbl in ColumnRef',
|
|
5
6
|
34: 'Set default value for is_pk field in column metadata to False',
|
|
6
7
|
33: 'Add is_replica field to table metadata',
|
|
7
8
|
32: 'Add the lock_dummy BIGINT column to the dirs table',
|
pixeltable/plan.py
CHANGED
|
@@ -635,8 +635,8 @@ class Planner:
|
|
|
635
635
|
raise excs.Error(f'Join predicate {join_clause.join_predicate} not expressible in SQL')
|
|
636
636
|
|
|
637
637
|
@classmethod
|
|
638
|
-
def
|
|
639
|
-
"""Verify that the various ordering requirements don't conflict"""
|
|
638
|
+
def _create_combined_ordering(cls, analyzer: Analyzer, verify_agg: bool) -> Optional[OrderByClause]:
|
|
639
|
+
"""Verify that the various ordering requirements don't conflict and return a combined ordering"""
|
|
640
640
|
ob_clauses: list[OrderByClause] = [analyzer.order_by_clause.copy()]
|
|
641
641
|
|
|
642
642
|
if verify_agg:
|
|
@@ -652,8 +652,11 @@ class Planner:
|
|
|
652
652
|
OrderByItem(e, True) for e in fn_call.get_agg_order_by()
|
|
653
653
|
]
|
|
654
654
|
ob_clauses.append(ordering)
|
|
655
|
-
|
|
656
|
-
|
|
655
|
+
|
|
656
|
+
if len(ob_clauses) == 0:
|
|
657
|
+
return None
|
|
658
|
+
elif len(ob_clauses) == 1:
|
|
659
|
+
return ob_clauses[0]
|
|
657
660
|
|
|
658
661
|
combined_ordering = ob_clauses[0]
|
|
659
662
|
for ordering in ob_clauses[1:]:
|
|
@@ -664,6 +667,7 @@ class Planner:
|
|
|
664
667
|
f'{print_order_by_clause(combined_ordering)} vs {print_order_by_clause(ordering)}'
|
|
665
668
|
)
|
|
666
669
|
combined_ordering = combined
|
|
670
|
+
return combined_ordering
|
|
667
671
|
|
|
668
672
|
@classmethod
|
|
669
673
|
def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
|
|
@@ -761,7 +765,7 @@ class Planner:
|
|
|
761
765
|
analyzer.window_fn_calls
|
|
762
766
|
)
|
|
763
767
|
ctx = exec.ExecContext(row_builder)
|
|
764
|
-
cls.
|
|
768
|
+
combined_ordering = cls._create_combined_ordering(analyzer, verify_agg=is_python_agg)
|
|
765
769
|
cls._verify_join_clauses(analyzer)
|
|
766
770
|
|
|
767
771
|
# materialized with SQL table scans (ie, single-table SELECT statements):
|
|
@@ -859,6 +863,9 @@ class Planner:
|
|
|
859
863
|
row_builder, input=plan, select_list=analyzer.select_list, group_by_items=analyzer.group_by_clause
|
|
860
864
|
)
|
|
861
865
|
else:
|
|
866
|
+
input_sql_node = plan.get_node(exec.SqlNode)
|
|
867
|
+
assert combined_ordering is not None
|
|
868
|
+
input_sql_node.set_order_by(combined_ordering)
|
|
862
869
|
plan = exec.AggregationNode(
|
|
863
870
|
tbl.tbl_version,
|
|
864
871
|
row_builder,
|
pixeltable/share/__init__.py
CHANGED