PyPI - pixeltable - Versions diffs - 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl - Mend

pixeltable 0.3.12py3-none-any.whl → 0.3.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

pixeltable/__init__.py +2 -27
pixeltable/__version__.py +2 -2
pixeltable/catalog/catalog.py +9 -7
pixeltable/catalog/column.py +6 -2
pixeltable/catalog/dir.py +2 -1
pixeltable/catalog/insertable_table.py +11 -0
pixeltable/catalog/schema_object.py +2 -1
pixeltable/catalog/table.py +27 -38
pixeltable/catalog/table_version.py +19 -0
pixeltable/catalog/table_version_path.py +7 -0
pixeltable/catalog/view.py +31 -0
pixeltable/dataframe.py +50 -7
pixeltable/env.py +1 -1
pixeltable/exceptions.py +20 -2
pixeltable/exec/aggregation_node.py +14 -0
pixeltable/exec/cache_prefetch_node.py +1 -1
pixeltable/exec/expr_eval/evaluators.py +0 -4
pixeltable/exec/expr_eval/expr_eval_node.py +1 -2
pixeltable/exec/sql_node.py +3 -2
pixeltable/exprs/column_ref.py +42 -17
pixeltable/exprs/data_row.py +3 -0
pixeltable/exprs/globals.py +1 -1
pixeltable/exprs/literal.py +11 -1
pixeltable/exprs/rowid_ref.py +4 -1
pixeltable/exprs/similarity_expr.py +1 -1
pixeltable/func/function.py +1 -1
pixeltable/func/udf.py +1 -1
pixeltable/functions/__init__.py +2 -0
pixeltable/functions/anthropic.py +1 -1
pixeltable/functions/bedrock.py +130 -0
pixeltable/functions/date.py +185 -0
pixeltable/functions/gemini.py +22 -20
pixeltable/functions/globals.py +1 -16
pixeltable/functions/huggingface.py +7 -6
pixeltable/functions/image.py +15 -16
pixeltable/functions/json.py +2 -1
pixeltable/functions/math.py +40 -0
pixeltable/functions/mistralai.py +3 -2
pixeltable/functions/openai.py +9 -8
pixeltable/functions/string.py +1 -2
pixeltable/functions/together.py +4 -3
pixeltable/functions/video.py +2 -2
pixeltable/globals.py +26 -9
pixeltable/io/datarows.py +4 -3
pixeltable/io/hf_datasets.py +2 -2
pixeltable/io/label_studio.py +17 -17
pixeltable/io/pandas.py +29 -16
pixeltable/io/parquet.py +2 -0
pixeltable/io/table_data_conduit.py +8 -2
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_19.py +2 -2
pixeltable/metadata/converters/convert_34.py +21 -0
pixeltable/metadata/notes.py +1 -0
pixeltable/plan.py +12 -5
pixeltable/share/__init__.py +1 -1
pixeltable/share/packager.py +219 -119
pixeltable/share/publish.py +61 -16
pixeltable/store.py +45 -20
pixeltable/type_system.py +46 -2
pixeltable/utils/arrow.py +8 -2
pixeltable/utils/pytorch.py +4 -0
{pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/METADATA +2 -4
{pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/RECORD +66 -63
{pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/WHEEL +1 -1
{pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/LICENSE +0 -0
{pixeltable-0.3.12.dist-info → pixeltable-0.3.14.dist-info}/entry_points.txt +0 -0

pixeltable/io/pandas.py CHANGED Viewed

@@ -8,6 +8,8 @@ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
 import pixeltable as pxt
 import pixeltable.exceptions as excs
+import pixeltable.type_system as ts
+from pixeltable.env import Env
 def import_pandas(
@@ -119,15 +121,15 @@ def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> No
 def df_infer_schema(
-    df: pd.DataFrame, schema_overrides: dict[str, pxt.ColumnType], primary_key: list[str]
-) -> dict[str, pxt.ColumnType]:
+    df: pd.DataFrame, schema_overrides: dict[str, ts.ColumnType], primary_key: list[str]
+) -> dict[str, ts.ColumnType]:
     """
     Infers a Pixeltable schema from a Pandas DataFrame.
     Returns:
         A tuple containing a Pixeltable schema and a list of primary key column names.
     """
-    pd_schema: dict[str, pxt.ColumnType] = {}
+    pd_schema: dict[str, ts.ColumnType] = {}
     for pd_name, pd_dtype in zip(df.columns, df.dtypes):
         if pd_name in schema_overrides:
             pxt_type = schema_overrides[pd_name]
@@ -138,7 +140,7 @@ def df_infer_schema(
     return pd_schema
-def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
+def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[ts.ColumnType]:
     """
     Determines a pixeltable ColumnType from a pandas dtype
@@ -146,21 +148,21 @@ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.C
         pd_dtype: A pandas dtype object
     Returns:
-        pxt.ColumnType: A pixeltable ColumnType
+        ts.ColumnType: A pixeltable ColumnType
     """
     # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
     # compatible with NumPy dtypes
     # The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
     if is_datetime64_any_dtype(pd_dtype):
-        return pxt.TimestampType(nullable=nullable)
+        return ts.TimestampType(nullable=nullable)
     if is_extension_array_dtype(pd_dtype):
         return None
     # Most other pandas dtypes are directly NumPy compatible
     assert isinstance(pd_dtype, np.dtype)
-    return pxt.ArrayType.from_np_dtype(pd_dtype, nullable)
+    return ts.ArrayType.from_np_dtype(pd_dtype, nullable)
-def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
+def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> ts.ColumnType:
     """
     Infers a Pixeltable type based on a pandas dtype.
     """
@@ -176,12 +178,12 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
         if len(data_col) == 0:
             # No non-null values; default to FloatType (the Pandas type of an all-NaN column)
-            return pxt.FloatType(nullable=nullable)
+            return ts.FloatType(nullable=nullable)
-        inferred_type = pxt.ColumnType.infer_common_literal_type(data_col)
+        inferred_type = ts.ColumnType.infer_common_literal_type(data_col)
         if inferred_type is None:
             # Fallback on StringType if everything else fails
-            return pxt.StringType(nullable=nullable)
+            return ts.StringType(nullable=nullable)
         else:
             return inferred_type.copy(nullable=nullable)
@@ -189,7 +191,7 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
 def _df_row_to_pxt_row(
-    row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
+    row: tuple[Any, ...], schema: dict[str, ts.ColumnType], col_mapping: Optional[dict[str, str]]
 ) -> dict[str, Any]:
     """Convert a row to insertable format"""
     pxt_row: dict[str, Any] = {}
@@ -208,14 +210,25 @@ def _df_row_to_pxt_row(
             nval = bool(val)
         elif pxt_type.is_string_type():
             nval = str(val)
+        elif pxt_type.is_date_type():
+            if pd.isnull(val):
+                # pandas has the bespoke 'NaT' valud for a missing timestamp
+                # This is not supported by postgres, and must be converted to None
+                nval = None
+            else:
+                nval = pd.Timestamp(val).date()
         elif pxt_type.is_timestamp_type():
             if pd.isnull(val):
-                # pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
-                # much not-ok with it. (But if we convert it to None and then load out the
-                # table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
+                # pandas has the bespoke 'NaT' value for a missing timestamp
+                # This is not supported by postgres, and must be converted to None
                 nval = None
             else:
-                nval = pd.Timestamp(val).to_pydatetime()
+                tval = pd.Timestamp(val)
+                # pandas supports tz-aware and naive timestamps.
+                if tval.tz is None:
+                    nval = pd.Timestamp(tval).tz_localize(tz=Env.get().default_time_zone)
+                else:
+                    nval = tval.astimezone(Env.get().default_time_zone)
         else:
             nval = val
         pxt_row[pxt_name] = nval

pixeltable/io/parquet.py CHANGED Viewed

@@ -127,6 +127,8 @@ def export_parquet(
                         length = 8
                     elif col_type.is_bool_type():
                         length = 1
+                    elif col_type.is_date_type():
+                        length = 4
                     elif col_type.is_timestamp_type():
                         val = val.astimezone(datetime.timezone.utc)
                         length = 8

pixeltable/io/table_data_conduit.py CHANGED Viewed

@@ -15,6 +15,7 @@ from pyarrow.parquet import ParquetDataset
 import pixeltable as pxt
 import pixeltable.exceptions as excs
+import pixeltable.type_system as ts
 from pixeltable.io.pandas import _df_check_primary_key_values, _df_row_to_pxt_row, df_infer_schema
 from pixeltable.utils import parse_local_file_path
@@ -72,6 +73,11 @@ class TableDataConduit:
     def check_source_format(self) -> None:
         assert self.source_format is None or TableDataConduitFormat.is_valid(self.source_format)
+    def __post_init__(self) -> None:
+        """If no extra_fields were provided, initialize to empty dict"""
+        if self.extra_fields is None:
+            self.extra_fields = {}
     @classmethod
     def is_rowdata_structure(cls, d: TableDataSource) -> bool:
         if not isinstance(d, list) or len(d) == 0:
@@ -83,7 +89,7 @@ class TableDataConduit:
     def normalize_pxt_schema_types(self) -> None:
         for name, coltype in self.pxt_schema.items():
-            self.pxt_schema[name] = pxt.ColumnType.normalize_type(coltype)
+            self.pxt_schema[name] = ts.ColumnType.normalize_type(coltype)
     def infer_schema(self) -> dict[str, Any]:
         raise NotImplementedError
@@ -393,7 +399,7 @@ class HFTableDataConduit(TableDataConduit):
                         f'Column name `{self.column_name_for_split}` already exists in dataset schema;'
                         f'provide a different `column_name_for_split`'
                     )
-                self.src_schema[self.column_name_for_split] = pxt.StringType(nullable=True)
+                self.src_schema[self.column_name_for_split] = ts.StringType(nullable=True)
             inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
                 self.src_schema, self.src_pk, self.src_schema_overrides, True

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -16,7 +16,7 @@ _console_logger = ConsoleLogger(logging.getLogger('pixeltable'))
 # current version of the metadata; this is incremented whenever the metadata schema changes
-VERSION = 34
+VERSION = 35
 def create_system_info(engine: sql.engine.Engine) -> None:

pixeltable/metadata/converters/convert_19.py CHANGED Viewed

@@ -3,7 +3,7 @@ from typing import Any, Optional
 import sqlalchemy as sql
-import pixeltable as pxt
+import pixeltable.type_system as ts
 from pixeltable.metadata import register_converter, schema
 from pixeltable.metadata.converters.util import convert_table_md
@@ -34,7 +34,7 @@ def __update_timestamp_literals(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
         # timestamp literal, which (in version 19) is stored in the DB as a naive datetime.
         # We convert it to an aware datetime, stored in UTC.
         assert v['_classname'] == 'Literal'
-        assert v['val_t'] == pxt.ColumnType.Type.TIMESTAMP.name
+        assert v['val_t'] == ts.ColumnType.Type.TIMESTAMP.name
         assert isinstance(v['val'], str)
         dt = datetime.datetime.fromisoformat(v['val'])
         assert dt.tzinfo is None  # In version 19 all timestamps are naive

pixeltable/metadata/converters/convert_34.py ADDED Viewed

@@ -0,0 +1,21 @@
+from typing import Any, Optional
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_md
+@register_converter(version=34)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_md(engine, substitution_fn=__substitute_md)
+def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
+    if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'ColumnRef':
+        # Add reference_tbl to ColumnRef; for historical metadata it is always equal to tbl
+        assert 'reference_tbl' not in v
+        v['reference_tbl'] = None
+        return k, v
+    return None

pixeltable/metadata/notes.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # rather than as a comment, so that the existence of a description can be enforced by
 # the unit tests when new versions are added.
 VERSION_NOTES = {
+    35: 'Track reference_tbl in ColumnRef',
     34: 'Set default value for is_pk field in column metadata to False',
     33: 'Add is_replica field to table metadata',
     32: 'Add the lock_dummy BIGINT column to the dirs table',

pixeltable/plan.py CHANGED Viewed

@@ -635,8 +635,8 @@ class Planner:
                 raise excs.Error(f'Join predicate {join_clause.join_predicate} not expressible in SQL')
     @classmethod
-    def _verify_ordering(cls, analyzer: Analyzer, verify_agg: bool) -> None:
-        """Verify that the various ordering requirements don't conflict"""
+    def _create_combined_ordering(cls, analyzer: Analyzer, verify_agg: bool) -> Optional[OrderByClause]:
+        """Verify that the various ordering requirements don't conflict and return a combined ordering"""
         ob_clauses: list[OrderByClause] = [analyzer.order_by_clause.copy()]
         if verify_agg:
@@ -652,8 +652,11 @@ class Planner:
                     OrderByItem(e, True) for e in fn_call.get_agg_order_by()
                 ]
                 ob_clauses.append(ordering)
-        if len(ob_clauses) <= 1:
-            return
+        if len(ob_clauses) == 0:
+            return None
+        elif len(ob_clauses) == 1:
+            return ob_clauses[0]
         combined_ordering = ob_clauses[0]
         for ordering in ob_clauses[1:]:
@@ -664,6 +667,7 @@ class Planner:
                     f'{print_order_by_clause(combined_ordering)} vs {print_order_by_clause(ordering)}'
                 )
             combined_ordering = combined
+        return combined_ordering
     @classmethod
     def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
@@ -761,7 +765,7 @@ class Planner:
             analyzer.window_fn_calls
         )
         ctx = exec.ExecContext(row_builder)
-        cls._verify_ordering(analyzer, verify_agg=is_python_agg)
+        combined_ordering = cls._create_combined_ordering(analyzer, verify_agg=is_python_agg)
         cls._verify_join_clauses(analyzer)
         # materialized with SQL table scans (ie, single-table SELECT statements):
@@ -859,6 +863,9 @@ class Planner:
                     row_builder, input=plan, select_list=analyzer.select_list, group_by_items=analyzer.group_by_clause
                 )
             else:
+                input_sql_node = plan.get_node(exec.SqlNode)
+                assert combined_ordering is not None
+                input_sql_node.set_order_by(combined_ordering)
                 plan = exec.AggregationNode(
                     tbl.tbl_version,
                     row_builder,

pixeltable/share/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 # ruff: noqa: F401
-from .publish import publish_snapshot
+from .publish import pull_replica, push_replica

pixeltable 0.3.12__py3-none-any.whl → 0.3.14__py3-none-any.whl

pixeltable 0.3.12py3-none-any.whl → 0.3.14py3-none-any.whl