PyPI - pixeltable - Versions diffs - 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl - Mend

pixeltable 0.4.15py3-none-any.whl → 0.4.17py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (68) hide show

pixeltable/__init__.py +4 -0
pixeltable/catalog/catalog.py +125 -63
pixeltable/catalog/column.py +7 -2
pixeltable/catalog/table.py +1 -0
pixeltable/catalog/table_metadata.py +4 -0
pixeltable/catalog/table_version.py +174 -117
pixeltable/catalog/table_version_handle.py +4 -1
pixeltable/catalog/table_version_path.py +0 -11
pixeltable/catalog/view.py +6 -0
pixeltable/config.py +7 -0
pixeltable/dataframe.py +10 -5
pixeltable/env.py +56 -19
pixeltable/exec/__init__.py +2 -0
pixeltable/exec/cell_materialization_node.py +231 -0
pixeltable/exec/cell_reconstruction_node.py +135 -0
pixeltable/exec/exec_node.py +1 -1
pixeltable/exec/expr_eval/evaluators.py +1 -0
pixeltable/exec/expr_eval/expr_eval_node.py +3 -0
pixeltable/exec/expr_eval/globals.py +2 -0
pixeltable/exec/globals.py +32 -0
pixeltable/exec/object_store_save_node.py +1 -4
pixeltable/exec/row_update_node.py +16 -9
pixeltable/exec/sql_node.py +107 -14
pixeltable/exprs/__init__.py +1 -1
pixeltable/exprs/arithmetic_expr.py +23 -18
pixeltable/exprs/column_property_ref.py +10 -10
pixeltable/exprs/column_ref.py +2 -2
pixeltable/exprs/data_row.py +106 -37
pixeltable/exprs/expr.py +9 -0
pixeltable/exprs/expr_set.py +14 -7
pixeltable/exprs/inline_expr.py +2 -19
pixeltable/exprs/json_path.py +45 -12
pixeltable/exprs/row_builder.py +54 -22
pixeltable/functions/__init__.py +1 -0
pixeltable/functions/bedrock.py +7 -0
pixeltable/functions/deepseek.py +11 -4
pixeltable/functions/llama_cpp.py +7 -0
pixeltable/functions/math.py +1 -1
pixeltable/functions/ollama.py +7 -0
pixeltable/functions/openai.py +4 -4
pixeltable/functions/openrouter.py +143 -0
pixeltable/functions/video.py +110 -28
pixeltable/globals.py +10 -4
pixeltable/io/globals.py +18 -17
pixeltable/io/parquet.py +1 -1
pixeltable/io/table_data_conduit.py +47 -22
pixeltable/iterators/document.py +61 -23
pixeltable/iterators/video.py +126 -53
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_40.py +73 -0
pixeltable/metadata/notes.py +1 -0
pixeltable/plan.py +175 -46
pixeltable/share/packager.py +155 -26
pixeltable/store.py +2 -3
pixeltable/type_system.py +5 -3
pixeltable/utils/arrow.py +6 -6
pixeltable/utils/av.py +65 -0
pixeltable/utils/console_output.py +4 -1
pixeltable/utils/exception_handler.py +5 -28
pixeltable/utils/image.py +7 -0
pixeltable/utils/misc.py +5 -0
pixeltable/utils/object_stores.py +16 -1
pixeltable/utils/s3_store.py +44 -11
{pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/METADATA +29 -28
{pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/RECORD +68 -61
{pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/WHEEL +0 -0
{pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/entry_points.txt +0 -0
{pixeltable-0.4.15.dist-info → pixeltable-0.4.17.dist-info}/licenses/LICENSE +0 -0

pixeltable/share/packager.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import base64
-import datetime
 import io
 import json
 import logging
@@ -13,6 +12,7 @@ from uuid import UUID
 import more_itertools
 import numpy as np
+import pgvector.sqlalchemy as sql_vector  # type: ignore[import-untyped]
 import PIL.Image
 import pyarrow as pa
 import pyarrow.parquet as pq
@@ -21,6 +21,7 @@ import sqlalchemy as sql
 import pixeltable as pxt
 from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
 from pixeltable.env import Env
+from pixeltable.exprs.data_row import CellMd
 from pixeltable.metadata import schema
 from pixeltable.utils import sha256sum
 from pixeltable.utils.formatter import Formatter
@@ -109,9 +110,12 @@ class TablePackager:
         assert any(tv.id == base.id for base in self.table._tbl_version_path.get_tbl_versions())
         sql_types = {col.name: col.type for col in tv.store_tbl.sa_tbl.columns}
         media_cols: set[str] = set()
+        cellmd_cols: set[str] = set()
         for col in tv.cols:
             if col.is_stored and col.col_type.is_media_type():
                 media_cols.add(col.store_name())
+            if col.stores_cellmd:
+                cellmd_cols.add(col.cellmd_store_name())
         parquet_schema = self.__to_parquet_schema(tv.store_tbl.sa_tbl)
         # TODO: Partition larger tables into multiple parquet files. (The parquet file naming scheme anticipates
@@ -126,10 +130,10 @@ class TablePackager:
         # excessive memory usage. The pyarrow tables are then amalgamated into the (single) Parquet table on disk.
         # We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
         # faster compression should provide good performance while still reducing temporary storage utilization.
-        parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='SNAPPY')
+        parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='snappy')
         filter_tv = self.table._tbl_version_path.tbl_version.get()
         row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
-        for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, parquet_schema):
+        for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, cellmd_cols, parquet_schema):
             parquet_writer.write_table(pa_table)
         parquet_writer.close()
@@ -138,7 +142,7 @@ class TablePackager:
     @classmethod
     def __to_parquet_schema(cls, store_tbl: sql.Table) -> pa.Schema:
         entries = [(col_name, cls.__to_parquet_type(col.type)) for col_name, col in store_tbl.columns.items()]
-        return pa.schema(entries)  # type: ignore[arg-type]
+        return pa.schema(entries)
     @classmethod
     def __to_parquet_type(cls, col_type: sql.types.TypeEngine[Any]) -> pa.DataType:
@@ -151,13 +155,17 @@ class TablePackager:
         if isinstance(col_type, sql.Float):
             return pa.float32()
         if isinstance(col_type, sql.TIMESTAMP):
-            return pa.timestamp('us', tz=datetime.timezone.utc)
+            return pa.timestamp('us', tz='UTC')
         if isinstance(col_type, sql.Date):
             return pa.date32()
         if isinstance(col_type, sql.JSON):
             return pa.string()  # JSON will be exported as strings
         if isinstance(col_type, sql.LargeBinary):
             return pa.binary()
+        if isinstance(col_type, sql_vector.Vector):
+            # Parquet/pyarrow do not handle null values properly for fixed_shape_tensor(), so we have to use list_()
+            # here instead.
+            return pa.list_(pa.float32())
         raise AssertionError(f'Unrecognized SQL type: {col_type} (type {type(col_type)})')
     def __to_pa_tables(
@@ -165,6 +173,7 @@ class TablePackager:
         row_iter: Iterator[dict[str, Any]],
         sql_types: dict[str, sql.types.TypeEngine[Any]],
         media_cols: set[str],
+        cellmd_cols: set[str],
         arrow_schema: pa.Schema,
         batch_size: int = 1_000,
     ) -> Iterator[pa.Table]:
@@ -176,14 +185,21 @@ class TablePackager:
         for rows in more_itertools.batched(row_iter, batch_size):
             cols = {}
             for name, sql_type in sql_types.items():
-                is_media_col = name in media_cols
-                values = [self.__to_pa_value(row.get(name), sql_type, is_media_col) for row in rows]
+                values = [
+                    self.__to_pa_value(row.get(name), sql_type, name in media_cols, name in cellmd_cols) for row in rows
+                ]
                 cols[name] = values
             yield pa.Table.from_pydict(cols, schema=arrow_schema)
-    def __to_pa_value(self, val: Any, sql_type: sql.types.TypeEngine[Any], is_media_col: bool) -> Any:
+    def __to_pa_value(
+        self, val: Any, sql_type: sql.types.TypeEngine[Any], is_media_col: bool, is_cellmd_col: bool
+    ) -> Any:
         if val is None:
             return None
+        if is_cellmd_col:
+            assert isinstance(val, dict)
+            # Export JSON as strings
+            return json.dumps(self.__process_cellmd(val))
         if isinstance(sql_type, sql.JSON):
             # Export JSON as strings
             return json.dumps(val)
@@ -194,6 +210,10 @@ class TablePackager:
         return val
     def __process_media_url(self, url: str) -> str:
+        """
+        Process a media URL for export. If it's a local file URL (file://), then replace it with a pxtmedia:// URI,
+        copying the file into the tarball if necessary. If it's any other type of URL, return it unchanged.
+        """
         parsed_url = urllib.parse.urlparse(url)
         if parsed_url.scheme == 'file':
             # It's the URL of a local file. Replace it with a pxtmedia:// URI.
@@ -214,6 +234,21 @@ class TablePackager:
         # For any type of URL other than a local file, just return the URL as-is.
         return url
+    def __process_cellmd(self, cellmd: dict[str, Any]) -> dict[str, Any]:
+        """
+        Process a cellmd dictionary for export. This involves replacing any local file references
+        with pxtmedia:// URIs, as described above.
+        """
+        cellmd_ = CellMd.from_dict(cellmd)
+        if cellmd_.file_urls is None:
+            return cellmd  # No changes
+        updated_urls: list[str] = []
+        for url in cellmd_.file_urls:
+            updated_urls.append(self.__process_media_url(url))
+        cellmd_.file_urls = updated_urls
+        return cellmd_.as_dict()
     def __build_tarball(self) -> Path:
         bundle_path = self.tmp_dir / 'bundle.tar.bz2'
         with tarfile.open(bundle_path, 'w:bz2') as tf:
@@ -409,6 +444,9 @@ class TableRestorer:
         # 2. "rectify" the v_max values in both the temporary table and the existing table (more on this below);
         # 3. Delete any row instances from the temporary table that are already present in the existing table;
         # 4. Copy the remaining rows from the temporary table into the existing table.
+        # 5. Rectify any index columns.
+        # STEP 1: Import the parquet data into a temporary table.
         # Create a temporary table for the initial data load, containing columns for all columns present in the
         # parquet table. The parquet columns have identical names to those in the store table, so we can use the
@@ -416,7 +454,7 @@ class TableRestorer:
         # e.g., pa.string() may hold either VARCHAR or serialized JSONB).
         temp_cols: dict[str, sql.Column] = {}
         for field in parquet_table.schema:
-            assert field.name in store_sa_tbl.columns
+            assert field.name in store_sa_tbl.columns, f'{field.name} not in {list(store_sa_tbl.columns)}'
             col_type = store_sa_tbl.columns[field.name].type
             temp_cols[field.name] = sql.Column(field.name, col_type)
         temp_sa_tbl_name = f'temp_{uuid.uuid4().hex}'
@@ -432,6 +470,8 @@ class TableRestorer:
             rows = self.__from_pa_pydict(tv, pydict)
             conn.execute(sql.insert(temp_sa_tbl), rows)
+        # STEP 2: Rectify v_max values.
         # Each row version is identified uniquely by its pk, a tuple (row_id, pos_0, pos_1, ..., pos_k, v_min).
         # Conversely, v_max is not part of the primary key, but is simply a bookkeeping device.
         # In an original table, v_max is always equal to the v_min of the succeeding row instance with the same
@@ -540,6 +580,8 @@ class TableRestorer:
         result = conn.execute(q)
         _logger.debug(f'Rectified {result.rowcount} row(s) in {store_sa_tbl_name!r}.')
+        # STEP 3: Delete any row instances from the temporary table that are already present in the existing table.
         # Now we need to update rows in the existing table that are also present in the temporary table. This is to
         # account for the scenario where the temporary table has columns that are not present in the existing table.
         # (We can't simply replace the rows with their versions in the temporary table, because the converse scenario
@@ -570,7 +612,9 @@ class TableRestorer:
         result = conn.execute(q)
         _logger.debug(f'Deleted {result.rowcount} row(s) from {temp_sa_tbl_name!r}.')
-        # Finally, copy the remaining data (consisting entirely of new row instances) from the temporary table into
+        # STEP 4: Copy the remaining rows from the temporary table into the existing table.
+        # Now copy the remaining data (consisting entirely of new row instances) from the temporary table into
         # the actual table.
         q = store_sa_tbl.insert().from_select(
             [store_sa_tbl.c[col_name] for col_name in temp_cols], sql.select(*temp_cols.values())
@@ -579,39 +623,113 @@ class TableRestorer:
         result = conn.execute(q)
         _logger.debug(f'Inserted {result.rowcount} row(s) from {temp_sa_tbl_name!r} into {store_sa_tbl_name!r}.')
+        # STEP 5: Rectify any index columns.
+        # Finally, rectify any index columns in the table. This involves shuffling data between the index's val and
+        # undo columns to ensure they appropriately reflect the most recent replicated version of the table.
+        # Get the most recent replicated version of the table. This might be the version we're currently importing,
+        # but it might be a different version of the table that was previously imported.
+        head_version_md = catalog.Catalog.get()._collect_tbl_history(tv.id, n=1)[0]
+        head_version = head_version_md.version_md.version
+        _logger.debug(f'Head version for index rectification is {head_version}.')
+        # Get the index info from the table metadata. Here we use the tbl_md that we just collected from the DB.
+        # This is to ensure we pick up ALL indices, including dropped indices and indices that are present in
+        # a previously replicated version of the table, but not in the one currently being imported.
+        index_md = head_version_md.tbl_md.index_md
+        # Now update the table. We can do this for all indices together with just two SQL queries. For each index,
+        # at most one of the val or undo columns will be non-NULL in any given row.
+        # For rows where v_min <= head_version < v_max, we set, for all indices:
+        #   val_col = whichever of (val_col, undo_col) is non-NULL (or NULL if both are, e.g., for a dropped index)
+        #   undo_col = NULL
+        # For rows where head_version < v_min or v_max <= head_version, vice versa.
+        val_sql_clauses: dict[str, sql.ColumnElement] = {}
+        undo_sql_clauses: dict[str, sql.ColumnElement] = {}
+        for index in index_md.values():
+            if index.class_fqn.endswith('.EmbeddingIndex'):
+                val_col_name = f'col_{index.index_val_col_id}'
+                undo_col_name = f'col_{index.index_val_undo_col_id}'
+                # Check that the val column for the index is actually present in the store table. We need to do this
+                # to properly handle the case where the replica represents a table version that was *not* the most
+                # recent version at the time it was published. In that case, it is possible for tbl_md to contain
+                # metadata for indices not known to any version that has been replicated. (However, the converse
+                # *does* hold: all replicated indices must have metadata in tbl_md; and that's what's important.)
+                if val_col_name in store_sa_tbl.c:
+                    assert undo_col_name in store_sa_tbl.c
+                    coalesce = sql.func.coalesce(store_sa_tbl.c[val_col_name], store_sa_tbl.c[undo_col_name])
+                    val_sql_clauses[val_col_name] = coalesce
+                    val_sql_clauses[undo_col_name] = sql.null()
+                    undo_sql_clauses[undo_col_name] = coalesce
+                    undo_sql_clauses[val_col_name] = sql.null()
+        if len(val_sql_clauses) > 0:
+            q2 = (
+                store_sa_tbl.update()
+                .values(**val_sql_clauses)
+                .where(sql.and_(tv.store_tbl.v_min_col <= head_version, tv.store_tbl.v_max_col > head_version))
+            )
+            _logger.debug(q2.compile())
+            _ = conn.execute(q2)
+            q2 = (
+                store_sa_tbl.update()
+                .values(**undo_sql_clauses)
+                .where(sql.or_(tv.store_tbl.v_min_col > head_version, tv.store_tbl.v_max_col <= head_version))
+            )
+            _logger.debug(q2.compile())
+            _ = conn.execute(q2)
+            _logger.debug(f'Rectified index columns in {store_sa_tbl_name!r}.')
+        else:
+            _logger.debug(f'No index columns to rectify in {store_sa_tbl_name!r}.')
     def __from_pa_pydict(self, tv: catalog.TableVersion, pydict: dict[str, Any]) -> list[dict[str, Any]]:
         # Data conversions from pyarrow to Pixeltable
         sql_types: dict[str, sql.types.TypeEngine[Any]] = {}
         for col_name in pydict:
             assert col_name in tv.store_tbl.sa_tbl.columns
             sql_types[col_name] = tv.store_tbl.sa_tbl.columns[col_name].type
-        media_cols: dict[str, catalog.Column] = {}
-        for col in tv.cols:
-            if col.is_stored and col.col_type.is_media_type():
-                assert tv.id == col.tbl.id
-                assert tv.version == col.tbl.version
-                media_cols[col.store_name()] = col
+        stored_cols: dict[str, catalog.Column] = {col.store_name(): col for col in tv.cols if col.is_stored}
+        stored_cols |= {col.cellmd_store_name(): col for col in tv.cols if col.stores_cellmd}
         row_count = len(next(iter(pydict.values())))
-        rows: list[dict[str, Any]] = []
-        for i in range(row_count):
-            row = {
-                col_name: self.__from_pa_value(col_vals[i], sql_types[col_name], media_cols.get(col_name))
-                for col_name, col_vals in pydict.items()
-            }
-            rows.append(row)
+        rows: list[dict[str, Any]] = [{} for _ in range(row_count)]
+        for col_name, col_vals in pydict.items():
+            assert len(col_vals) == row_count
+            col = stored_cols.get(col_name)  # Will be None for system columns
+            is_media_col = col is not None and col.is_stored and col.col_type.is_media_type()
+            is_cellmd_col = col is not None and col.stores_cellmd and col_name == col.cellmd_store_name()
+            assert col is None or is_cellmd_col or col_name == col.store_name()
+            for i, val in enumerate(col_vals):
+                rows[i][col_name] = self.__from_pa_value(val, sql_types[col_name], col, is_media_col, is_cellmd_col)
         return rows
     def __from_pa_value(
-        self, val: Any, sql_type: sql.types.TypeEngine[Any], media_col: Optional[catalog.Column]
+        self,
+        val: Any,
+        sql_type: sql.types.TypeEngine[Any],
+        col: Optional[catalog.Column],
+        is_media_col: bool,
+        is_cellmd_col: bool,
     ) -> Any:
         if val is None:
             return None
+        if isinstance(sql_type, sql_vector.Vector):
+            if isinstance(val, list):
+                val = np.array(val, dtype=np.float32)
+            assert isinstance(val, np.ndarray) and val.dtype == np.float32 and val.ndim == 1
+            return val
+        if is_cellmd_col:
+            assert col is not None
+            assert isinstance(val, str)
+            return self.__restore_cellmd(col, json.loads(val))
         if isinstance(sql_type, sql.JSON):
             return json.loads(val)
-        if media_col is not None:
-            return self.__relocate_media_file(media_col, val)
+        if is_media_col:
+            assert col is not None
+            return self.__relocate_media_file(col, val)
         return val
     def __relocate_media_file(self, media_col: catalog.Column, url: str) -> str:
@@ -629,3 +747,14 @@ class TableRestorer:
             return self.media_files[url]
         # For any type of URL other than a local file, just return the URL as-is.
         return url
+    def __restore_cellmd(self, col: catalog.Column, cellmd: dict[str, Any]) -> dict[str, Any]:
+        cellmd_ = CellMd.from_dict(cellmd)
+        if cellmd_.file_urls is None:
+            return cellmd  # No changes
+        updated_urls: list[str] = []
+        for url in cellmd_.file_urls:
+            updated_urls.append(self.__relocate_media_file(col, url))
+        cellmd_.file_urls = updated_urls
+        return cellmd_.as_dict()

pixeltable/store.py CHANGED Viewed

@@ -321,7 +321,7 @@ class StoreBase:
                     table_row, num_row_exc = row_builder.create_store_table_row(row, cols_with_excs, pk)
                     num_excs += num_row_exc
-                    if show_progress:
+                    if show_progress and Env.get().verbosity >= 1:
                         if progress_bar is None:
                             warnings.simplefilter('ignore', category=TqdmWarning)
                             progress_bar = tqdm(
@@ -434,8 +434,7 @@ class StoreBase:
             *[c1 == c2 for c1, c2 in zip(self.rowid_columns(), filter_view.rowid_columns())],
         )
         stmt = (
-            sql.select('*')  # TODO: Use a more specific list of columns?
-            .select_from(self.sa_tbl)
+            sql.select(self.sa_tbl)
             .where(self.v_min_col <= version)
             .where(self.v_max_col > version)
             .where(sql.exists().where(filter_predicate))

pixeltable/type_system.py CHANGED Viewed

@@ -25,6 +25,7 @@ import sqlalchemy as sql
 from typing_extensions import _AnnotatedAlias
 import pixeltable.exceptions as excs
+from pixeltable.env import Env
 from pixeltable.utils import parse_local_file_path
@@ -673,8 +674,9 @@ class TimestampType(ColumnType):
     def _create_literal(self, val: Any) -> Any:
         if isinstance(val, str):
             return datetime.datetime.fromisoformat(val)
-        if isinstance(val, datetime.datetime):
-            return val
+        # Place naive timestamps in the default time zone
+        if isinstance(val, datetime.datetime) and val.tzinfo is None:
+            return val.replace(tzinfo=Env.get().default_time_zone)
         return val
@@ -760,7 +762,7 @@ class JsonType(ColumnType):
     @classmethod
     def __is_valid_json(cls, val: Any) -> bool:
-        if val is None or isinstance(val, (str, int, float, bool)):
+        if val is None or isinstance(val, (str, int, float, bool, np.ndarray, PIL.Image.Image)):
             return True
         if isinstance(val, (list, tuple)):
             return all(cls.__is_valid_json(v) for v in val)

pixeltable/utils/arrow.py CHANGED Viewed

@@ -16,7 +16,7 @@ if TYPE_CHECKING:
 PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
     pa.string(): ts.StringType(nullable=True),
     pa.large_string(): ts.StringType(nullable=True),
-    pa.timestamp('us', tz=datetime.timezone.utc): ts.TimestampType(nullable=True),
+    pa.timestamp('us', tz='UTC'): ts.TimestampType(nullable=True),
     pa.bool_(): ts.BoolType(nullable=True),
     pa.int8(): ts.IntType(nullable=True),
     pa.int16(): ts.IntType(nullable=True),
@@ -35,7 +35,7 @@ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
 PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
     ts.StringType: pa.string(),
-    ts.TimestampType: pa.timestamp('us', tz=datetime.timezone.utc),  # postgres timestamp is microseconds
+    ts.TimestampType: pa.timestamp('us', tz='UTC'),  # postgres timestamp is microseconds
     ts.DateType: pa.date32(),  # This could be date64
     ts.BoolType: pa.bool_(),
     ts.IntType: pa.int64(),
@@ -61,7 +61,7 @@ def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.C
         dtype = to_pixeltable_type(arrow_type.value_type, nullable)
         if dtype is None:
             return None
-        return ts.ArrayType(shape=arrow_type.shape, dtype=dtype, nullable=nullable)
+        return ts.ArrayType(shape=tuple(arrow_type.shape), dtype=dtype, nullable=nullable)
     else:
         return None
@@ -92,7 +92,7 @@ def to_pxt_schema(
 def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
-    return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())  # type: ignore[misc]
+    return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())
 def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
@@ -106,7 +106,7 @@ def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa
         else:
             pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
             pa_arrays.append(pa_array)
-    return pa.RecordBatch.from_arrays(pa_arrays, schema=schema)  # type: ignore
+    return pa.RecordBatch.from_arrays(pa_arrays, schema=schema)
 def to_record_batches(df: 'pxt.DataFrame', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
@@ -192,7 +192,7 @@ def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
         col = batch.column(k)
         if isinstance(col.type, pa.FixedShapeTensorType):
             # treat array columns as numpy arrays to easily preserve numpy type
-            out[name] = col.to_numpy(zero_copy_only=False)  # type: ignore[call-arg]
+            out[name] = col.to_numpy(zero_copy_only=False)
         else:
             # for the rest, use pydict to preserve python types
             out[name] = col.to_pylist()

pixeltable/utils/av.py CHANGED Viewed

@@ -3,6 +3,8 @@ from typing import Any
 import av
 import av.stream
+from pixeltable.env import Env
 def get_metadata(path: str) -> dict:
     with av.open(path) as container:
@@ -109,3 +111,66 @@ def ffmpeg_clip_cmd(input_path: str, output_path: str, start_time: float, durati
         ]
     )
     return cmd
+def ffmpeg_segment_cmd(
+    input_path: str,
+    output_pattern: str,
+    segment_duration: float | None = None,
+    segment_times: list[float] | None = None,
+    video_encoder: str | None = None,
+    video_encoder_args: dict[str, Any] | None = None,
+) -> list[str]:
+    """Commandline for frame-accurate segmentation"""
+    assert (segment_duration is None) != (segment_times is None)
+    if video_encoder is None:
+        video_encoder = Env.get().default_video_encoder
+    cmd = [
+        'ffmpeg',
+        '-i',
+        input_path,
+        '-f',
+        'segment',  # Use segment muxer
+    ]
+    if segment_duration is not None:
+        cmd.extend(
+            [
+                '-segment_time',
+                str(segment_duration),  # Target segment duration
+                '-break_non_keyframes',
+                '1',  # need to break at non-keyframes to get frame-accurate segments
+                '-force_key_frames',
+                f'expr:gte(t,n_forced*{segment_duration})',  # Force keyframe at each segment boundary
+            ]
+        )
+    else:
+        assert segment_times is not None
+        times_str = ','.join([str(t) for t in segment_times])
+        cmd.extend(['-segment_times', times_str, '-force_key_frames', times_str])
+    cmd.extend(
+        [
+            '-reset_timestamps',
+            '1',  # Reset timestamps for each segment
+            '-map',
+            '0',  # Copy all streams from input
+            '-c:a',
+            'copy',  # don't re-encode audio
+            '-c:v',
+            video_encoder,  # re-encode video
+        ]
+    )
+    if video_encoder_args is not None:
+        for k, v in video_encoder_args.items():
+            cmd.extend([f'-{k}', str(v)])
+    cmd.extend(
+        [
+            '-loglevel',
+            'error',  # Only show errors
+            output_pattern,
+        ]
+    )
+    return cmd

pixeltable/utils/console_output.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import logging
 from typing import TextIO
+from pixeltable import exceptions as excs
 def map_level(verbosity: int) -> int:
     """
@@ -19,7 +21,8 @@ def map_level(verbosity: int) -> int:
         return logging.INFO
     if verbosity == 2:
         return logging.DEBUG
-    return logging.INFO
+    raise excs.Error(f'Invalid verbosity level: {verbosity}')
 class ConsoleOutputHandler(logging.StreamHandler):

pixeltable/utils/exception_handler.py CHANGED Viewed

@@ -1,32 +1,9 @@
 import logging
-import sys
 from typing import Any, Callable, Optional, TypeVar
 R = TypeVar('R')
-def _is_in_exception() -> bool:
-    """
-    Check if code is currently executing within an exception context.
-    """
-    current_exception = sys.exc_info()[1]
-    return current_exception is not None
-def run_cleanup_on_exception(cleanup_func: Callable[..., R], *args: Any, **kwargs: Any) -> Optional[R]:
-    """
-    Runs cleanup only when running in exception context.
-    The function `run_cleanup_on_exception()` should be used to clean up resources when an operation fails.
-    This is typically done using a try, except, and finally block, with the resource cleanup logic placed within
-    the except block. However, this pattern may not handle KeyboardInterrupt exceptions.
-    To ensure that resources are always cleaned up at least once when an exception or KeyboardInterrupt occurs,
-    create an idempotent function for cleaning up resources and pass it to the `run_cleanup_on_exception()` function
-    from the finally block.
-    """
-    if _is_in_exception():
-        return run_cleanup(cleanup_func, *args, raise_error=False, **kwargs)
-    return None
+logger = logging.getLogger('pixeltable')
 def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool = True, **kwargs: Any) -> Optional[R]:
@@ -40,20 +17,20 @@ def run_cleanup(cleanup_func: Callable[..., R], *args: Any, raise_error: bool =
         raise_error: raise an exception if an error occurs during cleanup.
     """
     try:
-        logging.debug(f'Running cleanup function: {cleanup_func.__name__!r}')
+        logger.debug(f'Running cleanup function: {cleanup_func.__name__!r}')
         return cleanup_func(*args, **kwargs)
     except KeyboardInterrupt as interrupt:
         # Save original exception and re-attempt cleanup
         original_exception = interrupt
-        logging.debug(f'Cleanup {cleanup_func.__name__!r} interrupted, retrying')
+        logger.debug(f'Cleanup {cleanup_func.__name__!r} interrupted, retrying')
         try:
             return cleanup_func(*args, **kwargs)
         except Exception as e:
             # Suppress this exception
-            logging.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e}')
+            logger.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e.__class__}: {e}')
         raise KeyboardInterrupt from original_exception
     except Exception as e:
-        logging.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e}')
+        logger.error(f'Cleanup {cleanup_func.__name__!r} failed with exception {e.__class__}: {e}')
         if raise_error:
             raise e
     return None

pixeltable/utils/image.py ADDED Viewed

@@ -0,0 +1,7 @@
+import PIL.Image
+def default_format(img: PIL.Image.Image) -> str:
+    # Default to JPEG unless the image has a transparency layer (which isn't supported by JPEG).
+    # In that case, use WebP instead.
+    return 'webp' if img.has_transparency_data else 'jpeg'

pixeltable/utils/misc.py ADDED Viewed

@@ -0,0 +1,5 @@
+from typing import Any
+def non_none_dict_factory(d: list[tuple[str, Any]]) -> dict:
+    return {k: v for (k, v) in d if v is not None}

pixeltable/utils/object_stores.py CHANGED Viewed

@@ -22,6 +22,7 @@ class StorageTarget(enum.Enum):
     LOCAL_STORE = 'os'  # Local file system
     S3_STORE = 's3'  # Amazon S3
     R2_STORE = 'r2'  # Cloudflare R2
+    B2_STORE = 'b2'  # Backblaze B2
     GCS_STORE = 'gs'  # Google Cloud Storage
     AZURE_STORE = 'az'  # Azure Blob Storage
     HTTP_STORE = 'http'  # HTTP/HTTPS
@@ -63,6 +64,7 @@ class StorageObjectAddress(NamedTuple):
             StorageTarget.LOCAL_STORE,
             StorageTarget.S3_STORE,
             StorageTarget.R2_STORE,
+            StorageTarget.B2_STORE,
             StorageTarget.GCS_STORE,
             StorageTarget.AZURE_STORE,
             StorageTarget.HTTP_STORE,
@@ -218,15 +220,23 @@ class ObjectPath:
             # Standard HTTP(S) URL format
             # https://account.blob.core.windows.net/container/<optional path>/<optional object>
             # https://account.r2.cloudflarestorage.com/container/<optional path>/<optional object>
+            # https://s3.us-west-004.backblazeb2.com/container/<optional path>/<optional object>
             # and possibly others
             key = parsed.path
             if 'cloudflare' in parsed.netloc:
                 storage_target = StorageTarget.R2_STORE
+            elif 'backblazeb2' in parsed.netloc:
+                storage_target = StorageTarget.B2_STORE
             elif 'windows' in parsed.netloc:
                 storage_target = StorageTarget.AZURE_STORE
             else:
                 storage_target = StorageTarget.HTTP_STORE
-            if storage_target in [StorageTarget.S3_STORE, StorageTarget.AZURE_STORE, StorageTarget.R2_STORE]:
+            if storage_target in (
+                StorageTarget.S3_STORE,
+                StorageTarget.AZURE_STORE,
+                StorageTarget.R2_STORE,
+                StorageTarget.B2_STORE,
+            ):
                 account_name = parsed.netloc.split('.', 1)[0]
                 account_extension = parsed.netloc.split('.', 1)[1]
                 path_parts = key.lstrip('/').split('/', 1)
@@ -370,6 +380,11 @@ class ObjectOps:
             env.Env.get().require_package('boto3')
             from pixeltable.utils.s3_store import S3Store
+            return S3Store(soa)
+        if soa.storage_target == StorageTarget.B2_STORE:
+            env.Env.get().require_package('boto3')
+            from pixeltable.utils.s3_store import S3Store
             return S3Store(soa)
         if soa.storage_target == StorageTarget.GCS_STORE and soa.scheme == 'gs':
             env.Env.get().require_package('google.cloud.storage')

pixeltable 0.4.15__py3-none-any.whl → 0.4.17__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.15py3-none-any.whl → 0.4.17py3-none-any.whl