PyPI - pixeltable - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl - Mend

pixeltable 0.3.15py3-none-any.whl → 0.4.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (58) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/catalog.py +296 -105
pixeltable/catalog/column.py +10 -8
pixeltable/catalog/dir.py +1 -2
pixeltable/catalog/insertable_table.py +25 -20
pixeltable/catalog/schema_object.py +3 -6
pixeltable/catalog/table.py +261 -189
pixeltable/catalog/table_version.py +333 -202
pixeltable/catalog/table_version_handle.py +15 -2
pixeltable/catalog/table_version_path.py +60 -14
pixeltable/catalog/view.py +38 -6
pixeltable/dataframe.py +196 -18
pixeltable/env.py +4 -4
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/expr_eval/evaluators.py +4 -1
pixeltable/exec/in_memory_data_node.py +1 -1
pixeltable/exec/sql_node.py +171 -22
pixeltable/exprs/column_property_ref.py +15 -6
pixeltable/exprs/column_ref.py +32 -11
pixeltable/exprs/comparison.py +1 -1
pixeltable/exprs/data_row.py +5 -3
pixeltable/exprs/expr.py +7 -0
pixeltable/exprs/literal.py +2 -0
pixeltable/exprs/row_builder.py +4 -6
pixeltable/exprs/rowid_ref.py +8 -0
pixeltable/exprs/similarity_expr.py +1 -0
pixeltable/func/query_template_function.py +1 -1
pixeltable/func/tools.py +1 -1
pixeltable/functions/gemini.py +0 -1
pixeltable/functions/string.py +212 -58
pixeltable/globals.py +12 -4
pixeltable/index/base.py +5 -0
pixeltable/index/btree.py +5 -0
pixeltable/index/embedding_index.py +5 -0
pixeltable/io/external_store.py +8 -29
pixeltable/io/label_studio.py +1 -1
pixeltable/io/parquet.py +2 -2
pixeltable/io/table_data_conduit.py +0 -31
pixeltable/metadata/__init__.py +11 -2
pixeltable/metadata/converters/convert_13.py +2 -2
pixeltable/metadata/converters/convert_30.py +6 -11
pixeltable/metadata/converters/convert_35.py +9 -0
pixeltable/metadata/converters/convert_36.py +38 -0
pixeltable/metadata/converters/util.py +3 -9
pixeltable/metadata/notes.py +2 -0
pixeltable/metadata/schema.py +8 -1
pixeltable/plan.py +221 -14
pixeltable/share/packager.py +137 -13
pixeltable/share/publish.py +2 -2
pixeltable/store.py +19 -13
pixeltable/utils/dbms.py +1 -1
pixeltable/utils/formatter.py +64 -42
pixeltable/utils/sample.py +25 -0
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/METADATA +2 -1
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/RECORD +58 -55
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/LICENSE +0 -0
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/WHEEL +0 -0
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/entry_points.txt +0 -0

pixeltable/share/packager.py CHANGED Viewed

@@ -1,4 +1,7 @@
+import base64
 import datetime
+import io
+import itertools
 import json
 import logging
 import tarfile
@@ -7,17 +10,21 @@ import urllib.request
 import uuid
 from pathlib import Path
 from typing import Any, Iterator, Optional
+from uuid import UUID
 import more_itertools
+import numpy as np
+import PIL.Image
 import pyarrow as pa
 import pyarrow.parquet as pq
 import sqlalchemy as sql
 import pixeltable as pxt
-from pixeltable import catalog, exceptions as excs, metadata
+from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
 from pixeltable.env import Env
 from pixeltable.metadata import schema
 from pixeltable.utils import sha256sum
+from pixeltable.utils.formatter import Formatter
 from pixeltable.utils.media_store import MediaStore
 _logger = logging.getLogger('pixeltable')
@@ -45,13 +52,17 @@ class TablePackager:
     media_files: dict[Path, str]  # Mapping from local media file paths to their tarball names
     md: dict[str, Any]
+    bundle_path: Path
+    preview_header: dict[str, str]
+    preview: list[list[Any]]
     def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
         self.table = table
         self.tmp_dir = Path(Env.get().create_tmp_path())
         self.media_files = {}
         # Load metadata
-        with Env.get().begin_xact():
+        with catalog.Catalog.get().begin_xact(for_write=False):
             tbl_md = catalog.Catalog.get().load_replica_md(table)
             self.md = {
                 'pxt_version': pxt.__version__,
@@ -66,20 +77,29 @@ class TablePackager:
         Export the table to a tarball containing Parquet tables and media files.
         """
         assert not self.tmp_dir.exists()  # Packaging can only be done once per TablePackager instance
-        _logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
+        _logger.info(f'Packaging table {self.table._path()!r} and its ancestors in: {self.tmp_dir}')
         self.tmp_dir.mkdir()
         with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
             json.dump(self.md, fp)
         self.tables_dir = self.tmp_dir / 'tables'
         self.tables_dir.mkdir()
-        with Env.get().begin_xact():
+        with catalog.Catalog.get().begin_xact(for_write=False):
             for tv in self.table._tbl_version_path.get_tbl_versions():
-                _logger.info(f"Exporting table '{tv.get().name}:{tv.get().version}'.")
+                _logger.info(f'Exporting table {tv.get().versioned_name!r}.')
                 self.__export_table(tv.get())
         _logger.info('Building archive.')
-        bundle_path = self.__build_tarball()
-        _logger.info(f'Packaging complete: {bundle_path}')
-        return bundle_path
+        self.bundle_path = self.__build_tarball()
+        _logger.info('Extracting preview data.')
+        self.md['count'] = self.table.count()
+        preview_header, preview = self.__extract_preview_data()
+        self.md['preview_header'] = preview_header
+        self.md['preview'] = preview
+        _logger.info(f'Packaging complete: {self.bundle_path}')
+        return self.bundle_path
     def __export_table(self, tv: catalog.TableVersion) -> None:
         """
@@ -206,6 +226,96 @@ class TablePackager:
                 tf.add(src_file, arcname=f'media/{dest_name}')
         return bundle_path
+    def __extract_preview_data(self) -> tuple[dict[str, str], list[list[Any]]]:
+        """
+        Extract a preview of the table data for display in the UI.
+        In order to bound the size of the output data, all "unbounded" data types are resized:
+        - Strings are abbreviated as per Formatter.abbreviate()
+        - Arrays and JSON are shortened and formatted as strings
+        - Images are resized to thumbnail size as a base64-encoded webp
+        - Videos are replaced by their first frame and resized as above
+        - Documents are replaced by a thumbnail as a base64-encoded webp
+        """
+        # First 8 columns
+        preview_cols = dict(itertools.islice(self.table._schema.items(), 0, 8))
+        select_list = [self.table[col_name] for col_name in preview_cols]
+        # First 5 rows
+        rows = list(self.table.select(*select_list).head(n=5))
+        preview_header = {col_name: str(col_type._type) for col_name, col_type in preview_cols.items()}
+        preview = [
+            [self.__encode_preview_data(val, col_type)]
+            for row in rows
+            for val, col_type in zip(row.values(), preview_cols.values())
+        ]
+        return preview_header, preview
+    def __encode_preview_data(self, val: Any, col_type: ts.ColumnType) -> Any:
+        if val is None:
+            return None
+        match col_type._type:
+            case ts.ColumnType.Type.STRING:
+                assert isinstance(val, str)
+                return Formatter.abbreviate(val)
+            case ts.ColumnType.Type.INT | ts.ColumnType.Type.FLOAT | ts.ColumnType.Type.BOOL:
+                return val
+            case ts.ColumnType.Type.TIMESTAMP | ts.ColumnType.Type.DATE:
+                return str(val)
+            case ts.ColumnType.Type.ARRAY:
+                assert isinstance(val, np.ndarray)
+                return Formatter.format_array(val)
+            case ts.ColumnType.Type.JSON:
+                # We need to escape the JSON string server-side for security reasons.
+                # Therefore we don't escape it here, in order to avoid double-escaping.
+                return Formatter.format_json(val, escape_strings=False)
+            case ts.ColumnType.Type.IMAGE:
+                # Rescale the image to minimize data transfer size
+                assert isinstance(val, PIL.Image.Image)
+                return self.__encode_image(val)
+            case ts.ColumnType.Type.VIDEO:
+                assert isinstance(val, str)
+                return self.__encode_video(val)
+            case ts.ColumnType.Type.AUDIO:
+                return None
+            case ts.ColumnType.Type.DOCUMENT:
+                assert isinstance(val, str)
+                return self.__encode_document(val)
+            case _:
+                raise AssertionError(f'Unrecognized column type: {col_type._type}')
+    def __encode_image(self, img: PIL.Image.Image) -> str:
+        # Heuristic for thumbnail sizing:
+        # Standardize on a width of 240 pixels (to most efficiently utilize the columnar display).
+        # But, if the aspect ratio is below 2:3, bound the height at 360 pixels (to avoid unboundedly tall thumbnails
+        #     in the case of highly oblong images).
+        if img.height > img.width * 1.5:
+            scaled_img = img.resize((img.width * 360 // img.height, 360))
+        else:
+            scaled_img = img.resize((240, img.height * 240 // img.width))
+        with io.BytesIO() as buffer:
+            scaled_img.save(buffer, 'webp')
+            return base64.b64encode(buffer.getvalue()).decode()
+    def __encode_video(self, video_path: str) -> Optional[str]:
+        thumb = Formatter.extract_first_video_frame(video_path)
+        return self.__encode_image(thumb) if thumb is not None else None
+    def __encode_document(self, doc_path: str) -> Optional[str]:
+        thumb = Formatter.make_document_thumbnail(doc_path)
+        return self.__encode_image(thumb) if thumb is not None else None
 class TableRestorer:
     """
@@ -253,13 +363,26 @@ class TableRestorer:
         tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
         # Create the replica table
-        # TODO: This needs to be made concurrency-safe.
-        replica_tbl = catalog.Catalog.get().create_replica(catalog.Path(self.tbl_path), tbl_md)
-        assert replica_tbl._tbl_version.get().is_snapshot
+        # The logic here needs to be completely restructured in order to make it concurrency-safe.
+        # - Catalog.create_replica() needs to write the metadata and also create the physical store tables
+        #   and populate them, otherwise concurrent readers will see an inconsistent state (table metadata w/o
+        #   an actual table)
+        # - this could be done one replica at a time (instead of the entire hierarchy)
+        cat = catalog.Catalog.get()
+        cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
+        # don't call get_table() until after the calls to create_replica() and __import_table() below;
+        # the TV instances created by get_table() would be replaced by create_replica(), which creates duplicate
+        # TV instances for the same replica version, which then leads to failures when constructing queries
         # Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
         # replica_tbl itself if it's a pure snapshot.
-        if replica_tbl._id != replica_tbl._tbl_version.id:
+        target_md = tbl_md[0]
+        is_pure_snapshot = (
+            target_md.tbl_md.view_md is not None
+            and target_md.tbl_md.view_md.predicate is None
+            and len(target_md.schema_version_md.columns) == 0
+        )
+        if is_pure_snapshot:
             ancestor_md = tbl_md[1:]  # Pure snapshot; skip replica_tbl
         else:
             ancestor_md = tbl_md  # Not a pure snapshot; include replica_tbl
@@ -273,7 +396,8 @@ class TableRestorer:
                 _logger.info(f'Importing table {tv.name!r}.')
                 self.__import_table(self.tmp_dir, tv, md)
-        return replica_tbl
+        with cat.begin_xact(for_write=False):
+            return cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id))
     def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
         """

pixeltable/share/publish.py CHANGED Viewed

@@ -35,7 +35,7 @@ def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
     upload_id = response_json['upload_id']
     destination_uri = response_json['destination_uri']
-    Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
+    Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path()}' at: {dest_tbl_uri}")
     bundle = packager.package()
@@ -117,7 +117,7 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
     restorer = TableRestorer(dest_path, response_json)
     tbl = restorer.restore(bundle_path)
-    Env.get().console_logger.info(f'Created local replica {tbl._path!r} from URI: {src_tbl_uri}')
+    Env.get().console_logger.info(f'Created local replica {tbl._path()!r} from URI: {src_tbl_uri}')
     return tbl

pixeltable/store.py CHANGED Viewed

@@ -52,7 +52,8 @@ class StoreBase:
         # We need to declare a `base` variable here, even though it's only defined for instances of `StoreView`,
         # since it's referenced by various methods of `StoreBase`
         self.base = tbl_version.base.get().store_tbl if tbl_version.base is not None else None
-        self.create_sa_tbl()
+        # we're passing in tbl_version to avoid a circular call to TableVersionHandle.get()
+        self.create_sa_tbl(tbl_version)
     def system_columns(self) -> list[sql.Column]:
         return [*self._pk_cols, self.v_max_col]
@@ -77,11 +78,13 @@ class StoreBase:
         self._pk_cols = [*rowid_cols, self.v_min_col]
         return [*rowid_cols, self.v_min_col, self.v_max_col]
-    def create_sa_tbl(self) -> None:
+    def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
         """Create self.sa_tbl from self.tbl_version."""
+        if tbl_version is None:
+            tbl_version = self.tbl_version.get()
         system_cols = self._create_system_columns()
         all_cols = system_cols.copy()
-        for col in [c for c in self.tbl_version.get().cols if c.is_stored]:
+        for col in [c for c in tbl_version.cols if c.is_stored]:
             # re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
             # to the last sql.Table version we created and cannot be reused
             col.create_sa_cols()
@@ -99,16 +102,17 @@ class StoreBase:
         # - base x view joins can be executed as merge joins
         # - speeds up ORDER BY rowid DESC
         # - allows filtering for a particular table version in index scan
-        idx_name = f'sys_cols_idx_{self.tbl_version.id.hex}'
+        idx_name = f'sys_cols_idx_{tbl_version.id.hex}'
         idxs.append(sql.Index(idx_name, *system_cols))
         # v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
-        idx_name = f'vmin_idx_{self.tbl_version.id.hex}'
+        idx_name = f'vmin_idx_{tbl_version.id.hex}'
         idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using=Env.get().dbms.version_index_type))
-        idx_name = f'vmax_idx_{self.tbl_version.id.hex}'
+        idx_name = f'vmax_idx_{tbl_version.id.hex}'
         idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using=Env.get().dbms.version_index_type))
         self.sa_tbl = sql.Table(self._storage_name(), self.sa_md, *all_cols, *idxs)
+        # _logger.debug(f'created sa tbl for {tbl_version.id!s} (sa_tbl={id(self.sa_tbl):x}, tv={id(tbl_version):x})')
     @abc.abstractmethod
     def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
@@ -285,7 +289,7 @@ class StoreBase:
                         else:
                             if col.col_type.is_image_type() and result_row.file_urls[value_expr_slot_idx] is None:
                                 # we have yet to store this image
-                                filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.get().version))
+                                filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
                                 result_row.flush_img(value_expr_slot_idx, filepath)
                             val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
                             if col.col_type.is_media_type():
@@ -415,9 +419,7 @@ class StoreBase:
             number of deleted rows
         """
         where_clause = sql.true() if where_clause is None else where_clause
-        where_clause = sql.and_(
-            self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION, where_clause
-        )
+        version_clause = sql.and_(self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION)
         rowid_join_clause = self._rowid_join_predicate()
         base_versions_clause = (
             sql.true() if len(base_versions) == 0 else self.base._versions_clause(base_versions, match_on_vmin)
@@ -428,10 +430,12 @@ class StoreBase:
             set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
             # set value column to NULL
             set_clause[index_info.val_col.sa_col] = None
         stmt = (
             sql.update(self.sa_tbl)
             .values(set_clause)
             .where(where_clause)
+            .where(version_clause)
             .where(rowid_join_clause)
             .where(base_versions_clause)
         )
@@ -528,10 +532,12 @@ class StoreComponentView(StoreView):
         self.rowid_cols.append(self.pos_col)
         return self.rowid_cols
-    def create_sa_tbl(self) -> None:
-        super().create_sa_tbl()
+    def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
+        if tbl_version is None:
+            tbl_version = self.tbl_version.get()
+        super().create_sa_tbl(tbl_version)
         # we need to fix up the 'pos' column in TableVersion
-        self.tbl_version.get().cols_by_name['pos'].sa_col = self.pos_col
+        tbl_version.cols_by_name['pos'].sa_col = self.pos_col
     def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
         return sql.and_(

pixeltable/utils/dbms.py CHANGED Viewed

@@ -35,7 +35,7 @@ class PostgresqlDbms(Dbms):
     """
     def __init__(self, db_url: URL):
-        super().__init__('postgresql', 'REPEATABLE READ', 'brin', db_url)
+        super().__init__('postgresql', 'SERIALIZABLE', 'brin', db_url)
     def drop_db_stmt(self, database: str) -> str:
         return f'DROP DATABASE {database}'

pixeltable/utils/formatter.py CHANGED Viewed

@@ -63,10 +63,10 @@ class Formatter:
         """
         Escapes special characters in `val`, and abbreviates `val` if its length exceeds `_STRING_MAX_LEN`.
         """
-        return cls.__escape(cls.__abbreviate(val, cls.__STRING_MAX_LEN))
+        return cls.__escape(cls.abbreviate(val))
     @classmethod
-    def __abbreviate(cls, val: str, max_len: int) -> str:
+    def abbreviate(cls, val: str, max_len: int = __STRING_MAX_LEN) -> str:
         if len(val) > max_len:
             edgeitems = (max_len - len(cls.__STRING_SEP)) // 2
             return f'{val[:edgeitems]}{cls.__STRING_SEP}{val[-edgeitems:]}'
@@ -94,41 +94,45 @@ class Formatter:
         )
     @classmethod
-    def format_json(cls, val: Any) -> str:
+    def format_json(cls, val: Any, escape_strings: bool = True) -> str:
         if isinstance(val, str):
             # JSON-like formatting will be applied to strings that appear nested within a list or dict
             # (quote the string; escape any quotes inside the string; shorter abbreviations).
             # However, if the string appears in top-level position (i.e., the entire JSON value is a
             # string), then we format it like an ordinary string.
-            return cls.format_string(val)
+            return cls.format_string(val) if escape_strings else cls.abbreviate(val)
         # In all other cases, dump the JSON struct recursively.
-        return cls.__format_json_rec(val)
+        return cls.__format_json_rec(val, escape_strings)
     @classmethod
-    def __format_json_rec(cls, val: Any) -> str:
+    def __format_json_rec(cls, val: Any, escape_strings: bool) -> str:
         if isinstance(val, str):
-            return cls.__escape(json.dumps(cls.__abbreviate(val, cls.__NESTED_STRING_MAX_LEN)))
+            formatted = json.dumps(cls.abbreviate(val, cls.__NESTED_STRING_MAX_LEN))
+            return cls.__escape(formatted) if escape_strings else formatted
         if isinstance(val, float):
             return cls.format_float(val)
         if isinstance(val, np.ndarray):
             return cls.format_array(val)
         if isinstance(val, list):
             if len(val) < cls.__LIST_THRESHOLD:
-                components = [cls.__format_json_rec(x) for x in val]
+                components = [cls.__format_json_rec(x, escape_strings) for x in val]
             else:
-                components = [cls.__format_json_rec(x) for x in val[: cls.__LIST_EDGEITEMS]]
+                components = [cls.__format_json_rec(x, escape_strings) for x in val[: cls.__LIST_EDGEITEMS]]
                 components.append('...')
-                components.extend(cls.__format_json_rec(x) for x in val[-cls.__LIST_EDGEITEMS :])
+                components.extend(cls.__format_json_rec(x, escape_strings) for x in val[-cls.__LIST_EDGEITEMS :])
             return '[' + ', '.join(components) + ']'
         if isinstance(val, dict):
-            kv_pairs = (f'{cls.__format_json_rec(k)}: {cls.__format_json_rec(v)}' for k, v in val.items())
+            kv_pairs = (
+                f'{cls.__format_json_rec(k, escape_strings)}: {cls.__format_json_rec(v, escape_strings)}'
+                for k, v in val.items()
+            )
             return '{' + ', '.join(kv_pairs) + '}'
         # Everything else
         try:
             return json.dumps(val)
         except TypeError:  # Not JSON serializable
-            return str(val)
+            return cls.__escape(str(val))
     def format_img(self, img: Image.Image) -> str:
         """
@@ -152,22 +156,19 @@ class Formatter:
             """
     def format_video(self, file_path: str) -> str:
-        thumb_tag = ''
         # Attempt to extract the first frame of the video to use as a thumbnail,
         # so that the notebook can be exported as HTML and viewed in contexts where
         # the video itself is not accessible.
         # TODO(aaron-siegel): If the video is backed by a concrete external URL,
         # should we link to that instead?
-        with av.open(file_path) as container:
-            try:
-                thumb = next(container.decode(video=0)).to_image()
-                assert isinstance(thumb, Image.Image)
-                with io.BytesIO() as buffer:
-                    thumb.save(buffer, 'jpeg')
-                    thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
-                    thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
-            except Exception:
-                pass
+        thumb = self.extract_first_video_frame(file_path)
+        if thumb is None:
+            thumb_tag = ''
+        else:
+            with io.BytesIO() as buffer:
+                thumb.save(buffer, 'jpeg')
+                thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
+                thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
         if self.__num_rows > 1:
             width = 320
         elif self.__num_cols > 1:
@@ -182,6 +183,16 @@ class Formatter:
         </div>
         """
+    @classmethod
+    def extract_first_video_frame(cls, file_path: str) -> Optional[Image.Image]:
+        with av.open(file_path) as container:
+            try:
+                img = next(container.decode(video=0)).to_image()
+                assert isinstance(img, Image.Image)
+                return img
+            except Exception:
+                return None
     def format_audio(self, file_path: str) -> str:
         return f"""
         <div class="pxt_audio">
@@ -191,29 +202,18 @@ class Formatter:
         </div>
         """
-    def format_document(self, file_path: str) -> str:
-        max_width = max_height = 320
+    def format_document(self, file_path: str, max_width: int = 320, max_height: int = 320) -> str:
         # by default, file path will be shown as a link
         inner_element = file_path
         inner_element = html.escape(inner_element)
-        # try generating a thumbnail for different types and use that if successful
-        if file_path.lower().endswith('.pdf'):
-            try:
-                import fitz  # type: ignore[import-untyped]
-                doc = fitz.open(file_path)
-                p = doc.get_page_pixmap(0)
-                while p.width > max_width or p.height > max_height:
-                    # shrink(1) will halve each dimension
-                    p.shrink(1)
-                data = p.tobytes(output='jpeg')
-                thumb_base64 = base64.b64encode(data).decode()
-                img_src = f'data:image/jpeg;base64,{thumb_base64}'
-                inner_element = f"""
-                    <img style="object-fit: contain; border: 1px solid black;" src="{img_src}" />
-                """
-            except Exception:
-                logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have PyMuPDF installed.')
+        thumb = self.make_document_thumbnail(file_path, max_width, max_height)
+        if thumb is not None:
+            with io.BytesIO() as buffer:
+                thumb.save(buffer, 'webp')
+                thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
+                thumb_tag = f'data:image/webp;base64,{thumb_base64}'
+            inner_element = f'<img style="object-fit: contain; border: 1px solid black;" src="{thumb_tag}" />'
         return f"""
         <div class="pxt_document" style="width:{max_width}px;">
@@ -223,6 +223,28 @@ class Formatter:
         </div>
         """
+    @classmethod
+    def make_document_thumbnail(
+        cls, file_path: str, max_width: int = 320, max_height: int = 320
+    ) -> Optional[Image.Image]:
+        """
+        Returns a thumbnail image of a document.
+        """
+        if file_path.lower().endswith('.pdf'):
+            try:
+                import fitz  # type: ignore[import-untyped]
+                doc = fitz.open(file_path)
+                pixmap = doc.get_page_pixmap(0)
+                while pixmap.width > max_width or pixmap.height > max_height:
+                    # shrink(1) will halve each dimension
+                    pixmap.shrink(1)
+                return pixmap.pil_image()
+            except Exception:
+                logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have PyMuPDF installed.')
+        return None
     @classmethod
     def __create_source_tag(cls, http_address: str, file_path: str) -> str:
         src_url = get_file_uri(http_address, file_path)

pixeltable/utils/sample.py ADDED Viewed

@@ -0,0 +1,25 @@
+import sqlalchemy as sql
+from pixeltable.func.udf import udf
+@udf
+def sample_key(seed: int, *key_fields: int) -> str:
+    """
+    Create a sample key from the given seed and key fields.
+    Args:
+        seed: The seed value.
+        rowids: The rowids to include in the sample key.
+    Returns:
+        A string key for each row
+    """
+    raise NotImplementedError('SampleKey creation is not implemented in python.')
+@sample_key.to_sql
+def _(seed: sql.ColumnElement, *key_fields: sql.ColumnElement) -> sql.ColumnElement:
+    from pixeltable.exec.sql_node import SqlSampleNode
+    return SqlSampleNode.key_sql_expr(seed, key_fields)

{pixeltable-0.3.15.dist-info → pixeltable-0.4.0rc2.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.3
 Name: pixeltable
-Version: 0.3.15
+Version: 0.4.0rc2
 Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
 License: Apache-2.0
 Keywords: data-science,machine-learning,database,ai,computer-vision,chatbot,ml,artificial-intelligence,feature-engineering,multimodal,mlops,feature-store,vector-database,llm,genai
@@ -36,6 +36,7 @@ Requires-Dist: numpy (>=1.25)
 Requires-Dist: pandas (>=2.0,<3.0)
 Requires-Dist: pgvector (>=0.2.1)
 Requires-Dist: pillow (>=9.3.0)
+Requires-Dist: pillow-heif (>=0.15.0)
 Requires-Dist: pixeltable-pgserver (==0.3.1)
 Requires-Dist: psutil (>=5.9.5)
 Requires-Dist: psycopg[binary] (>=3.1.18)

pixeltable 0.3.15__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.15py3-none-any.whl → 0.4.0rc2py3-none-any.whl