PyPI - pixeltable - Versions diffs - 0.4.0rc1__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl - Mend

pixeltable 0.4.0rc1py3-none-any.whl → 0.4.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (29) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/catalog.py +4 -0
pixeltable/catalog/table.py +16 -0
pixeltable/catalog/table_version.py +17 -2
pixeltable/catalog/view.py +24 -1
pixeltable/dataframe.py +185 -9
pixeltable/env.py +2 -0
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/expr_eval/evaluators.py +4 -1
pixeltable/exec/sql_node.py +152 -12
pixeltable/exprs/data_row.py +5 -3
pixeltable/exprs/expr.py +7 -0
pixeltable/exprs/literal.py +2 -0
pixeltable/func/tools.py +1 -1
pixeltable/functions/gemini.py +0 -1
pixeltable/globals.py +5 -0
pixeltable/metadata/__init__.py +11 -2
pixeltable/metadata/converters/convert_36.py +38 -0
pixeltable/metadata/notes.py +1 -0
pixeltable/metadata/schema.py +3 -0
pixeltable/plan.py +217 -10
pixeltable/share/packager.py +115 -6
pixeltable/utils/formatter.py +64 -42
pixeltable/utils/sample.py +25 -0
{pixeltable-0.4.0rc1.dist-info → pixeltable-0.4.0rc2.dist-info}/METADATA +2 -1
{pixeltable-0.4.0rc1.dist-info → pixeltable-0.4.0rc2.dist-info}/RECORD +29 -27
{pixeltable-0.4.0rc1.dist-info → pixeltable-0.4.0rc2.dist-info}/LICENSE +0 -0
{pixeltable-0.4.0rc1.dist-info → pixeltable-0.4.0rc2.dist-info}/WHEEL +0 -0
{pixeltable-0.4.0rc1.dist-info → pixeltable-0.4.0rc2.dist-info}/entry_points.txt +0 -0

pixeltable/plan.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 import dataclasses
 import enum
 from textwrap import dedent
-from typing import Any, Iterable, Literal, Optional, Sequence
+from typing import Any, Iterable, Literal, NamedTuple, Optional, Sequence
 from uuid import UUID
 import sqlalchemy as sql
@@ -12,6 +12,7 @@ import pixeltable as pxt
 from pixeltable import catalog, exceptions as excs, exec, exprs
 from pixeltable.catalog import Column, TableVersionHandle
 from pixeltable.exec.sql_node import OrderByClause, OrderByItem, combine_order_by_clauses, print_order_by_clause
+from pixeltable.utils.sample import sample_key
 def _is_agg_fn_call(e: exprs.Expr) -> bool:
@@ -75,6 +76,98 @@ class FromClause:
     tbls: list[catalog.TableVersionPath]
     join_clauses: list[JoinClause] = dataclasses.field(default_factory=list)
+    @property
+    def _first_tbl(self) -> catalog.TableVersionPath:
+        assert len(self.tbls) == 1
+        return self.tbls[0]
+@dataclasses.dataclass
+class SampleClause:
+    """Defines a sampling clause for a table."""
+    version: Optional[int]
+    n: Optional[int]
+    n_per_stratum: Optional[int]
+    fraction: Optional[float]
+    seed: Optional[int]
+    stratify_exprs: Optional[list[exprs.Expr]]
+    # This seed value is used if one is not supplied
+    DEFAULT_SEED = 0
+    # The version of the hashing algorithm used for ordering and fractional sampling.
+    CURRENT_VERSION = 1
+    def __post_init__(self) -> None:
+        """If no version was provided, provide the default version"""
+        if self.version is None:
+            self.version = self.CURRENT_VERSION
+        if self.seed is None:
+            self.seed = self.DEFAULT_SEED
+    @property
+    def is_stratified(self) -> bool:
+        """Check if the sampling is stratified"""
+        return self.stratify_exprs is not None and len(self.stratify_exprs) > 0
+    @property
+    def is_repeatable(self) -> bool:
+        """Return true if the same rows will continue to be sampled if source rows are added or deleted."""
+        return not self.is_stratified and self.fraction is not None
+    def display_str(self, inline: bool = False) -> str:
+        return str(self)
+    def as_dict(self) -> dict:
+        """Return a dictionary representation of the object"""
+        d = dataclasses.asdict(self)
+        d['_classname'] = self.__class__.__name__
+        if self.is_stratified:
+            d['stratify_exprs'] = [e.as_dict() for e in self.stratify_exprs]
+        return d
+    @classmethod
+    def from_dict(cls, d: dict) -> SampleClause:
+        """Create a SampleClause from a dictionary representation"""
+        d_cleaned = {key: value for key, value in d.items() if key != '_classname'}
+        s = cls(**d_cleaned)
+        if s.is_stratified:
+            s.stratify_exprs = [exprs.Expr.from_dict(e) for e in d_cleaned.get('stratify_exprs', [])]
+        return s
+    def __repr__(self) -> str:
+        s = ','.join(e.display_str(inline=True) for e in self.stratify_exprs)
+        return (
+            f'sample_{self.version}(n={self.n}, n_per_stratum={self.n_per_stratum}, '
+            f'fraction={self.fraction}, seed={self.seed}, [{s}])'
+        )
+    @classmethod
+    def fraction_to_md5_hex(cls, fraction: float) -> str:
+        """Return the string representation of an approximation (to ~1e-9) of a fraction of the total space
+        of md5 hash values.
+        This is used for fractional sampling.
+        """
+        # Maximum count for the upper 32 bits of MD5: 2^32
+        max_md5_value = (2**32) - 1
+        # Calculate the fraction of this value
+        threshold_int = max_md5_value * int(1_000_000_000 * fraction) // 1_000_000_000
+        # Convert to hexadecimal string with padding
+        return format(threshold_int, '08x') + 'ffffffffffffffffffffffff'
+class SamplingClauses(NamedTuple):
+    """Clauses provided when rewriting a SampleClause"""
+    where: exprs.Expr
+    group_by_clause: Optional[list[exprs.Expr]]
+    order_by_clause: Optional[list[tuple[exprs.Expr, bool]]]
+    limit: Optional[exprs.Expr]
+    sample_clause: Optional[SampleClause]
 class Analyzer:
     """
@@ -260,7 +353,7 @@ class Planner:
     # TODO: create an exec.CountNode and change this to create_count_plan()
     @classmethod
     def create_count_stmt(cls, tbl: catalog.TableVersionPath, where_clause: Optional[exprs.Expr] = None) -> sql.Select:
-        stmt = sql.select(sql.func.count())
+        stmt = sql.select(sql.func.count().label('all_count'))
         refd_tbl_ids: set[UUID] = set()
         if where_clause is not None:
             analyzer = cls.analyze(tbl, where_clause)
@@ -322,6 +415,13 @@ class Planner:
         )
         return plan
+    @classmethod
+    def rowid_columns(cls, target: TableVersionHandle, num_rowid_cols: Optional[int] = None) -> list[exprs.Expr]:
+        """Return list of RowidRef for the given number of associated rowids"""
+        if num_rowid_cols is None:
+            num_rowid_cols = target.get().num_rowid_columns()
+        return [exprs.RowidRef(target, i) for i in range(num_rowid_cols)]
     @classmethod
     def create_df_insert_plan(
         cls, tbl: catalog.TableVersion, df: 'pxt.DataFrame', ignore_errors: bool
@@ -591,7 +691,24 @@ class Planner:
         # 2. for component views: iterator args
         iterator_args = [target.iterator_args] if target.iterator_args is not None else []
-        row_builder = exprs.RowBuilder(iterator_args, stored_cols, [])
+        # If this contains a sample specification, modify / create where, group_by, order_by, and limit clauses
+        from_clause = FromClause(tbls=[view.base])
+        where, group_by_clause, order_by_clause, limit, sample_clause = cls.create_sample_clauses(
+            from_clause, target.sample_clause, target.predicate, None, [], None
+        )
+        # if we're propagating an insert, we only want to see those base rows that were created for the current version
+        base_analyzer = Analyzer(
+            from_clause,
+            iterator_args,
+            where_clause=where,
+            group_by_clause=group_by_clause,
+            order_by_clause=order_by_clause,
+        )
+        row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [])
+        if target.sample_clause is not None and base_analyzer.filter is not None:
+            raise excs.Error(f'Filter {base_analyzer.filter} not expressible in SQL')
         # execution plan:
         # 1. materialize exprs computed from the base that are needed for stored view columns
@@ -603,13 +720,22 @@ class Planner:
             for e in row_builder.default_eval_ctx.target_exprs
             if e.is_bound_by([view]) and not e.is_bound_by([view.base])
         ]
-        # if we're propagating an insert, we only want to see those base rows that were created for the current version
-        base_analyzer = Analyzer(FromClause(tbls=[view.base]), base_output_exprs, where_clause=target.predicate)
+        # Create a new analyzer reflecting exactly what is required from the base table
+        base_analyzer = Analyzer(
+            from_clause,
+            base_output_exprs,
+            where_clause=where,
+            group_by_clause=group_by_clause,
+            order_by_clause=order_by_clause,
+        )
         base_eval_ctx = row_builder.create_eval_ctx(base_analyzer.all_exprs)
         plan = cls._create_query_plan(
             row_builder=row_builder,
             analyzer=base_analyzer,
             eval_ctx=base_eval_ctx,
+            limit=limit,
+            sample_clause=sample_clause,
             with_pk=True,
             exact_version_only=view.get_bases() if propagates_insert else [],
         )
@@ -692,6 +818,62 @@ class Planner:
         prefetch_node = exec.CachePrefetchNode(tbl_id, file_col_info, input_node)
         return prefetch_node
+    @classmethod
+    def create_sample_clauses(
+        cls,
+        from_clause: FromClause,
+        sample_clause: SampleClause,
+        where_clause: Optional[exprs.Expr],
+        group_by_clause: Optional[list[exprs.Expr]],
+        order_by_clause: Optional[list[tuple[exprs.Expr, bool]]],
+        limit: Optional[exprs.Expr],
+    ) -> SamplingClauses:
+        """tuple[
+            exprs.Expr,
+            Optional[list[exprs.Expr]],
+            Optional[list[tuple[exprs.Expr, bool]]],
+            Optional[exprs.Expr],
+            Optional[SampleClause],
+        ]:"""
+        """Construct clauses required for sampling under various conditions.
+        If there is no sampling, then return the original clauses.
+        If the sample is stratified, then return only the group by clause. The rest of the
+        mechanism for stratified sampling is provided by the SampleSqlNode.
+        If the sample is non-stratified, then rewrite the query to accommodate the supplied where clause,
+        and provide the other clauses required for sampling
+        """
+        # If no sample clause, return the original clauses
+        if sample_clause is None:
+            return SamplingClauses(where_clause, group_by_clause, order_by_clause, limit, None)
+        # If the sample clause is stratified, create a group by clause
+        if sample_clause.is_stratified:
+            group_by = sample_clause.stratify_exprs
+            # Note that limit is not possible here
+            return SamplingClauses(where_clause, group_by, order_by_clause, None, sample_clause)
+        else:
+            # If non-stratified sampling, construct a where clause, order_by, and limit clauses
+            # Construct an expression for sorting rows and limiting row counts
+            s_key = sample_key(
+                exprs.Literal(sample_clause.seed), *cls.rowid_columns(from_clause._first_tbl.tbl_version)
+            )
+            # Construct a suitable where clause
+            where = where_clause
+            if sample_clause.fraction is not None:
+                fraction_md5_hex = exprs.Expr.from_object(
+                    sample_clause.fraction_to_md5_hex(float(sample_clause.fraction))
+                )
+                f_where = s_key < fraction_md5_hex
+                where = where & f_where if where is not None else f_where
+            order_by: list[tuple[exprs.Expr, bool]] = [(s_key, True)]
+            limit = exprs.Literal(sample_clause.n)
+            # Note that group_by is not possible here
+            return SamplingClauses(where, None, order_by, limit, None)
     @classmethod
     def create_query_plan(
         cls,
@@ -701,6 +883,7 @@ class Planner:
         group_by_clause: Optional[list[exprs.Expr]] = None,
         order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
         limit: Optional[exprs.Expr] = None,
+        sample_clause: Optional[SampleClause] = None,
         ignore_errors: bool = False,
         exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
     ) -> exec.ExecNode:
@@ -714,14 +897,22 @@ class Planner:
             order_by_clause = []
         if exact_version_only is None:
             exact_version_only = []
+        # Modify clauses to include sample clause
+        where, group_by_clause, order_by_clause, limit, sample = cls.create_sample_clauses(
+            from_clause, sample_clause, where_clause, group_by_clause, order_by_clause, limit
+        )
         analyzer = Analyzer(
             from_clause,
             select_list,
-            where_clause=where_clause,
+            where_clause=where,
             group_by_clause=group_by_clause,
             order_by_clause=order_by_clause,
         )
         row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
+        if sample_clause is not None and analyzer.filter is not None:
+            raise excs.Error(f'Filter {analyzer.filter} not expressible in SQL')
         analyzer.finalize(row_builder)
         # select_list: we need to materialize everything that's been collected
@@ -732,6 +923,7 @@ class Planner:
             analyzer=analyzer,
             eval_ctx=eval_ctx,
             limit=limit,
+            sample_clause=sample,
             with_pk=True,
             exact_version_only=exact_version_only,
         )
@@ -747,6 +939,7 @@ class Planner:
         analyzer: Analyzer,
         eval_ctx: exprs.RowBuilder.EvalCtx,
         limit: Optional[exprs.Expr] = None,
+        sample_clause: Optional[SampleClause] = None,
         with_pk: bool = False,
         exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
     ) -> exec.ExecNode:
@@ -857,12 +1050,26 @@ class Planner:
                 sql_elements.contains_all(analyzer.select_list)
                 and sql_elements.contains_all(analyzer.grouping_exprs)
                 and isinstance(plan, exec.SqlNode)
-                and plan.to_cte() is not None
+                and plan.to_cte(keep_pk=(sample_clause is not None)) is not None
             ):
-                plan = exec.SqlAggregationNode(
-                    row_builder, input=plan, select_list=analyzer.select_list, group_by_items=analyzer.group_by_clause
-                )
+                if sample_clause is not None:
+                    plan = exec.SqlSampleNode(
+                        row_builder,
+                        input=plan,
+                        select_list=analyzer.select_list,
+                        stratify_exprs=analyzer.group_by_clause,
+                        sample_clause=sample_clause,
+                    )
+                else:
+                    plan = exec.SqlAggregationNode(
+                        row_builder,
+                        input=plan,
+                        select_list=analyzer.select_list,
+                        group_by_items=analyzer.group_by_clause,
+                    )
             else:
+                if sample_clause is not None:
+                    raise excs.Error('Sample clause not supported with Python aggregation')
                 input_sql_node = plan.get_node(exec.SqlNode)
                 assert combined_ordering is not None
                 input_sql_node.set_order_by(combined_ordering)

pixeltable/share/packager.py CHANGED Viewed

@@ -1,4 +1,7 @@
+import base64
 import datetime
+import io
+import itertools
 import json
 import logging
 import tarfile
@@ -10,15 +13,18 @@ from typing import Any, Iterator, Optional
 from uuid import UUID
 import more_itertools
+import numpy as np
+import PIL.Image
 import pyarrow as pa
 import pyarrow.parquet as pq
 import sqlalchemy as sql
 import pixeltable as pxt
-from pixeltable import catalog, exceptions as excs, metadata
+from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
 from pixeltable.env import Env
 from pixeltable.metadata import schema
 from pixeltable.utils import sha256sum
+from pixeltable.utils.formatter import Formatter
 from pixeltable.utils.media_store import MediaStore
 _logger = logging.getLogger('pixeltable')
@@ -46,6 +52,10 @@ class TablePackager:
     media_files: dict[Path, str]  # Mapping from local media file paths to their tarball names
     md: dict[str, Any]
+    bundle_path: Path
+    preview_header: dict[str, str]
+    preview: list[list[Any]]
     def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
         self.table = table
         self.tmp_dir = Path(Env.get().create_tmp_path())
@@ -67,7 +77,8 @@ class TablePackager:
         Export the table to a tarball containing Parquet tables and media files.
         """
         assert not self.tmp_dir.exists()  # Packaging can only be done once per TablePackager instance
-        _logger.info(f"Packaging table '{self.table._path()}' and its ancestors in: {self.tmp_dir}")
+        _logger.info(f'Packaging table {self.table._path()!r} and its ancestors in: {self.tmp_dir}')
         self.tmp_dir.mkdir()
         with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
             json.dump(self.md, fp)
@@ -75,12 +86,20 @@ class TablePackager:
         self.tables_dir.mkdir()
         with catalog.Catalog.get().begin_xact(for_write=False):
             for tv in self.table._tbl_version_path.get_tbl_versions():
-                _logger.info(f"Exporting table '{tv.get().versioned_name}'.")
+                _logger.info(f'Exporting table {tv.get().versioned_name!r}.')
                 self.__export_table(tv.get())
         _logger.info('Building archive.')
-        bundle_path = self.__build_tarball()
-        _logger.info(f'Packaging complete: {bundle_path}')
-        return bundle_path
+        self.bundle_path = self.__build_tarball()
+        _logger.info('Extracting preview data.')
+        self.md['count'] = self.table.count()
+        preview_header, preview = self.__extract_preview_data()
+        self.md['preview_header'] = preview_header
+        self.md['preview'] = preview
+        _logger.info(f'Packaging complete: {self.bundle_path}')
+        return self.bundle_path
     def __export_table(self, tv: catalog.TableVersion) -> None:
         """
@@ -207,6 +226,96 @@ class TablePackager:
                 tf.add(src_file, arcname=f'media/{dest_name}')
         return bundle_path
+    def __extract_preview_data(self) -> tuple[dict[str, str], list[list[Any]]]:
+        """
+        Extract a preview of the table data for display in the UI.
+        In order to bound the size of the output data, all "unbounded" data types are resized:
+        - Strings are abbreviated as per Formatter.abbreviate()
+        - Arrays and JSON are shortened and formatted as strings
+        - Images are resized to thumbnail size as a base64-encoded webp
+        - Videos are replaced by their first frame and resized as above
+        - Documents are replaced by a thumbnail as a base64-encoded webp
+        """
+        # First 8 columns
+        preview_cols = dict(itertools.islice(self.table._schema.items(), 0, 8))
+        select_list = [self.table[col_name] for col_name in preview_cols]
+        # First 5 rows
+        rows = list(self.table.select(*select_list).head(n=5))
+        preview_header = {col_name: str(col_type._type) for col_name, col_type in preview_cols.items()}
+        preview = [
+            [self.__encode_preview_data(val, col_type)]
+            for row in rows
+            for val, col_type in zip(row.values(), preview_cols.values())
+        ]
+        return preview_header, preview
+    def __encode_preview_data(self, val: Any, col_type: ts.ColumnType) -> Any:
+        if val is None:
+            return None
+        match col_type._type:
+            case ts.ColumnType.Type.STRING:
+                assert isinstance(val, str)
+                return Formatter.abbreviate(val)
+            case ts.ColumnType.Type.INT | ts.ColumnType.Type.FLOAT | ts.ColumnType.Type.BOOL:
+                return val
+            case ts.ColumnType.Type.TIMESTAMP | ts.ColumnType.Type.DATE:
+                return str(val)
+            case ts.ColumnType.Type.ARRAY:
+                assert isinstance(val, np.ndarray)
+                return Formatter.format_array(val)
+            case ts.ColumnType.Type.JSON:
+                # We need to escape the JSON string server-side for security reasons.
+                # Therefore we don't escape it here, in order to avoid double-escaping.
+                return Formatter.format_json(val, escape_strings=False)
+            case ts.ColumnType.Type.IMAGE:
+                # Rescale the image to minimize data transfer size
+                assert isinstance(val, PIL.Image.Image)
+                return self.__encode_image(val)
+            case ts.ColumnType.Type.VIDEO:
+                assert isinstance(val, str)
+                return self.__encode_video(val)
+            case ts.ColumnType.Type.AUDIO:
+                return None
+            case ts.ColumnType.Type.DOCUMENT:
+                assert isinstance(val, str)
+                return self.__encode_document(val)
+            case _:
+                raise AssertionError(f'Unrecognized column type: {col_type._type}')
+    def __encode_image(self, img: PIL.Image.Image) -> str:
+        # Heuristic for thumbnail sizing:
+        # Standardize on a width of 240 pixels (to most efficiently utilize the columnar display).
+        # But, if the aspect ratio is below 2:3, bound the height at 360 pixels (to avoid unboundedly tall thumbnails
+        #     in the case of highly oblong images).
+        if img.height > img.width * 1.5:
+            scaled_img = img.resize((img.width * 360 // img.height, 360))
+        else:
+            scaled_img = img.resize((240, img.height * 240 // img.width))
+        with io.BytesIO() as buffer:
+            scaled_img.save(buffer, 'webp')
+            return base64.b64encode(buffer.getvalue()).decode()
+    def __encode_video(self, video_path: str) -> Optional[str]:
+        thumb = Formatter.extract_first_video_frame(video_path)
+        return self.__encode_image(thumb) if thumb is not None else None
+    def __encode_document(self, doc_path: str) -> Optional[str]:
+        thumb = Formatter.make_document_thumbnail(doc_path)
+        return self.__encode_image(thumb) if thumb is not None else None
 class TableRestorer:
     """

pixeltable/utils/formatter.py CHANGED Viewed

@@ -63,10 +63,10 @@ class Formatter:
         """
         Escapes special characters in `val`, and abbreviates `val` if its length exceeds `_STRING_MAX_LEN`.
         """
-        return cls.__escape(cls.__abbreviate(val, cls.__STRING_MAX_LEN))
+        return cls.__escape(cls.abbreviate(val))
     @classmethod
-    def __abbreviate(cls, val: str, max_len: int) -> str:
+    def abbreviate(cls, val: str, max_len: int = __STRING_MAX_LEN) -> str:
         if len(val) > max_len:
             edgeitems = (max_len - len(cls.__STRING_SEP)) // 2
             return f'{val[:edgeitems]}{cls.__STRING_SEP}{val[-edgeitems:]}'
@@ -94,41 +94,45 @@ class Formatter:
         )
     @classmethod
-    def format_json(cls, val: Any) -> str:
+    def format_json(cls, val: Any, escape_strings: bool = True) -> str:
         if isinstance(val, str):
             # JSON-like formatting will be applied to strings that appear nested within a list or dict
             # (quote the string; escape any quotes inside the string; shorter abbreviations).
             # However, if the string appears in top-level position (i.e., the entire JSON value is a
             # string), then we format it like an ordinary string.
-            return cls.format_string(val)
+            return cls.format_string(val) if escape_strings else cls.abbreviate(val)
         # In all other cases, dump the JSON struct recursively.
-        return cls.__format_json_rec(val)
+        return cls.__format_json_rec(val, escape_strings)
     @classmethod
-    def __format_json_rec(cls, val: Any) -> str:
+    def __format_json_rec(cls, val: Any, escape_strings: bool) -> str:
         if isinstance(val, str):
-            return cls.__escape(json.dumps(cls.__abbreviate(val, cls.__NESTED_STRING_MAX_LEN)))
+            formatted = json.dumps(cls.abbreviate(val, cls.__NESTED_STRING_MAX_LEN))
+            return cls.__escape(formatted) if escape_strings else formatted
         if isinstance(val, float):
             return cls.format_float(val)
         if isinstance(val, np.ndarray):
             return cls.format_array(val)
         if isinstance(val, list):
             if len(val) < cls.__LIST_THRESHOLD:
-                components = [cls.__format_json_rec(x) for x in val]
+                components = [cls.__format_json_rec(x, escape_strings) for x in val]
             else:
-                components = [cls.__format_json_rec(x) for x in val[: cls.__LIST_EDGEITEMS]]
+                components = [cls.__format_json_rec(x, escape_strings) for x in val[: cls.__LIST_EDGEITEMS]]
                 components.append('...')
-                components.extend(cls.__format_json_rec(x) for x in val[-cls.__LIST_EDGEITEMS :])
+                components.extend(cls.__format_json_rec(x, escape_strings) for x in val[-cls.__LIST_EDGEITEMS :])
             return '[' + ', '.join(components) + ']'
         if isinstance(val, dict):
-            kv_pairs = (f'{cls.__format_json_rec(k)}: {cls.__format_json_rec(v)}' for k, v in val.items())
+            kv_pairs = (
+                f'{cls.__format_json_rec(k, escape_strings)}: {cls.__format_json_rec(v, escape_strings)}'
+                for k, v in val.items()
+            )
             return '{' + ', '.join(kv_pairs) + '}'
         # Everything else
         try:
             return json.dumps(val)
         except TypeError:  # Not JSON serializable
-            return str(val)
+            return cls.__escape(str(val))
     def format_img(self, img: Image.Image) -> str:
         """
@@ -152,22 +156,19 @@ class Formatter:
             """
     def format_video(self, file_path: str) -> str:
-        thumb_tag = ''
         # Attempt to extract the first frame of the video to use as a thumbnail,
         # so that the notebook can be exported as HTML and viewed in contexts where
         # the video itself is not accessible.
         # TODO(aaron-siegel): If the video is backed by a concrete external URL,
         # should we link to that instead?
-        with av.open(file_path) as container:
-            try:
-                thumb = next(container.decode(video=0)).to_image()
-                assert isinstance(thumb, Image.Image)
-                with io.BytesIO() as buffer:
-                    thumb.save(buffer, 'jpeg')
-                    thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
-                    thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
-            except Exception:
-                pass
+        thumb = self.extract_first_video_frame(file_path)
+        if thumb is None:
+            thumb_tag = ''
+        else:
+            with io.BytesIO() as buffer:
+                thumb.save(buffer, 'jpeg')
+                thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
+                thumb_tag = f'poster="data:image/jpeg;base64,{thumb_base64}"'
         if self.__num_rows > 1:
             width = 320
         elif self.__num_cols > 1:
@@ -182,6 +183,16 @@ class Formatter:
         </div>
         """
+    @classmethod
+    def extract_first_video_frame(cls, file_path: str) -> Optional[Image.Image]:
+        with av.open(file_path) as container:
+            try:
+                img = next(container.decode(video=0)).to_image()
+                assert isinstance(img, Image.Image)
+                return img
+            except Exception:
+                return None
     def format_audio(self, file_path: str) -> str:
         return f"""
         <div class="pxt_audio">
@@ -191,29 +202,18 @@ class Formatter:
         </div>
         """
-    def format_document(self, file_path: str) -> str:
-        max_width = max_height = 320
+    def format_document(self, file_path: str, max_width: int = 320, max_height: int = 320) -> str:
         # by default, file path will be shown as a link
         inner_element = file_path
         inner_element = html.escape(inner_element)
-        # try generating a thumbnail for different types and use that if successful
-        if file_path.lower().endswith('.pdf'):
-            try:
-                import fitz  # type: ignore[import-untyped]
-                doc = fitz.open(file_path)
-                p = doc.get_page_pixmap(0)
-                while p.width > max_width or p.height > max_height:
-                    # shrink(1) will halve each dimension
-                    p.shrink(1)
-                data = p.tobytes(output='jpeg')
-                thumb_base64 = base64.b64encode(data).decode()
-                img_src = f'data:image/jpeg;base64,{thumb_base64}'
-                inner_element = f"""
-                    <img style="object-fit: contain; border: 1px solid black;" src="{img_src}" />
-                """
-            except Exception:
-                logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have PyMuPDF installed.')
+        thumb = self.make_document_thumbnail(file_path, max_width, max_height)
+        if thumb is not None:
+            with io.BytesIO() as buffer:
+                thumb.save(buffer, 'webp')
+                thumb_base64 = base64.b64encode(buffer.getvalue()).decode()
+                thumb_tag = f'data:image/webp;base64,{thumb_base64}'
+            inner_element = f'<img style="object-fit: contain; border: 1px solid black;" src="{thumb_tag}" />'
         return f"""
         <div class="pxt_document" style="width:{max_width}px;">
@@ -223,6 +223,28 @@ class Formatter:
         </div>
         """
+    @classmethod
+    def make_document_thumbnail(
+        cls, file_path: str, max_width: int = 320, max_height: int = 320
+    ) -> Optional[Image.Image]:
+        """
+        Returns a thumbnail image of a document.
+        """
+        if file_path.lower().endswith('.pdf'):
+            try:
+                import fitz  # type: ignore[import-untyped]
+                doc = fitz.open(file_path)
+                pixmap = doc.get_page_pixmap(0)
+                while pixmap.width > max_width or pixmap.height > max_height:
+                    # shrink(1) will halve each dimension
+                    pixmap.shrink(1)
+                return pixmap.pil_image()
+            except Exception:
+                logging.warning(f'Failed to produce PDF thumbnail {file_path}. Make sure you have PyMuPDF installed.')
+        return None
     @classmethod
     def __create_source_tag(cls, http_address: str, file_path: str) -> str:
         src_url = get_file_uri(http_address, file_path)

pixeltable 0.4.0rc1__py3-none-any.whl → 0.4.0rc2__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.0rc1py3-none-any.whl → 0.4.0rc2py3-none-any.whl