PyPI - pixeltable - Versions diffs - 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl - Mend

pixeltable 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (78) hide show

pixeltable/__init__.py +1 -1
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +9 -1
pixeltable/catalog/catalog.py +559 -134
pixeltable/catalog/column.py +36 -32
pixeltable/catalog/dir.py +1 -2
pixeltable/catalog/globals.py +12 -0
pixeltable/catalog/insertable_table.py +30 -25
pixeltable/catalog/schema_object.py +9 -6
pixeltable/catalog/table.py +334 -267
pixeltable/catalog/table_version.py +358 -241
pixeltable/catalog/table_version_handle.py +18 -2
pixeltable/catalog/table_version_path.py +86 -16
pixeltable/catalog/view.py +47 -23
pixeltable/dataframe.py +198 -19
pixeltable/env.py +6 -4
pixeltable/exceptions.py +6 -0
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/exec_node.py +2 -0
pixeltable/exec/expr_eval/evaluators.py +4 -1
pixeltable/exec/expr_eval/expr_eval_node.py +4 -4
pixeltable/exec/in_memory_data_node.py +1 -1
pixeltable/exec/sql_node.py +188 -22
pixeltable/exprs/column_property_ref.py +16 -6
pixeltable/exprs/column_ref.py +33 -11
pixeltable/exprs/comparison.py +1 -1
pixeltable/exprs/data_row.py +5 -3
pixeltable/exprs/expr.py +11 -4
pixeltable/exprs/literal.py +2 -0
pixeltable/exprs/row_builder.py +4 -6
pixeltable/exprs/rowid_ref.py +8 -0
pixeltable/exprs/similarity_expr.py +1 -0
pixeltable/func/__init__.py +1 -0
pixeltable/func/mcp.py +74 -0
pixeltable/func/query_template_function.py +5 -3
pixeltable/func/tools.py +12 -2
pixeltable/func/udf.py +2 -2
pixeltable/functions/__init__.py +1 -0
pixeltable/functions/anthropic.py +19 -45
pixeltable/functions/deepseek.py +19 -38
pixeltable/functions/fireworks.py +9 -18
pixeltable/functions/gemini.py +2 -3
pixeltable/functions/groq.py +108 -0
pixeltable/functions/llama_cpp.py +6 -6
pixeltable/functions/mistralai.py +16 -53
pixeltable/functions/ollama.py +1 -1
pixeltable/functions/openai.py +82 -165
pixeltable/functions/string.py +212 -58
pixeltable/functions/together.py +22 -80
pixeltable/globals.py +10 -4
pixeltable/index/base.py +5 -0
pixeltable/index/btree.py +5 -0
pixeltable/index/embedding_index.py +5 -0
pixeltable/io/external_store.py +10 -31
pixeltable/io/label_studio.py +5 -5
pixeltable/io/parquet.py +2 -2
pixeltable/io/table_data_conduit.py +1 -32
pixeltable/metadata/__init__.py +11 -2
pixeltable/metadata/converters/convert_13.py +2 -2
pixeltable/metadata/converters/convert_30.py +6 -11
pixeltable/metadata/converters/convert_35.py +9 -0
pixeltable/metadata/converters/convert_36.py +38 -0
pixeltable/metadata/converters/convert_37.py +15 -0
pixeltable/metadata/converters/util.py +3 -9
pixeltable/metadata/notes.py +3 -0
pixeltable/metadata/schema.py +13 -1
pixeltable/plan.py +135 -12
pixeltable/share/packager.py +138 -14
pixeltable/share/publish.py +2 -2
pixeltable/store.py +19 -13
pixeltable/type_system.py +30 -0
pixeltable/utils/dbms.py +1 -1
pixeltable/utils/formatter.py +64 -42
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/METADATA +2 -1
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/RECORD +78 -73
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/LICENSE +0 -0
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/WHEEL +0 -0
{pixeltable-0.3.15.dist-info → pixeltable-0.4.0.dist-info}/entry_points.txt +0 -0

pixeltable/plan.py CHANGED Viewed

@@ -75,6 +75,88 @@ class FromClause:
     tbls: list[catalog.TableVersionPath]
     join_clauses: list[JoinClause] = dataclasses.field(default_factory=list)
+    @property
+    def _first_tbl(self) -> catalog.TableVersionPath:
+        assert len(self.tbls) == 1
+        return self.tbls[0]
+@dataclasses.dataclass
+class SampleClause:
+    """Defines a sampling clause for a table."""
+    version: Optional[int]
+    n: Optional[int]
+    n_per_stratum: Optional[int]
+    fraction: Optional[float]
+    seed: Optional[int]
+    stratify_exprs: Optional[list[exprs.Expr]]
+    # This seed value is used if one is not supplied
+    DEFAULT_SEED = 0
+    # The version of the hashing algorithm used for ordering and fractional sampling.
+    CURRENT_VERSION = 1
+    def __post_init__(self) -> None:
+        """If no version was provided, provide the default version"""
+        if self.version is None:
+            self.version = self.CURRENT_VERSION
+        if self.seed is None:
+            self.seed = self.DEFAULT_SEED
+    @property
+    def is_stratified(self) -> bool:
+        """Check if the sampling is stratified"""
+        return self.stratify_exprs is not None and len(self.stratify_exprs) > 0
+    @property
+    def is_repeatable(self) -> bool:
+        """Return true if the same rows will continue to be sampled if source rows are added or deleted."""
+        return not self.is_stratified and self.fraction is not None
+    def display_str(self, inline: bool = False) -> str:
+        return str(self)
+    def as_dict(self) -> dict:
+        """Return a dictionary representation of the object"""
+        d = dataclasses.asdict(self)
+        d['_classname'] = self.__class__.__name__
+        if self.is_stratified:
+            d['stratify_exprs'] = [e.as_dict() for e in self.stratify_exprs]
+        return d
+    @classmethod
+    def from_dict(cls, d: dict) -> SampleClause:
+        """Create a SampleClause from a dictionary representation"""
+        d_cleaned = {key: value for key, value in d.items() if key != '_classname'}
+        s = cls(**d_cleaned)
+        if s.is_stratified:
+            s.stratify_exprs = [exprs.Expr.from_dict(e) for e in d_cleaned.get('stratify_exprs', [])]
+        return s
+    def __repr__(self) -> str:
+        s = ','.join(e.display_str(inline=True) for e in self.stratify_exprs)
+        return (
+            f'sample_{self.version}(n={self.n}, n_per_stratum={self.n_per_stratum}, '
+            f'fraction={self.fraction}, seed={self.seed}, [{s}])'
+        )
+    @classmethod
+    def fraction_to_md5_hex(cls, fraction: float) -> str:
+        """Return the string representation of an approximation (to ~1e-9) of a fraction of the total space
+        of md5 hash values.
+        This is used for fractional sampling.
+        """
+        # Maximum count for the upper 32 bits of MD5: 2^32
+        max_md5_value = (2**32) - 1
+        # Calculate the fraction of this value
+        threshold_int = max_md5_value * int(1_000_000_000 * fraction) // 1_000_000_000
+        # Convert to hexadecimal string with padding
+        return format(threshold_int, '08x') + 'ffffffffffffffffffffffff'
 class Analyzer:
     """
@@ -87,6 +169,8 @@ class Analyzer:
     group_by_clause: Optional[list[exprs.Expr]]  # None for non-aggregate queries; [] for agg query w/o grouping
     grouping_exprs: list[exprs.Expr]  # [] for non-aggregate queries or agg query w/o grouping
     order_by_clause: OrderByClause
+    stratify_exprs: list[exprs.Expr]  # [] if no stratiifcation is required
+    sample_clause: Optional[SampleClause]  # None if no sampling clause is present
     sql_elements: exprs.SqlElementCache
@@ -107,6 +191,7 @@ class Analyzer:
         where_clause: Optional[exprs.Expr] = None,
         group_by_clause: Optional[list[exprs.Expr]] = None,
         order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
+        sample_clause: Optional[SampleClause] = None,
     ):
         if order_by_clause is None:
             order_by_clause = []
@@ -120,6 +205,11 @@ class Analyzer:
         self.group_by_clause = (
             [e.resolve_computed_cols() for e in group_by_clause] if group_by_clause is not None else None
         )
+        self.sample_clause = sample_clause
+        if self.sample_clause is not None and self.sample_clause.is_stratified:
+            self.stratify_exprs = [e.resolve_computed_cols() for e in sample_clause.stratify_exprs]
+        else:
+            self.stratify_exprs = []
         self.order_by_clause = [OrderByItem(e.resolve_computed_cols(), asc) for e, asc in order_by_clause]
         self.sql_where_clause = None
@@ -135,8 +225,11 @@ class Analyzer:
                 self.all_exprs.append(join_clause.join_predicate)
         if self.group_by_clause is not None:
             self.all_exprs.extend(self.group_by_clause)
+        self.all_exprs.extend(self.stratify_exprs)
         self.all_exprs.extend(e for e, _ in self.order_by_clause)
         if self.filter is not None:
+            if sample_clause is not None:
+                raise excs.Error(f'Filter {self.filter} not expressible in SQL')
             self.all_exprs.append(self.filter)
         self.agg_order_by = []
@@ -260,7 +353,7 @@ class Planner:
     # TODO: create an exec.CountNode and change this to create_count_plan()
     @classmethod
     def create_count_stmt(cls, tbl: catalog.TableVersionPath, where_clause: Optional[exprs.Expr] = None) -> sql.Select:
-        stmt = sql.select(sql.func.count())
+        stmt = sql.select(sql.func.count().label('all_count'))
         refd_tbl_ids: set[UUID] = set()
         if where_clause is not None:
             analyzer = cls.analyze(tbl, where_clause)
@@ -289,7 +382,7 @@ class Planner:
         # create InMemoryDataNode for 'rows'
         plan: exec.ExecNode = exec.InMemoryDataNode(
-            TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_rowid
+            TableVersionHandle(tbl.id, tbl.effective_version), rows, row_builder, tbl.next_row_id
         )
         media_input_col_info = [
@@ -322,6 +415,13 @@ class Planner:
         )
         return plan
+    @classmethod
+    def rowid_columns(cls, target: TableVersionHandle, num_rowid_cols: Optional[int] = None) -> list[exprs.Expr]:
+        """Return list of RowidRef for the given number of associated rowids"""
+        if num_rowid_cols is None:
+            num_rowid_cols = target.get().num_rowid_columns()
+        return [exprs.RowidRef(target, i) for i in range(num_rowid_cols)]
     @classmethod
     def create_df_insert_plan(
         cls, tbl: catalog.TableVersion, df: 'pxt.DataFrame', ignore_errors: bool
@@ -385,7 +485,7 @@ class Planner:
         cls.__check_valid_columns(tbl.tbl_version.get(), recomputed_cols, 'updated in')
-        recomputed_base_cols = {col for col in recomputed_cols if col.tbl == tbl.tbl_version}
+        recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == tbl.tbl_version.id}
         copied_cols = [
             col
             for col in target.cols_by_id.values()
@@ -409,7 +509,7 @@ class Planner:
         for i, col in enumerate(all_base_cols):
             plan.row_builder.add_table_column(col, select_list[i].slot_idx)
         recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
-        return plan, [f'{c.tbl.get().name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
+        return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
     @classmethod
     def __check_valid_columns(
@@ -465,7 +565,7 @@ class Planner:
         recomputed_cols.update(idx_val_cols)
         # we only need to recompute stored columns (unstored ones are substituted away)
         recomputed_cols = {c for c in recomputed_cols if c.is_stored}
-        recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
+        recomputed_base_cols = {col for col in recomputed_cols if col.tbl.id == target.id}
         copied_cols = [
             col
             for col in target.cols_by_id.values()
@@ -591,8 +691,13 @@ class Planner:
         # 2. for component views: iterator args
         iterator_args = [target.iterator_args] if target.iterator_args is not None else []
-        row_builder = exprs.RowBuilder(iterator_args, stored_cols, [])
+        from_clause = FromClause(tbls=[view.base])
+        base_analyzer = Analyzer(
+            from_clause, iterator_args, where_clause=target.predicate, sample_clause=target.sample_clause
+        )
+        row_builder = exprs.RowBuilder(base_analyzer.all_exprs, stored_cols, [])
+        # if we're propagating an insert, we only want to see those base rows that were created for the current version
         # execution plan:
         # 1. materialize exprs computed from the base that are needed for stored view columns
         # 2. if it's an iterator view, expand the base rows into component rows
@@ -603,8 +708,11 @@ class Planner:
             for e in row_builder.default_eval_ctx.target_exprs
             if e.is_bound_by([view]) and not e.is_bound_by([view.base])
         ]
-        # if we're propagating an insert, we only want to see those base rows that were created for the current version
-        base_analyzer = Analyzer(FromClause(tbls=[view.base]), base_output_exprs, where_clause=target.predicate)
+        # Create a new analyzer reflecting exactly what is required from the base table
+        base_analyzer = Analyzer(
+            from_clause, base_output_exprs, where_clause=target.predicate, sample_clause=target.sample_clause
+        )
         base_eval_ctx = row_builder.create_eval_ctx(base_analyzer.all_exprs)
         plan = cls._create_query_plan(
             row_builder=row_builder,
@@ -701,6 +809,7 @@ class Planner:
         group_by_clause: Optional[list[exprs.Expr]] = None,
         order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,
         limit: Optional[exprs.Expr] = None,
+        sample_clause: Optional[SampleClause] = None,
         ignore_errors: bool = False,
         exact_version_only: Optional[list[catalog.TableVersionHandle]] = None,
     ) -> exec.ExecNode:
@@ -714,12 +823,14 @@ class Planner:
             order_by_clause = []
         if exact_version_only is None:
             exact_version_only = []
         analyzer = Analyzer(
             from_clause,
             select_list,
             where_clause=where_clause,
             group_by_clause=group_by_clause,
             order_by_clause=order_by_clause,
+            sample_clause=sample_clause,
         )
         row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
@@ -773,6 +884,7 @@ class Planner:
         # - join clause subexprs
         # - subexprs of Where clause conjuncts that can't be run in SQL
         # - all grouping exprs
+        # - all stratify exprs
         candidates = list(
             exprs.Expr.list_subexprs(
                 analyzer.select_list,
@@ -787,10 +899,12 @@ class Planner:
             candidates.extend(
                 exprs.Expr.subexprs(analyzer.filter, filter=sql_elements.contains, traverse_matches=False)
             )
-        if analyzer.group_by_clause is not None:
-            candidates.extend(
-                exprs.Expr.list_subexprs(analyzer.group_by_clause, filter=sql_elements.contains, traverse_matches=False)
-            )
+        candidates.extend(
+            exprs.Expr.list_subexprs(analyzer.grouping_exprs, filter=sql_elements.contains, traverse_matches=False)
+        )
+        candidates.extend(
+            exprs.Expr.list_subexprs(analyzer.stratify_exprs, filter=sql_elements.contains, traverse_matches=False)
+        )
         # not isinstance(...): we don't want to materialize Literals via a Select
         sql_exprs = exprs.ExprSet(e for e in candidates if not isinstance(e, exprs.Literal))
@@ -835,6 +949,15 @@ class Planner:
             # we need to order the input for window functions
             plan.set_order_by(analyzer.get_window_fn_ob_clause())
+        if analyzer.sample_clause is not None:
+            plan = exec.SqlSampleNode(
+                row_builder,
+                input=plan,
+                select_list=tbl_scan_exprs,
+                sample_clause=analyzer.sample_clause,
+                stratify_exprs=analyzer.stratify_exprs,
+            )
         plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder, plan)
         if analyzer.group_by_clause is not None:

pixeltable/share/packager.py CHANGED Viewed

@@ -1,4 +1,7 @@
+import base64
 import datetime
+import io
+import itertools
 import json
 import logging
 import tarfile
@@ -7,17 +10,21 @@ import urllib.request
 import uuid
 from pathlib import Path
 from typing import Any, Iterator, Optional
+from uuid import UUID
 import more_itertools
+import numpy as np
+import PIL.Image
 import pyarrow as pa
 import pyarrow.parquet as pq
 import sqlalchemy as sql
 import pixeltable as pxt
-from pixeltable import catalog, exceptions as excs, metadata
+from pixeltable import catalog, exceptions as excs, metadata, type_system as ts
 from pixeltable.env import Env
 from pixeltable.metadata import schema
 from pixeltable.utils import sha256sum
+from pixeltable.utils.formatter import Formatter
 from pixeltable.utils.media_store import MediaStore
 _logger = logging.getLogger('pixeltable')
@@ -45,13 +52,17 @@ class TablePackager:
     media_files: dict[Path, str]  # Mapping from local media file paths to their tarball names
     md: dict[str, Any]
+    bundle_path: Path
+    preview_header: dict[str, str]
+    preview: list[list[Any]]
     def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
         self.table = table
         self.tmp_dir = Path(Env.get().create_tmp_path())
         self.media_files = {}
         # Load metadata
-        with Env.get().begin_xact():
+        with catalog.Catalog.get().begin_xact(for_write=False):
             tbl_md = catalog.Catalog.get().load_replica_md(table)
             self.md = {
                 'pxt_version': pxt.__version__,
@@ -66,20 +77,29 @@ class TablePackager:
         Export the table to a tarball containing Parquet tables and media files.
         """
         assert not self.tmp_dir.exists()  # Packaging can only be done once per TablePackager instance
-        _logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
+        _logger.info(f'Packaging table {self.table._path()!r} and its ancestors in: {self.tmp_dir}')
         self.tmp_dir.mkdir()
         with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
             json.dump(self.md, fp)
         self.tables_dir = self.tmp_dir / 'tables'
         self.tables_dir.mkdir()
-        with Env.get().begin_xact():
+        with catalog.Catalog.get().begin_xact(for_write=False):
             for tv in self.table._tbl_version_path.get_tbl_versions():
-                _logger.info(f"Exporting table '{tv.get().name}:{tv.get().version}'.")
+                _logger.info(f'Exporting table {tv.get().versioned_name!r}.')
                 self.__export_table(tv.get())
         _logger.info('Building archive.')
-        bundle_path = self.__build_tarball()
-        _logger.info(f'Packaging complete: {bundle_path}')
-        return bundle_path
+        self.bundle_path = self.__build_tarball()
+        _logger.info('Extracting preview data.')
+        self.md['count'] = self.table.count()
+        preview_header, preview = self.__extract_preview_data()
+        self.md['preview_header'] = preview_header
+        self.md['preview'] = preview
+        _logger.info(f'Packaging complete: {self.bundle_path}')
+        return self.bundle_path
     def __export_table(self, tv: catalog.TableVersion) -> None:
         """
@@ -107,7 +127,7 @@ class TablePackager:
         # We use snappy compression for the Parquet tables; the entire bundle will be bzip2-compressed later, so
         # faster compression should provide good performance while still reducing temporary storage utilization.
         parquet_writer = pq.ParquetWriter(parquet_file, parquet_schema, compression='SNAPPY')
-        filter_tv = self.table._tbl_version.get()
+        filter_tv = self.table._tbl_version_path.tbl_version.get()
         row_iter = tv.store_tbl.dump_rows(tv.version, filter_tv.store_tbl, filter_tv.version)
         for pa_table in self.__to_pa_tables(row_iter, sql_types, media_cols, parquet_schema):
             parquet_writer.write_table(pa_table)
@@ -206,6 +226,96 @@ class TablePackager:
                 tf.add(src_file, arcname=f'media/{dest_name}')
         return bundle_path
+    def __extract_preview_data(self) -> tuple[dict[str, str], list[list[Any]]]:
+        """
+        Extract a preview of the table data for display in the UI.
+        In order to bound the size of the output data, all "unbounded" data types are resized:
+        - Strings are abbreviated as per Formatter.abbreviate()
+        - Arrays and JSON are shortened and formatted as strings
+        - Images are resized to thumbnail size as a base64-encoded webp
+        - Videos are replaced by their first frame and resized as above
+        - Documents are replaced by a thumbnail as a base64-encoded webp
+        """
+        # First 8 columns
+        preview_cols = dict(itertools.islice(self.table._get_schema().items(), 0, 8))
+        select_list = [self.table[col_name] for col_name in preview_cols]
+        # First 5 rows
+        rows = list(self.table.select(*select_list).head(n=5))
+        preview_header = {col_name: str(col_type._type) for col_name, col_type in preview_cols.items()}
+        preview = [
+            [self.__encode_preview_data(val, col_type)]
+            for row in rows
+            for val, col_type in zip(row.values(), preview_cols.values())
+        ]
+        return preview_header, preview
+    def __encode_preview_data(self, val: Any, col_type: ts.ColumnType) -> Any:
+        if val is None:
+            return None
+        match col_type._type:
+            case ts.ColumnType.Type.STRING:
+                assert isinstance(val, str)
+                return Formatter.abbreviate(val)
+            case ts.ColumnType.Type.INT | ts.ColumnType.Type.FLOAT | ts.ColumnType.Type.BOOL:
+                return val
+            case ts.ColumnType.Type.TIMESTAMP | ts.ColumnType.Type.DATE:
+                return str(val)
+            case ts.ColumnType.Type.ARRAY:
+                assert isinstance(val, np.ndarray)
+                return Formatter.format_array(val)
+            case ts.ColumnType.Type.JSON:
+                # We need to escape the JSON string server-side for security reasons.
+                # Therefore we don't escape it here, in order to avoid double-escaping.
+                return Formatter.format_json(val, escape_strings=False)
+            case ts.ColumnType.Type.IMAGE:
+                # Rescale the image to minimize data transfer size
+                assert isinstance(val, PIL.Image.Image)
+                return self.__encode_image(val)
+            case ts.ColumnType.Type.VIDEO:
+                assert isinstance(val, str)
+                return self.__encode_video(val)
+            case ts.ColumnType.Type.AUDIO:
+                return None
+            case ts.ColumnType.Type.DOCUMENT:
+                assert isinstance(val, str)
+                return self.__encode_document(val)
+            case _:
+                raise AssertionError(f'Unrecognized column type: {col_type._type}')
+    def __encode_image(self, img: PIL.Image.Image) -> str:
+        # Heuristic for thumbnail sizing:
+        # Standardize on a width of 240 pixels (to most efficiently utilize the columnar display).
+        # But, if the aspect ratio is below 2:3, bound the height at 360 pixels (to avoid unboundedly tall thumbnails
+        #     in the case of highly oblong images).
+        if img.height > img.width * 1.5:
+            scaled_img = img.resize((img.width * 360 // img.height, 360))
+        else:
+            scaled_img = img.resize((240, img.height * 240 // img.width))
+        with io.BytesIO() as buffer:
+            scaled_img.save(buffer, 'webp')
+            return base64.b64encode(buffer.getvalue()).decode()
+    def __encode_video(self, video_path: str) -> Optional[str]:
+        thumb = Formatter.extract_first_video_frame(video_path)
+        return self.__encode_image(thumb) if thumb is not None else None
+    def __encode_document(self, doc_path: str) -> Optional[str]:
+        thumb = Formatter.make_document_thumbnail(doc_path)
+        return self.__encode_image(thumb) if thumb is not None else None
 class TableRestorer:
     """
@@ -253,13 +363,26 @@ class TableRestorer:
         tbl_md = [schema.FullTableMd.from_dict(t) for t in self.md['md']['tables']]
         # Create the replica table
-        # TODO: This needs to be made concurrency-safe.
-        replica_tbl = catalog.Catalog.get().create_replica(catalog.Path(self.tbl_path), tbl_md)
-        assert replica_tbl._tbl_version.get().is_snapshot
+        # The logic here needs to be completely restructured in order to make it concurrency-safe.
+        # - Catalog.create_replica() needs to write the metadata and also create the physical store tables
+        #   and populate them, otherwise concurrent readers will see an inconsistent state (table metadata w/o
+        #   an actual table)
+        # - this could be done one replica at a time (instead of the entire hierarchy)
+        cat = catalog.Catalog.get()
+        cat.create_replica(catalog.Path(self.tbl_path), tbl_md)
+        # don't call get_table() until after the calls to create_replica() and __import_table() below;
+        # the TV instances created by get_table() would be replaced by create_replica(), which creates duplicate
+        # TV instances for the same replica version, which then leads to failures when constructing queries
         # Now we need to instantiate and load data for replica_tbl and its ancestors, except that we skip
         # replica_tbl itself if it's a pure snapshot.
-        if replica_tbl._id != replica_tbl._tbl_version.id:
+        target_md = tbl_md[0]
+        is_pure_snapshot = (
+            target_md.tbl_md.view_md is not None
+            and target_md.tbl_md.view_md.predicate is None
+            and len(target_md.schema_version_md.columns) == 0
+        )
+        if is_pure_snapshot:
             ancestor_md = tbl_md[1:]  # Pure snapshot; skip replica_tbl
         else:
             ancestor_md = tbl_md  # Not a pure snapshot; include replica_tbl
@@ -273,7 +396,8 @@ class TableRestorer:
                 _logger.info(f'Importing table {tv.name!r}.')
                 self.__import_table(self.tmp_dir, tv, md)
-        return replica_tbl
+        with cat.begin_xact(for_write=False):
+            return cat.get_table_by_id(UUID(tbl_md[0].tbl_md.tbl_id))
     def __import_table(self, bundle_path: Path, tv: catalog.TableVersion, tbl_md: schema.FullTableMd) -> None:
         """

pixeltable/share/publish.py CHANGED Viewed

@@ -35,7 +35,7 @@ def push_replica(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
     upload_id = response_json['upload_id']
     destination_uri = response_json['destination_uri']
-    Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
+    Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path()}' at: {dest_tbl_uri}")
     bundle = packager.package()
@@ -117,7 +117,7 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
     restorer = TableRestorer(dest_path, response_json)
     tbl = restorer.restore(bundle_path)
-    Env.get().console_logger.info(f'Created local replica {tbl._path!r} from URI: {src_tbl_uri}')
+    Env.get().console_logger.info(f'Created local replica {tbl._path()!r} from URI: {src_tbl_uri}')
     return tbl

pixeltable/store.py CHANGED Viewed

@@ -52,7 +52,8 @@ class StoreBase:
         # We need to declare a `base` variable here, even though it's only defined for instances of `StoreView`,
         # since it's referenced by various methods of `StoreBase`
         self.base = tbl_version.base.get().store_tbl if tbl_version.base is not None else None
-        self.create_sa_tbl()
+        # we're passing in tbl_version to avoid a circular call to TableVersionHandle.get()
+        self.create_sa_tbl(tbl_version)
     def system_columns(self) -> list[sql.Column]:
         return [*self._pk_cols, self.v_max_col]
@@ -77,11 +78,13 @@ class StoreBase:
         self._pk_cols = [*rowid_cols, self.v_min_col]
         return [*rowid_cols, self.v_min_col, self.v_max_col]
-    def create_sa_tbl(self) -> None:
+    def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
         """Create self.sa_tbl from self.tbl_version."""
+        if tbl_version is None:
+            tbl_version = self.tbl_version.get()
         system_cols = self._create_system_columns()
         all_cols = system_cols.copy()
-        for col in [c for c in self.tbl_version.get().cols if c.is_stored]:
+        for col in [c for c in tbl_version.cols if c.is_stored]:
             # re-create sql.Column for each column, regardless of whether it already has sa_col set: it was bound
             # to the last sql.Table version we created and cannot be reused
             col.create_sa_cols()
@@ -99,16 +102,17 @@ class StoreBase:
         # - base x view joins can be executed as merge joins
         # - speeds up ORDER BY rowid DESC
         # - allows filtering for a particular table version in index scan
-        idx_name = f'sys_cols_idx_{self.tbl_version.id.hex}'
+        idx_name = f'sys_cols_idx_{tbl_version.id.hex}'
         idxs.append(sql.Index(idx_name, *system_cols))
         # v_min/v_max indices: speeds up base table scans needed to propagate a base table insert or delete
-        idx_name = f'vmin_idx_{self.tbl_version.id.hex}'
+        idx_name = f'vmin_idx_{tbl_version.id.hex}'
         idxs.append(sql.Index(idx_name, self.v_min_col, postgresql_using=Env.get().dbms.version_index_type))
-        idx_name = f'vmax_idx_{self.tbl_version.id.hex}'
+        idx_name = f'vmax_idx_{tbl_version.id.hex}'
         idxs.append(sql.Index(idx_name, self.v_max_col, postgresql_using=Env.get().dbms.version_index_type))
         self.sa_tbl = sql.Table(self._storage_name(), self.sa_md, *all_cols, *idxs)
+        # _logger.debug(f'created sa tbl for {tbl_version.id!s} (sa_tbl={id(self.sa_tbl):x}, tv={id(tbl_version):x})')
     @abc.abstractmethod
     def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
@@ -285,7 +289,7 @@ class StoreBase:
                         else:
                             if col.col_type.is_image_type() and result_row.file_urls[value_expr_slot_idx] is None:
                                 # we have yet to store this image
-                                filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.get().version))
+                                filepath = str(MediaStore.prepare_media_path(col.tbl.id, col.id, col.tbl.version))
                                 result_row.flush_img(value_expr_slot_idx, filepath)
                             val = result_row.get_stored_val(value_expr_slot_idx, col.sa_col.type)
                             if col.col_type.is_media_type():
@@ -415,9 +419,7 @@ class StoreBase:
             number of deleted rows
         """
         where_clause = sql.true() if where_clause is None else where_clause
-        where_clause = sql.and_(
-            self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION, where_clause
-        )
+        version_clause = sql.and_(self.v_min_col < current_version, self.v_max_col == schema.Table.MAX_VERSION)
         rowid_join_clause = self._rowid_join_predicate()
         base_versions_clause = (
             sql.true() if len(base_versions) == 0 else self.base._versions_clause(base_versions, match_on_vmin)
@@ -428,10 +430,12 @@ class StoreBase:
             set_clause[index_info.undo_col.sa_col] = index_info.val_col.sa_col
             # set value column to NULL
             set_clause[index_info.val_col.sa_col] = None
         stmt = (
             sql.update(self.sa_tbl)
             .values(set_clause)
             .where(where_clause)
+            .where(version_clause)
             .where(rowid_join_clause)
             .where(base_versions_clause)
         )
@@ -528,10 +532,12 @@ class StoreComponentView(StoreView):
         self.rowid_cols.append(self.pos_col)
         return self.rowid_cols
-    def create_sa_tbl(self) -> None:
-        super().create_sa_tbl()
+    def create_sa_tbl(self, tbl_version: Optional[catalog.TableVersion] = None) -> None:
+        if tbl_version is None:
+            tbl_version = self.tbl_version.get()
+        super().create_sa_tbl(tbl_version)
         # we need to fix up the 'pos' column in TableVersion
-        self.tbl_version.get().cols_by_name['pos'].sa_col = self.pos_col
+        tbl_version.cols_by_name['pos'].sa_col = self.pos_col
     def _rowid_join_predicate(self) -> sql.ColumnElement[bool]:
         return sql.and_(

pixeltable/type_system.py CHANGED Viewed

@@ -395,6 +395,36 @@ class ColumnType:
                 raise excs.Error(f'Standard Python type `{name}` cannot be used here; use `{suggestion}` instead')
         raise excs.Error(f'Unknown type: {t}')
+    @classmethod
+    def from_json_schema(cls, schema: dict[str, Any]) -> Optional[ColumnType]:
+        # We first express the JSON schema as a Python type, and then convert it to a Pixeltable type.
+        # TODO: Is there a meaningful fallback if one of these operations fails? (Maybe another use case for a pxt Any
+        #     type?)
+        py_type = cls.__json_schema_to_py_type(schema)
+        return cls.from_python_type(py_type) if py_type is not None else None
+    @classmethod
+    def __json_schema_to_py_type(cls, schema: dict[str, Any]) -> Union[type, _GenericAlias, None]:
+        if 'type' in schema:
+            if schema['type'] == 'null':
+                return type(None)
+            if schema['type'] == 'string':
+                return str
+            if schema['type'] == 'integer':
+                return int
+            if schema['type'] == 'number':
+                return float
+            if schema['type'] == 'boolean':
+                return bool
+            if schema['type'] in ('array', 'object'):
+                return list
+        elif 'anyOf' in schema:
+            subscripts = tuple(cls.__json_schema_to_py_type(subschema) for subschema in schema['anyOf'])
+            if all(subscript is not None for subscript in subscripts):
+                return Union[subscripts]
+        return None
     def validate_literal(self, val: Any) -> None:
         """Raise TypeError if val is not a valid literal for this type"""
         if val is None:

pixeltable/utils/dbms.py CHANGED Viewed

@@ -35,7 +35,7 @@ class PostgresqlDbms(Dbms):
     """
     def __init__(self, db_url: URL):
-        super().__init__('postgresql', 'REPEATABLE READ', 'brin', db_url)
+        super().__init__('postgresql', 'SERIALIZABLE', 'brin', db_url)
     def drop_db_stmt(self, database: str) -> str:
         return f'DROP DATABASE {database}'

pixeltable 0.3.15__py3-none-any.whl → 0.4.0__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.15py3-none-any.whl → 0.4.0py3-none-any.whl