PyPI - pixeltable - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl - Mend

pixeltable 0.3.4py3-none-any.whl → 0.3.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (57) hide show

pixeltable/__init__.py +1 -0
pixeltable/__version__.py +2 -2
pixeltable/catalog/catalog.py +9 -2
pixeltable/catalog/column.py +1 -1
pixeltable/catalog/dir.py +1 -1
pixeltable/catalog/table.py +1 -1
pixeltable/catalog/table_version.py +12 -2
pixeltable/catalog/table_version_path.py +2 -2
pixeltable/catalog/view.py +64 -20
pixeltable/dataframe.py +10 -5
pixeltable/env.py +12 -0
pixeltable/exec/expr_eval/evaluators.py +4 -2
pixeltable/exec/expr_eval/expr_eval_node.py +4 -1
pixeltable/exprs/comparison.py +8 -4
pixeltable/exprs/data_row.py +5 -3
pixeltable/exprs/expr.py +2 -2
pixeltable/exprs/function_call.py +155 -313
pixeltable/func/aggregate_function.py +29 -15
pixeltable/func/callable_function.py +11 -8
pixeltable/func/expr_template_function.py +3 -9
pixeltable/func/function.py +148 -74
pixeltable/func/signature.py +65 -30
pixeltable/func/udf.py +1 -1
pixeltable/functions/__init__.py +1 -0
pixeltable/functions/deepseek.py +121 -0
pixeltable/functions/image.py +7 -7
pixeltable/functions/openai.py +23 -9
pixeltable/functions/video.py +14 -7
pixeltable/globals.py +14 -3
pixeltable/index/embedding_index.py +4 -13
pixeltable/io/globals.py +88 -77
pixeltable/io/hf_datasets.py +34 -34
pixeltable/io/pandas.py +75 -76
pixeltable/io/parquet.py +19 -27
pixeltable/io/utils.py +115 -0
pixeltable/iterators/audio.py +2 -1
pixeltable/iterators/video.py +1 -1
pixeltable/metadata/__init__.py +2 -1
pixeltable/metadata/converters/convert_15.py +18 -8
pixeltable/metadata/converters/convert_27.py +31 -0
pixeltable/metadata/converters/convert_28.py +15 -0
pixeltable/metadata/converters/convert_29.py +111 -0
pixeltable/metadata/converters/util.py +12 -1
pixeltable/metadata/notes.py +3 -0
pixeltable/metadata/schema.py +8 -0
pixeltable/share/__init__.py +1 -0
pixeltable/share/packager.py +41 -13
pixeltable/share/publish.py +97 -0
pixeltable/type_system.py +40 -14
pixeltable/utils/__init__.py +41 -0
pixeltable/utils/arrow.py +40 -7
pixeltable/utils/formatter.py +1 -1
{pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/METADATA +34 -49
{pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/RECORD +57 -51
{pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/WHEEL +1 -1
{pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/LICENSE +0 -0
{pixeltable-0.3.4.dist-info → pixeltable-0.3.5.dist-info}/entry_points.txt +0 -0

pixeltable/metadata/converters/convert_15.py CHANGED Viewed

@@ -17,7 +17,7 @@ _logger = logging.getLogger('pixeltable')
 def _(engine: sql.engine.Engine) -> None:
     with engine.begin() as conn:
         for row in conn.execute(sql.select(Function)):
-            id, dir_id, md, binary_obj = row
+            id, _, md, binary_obj = row
             md['md'] = __update_md(md['md'], binary_obj)
             _logger.info(f'Updating function: {id}')
             conn.execute(sql.update(Function).where(Function.id == id).values(md=md))
@@ -27,14 +27,24 @@ def __update_md(orig_d: dict, binary_obj: bytes) -> Any:
     # construct dict produced by CallableFunction.to_store()
     py_fn = cloudpickle.loads(binary_obj)
     py_params = inspect.signature(py_fn).parameters
-    return_type = ts.ColumnType.from_dict(orig_d['return_type'])
-    params: list[func.Parameter] = []
+    return_type = orig_d['return_type']
+    params: list[dict] = []
     for name, col_type_dict, kind_int, is_batched in orig_d['parameters']:
-        col_type = ts.ColumnType.from_dict(col_type_dict) if col_type_dict is not None else None
         default = py_params[name].default
-        kind = inspect._ParameterKind(kind_int)  # is there a way to avoid referencing a private type?
-        params.append(func.Parameter(name=name, col_type=col_type, kind=kind, default=default, is_batched=is_batched))
+        kind = inspect._ParameterKind(kind_int)
+        params.append(
+            {
+                'name': name,
+                'col_type': col_type_dict,
+                'kind': str(kind),
+                'is_batched': is_batched,
+                'has_default': default is not inspect.Parameter.empty,
+                'default': None if default is inspect.Parameter.empty else default,
+            }
+        )
     is_batched = 'batch_size' in orig_d
-    sig = func.Signature(return_type, params, is_batched=is_batched)
-    d = {'signature': sig.as_dict(), 'batch_size': orig_d['batch_size'] if is_batched else None}
+    d = {
+        'signature': {'return_type': return_type, 'parameters': params, 'is_batched': is_batched},
+        'batch_size': orig_d['batch_size'] if is_batched else None,
+    }
     return d

pixeltable/metadata/converters/convert_27.py ADDED Viewed

@@ -0,0 +1,31 @@
+import logging
+from typing import Any, Optional
+from uuid import UUID
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_md
+from pixeltable.metadata.schema import Table
+_logger = logging.getLogger('pixeltable')
+@register_converter(version=27)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_md(engine, table_md_updater=__update_table_md)
+def __update_table_md(table_md: dict, table_id: UUID) -> None:
+    """Update the view metadata to add the include_base_columns boolean if it is missing
+    Args:
+        table_md (dict): copy of the original table metadata. this gets updated in place.
+        table_id (UUID): the table id
+    """
+    if table_md['view_md'] is None:
+        return
+    if 'include_base_columns' not in table_md['view_md']:
+        table_md['view_md']['include_base_columns'] = True
+        _logger.info(f'Updating view metadata for table: {table_id}')

pixeltable/metadata/converters/convert_28.py ADDED Viewed

@@ -0,0 +1,15 @@
+import logging
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.schema import Dir, Table, TableSchemaVersion, TableVersion
+@register_converter(version=28)
+def _(engine: sql.engine.Engine) -> None:
+    with engine.begin() as conn:
+        conn.execute(sql.update(Dir).values(md=Dir.md.concat({'user': None, 'additional_md': {}})))
+        conn.execute(sql.update(Table).values(md=Table.md.concat({'user': None, 'additional_md': {}})))
+        conn.execute(sql.update(TableVersion).values(md=TableVersion.md.concat({'additional_md': {}})))
+        conn.execute(sql.update(TableSchemaVersion).values(md=TableSchemaVersion.md.concat({'additional_md': {}})))

pixeltable/metadata/converters/convert_29.py ADDED Viewed

@@ -0,0 +1,111 @@
+from typing import Any, Optional
+import sqlalchemy as sql
+from pixeltable import exprs
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_md
+@register_converter(version=29)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_md(engine, substitution_fn=__substitute_md)
+def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
+    # Defaults are now stored as literals in signatures
+    if k == 'parameters':
+        for param in v:
+            assert isinstance(param, dict)
+            has_default = param.get('has_default') or (param.get('default') is not None)
+            if 'has_default' in param:
+                del param['has_default']
+            literal = exprs.Expr.from_object(param['default']) if has_default else None
+            assert literal is None or isinstance(literal, exprs.Literal)
+            param['default'] = None if literal is None else literal.as_dict()
+        return k, v
+    # Method of organizing argument expressions has changed
+    if isinstance(v, dict) and v.get('_classname') == 'FunctionCall':
+        args = v['args']
+        kwargs = v['kwargs']
+        components = v['components']
+        group_by_start_idx = v['group_by_start_idx']
+        group_by_stop_idx = v['group_by_stop_idx']
+        order_by_start_idx = v['order_by_start_idx']
+        new_args = []
+        for arg in args:
+            if arg[0] is not None:
+                assert isinstance(arg[0], int)
+                new_args.append(components[arg[0]])
+            else:
+                literal = exprs.Expr.from_object(arg[1])
+                new_args.append(literal.as_dict())
+        new_kwargs = {}
+        for name, kwarg in kwargs.items():
+            if kwarg[0] is not None:
+                assert isinstance(kwarg[0], int)
+                new_kwargs[name] = components[kwarg[0]]
+            else:
+                literal = exprs.Expr.from_object(kwarg[1])
+                new_kwargs[name] = literal.as_dict()
+        # We need to expand ("unroll") any var-args or var-kwargs.
+        new_args_len = len(new_args)
+        rolled_args: Optional[dict] = None
+        rolled_kwargs: Optional[dict] = None
+        if 'signature' in v['fn']:
+            # If it's a pickled function, there's no signature, so we're out of luck; varargs in a pickled function
+            # is an edge case that won't migrate properly.
+            parameters: list[dict] = v['fn']['signature']['parameters']
+            for i, param in enumerate(parameters):
+                if param['kind'] == 'VAR_POSITIONAL':
+                    if new_args_len > i:
+                        # For peculiar historical reasons, variable kwargs might show up in args. Thus variable
+                        # positional args is not necessarily the last element of args; it might be the second-to-last.
+                        assert new_args_len <= i + 2, new_args
+                        rolled_args = new_args[i]
+                        new_args = new_args[:i] + new_args[i + 1 :]
+                if param['kind'] == 'VAR_KEYWORD':
+                    # As noted above, variable kwargs might show up either in args or in kwargs. If it's in args, it
+                    # is necessarily the last element.
+                    if new_args_len > i:
+                        assert new_args_len <= i + 1, new_args
+                        rolled_kwargs = new_args.pop()
+                    if param['name'] in kwargs:
+                        assert rolled_kwargs is None
+                        rolled_kwargs = kwargs.pop(param['name'])
+        if rolled_args is not None:
+            assert rolled_args['_classname'] in ('InlineArray', 'InlineList')
+            new_args.extend(rolled_args['components'])
+        if rolled_kwargs is not None:
+            assert rolled_kwargs['_classname'] == 'InlineDict'
+            new_kwargs.update(zip(rolled_kwargs['keys'], rolled_kwargs['components']))
+        group_by_exprs = [components[i] for i in range(group_by_start_idx, group_by_stop_idx)]
+        order_by_exprs = [components[i] for i in range(order_by_start_idx, len(components))]
+        new_components = [*new_args, *new_kwargs.values(), *group_by_exprs, *order_by_exprs]
+        newv = {
+            'fn': v['fn'],
+            'arg_idxs': list(range(len(new_args))),
+            'kwarg_idxs': {name: i + len(new_args) for i, name in enumerate(new_kwargs.keys())},
+            'group_by_start_idx': len(new_args) + len(new_kwargs),
+            'group_by_stop_idx': len(new_args) + len(new_kwargs) + len(group_by_exprs),
+            'order_by_start_idx': len(new_args) + len(new_kwargs) + len(group_by_exprs),
+            'is_method_call': False,
+            '_classname': 'FunctionCall',
+            'components': new_components,
+        }
+        if 'return_type' in v:
+            newv['return_type'] = v['return_type']
+        return k, newv
+    return None

pixeltable/metadata/converters/util.py CHANGED Viewed

@@ -5,7 +5,7 @@ from uuid import UUID
 import sqlalchemy as sql
-from pixeltable.metadata.schema import Table, TableSchemaVersion
+from pixeltable.metadata.schema import Function, Table, TableSchemaVersion
 __logger = logging.getLogger('pixeltable')
@@ -50,6 +50,17 @@ def convert_table_md(
                 __logger.info(f'Updating schema for table: {id}')
                 conn.execute(sql.update(Table).where(Table.id == id).values(md=updated_table_md))
+        for row in conn.execute(sql.select(Function)):
+            id = row[0]
+            function_md = row[2]
+            assert isinstance(function_md, dict)
+            updated_function_md = copy.deepcopy(function_md)
+            if substitution_fn is not None:
+                updated_function_md = __substitute_md_rec(updated_function_md, substitution_fn)
+            if updated_function_md != function_md:
+                __logger.info(f'Updating function: {id}')
+                conn.execute(sql.update(Function).where(Function.id == id).values(md=updated_function_md))
 def __update_column_md(table_md: dict, column_md_updater: Callable[[dict], None]) -> None:
     columns_md = table_md['column_md']

pixeltable/metadata/notes.py CHANGED Viewed

@@ -2,6 +2,9 @@
 # rather than as a comment, so that the existence of a description can be enforced by
 # the unit tests when new versions are added.
 VERSION_NOTES = {
+    30: 'Store default values and constant arguments as literals',
+    29: 'Add user and additional_md fields to metadata structs',
+    28: 'Enable view creation from DataFrame with select clause',
     27: 'Enable pxt.query parameterization of limit clauses',
     26: 'Rename clip_text and clip_image to clip',
     25: 'Functions with multiple signatures',

pixeltable/metadata/schema.py CHANGED Viewed

@@ -74,6 +74,8 @@ class SystemInfo(Base):
 @dataclasses.dataclass
 class DirMd:
     name: str
+    user: Optional[str]
+    additional_md: dict[str, Any]
 class Dir(Base):
@@ -132,6 +134,7 @@ class IndexMd:
 @dataclasses.dataclass
 class ViewMd:
     is_snapshot: bool
+    include_base_columns: bool
     # (table id, version); for mutable views, all versions are None
     base_versions: list[tuple[str, Optional[int]]]
@@ -150,6 +153,8 @@ class ViewMd:
 class TableMd:
     name: str
+    user: Optional[str]
     # monotonically increasing w/in Table for both data and schema changes, starting at 0
     current_version: int
     # each version has a corresponding schema version (current_version >= current_schema_version)
@@ -169,6 +174,7 @@ class TableMd:
     column_md: dict[int, ColumnMd]  # col_id -> ColumnMd
     index_md: dict[int, IndexMd]  # index_id -> IndexMd
     view_md: Optional[ViewMd]
+    additional_md: dict[str, Any]
 class Table(Base):
@@ -194,6 +200,7 @@ class TableVersionMd:
     created_at: float  # time.time()
     version: int
     schema_version: int
+    additional_md: dict[str, Any]
 class TableVersion(Base):
@@ -232,6 +239,7 @@ class TableSchemaVersionMd:
     # default validation strategy for any media column of this table
     # stores column.MediaValiation.name.lower()
     media_validation: str
+    additional_md: dict[str, Any]
 # versioning: each table schema change results in a new record

pixeltable/share/__init__.py CHANGED Viewed

	@@ -0,0 +1 @@
1	+ from .publish import publish_snapshot

pixeltable/share/packager.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import dataclasses
 import io
 import json
 import logging
@@ -5,8 +6,9 @@ import tarfile
 import urllib.parse
 import urllib.request
 import uuid
+from datetime import datetime
 from pathlib import Path
-from typing import Any, Iterator
+from typing import Any, Iterator, Optional
 import more_itertools
 import numpy as np
@@ -15,7 +17,8 @@ import pyiceberg.catalog
 import pixeltable as pxt
 import pixeltable.type_system as ts
-from pixeltable import exprs
+from pixeltable import catalog, exprs, metadata
+from pixeltable.dataframe import DataFrame
 from pixeltable.env import Env
 from pixeltable.utils.arrow import PXT_TO_PA_TYPES
 from pixeltable.utils.iceberg import sqlite_catalog
@@ -28,6 +31,7 @@ class TablePackager:
     Packages a pixeltable Table into a tarball containing Iceberg tables and media files. The structure of the tarball
     is as follows:
+    metadata.json  # Pixeltable metadata for the packaged table
     warehouse/catalog.db  # sqlite Iceberg catalog
     warehouse/pxt.db/**  # Iceberg metadata and data files (parquet/avro/json)
     media/**  # Local media files
@@ -43,16 +47,40 @@ class TablePackager:
       'media/{uuid}{extension}', and the Iceberg table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
     """
-    table: pxt.Table  # The table to be packaged
+    table: catalog.Table  # The table to be packaged
     tmp_dir: Path  # Temporary directory where the package will reside
     iceberg_catalog: pyiceberg.catalog.Catalog
     media_files: dict[Path, str]  # Mapping from local media file paths to their tarball names
+    md: dict[str, Any]
-    def __init__(self, table: pxt.Table) -> None:
+    def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
         self.table = table
         self.tmp_dir = Path(Env.get().create_tmp_path())
         self.media_files = {}
+        # Generate metadata
+        self.md = {
+            'pxt_version': pxt.__version__,
+            'pxt_md_version': metadata.VERSION,
+            'md': {
+                'tables': [
+                    {
+                        'table_id': str(t._tbl_version.id),
+                        # These are temporary; will replace with a better solution once the concurrency changes to catalog have
+                        # been merged
+                        'table_md': dataclasses.asdict(t._tbl_version._create_tbl_md()),
+                        'table_version_md': dataclasses.asdict(
+                            t._tbl_version._create_version_md(datetime.now().timestamp())
+                        ),
+                        'table_schema_version_md': dataclasses.asdict(t._tbl_version._create_schema_version_md(0)),
+                    }
+                    for t in (table, *table._bases)
+                ]
+            },
+        }
+        if additional_md is not None:
+            self.md.update(additional_md)
     def package(self) -> Path:
         """
         Export the table to a tarball containing Iceberg tables and media files.
@@ -60,8 +88,10 @@ class TablePackager:
         assert not self.tmp_dir.exists()  # Packaging can only be done once per TablePackager instance
         _logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
         self.tmp_dir.mkdir()
+        with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
+            json.dump(self.md, fp)
         self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
-        ancestors = [self.table] + self.table._bases
+        ancestors = (self.table, *self.table._bases)
         for t in ancestors:
             _logger.info(f"Exporting table '{t._path}'.")
             self.__export_table(t)
@@ -70,7 +100,7 @@ class TablePackager:
         _logger.info(f'Packaging complete: {bundle_path}')
         return bundle_path
-    def __export_table(self, t: pxt.Table) -> None:
+    def __export_table(self, t: catalog.Table) -> None:
         """
         Exports the data from `t` into an Iceberg table.
         """
@@ -116,7 +146,7 @@ class TablePackager:
             iceberg_tbl.append(pa_table)
     @classmethod
-    def __iceberg_namespace(cls, table: pxt.Table) -> str:
+    def __iceberg_namespace(cls, table: catalog.Table) -> str:
         """
         Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
         """
@@ -149,11 +179,7 @@ class TablePackager:
         return PXT_TO_PA_TYPES.get(col_type.__class__)
     def __to_pa_tables(
-        self,
-        df: pxt.DataFrame,
-        actual_col_types: list[pxt.ColumnType],
-        arrow_schema: pa.Schema,
-        batch_size: int = 1_000,
+        self, df: DataFrame, actual_col_types: list[ts.ColumnType], arrow_schema: pa.Schema, batch_size: int = 1_000
     ) -> Iterator[pa.Table]:
         """
         Load a DataFrame as a sequence of pyarrow tables. The pyarrow tables are batched into smaller chunks
@@ -165,7 +191,7 @@ class TablePackager:
             cols['_v_min'] = [row[-1] for row in rows]
             yield pa.Table.from_pydict(cols, schema=arrow_schema)
-    def __to_pa_rows(self, df: pxt.DataFrame, actual_col_types: list[pxt.ColumnType]) -> Iterator[list]:
+    def __to_pa_rows(self, df: DataFrame, actual_col_types: list[ts.ColumnType]) -> Iterator[list]:
         for row in df._exec():
             vals = [row[e.slot_idx] for e in df._select_list_exprs]
             result = [self.__to_pa_value(val, col_type) for val, col_type in zip(vals, actual_col_types)]
@@ -210,6 +236,8 @@ class TablePackager:
     def __build_tarball(self) -> Path:
         bundle_path = self.tmp_dir / 'bundle.tar.bz2'
         with tarfile.open(bundle_path, 'w:bz2') as tf:
+            # Add metadata json
+            tf.add(self.tmp_dir / 'metadata.json', arcname='metadata.json')
             # Add the Iceberg warehouse dir (including the catalog)
             tf.add(self.tmp_dir / 'warehouse', arcname='warehouse', recursive=True)
             # Add the media files

pixeltable/share/publish.py ADDED Viewed

@@ -0,0 +1,97 @@
+import dataclasses
+import os
+import sys
+import urllib.parse
+import urllib.request
+from datetime import datetime
+from pathlib import Path
+import requests
+from tqdm import tqdm
+import pixeltable as pxt
+from pixeltable import exceptions as excs, metadata
+from pixeltable.env import Env
+from pixeltable.utils import sha256sum
+from .packager import TablePackager
+# These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
+# pixeltable.com URLs are available.
+_PUBLISH_URL = os.environ.get('PIXELTABLE_PUBLISH_URL')
+_FINALIZE_URL = os.environ.get('PIXELTABLE_FINALIZE_URL')
+def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
+    packager = TablePackager(src_tbl, additional_md={'table_uri': dest_tbl_uri})
+    request_json = packager.md
+    headers_json = {'X-api-key': Env.get().pxt_api_key}
+    response = requests.post(_PUBLISH_URL, json=request_json, headers=headers_json)
+    if response.status_code != 200:
+        raise excs.Error(f'Error publishing snapshot: {response.text}')
+    response_json = response.json()
+    if not isinstance(response_json, dict) or response_json.get('destination') != 's3':
+        raise excs.Error(f'Error publishing snapshot: unexpected response from server.\n{response_json}')
+    upload_id = response_json['upload_id']
+    destination_uri = response_json['destination_uri']
+    Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
+    bundle = packager.package()
+    parsed_location = urllib.parse.urlparse(destination_uri)
+    if parsed_location.scheme == 's3':
+        _upload_bundle_to_s3(bundle, parsed_location)
+    else:
+        raise excs.Error(f'Unsupported destination: {destination_uri}')
+    Env.get().console_logger.info(f'Finalizing snapshot ...')
+    finalize_request_json = {
+        'upload_id': upload_id,
+        'datafile': bundle.name,
+        'size': bundle.stat().st_size,
+        'sha256': sha256sum(bundle),  # Generate our own SHA for independent verification
+    }
+    # TODO: Use Pydantic for validation
+    finalize_response = requests.post(_FINALIZE_URL, json=finalize_request_json, headers=headers_json)
+    if finalize_response.status_code != 200:
+        raise excs.Error(f'Error finalizing snapshot: {finalize_response.text}')
+    finalize_response_json = finalize_response.json()
+    if not isinstance(finalize_response_json, dict) or 'confirmed_table_uri' not in finalize_response_json:
+        raise excs.Error(f'Error finalizing snapshot: unexpected response from server.\n{finalize_response_json}')
+    confirmed_tbl_uri = finalize_response_json['confirmed_table_uri']
+    Env.get().console_logger.info(f'The published snapshot is now available at: {confirmed_tbl_uri}')
+    return confirmed_tbl_uri
+def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
+    from pixeltable.utils.s3 import get_client
+    bucket = parsed_location.netloc
+    remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
+    remote_path = str(remote_dir / bundle.name)[1:]  # Remove initial /
+    Env.get().console_logger.info(f'Uploading snapshot to: {bucket}:{remote_path}')
+    boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
+    s3_client = get_client(**boto_config)
+    upload_args = {'ChecksumAlgorithm': 'SHA256'}
+    progress_bar = tqdm(
+        desc=f'Uploading',
+        total=bundle.stat().st_size,
+        unit='B',
+        unit_scale=True,
+        unit_divisor=1024,
+        miniters=1,  # Update every iteration (should be fine for an upload)
+        ncols=100,
+        file=sys.stdout,
+    )
+    s3_client.upload_file(
+        Filename=str(bundle), Bucket=bucket, Key=str(remote_path), ExtraArgs=upload_args, Callback=progress_bar.update
+    )

pixeltable/type_system.py CHANGED Viewed

@@ -8,10 +8,9 @@ import json
 import typing
 import urllib.parse
 import urllib.request
-from pathlib import Path
 from typing import Any, Iterable, Literal, Mapping, Optional, Sequence, Union
-import av  # type: ignore
+import av
 import jsonschema
 import jsonschema.protocols
 import jsonschema.validators
@@ -22,6 +21,7 @@ import sqlalchemy as sql
 from typing_extensions import _AnnotatedAlias
 import pixeltable.exceptions as excs
+from pixeltable.utils import parse_local_file_path
 from typing import _GenericAlias  # type: ignore[attr-defined]  # isort: skip
@@ -47,8 +47,8 @@ class ColumnType:
         @classmethod
         def supertype(
             cls,
-            type1: 'ColumnType.Type',
-            type2: 'ColumnType.Type',
+            type1: Optional['ColumnType.Type'],
+            type2: Optional['ColumnType.Type'],
             # we need to pass this in because we can't easily append it as a class member
             common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type'],
         ) -> Optional['ColumnType.Type']:
@@ -93,6 +93,9 @@ class ColumnType:
         self._type = t
         self._nullable = nullable
+    def has_supertype(self) -> bool:
+        return True
     @property
     def nullable(self) -> bool:
         return self._nullable
@@ -271,8 +274,10 @@ class ColumnType:
                 inferred_type = val_type
             else:
                 inferred_type = inferred_type.supertype(val_type)
-                if inferred_type is None:
-                    return None
+            if inferred_type is None:
+                return None
+            if not inferred_type.has_supertype():
+                return inferred_type
         return inferred_type
     @classmethod
@@ -397,12 +402,9 @@ class ColumnType:
     def _validate_file_path(self, val: Any) -> None:
         """Raises TypeError if not a valid local file path or not a path/byte sequence"""
         if isinstance(val, str):
-            parsed = urllib.parse.urlparse(val)
-            if parsed.scheme != '' and parsed.scheme != 'file':
-                return
-            path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
-            if not path.is_file():
-                raise TypeError(f'File not found: {str(path)}')
+            path = parse_local_file_path(val)
+            if path is not None and not path.is_file():
+                raise TypeError(f'File not found: {path}')
         else:
             if not isinstance(val, bytes):
                 raise TypeError(f'expected file path or bytes, got {type(val)}')
@@ -495,7 +497,7 @@ class InvalidType(ColumnType):
         super().__init__(self.Type.INVALID, nullable=nullable)
     def to_sa_type(self) -> sql.types.TypeEngine:
-        assert False
+        return sql.types.NullType()
     def print_value(self, val: Any) -> str:
         return str(val)
@@ -508,6 +510,9 @@ class StringType(ColumnType):
     def __init__(self, nullable: bool = False):
         super().__init__(self.Type.STRING, nullable=nullable)
+    def has_supertype(self):
+        return not self.nullable
     def to_sa_type(self) -> sql.types.TypeEngine:
         return sql.String()
@@ -591,6 +596,9 @@ class TimestampType(ColumnType):
     def __init__(self, nullable: bool = False):
         super().__init__(self.Type.TIMESTAMP, nullable=nullable)
+    def has_supertype(self):
+        return not self.nullable
     def to_sa_type(self) -> sql.types.TypeEngine:
         return sql.TIMESTAMP(timezone=True)
@@ -601,6 +609,8 @@ class TimestampType(ColumnType):
     def _create_literal(self, val: Any) -> Any:
         if isinstance(val, str):
             return datetime.datetime.fromisoformat(val)
+        if isinstance(val, datetime.datetime):
+            return val
         return val
@@ -651,6 +661,10 @@ class JsonType(ColumnType):
         return val_type.print_value(val)
     def _validate_literal(self, val: Any) -> None:
+        if isinstance(val, tuple):
+            val = list(val)
+        if isinstance(val, pydantic.BaseModel):
+            val = val.model_dump()
         if not self.__is_valid_json(val):
             raise TypeError(f'That literal is not a valid Pixeltable JSON object: {val}')
         if self.__validator is not None:
@@ -818,14 +832,20 @@ class ArrayType(ColumnType):
         return hash((self._type, self.nullable, self.shape, self.dtype))
     def supertype(self, other: ColumnType) -> Optional[ArrayType]:
+        basic_supertype = super().supertype(other)
+        if basic_supertype is not None:
+            assert isinstance(basic_supertype, ArrayType)
+            return basic_supertype
         if not isinstance(other, ArrayType):
             return None
         super_dtype = self.Type.supertype(self.dtype, other.dtype, self.common_supertypes)
         if super_dtype is None:
             # if the dtypes are incompatible, then the supertype is a fully general array
             return ArrayType(nullable=(self.nullable or other.nullable))
         super_shape: Optional[tuple[Optional[int], ...]]
-        if len(self.shape) != len(other.shape):
+        if self.shape is None or other.shape is None or len(self.shape) != len(other.shape):
             super_shape = None
         else:
             super_shape = tuple(n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape))
@@ -1009,8 +1029,14 @@ class ImageType(ColumnType):
         return hash((self._type, self.nullable, self.size, self.mode))
     def supertype(self, other: ColumnType) -> Optional[ImageType]:
+        basic_supertype = super().supertype(other)
+        if basic_supertype is not None:
+            assert isinstance(basic_supertype, ImageType)
+            return basic_supertype
         if not isinstance(other, ImageType):
             return None
         width = self.width if self.width == other.width else None
         height = self.height if self.height == other.height else None
         mode = self.mode if self.mode == other.mode else None

pixeltable 0.3.4__py3-none-any.whl → 0.3.5__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.4py3-none-any.whl → 0.3.5py3-none-any.whl