PyPI - pixeltable - Versions diffs - 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl - Mend

pixeltable 0.3.3py3-none-any.whl → 0.3.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (18) hide show

pixeltable/__version__.py +2 -2
pixeltable/dataframe.py +4 -9
pixeltable/env.py +8 -3
pixeltable/exec/component_iteration_node.py +1 -2
pixeltable/exprs/expr.py +7 -0
pixeltable/functions/openai.py +35 -10
pixeltable/io/pandas.py +3 -14
pixeltable/share/__init__.py +0 -0
pixeltable/share/packager.py +218 -0
pixeltable/type_system.py +47 -28
pixeltable/utils/arrow.py +6 -6
pixeltable/utils/iceberg.py +14 -0
pixeltable/utils/media_store.py +1 -1
{pixeltable-0.3.3.dist-info → pixeltable-0.3.4.dist-info}/METADATA +4 -2
{pixeltable-0.3.3.dist-info → pixeltable-0.3.4.dist-info}/RECORD +18 -15
{pixeltable-0.3.3.dist-info → pixeltable-0.3.4.dist-info}/LICENSE +0 -0
{pixeltable-0.3.3.dist-info → pixeltable-0.3.4.dist-info}/WHEEL +0 -0
{pixeltable-0.3.3.dist-info → pixeltable-0.3.4.dist-info}/entry_points.txt +0 -0

pixeltable/__version__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 # These version placeholders will be replaced during build.
-__version__ = '0.3.3'
-__version_tuple__ = (0, 3, 3)
+__version__ = '0.3.4'
+__version_tuple__ = (0, 3, 4)

pixeltable/dataframe.py CHANGED Viewed

@@ -578,15 +578,9 @@ class DataFrame:
         # analyze select list; wrap literals with the corresponding expressions
         select_list: list[tuple[exprs.Expr, Optional[str]]] = []
         for raw_expr, name in base_list:
-            if isinstance(raw_expr, exprs.Expr):
-                select_list.append((raw_expr, name))
-            elif isinstance(raw_expr, (dict, list, tuple)):
-                select_list.append((exprs.Expr.from_object(raw_expr), name))
-            elif isinstance(raw_expr, np.ndarray):
-                select_list.append((exprs.Expr.from_array(raw_expr), name))
-            else:
-                select_list.append((exprs.Literal(raw_expr), name))
-            expr = select_list[-1][0]
+            expr = exprs.Expr.from_object(raw_expr)
+            if expr is None:
+                raise excs.Error(f'Invalid expression: {raw_expr}')
             if expr.col_type.is_invalid_type():
                 raise excs.Error(f'Invalid type: {raw_expr}')
             if not expr.is_bound_by(self._from_clause.tbls):
@@ -594,6 +588,7 @@ class DataFrame:
                     f"Expression '{expr}' cannot be evaluated in the context of this query's tables "
                     f'({",".join(tbl.tbl_name() for tbl in self._from_clause.tbls)})'
                 )
+            select_list.append((expr, name))
         # check user provided names do not conflict among themselves or with auto-generated ones
         seen: set[str] = set()

pixeltable/env.py CHANGED Viewed

@@ -333,9 +333,7 @@ class Env:
         http_logger.addHandler(http_fh)
         http_logger.propagate = False
-        # empty tmp dir
-        for path in glob.glob(f'{self._tmp_dir}/*'):
-            os.remove(path)
+        self.clear_tmp_dir()
         self._db_name = os.environ.get('PIXELTABLE_DB', 'pixeltable')
         self._pgdata_dir = Path(os.environ.get('PIXELTABLE_PGDATA', str(self._home / 'pgdata')))
@@ -628,6 +626,13 @@ class Env:
             )
             self.__optional_packages['spacy'].is_installed = False
+    def clear_tmp_dir(self) -> None:
+        for path in glob.glob(f'{self._tmp_dir}/*'):
+            if os.path.isdir(path):
+                shutil.rmtree(path)
+            else:
+                os.remove(path)
     def num_tmp_files(self) -> int:
         return len(glob.glob(f'{self._tmp_dir}/*'))

pixeltable/exec/component_iteration_node.py CHANGED Viewed

@@ -1,5 +1,4 @@
-import inspect
-from typing import AsyncIterator, Iterator, Optional
+from typing import AsyncIterator
 import pixeltable.catalog as catalog
 import pixeltable.exceptions as excs

pixeltable/exprs/expr.py CHANGED Viewed

@@ -10,6 +10,7 @@ import typing
 from typing import TYPE_CHECKING, Any, Callable, Iterable, Iterator, Optional, TypeVar, Union, overload
 from uuid import UUID
+import numpy as np
 import sqlalchemy as sql
 from typing_extensions import Self, _AnnotatedAlias
@@ -379,6 +380,12 @@ class Expr(abc.ABC):
     @classmethod
     def from_array(cls, elements: Iterable) -> Optional[Expr]:
         from .inline_expr import InlineArray
+        from .literal import Literal
+        if isinstance(elements, np.ndarray):
+            pxttype = ts.ArrayType.from_literal(elements)
+            if pxttype is not None:
+                return Literal(elements, col_type=pxttype)
         inline_array = InlineArray(elements)
         return inline_array.maybe_literal()

pixeltable/functions/openai.py CHANGED Viewed

@@ -14,7 +14,7 @@ import math
 import pathlib
 import re
 import uuid
-from typing import TYPE_CHECKING, Any, Callable, Optional, Type, TypeVar, Union, cast
+from typing import TYPE_CHECKING, Any, Callable, Literal, Optional, Type, TypeVar, Union, cast
 import httpx
 import numpy as np
@@ -324,10 +324,17 @@ async def translations(
 # Chat Endpoints
+def _default_max_tokens(model: str) -> int:
+    if model in ('o1', 'o3-mini'):
+        return 65536
+    else:
+        return 1024
 def _chat_completions_get_request_resources(
-    messages: list, max_tokens: Optional[int], n: Optional[int]
+    messages: list, model: str, max_completion_tokens: Optional[int], max_tokens: Optional[int], n: Optional[int]
 ) -> dict[str, int]:
-    completion_tokens = n * max_tokens
+    completion_tokens = (n or 1) * (max_completion_tokens or max_tokens or _default_max_tokens(model))
     num_tokens = 0.0
     for message in messages:
@@ -349,16 +356,18 @@ async def chat_completions(
     logit_bias: Optional[dict[str, int]] = None,
     logprobs: Optional[bool] = None,
     top_logprobs: Optional[int] = None,
-    max_tokens: Optional[int] = 1024,
-    n: Optional[int] = 1,
+    max_completion_tokens: Optional[int] = None,
+    max_tokens: Optional[int] = None,
+    n: Optional[int] = None,
     presence_penalty: Optional[float] = None,
+    reasoning_effort: Optional[Literal['low', 'medium', 'high']] = None,
     response_format: Optional[dict] = None,
     seed: Optional[int] = None,
     stop: Optional[list[str]] = None,
     temperature: Optional[float] = None,
-    top_p: Optional[float] = None,
     tools: Optional[list[dict]] = None,
     tool_choice: Optional[dict] = None,
+    top_p: Optional[float] = None,
     user: Optional[str] = None,
     timeout: Optional[float] = None,
 ) -> dict:
@@ -418,6 +427,9 @@ async def chat_completions(
         resource_pool, lambda: OpenAIRateLimitsInfo(_chat_completions_get_request_resources)
     )
+    if max_completion_tokens is None and max_tokens is None:
+        max_completion_tokens = _default_max_tokens(model)
     # cast(Any, ...): avoid mypy errors
     result = await _openai_client().chat.completions.with_raw_response.create(
         messages=messages,
@@ -426,16 +438,18 @@ async def chat_completions(
         logit_bias=_opt(logit_bias),
         logprobs=_opt(logprobs),
         top_logprobs=_opt(top_logprobs),
+        max_completion_tokens=_opt(max_completion_tokens),
         max_tokens=_opt(max_tokens),
         n=_opt(n),
         presence_penalty=_opt(presence_penalty),
+        reasoning_effort=_opt(reasoning_effort),
         response_format=_opt(cast(Any, response_format)),
         seed=_opt(seed),
         stop=_opt(stop),
         temperature=_opt(temperature),
-        top_p=_opt(top_p),
         tools=_opt(cast(Any, tools)),
         tool_choice=_opt(cast(Any, tool_choice_)),
+        top_p=_opt(top_p),
         user=_opt(user),
         timeout=_opt(timeout),
         extra_body=extra_body,
@@ -448,9 +462,14 @@ async def chat_completions(
 def _vision_get_request_resources(
-    prompt: str, image: PIL.Image.Image, max_tokens: Optional[int], n: Optional[int]
+    prompt: str,
+    image: PIL.Image.Image,
+    model: str,
+    max_completion_tokens: Optional[int],
+    max_tokens: Optional[int],
+    n: Optional[int],
 ) -> dict[str, int]:
-    completion_tokens = n * max_tokens
+    completion_tokens = (n or 1) * (max_completion_tokens or max_tokens or _default_max_tokens(model))
     prompt_tokens = len(prompt) / 4
     # calculate image tokens based on
@@ -482,7 +501,8 @@ async def vision(
     image: PIL.Image.Image,
     *,
     model: str,
-    max_tokens: Optional[int] = 1024,
+    max_completion_tokens: Optional[int] = None,
+    max_tokens: Optional[int] = None,
     n: Optional[int] = 1,
     timeout: Optional[float] = None,
 ) -> str:
@@ -534,9 +554,14 @@ async def vision(
     rate_limits_info = env.Env.get().get_resource_pool_info(
         resource_pool, lambda: OpenAIRateLimitsInfo(_vision_get_request_resources)
     )
+    if max_completion_tokens is None and max_tokens is None:
+        max_completion_tokens = _default_max_tokens(model)
     result = await _openai_client().chat.completions.with_raw_response.create(
         messages=messages,  # type: ignore
         model=model,
+        max_completion_tokens=_opt(max_completion_tokens),
         max_tokens=_opt(max_tokens),
         n=_opt(n),
         timeout=_opt(timeout),

pixeltable/io/pandas.py CHANGED Viewed

@@ -185,20 +185,9 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
     """
     Infers a Pixeltable type based on a Numpy dtype.
     """
-    if np.issubdtype(np_dtype, np.integer):
-        return pxt.IntType(nullable=nullable)
-    if np.issubdtype(np_dtype, np.floating):
-        return pxt.FloatType(nullable=nullable)
-    if np.issubdtype(np_dtype, np.bool_):
-        return pxt.BoolType(nullable=nullable)
-    if np.issubdtype(np_dtype, np.character):
-        return pxt.StringType(nullable=nullable)
-    if np.issubdtype(np_dtype, np.datetime64):
-        return pxt.TimestampType(nullable=nullable)
+    pxttype = ts.ArrayType.from_np_dtype(np_dtype, nullable)
+    if pxttype is not None:
+        return pxttype
     if np_dtype == np.object_:
         # The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type

pixeltable/share/__init__.py ADDED Viewed

File without changes

pixeltable/share/packager.py ADDED Viewed

@@ -0,0 +1,218 @@
+import io
+import json
+import logging
+import tarfile
+import urllib.parse
+import urllib.request
+import uuid
+from pathlib import Path
+from typing import Any, Iterator
+import more_itertools
+import numpy as np
+import pyarrow as pa
+import pyiceberg.catalog
+import pixeltable as pxt
+import pixeltable.type_system as ts
+from pixeltable import exprs
+from pixeltable.env import Env
+from pixeltable.utils.arrow import PXT_TO_PA_TYPES
+from pixeltable.utils.iceberg import sqlite_catalog
+_logger = logging.getLogger('pixeltable')
+class TablePackager:
+    """
+    Packages a pixeltable Table into a tarball containing Iceberg tables and media files. The structure of the tarball
+    is as follows:
+    warehouse/catalog.db  # sqlite Iceberg catalog
+    warehouse/pxt.db/**  # Iceberg metadata and data files (parquet/avro/json)
+    media/**  # Local media files
+    If the table being archived is a view, then the Iceberg catalog will contain separate tables for the view and each
+    of its ancestors. All rows will be exported with additional _rowid and _v_min columns. Currently, only the most
+    recent version of the table can be exported, and only the full table contents.
+    If the table contains media columns, they are handled as follows:
+    - If a media file has an external URL (any URL scheme other than file://), then the URL will be preserved as-is and
+      stored in the Iceberg table.
+    - If a media file is a local file, then it will be copied into the tarball as a file of the form
+      'media/{uuid}{extension}', and the Iceberg table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
+    """
+    table: pxt.Table  # The table to be packaged
+    tmp_dir: Path  # Temporary directory where the package will reside
+    iceberg_catalog: pyiceberg.catalog.Catalog
+    media_files: dict[Path, str]  # Mapping from local media file paths to their tarball names
+    def __init__(self, table: pxt.Table) -> None:
+        self.table = table
+        self.tmp_dir = Path(Env.get().create_tmp_path())
+        self.media_files = {}
+    def package(self) -> Path:
+        """
+        Export the table to a tarball containing Iceberg tables and media files.
+        """
+        assert not self.tmp_dir.exists()  # Packaging can only be done once per TablePackager instance
+        _logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
+        self.tmp_dir.mkdir()
+        self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
+        ancestors = [self.table] + self.table._bases
+        for t in ancestors:
+            _logger.info(f"Exporting table '{t._path}'.")
+            self.__export_table(t)
+        _logger.info(f'Building archive.')
+        bundle_path = self.__build_tarball()
+        _logger.info(f'Packaging complete: {bundle_path}')
+        return bundle_path
+    def __export_table(self, t: pxt.Table) -> None:
+        """
+        Exports the data from `t` into an Iceberg table.
+        """
+        # First generate a select list for the data we want to extract from `t`. This includes:
+        # - all stored columns, including computed columns;
+        # - errortype and errormsg fields whenever they're defined.
+        # We select only those columns that are defined in this table (columns inherited from ancestor tables will be
+        # handled separately).
+        # For media columns, we substitute `col.fileurl` so that we always get the URL (which may be a file:// URL;
+        # these will be specially handled later)
+        select_exprs: dict[str, exprs.Expr] = {}
+        # As we generate the select list, we construct a separate list of column types. We can't rely on df._schema
+        # to get the column types, since we'll be substituting `fileurl`s for media columns.
+        actual_col_types: list[ts.ColumnType] = []
+        for col_name, col in t._tbl_version.cols_by_name.items():
+            if not col.is_stored:
+                continue
+            if col.col_type.is_media_type():
+                select_exprs[col_name] = t[col_name].fileurl
+            else:
+                select_exprs[col_name] = t[col_name]
+            actual_col_types.append(col.col_type)
+            if col.records_errors:
+                select_exprs[f'{col_name}_errortype'] = t[col_name].errortype
+                actual_col_types.append(ts.StringType())
+                select_exprs[f'{col_name}_errormsg'] = t[col_name].errormsg
+                actual_col_types.append(ts.StringType())
+        # Run the select() on `self.table`, not `t`, so that we export only those rows that are actually present in
+        # `self.table`.
+        df = self.table.select(**select_exprs)
+        namespace = self.__iceberg_namespace(t)
+        self.iceberg_catalog.create_namespace_if_not_exists(namespace)
+        iceberg_schema = self.__to_iceberg_schema(df._schema)
+        iceberg_tbl = self.iceberg_catalog.create_table(f'{namespace}.{t._name}', schema=iceberg_schema)
+        # Populate the Iceberg table with data.
+        # The data is first loaded from the DataFrame into a sequence of pyarrow tables, batched in order to avoid
+        # excessive memory usage. The pyarrow tables are then amalgamated into the (single) Iceberg table on disk.
+        for pa_table in self.__to_pa_tables(df, actual_col_types, iceberg_schema):
+            iceberg_tbl.append(pa_table)
+    @classmethod
+    def __iceberg_namespace(cls, table: pxt.Table) -> str:
+        """
+        Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
+        """
+        parent_path = table._parent._path
+        if len(parent_path) == 0:
+            return 'pxt'
+        else:
+            return f'pxt.{parent_path}'
+    # The following methods are responsible for schema and data conversion from Pixeltable to Iceberg. Some of this
+    # logic might be consolidated into arrow.py and unified with general Parquet export, but there are several
+    # major differences:
+    # - Iceberg has no array type; we export all arrays as binary blobs
+    # - We include _rowid and _v_min columns in the Iceberg table
+    # - Media columns are handled specially as indicated above
+    @classmethod
+    def __to_iceberg_schema(cls, pxt_schema: dict[str, ts.ColumnType]) -> pa.Schema:
+        entries = [(name, cls.__to_iceberg_type(col_type)) for name, col_type in pxt_schema.items()]
+        entries.append(('_rowid', pa.list_(pa.int64())))
+        entries.append(('_v_min', pa.int64()))
+        return pa.schema(entries)  # type: ignore[arg-type]
+    @classmethod
+    def __to_iceberg_type(cls, col_type: ts.ColumnType) -> pa.DataType:
+        if col_type.is_array_type():
+            return pa.binary()
+        if col_type.is_media_type():
+            return pa.string()
+        return PXT_TO_PA_TYPES.get(col_type.__class__)
+    def __to_pa_tables(
+        self,
+        df: pxt.DataFrame,
+        actual_col_types: list[pxt.ColumnType],
+        arrow_schema: pa.Schema,
+        batch_size: int = 1_000,
+    ) -> Iterator[pa.Table]:
+        """
+        Load a DataFrame as a sequence of pyarrow tables. The pyarrow tables are batched into smaller chunks
+        to avoid excessive memory usage.
+        """
+        for rows in more_itertools.batched(self.__to_pa_rows(df, actual_col_types), batch_size):
+            cols = {col_name: [row[idx] for row in rows] for idx, col_name in enumerate(df._schema.keys())}
+            cols['_rowid'] = [row[-2] for row in rows]
+            cols['_v_min'] = [row[-1] for row in rows]
+            yield pa.Table.from_pydict(cols, schema=arrow_schema)
+    def __to_pa_rows(self, df: pxt.DataFrame, actual_col_types: list[pxt.ColumnType]) -> Iterator[list]:
+        for row in df._exec():
+            vals = [row[e.slot_idx] for e in df._select_list_exprs]
+            result = [self.__to_pa_value(val, col_type) for val, col_type in zip(vals, actual_col_types)]
+            result.append(row.rowid)
+            result.append(row.v_min)
+            yield result
+    def __to_pa_value(self, val: Any, col_type: ts.ColumnType) -> Any:
+        if val is None:
+            return None
+        if col_type.is_array_type():
+            # Export arrays as binary
+            assert isinstance(val, np.ndarray)
+            arr = io.BytesIO()
+            np.save(arr, val)
+            return arr.getvalue()
+        if col_type.is_json_type():
+            # Export JSON as strings
+            return json.dumps(val)
+        if col_type.is_media_type():
+            # Handle media files as described above
+            assert isinstance(val, str)  # Media columns are always referenced by `fileurl`
+            return self.__process_media_url(val)
+        return val
+    def __process_media_url(self, url: str) -> str:
+        parsed_url = urllib.parse.urlparse(url)
+        if parsed_url.scheme == 'file':
+            # It's the URL of a local file. Replace it with a pxtmedia:// URI.
+            # (We can't use an actual pxt:// URI, because the eventual pxt:// table name might not be known at this
+            # time. The pxtmedia:// URI serves as a relative reference into the tarball that can be replaced with an
+            # actual URL when the table is reconstituted.)
+            path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_url.path)))
+            if path not in self.media_files:
+                # Create a new entry in the `media_files` dict so that we can copy the file into the tarball later.
+                dest_name = f'{uuid.uuid4().hex}{path.suffix}'
+                self.media_files[path] = dest_name
+            return f'pxtmedia://{self.media_files[path]}'
+        # For any type of URL other than a local file, just return the URL as-is.
+        return url
+    def __build_tarball(self) -> Path:
+        bundle_path = self.tmp_dir / 'bundle.tar.bz2'
+        with tarfile.open(bundle_path, 'w:bz2') as tf:
+            # Add the Iceberg warehouse dir (including the catalog)
+            tf.add(self.tmp_dir / 'warehouse', arcname='warehouse', recursive=True)
+            # Add the media files
+            for src_file, dest_name in self.media_files.items():
+                tf.add(src_file, arcname=f'media/{dest_name}')
+        return bundle_path

pixeltable/type_system.py CHANGED Viewed

@@ -9,9 +9,7 @@ import typing
 import urllib.parse
 import urllib.request
 from pathlib import Path
-from typing import _GenericAlias  # type: ignore[attr-defined]  # isort: skip
-from typing import Any, Iterable, Mapping, Optional, Sequence, Union
+from typing import Any, Iterable, Literal, Mapping, Optional, Sequence, Union
 import av  # type: ignore
 import jsonschema
@@ -25,6 +23,8 @@ from typing_extensions import _AnnotatedAlias
 import pixeltable.exceptions as excs
+from typing import _GenericAlias  # type: ignore[attr-defined]  # isort: skip
 class ColumnType:
     @enum.unique
@@ -213,9 +213,9 @@ class ColumnType:
             return self.copy(nullable=(self.nullable or other.nullable))
         if self.is_invalid_type():
-            return other
+            return other.copy(nullable=(self.nullable or other.nullable))
         if other.is_invalid_type():
-            return self
+            return self.copy(nullable=(self.nullable or other.nullable))
         if self.is_scalar_type() and other.is_scalar_type():
             t = self.Type.supertype(self._type, other._type, self.common_supertypes)
@@ -292,26 +292,24 @@ class ColumnType:
                 designations will be allowed regardless.
         """
         origin = typing.get_origin(t)
+        type_args = typing.get_args(t)
         if origin is typing.Union:
             # Check if `t` has the form Optional[T].
-            union_args = typing.get_args(t)
-            if len(union_args) == 2 and type(None) in union_args:
+            if len(type_args) == 2 and type(None) in type_args:
                 # `t` is a type of the form Optional[T] (equivalently, Union[T, None] or Union[None, T]).
                 # We treat it as the underlying type but with nullable=True.
-                underlying_py_type = union_args[0] if union_args[1] is type(None) else union_args[1]
+                underlying_py_type = type_args[0] if type_args[1] is type(None) else type_args[1]
                 underlying = cls.from_python_type(underlying_py_type, allow_builtin_types=allow_builtin_types)
                 if underlying is not None:
                     return underlying.copy(nullable=True)
         elif origin is Required:
-            required_args = typing.get_args(t)
-            assert len(required_args) == 1
+            assert len(type_args) == 1
             return cls.from_python_type(
-                required_args[0], nullable_default=False, allow_builtin_types=allow_builtin_types
-            )
+                type_args[0], nullable_default=False, allow_builtin_types=allow_builtin_types
+            ).copy(nullable=False)
         elif origin is typing.Annotated:
-            annotated_args = typing.get_args(t)
-            origin = annotated_args[0]
-            parameters = annotated_args[1]
+            origin = type_args[0]
+            parameters = type_args[1]
             if isinstance(parameters, ColumnType):
                 return parameters.copy(nullable=nullable_default)
         else:
@@ -323,6 +321,11 @@ class ColumnType:
             if isinstance(t, type) and issubclass(t, _PxtType):
                 return t.as_col_type(nullable=nullable_default)
             elif allow_builtin_types:
+                if t is Literal and len(type_args) > 0:
+                    literal_type = cls.infer_common_literal_type(type_args)
+                    if literal_type is None:
+                        return None
+                    return literal_type.copy(nullable=(literal_type.nullable or nullable_default))
                 if t is str:
                     return StringType(nullable=nullable_default)
                 if t is int:
@@ -335,7 +338,7 @@ class ColumnType:
                     return TimestampType(nullable=nullable_default)
                 if t is PIL.Image.Image:
                     return ImageType(nullable=nullable_default)
-                if issubclass(t, Sequence) or issubclass(t, Mapping) or issubclass(t, pydantic.BaseModel):
+                if isinstance(t, type) and issubclass(t, (Sequence, Mapping, pydantic.BaseModel)):
                     return JsonType(nullable=nullable_default)
         return None
@@ -851,23 +854,39 @@ class ArrayType(ColumnType):
         dtype = None if d['dtype'] is None else cls.make_type(cls.Type(d['dtype']))
         return cls(shape, dtype, nullable=d['nullable'])
+    @classmethod
+    def from_np_dtype(cls, dtype: np.dtype, nullable: bool) -> Optional[ColumnType]:
+        """
+        Return pixeltable type corresponding to a given simple numpy dtype
+        """
+        if np.issubdtype(dtype, np.integer):
+            return IntType(nullable=nullable)
+        if np.issubdtype(dtype, np.floating):
+            return FloatType(nullable=nullable)
+        if dtype == np.bool_:
+            return BoolType(nullable=nullable)
+        if np.issubdtype(dtype, np.str_):
+            return StringType(nullable=nullable)
+        if np.issubdtype(dtype, np.character):
+            return StringType(nullable=nullable)
+        if np.issubdtype(dtype, np.datetime64):
+            return TimestampType(nullable=nullable)
+        return None
     @classmethod
     def from_literal(cls, val: np.ndarray, nullable: bool = False) -> Optional[ArrayType]:
         # determine our dtype
         assert isinstance(val, np.ndarray)
-        dtype: ColumnType
-        if np.issubdtype(val.dtype, np.integer):
-            dtype = IntType()
-        elif np.issubdtype(val.dtype, np.floating):
-            dtype = FloatType()
-        elif val.dtype == np.bool_:
-            dtype = BoolType()
-        elif np.issubdtype(val.dtype, np.str_):
-            # Note that this includes NumPy types like '<U1' -- arrays of single Unicode characters
-            dtype = StringType()
-        else:
+        pxttype: Optional[ColumnType] = cls.from_np_dtype(val.dtype, nullable)
+        if pxttype == None:
             return None
-        return cls(val.shape, dtype=dtype, nullable=nullable)
+        return cls(val.shape, dtype=pxttype, nullable=nullable)
     def is_valid_literal(self, val: np.ndarray) -> bool:
         if not isinstance(val, np.ndarray):

pixeltable/utils/arrow.py CHANGED Viewed

@@ -6,7 +6,7 @@ import pyarrow as pa
 import pixeltable.type_system as ts
-_pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
+PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
     pa.string(): ts.StringType(nullable=True),
     pa.bool_(): ts.BoolType(nullable=True),
     pa.uint8(): ts.IntType(nullable=True),
@@ -18,7 +18,7 @@ _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
     pa.float32(): ts.FloatType(nullable=True),
 }
-_pt_to_pa: dict[type[ts.ColumnType], pa.DataType] = {
+PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
     ts.StringType: pa.string(),
     ts.TimestampType: pa.timestamp('us', tz=datetime.timezone.utc),  # postgres timestamp is microseconds
     ts.BoolType: pa.bool_(),
@@ -38,8 +38,8 @@ def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
     """
     if isinstance(arrow_type, pa.TimestampType):
         return ts.TimestampType(nullable=True)
-    elif arrow_type in _pa_to_pt:
-        return _pa_to_pt[arrow_type]
+    elif arrow_type in PA_TO_PXT_TYPES:
+        return PA_TO_PXT_TYPES[arrow_type]
     elif isinstance(arrow_type, pa.FixedShapeTensorType):
         dtype = to_pixeltable_type(arrow_type.value_type)
         if dtype is None:
@@ -53,8 +53,8 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
     """Convert a pixeltable DataType to a pyarrow datatype if one is defined.
     Returns None if no conversion is currently implemented.
     """
-    if pixeltable_type.__class__ in _pt_to_pa:
-        return _pt_to_pa[pixeltable_type.__class__]
+    if pixeltable_type.__class__ in PXT_TO_PA_TYPES:
+        return PXT_TO_PA_TYPES[pixeltable_type.__class__]
     elif isinstance(pixeltable_type, ts.ArrayType):
         return pa.fixed_shape_tensor(pa.from_numpy_dtype(pixeltable_type.numpy_dtype()), pixeltable_type.shape)
     else:

pixeltable/utils/iceberg.py ADDED Viewed

@@ -0,0 +1,14 @@
+from pathlib import Path
+from typing import Union
+from pyiceberg.catalog.sql import SqlCatalog
+def sqlite_catalog(warehouse_path: Union[str, Path], name: str = 'pixeltable') -> SqlCatalog:
+    """
+    Instantiate a sqlite Iceberg catalog at the specified path. If no catalog exists, one will be created.
+    """
+    if isinstance(warehouse_path, str):
+        warehouse_path = Path(warehouse_path)
+    warehouse_path.mkdir(exist_ok=True)
+    return SqlCatalog(name, uri=f'sqlite:///{warehouse_path}/catalog.db', warehouse=f'file://{warehouse_path}')

pixeltable/utils/media_store.py CHANGED Viewed

@@ -30,7 +30,7 @@ class MediaStore:
         the environment's media_dir.
         """
         id_hex = uuid.uuid4().hex
-        parent = Env.get().media_dir / tbl_id.hex / id_hex[0:2] / id_hex[0:4]
+        parent = Env.get().media_dir / tbl_id.hex / id_hex[:2] / id_hex[:4]
         parent.mkdir(parents=True, exist_ok=True)
         return parent / f'{tbl_id.hex}_{col_id}_{version}_{id_hex}{ext or ""}'

{pixeltable-0.3.3.dist-info → pixeltable-0.3.4.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: pixeltable
-Version: 0.3.3
+Version: 0.3.4
 Summary: AI Data Infrastructure: Declarative, Multimodal, and Incremental
 Home-page: https://pixeltable.com/
 License: Apache-2.0
@@ -39,11 +39,13 @@ Requires-Dist: numpy (>=1.25,<2.0)
 Requires-Dist: pandas (>=2.0,<3.0)
 Requires-Dist: pgvector (>=0.2.1,<0.3.0)
 Requires-Dist: pillow (>=9.3.0)
-Requires-Dist: pixeltable-pgserver (==0.2.9)
+Requires-Dist: pixeltable-pgserver (==0.3.1)
 Requires-Dist: psutil (>=5.9.5,<6.0.0)
 Requires-Dist: psycopg[binary] (>=3.1.18)
 Requires-Dist: puremagic (>=1.20)
+Requires-Dist: pyarrow (>=13.0.0)
 Requires-Dist: pydantic (>=2.7.4)
+Requires-Dist: pyiceberg (>=0.6.0)
 Requires-Dist: pymupdf (>=1.24.1,<2.0.0)
 Requires-Dist: pyyaml (>=6.0.1,<7.0.0)
 Requires-Dist: requests (>=2.31.0,<3.0.0)

{pixeltable-0.3.3.dist-info → pixeltable-0.3.4.dist-info}/RECORD RENAMED Viewed

@@ -1,5 +1,5 @@
 pixeltable/__init__.py,sha256=FeL_ABFaY6QiShtTao1cfhSAwXV_2dkhL_4-qXoHbPE,1616
-pixeltable/__version__.py,sha256=NMCNPWfp4W0_zblLn-1M1FNbW4Fe6XSxnsm2uSwk7eA,112
+pixeltable/__version__.py,sha256=a50-dZlwYU667r1CN3zUS6OONPFGlyZFnAAe8vTD1k8,112
 pixeltable/catalog/__init__.py,sha256=bACh33HpWQed86eV8t9of_ClSXqZx5blZi4y8vJ7-EA,517
 pixeltable/catalog/catalog.py,sha256=LFaOtHoGJM306jDlyyQRqCaPR6K4nrN-jPu3_vyZNvc,8267
 pixeltable/catalog/column.py,sha256=9Rm4DCP-uUCl3P44uTsD89P63jxmvv9emD2Rc7Bw_us,9684
@@ -14,13 +14,13 @@ pixeltable/catalog/table.py,sha256=qfTI7obvSanFt96-jbjSXU9PyninU3_B9K4pnaxlJdM,6
 pixeltable/catalog/table_version.py,sha256=rWBtgnIepVgq5tZ4vb9RzAL5peHnze5ZMOr-7gqMpog,60354
 pixeltable/catalog/table_version_path.py,sha256=yDU_KXriAckJqKPfKYhLVDig7glUc--_Fda9X7ekfGo,5810
 pixeltable/catalog/view.py,sha256=cTL1jBYHa3RweODoD-y_I9NjAntqJPSofP4BJdSWaBA,11226
-pixeltable/dataframe.py,sha256=hGYjMFE3Fwftgdsveo4eXd5SiGXl3uJOaIoH3wm61Po,49473
-pixeltable/env.py,sha256=8gWyNYnIufet8kbGpa-QNsVaEdTJGbCymUwq4XQpC2k,35723
+pixeltable/dataframe.py,sha256=9eMkOUKYpcml6y_Nsj9nTY_UHaDyzo1GT1c6IfzWfXo,49177
+pixeltable/env.py,sha256=1IN2Tju45H-ADNhMfVRDOQ11udBxo4L_euZ6gQKiRC8,35860
 pixeltable/exceptions.py,sha256=NuFY2WtkQpLfLHT_J70kOw9Tr0kEDkkgo-u7As4Gaq4,410
 pixeltable/exec/__init__.py,sha256=Qi0s2BEM8O8MPdYGQAIzclv2GNFsoCPJFvA6s5Tjc_o,489
 pixeltable/exec/aggregation_node.py,sha256=KR7OLQOfAL4KTF6_vKSuJvFC2ntwWf0NJxhQ9i340-4,4072
 pixeltable/exec/cache_prefetch_node.py,sha256=fwO-xUQfSOMWQMbrJplFXvjcKjLVjPz93O0HttSD3A8,12211
-pixeltable/exec/component_iteration_node.py,sha256=vYELAMtc4jKOxC0aZFjjx6UBlBcjC3LXG93epGHPJn0,4713
+pixeltable/exec/component_iteration_node.py,sha256=b3tyspAuYLYHlb7BvAWqDpMGJojSeqtP-l8x72OGjvA,4678
 pixeltable/exec/data_row_batch.py,sha256=E0SVjyOBc237DopT0TwqK7JzcgFTEpE3xOS9K0-WFh8,3407
 pixeltable/exec/exec_context.py,sha256=l7GWAbt57H9VEksrDCeocmlc-MgUp8w_nDdAau8Cfqw,1115
 pixeltable/exec/exec_node.py,sha256=RbMJLDy7jwphNCEphSL0w50Dy1lrpjtEEugzyL6pqlA,4006
@@ -41,7 +41,7 @@ pixeltable/exprs/column_ref.py,sha256=MBWrNwnbRe0Hswu0q_Arerm9JoQs_0pNSsCYVxXONx
 pixeltable/exprs/comparison.py,sha256=5Bw6fEvVq-ynt3ciGLCouse7ZWFGPA-egsEkgUjUvsc,5132
 pixeltable/exprs/compound_predicate.py,sha256=ZN_JL97OZfTwdfgXF2t57EGyTYrpsBHaduZWRuBAekk,3832
 pixeltable/exprs/data_row.py,sha256=4lEyTxTw95v3ERuG9mFUBla8FfhPueoZyltcpTsWLK0,10577
-pixeltable/exprs/expr.py,sha256=uE8_hMVF1fCILVR4DWKR6WyC7ovp9iY1mCpsrI3eQ_U,32208
+pixeltable/exprs/expr.py,sha256=r7eS6-7RCHemYBv_Ap1U9IKcZHpVqAghpxHcCpuk6uY,32463
 pixeltable/exprs/expr_dict.py,sha256=wf82K-aCPHZcM2A-VbE_0p5OzQFfVsI65uzMLp4Uwu4,1589
 pixeltable/exprs/expr_set.py,sha256=kkcG9df8fQOblNIKz2xciw9qfu2CnTWb4qwJKYVTUx8,2578
 pixeltable/exprs/function_call.py,sha256=3zjWl_vAKHpClR61-wpNNfPWYp5ccHO8CXD3Dts2bcs,28123
@@ -88,7 +88,7 @@ pixeltable/functions/llama_cpp.py,sha256=1nVXgU5ymuNblVNqRQv3iAEvlYpqzDZPAjYnAOH
 pixeltable/functions/math.py,sha256=WPoH9zD9_GdwvBs-FSC3Sqb70gOPNouhPcBZABsuLwI,1541
 pixeltable/functions/mistralai.py,sha256=H2onsnW1R_SaFN5SI_JWO0A5lJdlsnKxmtIu2m19cEg,6212
 pixeltable/functions/ollama.py,sha256=Et0l7XEMaNLxDwy3qTblljomjCkOQroY1Z7a-Ajmshk,4218
-pixeltable/functions/openai.py,sha256=mdeo4Y-wg-9LJAlmLxydu3VAS4NGgRQQvVb_7Gkefpc,28109
+pixeltable/functions/openai.py,sha256=Oc_WApfR8M_-EgUEwV1BBuQwkmhunLUGqUVl5CWDTnA,29083
 pixeltable/functions/replicate.py,sha256=BQ5iaFJnw5MioL3X08DQiH41xQ_Pi2H5DDEasux9-fE,2454
 pixeltable/functions/string.py,sha256=1vFlbqKVm2n6jdh23BIA_8MBJJiNyxbQoFs5tJPgpy4,20433
 pixeltable/functions/timestamp.py,sha256=KKOw7l1hErYp8QQfFiWVTf7QowZszOyHJu-OJDKaXSg,9114
@@ -108,7 +108,7 @@ pixeltable/io/fiftyone.py,sha256=nviYiqDOGS5Os374Tl5knGNXpjFlgqcKnSPsBzz60vU,685
 pixeltable/io/globals.py,sha256=0X0sLpVrqPlgNna_vQX4KcBuerdUojZDTyTaX2sKV4I,17838
 pixeltable/io/hf_datasets.py,sha256=DV_bHB-LOQB8YC9FK1KYTEgaBPFelk31fYpq8h72eEE,8321
 pixeltable/io/label_studio.py,sha256=Dlq-2iVBadDnU0xOn3duLbpBJxiegY04XkWsmqQTXwk,31242
-pixeltable/io/pandas.py,sha256=Z-hBUbC6t-dGfJe8ksYXjp8k6T9xGvwvpbIXZLekHbw,9952
+pixeltable/io/pandas.py,sha256=eKoo0tTPnKJUGOIc8VUV1gamsoeOPO6pOtXJyEV_W84,9594
 pixeltable/io/parquet.py,sha256=2i3YAQd-ZifxJv4JUU5Ysh7p6SemozBncd989bSl_qw,8745
 pixeltable/iterators/__init__.py,sha256=r5NYNF7qsepOPJnywG5N7jTz3Z1ubrbSzD19JK97cCM,431
 pixeltable/iterators/audio.py,sha256=UfWAzUAq33bqN5R7-kFK4LN2VUukhgZhAsnoHuOm2CU,9092
@@ -139,10 +139,12 @@ pixeltable/metadata/notes.py,sha256=2gQ0fAdAWOKxvzZ5DVBdmTk62q_KFGRFmv0tzi7tklE,
 pixeltable/metadata/schema.py,sha256=kv-PIMfG_NysET1k71iwIkBVlK5HwdnotXUvFeLaxaY,9470
 pixeltable/plan.py,sha256=ZTXpt10Rexvfm3_68CLQzUAS7YubZjbUJLbAN-RZDps,42385
 pixeltable/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pixeltable/share/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+pixeltable/share/packager.py,sha256=QcMRI5qihNzO9Wcku-KpA8N7jUCkygrJUyyHB5XAGAA,10233
 pixeltable/store.py,sha256=uQKW9A3RWVVuP6bnQx22jhs5_WxQKx3rV0sGpdoBUzY,22555
-pixeltable/type_system.py,sha256=yTMSt8hljouXH3jZ0xMinhNDMCVZB0dVTZRXejBcODU,50183
+pixeltable/type_system.py,sha256=c1kVcnX2Siu_V4DDn6DVF7nnDSNzFlDFw583WnWsUIc,50927
 pixeltable/utils/__init__.py,sha256=UYlrf6TIWJT0g-Hac0b34-dEk478B5Qx8dGco34YlIk,439
-pixeltable/utils/arrow.py,sha256=L0JFj6YQry1iHqhom6Zc9zWa8j6VCEUgQ0OfKqTiukY,3865
+pixeltable/utils/arrow.py,sha256=EVFTHXt1r1b-rbvgG-TOjvl6GiAtm1hH-86A449cKTw,3901
 pixeltable/utils/coco.py,sha256=dl-IYO4VgfFly4-TvvF9Rw9XK2yY6HGTuL7LcyQk_RA,7290
 pixeltable/utils/code.py,sha256=SbG5OUF_fQAbOgGZHDuENijmbzisVqa4VS9guaZ0KtU,1231
 pixeltable/utils/console_output.py,sha256=GJ1oJWanP8_an343CEB35rtc1kcVW1FQtT3vRT4SZPs,1148
@@ -151,13 +153,14 @@ pixeltable/utils/documents.py,sha256=APFujdYq1qe2Do4KAUI0te35jh4925geR9UB8GeFQ1w
 pixeltable/utils/filecache.py,sha256=sYofh-6TwkQbwe8X64eUt27itSJ8o5rY10HYZJShbbI,10703
 pixeltable/utils/formatter.py,sha256=5E_gDg11ClFI-5SthwkiqyE3hAok3JHDj4OSK9cJklM,9257
 pixeltable/utils/http_server.py,sha256=zsESVjtG1P6hrz-d2N1m6_BChqPt8N3f-EO9sJbWnLs,2388
-pixeltable/utils/media_store.py,sha256=LcVTF8CW9C54mGg6OHI5u9W-gh5CkIfxbQaP9WAkmag,3093
+pixeltable/utils/iceberg.py,sha256=L_s9G9NMIGMQdRHtNkks6ntTVW4DKKAw97R9gRmtw5s,553
+pixeltable/utils/media_store.py,sha256=kSQ6YwQPRQzOhhCChS2hYmY9HxXX1fRq_M_FgkfsYU8,3091
 pixeltable/utils/pytorch.py,sha256=8lJT1SyP9jTMN7uLtrj9T_rGPEYRID44rWXbjBhRUrU,3422
 pixeltable/utils/s3.py,sha256=pxip2MlCqd2Qon2dzJXzfxvwtZyc-BAsjAnLL4J_OXY,587
 pixeltable/utils/sql.py,sha256=JX_fNI_SJWVUcXif5ho5qVhfJKFupOCFLLrHCMcbzLk,796
 pixeltable/utils/transactional_directory.py,sha256=4Q8UTylEyw-aZa-NVjfjGR9_JHRJTGQH1k1LNFaZukY,1349
-pixeltable-0.3.3.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-pixeltable-0.3.3.dist-info/METADATA,sha256=s4trJASrbIe9hPC3MHXe0Tsvo7Fc0avMjgOpukZ7Hsw,19359
-pixeltable-0.3.3.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
-pixeltable-0.3.3.dist-info/entry_points.txt,sha256=ToOd-pRgG7AitEBgYoBCRRB4-KVDQ0pj_9T4a1LgwA4,97
-pixeltable-0.3.3.dist-info/RECORD,,
+pixeltable-0.3.4.dist-info/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+pixeltable-0.3.4.dist-info/METADATA,sha256=nM9QtJyu9ljdyn9ktpCuNLf9uaReun1Lo83BG9zR9Z4,19428
+pixeltable-0.3.4.dist-info/WHEEL,sha256=Nq82e9rUAnEjt98J6MlVmMCZb-t9cYE2Ir1kpBmnWfs,88
+pixeltable-0.3.4.dist-info/entry_points.txt,sha256=ToOd-pRgG7AitEBgYoBCRRB4-KVDQ0pj_9T4a1LgwA4,97
+pixeltable-0.3.4.dist-info/RECORD,,

{pixeltable-0.3.3.dist-info → pixeltable-0.3.4.dist-info}/LICENSE RENAMED Viewed

File without changes

{pixeltable-0.3.3.dist-info → pixeltable-0.3.4.dist-info}/WHEEL RENAMED Viewed

File without changes

{pixeltable-0.3.3.dist-info → pixeltable-0.3.4.dist-info}/entry_points.txt RENAMED Viewed

File without changes

pixeltable 0.3.3__py3-none-any.whl → 0.3.4__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.3py3-none-any.whl → 0.3.4py3-none-any.whl