PyPI - pixeltable - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl - Mend

pixeltable 0.3.4py3-none-any.whl → 0.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (63) hide show

pixeltable/__init__.py +1 -0
pixeltable/__version__.py +2 -2
pixeltable/catalog/catalog.py +9 -2
pixeltable/catalog/column.py +1 -1
pixeltable/catalog/dir.py +1 -1
pixeltable/catalog/table.py +3 -1
pixeltable/catalog/table_version.py +12 -2
pixeltable/catalog/table_version_path.py +2 -2
pixeltable/catalog/view.py +64 -20
pixeltable/dataframe.py +11 -6
pixeltable/env.py +12 -0
pixeltable/exec/expr_eval/evaluators.py +4 -2
pixeltable/exec/expr_eval/expr_eval_node.py +4 -1
pixeltable/exprs/comparison.py +8 -4
pixeltable/exprs/data_row.py +9 -7
pixeltable/exprs/expr.py +2 -2
pixeltable/exprs/function_call.py +155 -313
pixeltable/exprs/json_mapper.py +25 -8
pixeltable/exprs/json_path.py +6 -5
pixeltable/exprs/object_ref.py +16 -5
pixeltable/exprs/row_builder.py +10 -3
pixeltable/func/aggregate_function.py +29 -15
pixeltable/func/callable_function.py +11 -8
pixeltable/func/expr_template_function.py +3 -9
pixeltable/func/function.py +148 -74
pixeltable/func/signature.py +65 -30
pixeltable/func/tools.py +26 -26
pixeltable/func/udf.py +1 -1
pixeltable/functions/__init__.py +1 -0
pixeltable/functions/anthropic.py +9 -3
pixeltable/functions/deepseek.py +121 -0
pixeltable/functions/image.py +7 -7
pixeltable/functions/openai.py +30 -13
pixeltable/functions/video.py +14 -7
pixeltable/globals.py +14 -3
pixeltable/index/embedding_index.py +4 -13
pixeltable/io/globals.py +88 -77
pixeltable/io/hf_datasets.py +34 -34
pixeltable/io/pandas.py +75 -76
pixeltable/io/parquet.py +19 -27
pixeltable/io/utils.py +115 -0
pixeltable/iterators/audio.py +2 -1
pixeltable/iterators/video.py +1 -1
pixeltable/metadata/__init__.py +2 -1
pixeltable/metadata/converters/convert_15.py +18 -8
pixeltable/metadata/converters/convert_27.py +31 -0
pixeltable/metadata/converters/convert_28.py +15 -0
pixeltable/metadata/converters/convert_29.py +111 -0
pixeltable/metadata/converters/util.py +12 -1
pixeltable/metadata/notes.py +3 -0
pixeltable/metadata/schema.py +8 -0
pixeltable/share/__init__.py +1 -0
pixeltable/share/packager.py +41 -13
pixeltable/share/publish.py +97 -0
pixeltable/type_system.py +40 -14
pixeltable/utils/__init__.py +41 -0
pixeltable/utils/arrow.py +40 -7
pixeltable/utils/formatter.py +1 -1
{pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/METADATA +34 -49
{pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/RECORD +63 -57
{pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/WHEEL +1 -1
{pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/LICENSE +0 -0
{pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/entry_points.txt +0 -0

pixeltable/share/packager.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import dataclasses
 import io
 import json
 import logging
@@ -5,8 +6,9 @@ import tarfile
 import urllib.parse
 import urllib.request
 import uuid
+from datetime import datetime
 from pathlib import Path
-from typing import Any, Iterator
+from typing import Any, Iterator, Optional
 import more_itertools
 import numpy as np
@@ -15,7 +17,8 @@ import pyiceberg.catalog
 import pixeltable as pxt
 import pixeltable.type_system as ts
-from pixeltable import exprs
+from pixeltable import catalog, exprs, metadata
+from pixeltable.dataframe import DataFrame
 from pixeltable.env import Env
 from pixeltable.utils.arrow import PXT_TO_PA_TYPES
 from pixeltable.utils.iceberg import sqlite_catalog
@@ -28,6 +31,7 @@ class TablePackager:
     Packages a pixeltable Table into a tarball containing Iceberg tables and media files. The structure of the tarball
     is as follows:
+    metadata.json  # Pixeltable metadata for the packaged table
     warehouse/catalog.db  # sqlite Iceberg catalog
     warehouse/pxt.db/**  # Iceberg metadata and data files (parquet/avro/json)
     media/**  # Local media files
@@ -43,16 +47,40 @@ class TablePackager:
       'media/{uuid}{extension}', and the Iceberg table will contain the ephemeral URI 'pxtmedia://{uuid}{extension}'.
     """
-    table: pxt.Table  # The table to be packaged
+    table: catalog.Table  # The table to be packaged
     tmp_dir: Path  # Temporary directory where the package will reside
     iceberg_catalog: pyiceberg.catalog.Catalog
     media_files: dict[Path, str]  # Mapping from local media file paths to their tarball names
+    md: dict[str, Any]
-    def __init__(self, table: pxt.Table) -> None:
+    def __init__(self, table: catalog.Table, additional_md: Optional[dict[str, Any]] = None) -> None:
         self.table = table
         self.tmp_dir = Path(Env.get().create_tmp_path())
         self.media_files = {}
+        # Generate metadata
+        self.md = {
+            'pxt_version': pxt.__version__,
+            'pxt_md_version': metadata.VERSION,
+            'md': {
+                'tables': [
+                    {
+                        'table_id': str(t._tbl_version.id),
+                        # These are temporary; will replace with a better solution once the concurrency changes to catalog have
+                        # been merged
+                        'table_md': dataclasses.asdict(t._tbl_version._create_tbl_md()),
+                        'table_version_md': dataclasses.asdict(
+                            t._tbl_version._create_version_md(datetime.now().timestamp())
+                        ),
+                        'table_schema_version_md': dataclasses.asdict(t._tbl_version._create_schema_version_md(0)),
+                    }
+                    for t in (table, *table._bases)
+                ]
+            },
+        }
+        if additional_md is not None:
+            self.md.update(additional_md)
     def package(self) -> Path:
         """
         Export the table to a tarball containing Iceberg tables and media files.
@@ -60,8 +88,10 @@ class TablePackager:
         assert not self.tmp_dir.exists()  # Packaging can only be done once per TablePackager instance
         _logger.info(f"Packaging table '{self.table._path}' and its ancestors in: {self.tmp_dir}")
         self.tmp_dir.mkdir()
+        with open(self.tmp_dir / 'metadata.json', 'w', encoding='utf8') as fp:
+            json.dump(self.md, fp)
         self.iceberg_catalog = sqlite_catalog(self.tmp_dir / 'warehouse')
-        ancestors = [self.table] + self.table._bases
+        ancestors = (self.table, *self.table._bases)
         for t in ancestors:
             _logger.info(f"Exporting table '{t._path}'.")
             self.__export_table(t)
@@ -70,7 +100,7 @@ class TablePackager:
         _logger.info(f'Packaging complete: {bundle_path}')
         return bundle_path
-    def __export_table(self, t: pxt.Table) -> None:
+    def __export_table(self, t: catalog.Table) -> None:
         """
         Exports the data from `t` into an Iceberg table.
         """
@@ -116,7 +146,7 @@ class TablePackager:
             iceberg_tbl.append(pa_table)
     @classmethod
-    def __iceberg_namespace(cls, table: pxt.Table) -> str:
+    def __iceberg_namespace(cls, table: catalog.Table) -> str:
         """
         Iceberg tables must have a namespace, which cannot be the empty string, so we prepend `pxt` to the table path.
         """
@@ -149,11 +179,7 @@ class TablePackager:
         return PXT_TO_PA_TYPES.get(col_type.__class__)
     def __to_pa_tables(
-        self,
-        df: pxt.DataFrame,
-        actual_col_types: list[pxt.ColumnType],
-        arrow_schema: pa.Schema,
-        batch_size: int = 1_000,
+        self, df: DataFrame, actual_col_types: list[ts.ColumnType], arrow_schema: pa.Schema, batch_size: int = 1_000
     ) -> Iterator[pa.Table]:
         """
         Load a DataFrame as a sequence of pyarrow tables. The pyarrow tables are batched into smaller chunks
@@ -165,7 +191,7 @@ class TablePackager:
             cols['_v_min'] = [row[-1] for row in rows]
             yield pa.Table.from_pydict(cols, schema=arrow_schema)
-    def __to_pa_rows(self, df: pxt.DataFrame, actual_col_types: list[pxt.ColumnType]) -> Iterator[list]:
+    def __to_pa_rows(self, df: DataFrame, actual_col_types: list[ts.ColumnType]) -> Iterator[list]:
         for row in df._exec():
             vals = [row[e.slot_idx] for e in df._select_list_exprs]
             result = [self.__to_pa_value(val, col_type) for val, col_type in zip(vals, actual_col_types)]
@@ -210,6 +236,8 @@ class TablePackager:
     def __build_tarball(self) -> Path:
         bundle_path = self.tmp_dir / 'bundle.tar.bz2'
         with tarfile.open(bundle_path, 'w:bz2') as tf:
+            # Add metadata json
+            tf.add(self.tmp_dir / 'metadata.json', arcname='metadata.json')
             # Add the Iceberg warehouse dir (including the catalog)
             tf.add(self.tmp_dir / 'warehouse', arcname='warehouse', recursive=True)
             # Add the media files

pixeltable/share/publish.py ADDED Viewed

@@ -0,0 +1,97 @@
+import dataclasses
+import os
+import sys
+import urllib.parse
+import urllib.request
+from datetime import datetime
+from pathlib import Path
+import requests
+from tqdm import tqdm
+import pixeltable as pxt
+from pixeltable import exceptions as excs, metadata
+from pixeltable.env import Env
+from pixeltable.utils import sha256sum
+from .packager import TablePackager
+# These URLs are abstracted out for now, but will be replaced with actual (hard-coded) URLs once the
+# pixeltable.com URLs are available.
+_PUBLISH_URL = os.environ.get('PIXELTABLE_PUBLISH_URL')
+_FINALIZE_URL = os.environ.get('PIXELTABLE_FINALIZE_URL')
+def publish_snapshot(dest_tbl_uri: str, src_tbl: pxt.Table) -> str:
+    packager = TablePackager(src_tbl, additional_md={'table_uri': dest_tbl_uri})
+    request_json = packager.md
+    headers_json = {'X-api-key': Env.get().pxt_api_key}
+    response = requests.post(_PUBLISH_URL, json=request_json, headers=headers_json)
+    if response.status_code != 200:
+        raise excs.Error(f'Error publishing snapshot: {response.text}')
+    response_json = response.json()
+    if not isinstance(response_json, dict) or response_json.get('destination') != 's3':
+        raise excs.Error(f'Error publishing snapshot: unexpected response from server.\n{response_json}')
+    upload_id = response_json['upload_id']
+    destination_uri = response_json['destination_uri']
+    Env.get().console_logger.info(f"Creating a snapshot of '{src_tbl._path}' at: {dest_tbl_uri}")
+    bundle = packager.package()
+    parsed_location = urllib.parse.urlparse(destination_uri)
+    if parsed_location.scheme == 's3':
+        _upload_bundle_to_s3(bundle, parsed_location)
+    else:
+        raise excs.Error(f'Unsupported destination: {destination_uri}')
+    Env.get().console_logger.info(f'Finalizing snapshot ...')
+    finalize_request_json = {
+        'upload_id': upload_id,
+        'datafile': bundle.name,
+        'size': bundle.stat().st_size,
+        'sha256': sha256sum(bundle),  # Generate our own SHA for independent verification
+    }
+    # TODO: Use Pydantic for validation
+    finalize_response = requests.post(_FINALIZE_URL, json=finalize_request_json, headers=headers_json)
+    if finalize_response.status_code != 200:
+        raise excs.Error(f'Error finalizing snapshot: {finalize_response.text}')
+    finalize_response_json = finalize_response.json()
+    if not isinstance(finalize_response_json, dict) or 'confirmed_table_uri' not in finalize_response_json:
+        raise excs.Error(f'Error finalizing snapshot: unexpected response from server.\n{finalize_response_json}')
+    confirmed_tbl_uri = finalize_response_json['confirmed_table_uri']
+    Env.get().console_logger.info(f'The published snapshot is now available at: {confirmed_tbl_uri}')
+    return confirmed_tbl_uri
+def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
+    from pixeltable.utils.s3 import get_client
+    bucket = parsed_location.netloc
+    remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
+    remote_path = str(remote_dir / bundle.name)[1:]  # Remove initial /
+    Env.get().console_logger.info(f'Uploading snapshot to: {bucket}:{remote_path}')
+    boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
+    s3_client = get_client(**boto_config)
+    upload_args = {'ChecksumAlgorithm': 'SHA256'}
+    progress_bar = tqdm(
+        desc=f'Uploading',
+        total=bundle.stat().st_size,
+        unit='B',
+        unit_scale=True,
+        unit_divisor=1024,
+        miniters=1,  # Update every iteration (should be fine for an upload)
+        ncols=100,
+        file=sys.stdout,
+    )
+    s3_client.upload_file(
+        Filename=str(bundle), Bucket=bucket, Key=str(remote_path), ExtraArgs=upload_args, Callback=progress_bar.update
+    )

pixeltable/type_system.py CHANGED Viewed

@@ -8,10 +8,9 @@ import json
 import typing
 import urllib.parse
 import urllib.request
-from pathlib import Path
 from typing import Any, Iterable, Literal, Mapping, Optional, Sequence, Union
-import av  # type: ignore
+import av
 import jsonschema
 import jsonschema.protocols
 import jsonschema.validators
@@ -22,6 +21,7 @@ import sqlalchemy as sql
 from typing_extensions import _AnnotatedAlias
 import pixeltable.exceptions as excs
+from pixeltable.utils import parse_local_file_path
 from typing import _GenericAlias  # type: ignore[attr-defined]  # isort: skip
@@ -47,8 +47,8 @@ class ColumnType:
         @classmethod
         def supertype(
             cls,
-            type1: 'ColumnType.Type',
-            type2: 'ColumnType.Type',
+            type1: Optional['ColumnType.Type'],
+            type2: Optional['ColumnType.Type'],
             # we need to pass this in because we can't easily append it as a class member
             common_supertypes: dict[tuple['ColumnType.Type', 'ColumnType.Type'], 'ColumnType.Type'],
         ) -> Optional['ColumnType.Type']:
@@ -93,6 +93,9 @@ class ColumnType:
         self._type = t
         self._nullable = nullable
+    def has_supertype(self) -> bool:
+        return True
     @property
     def nullable(self) -> bool:
         return self._nullable
@@ -271,8 +274,10 @@ class ColumnType:
                 inferred_type = val_type
             else:
                 inferred_type = inferred_type.supertype(val_type)
-                if inferred_type is None:
-                    return None
+            if inferred_type is None:
+                return None
+            if not inferred_type.has_supertype():
+                return inferred_type
         return inferred_type
     @classmethod
@@ -397,12 +402,9 @@ class ColumnType:
     def _validate_file_path(self, val: Any) -> None:
         """Raises TypeError if not a valid local file path or not a path/byte sequence"""
         if isinstance(val, str):
-            parsed = urllib.parse.urlparse(val)
-            if parsed.scheme != '' and parsed.scheme != 'file':
-                return
-            path = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
-            if not path.is_file():
-                raise TypeError(f'File not found: {str(path)}')
+            path = parse_local_file_path(val)
+            if path is not None and not path.is_file():
+                raise TypeError(f'File not found: {path}')
         else:
             if not isinstance(val, bytes):
                 raise TypeError(f'expected file path or bytes, got {type(val)}')
@@ -495,7 +497,7 @@ class InvalidType(ColumnType):
         super().__init__(self.Type.INVALID, nullable=nullable)
     def to_sa_type(self) -> sql.types.TypeEngine:
-        assert False
+        return sql.types.NullType()
     def print_value(self, val: Any) -> str:
         return str(val)
@@ -508,6 +510,9 @@ class StringType(ColumnType):
     def __init__(self, nullable: bool = False):
         super().__init__(self.Type.STRING, nullable=nullable)
+    def has_supertype(self):
+        return not self.nullable
     def to_sa_type(self) -> sql.types.TypeEngine:
         return sql.String()
@@ -591,6 +596,9 @@ class TimestampType(ColumnType):
     def __init__(self, nullable: bool = False):
         super().__init__(self.Type.TIMESTAMP, nullable=nullable)
+    def has_supertype(self):
+        return not self.nullable
     def to_sa_type(self) -> sql.types.TypeEngine:
         return sql.TIMESTAMP(timezone=True)
@@ -601,6 +609,8 @@ class TimestampType(ColumnType):
     def _create_literal(self, val: Any) -> Any:
         if isinstance(val, str):
             return datetime.datetime.fromisoformat(val)
+        if isinstance(val, datetime.datetime):
+            return val
         return val
@@ -651,6 +661,10 @@ class JsonType(ColumnType):
         return val_type.print_value(val)
     def _validate_literal(self, val: Any) -> None:
+        if isinstance(val, tuple):
+            val = list(val)
+        if isinstance(val, pydantic.BaseModel):
+            val = val.model_dump()
         if not self.__is_valid_json(val):
             raise TypeError(f'That literal is not a valid Pixeltable JSON object: {val}')
         if self.__validator is not None:
@@ -818,14 +832,20 @@ class ArrayType(ColumnType):
         return hash((self._type, self.nullable, self.shape, self.dtype))
     def supertype(self, other: ColumnType) -> Optional[ArrayType]:
+        basic_supertype = super().supertype(other)
+        if basic_supertype is not None:
+            assert isinstance(basic_supertype, ArrayType)
+            return basic_supertype
         if not isinstance(other, ArrayType):
             return None
         super_dtype = self.Type.supertype(self.dtype, other.dtype, self.common_supertypes)
         if super_dtype is None:
             # if the dtypes are incompatible, then the supertype is a fully general array
             return ArrayType(nullable=(self.nullable or other.nullable))
         super_shape: Optional[tuple[Optional[int], ...]]
-        if len(self.shape) != len(other.shape):
+        if self.shape is None or other.shape is None or len(self.shape) != len(other.shape):
             super_shape = None
         else:
             super_shape = tuple(n1 if n1 == n2 else None for n1, n2 in zip(self.shape, other.shape))
@@ -1009,8 +1029,14 @@ class ImageType(ColumnType):
         return hash((self._type, self.nullable, self.size, self.mode))
     def supertype(self, other: ColumnType) -> Optional[ImageType]:
+        basic_supertype = super().supertype(other)
+        if basic_supertype is not None:
+            assert isinstance(basic_supertype, ImageType)
+            return basic_supertype
         if not isinstance(other, ImageType):
             return None
         width = self.width if self.width == other.width else None
         height = self.height if self.height == other.height else None
         mode = self.mode if self.mode == other.mode else None

pixeltable/utils/__init__.py CHANGED Viewed

@@ -1,3 +1,10 @@
+import hashlib
+import urllib.parse
+import urllib.request
+from pathlib import Path
+from typing import Optional, Union
 def print_perf_counter_delta(delta: float) -> str:
     """Prints a performance counter delta in a human-readable format.
@@ -15,3 +22,37 @@ def print_perf_counter_delta(delta: float) -> str:
         return f'{delta * 1e3:.2f} ms'
     else:
         return f'{delta:.2f} s'
+def sha256sum(path: Union[Path, str]) -> str:
+    """
+    Compute the SHA256 hash of a file.
+    """
+    if isinstance(path, str):
+        path = Path(path)
+    h = hashlib.sha256()
+    with open(path, 'rb') as file:
+        while chunk := file.read(h.block_size):
+            h.update(chunk)
+    return h.hexdigest()
+def parse_local_file_path(file_or_url: str) -> Optional[Path]:
+    """
+    Parses a string that may be either a URL or a local file path.
+    If the string is a local file path or a file-scheme URL (file://), then a Path object will be returned.
+    Otherwise, None will be returned.
+    """
+    parsed = urllib.parse.urlparse(file_or_url)
+    if len(parsed.scheme) <= 1:
+        # We're using `urlparse` to help distinguish file paths from URLs. If there is no scheme, then it's a file path.
+        # If there's a single-character scheme, we also interpret this as a file path; this insures that drive letters
+        # on Windows pathnames are correctly handled.
+        return Path(file_or_url).absolute()
+    elif parsed.scheme == 'file':
+        return Path(urllib.parse.unquote(urllib.request.url2pathname(parsed.path)))
+    else:
+        return None

pixeltable/utils/arrow.py CHANGED Viewed

@@ -8,6 +8,8 @@ import pixeltable.type_system as ts
 PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
     pa.string(): ts.StringType(nullable=True),
+    pa.large_string(): ts.StringType(nullable=True),
+    pa.timestamp('us', tz=datetime.timezone.utc): ts.TimestampType(nullable=True),
     pa.bool_(): ts.BoolType(nullable=True),
     pa.uint8(): ts.IntType(nullable=True),
     pa.int8(): ts.IntType(nullable=True),
@@ -16,6 +18,7 @@ PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
     pa.int32(): ts.IntType(nullable=True),
     pa.int64(): ts.IntType(nullable=True),
     pa.float32(): ts.FloatType(nullable=True),
+    pa.float64(): ts.FloatType(nullable=True),
 }
 PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
@@ -32,19 +35,20 @@ PXT_TO_PA_TYPES: dict[type[ts.ColumnType], pa.DataType] = {
 }
-def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
+def to_pixeltable_type(arrow_type: pa.DataType, nullable: bool) -> Optional[ts.ColumnType]:
     """Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
     Returns None if no conversion is currently implemented.
     """
     if isinstance(arrow_type, pa.TimestampType):
-        return ts.TimestampType(nullable=True)
+        return ts.TimestampType(nullable=nullable)
     elif arrow_type in PA_TO_PXT_TYPES:
-        return PA_TO_PXT_TYPES[arrow_type]
+        pt = PA_TO_PXT_TYPES[arrow_type]
+        return pt.copy(nullable=nullable)
     elif isinstance(arrow_type, pa.FixedShapeTensorType):
-        dtype = to_pixeltable_type(arrow_type.value_type)
+        dtype = to_pixeltable_type(arrow_type.value_type, nullable)
         if dtype is None:
             return None
-        return ts.ArrayType(shape=arrow_type.shape, dtype=dtype)
+        return ts.ArrayType(shape=arrow_type.shape, dtype=dtype, nullable=nullable)
     else:
         return None
@@ -61,8 +65,17 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
         return None
-def to_pixeltable_schema(arrow_schema: pa.Schema) -> dict[str, ts.ColumnType]:
-    return {field.name: to_pixeltable_type(field.type) for field in arrow_schema}
+def ar_infer_schema(
+    arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
+) -> dict[str, ts.ColumnType]:
+    """Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
+    ar_schema = {
+        field.name: to_pixeltable_type(field.type, field.name not in primary_key)
+        if field.name not in schema_overrides
+        else schema_overrides[field.name]
+        for field in arrow_schema
+    }
+    return ar_schema
 def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
@@ -96,3 +109,23 @@ def iter_tuples(batch: Union[pa.Table, pa.RecordBatch]) -> Iterator[dict[str, An
     for i in range(batch_size):
         yield {col_name: values[i] for col_name, values in pydict.items()}
+def iter_tuples2(
+    batch: Union[pa.Table, pa.RecordBatch], col_mapping: Optional[dict[str, str]], schema: dict[str, ts.ColumnType]
+) -> Iterator[dict[str, Any]]:
+    """Convert a RecordBatch to an iterator of dictionaries. also works with pa.Table and pa.RowGroup"""
+    pydict = to_pydict(batch)
+    assert len(pydict) > 0, 'empty record batch'
+    for _, v in pydict.items():
+        batch_size = len(v)
+        break
+    for i in range(batch_size):
+        # Convert a row to insertable format
+        yield {
+            (pxt_name := col_name if col_mapping is None else col_mapping[col_name]): schema[pxt_name].create_literal(
+                values[i]
+            )
+            for col_name, values in pydict.items()
+        }

pixeltable/utils/formatter.py CHANGED Viewed

@@ -6,7 +6,7 @@ import logging
 import mimetypes
 from typing import Any, Callable, Optional
-import av  # type: ignore[import-untyped]
+import av
 import numpy as np
 import PIL
 import PIL.Image as Image

pixeltable 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.4py3-none-any.whl → 0.3.6py3-none-any.whl