PyPI - pixeltable - Versions diffs - 0.4.12__py3-none-any.whl → 0.4.14__py3-none-any.whl - Mend

pixeltable 0.4.12py3-none-any.whl → 0.4.14py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (55) hide show

pixeltable/__init__.py +11 -1
pixeltable/catalog/__init__.py +2 -1
pixeltable/catalog/catalog.py +179 -63
pixeltable/catalog/column.py +24 -20
pixeltable/catalog/table.py +96 -124
pixeltable/catalog/table_metadata.py +96 -0
pixeltable/catalog/table_version.py +15 -6
pixeltable/catalog/view.py +22 -22
pixeltable/config.py +2 -0
pixeltable/dataframe.py +3 -2
pixeltable/env.py +43 -21
pixeltable/exec/__init__.py +1 -0
pixeltable/exec/aggregation_node.py +0 -1
pixeltable/exec/cache_prefetch_node.py +74 -98
pixeltable/exec/data_row_batch.py +2 -18
pixeltable/exec/in_memory_data_node.py +1 -1
pixeltable/exec/object_store_save_node.py +299 -0
pixeltable/exec/sql_node.py +28 -33
pixeltable/exprs/data_row.py +31 -25
pixeltable/exprs/json_path.py +6 -5
pixeltable/exprs/row_builder.py +6 -12
pixeltable/functions/gemini.py +1 -1
pixeltable/functions/openai.py +1 -1
pixeltable/functions/video.py +5 -6
pixeltable/globals.py +6 -7
pixeltable/index/embedding_index.py +5 -8
pixeltable/io/__init__.py +2 -1
pixeltable/io/fiftyone.py +1 -1
pixeltable/io/label_studio.py +4 -5
pixeltable/io/lancedb.py +3 -0
pixeltable/io/parquet.py +9 -89
pixeltable/io/table_data_conduit.py +2 -2
pixeltable/iterators/audio.py +1 -1
pixeltable/iterators/document.py +10 -12
pixeltable/iterators/video.py +1 -1
pixeltable/metadata/schema.py +7 -0
pixeltable/plan.py +26 -1
pixeltable/share/packager.py +8 -2
pixeltable/share/publish.py +3 -9
pixeltable/type_system.py +1 -3
pixeltable/utils/arrow.py +97 -2
pixeltable/utils/dbms.py +31 -5
pixeltable/utils/gcs_store.py +283 -0
pixeltable/utils/lancedb.py +88 -0
pixeltable/utils/local_store.py +316 -0
pixeltable/utils/object_stores.py +497 -0
pixeltable/utils/pytorch.py +5 -6
pixeltable/utils/s3_store.py +354 -0
{pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/METADATA +162 -127
{pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/RECORD +53 -47
pixeltable/utils/media_store.py +0 -248
pixeltable/utils/s3.py +0 -17
{pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/WHEEL +0 -0
{pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/entry_points.txt +0 -0
{pixeltable-0.4.12.dist-info → pixeltable-0.4.14.dist-info}/licenses/LICENSE +0 -0

pixeltable/index/embedding_index.py CHANGED Viewed

@@ -138,15 +138,12 @@ class EmbeddingIndex(IndexBase):
     def create_index(self, index_name: str, index_value_col: catalog.Column) -> None:
         """Create the index on the index value column"""
-        idx = sql.Index(
-            index_name,
-            index_value_col.sa_col,
-            postgresql_using='hnsw',
-            postgresql_with={'m': 16, 'ef_construction': 64},
-            postgresql_ops={index_value_col.sa_col.name: self.PGVECTOR_OPS[self.metric]},
+        Env.get().dbms.create_vector_index(
+            index_name=index_name,
+            index_value_sa_col=index_value_col.sa_col,
+            conn=Env.get().conn,
+            metric=self.PGVECTOR_OPS[self.metric],
         )
-        conn = Env.get().conn
-        idx.create(bind=conn)
     def drop_index(self, index_name: str, index_value_col: catalog.Column) -> None:
         """Drop the index on the index value column"""

pixeltable/io/__init__.py CHANGED Viewed

@@ -4,11 +4,12 @@ from .datarows import import_json, import_rows
 from .external_store import ExternalStore
 from .globals import create_label_studio_project, export_images_as_fo_dataset
 from .hf_datasets import import_huggingface_dataset
+from .lancedb import export_lancedb
 from .pandas import import_csv, import_excel, import_pandas
 from .parquet import export_parquet, import_parquet
 __default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
-__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows'}
+__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows', 'lancedb'}
 __all__ = sorted(__default_dir - __removed_symbols)

pixeltable/io/fiftyone.py CHANGED Viewed

@@ -9,7 +9,7 @@ import puremagic
 import pixeltable as pxt
 import pixeltable.exceptions as excs
 from pixeltable import exprs
-from pixeltable.utils.media_store import TempStore
+from pixeltable.utils.local_store import TempStore
 class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):

pixeltable/io/label_studio.py CHANGED Viewed

@@ -19,7 +19,7 @@ from pixeltable.config import Config
 from pixeltable.exprs import ColumnRef, DataRow, Expr
 from pixeltable.io.external_store import Project
 from pixeltable.utils import coco
-from pixeltable.utils.media_store import TempStore
+from pixeltable.utils.local_store import TempStore
 # label_studio_sdk>=1 and label_studio_sdk<1 are not compatible, so we need to try
 # the import two different ways to insure intercompatibility
@@ -46,6 +46,9 @@ class LabelStudioProject(Project):
     """
     An [`ExternalStore`][pixeltable.io.ExternalStore] that represents a Label Studio project, providing functionality
     for synchronizing between a Pixeltable table and a Label Studio project.
+    The constructor will NOT create a new Label Studio project; it is also used when loading
+    metadata for existing projects.
     """
     project_id: int  # Label Studio project ID
@@ -60,10 +63,6 @@ class LabelStudioProject(Project):
         col_mapping: dict[ColumnHandle, str],
         stored_proxies: Optional[dict[ColumnHandle, ColumnHandle]] = None,
     ):
-        """
-        The constructor will NOT create a new Label Studio project; it is also used when loading
-        metadata for existing projects.
-        """
         self.project_id = project_id
         self.media_import_method = media_import_method
         self._project = None

pixeltable/io/lancedb.py ADDED Viewed

@@ -0,0 +1,3 @@
+from pixeltable.utils.lancedb import export_lancedb
+__all__ = ['export_lancedb']

pixeltable/io/parquet.py CHANGED Viewed

@@ -1,46 +1,22 @@
 from __future__ import annotations
-import datetime
-import io
 import json
 import logging
 import typing
-from collections import deque
 from pathlib import Path
 from typing import Any, Optional
-import numpy as np
-import PIL.Image
 import pixeltable as pxt
 import pixeltable.exceptions as excs
 from pixeltable.catalog import Catalog
 from pixeltable.utils.transactional_directory import transactional_directory
 if typing.TYPE_CHECKING:
-    import pyarrow as pa
     import pixeltable as pxt
 _logger = logging.getLogger('pixeltable')
-def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
-    import pyarrow as pa
-    from pyarrow import parquet
-    pydict = {}
-    for field in schema:
-        if isinstance(field.type, pa.FixedShapeTensorType):
-            stacked_arr = np.stack(value_batch[field.name])
-            pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
-        else:
-            pydict[field.name] = value_batch[field.name]
-    tab = pa.Table.from_pydict(pydict, schema=schema)
-    parquet.write_table(tab, str(output_path))
 def export_parquet(
     table_or_df: pxt.Table | pxt.DataFrame,
     parquet_path: Path,
@@ -63,7 +39,9 @@ def export_parquet(
                         If False, will raise an error if the Dataframe has any image column.
                         Default False.
     """
-    from pixeltable.utils.arrow import to_arrow_schema
+    import pyarrow as pa
+    from pixeltable.utils.arrow import to_record_batches
     df: pxt.DataFrame
     if isinstance(table_or_df, pxt.catalog.Table):
@@ -71,9 +49,6 @@ def export_parquet(
     else:
         df = table_or_df
-    type_dict = {k: v.as_dict() for k, v in df.schema.items()}
-    arrow_schema = to_arrow_schema(df.schema)
     if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
         raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
@@ -81,70 +56,15 @@ def export_parquet(
     with transactional_directory(parquet_path) as temp_path:
         # dump metadata json file so we can inspect what was the source of the parquet file later on.
         json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
+        type_dict = {k: v.as_dict() for k, v in df.schema.items()}
         json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w'))  # keep type metadata
         batch_num = 0
-        current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
-        current_byte_estimate = 0
         with Catalog.get().begin_xact(for_write=False):
-            for data_row in df._exec():
-                for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
-                    val = data_row[e.slot_idx]
-                    if val is None:
-                        current_value_batch[col_name].append(val)
-                        continue
-                    assert val is not None
-                    if col_type.is_image_type():
-                        # images get inlined into the parquet file
-                        if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
-                            # if there is a file, read directly to preserve information
-                            with open(data_row.file_paths[e.slot_idx], 'rb') as f:
-                                val = f.read()
-                        elif isinstance(val, PIL.Image.Image):
-                            # if no file available, eg. bc it is computed, convert to png
-                            buf = io.BytesIO()
-                            val.save(buf, format='PNG')
-                            val = buf.getvalue()
-                        else:
-                            raise excs.Error(f'unknown image type {type(val)}')
-                        length = len(val)
-                    elif col_type.is_string_type():
-                        length = len(val)
-                    elif col_type.is_video_type() or col_type.is_audio_type():
-                        if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
-                            val = data_row.file_paths[e.slot_idx]
-                        else:
-                            raise excs.Error(f'unknown audio/video type {type(val)}')
-                        length = len(val)
-                    elif col_type.is_json_type():
-                        val = json.dumps(val)
-                        length = len(val)
-                    elif col_type.is_array_type():
-                        length = val.nbytes
-                    elif col_type.is_int_type() or col_type.is_float_type():
-                        length = 8
-                    elif col_type.is_bool_type():
-                        length = 1
-                    elif col_type.is_date_type():
-                        length = 4
-                    elif col_type.is_timestamp_type():
-                        val = val.astimezone(datetime.timezone.utc)
-                        length = 8
-                    else:
-                        raise excs.Error(f'unknown type {col_type} for {col_name}')
-                    current_value_batch[col_name].append(val)
-                    current_byte_estimate += length
-                if current_byte_estimate > partition_size_bytes:
-                    assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
-                    _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
-                    batch_num += 1
-                    current_value_batch = {k: deque() for k in df.schema}
-                    current_byte_estimate = 0
-            _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
+            for record_batch in to_record_batches(df, partition_size_bytes):
+                output_path = temp_path / f'part-{batch_num:05d}.parquet'
+                arrow_tbl = pa.Table.from_batches([record_batch])  # type: ignore
+                pa.parquet.write_table(arrow_tbl, str(output_path))
+                batch_num += 1
 def import_parquet(

pixeltable/io/table_data_conduit.py CHANGED Viewed

@@ -469,12 +469,12 @@ class ParquetTableDataConduit(TableDataConduit):
         return t
     def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
-        from pixeltable.utils.arrow import ar_infer_schema
+        from pixeltable.utils.arrow import to_pxt_schema
         if self.source_column_map is None:
             if self.src_schema_overrides is None:
                 self.src_schema_overrides = {}
-            self.src_schema = ar_infer_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
+            self.src_schema = to_pxt_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
             inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
                 self.src_schema, self.src_pk, self.src_schema_overrides
             )

pixeltable/iterators/audio.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import Any, ClassVar, Optional
 import av
 from pixeltable import exceptions as excs, type_system as ts
-from pixeltable.utils.media_store import TempStore
+from pixeltable.utils.local_store import TempStore
 from .base import ComponentIterator

pixeltable/iterators/document.py CHANGED Viewed

@@ -94,6 +94,16 @@ class DocumentSplitter(ComponentIterator):
     include additional metadata fields if specified in the `metadata` parameter, as explained below.
     Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
+    Args:
+        separators: separators to use to chunk the document. Options are:
+             `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
+             This may be a comma-separated string, e.g., `'heading,token_limit'`.
+        limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
+             or `'char_limit'` is specified.
+        metadata: additional metadata fields to include in the output. Options are:
+             `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
+             (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
     """
     METADATA_COLUMN_TYPES: ClassVar[dict[ChunkMetadata, ColumnType]] = {
@@ -116,18 +126,6 @@ class DocumentSplitter(ComponentIterator):
         tiktoken_encoding: Optional[str] = 'cl100k_base',
         tiktoken_target_model: Optional[str] = None,
     ):
-        """Init method for `DocumentSplitter` class.
-        Args:
-            separators: separators to use to chunk the document. Options are:
-                 `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
-                 This may be a comma-separated string, e.g., `'heading,token_limit'`.
-            limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
-                 or `'char_limit'` is specified.
-            metadata: additional metadata fields to include in the output. Options are:
-                 `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
-                 (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
-        """
         if html_skip_tags is None:
             html_skip_tags = ['nav']
         self._doc_handle = get_document_handle(document)

pixeltable/iterators/video.py CHANGED Viewed

@@ -14,7 +14,7 @@ import pixeltable as pxt
 import pixeltable.exceptions as excs
 import pixeltable.type_system as ts
 import pixeltable.utils.av as av_utils
-from pixeltable.utils.media_store import TempStore
+from pixeltable.utils.local_store import TempStore
 from .base import ComponentIterator

pixeltable/metadata/schema.py CHANGED Viewed

@@ -115,6 +115,9 @@ class ColumnMd:
     # if True, the column is present in the stored table
     stored: Optional[bool]
+    # If present, the URI for the destination for column values
+    destination: Optional[str] = None
 @dataclasses.dataclass
 class IndexMd:
@@ -244,6 +247,9 @@ class TableVersionMd:
     schema_version: int
     user: Optional[str] = None  # User that created this version
     update_status: Optional[UpdateStatus] = None  # UpdateStatus of the change that created this version
+    # A version fragment cannot be queried or instantiated via get_table(). A fragment represents a version of a
+    # replica table that has incomplete data, and exists only to provide base table support for a dependent view.
+    is_fragment: bool = False
     additional_md: dict[str, Any] = dataclasses.field(default_factory=dict)
@@ -353,6 +359,7 @@ class FullTableMd(NamedTuple):
     def is_pure_snapshot(self) -> bool:
         return (
             self.tbl_md.view_md is not None
+            and self.tbl_md.view_md.is_snapshot
             and self.tbl_md.view_md.predicate is None
             and len(self.schema_version_md.columns) == 0
         )

pixeltable/plan.py CHANGED Viewed

@@ -403,6 +403,8 @@ class Planner:
                 ignore_errors=ignore_errors,
             )
         )
+        plan = cls._insert_save_node(tbl.id, row_builder.stored_media_cols, input_node=plan)
         return plan
     @classmethod
@@ -499,6 +501,9 @@ class Planner:
         for i, col in enumerate(all_base_cols):
             plan.row_builder.add_table_column(col, select_list[i].slot_idx)
         plan.ctx.num_computed_exprs = len(recomputed_exprs)
+        plan = cls._insert_save_node(tbl.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
         recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
         return plan, [f'{c.tbl.name}.{c.name}' for c in updated_cols + recomputed_user_cols], recomputed_user_cols
@@ -597,6 +602,7 @@ class Planner:
         # we're returning everything to the user, so we might as well do it in a single batch
         ctx.batch_size = 0
         plan.set_ctx(ctx)
+        plan = cls._insert_save_node(tbl.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
         recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
         return (
             plan,
@@ -650,6 +656,8 @@ class Planner:
         for i, col in enumerate(copied_cols + list(recomputed_cols)):  # same order as select_list
             plan.row_builder.add_table_column(col, select_list[i].slot_idx)
         # TODO: avoid duplication with view_load_plan() logic (where does this belong?)
+        plan = cls._insert_save_node(view.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
         return plan
     @classmethod
@@ -718,6 +726,8 @@ class Planner:
         exec_ctx.ignore_errors = True
         plan.set_ctx(exec_ctx)
+        plan = cls._insert_save_node(view.tbl_version.id, plan.row_builder.stored_media_cols, input_node=plan)
         return plan, len(row_builder.default_eval_ctx.target_exprs)
     @classmethod
@@ -762,6 +772,17 @@ class Planner:
             combined_ordering = combined
         return combined_ordering
+    @classmethod
+    def _insert_save_node(
+        cls, tbl_id: UUID, stored_media_cols: list[exprs.ColumnSlotIdx], input_node: exec.ExecNode
+    ) -> exec.ExecNode:
+        """Return an ObjectStoreSaveNode if stored media columns are present, otherwise return input"""
+        if len(stored_media_cols) == 0:
+            return input_node
+        save_node = exec.ObjectStoreSaveNode(tbl_id, stored_media_cols, input_node)
+        save_node.set_ctx(input_node.ctx)
+        return save_node
     @classmethod
     def _is_contained_in(cls, l1: Iterable[exprs.Expr], l2: Iterable[exprs.Expr]) -> bool:
         """Returns True if l1 is contained in l2"""
@@ -771,7 +792,7 @@ class Planner:
     def _insert_prefetch_node(
         cls, tbl_id: UUID, expressions: Iterable[exprs.Expr], input_node: exec.ExecNode
     ) -> exec.ExecNode:
-        """Return a CachePrefetchNode if needed, otherwise return input"""
+        """Return a node to prefetch data if needed, otherwise return input"""
         # we prefetch external files for all media ColumnRefs, even those that aren't part of the dependencies
         # of output_exprs: if unstored iterator columns are present, we might need to materialize ColumnRefs that
         # aren't explicitly captured as dependencies
@@ -989,6 +1010,7 @@ class Planner:
                 if not agg_output.issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
                     # we need an ExprEvalNode to evaluate the remaining output exprs
                     plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, agg_output, input=plan)
+                plan = cls._insert_save_node(tbl.tbl_version.id, row_builder.stored_media_cols, input_node=plan)
         else:
             if not exprs.ExprSet(sql_exprs).issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
                 # we need an ExprEvalNode to evaluate the remaining output exprs
@@ -1034,10 +1056,13 @@ class Planner:
         plan = cls._create_query_plan(
             row_builder=row_builder, analyzer=analyzer, eval_ctx=row_builder.default_eval_ctx, with_pk=True
         )
         plan.ctx.batch_size = 16
         plan.ctx.show_pbar = True
         plan.ctx.ignore_errors = True
         computed_exprs = row_builder.output_exprs - row_builder.input_exprs
         plan.ctx.num_computed_exprs = len(computed_exprs)  # we are adding a computed column, so we need to evaluate it
+        plan = cls._insert_save_node(tbl.tbl_version.id, row_builder.stored_media_cols, input_node=plan)
         return plan

pixeltable/share/packager.py CHANGED Viewed

@@ -24,7 +24,8 @@ from pixeltable.env import Env
 from pixeltable.metadata import schema
 from pixeltable.utils import sha256sum
 from pixeltable.utils.formatter import Formatter
-from pixeltable.utils.media_store import MediaStore, TempStore
+from pixeltable.utils.local_store import TempStore
+from pixeltable.utils.object_stores import ObjectOps
 _logger = logging.getLogger('pixeltable')
@@ -362,6 +363,8 @@ class TableRestorer:
         for md in tbl_md:
             md.tbl_md.is_replica = True
+        assert not tbl_md[0].version_md.is_fragment  # Top-level table cannot be a version fragment
         cat = catalog.Catalog.get()
         with cat.begin_xact(for_write=True):
@@ -369,6 +372,9 @@ class TableRestorer:
             # versions that have not been seen before.
             cat.create_replica(catalog.Path.parse(self.tbl_path), tbl_md)
+            _logger.debug(f'Now will import data for {len(tbl_md)} table(s):')
+            _logger.debug(repr([md.tbl_md.tbl_id for md in tbl_md[::-1]]))
             # Now we need to load data for replica_tbl and its ancestors, except that we skip
             # replica_tbl itself if it's a pure snapshot.
             for md in tbl_md[::-1]:  # Base table first
@@ -619,7 +625,7 @@ class TableRestorer:
                 # in self.media_files.
                 src_path = self.tmp_dir / 'media' / parsed_url.netloc
                 # Move the file to the media store and update the URL.
-                self.media_files[url] = MediaStore.get().relocate_local_media_file(src_path, media_col)
+                self.media_files[url] = ObjectOps.put_file(media_col, src_path, relocate_or_delete=True)
             return self.media_files[url]
         # For any type of URL other than a local file, just return the URL as-is.
         return url

pixeltable/share/publish.py CHANGED Viewed

@@ -14,7 +14,7 @@ import pixeltable as pxt
 from pixeltable import exceptions as excs
 from pixeltable.env import Env
 from pixeltable.utils import sha256sum
-from pixeltable.utils.media_store import TempStore
+from pixeltable.utils.local_store import TempStore
 from .packager import TablePackager, TableRestorer
@@ -79,16 +79,13 @@ def push_replica(
 def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult) -> None:
-    from pixeltable.utils.s3 import get_client
     bucket = parsed_location.netloc
     remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
     remote_path = str(remote_dir / bundle.name)[1:]  # Remove initial /
     Env.get().console_logger.info(f'Uploading snapshot to: {bucket}:{remote_path}')
-    boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
-    s3_client = get_client(**boto_config)
+    s3_client = Env.get().get_client('s3')
     upload_args = {'ChecksumAlgorithm': 'SHA256'}
@@ -135,16 +132,13 @@ def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
 def _download_bundle_from_s3(parsed_location: urllib.parse.ParseResult, bundle_filename: str) -> Path:
-    from pixeltable.utils.s3 import get_client
     bucket = parsed_location.netloc
     remote_dir = Path(urllib.parse.unquote(urllib.request.url2pathname(parsed_location.path)))
     remote_path = str(remote_dir / bundle_filename)[1:]  # Remove initial /
     Env.get().console_logger.info(f'Downloading snapshot from: {bucket}:{remote_path}')
-    boto_config = {'max_pool_connections': 5, 'connect_timeout': 15, 'retries': {'max_attempts': 3, 'mode': 'adaptive'}}
-    s3_client = get_client(**boto_config)
+    s3_client = Env.get().get_client('s3')
     obj = s3_client.head_object(Bucket=bucket, Key=remote_path)  # Check if the object exists
     bundle_size = obj['ContentLength']

pixeltable/type_system.py CHANGED Viewed

@@ -1081,9 +1081,7 @@ class ImageType(ColumnType):
         mode: Optional[str] = None,
         nullable: bool = False,
     ):
-        """
-        TODO: does it make sense to specify only width or height?
-        """
+        # TODO: does it make sense to specify only width or height?
         super().__init__(self.Type.IMAGE, nullable=nullable)
         assert not (width is not None and size is not None)
         assert not (height is not None and size is not None)

pixeltable/utils/arrow.py CHANGED Viewed

@@ -1,11 +1,18 @@
 import datetime
-from typing import Any, Iterator, Optional
+import io
+import json
+from typing import TYPE_CHECKING, Any, Iterator, Optional, cast
 import numpy as np
+import PIL.Image
 import pyarrow as pa
+import pixeltable.exceptions as excs
 import pixeltable.type_system as ts
+if TYPE_CHECKING:
+    import pixeltable as pxt
 PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
     pa.string(): ts.StringType(nullable=True),
     pa.large_string(): ts.StringType(nullable=True),
@@ -71,7 +78,7 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
         return None
-def ar_infer_schema(
+def to_pxt_schema(
     arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
 ) -> dict[str, ts.ColumnType]:
     """Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
@@ -88,6 +95,94 @@ def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
     return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())  # type: ignore[misc]
+def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
+    import pyarrow as pa
+    pa_arrays: list[pa.Array] = []
+    for field in schema:
+        if isinstance(field.type, pa.FixedShapeTensorType):
+            stacked_arr = np.stack(column_vals[field.name])
+            pa_arrays.append(pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr))
+        else:
+            pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
+            pa_arrays.append(pa_array)
+    return pa.RecordBatch.from_arrays(pa_arrays, schema=schema)  # type: ignore
+def to_record_batches(df: 'pxt.DataFrame', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
+    arrow_schema = to_arrow_schema(df.schema)
+    batch_columns: dict[str, list[Any]] = {k: [] for k in df.schema}
+    current_byte_estimate = 0
+    num_batch_rows = 0
+    # TODO: in order to avoid having to deal with ExprEvalError here, DataFrameResultSet should be an iterator
+    # over _exec()
+    try:
+        for data_row in df._exec():
+            num_batch_rows += 1
+            for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
+                val = data_row[e.slot_idx]
+                val_size_bytes: int
+                if val is None:
+                    batch_columns[col_name].append(val)
+                    continue
+                assert val is not None
+                if col_type.is_image_type():
+                    # images get inlined into the parquet file
+                    if data_row.file_paths[e.slot_idx] is not None:
+                        # if there is a file, read directly to preserve information
+                        with open(data_row.file_paths[e.slot_idx], 'rb') as f:
+                            val = f.read()
+                    elif isinstance(val, PIL.Image.Image):
+                        # no file available: save as png
+                        buf = io.BytesIO()
+                        val.save(buf, format='png')
+                        val = buf.getvalue()
+                    else:
+                        raise excs.Error(f'unknown image type {type(val)}')
+                    val_size_bytes = len(val)
+                elif col_type.is_string_type():
+                    val_size_bytes = len(val)
+                elif col_type.is_media_type():
+                    assert data_row.file_paths[e.slot_idx] is not None
+                    val = data_row.file_paths[e.slot_idx]
+                    val_size_bytes = len(val)
+                elif col_type.is_json_type():
+                    val = json.dumps(val)
+                    val_size_bytes = len(val)
+                elif col_type.is_array_type():
+                    val_size_bytes = val.nbytes
+                elif col_type.is_int_type() or col_type.is_float_type():
+                    val_size_bytes = 8
+                elif col_type.is_bool_type():
+                    val_size_bytes = 1
+                elif col_type.is_date_type():
+                    val_size_bytes = 4
+                elif col_type.is_timestamp_type():
+                    val = val.astimezone(datetime.timezone.utc)
+                    val_size_bytes = 8
+                else:
+                    raise excs.Error(f'unknown type {col_type} for {col_name}')
+                batch_columns[col_name].append(val)
+                current_byte_estimate += val_size_bytes
+            if current_byte_estimate > batch_size_bytes and num_batch_rows > 0:
+                record_batch = _to_record_batch(batch_columns, arrow_schema)
+                yield record_batch
+                batch_columns = {k: [] for k in df.schema}
+                current_byte_estimate = 0
+                num_batch_rows = 0
+    except excs.ExprEvalError as e:
+        df._raise_expr_eval_err(e)
+    if num_batch_rows > 0:
+        record_batch = _to_record_batch(batch_columns, arrow_schema)
+        yield record_batch
 def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
     """Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
     this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.

pixeltable 0.4.12__py3-none-any.whl → 0.4.14__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.12py3-none-any.whl → 0.4.14py3-none-any.whl