PyPI - pixeltable - Versions diffs - 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl - Mend

pixeltable 0.4.11py3-none-any.whl → 0.4.13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (27) hide show

pixeltable/__init__.py +11 -1
pixeltable/catalog/__init__.py +2 -1
pixeltable/catalog/table.py +88 -118
pixeltable/catalog/table_metadata.py +96 -0
pixeltable/catalog/table_version.py +23 -26
pixeltable/catalog/view.py +3 -1
pixeltable/dataframe.py +29 -8
pixeltable/env.py +2 -6
pixeltable/exprs/compound_predicate.py +2 -1
pixeltable/functions/anthropic.py +17 -6
pixeltable/functions/groq.py +2 -2
pixeltable/functions/openai.py +6 -3
pixeltable/globals.py +11 -7
pixeltable/io/__init__.py +2 -1
pixeltable/io/lancedb.py +3 -0
pixeltable/io/parquet.py +9 -89
pixeltable/io/table_data_conduit.py +2 -2
pixeltable/share/__init__.py +1 -1
pixeltable/share/publish.py +12 -10
pixeltable/utils/arrow.py +97 -2
pixeltable/utils/lancedb.py +88 -0
pixeltable/utils/media_store.py +11 -0
{pixeltable-0.4.11.dist-info → pixeltable-0.4.13.dist-info}/METADATA +162 -127
{pixeltable-0.4.11.dist-info → pixeltable-0.4.13.dist-info}/RECORD +27 -24
{pixeltable-0.4.11.dist-info → pixeltable-0.4.13.dist-info}/WHEEL +0 -0
{pixeltable-0.4.11.dist-info → pixeltable-0.4.13.dist-info}/entry_points.txt +0 -0
{pixeltable-0.4.11.dist-info → pixeltable-0.4.13.dist-info}/licenses/LICENSE +0 -0

pixeltable/functions/anthropic.py CHANGED Viewed

@@ -41,9 +41,9 @@ def _anthropic_client() -> 'anthropic.AsyncAnthropic':
 def _get_header_info(
     headers: httpx.Headers,
 ) -> tuple[
-    Optional[tuple[int, int, datetime.datetime]],
-    Optional[tuple[int, int, datetime.datetime]],
-    Optional[tuple[int, int, datetime.datetime]],
+    tuple[int, int, datetime.datetime] | None,
+    tuple[int, int, datetime.datetime] | None,
+    tuple[int, int, datetime.datetime] | None,
 ]:
     """Extract rate limit info from Anthropic API response headers."""
     requests_limit_str = headers.get('anthropic-ratelimit-requests-limit')
@@ -54,7 +54,9 @@ def _get_header_info(
     requests_reset = (
         datetime.datetime.fromisoformat(requests_reset_str.replace('Z', '+00:00')) if requests_reset_str else None
     )
-    requests_info = (requests_limit, requests_remaining, requests_reset) if requests_reset else None
+    requests_info = (
+        (requests_limit, requests_remaining, requests_reset) if requests_reset and requests_remaining else None
+    )
     input_tokens_limit_str = headers.get('anthropic-ratelimit-input-tokens-limit')
     input_tokens_limit = int(input_tokens_limit_str) if input_tokens_limit_str is not None else None
@@ -66,7 +68,11 @@ def _get_header_info(
         if input_tokens_reset_str
         else None
     )
-    input_tokens_info = (input_tokens_limit, input_tokens_remaining, input_tokens_reset) if input_tokens_reset else None
+    input_tokens_info = (
+        (input_tokens_limit, input_tokens_remaining, input_tokens_reset)
+        if input_tokens_reset and input_tokens_remaining
+        else None
+    )
     output_tokens_limit_str = headers.get('anthropic-ratelimit-output-tokens-limit')
     output_tokens_limit = int(output_tokens_limit_str) if output_tokens_limit_str is not None else None
@@ -79,9 +85,14 @@ def _get_header_info(
         else None
     )
     output_tokens_info = (
-        (output_tokens_limit, output_tokens_remaining, output_tokens_reset) if output_tokens_reset else None
+        (output_tokens_limit, output_tokens_remaining, output_tokens_reset)
+        if output_tokens_reset and output_tokens_remaining
+        else None
     )
+    if requests_info is None or input_tokens_info is None or output_tokens_info is None:
+        _logger.debug(f'get_header_info(): incomplete rate limit info: {headers}')
     return requests_info, input_tokens_info, output_tokens_info

pixeltable/functions/groq.py CHANGED Viewed

@@ -62,11 +62,11 @@ async def chat_completions(
         A dictionary containing the response and other metadata.
     Examples:
-        Add a computed column that applies the model `llama3-8b-8192`
+        Add a computed column that applies the model `llama-3.1-8b-instant`
         to an existing Pixeltable column `tbl.prompt` of the table `tbl`:
         >>> messages = [{'role': 'user', 'content': tbl.prompt}]
-        ... tbl.add_computed_column(response=chat_completions(messages, model='llama3-8b-8192'))
+        ... tbl.add_computed_column(response=chat_completions(messages, model='llama-3.1-8b-instant'))
     """
     if model_kwargs is None:
         model_kwargs = {}

pixeltable/functions/openai.py CHANGED Viewed

@@ -113,7 +113,7 @@ def _parse_header_duration(duration_str: str) -> datetime.timedelta:
 def _get_header_info(
     headers: httpx.Headers,
-) -> tuple[Optional[tuple[int, int, datetime.datetime]], Optional[tuple[int, int, datetime.datetime]]]:
+) -> tuple[tuple[int, int, datetime.datetime] | None, tuple[int, int, datetime.datetime] | None]:
     now = datetime.datetime.now(tz=datetime.timezone.utc)
     requests_limit_str = headers.get('x-ratelimit-limit-requests')
@@ -122,7 +122,7 @@ def _get_header_info(
     requests_remaining = int(requests_remaining_str) if requests_remaining_str is not None else None
     requests_reset_str = headers.get('x-ratelimit-reset-requests', '5s')  # Default to 5 seconds
     requests_reset_ts = now + _parse_header_duration(requests_reset_str)
-    requests_info = (requests_limit, requests_remaining, requests_reset_ts)
+    requests_info = (requests_limit, requests_remaining, requests_reset_ts) if requests_remaining is not None else None
     tokens_limit_str = headers.get('x-ratelimit-limit-tokens')
     tokens_limit = int(tokens_limit_str) if tokens_limit_str is not None else None
@@ -130,7 +130,10 @@ def _get_header_info(
     tokens_remaining = int(tokens_remaining_str) if tokens_remaining_str is not None else None
     tokens_reset_str = headers.get('x-ratelimit-reset-tokens', '5s')  # Default to 5 seconds
     tokens_reset_ts = now + _parse_header_duration(tokens_reset_str)
-    tokens_info = (tokens_limit, tokens_remaining, tokens_reset_ts)
+    tokens_info = (tokens_limit, tokens_remaining, tokens_reset_ts) if tokens_remaining is not None else None
+    if requests_info is None or tokens_info is None:
+        _logger.debug(f'get_header_info(): incomplete rate limit info: {headers}')
     return requests_info, tokens_info

pixeltable/globals.py CHANGED Viewed

@@ -3,7 +3,7 @@ from __future__ import annotations
 import logging
 import os
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, NamedTuple, Optional, Sequence, Union
+from typing import TYPE_CHECKING, Any, Iterable, Literal, NamedTuple, Optional, Union
 import pandas as pd
 import pydantic
@@ -24,9 +24,8 @@ if TYPE_CHECKING:
         str,
         os.PathLike,
         Path,  # OS paths, filenames, URLs
-        Iterator[dict[str, Any]],  # iterator producing dictionaries of values
-        RowData,  # list of dictionaries
-        Sequence[pydantic.BaseModel],  # list of Pydantic models
+        Iterable[dict[str, Any]],  # dictionaries of values
+        Iterable[pydantic.BaseModel],  # Pydantic model instances
         DataFrame,  # Pixeltable DataFrame
         pd.DataFrame,  # pandas DataFrame
         datasets.Dataset,
@@ -542,9 +541,14 @@ def drop_table(
         assert isinstance(table, str)
         tbl_path = table
-    path_obj = catalog.Path.parse(tbl_path)
-    if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
-    Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
+    if tbl_path.startswith('pxt://'):
+        # Remote table
+        share.delete_replica(tbl_path)
+    else:
+        # Local table
+        path_obj = catalog.Path.parse(tbl_path)
+        if_not_exists_ = catalog.IfNotExistsParam.validated(if_not_exists, 'if_not_exists')
+        Catalog.get().drop_table(path_obj, force=force, if_not_exists=if_not_exists_)
 def get_dir_contents(dir_path: str = '', recursive: bool = True) -> 'DirContents':

pixeltable/io/__init__.py CHANGED Viewed

@@ -4,11 +4,12 @@ from .datarows import import_json, import_rows
 from .external_store import ExternalStore
 from .globals import create_label_studio_project, export_images_as_fo_dataset
 from .hf_datasets import import_huggingface_dataset
+from .lancedb import export_lancedb
 from .pandas import import_csv, import_excel, import_pandas
 from .parquet import export_parquet, import_parquet
 __default_dir = {symbol for symbol in dir() if not symbol.startswith('_')}
-__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows'}
+__removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet', 'datarows', 'lancedb'}
 __all__ = sorted(__default_dir - __removed_symbols)

pixeltable/io/lancedb.py ADDED Viewed

@@ -0,0 +1,3 @@
+from pixeltable.utils.lancedb import export_lancedb
+__all__ = ['export_lancedb']

pixeltable/io/parquet.py CHANGED Viewed

@@ -1,46 +1,22 @@
 from __future__ import annotations
-import datetime
-import io
 import json
 import logging
 import typing
-from collections import deque
 from pathlib import Path
 from typing import Any, Optional
-import numpy as np
-import PIL.Image
 import pixeltable as pxt
 import pixeltable.exceptions as excs
 from pixeltable.catalog import Catalog
 from pixeltable.utils.transactional_directory import transactional_directory
 if typing.TYPE_CHECKING:
-    import pyarrow as pa
     import pixeltable as pxt
 _logger = logging.getLogger('pixeltable')
-def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
-    import pyarrow as pa
-    from pyarrow import parquet
-    pydict = {}
-    for field in schema:
-        if isinstance(field.type, pa.FixedShapeTensorType):
-            stacked_arr = np.stack(value_batch[field.name])
-            pydict[field.name] = pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr)
-        else:
-            pydict[field.name] = value_batch[field.name]
-    tab = pa.Table.from_pydict(pydict, schema=schema)
-    parquet.write_table(tab, str(output_path))
 def export_parquet(
     table_or_df: pxt.Table | pxt.DataFrame,
     parquet_path: Path,
@@ -63,7 +39,9 @@ def export_parquet(
                         If False, will raise an error if the Dataframe has any image column.
                         Default False.
     """
-    from pixeltable.utils.arrow import to_arrow_schema
+    import pyarrow as pa
+    from pixeltable.utils.arrow import to_record_batches
     df: pxt.DataFrame
     if isinstance(table_or_df, pxt.catalog.Table):
@@ -71,9 +49,6 @@ def export_parquet(
     else:
         df = table_or_df
-    type_dict = {k: v.as_dict() for k, v in df.schema.items()}
-    arrow_schema = to_arrow_schema(df.schema)
     if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
         raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
@@ -81,70 +56,15 @@ def export_parquet(
     with transactional_directory(parquet_path) as temp_path:
         # dump metadata json file so we can inspect what was the source of the parquet file later on.
         json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
+        type_dict = {k: v.as_dict() for k, v in df.schema.items()}
         json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w'))  # keep type metadata
         batch_num = 0
-        current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
-        current_byte_estimate = 0
         with Catalog.get().begin_xact(for_write=False):
-            for data_row in df._exec():
-                for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
-                    val = data_row[e.slot_idx]
-                    if val is None:
-                        current_value_batch[col_name].append(val)
-                        continue
-                    assert val is not None
-                    if col_type.is_image_type():
-                        # images get inlined into the parquet file
-                        if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
-                            # if there is a file, read directly to preserve information
-                            with open(data_row.file_paths[e.slot_idx], 'rb') as f:
-                                val = f.read()
-                        elif isinstance(val, PIL.Image.Image):
-                            # if no file available, eg. bc it is computed, convert to png
-                            buf = io.BytesIO()
-                            val.save(buf, format='PNG')
-                            val = buf.getvalue()
-                        else:
-                            raise excs.Error(f'unknown image type {type(val)}')
-                        length = len(val)
-                    elif col_type.is_string_type():
-                        length = len(val)
-                    elif col_type.is_video_type() or col_type.is_audio_type():
-                        if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
-                            val = data_row.file_paths[e.slot_idx]
-                        else:
-                            raise excs.Error(f'unknown audio/video type {type(val)}')
-                        length = len(val)
-                    elif col_type.is_json_type():
-                        val = json.dumps(val)
-                        length = len(val)
-                    elif col_type.is_array_type():
-                        length = val.nbytes
-                    elif col_type.is_int_type() or col_type.is_float_type():
-                        length = 8
-                    elif col_type.is_bool_type():
-                        length = 1
-                    elif col_type.is_date_type():
-                        length = 4
-                    elif col_type.is_timestamp_type():
-                        val = val.astimezone(datetime.timezone.utc)
-                        length = 8
-                    else:
-                        raise excs.Error(f'unknown type {col_type} for {col_name}')
-                    current_value_batch[col_name].append(val)
-                    current_byte_estimate += length
-                if current_byte_estimate > partition_size_bytes:
-                    assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
-                    _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
-                    batch_num += 1
-                    current_value_batch = {k: deque() for k in df.schema}
-                    current_byte_estimate = 0
-            _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
+            for record_batch in to_record_batches(df, partition_size_bytes):
+                output_path = temp_path / f'part-{batch_num:05d}.parquet'
+                arrow_tbl = pa.Table.from_batches([record_batch])  # type: ignore
+                pa.parquet.write_table(arrow_tbl, str(output_path))
+                batch_num += 1
 def import_parquet(

pixeltable/io/table_data_conduit.py CHANGED Viewed

@@ -469,12 +469,12 @@ class ParquetTableDataConduit(TableDataConduit):
         return t
     def infer_schema_part1(self) -> tuple[dict[str, ts.ColumnType], list[str]]:
-        from pixeltable.utils.arrow import ar_infer_schema
+        from pixeltable.utils.arrow import to_pxt_schema
         if self.source_column_map is None:
             if self.src_schema_overrides is None:
                 self.src_schema_overrides = {}
-            self.src_schema = ar_infer_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
+            self.src_schema = to_pxt_schema(self.pq_ds.schema, self.src_schema_overrides, self.src_pk)
             inferred_schema, inferred_pk, self.source_column_map = normalize_schema_names(
                 self.src_schema, self.src_pk, self.src_schema_overrides
             )

pixeltable/share/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 # ruff: noqa: F401
-from .publish import pull_replica, push_replica
+from .publish import delete_replica, pull_replica, push_replica

pixeltable/share/publish.py CHANGED Viewed

@@ -27,15 +27,11 @@ PIXELTABLE_API_URL = os.environ.get('PIXELTABLE_API_URL', 'https://internal-api.
 def push_replica(
     dest_tbl_uri: str, src_tbl: pxt.Table, bucket: str | None = None, access: Literal['public', 'private'] = 'private'
 ) -> str:
-    if not src_tbl._tbl_version_path.is_snapshot():
-        raise excs.Error('Only snapshots may be published.')
     packager = TablePackager(
         src_tbl, additional_md={'table_uri': dest_tbl_uri, 'bucket_name': bucket, 'is_public': access == 'public'}
     )
     request_json = packager.md | {'operation_type': 'publish_snapshot'}
-    headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
-    response = requests.post(PIXELTABLE_API_URL, json=request_json, headers=headers_json)
+    response = requests.post(PIXELTABLE_API_URL, json=request_json, headers=_api_headers())
     if response.status_code != 200:
         raise excs.Error(f'Error publishing snapshot: {response.text}')
     response_json = response.json()
@@ -70,7 +66,7 @@ def push_replica(
         'preview_data': packager.md['preview_data'],
     }
     # TODO: Use Pydantic for validation
-    finalize_response = requests.post(PIXELTABLE_API_URL, json=finalize_request_json, headers=headers_json)
+    finalize_response = requests.post(PIXELTABLE_API_URL, json=finalize_request_json, headers=_api_headers())
     if finalize_response.status_code != 200:
         raise excs.Error(f'Error finalizing snapshot: {finalize_response.text}')
     finalize_response_json = finalize_response.json()
@@ -112,9 +108,8 @@ def _upload_bundle_to_s3(bundle: Path, parsed_location: urllib.parse.ParseResult
 def pull_replica(dest_path: str, src_tbl_uri: str) -> pxt.Table:
-    headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
     clone_request_json = {'operation_type': 'clone_snapshot', 'table_uri': src_tbl_uri}
-    response = requests.post(PIXELTABLE_API_URL, json=clone_request_json, headers=headers_json)
+    response = requests.post(PIXELTABLE_API_URL, json=clone_request_json, headers=_api_headers())
     if response.status_code != 200:
         raise excs.Error(f'Error cloning snapshot: {response.text}')
     response_json = response.json()
@@ -268,11 +263,18 @@ def _download_from_presigned_url(
 # TODO: This will be replaced by drop_table with cloud table uri
 def delete_replica(dest_path: str) -> None:
     """Delete cloud replica"""
-    headers_json = {'X-api-key': Env.get().pxt_api_key, 'Content-Type': 'application/json'}
     delete_request_json = {'operation_type': 'delete_snapshot', 'table_uri': dest_path}
-    response = requests.post(PIXELTABLE_API_URL, json=delete_request_json, headers=headers_json)
+    response = requests.post(PIXELTABLE_API_URL, json=delete_request_json, headers=_api_headers())
     if response.status_code != 200:
         raise excs.Error(f'Error deleting replica: {response.text}')
     response_json = response.json()
     if not isinstance(response_json, dict) or 'table_uri' not in response_json:
         raise excs.Error(f'Error deleting replica: unexpected response from server.\n{response_json}')
+def _api_headers() -> dict[str, str]:
+    headers = {'Content-Type': 'application/json'}
+    api_key = Env.get().pxt_api_key
+    if api_key is not None:
+        headers['X-api-key'] = api_key
+    return headers

pixeltable/utils/arrow.py CHANGED Viewed

@@ -1,11 +1,18 @@
 import datetime
-from typing import Any, Iterator, Optional
+import io
+import json
+from typing import TYPE_CHECKING, Any, Iterator, Optional, cast
 import numpy as np
+import PIL.Image
 import pyarrow as pa
+import pixeltable.exceptions as excs
 import pixeltable.type_system as ts
+if TYPE_CHECKING:
+    import pixeltable as pxt
 PA_TO_PXT_TYPES: dict[pa.DataType, ts.ColumnType] = {
     pa.string(): ts.StringType(nullable=True),
     pa.large_string(): ts.StringType(nullable=True),
@@ -71,7 +78,7 @@ def to_arrow_type(pixeltable_type: ts.ColumnType) -> Optional[pa.DataType]:
         return None
-def ar_infer_schema(
+def to_pxt_schema(
     arrow_schema: pa.Schema, schema_overrides: dict[str, Any], primary_key: list[str]
 ) -> dict[str, ts.ColumnType]:
     """Convert a pyarrow Schema to a schema using pyarrow names and pixeltable types."""
@@ -88,6 +95,94 @@ def to_arrow_schema(pixeltable_schema: dict[str, Any]) -> pa.Schema:
     return pa.schema((name, to_arrow_type(typ)) for name, typ in pixeltable_schema.items())  # type: ignore[misc]
+def _to_record_batch(column_vals: dict[str, list[Any]], schema: pa.Schema) -> pa.RecordBatch:
+    import pyarrow as pa
+    pa_arrays: list[pa.Array] = []
+    for field in schema:
+        if isinstance(field.type, pa.FixedShapeTensorType):
+            stacked_arr = np.stack(column_vals[field.name])
+            pa_arrays.append(pa.FixedShapeTensorArray.from_numpy_ndarray(stacked_arr))
+        else:
+            pa_array = cast(pa.Array, pa.array(column_vals[field.name]))
+            pa_arrays.append(pa_array)
+    return pa.RecordBatch.from_arrays(pa_arrays, schema=schema)  # type: ignore
+def to_record_batches(df: 'pxt.DataFrame', batch_size_bytes: int) -> Iterator[pa.RecordBatch]:
+    arrow_schema = to_arrow_schema(df.schema)
+    batch_columns: dict[str, list[Any]] = {k: [] for k in df.schema}
+    current_byte_estimate = 0
+    num_batch_rows = 0
+    # TODO: in order to avoid having to deal with ExprEvalError here, DataFrameResultSet should be an iterator
+    # over _exec()
+    try:
+        for data_row in df._exec():
+            num_batch_rows += 1
+            for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
+                val = data_row[e.slot_idx]
+                val_size_bytes: int
+                if val is None:
+                    batch_columns[col_name].append(val)
+                    continue
+                assert val is not None
+                if col_type.is_image_type():
+                    # images get inlined into the parquet file
+                    if data_row.file_paths[e.slot_idx] is not None:
+                        # if there is a file, read directly to preserve information
+                        with open(data_row.file_paths[e.slot_idx], 'rb') as f:
+                            val = f.read()
+                    elif isinstance(val, PIL.Image.Image):
+                        # no file available: save as png
+                        buf = io.BytesIO()
+                        val.save(buf, format='png')
+                        val = buf.getvalue()
+                    else:
+                        raise excs.Error(f'unknown image type {type(val)}')
+                    val_size_bytes = len(val)
+                elif col_type.is_string_type():
+                    val_size_bytes = len(val)
+                elif col_type.is_media_type():
+                    assert data_row.file_paths[e.slot_idx] is not None
+                    val = data_row.file_paths[e.slot_idx]
+                    val_size_bytes = len(val)
+                elif col_type.is_json_type():
+                    val = json.dumps(val)
+                    val_size_bytes = len(val)
+                elif col_type.is_array_type():
+                    val_size_bytes = val.nbytes
+                elif col_type.is_int_type() or col_type.is_float_type():
+                    val_size_bytes = 8
+                elif col_type.is_bool_type():
+                    val_size_bytes = 1
+                elif col_type.is_date_type():
+                    val_size_bytes = 4
+                elif col_type.is_timestamp_type():
+                    val = val.astimezone(datetime.timezone.utc)
+                    val_size_bytes = 8
+                else:
+                    raise excs.Error(f'unknown type {col_type} for {col_name}')
+                batch_columns[col_name].append(val)
+                current_byte_estimate += val_size_bytes
+            if current_byte_estimate > batch_size_bytes and num_batch_rows > 0:
+                record_batch = _to_record_batch(batch_columns, arrow_schema)
+                yield record_batch
+                batch_columns = {k: [] for k in df.schema}
+                current_byte_estimate = 0
+                num_batch_rows = 0
+    except excs.ExprEvalError as e:
+        df._raise_expr_eval_err(e)
+    if num_batch_rows > 0:
+        record_batch = _to_record_batch(batch_columns, arrow_schema)
+        yield record_batch
 def to_pydict(batch: pa.Table | pa.RecordBatch) -> dict[str, list | np.ndarray]:
     """Convert a RecordBatch to a dictionary of lists, unlike pa.lib.RecordBatch.to_pydict,
     this function will not convert numpy arrays to lists, and will preserve the original numpy dtype.

pixeltable/utils/lancedb.py ADDED Viewed

@@ -0,0 +1,88 @@
+from __future__ import annotations
+import logging
+import shutil
+from pathlib import Path
+from typing import Literal
+import pixeltable as pxt
+import pixeltable.exceptions as excs
+from pixeltable.catalog import Catalog
+from pixeltable.env import Env
+_logger = logging.getLogger('pixeltable')
+def export_lancedb(
+    table_or_df: pxt.Table | pxt.DataFrame,
+    db_uri: Path,
+    table_name: str,
+    batch_size_bytes: int = 128 * 2**20,
+    if_exists: Literal['error', 'overwrite', 'append'] = 'error',
+) -> None:
+    """
+    Exports a dataframe's data to a LanceDB table.
+    This utilizes LanceDB's streaming interface for efficient table creation, via a sequence of in-memory pyarrow
+    `RecordBatches`, the size of which can be controlled with the `batch_size_bytes` parameter.
+    __Requirements:__
+    - `pip install lancedb`
+    Args:
+        table_or_df : Table or Dataframe to export.
+        db_uri: Local Path to the LanceDB database.
+        table_name : Name of the table in the LanceDB database.
+        batch_size_bytes : Maximum size in bytes for each batch.
+        if_exists: Determines the behavior if the table already exists. Must be one of the following:
+            - `'error'`: raise an error
+            - `'overwrite'`: overwrite the existing table
+            - `'append'`: append to the existing table
+    """
+    Env.get().require_package('lancedb')
+    import lancedb  # type: ignore[import-untyped]
+    from pixeltable.utils.arrow import to_arrow_schema, to_record_batches
+    if if_exists not in ('error', 'overwrite', 'append'):
+        raise excs.Error("export_lancedb(): 'if_exists' must be one of: ['error', 'overwrite', 'append']")
+    df: pxt.DataFrame
+    if isinstance(table_or_df, pxt.catalog.Table):
+        df = table_or_df._df()
+    else:
+        df = table_or_df
+    db_exists = False
+    if db_uri.exists():
+        if not db_uri.is_dir():
+            raise excs.Error(f"export_lancedb(): '{db_uri!s}' exists and is not a directory")
+        db_exists = True
+    try:
+        db = lancedb.connect(str(db_uri))
+        lance_tbl: lancedb.LanceTable | None = None
+        try:
+            lance_tbl = db.open_table(table_name)
+            if if_exists == 'error':
+                raise excs.Error(f'export_lancedb(): table {table_name!r} already exists in {db_uri!r}')
+        except ValueError:
+            # table doesn't exist
+            pass
+        with Catalog.get().begin_xact(for_write=False):
+            if lance_tbl is None or if_exists == 'overwrite':
+                mode = 'overwrite' if lance_tbl is not None else 'create'
+                arrow_schema = to_arrow_schema(df.schema)
+                _ = db.create_table(table_name, to_record_batches(df, batch_size_bytes), schema=arrow_schema, mode=mode)
+            else:
+                lance_tbl.add(to_record_batches(df, batch_size_bytes))
+    except Exception as e:
+        # cleanup
+        if not db_exists:
+            shutil.rmtree(db_uri)
+        raise e

pixeltable/utils/media_store.py CHANGED Viewed

@@ -189,6 +189,12 @@ class MediaStore:
         result.sort(key=lambda e: e[3], reverse=True)
         return result
+    def clear(self) -> None:
+        """Clear all files from the media store."""
+        assert self.__base_dir.exists()
+        shutil.rmtree(self.__base_dir)
+        self.__base_dir.mkdir()
 class TempStore:
     """
@@ -235,3 +241,8 @@ class TempStore:
         if tbl_id is not None:
             return MediaStore(cls._tmp_dir())._prepare_media_path_raw(tbl_id, 0, 0, extension)
         return cls._tmp_dir() / f'{uuid.uuid4()}{extension}'
+    @classmethod
+    def clear(cls) -> None:
+        """Clear all files from the temporary store."""
+        MediaStore(cls._tmp_dir()).clear()

pixeltable 0.4.11__py3-none-any.whl → 0.4.13__py3-none-any.whl

Potentially problematic release.

pixeltable 0.4.11py3-none-any.whl → 0.4.13py3-none-any.whl