PyPI - pixeltable - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl - Mend

pixeltable 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (122) hide show

pixeltable/__init__.py +2 -3
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +2 -1
pixeltable/catalog/catalog.py +63 -36
pixeltable/catalog/column.py +11 -4
pixeltable/catalog/dir.py +5 -5
pixeltable/catalog/globals.py +28 -14
pixeltable/catalog/insertable_table.py +81 -43
pixeltable/catalog/path.py +2 -2
pixeltable/catalog/table.py +140 -109
pixeltable/catalog/table_version.py +60 -43
pixeltable/catalog/table_version_handle.py +3 -0
pixeltable/catalog/table_version_path.py +1 -1
pixeltable/catalog/view.py +17 -9
pixeltable/dataframe.py +5 -3
pixeltable/env.py +109 -43
pixeltable/exec/__init__.py +2 -0
pixeltable/exec/aggregation_node.py +6 -8
pixeltable/exec/cache_prefetch_node.py +4 -7
pixeltable/exec/component_iteration_node.py +1 -3
pixeltable/exec/data_row_batch.py +1 -2
pixeltable/exec/exec_context.py +1 -1
pixeltable/exec/exec_node.py +2 -3
pixeltable/exec/expr_eval/__init__.py +2 -0
pixeltable/exec/expr_eval/evaluators.py +137 -20
pixeltable/exec/expr_eval/expr_eval_node.py +43 -64
pixeltable/exec/expr_eval/globals.py +68 -7
pixeltable/exec/expr_eval/schedulers.py +25 -23
pixeltable/exec/in_memory_data_node.py +8 -6
pixeltable/exec/row_update_node.py +3 -4
pixeltable/exec/sql_node.py +16 -17
pixeltable/exprs/__init__.py +3 -2
pixeltable/exprs/arithmetic_expr.py +2 -0
pixeltable/exprs/column_property_ref.py +1 -1
pixeltable/exprs/column_ref.py +39 -3
pixeltable/exprs/compound_predicate.py +1 -1
pixeltable/exprs/data_row.py +17 -1
pixeltable/exprs/expr.py +51 -21
pixeltable/exprs/function_call.py +34 -2
pixeltable/exprs/globals.py +12 -0
pixeltable/exprs/json_mapper.py +95 -48
pixeltable/exprs/json_path.py +3 -10
pixeltable/exprs/method_ref.py +2 -2
pixeltable/exprs/object_ref.py +2 -2
pixeltable/exprs/row_builder.py +33 -6
pixeltable/exprs/similarity_expr.py +6 -21
pixeltable/exprs/sql_element_cache.py +1 -1
pixeltable/exprs/string_op.py +107 -0
pixeltable/ext/__init__.py +1 -1
pixeltable/ext/functions/__init__.py +1 -1
pixeltable/ext/functions/whisperx.py +1 -1
pixeltable/ext/functions/yolox.py +22 -65
pixeltable/func/aggregate_function.py +1 -1
pixeltable/func/callable_function.py +2 -5
pixeltable/func/expr_template_function.py +22 -2
pixeltable/func/function.py +4 -5
pixeltable/func/function_registry.py +1 -1
pixeltable/func/signature.py +1 -1
pixeltable/func/tools.py +2 -2
pixeltable/func/udf.py +2 -2
pixeltable/functions/__init__.py +2 -2
pixeltable/functions/anthropic.py +2 -2
pixeltable/functions/audio.py +1 -1
pixeltable/functions/deepseek.py +1 -1
pixeltable/functions/fireworks.py +1 -1
pixeltable/functions/globals.py +22 -11
pixeltable/functions/huggingface.py +1 -1
pixeltable/functions/image.py +1 -1
pixeltable/functions/json.py +1 -1
pixeltable/functions/llama_cpp.py +1 -1
pixeltable/functions/math.py +1 -1
pixeltable/functions/mistralai.py +1 -1
pixeltable/functions/ollama.py +1 -1
pixeltable/functions/openai.py +2 -2
pixeltable/functions/replicate.py +1 -1
pixeltable/functions/string.py +1 -1
pixeltable/functions/timestamp.py +1 -1
pixeltable/functions/together.py +1 -1
pixeltable/functions/util.py +1 -1
pixeltable/functions/video.py +2 -2
pixeltable/functions/vision.py +2 -2
pixeltable/globals.py +85 -33
pixeltable/index/embedding_index.py +12 -1
pixeltable/io/__init__.py +8 -5
pixeltable/io/datarows.py +138 -0
pixeltable/io/external_store.py +8 -5
pixeltable/io/fiftyone.py +6 -7
pixeltable/io/globals.py +7 -160
pixeltable/io/hf_datasets.py +21 -98
pixeltable/io/label_studio.py +21 -20
pixeltable/io/pandas.py +35 -48
pixeltable/io/parquet.py +17 -42
pixeltable/io/table_data_conduit.py +569 -0
pixeltable/io/utils.py +6 -21
pixeltable/iterators/__init__.py +1 -1
pixeltable/metadata/__init__.py +6 -4
pixeltable/metadata/converters/convert_24.py +3 -3
pixeltable/metadata/converters/convert_25.py +1 -1
pixeltable/metadata/converters/convert_29.py +1 -1
pixeltable/metadata/converters/convert_30.py +50 -0
pixeltable/metadata/converters/util.py +26 -1
pixeltable/metadata/notes.py +1 -0
pixeltable/metadata/schema.py +3 -0
pixeltable/store.py +2 -2
pixeltable/type_system.py +19 -7
pixeltable/utils/arrow.py +32 -7
pixeltable/utils/console_output.py +3 -2
pixeltable/utils/coroutine.py +3 -3
pixeltable/utils/dbms.py +66 -0
pixeltable/utils/documents.py +61 -67
pixeltable/utils/filecache.py +1 -1
pixeltable/utils/http_server.py +3 -2
pixeltable/utils/pytorch.py +1 -1
pixeltable/utils/sql.py +1 -1
pixeltable-0.3.11.dist-info/METADATA +436 -0
pixeltable-0.3.11.dist-info/RECORD +179 -0
{pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/WHEEL +1 -1
pixeltable/catalog/path_dict.py +0 -169
pixeltable-0.3.9.dist-info/METADATA +0 -382
pixeltable-0.3.9.dist-info/RECORD +0 -175
{pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/LICENSE +0 -0
{pixeltable-0.3.9.dist-info → pixeltable-0.3.11.dist-info}/entry_points.txt +0 -0

pixeltable/io/hf_datasets.py CHANGED Viewed

@@ -1,41 +1,38 @@
 from __future__ import annotations
-import logging
-import math
-import random
 import typing
 from typing import Any, Optional, Union
 import pixeltable as pxt
 import pixeltable.type_system as ts
-from pixeltable import exceptions as excs
-from .utils import normalize_import_parameters, normalize_schema_names
 if typing.TYPE_CHECKING:
     import datasets  # type: ignore[import-untyped]
-_logger = logging.getLogger('pixeltable')
-# use 100MB as the batch size limit for loading a huggingface dataset into pixeltable.
-# The primary goal is to bound memory use, regardless of dataset size.
-# Second goal is to limit overhead. 100MB is presumed to be reasonable for a lot of storage systems.
-_K_BATCH_SIZE_BYTES = 100_000_000
-# note, there are many more types. we allow overrides in the schema_override parameter
+# note, there are many more types. we allow overrides in the schema_overrides parameter
 # to handle cases where the appropriate type is not yet mapped, or to override this mapping.
 # https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Value
 _hf_to_pxt: dict[str, ts.ColumnType] = {
-    'int32': ts.IntType(nullable=True),  # pixeltable widens to big int
-    'int64': ts.IntType(nullable=True),
     'bool': ts.BoolType(nullable=True),
+    'int8': ts.IntType(nullable=True),
+    'int16': ts.IntType(nullable=True),
+    'int32': ts.IntType(nullable=True),
+    'int64': ts.IntType(nullable=True),
+    'uint8': ts.IntType(nullable=True),
+    'uint16': ts.IntType(nullable=True),
+    'uint32': ts.IntType(nullable=True),
+    'uint64': ts.IntType(nullable=True),
+    'float16': ts.FloatType(nullable=True),
     'float32': ts.FloatType(nullable=True),
     'float64': ts.FloatType(nullable=True),
-    'large_string': ts.StringType(nullable=True),
     'string': ts.StringType(nullable=True),
+    'large_string': ts.StringType(nullable=True),
     'timestamp[s]': ts.TimestampType(nullable=True),
     'timestamp[ms]': ts.TimestampType(nullable=True),  # HF dataset iterator converts timestamps to datetime.datetime
     'timestamp[us]': ts.TimestampType(nullable=True),
+    'date32': ts.StringType(nullable=True),  # date32 is not supported in pixeltable, use string
+    'date64': ts.StringType(nullable=True),  # date64 is not supported in pixeltable, use string
 }
@@ -88,7 +85,6 @@ def import_huggingface_dataset(
     table_path: str,
     dataset: Union[datasets.Dataset, datasets.DatasetDict],
     *,
-    column_name_for_split: Optional[str] = None,
     schema_overrides: Optional[dict[str, Any]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     **kwargs: Any,
@@ -101,91 +97,18 @@ def import_huggingface_dataset(
         dataset: Huggingface [`datasets.Dataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset)
             or [`datasets.DatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetDict)
             to insert into the table.
-        column_name_for_split: column name to use for split information. If None, no split information will be stored.
         schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
-            name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`. The keys in
-            `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not they are valid
-            Pixeltable identifiers).
+            name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`.
+            The keys in `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not
+            they are valid Pixeltable identifiers).
         primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
         kwargs: Additional arguments to pass to `create_table`.
+            An argument of `column_name_for_split` must be provided if the source is a DatasetDict.
+            This column name will contain the split information. If None, no split information will be stored.
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    import datasets
-    import pixeltable as pxt
-    if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
-        raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
-    # Create the pixeltable schema from the huggingface schema
-    hf_schema_source = _get_hf_schema(dataset)
-    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
-    hf_schema = huggingface_schema_to_pxt_schema(hf_schema_source, schema_overrides, primary_key)
-    # Add the split column to the schema if requested
-    if column_name_for_split is not None:
-        if column_name_for_split in hf_schema:
-            raise excs.Error(
-                f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
-            )
-        hf_schema[column_name_for_split] = ts.StringType(nullable=True)
-    schema, pxt_pk, _ = normalize_schema_names(hf_schema, primary_key, schema_overrides, True)
-    # Prepare to create table and insert data
-    if table_path in pxt.list_tables():
-        raise excs.Error(f'table {table_path} already exists')
-    if isinstance(dataset, datasets.Dataset):
-        # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
-        raw_name = dataset.split._name
-        split_name = raw_name.split('[')[0] if raw_name is not None else None
-        dataset_dict = {split_name: dataset}
-    else:
-        dataset_dict = dataset
-    # extract all class labels from the dataset to translate category ints to strings
-    categorical_features = {
-        feature_name: feature_type.names
-        for (feature_name, feature_type) in hf_schema_source.items()
-        if isinstance(feature_type, datasets.ClassLabel)
-    }
-    try:
-        # random tmp name
-        tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
-        tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
-        def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
-            output_row = row.copy()
-            # map all class labels to strings
-            for field, values in categorical_features.items():
-                output_row[field] = values[row[field]]
-            # add split name to row
-            if column_name_for_split is not None:
-                output_row[column_name_for_split] = split_name
-            return output_row
-        for split_name, split_dataset in dataset_dict.items():
-            num_batches = split_dataset.size_in_bytes / _K_BATCH_SIZE_BYTES
-            tuples_per_batch = math.ceil(split_dataset.num_rows / num_batches)
-            assert tuples_per_batch > 0
-            batch = []
-            for row in split_dataset:
-                batch.append(_translate_row(row, split_name))
-                if len(batch) >= tuples_per_batch:
-                    tab.insert(batch)
-                    batch = []
-            # last batch
-            if len(batch) > 0:
-                tab.insert(batch)
-    except Exception as e:
-        _logger.error(f'Error while inserting dataset into table: {tmp_name}')
-        raise e
-    pxt.move(tmp_name, table_path)
-    return pxt.get_table(table_path)
+    return pxt.create_table(
+        table_path, source=dataset, schema_overrides=schema_overrides, primary_key=primary_key, extra_args=kwargs
+    )

pixeltable/io/label_studio.py CHANGED Viewed

@@ -5,16 +5,14 @@ import os
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Any, Iterator, Literal, Optional, cast
-from xml.etree import ElementTree
+from xml.etree import ElementTree as ET
 import label_studio_sdk  # type: ignore[import-untyped]
 import PIL.Image
 from requests.exceptions import HTTPError
 import pixeltable as pxt
-import pixeltable.env as env
-import pixeltable.exceptions as excs
-from pixeltable import Column, Table
+from pixeltable import Column, Table, env, exceptions as excs
 from pixeltable.config import Config
 from pixeltable.exprs import ColumnRef, DataRow, Expr
 from pixeltable.io.external_store import Project, SyncStatus
@@ -140,7 +138,8 @@ class LabelStudioProject(Project):
             page += 1
         if unknown_task_count > 0:
             _logger.warning(
-                f'Skipped {unknown_task_count} unrecognized task(s) when syncing Label Studio project "{self.project_title}".'
+                f'Skipped {unknown_task_count} unrecognized task(s) when syncing '
+                f'Label Studio project {self.project_title!r}.'
             )
     def __update_tasks(self, t: Table, existing_tasks: dict[tuple, dict]) -> SyncStatus:
@@ -174,11 +173,11 @@ class LabelStudioProject(Project):
             # Send media to Label Studio by HTTP post.
             assert len(t_data_cols) == 1  # This was verified when the project was set up
             return self.__update_tasks_by_post(t, existing_tasks, t_data_cols[0], t_rl_cols, rl_info)
-        elif self.media_import_method == 'file' or self.media_import_method == 'url':
+        elif self.media_import_method in ('file', 'url'):
             # Send media to Label Studio by file reference (local file or URL).
             return self.__update_tasks_by_files(t, existing_tasks, t_data_cols, t_rl_cols, rl_info)
         else:
-            assert False
+            raise AssertionError()
     def __update_tasks_by_post(
         self,
@@ -227,7 +226,7 @@ class LabelStudioProject(Project):
                     )
                     for i in range(len(coco_annotations))
                 ]
-                _logger.debug(f'`predictions`: %s', predictions)
+                _logger.debug('`predictions`: {%s}', predictions)
                 self.project.create_predictions(predictions)
                 tasks_created += 1
@@ -358,7 +357,7 @@ class LabelStudioProject(Project):
     def __localpath_to_lspath(cls, localpath: str) -> str:
         # Transform the local path into Label Studio's bespoke path format.
         relpath = Path(localpath).relative_to(Config.get().home)
-        return f'/data/local-files/?d={str(relpath)}'
+        return f'/data/local-files/?d={relpath}'
     def __delete_stale_tasks(
         self, existing_tasks: dict[tuple, dict], row_ids_in_pxt: set[tuple], tasks_created: int
@@ -405,7 +404,8 @@ class LabelStudioProject(Project):
         updates = [{'_rowid': rowid, local_annotations_col.name: ann} for rowid, ann in annotations.items()]
         if len(updates) > 0:
             _logger.info(
-                f'Updating table `{t._name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
+                f'Updating table {t._name!r}, column {local_annotations_col.name!r} '
+                f'with {len(updates)} total annotations.'
             )
             # batch_update currently doesn't propagate from views to base tables. As a workaround, we call
             # batch_update on the actual ancestor table that holds the annotations column.
@@ -451,7 +451,7 @@ class LabelStudioProject(Project):
         Parses a Label Studio XML config, extracting the names and Pixeltable types of
         all input variables.
         """
-        root: ElementTree.Element = ElementTree.fromstring(xml_config)
+        root: ET.Element = ET.fromstring(xml_config)
         if root.tag.lower() != 'view':
             raise excs.Error('Root of Label Studio config must be a `View`')
         config = _LabelStudioConfig(
@@ -461,7 +461,7 @@ class LabelStudioProject(Project):
         return config
     @classmethod
-    def __parse_data_keys_config(cls, root: ElementTree.Element) -> dict[str, '_DataKey']:
+    def __parse_data_keys_config(cls, root: ET.Element) -> dict[str, '_DataKey']:
         """Parses the data keys from a Label Studio XML config."""
         config: dict[str, '_DataKey'] = {}
         for element in root:
@@ -477,7 +477,7 @@ class LabelStudioProject(Project):
         return config
     @classmethod
-    def __parse_rectangle_labels_config(cls, root: ElementTree.Element) -> dict[str, '_RectangleLabel']:
+    def __parse_rectangle_labels_config(cls, root: ET.Element) -> dict[str, '_RectangleLabel']:
         """Parses the RectangleLabels from a Label Studio XML config."""
         config: dict[str, '_RectangleLabel'] = {}
         for element in root:
@@ -534,7 +534,7 @@ class LabelStudioProject(Project):
         _label_studio_client().delete_project(self.project_id)
         env.Env.get().console_logger.info(f'Deleted Label Studio project: {title}')
-    def __eq__(self, other) -> bool:
+    def __eq__(self, other: object) -> bool:
         return isinstance(other, LabelStudioProject) and self.project_id == other.project_id
     def __hash__(self) -> int:
@@ -576,7 +576,7 @@ class LabelStudioProject(Project):
                 local_annotations_column = ANNOTATIONS_COLUMN
             else:
                 local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
-            if local_annotations_column not in t._schema.keys():
+            if local_annotations_column not in t._schema:
                 t.add_columns({local_annotations_column: pxt.JsonType(nullable=True)})
         resolved_col_mapping = cls.validate_columns(
@@ -591,9 +591,9 @@ class LabelStudioProject(Project):
             if media_import_method != 'url':
                 raise excs.Error("`s3_configuration` is only valid when `media_import_method == 'url'`")
             s3_configuration = copy.copy(s3_configuration)
-            if not 'bucket' in s3_configuration:
+            if 'bucket' not in s3_configuration:
                 raise excs.Error('`s3_configuration` must contain a `bucket` field')
-            if not 'title' in s3_configuration:
+            if 'title' not in s3_configuration:
                 s3_configuration['title'] = 'Pixeltable-S3-Import-Storage'
             if (
                 'aws_access_key_id' not in s3_configuration
@@ -633,7 +633,8 @@ class LabelStudioProject(Project):
                         raise excs.Error(
                             '`media_import_method` is set to `file`, but your Label Studio server is not configured '
                             'for local file storage.\nPlease set the `LABEL_STUDIO_LOCAL_FILES_SERVING_ENABLED` '
-                            'environment variable to `true` in the environment where your Label Studio server is running.'
+                            'environment variable to `true` in the environment where your Label Studio server '
+                            'is running.'
                         ) from exc
                 raise  # Handle any other exception type normally
@@ -663,7 +664,7 @@ class _LabelStudioConfig:
     rectangle_labels: dict[str, _RectangleLabel]
     def validate(self) -> None:
-        data_key_names = set(key.name for key in self.data_keys.values() if key.name is not None)
+        data_key_names = {key.name for key in self.data_keys.values() if key.name is not None}
         for name, rl in self.rectangle_labels.items():
             if rl.to_name not in data_key_names:
                 raise excs.Error(
@@ -674,7 +675,7 @@ class _LabelStudioConfig:
     @property
     def export_columns(self) -> dict[str, pxt.ColumnType]:
         data_key_cols = {key_id: key_info.column_type for key_id, key_info in self.data_keys.items()}
-        rl_cols = {name: pxt.JsonType() for name in self.rectangle_labels.keys()}
+        rl_cols = {name: pxt.JsonType() for name in self.rectangle_labels}
         return {**data_key_cols, **rl_cols}

pixeltable/io/pandas.py CHANGED Viewed

@@ -1,3 +1,4 @@
+import os
 from typing import Any, Optional, Union
 import numpy as np
@@ -7,9 +8,6 @@ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
 import pixeltable as pxt
 import pixeltable.exceptions as excs
-from pixeltable import Table
-from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
 def import_pandas(
@@ -43,30 +41,24 @@ def import_pandas(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
-    pd_schema = df_infer_schema(df, schema_overrides, primary_key)
-    schema, pxt_pk, col_mapping = normalize_schema_names(pd_schema, primary_key, schema_overrides, False)
-    __check_primary_key_values(df, primary_key)
-    # Convert all rows to insertable format
-    tbl_rows = [__df_row_to_pxt_row(row, pd_schema, col_mapping) for row in df.itertuples()]
-    table = find_or_create_table(
-        tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
+    return pxt.create_table(
+        tbl_name,
+        source=df,
+        schema_overrides=schema_overrides,
+        primary_key=primary_key,
+        num_retained_versions=num_retained_versions,
+        comment=comment,
     )
-    table.insert(tbl_rows)
-    return table
 def import_csv(
     tbl_name: str,
-    filepath_or_buffer,
+    filepath_or_buffer: Union[str, os.PathLike],
     schema_overrides: Optional[dict[str, Any]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
     comment: str = '',
-    **kwargs,
+    **kwargs: Any,
 ) -> pxt.Table:
     """
     Creates a new base table from a csv file. This is a convenience method and is equivalent
@@ -77,26 +69,26 @@ def import_csv(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    df = pd.read_csv(filepath_or_buffer, **kwargs)
-    return import_pandas(
+    return pxt.create_table(
         tbl_name,
-        df,
+        source=filepath_or_buffer,
         schema_overrides=schema_overrides,
         primary_key=primary_key,
         num_retained_versions=num_retained_versions,
         comment=comment,
+        extra_args=kwargs,
     )
 def import_excel(
     tbl_name: str,
-    io,
-    *args,
+    io: Union[str, os.PathLike],
+    *,
     schema_overrides: Optional[dict[str, Any]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
     comment: str = '',
-    **kwargs,
+    **kwargs: Any,
 ) -> pxt.Table:
     """
     Creates a new base table from an Excel (.xlsx) file. This is a convenience method and is
@@ -107,18 +99,18 @@ def import_excel(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    df = pd.read_excel(io, *args, **kwargs)
-    return import_pandas(
+    return pxt.create_table(
         tbl_name,
-        df,
+        source=io,
         schema_overrides=schema_overrides,
         primary_key=primary_key,
         num_retained_versions=num_retained_versions,
         comment=comment,
+        extra_args=kwargs,
     )
-def __check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
+def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
     for pd_name in primary_key:
         # This can be faster for large DataFrames
         has_nulls = df[pd_name].count() < len(df)
@@ -146,15 +138,6 @@ def df_infer_schema(
     return pd_schema
-"""
-# Check if a datetime64[ns, UTC] dtype
-def is_datetime_tz_utc(x: Any) -> bool:
-    if isinstance(x, pd.Timestamp) and x.tzinfo is not None and str(x.tzinfo) == 'UTC':
-        return True
-    return pd.api.types.is_datetime64tz_dtype(x) and str(x).endswith('UTC]')
-"""
 def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
     """
     Determines a pixeltable ColumnType from a pandas dtype
@@ -165,7 +148,8 @@ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.C
     Returns:
         pxt.ColumnType: A pixeltable ColumnType
     """
-    # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly compatible with NumPy dtypes
+    # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
+    # compatible with NumPy dtypes
     # The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
     if is_datetime64_any_dtype(pd_dtype):
         return pxt.TimestampType(nullable=nullable)
@@ -204,32 +188,35 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
     raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {pd_dtype})')
-def __df_row_to_pxt_row(
+def _df_row_to_pxt_row(
     row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
 ) -> dict[str, Any]:
     """Convert a row to insertable format"""
     pxt_row: dict[str, Any] = {}
     for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
+        pxt_name = col_mapping.get(col_name, col_name)
+        nval: Any
         if pxt_type.is_float_type():
-            val = float(val)
+            nval = float(val)
         elif isinstance(val, float) and np.isnan(val):
             # pandas uses NaN for empty cells, even for types other than float;
             # for any type but a float, convert these to None
-            val = None
+            nval = None
         elif pxt_type.is_int_type():
-            val = int(val)
+            nval = int(val)
         elif pxt_type.is_bool_type():
-            val = bool(val)
+            nval = bool(val)
         elif pxt_type.is_string_type():
-            val = str(val)
+            nval = str(val)
         elif pxt_type.is_timestamp_type():
             if pd.isnull(val):
                 # pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
                 # much not-ok with it. (But if we convert it to None and then load out the
                 # table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
-                val = None
+                nval = None
             else:
-                val = pd.Timestamp(val).to_pydatetime()
-        pxt_name = col_name if col_mapping is None else col_mapping[col_name]
-        pxt_row[pxt_name] = val
+                nval = pd.Timestamp(val).to_pydatetime()
+        else:
+            nval = val
+        pxt_row[pxt_name] = nval
     return pxt_row

pixeltable/io/parquet.py CHANGED Viewed

@@ -4,7 +4,6 @@ import datetime
 import io
 import json
 import logging
-import random
 import typing
 from collections import deque
 from pathlib import Path
@@ -14,12 +13,10 @@ import numpy as np
 import PIL.Image
 import pixeltable as pxt
-import pixeltable.exceptions as exc
+import pixeltable.exceptions as excs
 from pixeltable.env import Env
 from pixeltable.utils.transactional_directory import transactional_directory
-from .utils import normalize_import_parameters, normalize_schema_names
 if typing.TYPE_CHECKING:
     import pyarrow as pa
@@ -78,7 +75,7 @@ def export_parquet(
     arrow_schema = to_arrow_schema(df.schema)
     if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
-        raise exc.Error('Cannot export Dataframe with image columns when inline_images is False')
+        raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
     # store the changes atomically
     with transactional_directory(parquet_path) as temp_path:
@@ -87,7 +84,7 @@ def export_parquet(
         json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w'))  # keep type metadata
         batch_num = 0
-        current_value_batch: dict[str, deque] = {k: deque() for k in df.schema.keys()}
+        current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
         current_byte_estimate = 0
         with Env.get().begin_xact():
@@ -111,7 +108,7 @@ def export_parquet(
                             val.save(buf, format='PNG')
                             val = buf.getvalue()
                         else:
-                            assert False, f'unknown image type {type(val)}'
+                            raise excs.Error(f'unknown image type {type(val)}')
                         length = len(val)
                     elif col_type.is_string_type():
                         length = len(val)
@@ -119,16 +116,14 @@ def export_parquet(
                         if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
                             val = data_row.file_paths[e.slot_idx]
                         else:
-                            assert False, f'unknown video type {type(val)}'
+                            raise excs.Error(f'unknown video type {type(val)}')
                         length = len(val)
                     elif col_type.is_json_type():
                         val = json.dumps(val)
                         length = len(val)
                     elif col_type.is_array_type():
                         length = val.nbytes
-                    elif col_type.is_int_type():
-                        length = 8
-                    elif col_type.is_float_type():
+                    elif col_type.is_int_type() or col_type.is_float_type():
                         length = 8
                     elif col_type.is_bool_type():
                         length = 1
@@ -136,7 +131,7 @@ def export_parquet(
                         val = val.astimezone(datetime.timezone.utc)
                         length = 8
                     else:
-                        assert False, f'unknown type {col_type} for {col_name}'
+                        raise excs.Error(f'unknown type {col_type} for {col_name}')
                     current_value_batch[col_name].append(val)
                     current_byte_estimate += length
@@ -144,7 +139,7 @@ def export_parquet(
                     assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
                     _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
                     batch_num += 1
-                    current_value_batch = {k: deque() for k in df.schema.keys()}
+                    current_value_batch = {k: deque() for k in df.schema}
                     current_byte_estimate = 0
             _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
@@ -173,32 +168,12 @@ def import_parquet(
     Returns:
         A handle to the newly created table.
     """
-    from pyarrow import parquet
-    from pixeltable.utils.arrow import ar_infer_schema, iter_tuples2
-    input_path = Path(parquet_path).expanduser()
-    parquet_dataset = parquet.ParquetDataset(str(input_path))
-    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
-    ar_schema = ar_infer_schema(parquet_dataset.schema, schema_overrides, primary_key)
-    schema, pxt_pk, col_mapping = normalize_schema_names(ar_schema, primary_key, schema_overrides, False)
-    if table in pxt.list_tables():
-        raise exc.Error(f'Table {table} already exists')
-    tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
-    total_rows = 0
-    try:
-        tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
-        for fragment in parquet_dataset.fragments:  # type: ignore[attr-defined]
-            for batch in fragment.to_batches():
-                dict_batch = list(iter_tuples2(batch, col_mapping, schema))
-                total_rows += len(dict_batch)
-                tab.insert(dict_batch)
-    except Exception as e:
-        _logger.error(f'Error after inserting {total_rows} rows from Parquet file into table: {e}')
-        raise e
-    pxt.move(tmp_name, table)
-    return pxt.get_table(table)
+    value = kwargs.pop('source_format', None)
+    return pxt.create_table(
+        table,
+        source=parquet_path,
+        source_format=value,
+        schema_overrides=schema_overrides,
+        primary_key=primary_key,
+        extra_args=kwargs,
+    )

pixeltable 0.3.9__py3-none-any.whl → 0.3.11__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.9py3-none-any.whl → 0.3.11py3-none-any.whl