PyPI - pixeltable - Versions diffs - 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl - Mend

pixeltable 0.3.9py3-none-any.whl → 0.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (44) hide show

pixeltable/__init__.py +1 -2
pixeltable/__version__.py +2 -2
pixeltable/catalog/column.py +5 -0
pixeltable/catalog/globals.py +16 -0
pixeltable/catalog/insertable_table.py +82 -41
pixeltable/catalog/table.py +78 -55
pixeltable/catalog/table_version.py +18 -3
pixeltable/catalog/view.py +9 -2
pixeltable/env.py +1 -1
pixeltable/exec/exec_node.py +1 -1
pixeltable/exprs/__init__.py +2 -1
pixeltable/exprs/arithmetic_expr.py +2 -0
pixeltable/exprs/column_ref.py +36 -0
pixeltable/exprs/expr.py +39 -9
pixeltable/exprs/globals.py +12 -0
pixeltable/exprs/json_mapper.py +1 -1
pixeltable/exprs/json_path.py +0 -6
pixeltable/exprs/similarity_expr.py +5 -20
pixeltable/exprs/string_op.py +107 -0
pixeltable/ext/functions/yolox.py +21 -64
pixeltable/func/tools.py +2 -2
pixeltable/functions/__init__.py +1 -1
pixeltable/functions/globals.py +16 -5
pixeltable/globals.py +85 -33
pixeltable/io/__init__.py +3 -2
pixeltable/io/datarows.py +138 -0
pixeltable/io/external_store.py +8 -5
pixeltable/io/globals.py +7 -160
pixeltable/io/hf_datasets.py +21 -98
pixeltable/io/pandas.py +29 -43
pixeltable/io/parquet.py +17 -42
pixeltable/io/table_data_conduit.py +569 -0
pixeltable/io/utils.py +6 -21
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_30.py +50 -0
pixeltable/metadata/converters/util.py +26 -1
pixeltable/metadata/notes.py +1 -0
pixeltable/metadata/schema.py +3 -0
pixeltable/utils/arrow.py +32 -7
{pixeltable-0.3.9.dist-info → pixeltable-0.3.10.dist-info}/METADATA +1 -1
{pixeltable-0.3.9.dist-info → pixeltable-0.3.10.dist-info}/RECORD +44 -40
{pixeltable-0.3.9.dist-info → pixeltable-0.3.10.dist-info}/WHEEL +1 -1
{pixeltable-0.3.9.dist-info → pixeltable-0.3.10.dist-info}/LICENSE +0 -0
{pixeltable-0.3.9.dist-info → pixeltable-0.3.10.dist-info}/entry_points.txt +0 -0

pixeltable/io/hf_datasets.py CHANGED Viewed

@@ -1,41 +1,38 @@
 from __future__ import annotations
-import logging
-import math
-import random
 import typing
 from typing import Any, Optional, Union
 import pixeltable as pxt
 import pixeltable.type_system as ts
-from pixeltable import exceptions as excs
-from .utils import normalize_import_parameters, normalize_schema_names
 if typing.TYPE_CHECKING:
     import datasets  # type: ignore[import-untyped]
-_logger = logging.getLogger('pixeltable')
-# use 100MB as the batch size limit for loading a huggingface dataset into pixeltable.
-# The primary goal is to bound memory use, regardless of dataset size.
-# Second goal is to limit overhead. 100MB is presumed to be reasonable for a lot of storage systems.
-_K_BATCH_SIZE_BYTES = 100_000_000
-# note, there are many more types. we allow overrides in the schema_override parameter
+# note, there are many more types. we allow overrides in the schema_overrides parameter
 # to handle cases where the appropriate type is not yet mapped, or to override this mapping.
 # https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Value
 _hf_to_pxt: dict[str, ts.ColumnType] = {
-    'int32': ts.IntType(nullable=True),  # pixeltable widens to big int
-    'int64': ts.IntType(nullable=True),
     'bool': ts.BoolType(nullable=True),
+    'int8': ts.IntType(nullable=True),
+    'int16': ts.IntType(nullable=True),
+    'int32': ts.IntType(nullable=True),
+    'int64': ts.IntType(nullable=True),
+    'uint8': ts.IntType(nullable=True),
+    'uint16': ts.IntType(nullable=True),
+    'uint32': ts.IntType(nullable=True),
+    'uint64': ts.IntType(nullable=True),
+    'float16': ts.FloatType(nullable=True),
     'float32': ts.FloatType(nullable=True),
     'float64': ts.FloatType(nullable=True),
-    'large_string': ts.StringType(nullable=True),
     'string': ts.StringType(nullable=True),
+    'large_string': ts.StringType(nullable=True),
     'timestamp[s]': ts.TimestampType(nullable=True),
     'timestamp[ms]': ts.TimestampType(nullable=True),  # HF dataset iterator converts timestamps to datetime.datetime
     'timestamp[us]': ts.TimestampType(nullable=True),
+    'date32': ts.StringType(nullable=True),  # date32 is not supported in pixeltable, use string
+    'date64': ts.StringType(nullable=True),  # date64 is not supported in pixeltable, use string
 }
@@ -88,7 +85,6 @@ def import_huggingface_dataset(
     table_path: str,
     dataset: Union[datasets.Dataset, datasets.DatasetDict],
     *,
-    column_name_for_split: Optional[str] = None,
     schema_overrides: Optional[dict[str, Any]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     **kwargs: Any,
@@ -101,91 +97,18 @@ def import_huggingface_dataset(
         dataset: Huggingface [`datasets.Dataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset)
             or [`datasets.DatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetDict)
             to insert into the table.
-        column_name_for_split: column name to use for split information. If None, no split information will be stored.
         schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
-            name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`. The keys in
-            `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not they are valid
-            Pixeltable identifiers).
+            name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`.
+            The keys in `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not
+            they are valid Pixeltable identifiers).
         primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
         kwargs: Additional arguments to pass to `create_table`.
+            An argument of `column_name_for_split` must be provided if the source is a DatasetDict.
+            This column name will contain the split information. If None, no split information will be stored.
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    import datasets
-    import pixeltable as pxt
-    if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
-        raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
-    # Create the pixeltable schema from the huggingface schema
-    hf_schema_source = _get_hf_schema(dataset)
-    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
-    hf_schema = huggingface_schema_to_pxt_schema(hf_schema_source, schema_overrides, primary_key)
-    # Add the split column to the schema if requested
-    if column_name_for_split is not None:
-        if column_name_for_split in hf_schema:
-            raise excs.Error(
-                f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
-            )
-        hf_schema[column_name_for_split] = ts.StringType(nullable=True)
-    schema, pxt_pk, _ = normalize_schema_names(hf_schema, primary_key, schema_overrides, True)
-    # Prepare to create table and insert data
-    if table_path in pxt.list_tables():
-        raise excs.Error(f'table {table_path} already exists')
-    if isinstance(dataset, datasets.Dataset):
-        # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
-        raw_name = dataset.split._name
-        split_name = raw_name.split('[')[0] if raw_name is not None else None
-        dataset_dict = {split_name: dataset}
-    else:
-        dataset_dict = dataset
-    # extract all class labels from the dataset to translate category ints to strings
-    categorical_features = {
-        feature_name: feature_type.names
-        for (feature_name, feature_type) in hf_schema_source.items()
-        if isinstance(feature_type, datasets.ClassLabel)
-    }
-    try:
-        # random tmp name
-        tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
-        tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
-        def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
-            output_row = row.copy()
-            # map all class labels to strings
-            for field, values in categorical_features.items():
-                output_row[field] = values[row[field]]
-            # add split name to row
-            if column_name_for_split is not None:
-                output_row[column_name_for_split] = split_name
-            return output_row
-        for split_name, split_dataset in dataset_dict.items():
-            num_batches = split_dataset.size_in_bytes / _K_BATCH_SIZE_BYTES
-            tuples_per_batch = math.ceil(split_dataset.num_rows / num_batches)
-            assert tuples_per_batch > 0
-            batch = []
-            for row in split_dataset:
-                batch.append(_translate_row(row, split_name))
-                if len(batch) >= tuples_per_batch:
-                    tab.insert(batch)
-                    batch = []
-            # last batch
-            if len(batch) > 0:
-                tab.insert(batch)
-    except Exception as e:
-        _logger.error(f'Error while inserting dataset into table: {tmp_name}')
-        raise e
-    pxt.move(tmp_name, table_path)
-    return pxt.get_table(table_path)
+    return pxt.create_table(
+        table_path, source=dataset, schema_overrides=schema_overrides, primary_key=primary_key, extra_args=kwargs
+    )

pixeltable/io/pandas.py CHANGED Viewed

@@ -7,9 +7,6 @@ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
 import pixeltable as pxt
 import pixeltable.exceptions as excs
-from pixeltable import Table
-from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
 def import_pandas(
@@ -43,20 +40,14 @@ def import_pandas(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
-    pd_schema = df_infer_schema(df, schema_overrides, primary_key)
-    schema, pxt_pk, col_mapping = normalize_schema_names(pd_schema, primary_key, schema_overrides, False)
-    __check_primary_key_values(df, primary_key)
-    # Convert all rows to insertable format
-    tbl_rows = [__df_row_to_pxt_row(row, pd_schema, col_mapping) for row in df.itertuples()]
-    table = find_or_create_table(
-        tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
+    return pxt.create_table(
+        tbl_name,
+        source=df,
+        schema_overrides=schema_overrides,
+        primary_key=primary_key,
+        num_retained_versions=num_retained_versions,
+        comment=comment,
     )
-    table.insert(tbl_rows)
-    return table
 def import_csv(
@@ -77,14 +68,14 @@ def import_csv(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    df = pd.read_csv(filepath_or_buffer, **kwargs)
-    return import_pandas(
+    return pxt.create_table(
         tbl_name,
-        df,
+        source=filepath_or_buffer,
         schema_overrides=schema_overrides,
         primary_key=primary_key,
         num_retained_versions=num_retained_versions,
         comment=comment,
+        extra_args=kwargs,
     )
@@ -107,18 +98,18 @@ def import_excel(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    df = pd.read_excel(io, *args, **kwargs)
-    return import_pandas(
+    return pxt.create_table(
         tbl_name,
-        df,
+        source=io,
         schema_overrides=schema_overrides,
         primary_key=primary_key,
         num_retained_versions=num_retained_versions,
         comment=comment,
+        extra_args=kwargs,
     )
-def __check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
+def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
     for pd_name in primary_key:
         # This can be faster for large DataFrames
         has_nulls = df[pd_name].count() < len(df)
@@ -146,15 +137,6 @@ def df_infer_schema(
     return pd_schema
-"""
-# Check if a datetime64[ns, UTC] dtype
-def is_datetime_tz_utc(x: Any) -> bool:
-    if isinstance(x, pd.Timestamp) and x.tzinfo is not None and str(x.tzinfo) == 'UTC':
-        return True
-    return pd.api.types.is_datetime64tz_dtype(x) and str(x).endswith('UTC]')
-"""
 def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
     """
     Determines a pixeltable ColumnType from a pandas dtype
@@ -165,7 +147,8 @@ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.C
     Returns:
         pxt.ColumnType: A pixeltable ColumnType
     """
-    # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly compatible with NumPy dtypes
+    # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
+    # compatible with NumPy dtypes
     # The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
     if is_datetime64_any_dtype(pd_dtype):
         return pxt.TimestampType(nullable=nullable)
@@ -204,32 +187,35 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
     raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {pd_dtype})')
-def __df_row_to_pxt_row(
+def _df_row_to_pxt_row(
     row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
 ) -> dict[str, Any]:
     """Convert a row to insertable format"""
     pxt_row: dict[str, Any] = {}
     for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
+        pxt_name = col_mapping.get(col_name, col_name)
+        nval: Any
         if pxt_type.is_float_type():
-            val = float(val)
+            nval = float(val)
         elif isinstance(val, float) and np.isnan(val):
             # pandas uses NaN for empty cells, even for types other than float;
             # for any type but a float, convert these to None
-            val = None
+            nval = None
         elif pxt_type.is_int_type():
-            val = int(val)
+            nval = int(val)
         elif pxt_type.is_bool_type():
-            val = bool(val)
+            nval = bool(val)
         elif pxt_type.is_string_type():
-            val = str(val)
+            nval = str(val)
         elif pxt_type.is_timestamp_type():
             if pd.isnull(val):
                 # pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
                 # much not-ok with it. (But if we convert it to None and then load out the
                 # table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
-                val = None
+                nval = None
             else:
-                val = pd.Timestamp(val).to_pydatetime()
-        pxt_name = col_name if col_mapping is None else col_mapping[col_name]
-        pxt_row[pxt_name] = val
+                nval = pd.Timestamp(val).to_pydatetime()
+        else:
+            nval = val
+        pxt_row[pxt_name] = nval
     return pxt_row

pixeltable/io/parquet.py CHANGED Viewed

@@ -4,7 +4,6 @@ import datetime
 import io
 import json
 import logging
-import random
 import typing
 from collections import deque
 from pathlib import Path
@@ -14,12 +13,10 @@ import numpy as np
 import PIL.Image
 import pixeltable as pxt
-import pixeltable.exceptions as exc
+import pixeltable.exceptions as excs
 from pixeltable.env import Env
 from pixeltable.utils.transactional_directory import transactional_directory
-from .utils import normalize_import_parameters, normalize_schema_names
 if typing.TYPE_CHECKING:
     import pyarrow as pa
@@ -78,7 +75,7 @@ def export_parquet(
     arrow_schema = to_arrow_schema(df.schema)
     if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
-        raise exc.Error('Cannot export Dataframe with image columns when inline_images is False')
+        raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
     # store the changes atomically
     with transactional_directory(parquet_path) as temp_path:
@@ -87,7 +84,7 @@ def export_parquet(
         json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w'))  # keep type metadata
         batch_num = 0
-        current_value_batch: dict[str, deque] = {k: deque() for k in df.schema.keys()}
+        current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
         current_byte_estimate = 0
         with Env.get().begin_xact():
@@ -111,7 +108,7 @@ def export_parquet(
                             val.save(buf, format='PNG')
                             val = buf.getvalue()
                         else:
-                            assert False, f'unknown image type {type(val)}'
+                            raise excs.Error(f'unknown image type {type(val)}')
                         length = len(val)
                     elif col_type.is_string_type():
                         length = len(val)
@@ -119,16 +116,14 @@ def export_parquet(
                         if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
                             val = data_row.file_paths[e.slot_idx]
                         else:
-                            assert False, f'unknown video type {type(val)}'
+                            raise excs.Error(f'unknown video type {type(val)}')
                         length = len(val)
                     elif col_type.is_json_type():
                         val = json.dumps(val)
                         length = len(val)
                     elif col_type.is_array_type():
                         length = val.nbytes
-                    elif col_type.is_int_type():
-                        length = 8
-                    elif col_type.is_float_type():
+                    elif col_type.is_int_type() or col_type.is_float_type():
                         length = 8
                     elif col_type.is_bool_type():
                         length = 1
@@ -136,7 +131,7 @@ def export_parquet(
                         val = val.astimezone(datetime.timezone.utc)
                         length = 8
                     else:
-                        assert False, f'unknown type {col_type} for {col_name}'
+                        raise excs.Error(f'unknown type {col_type} for {col_name}')
                     current_value_batch[col_name].append(val)
                     current_byte_estimate += length
@@ -144,7 +139,7 @@ def export_parquet(
                     assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
                     _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
                     batch_num += 1
-                    current_value_batch = {k: deque() for k in df.schema.keys()}
+                    current_value_batch = {k: deque() for k in df.schema}
                     current_byte_estimate = 0
             _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
@@ -173,32 +168,12 @@ def import_parquet(
     Returns:
         A handle to the newly created table.
     """
-    from pyarrow import parquet
-    from pixeltable.utils.arrow import ar_infer_schema, iter_tuples2
-    input_path = Path(parquet_path).expanduser()
-    parquet_dataset = parquet.ParquetDataset(str(input_path))
-    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
-    ar_schema = ar_infer_schema(parquet_dataset.schema, schema_overrides, primary_key)
-    schema, pxt_pk, col_mapping = normalize_schema_names(ar_schema, primary_key, schema_overrides, False)
-    if table in pxt.list_tables():
-        raise exc.Error(f'Table {table} already exists')
-    tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
-    total_rows = 0
-    try:
-        tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
-        for fragment in parquet_dataset.fragments:  # type: ignore[attr-defined]
-            for batch in fragment.to_batches():
-                dict_batch = list(iter_tuples2(batch, col_mapping, schema))
-                total_rows += len(dict_batch)
-                tab.insert(dict_batch)
-    except Exception as e:
-        _logger.error(f'Error after inserting {total_rows} rows from Parquet file into table: {e}')
-        raise e
-    pxt.move(tmp_name, table)
-    return pxt.get_table(table)
+    value = kwargs.pop('source_format', None)
+    return pxt.create_table(
+        table,
+        source=parquet_path,
+        source_format=value,
+        schema_overrides=schema_overrides,
+        primary_key=primary_key,
+        extra_args=kwargs,
+    )

pixeltable 0.3.9__py3-none-any.whl → 0.3.10__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.9py3-none-any.whl → 0.3.10py3-none-any.whl