PyPI - pixeltable - Versions diffs - 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl - Mend

pixeltable 0.3.4py3-none-any.whl → 0.3.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (63) hide show

pixeltable/__init__.py +1 -0
pixeltable/__version__.py +2 -2
pixeltable/catalog/catalog.py +9 -2
pixeltable/catalog/column.py +1 -1
pixeltable/catalog/dir.py +1 -1
pixeltable/catalog/table.py +3 -1
pixeltable/catalog/table_version.py +12 -2
pixeltable/catalog/table_version_path.py +2 -2
pixeltable/catalog/view.py +64 -20
pixeltable/dataframe.py +11 -6
pixeltable/env.py +12 -0
pixeltable/exec/expr_eval/evaluators.py +4 -2
pixeltable/exec/expr_eval/expr_eval_node.py +4 -1
pixeltable/exprs/comparison.py +8 -4
pixeltable/exprs/data_row.py +9 -7
pixeltable/exprs/expr.py +2 -2
pixeltable/exprs/function_call.py +155 -313
pixeltable/exprs/json_mapper.py +25 -8
pixeltable/exprs/json_path.py +6 -5
pixeltable/exprs/object_ref.py +16 -5
pixeltable/exprs/row_builder.py +10 -3
pixeltable/func/aggregate_function.py +29 -15
pixeltable/func/callable_function.py +11 -8
pixeltable/func/expr_template_function.py +3 -9
pixeltable/func/function.py +148 -74
pixeltable/func/signature.py +65 -30
pixeltable/func/tools.py +26 -26
pixeltable/func/udf.py +1 -1
pixeltable/functions/__init__.py +1 -0
pixeltable/functions/anthropic.py +9 -3
pixeltable/functions/deepseek.py +121 -0
pixeltable/functions/image.py +7 -7
pixeltable/functions/openai.py +30 -13
pixeltable/functions/video.py +14 -7
pixeltable/globals.py +14 -3
pixeltable/index/embedding_index.py +4 -13
pixeltable/io/globals.py +88 -77
pixeltable/io/hf_datasets.py +34 -34
pixeltable/io/pandas.py +75 -76
pixeltable/io/parquet.py +19 -27
pixeltable/io/utils.py +115 -0
pixeltable/iterators/audio.py +2 -1
pixeltable/iterators/video.py +1 -1
pixeltable/metadata/__init__.py +2 -1
pixeltable/metadata/converters/convert_15.py +18 -8
pixeltable/metadata/converters/convert_27.py +31 -0
pixeltable/metadata/converters/convert_28.py +15 -0
pixeltable/metadata/converters/convert_29.py +111 -0
pixeltable/metadata/converters/util.py +12 -1
pixeltable/metadata/notes.py +3 -0
pixeltable/metadata/schema.py +8 -0
pixeltable/share/__init__.py +1 -0
pixeltable/share/packager.py +41 -13
pixeltable/share/publish.py +97 -0
pixeltable/type_system.py +40 -14
pixeltable/utils/__init__.py +41 -0
pixeltable/utils/arrow.py +40 -7
pixeltable/utils/formatter.py +1 -1
{pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/METADATA +34 -49
{pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/RECORD +63 -57
{pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/WHEEL +1 -1
{pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/LICENSE +0 -0
{pixeltable-0.3.4.dist-info → pixeltable-0.3.6.dist-info}/entry_points.txt +0 -0

pixeltable/index/embedding_index.py CHANGED Viewed

@@ -99,10 +99,10 @@ class EmbeddingIndex(IndexBase):
         # Now validate the return types of the embedding functions.
         if self.string_embed is not None:
-            self._validate_embedding_fn(self.string_embed, ts.ColumnType.Type.STRING)
+            self._validate_embedding_fn(self.string_embed)
         if self.image_embed is not None:
-            self._validate_embedding_fn(self.image_embed, ts.ColumnType.Type.IMAGE)
+            self._validate_embedding_fn(self.image_embed)
         if c.col_type.is_string_type() and self.string_embed is None:
             raise excs.Error(f"Text embedding function is required for column {c.name} (parameter 'string_embed')")
@@ -206,21 +206,12 @@ class EmbeddingIndex(IndexBase):
         return None
     @classmethod
-    def _validate_embedding_fn(cls, embed_fn: func.Function, expected_type: ts.ColumnType.Type) -> None:
+    def _validate_embedding_fn(cls, embed_fn: func.Function) -> None:
         """Validate the given embedding function."""
         assert not embed_fn.is_polymorphic
-        sig = embed_fn.signature
-        # validate return type
-        param_name = sig.parameters_by_pos[0].name
-        if expected_type == ts.ColumnType.Type.STRING:
-            return_type = embed_fn.call_return_type([], {param_name: 'dummy'})
-        else:
-            assert expected_type == ts.ColumnType.Type.IMAGE
-            img = PIL.Image.new('RGB', (512, 512))
-            return_type = embed_fn.call_return_type([], {param_name: img})
+        return_type = embed_fn.signature.return_type
-        assert return_type is not None
         if not isinstance(return_type, ts.ArrayType):
             raise excs.Error(
                 f'The function `{embed_fn.name}` is not a valid embedding: '

pixeltable/io/globals.py CHANGED Viewed

@@ -1,3 +1,7 @@
+import json
+import urllib.parse
+import urllib.request
+from pathlib import Path
 from typing import TYPE_CHECKING, Any, Literal, Optional, Union
 import pixeltable as pxt
@@ -5,11 +9,61 @@ import pixeltable.exceptions as excs
 from pixeltable import Table, exprs
 from pixeltable.env import Env
 from pixeltable.io.external_store import SyncStatus
+from pixeltable.utils import parse_local_file_path
 if TYPE_CHECKING:
     import fiftyone as fo  # type: ignore[import-untyped]
+from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
+def _infer_schema_from_rows(
+    rows: list[dict[str, Any]], schema_overrides: dict[str, Any], primary_key: list[str]
+) -> dict[str, pxt.ColumnType]:
+    schema: dict[str, pxt.ColumnType] = {}
+    cols_with_nones: set[str] = set()
+    for n, row in enumerate(rows):
+        for col_name, value in row.items():
+            if col_name in schema_overrides:
+                # We do the insertion here; this will ensure that the column order matches the order
+                # in which the column names are encountered in the input data, even if `schema_overrides`
+                # is specified.
+                if col_name not in schema:
+                    schema[col_name] = schema_overrides[col_name]
+            elif value is not None:
+                # If `key` is not in `schema_overrides`, then we infer its type from the data.
+                # The column type will always be nullable by default.
+                col_type = pxt.ColumnType.infer_literal_type(value, nullable=col_name not in primary_key)
+                if col_type is None:
+                    raise excs.Error(
+                        f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}'
+                    )
+                if col_name not in schema:
+                    schema[col_name] = col_type
+                else:
+                    supertype = schema[col_name].supertype(col_type)
+                    if supertype is None:
+                        raise excs.Error(
+                            f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
+                            'Consider specifying the type explicitly in `schema_overrides`.'
+                        )
+                    schema[col_name] = supertype
+            else:
+                cols_with_nones.add(col_name)
+    entirely_none_cols = cols_with_nones - schema.keys()
+    if len(entirely_none_cols) > 0:
+        # A column can only end up in `entirely_none_cols` if it was not in `schema_overrides` and
+        # was not encountered in any row with a non-None value.
+        raise excs.Error(
+            f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
+            'Consider specifying the type(s) explicitly in `schema_overrides`.'
+        )
+    return schema
 def create_label_studio_project(
     t: Table,
     label_config: str,
@@ -140,7 +194,7 @@ def import_rows(
     tbl_path: str,
     rows: list[dict[str, Any]],
     *,
-    schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
+    schema_overrides: Optional[dict[str, Any]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
     comment: str = '',
@@ -169,67 +223,22 @@ def import_rows(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    if schema_overrides is None:
-        schema_overrides = {}
-    schema: dict[str, pxt.ColumnType] = {}
-    cols_with_nones: set[str] = set()
+    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
+    row_schema = _infer_schema_from_rows(rows, schema_overrides, primary_key)
+    schema, pxt_pk, _ = normalize_schema_names(row_schema, primary_key, schema_overrides, True)
-    for n, row in enumerate(rows):
-        for col_name, value in row.items():
-            if col_name in schema_overrides:
-                # We do the insertion here; this will ensure that the column order matches the order
-                # in which the column names are encountered in the input data, even if `schema_overrides`
-                # is specified.
-                if col_name not in schema:
-                    schema[col_name] = schema_overrides[col_name]
-            elif value is not None:
-                # If `key` is not in `schema_overrides`, then we infer its type from the data.
-                # The column type will always be nullable by default.
-                col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
-                if col_type is None:
-                    raise excs.Error(
-                        f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}'
-                    )
-                if col_name not in schema:
-                    schema[col_name] = col_type
-                else:
-                    supertype = schema[col_name].supertype(col_type)
-                    if supertype is None:
-                        raise excs.Error(
-                            f'Could not infer type of column `{col_name}`; the value in row {n} does not match preceding type {schema[col_name]}: {value!r}\n'
-                            'Consider specifying the type explicitly in `schema_overrides`.'
-                        )
-                    schema[col_name] = supertype
-            else:
-                cols_with_nones.add(col_name)
-    extraneous_keys = schema_overrides.keys() - schema.keys()
-    if len(extraneous_keys) > 0:
-        raise excs.Error(
-            f'The following columns specified in `schema_overrides` are not present in the data: {", ".join(extraneous_keys)}'
-        )
-    entirely_none_cols = cols_with_nones - schema.keys()
-    if len(entirely_none_cols) > 0:
-        # A column can only end up in `entirely_null_cols` if it was not in `schema_overrides` and
-        # was not encountered in any row with a non-None value.
-        raise excs.Error(
-            f'The following columns have no non-null values: {", ".join(entirely_none_cols)}\n'
-            'Consider specifying the type(s) explicitly in `schema_overrides`.'
-        )
-    t = pxt.create_table(
-        tbl_path, schema, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment
+    table = find_or_create_table(
+        tbl_path, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
     )
-    t.insert(rows)
-    return t
+    table.insert(rows)
+    return table
 def import_json(
     tbl_path: str,
     filepath_or_url: str,
     *,
-    schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
+    schema_overrides: Optional[dict[str, Any]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
     comment: str = '',
@@ -253,33 +262,35 @@ def import_json(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    import json
-    import urllib.parse
-    import urllib.request
-    # TODO Consolidate this logic with other places where files/URLs are parsed
-    parsed = urllib.parse.urlparse(filepath_or_url)
-    if len(parsed.scheme) <= 1 or parsed.scheme == 'file':
-        # local file path
-        if len(parsed.scheme) <= 1:
-            filepath = filepath_or_url
-        else:
-            filepath = urllib.parse.unquote(urllib.request.url2pathname(parsed.path))
-        with open(filepath) as fp:
+    path = parse_local_file_path(filepath_or_url)
+    if path is None:  # it's a URL
+        # TODO: This should read from S3 as well.
+        contents = urllib.request.urlopen(filepath_or_url).read()
+    else:
+        with open(path) as fp:
             contents = fp.read()
+    rows = json.loads(contents, **kwargs)
+    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
+    row_schema = _infer_schema_from_rows(rows, schema_overrides, primary_key)
+    schema, pxt_pk, col_mapping = normalize_schema_names(row_schema, primary_key, schema_overrides, False)
+    # Convert all rows to insertable format - not needed, misnamed columns and types are errors in the incoming row format
+    if col_mapping is not None:
+        tbl_rows = [
+            {field if col_mapping is None else col_mapping[field]: val for field, val in row.items()} for row in rows
+        ]
     else:
-        # URL
-        contents = urllib.request.urlopen(filepath_or_url).read()
-    data = json.loads(contents, **kwargs)
-    return import_rows(
-        tbl_path,
-        data,
-        schema_overrides=schema_overrides,
-        primary_key=primary_key,
-        num_retained_versions=num_retained_versions,
-        comment=comment,
+        tbl_rows = rows
+    table = find_or_create_table(
+        tbl_path, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
     )
+    table.insert(tbl_rows)
+    return table
 def export_images_as_fo_dataset(
     tbl: pxt.Table,

pixeltable/io/hf_datasets.py CHANGED Viewed

@@ -10,6 +10,8 @@ import pixeltable as pxt
 import pixeltable.type_system as ts
 from pixeltable import exceptions as excs
+from .utils import normalize_import_parameters, normalize_schema_names
 if typing.TYPE_CHECKING:
     import datasets  # type: ignore[import-untyped]
@@ -28,29 +30,33 @@ _hf_to_pxt: dict[str, ts.ColumnType] = {
     'int64': ts.IntType(nullable=True),
     'bool': ts.BoolType(nullable=True),
     'float32': ts.FloatType(nullable=True),
+    'float64': ts.FloatType(nullable=True),
+    'large_string': ts.StringType(nullable=True),
     'string': ts.StringType(nullable=True),
     'timestamp[s]': ts.TimestampType(nullable=True),
     'timestamp[ms]': ts.TimestampType(nullable=True),  # HF dataset iterator converts timestamps to datetime.datetime
+    'timestamp[us]': ts.TimestampType(nullable=True),
 }
-def _to_pixeltable_type(feature_type: Any) -> Optional[ts.ColumnType]:
+def _to_pixeltable_type(feature_type: Any, nullable: bool) -> Optional[ts.ColumnType]:
     """Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
     import datasets
     if isinstance(feature_type, datasets.ClassLabel):
         # enum, example: ClassLabel(names=['neg', 'pos'], id=None)
-        return ts.StringType(nullable=True)
+        return ts.StringType(nullable=nullable)
     elif isinstance(feature_type, datasets.Value):
         # example: Value(dtype='int64', id=None)
-        return _hf_to_pxt.get(feature_type.dtype, None)
+        pt = _hf_to_pxt.get(feature_type.dtype, None)
+        return pt.copy(nullable=nullable) if pt is not None else None
     elif isinstance(feature_type, datasets.Sequence):
         # example: cohere wiki. Sequence(feature=Value(dtype='float32', id=None), length=-1, id=None)
-        dtype = _to_pixeltable_type(feature_type.feature)
+        dtype = _to_pixeltable_type(feature_type.feature, nullable)
         length = feature_type.length if feature_type.length != -1 else None
         return ts.ArrayType(shape=(length,), dtype=dtype)
     elif isinstance(feature_type, datasets.Image):
-        return ts.ImageType(nullable=True)
+        return ts.ImageType(nullable=nullable)
     else:
         return None
@@ -63,15 +69,17 @@ def _get_hf_schema(dataset: Union[datasets.Dataset, datasets.DatasetDict]) -> da
     return first_dataset.features
-def huggingface_schema_to_pixeltable_schema(
-    hf_dataset: Union[datasets.Dataset, datasets.DatasetDict],
+def huggingface_schema_to_pxt_schema(
+    hf_schema: datasets.Features, schema_overrides: dict[str, Any], primary_key: list[str]
 ) -> dict[str, Optional[ts.ColumnType]]:
     """Generate a pixeltable schema from a huggingface dataset schema.
     Columns without a known mapping are mapped to None
     """
-    hf_schema = _get_hf_schema(hf_dataset)
     pixeltable_schema = {
-        column_name: _to_pixeltable_type(feature_type) for column_name, feature_type in hf_schema.items()
+        column_name: _to_pixeltable_type(feature_type, column_name not in primary_key)
+        if column_name not in schema_overrides
+        else schema_overrides[column_name]
+        for column_name, feature_type in hf_schema.items()
     }
     return pixeltable_schema
@@ -82,6 +90,7 @@ def import_huggingface_dataset(
     *,
     column_name_for_split: Optional[str] = None,
     schema_overrides: Optional[dict[str, Any]] = None,
+    primary_key: Optional[Union[str, list[str]]] = None,
     **kwargs: Any,
 ) -> pxt.Table:
     """Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
@@ -97,6 +106,7 @@ def import_huggingface_dataset(
             name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`. The keys in
             `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not they are valid
             Pixeltable identifiers).
+        primary_key: The primary key of the table (see [`create_table()`][pixeltable.create_table]).
         kwargs: Additional arguments to pass to `create_table`.
     Returns:
@@ -106,57 +116,47 @@ def import_huggingface_dataset(
     import pixeltable as pxt
-    if table_path in pxt.list_tables():
-        raise excs.Error(f'table {table_path} already exists')
     if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
         raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
-    if isinstance(dataset, datasets.Dataset):
-        # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
-        raw_name = dataset.split._name
-        split_name = raw_name.split('[')[0] if raw_name is not None else None
-        dataset_dict = {split_name: dataset}
-    else:
-        dataset_dict = dataset
-    pixeltable_schema = huggingface_schema_to_pixeltable_schema(dataset)
-    if schema_overrides is not None:
-        pixeltable_schema.update(schema_overrides)
+    # Create the pixeltable schema from the huggingface schema
+    hf_schema_source = _get_hf_schema(dataset)
+    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
+    hf_schema = huggingface_schema_to_pxt_schema(hf_schema_source, schema_overrides, primary_key)
+    # Add the split column to the schema if requested
     if column_name_for_split is not None:
-        if column_name_for_split in pixeltable_schema:
+        if column_name_for_split in hf_schema:
             raise excs.Error(
                 f'Column name `{column_name_for_split}` already exists in dataset schema; provide a different `column_name_for_split`'
             )
-        pixeltable_schema[column_name_for_split] = ts.StringType(nullable=True)
+        hf_schema[column_name_for_split] = ts.StringType(nullable=True)
-    for field, column_type in pixeltable_schema.items():
-        if column_type is None:
-            raise excs.Error(f'Could not infer pixeltable type for feature `{field}` in huggingface dataset')
+    schema, pxt_pk, _ = normalize_schema_names(hf_schema, primary_key, schema_overrides, True)
+    # Prepare to create table and insert data
+    if table_path in pxt.list_tables():
+        raise excs.Error(f'table {table_path} already exists')
     if isinstance(dataset, datasets.Dataset):
         # when loading an hf dataset partially, dataset.split._name is sometimes the form "train[0:1000]"
         raw_name = dataset.split._name
         split_name = raw_name.split('[')[0] if raw_name is not None else None
         dataset_dict = {split_name: dataset}
-    elif isinstance(dataset, datasets.DatasetDict):
-        dataset_dict = dataset
     else:
-        raise excs.Error(f'`type(dataset)` must be `datasets.Dataset` or `datasets.DatasetDict`. Got {type(dataset)=}')
+        dataset_dict = dataset
     # extract all class labels from the dataset to translate category ints to strings
-    hf_schema = _get_hf_schema(dataset)
     categorical_features = {
         feature_name: feature_type.names
-        for (feature_name, feature_type) in hf_schema.items()
+        for (feature_name, feature_type) in hf_schema_source.items()
         if isinstance(feature_type, datasets.ClassLabel)
     }
     try:
         # random tmp name
         tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
-        tab = pxt.create_table(tmp_name, pixeltable_schema, **kwargs)
+        tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
         def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
             output_row = row.copy()

pixeltable/io/pandas.py CHANGED Viewed

@@ -2,17 +2,21 @@ from typing import Any, Optional, Union
 import numpy as np
 import pandas as pd
+from pandas._typing import DtypeObj  # For pandas dtype type hints
+from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
 import pixeltable as pxt
 import pixeltable.exceptions as excs
-import pixeltable.type_system as ts
+from pixeltable import Table
+from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
 def import_pandas(
     tbl_name: str,
     df: pd.DataFrame,
     *,
-    schema_overrides: Optional[dict[str, pxt.ColumnType]] = None,
+    schema_overrides: Optional[dict[str, Any]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
     comment: str = '',
@@ -39,16 +43,16 @@ def import_pandas(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    if schema_overrides is None:
-        schema_overrides = {}
-    if primary_key is None:
-        primary_key = []
-    elif isinstance(primary_key, str):
-        primary_key = [primary_key]
-    schema, pxt_pk = __df_to_pxt_schema(df, schema_overrides, primary_key)
-    tbl_rows = (dict(__df_row_to_pxt_row(row, schema)) for row in df.itertuples())
-    table = pxt.create_table(
+    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
+    pd_schema = df_infer_schema(df, schema_overrides, primary_key)
+    schema, pxt_pk, col_mapping = normalize_schema_names(pd_schema, primary_key, schema_overrides, False)
+    __check_primary_key_values(df, primary_key)
+    # Convert all rows to insertable format
+    tbl_rows = [__df_row_to_pxt_row(row, pd_schema, col_mapping) for row in df.itertuples()]
+    table = find_or_create_table(
         tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
     )
     table.insert(tbl_rows)
@@ -58,7 +62,7 @@ def import_pandas(
 def import_csv(
     tbl_name: str,
     filepath_or_buffer,
-    schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
+    schema_overrides: Optional[dict[str, Any]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
     comment: str = '',
@@ -88,7 +92,7 @@ def import_excel(
     tbl_name: str,
     io,
     *args,
-    schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
+    schema_overrides: Optional[dict[str, Any]] = None,
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
     comment: str = '',
@@ -114,82 +118,73 @@ def import_excel(
     )
-def __df_to_pxt_schema(
+def __check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
+    for pd_name in primary_key:
+        # This can be faster for large DataFrames
+        has_nulls = df[pd_name].count() < len(df)
+        if has_nulls:
+            raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
+def df_infer_schema(
     df: pd.DataFrame, schema_overrides: dict[str, pxt.ColumnType], primary_key: list[str]
-) -> tuple[dict[str, pxt.ColumnType], list[str]]:
+) -> dict[str, pxt.ColumnType]:
     """
     Infers a Pixeltable schema from a Pandas DataFrame.
     Returns:
         A tuple containing a Pixeltable schema and a list of primary key column names.
     """
-    for pd_name in schema_overrides:
-        if pd_name not in df.columns:
-            raise excs.Error(
-                f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
-            )
-    for pd_name in primary_key:
-        if pd_name not in df.columns:
-            raise excs.Error(f'Primary key column `{pd_name}` does not exist in the given `DataFrame`.')
-    schema: dict[str, pxt.ColumnType] = {}
-    col_mapping: dict[str, str] = {}  # Maps Pandas column names to Pixeltable column names
+    pd_schema: dict[str, pxt.ColumnType] = {}
     for pd_name, pd_dtype in zip(df.columns, df.dtypes):
         if pd_name in schema_overrides:
             pxt_type = schema_overrides[pd_name]
         else:
-            # This complicated-looking condition is necessary because we cannot safely call `pd.isna()` on
-            # general objects, so we need to check for nulls in the specific cases where we might expect them.
-            # isinstance(val, float) will check for NaN values in float columns *as well as* floats appearing
-            # in object columns (where Pandas uses NaN as a general null).
-            # np.issubdtype(pd_dtype, np.datetime64) checks for NaT values specifically in datetime columns.
-            has_na = any(
-                (isinstance(val, float) or np.issubdtype(pd_dtype, np.datetime64)) and pd.isna(val)
-                for val in df[pd_name]
-            )
-            if has_na and pd_name in primary_key:
-                raise excs.Error(f'Primary key column `{pd_name}` cannot contain null values.')
-            pxt_type = __np_dtype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
-        pxt_name = __normalize_pxt_col_name(pd_name)
-        # Ensure that column names are unique by appending a distinguishing suffix
-        # to any collisions
-        if pxt_name in schema:
-            n = 2
-            while f'{pxt_name}_{n}' in schema:
-                n += 1
-            pxt_name = f'{pxt_name}_{n}'
-        schema[pxt_name] = pxt_type
-        col_mapping[pd_name] = pxt_name
-    pxt_pk = [col_mapping[pk] for pk in primary_key]
-    return schema, pxt_pk
-def __normalize_pxt_col_name(pd_name: str) -> str:
-    """
-    Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
-    - replacing any non-ascii or non-alphanumeric characters with an underscore _
-    - prefixing the result with the letter 'c' if it starts with an underscore or a number
+            pxt_type = __pd_coltype_to_pxt_type(pd_dtype, df[pd_name], pd_name not in primary_key)
+        pd_schema[pd_name] = pxt_type
+    return pd_schema
+"""
+# Check if a datetime64[ns, UTC] dtype
+def is_datetime_tz_utc(x: Any) -> bool:
+    if isinstance(x, pd.Timestamp) and x.tzinfo is not None and str(x.tzinfo) == 'UTC':
+        return True
+    return pd.api.types.is_datetime64tz_dtype(x) and str(x).endswith('UTC]')
+"""
+def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
     """
-    id = ''.join(ch if ch.isascii() and ch.isalnum() else '_' for ch in pd_name)
-    if id[0].isnumeric():
-        id = f'c_{id}'
-    elif id[0] == '_':
-        id = f'c{id}'
-    assert pxt.catalog.is_valid_identifier(id), id
-    return id
+    Determines a pixeltable ColumnType from a pandas dtype
+    Args:
+        pd_dtype: A pandas dtype object
-def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
+    Returns:
+        pxt.ColumnType: A pixeltable ColumnType
+    """
+    # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly compatible with NumPy dtypes
+    # The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
+    if is_datetime64_any_dtype(pd_dtype):
+        return pxt.TimestampType(nullable=nullable)
+    if is_extension_array_dtype(pd_dtype):
+        return None
+    # Most other pandas dtypes are directly NumPy compatible
+    assert isinstance(pd_dtype, np.dtype)
+    return pxt.ArrayType.from_np_dtype(pd_dtype, nullable)
+def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable: bool) -> pxt.ColumnType:
     """
-    Infers a Pixeltable type based on a Numpy dtype.
+    Infers a Pixeltable type based on a pandas dtype.
     """
-    pxttype = ts.ArrayType.from_np_dtype(np_dtype, nullable)
+    pxttype = __pd_dtype_to_pxt_type(pd_dtype, nullable)
     if pxttype is not None:
         return pxttype
-    if np_dtype == np.object_:
+    if pd_dtype == np.object_:
         # The `object_` dtype can mean all sorts of things; see if we can infer the Pixeltable type
         # based on the actual data in `data_col`.
         # First drop any null values (they don't contribute to type inference).
@@ -206,11 +201,14 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
         else:
             return inferred_type.copy(nullable=nullable)
-    raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {np_dtype})')
+    raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {pd_dtype})')
-def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
-    rows = {}
+def __df_row_to_pxt_row(
+    row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
+) -> dict[str, Any]:
+    """Convert a row to insertable format"""
+    pxt_row: dict[str, Any] = {}
     for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
         if pxt_type.is_float_type():
             val = float(val)
@@ -232,5 +230,6 @@ def __df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType])
                 val = None
             else:
                 val = pd.Timestamp(val).to_pydatetime()
-        rows[col_name] = val
-    return rows
+        pxt_name = col_name if col_mapping is None else col_mapping[col_name]
+        pxt_row[pxt_name] = val
+    return pxt_row

pixeltable 0.3.4__py3-none-any.whl → 0.3.6__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.4py3-none-any.whl → 0.3.6py3-none-any.whl