PyPI - pixeltable - Versions diffs - 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl - Mend

pixeltable 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (52) hide show

pixeltable/__init__.py +1 -2
pixeltable/__version__.py +2 -2
pixeltable/catalog/catalog.py +509 -103
pixeltable/catalog/column.py +5 -0
pixeltable/catalog/dir.py +15 -6
pixeltable/catalog/globals.py +16 -0
pixeltable/catalog/insertable_table.py +82 -41
pixeltable/catalog/path.py +15 -0
pixeltable/catalog/schema_object.py +7 -12
pixeltable/catalog/table.py +81 -67
pixeltable/catalog/table_version.py +23 -7
pixeltable/catalog/view.py +9 -6
pixeltable/env.py +15 -9
pixeltable/exec/exec_node.py +1 -1
pixeltable/exprs/__init__.py +2 -1
pixeltable/exprs/arithmetic_expr.py +2 -0
pixeltable/exprs/column_ref.py +38 -2
pixeltable/exprs/expr.py +61 -12
pixeltable/exprs/function_call.py +1 -4
pixeltable/exprs/globals.py +12 -0
pixeltable/exprs/json_mapper.py +4 -4
pixeltable/exprs/json_path.py +10 -11
pixeltable/exprs/similarity_expr.py +5 -20
pixeltable/exprs/string_op.py +107 -0
pixeltable/ext/functions/yolox.py +21 -64
pixeltable/func/callable_function.py +5 -2
pixeltable/func/query_template_function.py +6 -18
pixeltable/func/tools.py +2 -2
pixeltable/functions/__init__.py +1 -1
pixeltable/functions/globals.py +16 -5
pixeltable/globals.py +172 -262
pixeltable/io/__init__.py +3 -2
pixeltable/io/datarows.py +138 -0
pixeltable/io/external_store.py +8 -5
pixeltable/io/globals.py +7 -160
pixeltable/io/hf_datasets.py +21 -98
pixeltable/io/pandas.py +29 -43
pixeltable/io/parquet.py +17 -42
pixeltable/io/table_data_conduit.py +569 -0
pixeltable/io/utils.py +6 -21
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_30.py +50 -0
pixeltable/metadata/converters/util.py +26 -1
pixeltable/metadata/notes.py +1 -0
pixeltable/metadata/schema.py +3 -0
pixeltable/utils/arrow.py +32 -7
pixeltable/utils/coroutine.py +41 -0
{pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/METADATA +1 -1
{pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/RECORD +52 -47
{pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/WHEEL +1 -1
{pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/LICENSE +0 -0
{pixeltable-0.3.8.dist-info → pixeltable-0.3.10.dist-info}/entry_points.txt +0 -0

pixeltable/io/pandas.py CHANGED Viewed

@@ -7,9 +7,6 @@ from pandas.api.types import is_datetime64_any_dtype, is_extension_array_dtype
 import pixeltable as pxt
 import pixeltable.exceptions as excs
-from pixeltable import Table
-from .utils import find_or_create_table, normalize_import_parameters, normalize_schema_names
 def import_pandas(
@@ -43,20 +40,14 @@ def import_pandas(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
-    pd_schema = df_infer_schema(df, schema_overrides, primary_key)
-    schema, pxt_pk, col_mapping = normalize_schema_names(pd_schema, primary_key, schema_overrides, False)
-    __check_primary_key_values(df, primary_key)
-    # Convert all rows to insertable format
-    tbl_rows = [__df_row_to_pxt_row(row, pd_schema, col_mapping) for row in df.itertuples()]
-    table = find_or_create_table(
-        tbl_name, schema, primary_key=pxt_pk, num_retained_versions=num_retained_versions, comment=comment
+    return pxt.create_table(
+        tbl_name,
+        source=df,
+        schema_overrides=schema_overrides,
+        primary_key=primary_key,
+        num_retained_versions=num_retained_versions,
+        comment=comment,
     )
-    table.insert(tbl_rows)
-    return table
 def import_csv(
@@ -77,14 +68,14 @@ def import_csv(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    df = pd.read_csv(filepath_or_buffer, **kwargs)
-    return import_pandas(
+    return pxt.create_table(
         tbl_name,
-        df,
+        source=filepath_or_buffer,
         schema_overrides=schema_overrides,
         primary_key=primary_key,
         num_retained_versions=num_retained_versions,
         comment=comment,
+        extra_args=kwargs,
     )
@@ -107,18 +98,18 @@ def import_excel(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    df = pd.read_excel(io, *args, **kwargs)
-    return import_pandas(
+    return pxt.create_table(
         tbl_name,
-        df,
+        source=io,
         schema_overrides=schema_overrides,
         primary_key=primary_key,
         num_retained_versions=num_retained_versions,
         comment=comment,
+        extra_args=kwargs,
     )
-def __check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
+def _df_check_primary_key_values(df: pd.DataFrame, primary_key: list[str]) -> None:
     for pd_name in primary_key:
         # This can be faster for large DataFrames
         has_nulls = df[pd_name].count() < len(df)
@@ -146,15 +137,6 @@ def df_infer_schema(
     return pd_schema
-"""
-# Check if a datetime64[ns, UTC] dtype
-def is_datetime_tz_utc(x: Any) -> bool:
-    if isinstance(x, pd.Timestamp) and x.tzinfo is not None and str(x.tzinfo) == 'UTC':
-        return True
-    return pd.api.types.is_datetime64tz_dtype(x) and str(x).endswith('UTC]')
-"""
 def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.ColumnType]:
     """
     Determines a pixeltable ColumnType from a pandas dtype
@@ -165,7 +147,8 @@ def __pd_dtype_to_pxt_type(pd_dtype: DtypeObj, nullable: bool) -> Optional[pxt.C
     Returns:
         pxt.ColumnType: A pixeltable ColumnType
     """
-    # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly compatible with NumPy dtypes
+    # Pandas extension arrays / types (Int64, boolean, string[pyarrow], etc.) are not directly
+    # compatible with NumPy dtypes
     # The timezone-aware datetime64[ns, tz=] dtype is a pandas extension dtype
     if is_datetime64_any_dtype(pd_dtype):
         return pxt.TimestampType(nullable=nullable)
@@ -204,32 +187,35 @@ def __pd_coltype_to_pxt_type(pd_dtype: DtypeObj, data_col: pd.Series, nullable:
     raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {pd_dtype})')
-def __df_row_to_pxt_row(
+def _df_row_to_pxt_row(
     row: tuple[Any, ...], schema: dict[str, pxt.ColumnType], col_mapping: Optional[dict[str, str]]
 ) -> dict[str, Any]:
     """Convert a row to insertable format"""
     pxt_row: dict[str, Any] = {}
     for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
+        pxt_name = col_mapping.get(col_name, col_name)
+        nval: Any
         if pxt_type.is_float_type():
-            val = float(val)
+            nval = float(val)
         elif isinstance(val, float) and np.isnan(val):
             # pandas uses NaN for empty cells, even for types other than float;
             # for any type but a float, convert these to None
-            val = None
+            nval = None
         elif pxt_type.is_int_type():
-            val = int(val)
+            nval = int(val)
         elif pxt_type.is_bool_type():
-            val = bool(val)
+            nval = bool(val)
         elif pxt_type.is_string_type():
-            val = str(val)
+            nval = str(val)
         elif pxt_type.is_timestamp_type():
             if pd.isnull(val):
                 # pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
                 # much not-ok with it. (But if we convert it to None and then load out the
                 # table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
-                val = None
+                nval = None
             else:
-                val = pd.Timestamp(val).to_pydatetime()
-        pxt_name = col_name if col_mapping is None else col_mapping[col_name]
-        pxt_row[pxt_name] = val
+                nval = pd.Timestamp(val).to_pydatetime()
+        else:
+            nval = val
+        pxt_row[pxt_name] = nval
     return pxt_row

pixeltable/io/parquet.py CHANGED Viewed

@@ -4,7 +4,6 @@ import datetime
 import io
 import json
 import logging
-import random
 import typing
 from collections import deque
 from pathlib import Path
@@ -14,12 +13,10 @@ import numpy as np
 import PIL.Image
 import pixeltable as pxt
-import pixeltable.exceptions as exc
+import pixeltable.exceptions as excs
 from pixeltable.env import Env
 from pixeltable.utils.transactional_directory import transactional_directory
-from .utils import normalize_import_parameters, normalize_schema_names
 if typing.TYPE_CHECKING:
     import pyarrow as pa
@@ -78,7 +75,7 @@ def export_parquet(
     arrow_schema = to_arrow_schema(df.schema)
     if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
-        raise exc.Error('Cannot export Dataframe with image columns when inline_images is False')
+        raise excs.Error('Cannot export Dataframe with image columns when inline_images is False')
     # store the changes atomically
     with transactional_directory(parquet_path) as temp_path:
@@ -87,7 +84,7 @@ def export_parquet(
         json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w'))  # keep type metadata
         batch_num = 0
-        current_value_batch: dict[str, deque] = {k: deque() for k in df.schema.keys()}
+        current_value_batch: dict[str, deque] = {k: deque() for k in df.schema}
         current_byte_estimate = 0
         with Env.get().begin_xact():
@@ -111,7 +108,7 @@ def export_parquet(
                             val.save(buf, format='PNG')
                             val = buf.getvalue()
                         else:
-                            assert False, f'unknown image type {type(val)}'
+                            raise excs.Error(f'unknown image type {type(val)}')
                         length = len(val)
                     elif col_type.is_string_type():
                         length = len(val)
@@ -119,16 +116,14 @@ def export_parquet(
                         if data_row.file_paths is not None and data_row.file_paths[e.slot_idx] is not None:
                             val = data_row.file_paths[e.slot_idx]
                         else:
-                            assert False, f'unknown video type {type(val)}'
+                            raise excs.Error(f'unknown video type {type(val)}')
                         length = len(val)
                     elif col_type.is_json_type():
                         val = json.dumps(val)
                         length = len(val)
                     elif col_type.is_array_type():
                         length = val.nbytes
-                    elif col_type.is_int_type():
-                        length = 8
-                    elif col_type.is_float_type():
+                    elif col_type.is_int_type() or col_type.is_float_type():
                         length = 8
                     elif col_type.is_bool_type():
                         length = 1
@@ -136,7 +131,7 @@ def export_parquet(
                         val = val.astimezone(datetime.timezone.utc)
                         length = 8
                     else:
-                        assert False, f'unknown type {col_type} for {col_name}'
+                        raise excs.Error(f'unknown type {col_type} for {col_name}')
                     current_value_batch[col_name].append(val)
                     current_byte_estimate += length
@@ -144,7 +139,7 @@ def export_parquet(
                     assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
                     _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
                     batch_num += 1
-                    current_value_batch = {k: deque() for k in df.schema.keys()}
+                    current_value_batch = {k: deque() for k in df.schema}
                     current_byte_estimate = 0
             _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
@@ -173,32 +168,12 @@ def import_parquet(
     Returns:
         A handle to the newly created table.
     """
-    from pyarrow import parquet
-    from pixeltable.utils.arrow import ar_infer_schema, iter_tuples2
-    input_path = Path(parquet_path).expanduser()
-    parquet_dataset = parquet.ParquetDataset(str(input_path))
-    schema_overrides, primary_key = normalize_import_parameters(schema_overrides, primary_key)
-    ar_schema = ar_infer_schema(parquet_dataset.schema, schema_overrides, primary_key)
-    schema, pxt_pk, col_mapping = normalize_schema_names(ar_schema, primary_key, schema_overrides, False)
-    if table in pxt.list_tables():
-        raise exc.Error(f'Table {table} already exists')
-    tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
-    total_rows = 0
-    try:
-        tab = pxt.create_table(tmp_name, schema, primary_key=pxt_pk, **kwargs)
-        for fragment in parquet_dataset.fragments:  # type: ignore[attr-defined]
-            for batch in fragment.to_batches():
-                dict_batch = list(iter_tuples2(batch, col_mapping, schema))
-                total_rows += len(dict_batch)
-                tab.insert(dict_batch)
-    except Exception as e:
-        _logger.error(f'Error after inserting {total_rows} rows from Parquet file into table: {e}')
-        raise e
-    pxt.move(tmp_name, table)
-    return pxt.get_table(table)
+    value = kwargs.pop('source_format', None)
+    return pxt.create_table(
+        table,
+        source=parquet_path,
+        source_format=value,
+        schema_overrides=schema_overrides,
+        primary_key=primary_key,
+        extra_args=kwargs,
+    )

pixeltable 0.3.8__py3-none-any.whl → 0.3.10__py3-none-any.whl

Potentially problematic release.

pixeltable 0.3.8py3-none-any.whl → 0.3.10py3-none-any.whl