PyPI - pixeltable - Versions diffs - 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl - Mend

pixeltable 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (110) hide show

pixeltable/__init__.py +20 -9
pixeltable/__version__.py +3 -0
pixeltable/catalog/column.py +23 -7
pixeltable/catalog/insertable_table.py +32 -19
pixeltable/catalog/table.py +210 -20
pixeltable/catalog/table_version.py +272 -111
pixeltable/catalog/table_version_path.py +6 -1
pixeltable/dataframe.py +184 -110
pixeltable/datatransfer/__init__.py +1 -0
pixeltable/datatransfer/label_studio.py +526 -0
pixeltable/datatransfer/remote.py +113 -0
pixeltable/env.py +213 -79
pixeltable/exec/__init__.py +2 -1
pixeltable/exec/data_row_batch.py +6 -7
pixeltable/exec/expr_eval_node.py +28 -28
pixeltable/exec/sql_scan_node.py +7 -6
pixeltable/exprs/__init__.py +4 -3
pixeltable/exprs/column_ref.py +11 -2
pixeltable/exprs/comparison.py +39 -1
pixeltable/exprs/data_row.py +7 -0
pixeltable/exprs/expr.py +26 -19
pixeltable/exprs/function_call.py +17 -18
pixeltable/exprs/globals.py +14 -2
pixeltable/exprs/image_member_access.py +9 -28
pixeltable/exprs/in_predicate.py +96 -0
pixeltable/exprs/inline_array.py +13 -11
pixeltable/exprs/inline_dict.py +15 -13
pixeltable/exprs/row_builder.py +7 -1
pixeltable/exprs/similarity_expr.py +67 -0
pixeltable/ext/functions/whisperx.py +30 -0
pixeltable/ext/functions/yolox.py +16 -0
pixeltable/func/__init__.py +0 -2
pixeltable/func/aggregate_function.py +5 -2
pixeltable/func/callable_function.py +57 -13
pixeltable/func/expr_template_function.py +14 -3
pixeltable/func/function.py +35 -4
pixeltable/func/signature.py +5 -15
pixeltable/func/udf.py +8 -12
pixeltable/functions/fireworks.py +9 -4
pixeltable/functions/huggingface.py +48 -5
pixeltable/functions/openai.py +49 -11
pixeltable/functions/pil/image.py +61 -64
pixeltable/functions/together.py +32 -6
pixeltable/functions/util.py +0 -43
pixeltable/functions/video.py +46 -8
pixeltable/globals.py +443 -0
pixeltable/index/__init__.py +1 -0
pixeltable/index/base.py +9 -2
pixeltable/index/btree.py +54 -0
pixeltable/index/embedding_index.py +91 -15
pixeltable/io/__init__.py +4 -0
pixeltable/io/globals.py +59 -0
pixeltable/{utils → io}/hf_datasets.py +48 -17
pixeltable/io/pandas.py +148 -0
pixeltable/{utils → io}/parquet.py +58 -33
pixeltable/iterators/__init__.py +1 -1
pixeltable/iterators/base.py +8 -4
pixeltable/iterators/document.py +225 -93
pixeltable/iterators/video.py +16 -9
pixeltable/metadata/__init__.py +8 -4
pixeltable/metadata/converters/convert_12.py +3 -0
pixeltable/metadata/converters/convert_13.py +41 -0
pixeltable/metadata/converters/convert_14.py +13 -0
pixeltable/metadata/converters/convert_15.py +29 -0
pixeltable/metadata/converters/util.py +63 -0
pixeltable/metadata/schema.py +12 -6
pixeltable/plan.py +11 -24
pixeltable/store.py +16 -23
pixeltable/tool/create_test_db_dump.py +49 -14
pixeltable/type_system.py +27 -58
pixeltable/utils/coco.py +94 -0
pixeltable/utils/documents.py +42 -12
pixeltable/utils/http_server.py +70 -0
pixeltable-0.2.7.dist-info/METADATA +137 -0
pixeltable-0.2.7.dist-info/RECORD +126 -0
{pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +1 -1
pixeltable/client.py +0 -600
pixeltable/exprs/image_similarity_predicate.py +0 -58
pixeltable/func/batched_function.py +0 -53
pixeltable/func/nos_function.py +0 -202
pixeltable/tests/conftest.py +0 -171
pixeltable/tests/ext/test_yolox.py +0 -21
pixeltable/tests/functions/test_fireworks.py +0 -43
pixeltable/tests/functions/test_functions.py +0 -60
pixeltable/tests/functions/test_huggingface.py +0 -158
pixeltable/tests/functions/test_openai.py +0 -162
pixeltable/tests/functions/test_together.py +0 -112
pixeltable/tests/test_audio.py +0 -65
pixeltable/tests/test_catalog.py +0 -27
pixeltable/tests/test_client.py +0 -21
pixeltable/tests/test_component_view.py +0 -379
pixeltable/tests/test_dataframe.py +0 -440
pixeltable/tests/test_dirs.py +0 -107
pixeltable/tests/test_document.py +0 -120
pixeltable/tests/test_exprs.py +0 -802
pixeltable/tests/test_function.py +0 -332
pixeltable/tests/test_index.py +0 -138
pixeltable/tests/test_migration.py +0 -44
pixeltable/tests/test_nos.py +0 -54
pixeltable/tests/test_snapshot.py +0 -231
pixeltable/tests/test_table.py +0 -1343
pixeltable/tests/test_transactional_directory.py +0 -42
pixeltable/tests/test_types.py +0 -52
pixeltable/tests/test_video.py +0 -159
pixeltable/tests/test_view.py +0 -535
pixeltable/tests/utils.py +0 -442
pixeltable/utils/clip.py +0 -18
pixeltable-0.2.5.dist-info/METADATA +0 -128
pixeltable-0.2.5.dist-info/RECORD +0 -139
{pixeltable-0.2.5.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0

pixeltable/io/globals.py ADDED Viewed

@@ -0,0 +1,59 @@
+from typing import Any, Optional, Literal
+import pixeltable as pxt
+from pixeltable import Table
+def create_label_studio_project(
+        t: Table,
+        label_config: str,
+        col_mapping: Optional[dict[str, str]] = None,
+        title: Optional[str] = None,
+        media_import_method: Literal['post', 'file'] = 'file',
+        sync_immediately: bool = True,
+        **kwargs: Any
+) -> None:
+    """
+    Creates a new Label Studio project and links it to the specified `Table`.
+    The required parameter `label_config` specifies the Label Studio project configuration,
+    in XML format, as described in the Label Studio documentation. The linked project will
+    have one column for each data field in the configuration; for example, if the
+    configuration has an entry
+    ```
+    <Image name="image_obj" value="$image"/>
+    ```
+    then the linked project will have a column named `image`. In addition, the linked project
+    will always have a JSON-typed column `annotations` representing the output.
+    By default, Pixeltable will link each of these columns to a column of the specified `Table`
+    with the same name. If any of the data fields are missing, an exception will be thrown. If
+    the `annotations` column is missing, it will be created. The default names can be overridden
+    by specifying an optional `col_mapping`, with Pixeltable column names as keys and Label
+    Studio field names as values.
+    Args:
+        t: The Table to link to.
+        label_config: The Label Studio project configuration, in XML format.
+        col_mapping: An optional mapping of local column names to remote column names.
+        title: An optional title for the Label Studio project. If not specified, the
+            name of the `Table` will be used as a default.
+        sync_immediately: If `True`, immediately perform an initial synchronization by
+            importing all rows of the `Table` as Label Studio tasks.
+    """
+    from pixeltable.datatransfer.label_studio import LabelStudioProject, ANNOTATIONS_COLUMN
+    ls_project = LabelStudioProject.create(title or t.get_name(), label_config, media_import_method, **kwargs)
+    # Create a column to hold the annotations, if one does not yet exist.
+    if col_mapping is not None and ANNOTATIONS_COLUMN in col_mapping.values():
+        local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
+    else:
+        local_annotations_column = ANNOTATIONS_COLUMN
+    if local_annotations_column not in t.column_names():
+        t[local_annotations_column] = pxt.JsonType(nullable=True)
+    # Link the project to `t`, and sync if appropriate.
+    t._link(ls_project, col_mapping)
+    if sync_immediately:
+        t.sync()

pixeltable/{utils → io}/hf_datasets.py RENAMED Viewed

@@ -1,11 +1,17 @@
-import datasets
-from typing import Union, Optional, List, Dict, Any
-import pixeltable.type_system as ts
-from pixeltable import exceptions as excs
-import math
+from __future__ import annotations
 import logging
-import pixeltable
+import math
 import random
+import typing
+from typing import Union, Optional, Any
+import pixeltable
+import pixeltable.type_system as ts
+from pixeltable import exceptions as excs
+if typing.TYPE_CHECKING:
+    import datasets
 _logger = logging.getLogger(__name__)
@@ -17,7 +23,7 @@ _K_BATCH_SIZE_BYTES = 100_000_000
 # note, there are many more types. we allow overrides in the schema_override parameter
 # to handle cases where the appropriate type is not yet mapped, or to override this mapping.
 # https://huggingface.co/docs/datasets/v2.17.0/en/package_reference/main_classes#datasets.Value
-_hf_to_pxt: Dict[str, ts.ColumnType] = {
+_hf_to_pxt: dict[str, ts.ColumnType] = {
     'int32': ts.IntType(nullable=True),  # pixeltable widens to big int
     'int64': ts.IntType(nullable=True),
     'bool': ts.BoolType(nullable=True),
@@ -27,10 +33,13 @@ _hf_to_pxt: Dict[str, ts.ColumnType] = {
     'timestamp[ms]': ts.TimestampType(nullable=True),  # HF dataset iterator converts timestamps to datetime.datetime
 }
 def _to_pixeltable_type(
     feature_type: Union[datasets.ClassLabel, datasets.Value, datasets.Sequence],
 ) -> Optional[ts.ColumnType]:
     """Convert a huggingface feature type to a pixeltable ColumnType if one is defined."""
+    import datasets
     if isinstance(feature_type, datasets.ClassLabel):
         # enum, example: ClassLabel(names=['neg', 'pos'], id=None)
         return ts.StringType(nullable=True)
@@ -45,14 +54,18 @@ def _to_pixeltable_type(
     else:
         return None
 def _get_hf_schema(dataset: Union[datasets.Dataset, datasets.DatasetDict]) -> datasets.Features:
     """Get the schema of a huggingface dataset as a dictionary."""
+    import datasets
     first_dataset = dataset if isinstance(dataset, datasets.Dataset) else next(iter(dataset.values()))
     return first_dataset.features
 def huggingface_schema_to_pixeltable_schema(
     hf_dataset: Union[datasets.Dataset, datasets.DatasetDict],
-) -> Dict[str, Optional[ts.ColumnType]]:
+) -> dict[str, Optional[ts.ColumnType]]:
     """Generate a pixeltable schema from a huggingface dataset schema.
     Columns without a known mapping are mapped to None
     """
@@ -62,17 +75,35 @@ def huggingface_schema_to_pixeltable_schema(
     }
     return pixeltable_schema
 def import_huggingface_dataset(
-    cl: 'pixeltable.Client',
     table_path: str,
     dataset: Union[datasets.Dataset, datasets.DatasetDict],
     *,
-    column_name_for_split: Optional[str],
-    schema_override: Optional[Dict[str, Any]],
+    column_name_for_split: Optional[str] = None,
+    schema_override: Optional[dict[str, Any]] = None,
     **kwargs,
 ) -> 'pixeltable.InsertableTable':
-    """See `pixeltable.Client.import_huggingface_dataset` for documentation"""
-    if table_path in cl.list_tables():
+    """Create a new `Table` from a Huggingface dataset, or dataset dict with multiple splits.
+        Requires datasets library to be installed.
+    Args:
+        path_str: Path to the table.
+        dataset: Huggingface datasets.Dataset or datasets.DatasetDict to insert into the table.
+        column_name_for_split: column name to use for split information. If None, no split information will be stored.
+        schema_override: Optional dictionary mapping column names to column type to override the corresponding defaults from
+        `pixeltable.utils.hf_datasets.huggingface_schema_to_pixeltable_schema`. The column type should be a pixeltable ColumnType.
+        For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
+        kwargs: Additional arguments to pass to `create_table`.
+    Returns:
+        The newly created table. The table will have loaded the data from the dataset.
+    """
+    import datasets
+    import pixeltable as pxt
+    if table_path in pxt.list_tables():
         raise excs.Error(f'table {table_path} already exists')
     if not isinstance(dataset, (datasets.Dataset, datasets.DatasetDict)):
@@ -122,9 +153,9 @@ def import_huggingface_dataset(
     try:
         # random tmp name
         tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
-        tab = cl.create_table(tmp_name, pixeltable_schema, **kwargs)
+        tab = pxt.create_table(tmp_name, pixeltable_schema, **kwargs)
-        def _translate_row(row: Dict[str, Any], split_name: str) -> Dict[str, Any]:
+        def _translate_row(row: dict[str, Any], split_name: str) -> dict[str, Any]:
             output_row = row.copy()
             # map all class labels to strings
             for field, values in categorical_features.items():
@@ -153,5 +184,5 @@ def import_huggingface_dataset(
         _logger.error(f'Error while inserting dataset into table: {tmp_name}')
         raise e
-    cl.move(tmp_name, table_path)
-    return cl.get_table(table_path)
+    pxt.move(tmp_name, table_path)
+    return pxt.get_table(table_path)

pixeltable/io/pandas.py ADDED Viewed

@@ -0,0 +1,148 @@
+from typing import Optional, Any, Iterable
+import numpy as np
+import pandas as pd
+import pixeltable as pxt
+import pixeltable.exceptions as excs
+import pixeltable.type_system as ts
+def import_pandas(
+    tbl_name: str, df: pd.DataFrame, *, schema_overrides: Optional[dict[str, pxt.ColumnType]] = None
+) -> pxt.catalog.InsertableTable:
+    """Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
+    will be inferred from the `DataFrame`, unless `schema` is specified.
+    The column names of the new `Table` will be identical to those in the `DataFrame`, as long as they are valid
+    Pixeltable identifiers. If a column name is not a valid Pixeltable identifier, it will be normalized according to
+    the following procedure:
+    - first replace any non-alphanumeric characters with underscores;
+    - then, preface the result with the letter 'c' if it begins with a number or an underscore;
+    - then, if there are any duplicate column names, suffix the duplicates with '_2', '_3', etc., in column order.
+    Args:
+        tbl_name: The name of the table to create.
+        df: The Pandas `DataFrame`.
+        schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
+            name `name` will be given type `type`, instead of being inferred from the `DataFrame`. The keys in
+            `schema_overrides` should be the column names of the `DataFrame` (whether or not they are valid
+            Pixeltable identifiers).
+    """
+    schema = _df_to_pxt_schema(df, schema_overrides)
+    tbl_rows = (dict(_df_row_to_pxt_row(row, schema)) for row in df.itertuples())
+    table = pxt.create_table(tbl_name, schema)
+    table.insert(tbl_rows)
+    return table
+def import_csv(
+    table_path: str, filepath_or_buffer, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs
+) -> pxt.catalog.InsertableTable:
+    """
+    Creates a new `Table` from a csv file. This is a convenience method and is equivalent
+    to calling `import_pandas(table_path, pd.read_csv(filepath_or_buffer, **kwargs), schema=schema)`.
+    See the Pandas documentation for `read_csv` for more details.
+    """
+    df = pd.read_csv(filepath_or_buffer, **kwargs)
+    return import_pandas(table_path, df, schema_overrides=schema_overrides)
+def import_excel(
+    table_path: str, io, *args, schema_overrides: Optional[dict[str, ts.ColumnType]] = None, **kwargs
+) -> pxt.catalog.InsertableTable:
+    """
+    Creates a new `Table` from an excel (.xlsx) file. This is a convenience method and is equivalent
+    to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
+    See the Pandas documentation for `read_excel` for more details.
+    """
+    df = pd.read_excel(io, *args, **kwargs)
+    return import_pandas(table_path, df, schema_overrides=schema_overrides)
+def _df_to_pxt_schema(
+    df: pd.DataFrame, schema_overrides: Optional[dict[str, pxt.ColumnType]]
+) -> dict[str, pxt.ColumnType]:
+    if schema_overrides is not None:
+        for pd_name in schema_overrides:
+            if pd_name not in df.columns:
+                raise excs.Error(
+                    f'Column `{pd_name}` specified in `schema_overrides` does not exist in the given `DataFrame`.'
+                )
+    schema = {}
+    for pd_name, pd_dtype in zip(df.columns, df.dtypes):
+        if schema_overrides is not None and pd_name in schema_overrides:
+            pxt_type = schema_overrides[pd_name]
+        else:
+            pxt_type = _np_dtype_to_pxt_type(pd_dtype, df[pd_name])
+        pxt_name = _normalize_pxt_col_name(pd_name)
+        # Ensure that column names are unique by appending a distinguishing suffix
+        # to any collisions
+        if pxt_name in schema:
+            n = 2
+            while f'{pxt_name}_{n}' in schema:
+                n += 1
+            pxt_name = f'{pxt_name}_{n}'
+        schema[pxt_name] = pxt_type
+    return schema
+def _normalize_pxt_col_name(pd_name: str) -> str:
+    """
+    Normalizes an arbitrary DataFrame column name into a valid Pixeltable identifier by:
+    - replacing any non-ascii or non-alphanumeric characters with an underscore _
+    - prefixing the result with the letter 'c' if it starts with an underscore or a number
+    """
+    id = ''.join(ch if ch.isascii() and ch.isalnum() else '_' for ch in pd_name)
+    if id[0].isnumeric():
+        id = f'c_{id}'
+    elif id[0] == '_':
+        id = f'c{id}'
+    assert pxt.catalog.is_valid_identifier(id), id
+    return id
+def _np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series) -> pxt.ColumnType:
+    """
+    Infers a Pixeltable type based on a Numpy dtype.
+    """
+    if np.issubdtype(np_dtype, np.integer):
+        return pxt.IntType()
+    if np.issubdtype(np_dtype, np.floating):
+        return pxt.FloatType()
+    if np.issubdtype(np_dtype, np.bool_):
+        return pxt.BoolType()
+    if np_dtype == np.object_ or np.issubdtype(np_dtype, np.character):
+        has_nan = any(isinstance(val, float) and np.isnan(val) for val in data_col)
+        return pxt.StringType(nullable=has_nan)
+    if np.issubdtype(np_dtype, np.datetime64):
+        has_nat = any(pd.isnull(val) for val in data_col)
+        return pxt.TimestampType(nullable=has_nat)
+    raise excs.Error(f'Unsupported dtype: {np_dtype}')
+def _df_row_to_pxt_row(row: tuple[Any, ...], schema: dict[str, pxt.ColumnType]) -> dict[str, Any]:
+    rows = {}
+    for val, (col_name, pxt_type) in zip(row[1:], schema.items()):
+        if pxt_type.is_float_type():
+            val = float(val)
+        elif isinstance(val, float) and np.isnan(val):
+            # pandas uses NaN for empty cells, even for types other than float;
+            # for any type but a float, convert these to None
+            val = None
+        elif pxt_type.is_int_type():
+            val = int(val)
+        elif pxt_type.is_bool_type():
+            val = bool(val)
+        elif pxt_type.is_string_type():
+            val = str(val)
+        elif pxt_type.is_timestamp_type():
+            if pd.isnull(val):
+                # pandas has the bespoke 'NaT' type for a missing timestamp; postgres is very
+                # much not-ok with it. (But if we convert it to None and then load out the
+                # table contents as a pandas DataFrame, it will correctly restore the 'NaT'!)
+                val = None
+            else:
+                val = pd.Timestamp(val).to_pydatetime()
+        rows[col_name] = val
+    return rows

pixeltable/{utils → io}/parquet.py RENAMED Viewed

@@ -1,26 +1,31 @@
+from __future__ import annotations
 import io
 import json
 import logging
+import random
+import typing
 from collections import deque
 from pathlib import Path
-from typing import Dict, List, Optional, Union
+from typing import Dict, Optional
-import numpy as np
 import PIL.Image
-import pyarrow as pa
-import pyarrow.parquet
+import numpy as np
+import pixeltable.exceptions as exc
 import pixeltable.type_system as ts
-from pixeltable.utils.arrow import iter_tuples, to_arrow_schema, to_pixeltable_schema
 from pixeltable.utils.transactional_directory import transactional_directory
-import pixeltable.exceptions as exc
-import random
+if typing.TYPE_CHECKING:
+    import pixeltable as pxt
+    import pyarrow as pa
 _logger = logging.getLogger(__name__)
-def _write_batch(value_batch : Dict[str, deque], schema : pa.Schema, output_path : Path) -> None:
+def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
+    import pyarrow as pa
     pydict = {}
     for field in schema:
         if isinstance(field.type, pa.FixedShapeTensorType):
@@ -32,21 +37,24 @@ def _write_batch(value_batch : Dict[str, deque], schema : pa.Schema, output_path
     tab = pa.Table.from_pydict(pydict, schema=schema)
     pa.parquet.write_table(tab, output_path)
-def save_parquet(df: 'pixeltable.DataFrame', dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
+def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
     """
-        Internal method to stream dataframe data to parquet format.
-        Does not materialize the dataset to memory.
+    Internal method to stream dataframe data to parquet format.
+    Does not materialize the dataset to memory.
-        It preserves pixeltable type metadata in a json file, which would otherwise
-        not be available in the parquet format.
+    It preserves pixeltable type metadata in a json file, which would otherwise
+    not be available in the parquet format.
-        Images are stored inline in a compressed format in their parquet file.
+    Images are stored inline in a compressed format in their parquet file.
-        Args:
-            df : dataframe to save.
-            dest_path : path to directory to save the parquet files to.
-            partition_size_bytes : maximum target size for each chunk. Default 100_000_000 bytes.
+    Args:
+        df : dataframe to save.
+        dest_path : path to directory to save the parquet files to.
+        partition_size_bytes : maximum target size for each chunk. Default 100_000_000 bytes.
     """
+    from pixeltable.utils.arrow import to_arrow_schema
     column_names = df.get_column_names()
     column_types = df.get_column_types()
     type_dict = {k: v.as_dict() for k, v in zip(column_names, column_types)}
@@ -55,15 +63,15 @@ def save_parquet(df: 'pixeltable.DataFrame', dest_path: Path, partition_size_byt
     # store the changes atomically
     with transactional_directory(dest_path) as temp_path:
         # dump metadata json file so we can inspect what was the source of the parquet file later on.
-        json.dump(df._as_dict(), (temp_path / '.pixeltable.json').open('w')) # pylint: disable=protected-access
-        json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w')) # keep type metadata
+        json.dump(df._as_dict(), (temp_path / '.pixeltable.json').open('w'))  # pylint: disable=protected-access
+        json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w'))  # keep type metadata
         batch_num = 0
-        current_value_batch : Dict[str, deque] = {k:deque() for k in column_names}
+        current_value_batch: Dict[str, deque] = {k: deque() for k in column_names}
         current_byte_estimate = 0
-        for data_row in df._exec(): # pylint: disable=protected-access
-            for (col_name, col_type, e) in zip(column_names, column_types, df._select_list_exprs): # pylint: disable=protected-access
+        for data_row in df._exec():  # pylint: disable=protected-access
+            for col_name, col_type, e in zip(column_names, column_types, df._select_list_exprs):  # pylint: disable=protected-access
                 val = data_row[e.slot_idx]
                 if val is None:
                     current_value_batch[col_name].append(val)
@@ -112,9 +120,9 @@ def save_parquet(df: 'pixeltable.DataFrame', dest_path: Path, partition_size_byt
                 current_byte_estimate += length
             if current_byte_estimate > partition_size_bytes:
                 assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
-                _write_batch(current_value_batch, arrow_schema,  temp_path / f'part-{batch_num:05d}.parquet')
+                _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
                 batch_num += 1
-                current_value_batch = {k:deque() for k in column_names}
+                current_value_batch = {k: deque() for k in column_names}
                 current_byte_estimate = 0
         _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
@@ -122,6 +130,8 @@ def save_parquet(df: 'pixeltable.DataFrame', dest_path: Path, partition_size_byt
 def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional[ts.ColumnType]]:
     """Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
+    import pyarrow as pa
+    from pixeltable.utils.arrow import to_pixeltable_schema
     input_path = Path(parquet_path).expanduser()
     parquet_dataset = pa.parquet.ParquetDataset(input_path)
@@ -129,14 +139,29 @@ def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional
 def import_parquet(
-    cl: 'pixeltable.Client',
     table_path: str,
     *,
     parquet_path: str,
-    schema_override: Optional[Dict[str, ts.ColumnType]],
+    schema_override: Optional[Dict[str, ts.ColumnType]] = None,
     **kwargs,
-) -> 'catalog.InsertableTable':
-    """See `pixeltable.Client.import_parquet` for documentation"""
+) -> pxt.catalog.InsertableTable:
+    """Create a new `Table` from a Parquet file or set of files. Requires pyarrow to be installed.
+    Args:
+        path_str: Path to the table within pixeltable.
+        parquet_path: Path to an individual Parquet file or directory of Parquet files.
+        schema_override: Optional dictionary mapping column names to column type to override the default
+                        schema inferred from the Parquet file. The column type should be a pixeltable ColumnType.
+                        For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
+                        Any fields not provided explicitly will map to types with `pixeltable.utils.parquet.parquet_schema_to_pixeltable_schema`
+        kwargs: Additional arguments to pass to `create_table`.
+    Returns:
+        The newly created table. The table will have loaded the data from the Parquet file(s).
+    """
+    import pixeltable as pxt
+    import pyarrow as pa
+    from pixeltable.utils.arrow import iter_tuples
     input_path = Path(parquet_path).expanduser()
     parquet_dataset = pa.parquet.ParquetDataset(input_path)
@@ -149,12 +174,12 @@ def import_parquet(
         if v is None:
             raise exc.Error(f'Could not infer pixeltable type for column {k} from parquet file')
-    if table_path in cl.list_tables():
+    if table_path in pxt.list_tables():
         raise exc.Error(f'Table {table_path} already exists')
     try:
         tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
-        tab = cl.create_table(tmp_name, schema, **kwargs)
+        tab = pxt.create_table(tmp_name, schema, **kwargs)
         for fragment in parquet_dataset.fragments:
             for batch in fragment.to_batches():
                 dict_batch = list(iter_tuples(batch))
@@ -163,5 +188,5 @@ def import_parquet(
         _logger.error(f'Error while inserting Parquet file into table: {e}')
         raise e
-    cl.move(tmp_name, table_path)
-    return cl.get_table(table_path)
+    pxt.move(tmp_name, table_path)
+    return pxt.get_table(table_path)

pixeltable/iterators/__init__.py CHANGED Viewed

@@ -1,3 +1,3 @@
 from .base import ComponentIterator
+from .document import DocumentSplitter
 from .video import FrameIterator

pixeltable/iterators/base.py CHANGED Viewed

@@ -6,11 +6,11 @@ from pixeltable.type_system import ColumnType
 class ComponentIterator(ABC):
-    """Base class for iterators."""
+    """Base class for Pixeltable iterators."""
     @classmethod
     @abstractmethod
-    def input_schema(cls) -> Dict[str, ColumnType]:
+    def input_schema(cls) -> dict[str, ColumnType]:
         """Provide the Pixeltable types of the init() parameters
         The keys need to match the names of the init() parameters. This is equivalent to the parameters_types
@@ -20,7 +20,7 @@ class ComponentIterator(ABC):
     @classmethod
     @abstractmethod
-    def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
+    def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
         """Specify the dictionary returned by next() and a list of unstored column names
         Returns:
@@ -33,7 +33,7 @@ class ComponentIterator(ABC):
         return self
     @abstractmethod
-    def __next__(self) -> Dict[str, Any]:
+    def __next__(self) -> dict[str, Any]:
         """Return the next element of the iterator as a dictionary or raise StopIteration"""
         raise NotImplementedError
@@ -46,3 +46,7 @@ class ComponentIterator(ABC):
     def set_pos(self, pos: int) -> None:
         """Set the iterator position to pos"""
         raise NotImplementedError
+    @classmethod
+    def create(cls, **kwargs: Any) -> tuple[type[ComponentIterator], dict[str, Any]]:
+        return cls, kwargs

pixeltable 0.2.5__py3-none-any.whl → 0.2.7__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.5py3-none-any.whl → 0.2.7py3-none-any.whl