PyPI - pixeltable - Versions diffs - 0.2.15__py3-none-any.whl → 0.2.16__py3-none-any.whl - Mend

pixeltable 0.2.15py3-none-any.whl → 0.2.16py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (52) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/column.py +3 -0
pixeltable/catalog/dir.py +1 -1
pixeltable/catalog/globals.py +15 -6
pixeltable/catalog/insertable_table.py +23 -8
pixeltable/catalog/named_function.py +1 -1
pixeltable/catalog/path_dict.py +4 -4
pixeltable/catalog/schema_object.py +30 -18
pixeltable/catalog/table.py +84 -99
pixeltable/catalog/table_version.py +35 -24
pixeltable/catalog/table_version_path.py +2 -2
pixeltable/catalog/view.py +15 -8
pixeltable/dataframe.py +56 -56
pixeltable/env.py +6 -5
pixeltable/exec/__init__.py +3 -3
pixeltable/exec/aggregation_node.py +3 -3
pixeltable/exec/expr_eval_node.py +3 -3
pixeltable/exec/in_memory_data_node.py +4 -4
pixeltable/exec/sql_node.py +4 -1
pixeltable/exprs/array_slice.py +3 -4
pixeltable/exprs/column_ref.py +20 -4
pixeltable/exprs/comparison.py +11 -6
pixeltable/exprs/data_row.py +3 -0
pixeltable/exprs/expr.py +51 -23
pixeltable/exprs/function_call.py +8 -1
pixeltable/exprs/inline_array.py +2 -2
pixeltable/exprs/json_path.py +36 -20
pixeltable/exprs/row_builder.py +4 -4
pixeltable/exprs/rowid_ref.py +1 -1
pixeltable/functions/__init__.py +1 -2
pixeltable/functions/audio.py +32 -0
pixeltable/functions/huggingface.py +4 -4
pixeltable/functions/image.py +1 -1
pixeltable/functions/video.py +5 -1
pixeltable/functions/vision.py +2 -6
pixeltable/globals.py +57 -28
pixeltable/io/external_store.py +4 -4
pixeltable/io/globals.py +12 -13
pixeltable/io/label_studio.py +6 -6
pixeltable/io/pandas.py +27 -12
pixeltable/io/parquet.py +14 -14
pixeltable/iterators/document.py +7 -7
pixeltable/plan.py +58 -29
pixeltable/store.py +32 -31
pixeltable/tool/create_test_db_dump.py +12 -6
pixeltable/type_system.py +89 -97
pixeltable/utils/pytorch.py +12 -10
{pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/METADATA +10 -10
{pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/RECORD +52 -51
{pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/LICENSE +0 -0
{pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/WHEEL +0 -0
{pixeltable-0.2.15.dist-info → pixeltable-0.2.16.dist-info}/entry_points.txt +0 -0

pixeltable/globals.py CHANGED Viewed

@@ -1,15 +1,18 @@
 import dataclasses
 import logging
 from typing import Any, Optional, Union
+from uuid import UUID
 import pandas as pd
 import sqlalchemy as sql
+from pandas.io.formats.style import Styler
 from sqlalchemy.util.preloaded import orm
 import pixeltable.exceptions as excs
 import pixeltable.exprs as exprs
-from pixeltable import catalog, func, DataFrame
+from pixeltable import DataFrame, catalog, func
 from pixeltable.catalog import Catalog
+from pixeltable.dataframe import DataFrameResultSet
 from pixeltable.env import Env
 from pixeltable.iterators import ComponentIterator
 from pixeltable.metadata import schema
@@ -24,21 +27,25 @@ def init() -> None:
 def create_table(
     path_str: str,
-    schema: dict[str, Any],
+    schema_or_df: Union[dict[str, Any], DataFrame],
     *,
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
     comment: str = '',
-) -> catalog.InsertableTable:
-    """Create a new `InsertableTable`.
+) -> catalog.Table:
+    """Create a new base table.
     Args:
         path_str: Path to the table.
-        schema: dictionary mapping column names to column types, value expressions, or to column specifications.
+        schema_or_df: Either a dictionary that maps column names to column types, or a
+            [`DataFrame`][pixeltable.DataFrame] whose contents and schema will be used to pre-populate the table.
+        primary_key: An optional column name or list of column names to use as the primary key(s) of the
+            table.
         num_retained_versions: Number of versions of the table to retain.
+        comment: An optional comment; its meaning is user-defined.
     Returns:
-        The newly created table.
+        A handle to the newly created [`Table`][pixeltable.Table].
     Raises:
         Error: if the path already exists or is invalid.
@@ -46,12 +53,27 @@ def create_table(
     Examples:
         Create a table with an int and a string column:
-        >>> table = cl.create_table('my_table', schema={'col1': IntType(), 'col2': StringType()})
+        >>> table = pxt.create_table('my_table', schema={'col1': IntType(), 'col2': StringType()})
+        Create a table from a select statement over an existing table `tbl`:
+        >>> table = pxt.create_table('my_table', tbl.where(tbl.col1 < 10).select(tbl.col2))
     """
     path = catalog.Path(path_str)
     Catalog.get().paths.check_is_valid(path, expected=None)
     dir = Catalog.get().paths[path.parent]
+    df: Optional[DataFrame] = None
+    if isinstance(schema_or_df, dict):
+        schema = schema_or_df
+    elif isinstance(schema_or_df, DataFrame):
+        df = schema_or_df
+        schema = df.schema
+    elif isinstance(schema_or_df, DataFrameResultSet):
+        raise excs.Error('`schema_or_df` must be either a schema dictionary or a Pixeltable DataFrame. (Is there an extraneous call to `collect()`?)')
+    else:
+        raise excs.Error('`schema_or_df` must be either a schema dictionary or a Pixeltable DataFrame.')
     if len(schema) == 0:
         raise excs.Error(f'Table schema is empty: `{path_str}`')
@@ -63,15 +85,17 @@ def create_table(
         if not isinstance(primary_key, list) or not all(isinstance(pk, str) for pk in primary_key):
             raise excs.Error('primary_key must be a single column name or a list of column names')
-    tbl = catalog.InsertableTable.create(
+    tbl = catalog.InsertableTable._create(
         dir._id,
         path.name,
         schema,
+        df,
         primary_key=primary_key,
         num_retained_versions=num_retained_versions,
         comment=comment,
     )
     Catalog.get().paths[path] = tbl
     _logger.info(f'Created table `{path_str}`.')
     return tbl
@@ -87,25 +111,28 @@ def create_view(
     num_retained_versions: int = 10,
     comment: str = '',
     ignore_errors: bool = False,
-) -> catalog.View:
-    """Create a new `View`.
+) -> Optional[catalog.Table]:
+    """Create a view of an existing table object (which itself can be a view or a snapshot or a base table).
     Args:
         path_str: Path to the view.
-        base: Table (i.e., table or view or snapshot) or DataFrame to base the view on.
+        base: [`Table`][pixeltable.Table] (i.e., table or view or snapshot) or [`DataFrame`][pixeltable.DataFrame] to
+            base the view on.
         schema: dictionary mapping column names to column types, value expressions, or to column specifications.
         filter: predicate to filter rows of the base table.
         is_snapshot: Whether the view is a snapshot.
         iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
             the base table.
         num_retained_versions: Number of versions of the view to retain.
+        comment: Optional comment for the view.
         ignore_errors: if True, fail silently if the path already exists or is invalid.
     Returns:
-        The newly created view.
+        A handle to the [`Table`][pixeltable.Table] representing the newly created view. If the path already
+        exists or is invalid and `ignore_errors=True`, returns `None`.
     Raises:
-        Error: if the path already exists or is invalid.
+        Error: if the path already exists or is invalid and `ignore_errors=False`.
     Examples:
         Create a view with an additional int and a string column and a filter:
@@ -140,7 +167,7 @@ def create_view(
         Catalog.get().paths.check_is_valid(path, expected=None)
     except Exception as e:
         if ignore_errors:
-            return
+            return None
         else:
             raise e
     dir = Catalog.get().paths[path.parent]
@@ -152,7 +179,7 @@ def create_view(
     else:
         iterator_class, iterator_args = iterator
-    view = catalog.View.create(
+    view = catalog.View._create(
         dir._id,
         path.name,
         base=tbl_version_path,
@@ -170,16 +197,16 @@ def create_view(
 def get_table(path: str) -> catalog.Table:
-    """Get a handle to a table (including views and snapshots).
+    """Get a handle to an existing table or view or snapshot.
     Args:
         path: Path to the table.
     Returns:
-        A `InsertableTable` or `View` object.
+        A handle to the [`Table`][pixeltable.Table].
     Raises:
-        Error: If the path does not exist or does not designate a table.
+        Error: If the path does not exist or does not designate a table object.
     Examples:
         Get handle for a table in the top-level directory:
@@ -197,6 +224,7 @@ def get_table(path: str) -> catalog.Table:
     p = catalog.Path(path)
     Catalog.get().paths.check_is_valid(p, expected=catalog.Table)
     obj = Catalog.get().paths[p]
+    assert isinstance(obj, catalog.Table)
     return obj
@@ -230,15 +258,15 @@ def move(path: str, new_path: str) -> None:
 def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> None:
-    """Drop a table.
+    """Drop a table or view or snapshot.
     Args:
-        path: Path to the table.
+        path: Path to the [`Table`][pixeltable.Table].
         force: If `True`, will also drop all views or sub-views of this table.
         ignore_errors: Whether to ignore errors if the table does not exist.
     Raises:
-        Error: If the path does not exist or does not designate a table and ignore_errors is False.
+        Error: If the path does not exist or does not designate a table object and ignore_errors is False.
     Examples:
         >>> cl.drop_table('my_table')
@@ -256,7 +284,7 @@ def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> N
     tbl = cat.paths[path_obj]
     assert isinstance(tbl, catalog.Table)
     if len(cat.tbl_dependents[tbl._id]) > 0:
-        dependent_paths = [dep.path for dep in cat.tbl_dependents[tbl._id]]
+        dependent_paths = [dep._path for dep in cat.tbl_dependents[tbl._id]]
         if force:
             for dependent_path in dependent_paths:
                 drop_table(dependent_path, force=True)
@@ -268,14 +296,14 @@ def drop_table(path: str, force: bool = False, ignore_errors: bool = False) -> N
 def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
-    """List the tables in a directory.
+    """List the [`Table`][pixeltable.Table]s in a directory.
     Args:
         dir_path: Path to the directory. Defaults to the root directory.
         recursive: Whether to list tables in subdirectories as well.
     Returns:
-        A list of table paths.
+        A list of [`Table`][pixeltable.Table] paths.
     Raises:
         Error: If the path does not exist or does not designate a directory.
@@ -297,7 +325,7 @@ def list_tables(dir_path: str = '', recursive: bool = True) -> list[str]:
     return [str(p) for p in Catalog.get().paths.get_children(path, child_type=catalog.Table, recursive=recursive)]
-def create_dir(path_str: str, ignore_errors: bool = False) -> catalog.Dir:
+def create_dir(path_str: str, ignore_errors: bool = False) -> Optional[catalog.Dir]:
     """Create a directory.
     Args:
@@ -325,6 +353,7 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> catalog.Dir:
             session.add(dir_record)
             session.flush()
             assert dir_record.id is not None
+            assert isinstance(dir_record.id, UUID)
             dir = catalog.Dir(dir_record.id, parent._id, path.name)
             Catalog.get().paths[path] = dir
             session.commit()
@@ -333,7 +362,7 @@ def create_dir(path_str: str, ignore_errors: bool = False) -> catalog.Dir:
             return dir
     except excs.Error as e:
         if ignore_errors:
-            return
+            return None
         else:
             raise e
@@ -415,7 +444,7 @@ def list_dirs(path_str: str = '', recursive: bool = True) -> list[str]:
     return [str(p) for p in Catalog.get().paths.get_children(path, child_type=catalog.Dir, recursive=recursive)]
-def list_functions() -> pd.DataFrame:
+def list_functions() -> Styler:
     """Returns information about all registered functions.
     Returns:
@@ -436,7 +465,7 @@ def list_functions() -> pd.DataFrame:
             'Return Type': [str(f.signature.get_return_type()) for f in functions],
         }
     )
-    pd_df = pd_df.style.set_properties(**{'text-align': 'left'}).set_table_styles(
+    pd_df = pd_df.style.set_properties(None, **{'text-align': 'left'}).set_table_styles(
         [dict(selector='th', props=[('text-align', 'center')])]
     )  # center-align headings
     return pd_df.hide(axis='index')

pixeltable/io/external_store.py CHANGED Viewed

@@ -217,17 +217,17 @@ class Project(ExternalStore, abc.ABC):
         resolved_col_mapping: dict[Column, str] = {}
         # Validate names
-        t_cols = table.column_names()
+        t_cols = set(table._schema.keys())
         for t_col, ext_col in col_mapping.items():
             if t_col not in t_cols:
                 if is_user_specified_col_mapping:
                     raise excs.Error(
-                        f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table.name}` '
+                        f'Column name `{t_col}` appears as a key in `col_mapping`, but Table `{table._name}` '
                         'contains no such column.'
                     )
                 else:
                     raise excs.Error(
-                        f'Column `{t_col}` does not exist in Table `{table.name}`. Either add a column `{t_col}`, '
+                        f'Column `{t_col}` does not exist in Table `{table._name}`. Either add a column `{t_col}`, '
                         f'or specify a `col_mapping` to associate a different column with the external field `{ext_col}`.'
                     )
             if ext_col not in export_cols and ext_col not in import_cols:
@@ -238,7 +238,7 @@ class Project(ExternalStore, abc.ABC):
             col = table[t_col].col
             resolved_col_mapping[col] = ext_col
         # Validate column specs
-        t_col_types = table.column_types()
+        t_col_types = table._schema
         for t_col, ext_col in col_mapping.items():
             t_col_type = t_col_types[t_col]
             if ext_col in export_cols:

pixeltable/io/globals.py CHANGED Viewed

@@ -1,5 +1,4 @@
 from typing import Any, Literal, Optional, Union
-import urllib.request
 import pixeltable as pxt
 import pixeltable.exceptions as excs
@@ -19,7 +18,7 @@ def create_label_studio_project(
         **kwargs: Any
 ) -> SyncStatus:
     """
-    Create a new Label Studio project and link it to the specified `Table`.
+    Create a new Label Studio project and link it to the specified [`Table`][pixeltable.Table].
     - A tutorial notebook with fully worked examples can be found here:
       [Using Label Studio for Annotations with Pixeltable](https://pixeltable.readme.io/docs/label-studio)
@@ -34,7 +33,7 @@ def create_label_studio_project(
     then the linked project will have a column named `image`. In addition, the linked project
     will always have a JSON-typed column `annotations` representing the output.
-    By default, Pixeltable will link each of these columns to a column of the specified `Table`
+    By default, Pixeltable will link each of these columns to a column of the specified [`Table`][pixeltable.Table]
     with the same name. If any of the data fields are missing, an exception will be raised. If
     the `annotations` column is missing, it will be created. The default names can be overridden
     by specifying an optional `col_mapping`, with Pixeltable column names as keys and Label
@@ -52,7 +51,7 @@ def create_label_studio_project(
     - `pip install boto3` (if using S3 import storage)
     Args:
-        t: The Table to link to.
+        t: The table to link to.
         label_config: The Label Studio project configuration, in XML format.
         name: An optional name for the new project in Pixeltable. If specified, must be a valid
             Pixeltable identifier and must not be the name of any other external data store
@@ -73,7 +72,7 @@ def create_label_studio_project(
             The default is `post`.
         col_mapping: An optional mapping of local column names to Label Studio fields.
         sync_immediately: If `True`, immediately perform an initial synchronization by
-            exporting all rows of the `Table` as Label Studio tasks.
+            exporting all rows of the table as Label Studio tasks.
         s3_configuration: If specified, S3 import storage will be configured for the new project. This can only
             be used with `media_import_method='url'`, and if `media_import_method='url'` and any of the media data is
             referenced by `s3://` URLs, then it must be specified in order for such media to display correctly
@@ -148,15 +147,15 @@ def import_rows(
     comment: str = ''
     ) -> Table:
     """
-    Creates a new `Table` from a list of dictionaries. The dictionaries must be of the form
-    `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
+    Creates a new base table from a list of dictionaries. The dictionaries must be of the
+    form `{column_name: value, ...}`. Pixeltable will attempt to infer the schema of the table from the
     supplied data, using the most specific type that can represent all the values in a column.
     If `schema_overrides` is specified, then for each entry `(column_name, type)` in `schema_overrides`,
     Pixeltable will force the specified column to the specified type (and will not attempt any type inference
     for that column).
-    All column types of the new `Table` will be nullable unless explicitly specified as non-nullable in
+    All column types of the new table will be nullable unless explicitly specified as non-nullable in
     `schema_overrides`.
     Args:
@@ -169,7 +168,7 @@ def import_rows(
         comment: A comment to attach to the table (see [`create_table()`][pixeltable.create_table]).
     Returns:
-        The newly created `Table`.
+        A handle to the newly created [`Table`][pixeltable.Table].
     """
     if schema_overrides is None:
         schema_overrides = {}
@@ -187,7 +186,7 @@ def import_rows(
             elif value is not None:
                 # If `key` is not in `schema_overrides`, then we infer its type from the data.
                 # The column type will always be nullable by default.
-                col_type = pxt.ColumnType.infer_literal_type(value).copy(nullable=True)
+                col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
                 if col_name not in schema:
                     schema[col_name] = col_type
                 else:
@@ -230,8 +229,8 @@ def import_json(
     **kwargs: Any
 ) -> Table:
     """
-    Creates a new `Table` from a JSON file. This is a convenience method and is equivalent
-    to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
+    Creates a new base table from a JSON file. This is a convenience method and is
+    equivalent to calling `import_data(table_path, json.loads(file_contents, **kwargs), ...)`, where `file_contents`
     is the contents of the specified `filepath_or_url`.
     Args:
@@ -245,7 +244,7 @@ def import_json(
         kwargs: Additional keyword arguments to pass to `json.loads`.
     Returns:
-        The newly created `Table`.
+        A handle to the newly created [`Table`][pixeltable.Table].
     """
     import json
     import urllib.parse

pixeltable/io/label_studio.py CHANGED Viewed

@@ -105,7 +105,7 @@ class LabelStudioProject(Project):
         return {ANNOTATIONS_COLUMN: pxt.JsonType(nullable=True)}
     def sync(self, t: Table, export_data: bool, import_data: bool) -> SyncStatus:
-        _logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t.name}`'
+        _logger.info(f'Syncing Label Studio project "{self.project_title}" with table `{t._name}`'
                      f' (export: {export_data}, import: {import_data}).')
         # Collect all existing tasks into a dict with entries `rowid: task`
         tasks = {tuple(task['meta']['rowid']): task for task in self.__fetch_all_tasks()}
@@ -396,15 +396,15 @@ class LabelStudioProject(Project):
         updates = [{'_rowid': rowid, local_annotations_col.name: ann} for rowid, ann in annotations.items()]
         if len(updates) > 0:
             _logger.info(
-                f'Updating table `{t.name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
+                f'Updating table `{t._name}`, column `{local_annotations_col.name}` with {len(updates)} total annotations.'
             )
             # batch_update currently doesn't propagate from views to base tables. As a workaround, we call
             # batch_update on the actual ancestor table that holds the annotations column.
             # TODO(aaron-siegel): Simplify this once propagation is properly implemented in batch_update
             ancestor = t
             while local_annotations_col not in ancestor._tbl_version.cols:
-                assert ancestor.base is not None
-                ancestor = ancestor.base
+                assert ancestor._base is not None
+                ancestor = ancestor._base
             update_status = ancestor.batch_update(updates)
             print(f'Updated annotation(s) from {len(updates)} task(s) in {self}.')
             return SyncStatus(pxt_rows_updated=update_status.num_rows, num_excs=update_status.num_excs)
@@ -565,7 +565,7 @@ class LabelStudioProject(Project):
         if title is None:
             # `title` defaults to table name
-            title = t.name
+            title = t._name
         # Create a column to hold the annotations, if one does not yet exist
         if col_mapping is None or ANNOTATIONS_COLUMN in col_mapping.values():
@@ -573,7 +573,7 @@ class LabelStudioProject(Project):
                 local_annotations_column = ANNOTATIONS_COLUMN
             else:
                 local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
-            if local_annotations_column not in t.column_names():
+            if local_annotations_column not in t._schema.keys():
                 t[local_annotations_column] = pxt.JsonType(nullable=True)
         resolved_col_mapping = cls.validate_columns(

pixeltable/io/pandas.py CHANGED Viewed

@@ -15,11 +15,12 @@ def import_pandas(
     primary_key: Optional[Union[str, list[str]]] = None,
     num_retained_versions: int = 10,
     comment: str = ''
-) -> pxt.catalog.InsertableTable:
-    """Creates a new `Table` from a Pandas `DataFrame`, with the specified name. The schema of the table
-    will be inferred from the `DataFrame`.
+) -> pxt.Table:
+    """Creates a new base table from a Pandas
+    [`DataFrame`](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html), with the
+    specified name. The schema of the table will be inferred from the DataFrame.
-    The column names of the new `Table` will be identical to those in the `DataFrame`, as long as they are valid
+    The column names of the new table will be identical to those in the DataFrame, as long as they are valid
     Pixeltable identifiers. If a column name is not a valid Pixeltable identifier, it will be normalized according to
     the following procedure:
     - first replace any non-alphanumeric characters with underscores;
@@ -33,6 +34,9 @@ def import_pandas(
             name `name` will be given type `type`, instead of being inferred from the `DataFrame`. The keys in
             `schema_overrides` should be the column names of the `DataFrame` (whether or not they are valid
             Pixeltable identifiers).
+    Returns:
+        A handle to the newly created [`Table`][pixeltable.Table].
     """
     if schema_overrides is None:
         schema_overrides = {}
@@ -54,11 +58,15 @@ def import_csv(
     num_retained_versions: int = 10,
     comment: str = '',
     **kwargs
-) -> pxt.catalog.InsertableTable:
+) -> pxt.Table:
     """
-    Creates a new `Table` from a csv file. This is a convenience method and is equivalent
+    Creates a new base table from a csv file. This is a convenience method and is equivalent
     to calling `import_pandas(table_path, pd.read_csv(filepath_or_buffer, **kwargs), schema=schema)`.
-    See the Pandas documentation for `read_csv` for more details.
+    See the Pandas documentation for [`read_csv`](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)
+    for more details.
+    Returns:
+        A handle to the newly created [`Table`][pixeltable.Table].
     """
     df = pd.read_csv(filepath_or_buffer, **kwargs)
     return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
@@ -70,11 +78,15 @@ def import_excel(
     num_retained_versions: int = 10,
     comment: str = '',
     **kwargs
-) -> pxt.catalog.InsertableTable:
+) -> pxt.Table:
     """
-    Creates a new `Table` from an excel (.xlsx) file. This is a convenience method and is equivalent
-    to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
-    See the Pandas documentation for `read_excel` for more details.
+    Creates a new base table from an Excel (.xlsx) file. This is a convenience method and is
+    equivalent to calling `import_pandas(table_path, pd.read_excel(io, *args, **kwargs), schema=schema)`.
+    See the Pandas documentation for [`read_excel`](https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html)
+    for more details.
+    Returns:
+        A handle to the newly created [`Table`][pixeltable.Table].
     """
     df = pd.read_excel(io, *args, **kwargs)
     return import_pandas(tbl_name, df, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
@@ -177,7 +189,10 @@ def __np_dtype_to_pxt_type(np_dtype: np.dtype, data_col: pd.Series, nullable: bo
             return pxt.FloatType(nullable=nullable)
         inferred_type = pxt.ColumnType.infer_common_literal_type(data_col)
-        if inferred_type is not None:
+        if inferred_type is None:
+            # Fallback on StringType if everything else fails
+            return pxt.StringType(nullable=nullable)
+        else:
             return inferred_type.copy(nullable=nullable)
     raise excs.Error(f'Could not infer Pixeltable type of column: {data_col.name} (dtype: {np_dtype})')

pixeltable/io/parquet.py CHANGED Viewed

@@ -19,12 +19,14 @@ from pixeltable.utils.transactional_directory import transactional_directory
 if typing.TYPE_CHECKING:
     import pixeltable as pxt
     import pyarrow as pa
+    from pyarrow import parquet
 _logger = logging.getLogger(__name__)
 def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
     import pyarrow as pa
+    from pyarrow import parquet
     pydict = {}
     for field in schema:
@@ -35,7 +37,7 @@ def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path:
             pydict[field.name] = value_batch[field.name]
     tab = pa.Table.from_pydict(pydict, schema=schema)
-    pa.parquet.write_table(tab, output_path)
+    parquet.write_table(tab, output_path)
 def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
@@ -55,23 +57,21 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
     """
     from pixeltable.utils.arrow import to_arrow_schema
-    column_names = df.get_column_names()
-    column_types = df.get_column_types()
-    type_dict = {k: v.as_dict() for k, v in zip(column_names, column_types)}
-    arrow_schema = to_arrow_schema(dict(zip(column_names, column_types)))
+    type_dict = {k: v.as_dict() for k, v in df.schema.items()}
+    arrow_schema = to_arrow_schema(df.schema)
     # store the changes atomically
     with transactional_directory(dest_path) as temp_path:
         # dump metadata json file so we can inspect what was the source of the parquet file later on.
-        json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))  # pylint: disable=protected-access
+        json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
         json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w'))  # keep type metadata
         batch_num = 0
-        current_value_batch: Dict[str, deque] = {k: deque() for k in column_names}
+        current_value_batch: Dict[str, deque] = {k: deque() for k in df.schema.keys()}
         current_byte_estimate = 0
-        for data_row in df._exec():  # pylint: disable=protected-access
-            for col_name, col_type, e in zip(column_names, column_types, df._select_list_exprs):  # pylint: disable=protected-access
+        for data_row in df._exec():
+            for (col_name, col_type), e in zip(df.schema.items(), df._select_list_exprs):
                 val = data_row[e.slot_idx]
                 if val is None:
                     current_value_batch[col_name].append(val)
@@ -122,7 +122,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
                 assert batch_num < 100_000, 'wrote too many parquet files, unclear ordering'
                 _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
                 batch_num += 1
-                current_value_batch = {k: deque() for k in column_names}
+                current_value_batch = {k: deque() for k in df.schema.keys()}
                 current_byte_estimate = 0
         _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
@@ -130,11 +130,11 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
 def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional[ts.ColumnType]]:
     """Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
-    import pyarrow as pa
+    from pyarrow import parquet
     from pixeltable.utils.arrow import to_pixeltable_schema
     input_path = Path(parquet_path).expanduser()
-    parquet_dataset = pa.parquet.ParquetDataset(input_path)
+    parquet_dataset = parquet.ParquetDataset(input_path)
     return to_pixeltable_schema(parquet_dataset.schema)
@@ -159,11 +159,11 @@ def import_parquet(
         The newly created table. The table will have loaded the data from the Parquet file(s).
     """
     import pixeltable as pxt
-    import pyarrow as pa
+    from pyarrow import parquet
     from pixeltable.utils.arrow import iter_tuples
     input_path = Path(parquet_path).expanduser()
-    parquet_dataset = pa.parquet.ParquetDataset(input_path)
+    parquet_dataset = parquet.ParquetDataset(input_path)
     schema = parquet_schema_to_pixeltable_schema(parquet_path)
     if schema_override is None:

pixeltable/iterators/document.py CHANGED Viewed

@@ -38,7 +38,7 @@ class DocumentSectionMetadata:
     sourceline: Optional[int] = None
     # the stack of headings up to the most recently observed one;
     # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
-    heading: Optional[Dict[int, str]] = None
+    heading: Optional[Dict[str, str]] = None
     # pdf-specific metadata
     page: Optional[int] = None
@@ -236,7 +236,7 @@ class DocumentSplitter(ComponentIterator):
         accumulated_text = []  # currently accumulated text
         # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
-        headings: Dict[int, str] = {}   # current state of observed headings (level -> text)
+        headings: Dict[str, str] = {}   # current state of observed headings (level -> text)
         sourceline = 0  # most recently seen sourceline
         def update_metadata(el: bs4.Tag) -> None:
@@ -244,12 +244,11 @@ class DocumentSplitter(ComponentIterator):
             nonlocal headings, sourceline
             sourceline = el.sourceline
             if el.name in _HTML_HEADINGS:
-                level = int(el.name[1])
                 # remove the previously seen lower levels
-                lower_levels = [l for l in headings if l > level]
+                lower_levels = [l for l in headings if l > el.name]
                 for l in lower_levels:
                     del headings[l]
-                headings[level] = el.get_text().strip()
+                headings[el.name] = el.get_text().strip()
         def emit() -> None:
             nonlocal accumulated_text, headings, sourceline
@@ -295,13 +294,14 @@ class DocumentSplitter(ComponentIterator):
         # current state
         accumulated_text = []  # currently accumulated text
         # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
-        headings: Dict[int, str] = {}   # current state of observed headings (level -> text)
+        headings: Dict[str, str] = {}   # current state of observed headings (level -> text)
         def update_headings(heading: Dict) -> None:
             # update current state
             nonlocal headings
             assert 'type' in heading and heading['type'] == 'heading'
-            level = heading['attrs']['level']
+            lint = heading['attrs']['level']
+            level = f'h{lint}'
             text = heading['children'][0]['raw'].strip()
             # remove the previously seen lower levels
             lower_levels = [l for l in headings.keys() if l > level]

pixeltable 0.2.15__py3-none-any.whl → 0.2.16__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.15py3-none-any.whl → 0.2.16py3-none-any.whl