PyPI - pixeltable - Versions diffs - 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl - Mend

pixeltable 0.2.20py3-none-any.whl → 0.2.22py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (120) hide show

pixeltable/__init__.py +7 -19
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +7 -7
pixeltable/catalog/column.py +37 -11
pixeltable/catalog/globals.py +21 -0
pixeltable/catalog/insertable_table.py +6 -4
pixeltable/catalog/table.py +227 -148
pixeltable/catalog/table_version.py +66 -28
pixeltable/catalog/table_version_path.py +0 -8
pixeltable/catalog/view.py +18 -19
pixeltable/dataframe.py +16 -32
pixeltable/env.py +6 -1
pixeltable/exec/__init__.py +1 -2
pixeltable/exec/aggregation_node.py +27 -17
pixeltable/exec/cache_prefetch_node.py +1 -1
pixeltable/exec/data_row_batch.py +9 -26
pixeltable/exec/exec_node.py +36 -7
pixeltable/exec/expr_eval_node.py +19 -11
pixeltable/exec/in_memory_data_node.py +14 -11
pixeltable/exec/sql_node.py +266 -138
pixeltable/exprs/__init__.py +1 -0
pixeltable/exprs/arithmetic_expr.py +3 -1
pixeltable/exprs/array_slice.py +7 -7
pixeltable/exprs/column_property_ref.py +37 -10
pixeltable/exprs/column_ref.py +93 -14
pixeltable/exprs/comparison.py +5 -5
pixeltable/exprs/compound_predicate.py +8 -7
pixeltable/exprs/data_row.py +56 -36
pixeltable/exprs/expr.py +65 -63
pixeltable/exprs/expr_dict.py +55 -0
pixeltable/exprs/expr_set.py +26 -15
pixeltable/exprs/function_call.py +53 -24
pixeltable/exprs/globals.py +4 -1
pixeltable/exprs/in_predicate.py +8 -7
pixeltable/exprs/inline_expr.py +4 -4
pixeltable/exprs/is_null.py +4 -4
pixeltable/exprs/json_mapper.py +11 -12
pixeltable/exprs/json_path.py +5 -10
pixeltable/exprs/literal.py +5 -5
pixeltable/exprs/method_ref.py +5 -4
pixeltable/exprs/object_ref.py +2 -1
pixeltable/exprs/row_builder.py +88 -36
pixeltable/exprs/rowid_ref.py +14 -13
pixeltable/exprs/similarity_expr.py +12 -7
pixeltable/exprs/sql_element_cache.py +12 -6
pixeltable/exprs/type_cast.py +8 -6
pixeltable/exprs/variable.py +5 -4
pixeltable/ext/functions/whisperx.py +7 -2
pixeltable/func/aggregate_function.py +1 -1
pixeltable/func/callable_function.py +2 -2
pixeltable/func/function.py +11 -10
pixeltable/func/function_registry.py +6 -7
pixeltable/func/query_template_function.py +11 -12
pixeltable/func/signature.py +17 -15
pixeltable/func/udf.py +0 -4
pixeltable/functions/__init__.py +2 -2
pixeltable/functions/audio.py +4 -6
pixeltable/functions/globals.py +84 -42
pixeltable/functions/huggingface.py +31 -34
pixeltable/functions/image.py +59 -45
pixeltable/functions/json.py +0 -1
pixeltable/functions/llama_cpp.py +106 -0
pixeltable/functions/mistralai.py +2 -2
pixeltable/functions/ollama.py +147 -0
pixeltable/functions/openai.py +22 -25
pixeltable/functions/replicate.py +72 -0
pixeltable/functions/string.py +59 -50
pixeltable/functions/timestamp.py +20 -20
pixeltable/functions/together.py +2 -2
pixeltable/functions/video.py +11 -20
pixeltable/functions/whisper.py +2 -20
pixeltable/globals.py +65 -74
pixeltable/index/base.py +2 -2
pixeltable/index/btree.py +20 -7
pixeltable/index/embedding_index.py +12 -14
pixeltable/io/__init__.py +1 -2
pixeltable/io/external_store.py +11 -5
pixeltable/io/fiftyone.py +178 -0
pixeltable/io/globals.py +98 -2
pixeltable/io/hf_datasets.py +1 -1
pixeltable/io/label_studio.py +6 -6
pixeltable/io/parquet.py +14 -13
pixeltable/iterators/base.py +3 -2
pixeltable/iterators/document.py +10 -8
pixeltable/iterators/video.py +126 -60
pixeltable/metadata/__init__.py +4 -3
pixeltable/metadata/converters/convert_14.py +4 -2
pixeltable/metadata/converters/convert_15.py +1 -1
pixeltable/metadata/converters/convert_19.py +1 -0
pixeltable/metadata/converters/convert_20.py +1 -1
pixeltable/metadata/converters/convert_21.py +34 -0
pixeltable/metadata/converters/util.py +54 -12
pixeltable/metadata/notes.py +1 -0
pixeltable/metadata/schema.py +40 -21
pixeltable/plan.py +149 -165
pixeltable/py.typed +0 -0
pixeltable/store.py +57 -37
pixeltable/tool/create_test_db_dump.py +6 -6
pixeltable/tool/create_test_video.py +1 -1
pixeltable/tool/doc_plugins/griffe.py +3 -34
pixeltable/tool/embed_udf.py +1 -1
pixeltable/tool/mypy_plugin.py +55 -0
pixeltable/type_system.py +260 -61
pixeltable/utils/arrow.py +10 -9
pixeltable/utils/coco.py +4 -4
pixeltable/utils/documents.py +16 -2
pixeltable/utils/filecache.py +9 -9
pixeltable/utils/formatter.py +10 -11
pixeltable/utils/http_server.py +2 -5
pixeltable/utils/media_store.py +6 -6
pixeltable/utils/pytorch.py +10 -11
pixeltable/utils/sql.py +2 -1
{pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/METADATA +50 -13
pixeltable-0.2.22.dist-info/RECORD +153 -0
pixeltable/exec/media_validation_node.py +0 -43
pixeltable/utils/help.py +0 -11
pixeltable-0.2.20.dist-info/RECORD +0 -147
{pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/LICENSE +0 -0
{pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/WHEEL +0 -0
{pixeltable-0.2.20.dist-info → pixeltable-0.2.22.dist-info}/entry_points.txt +0 -0

pixeltable/io/fiftyone.py ADDED Viewed

@@ -0,0 +1,178 @@
+import os
+from typing import Iterator, Optional, Union
+import fiftyone as fo  # type: ignore[import-untyped]
+import fiftyone.utils.data as foud  # type: ignore[import-untyped]
+import PIL.Image
+import puremagic
+import pixeltable as pxt
+import pixeltable.exceptions as excs
+from pixeltable import exprs
+from pixeltable.env import Env
+class PxtImageDatasetImporter(foud.LabeledImageDatasetImporter):
+    """
+    Implementation of a FiftyOne `DatasetImporter` that reads image data from a Pixeltable table.
+    """
+    __image_format: str  # format to use for any exported images that are not already stored on disk
+    __labels: dict[str, tuple[exprs.Expr, type[fo.Label]]]  # label_name -> (expr, label_cls)
+    __image_idx: int  # index of the image expr in the select list
+    __localpath_idx: Optional[int]  # index of the image localpath in the select list, if present
+    __row_iter: Iterator[list]  # iterator over the table rows, to be convered to FiftyOne samples
+    def __init__(
+        self,
+        tbl: pxt.Table,
+        image: exprs.Expr,
+        image_format: str,
+        classifications: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
+        detections: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
+        dataset_dir: Optional[os.PathLike] = None,
+        shuffle: bool = False,
+        seed: Union[int, float, str, bytes, bytearray, None] = None,
+        max_samples: Optional[int] = None,
+    ):
+        super().__init__(
+            dataset_dir=dataset_dir,
+            shuffle=shuffle,
+            seed=seed,
+            max_samples=max_samples
+        )
+        self.__image_format = image_format
+        label_categories = [
+            (classifications, fo.Classifications, 'classifications'),
+            (detections, fo.Detections, 'detections'),
+        ]
+        # Construct the labels. First add labels for all label types that have named dictionaries.
+        self.__labels = {}
+        for exprs_, label_cls, _ in label_categories:
+            if isinstance(exprs_, dict):
+                for label_name, expr in exprs_.items():
+                    if not label_name.isidentifier():
+                        raise excs.Error(f"Invalid label name: {label_name}")
+                    if label_name in self.__labels:
+                        raise excs.Error(f"Duplicate label name: {label_name}")
+                    self.__labels[label_name] = (expr, label_cls)
+        # Now add the remaining labels, assigning unused default names.
+        for exprs_, label_cls, default_name in label_categories:
+            if exprs_ is None or isinstance(exprs_, dict):
+                continue
+            if isinstance(exprs_, exprs.Expr):
+                exprs_ = [exprs_]
+            assert isinstance(exprs_, list)
+            for expr in exprs_:
+                if default_name not in self.__labels:
+                    name = default_name
+                else:
+                    i = 1
+                    while f'{default_name}_{i}' in self.__labels:
+                        i += 1
+                    name = f'{default_name}_{i}'
+                self.__labels[name] = (expr, label_cls)
+        # Build the select list:
+        # - Labels first, in the order they appear in self.__labels
+        # - Then the `image` expr
+        # - Then `image.localpath`, if `images` is a stored columnref
+        selection = [expr for expr, _ in self.__labels.values()]
+        self.__image_idx = len(selection)
+        selection.append(image)
+        if isinstance(image, exprs.ColumnRef) and image.col.is_stored:
+            # A stored image column; we can use the existing localpaths
+            self.__localpath_idx = len(selection)
+            selection.append(image.localpath)
+        else:
+            self.__localpath_idx = None
+        df = tbl.select(*selection)
+        self.__row_iter = df._output_row_iterator()
+    def __next__(self) -> tuple[str, Optional[fo.ImageMetadata], Optional[dict[str, fo.Label]]]:
+        row = next(self.__row_iter)
+        img = row[self.__image_idx]
+        assert isinstance(img, PIL.Image.Image)
+        if self.__localpath_idx is not None:
+            # Use the existing localpath of the stored image
+            file = row[self.__localpath_idx]
+            assert isinstance(file, str)
+        else:
+            # Write the dynamically created image to a temp file
+            file = str(Env.get().create_tmp_path(f'.{self.__image_format}'))
+            img.save(file, format=self.__image_format)
+        metadata = fo.ImageMetadata(
+            size_bytes=os.path.getsize(file),
+            mime_type=puremagic.from_file(file, mime=True),
+            width=img.width,
+            height=img.height,
+            filepath=file,
+            num_channels=len(img.getbands()),
+        )
+        labels: dict[str, fo.Label] = {}
+        for idx, (label_name, (_, label_cls)) in enumerate(self.__labels.items()):
+            label_data = row[idx]
+            if label_data is None:
+                continue
+            label: fo.Label
+            if label_cls is fo.Classifications:
+                label = fo.Classifications(classifications=self.__as_fo_classifications(label_data))
+            elif label_cls is fo.Detections:
+                label = fo.Detections(detections=self.__as_fo_detections(label_data))
+            else:
+                assert False
+            labels[label_name] = label
+        return file, metadata, labels
+    def __as_fo_classifications(self, data: list) -> list[fo.Classification]:
+        if not isinstance(data, list) or any('label' not in entry for entry in data):
+            raise excs.Error(
+                f'Invalid classifications data: {data}\n'
+                "(Expected a list of dicts, each containing a 'label' key)"
+            )
+        return [
+            fo.Classification(label=entry['label'], confidence=entry.get('confidence'))
+            for entry in data
+        ]
+    def __as_fo_detections(self, data: list) -> list[fo.Detections]:
+        if not isinstance(data, list) or any('label' not in entry or 'bounding_box' not in entry for entry in data):
+            raise excs.Error(
+                f'Invalid detections data: {data}\n'
+                "(Expected a list of dicts, each containing a 'label' and 'bounding_box' key)"
+            )
+        return [
+            fo.Detection(label=entry['label'], bounding_box=entry['bounding_box'], confidence=entry.get('confidence'))
+            for entry in data
+        ]
+    @property
+    def has_dataset_info(self) -> bool:
+        return False
+    @property
+    def has_image_metadata(self) -> bool:
+        return True
+    @property
+    def label_cls(self) -> dict[str, type]:
+        return {label_name: label_cls for label_name, (_, label_cls) in self.__labels.items()}
+    def setup(self) -> None:
+        pass
+    def get_dataset_info(self) -> dict:
+        pass
+    def close(self, *args) -> None:
+        pass

pixeltable/io/globals.py CHANGED Viewed

@@ -1,10 +1,14 @@
-from typing import Any, Literal, Optional, Union
+from typing import TYPE_CHECKING, Any, Literal, Optional, Union
 import pixeltable as pxt
 import pixeltable.exceptions as excs
-from pixeltable import Table
+from pixeltable import Table, exprs
+from pixeltable.env import Env
 from pixeltable.io.external_store import SyncStatus
+if TYPE_CHECKING:
+    import fiftyone as fo  # type: ignore[import-untyped]
 def create_label_studio_project(
         t: Table,
@@ -116,6 +120,8 @@ def create_label_studio_project(
                 s3_configuration={'bucket': 'my-bucket', 'region_name': 'us-east-2'}
             )
     """
+    Env.get().require_package('label_studio_sdk')
     from pixeltable.io.label_studio import LabelStudioProject
     ls_project = LabelStudioProject.create(
@@ -187,6 +193,8 @@ def import_rows(
                 # If `key` is not in `schema_overrides`, then we infer its type from the data.
                 # The column type will always be nullable by default.
                 col_type = pxt.ColumnType.infer_literal_type(value, nullable=True)
+                if col_type is None:
+                    raise excs.Error(f'Could not infer type for column `{col_name}`; the value in row {n} has an unsupported type: {type(value)}')
                 if col_name not in schema:
                     schema[col_name] = col_type
                 else:
@@ -265,3 +273,91 @@ def import_json(
         contents = urllib.request.urlopen(filepath_or_url).read()
     data = json.loads(contents, **kwargs)
     return import_rows(tbl_path, data, schema_overrides=schema_overrides, primary_key=primary_key, num_retained_versions=num_retained_versions, comment=comment)
+def export_images_as_fo_dataset(
+    tbl: pxt.Table,
+    images: exprs.Expr,
+    image_format: str = 'webp',
+    classifications: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
+    detections: Union[exprs.Expr, list[exprs.Expr], dict[str, exprs.Expr], None] = None,
+) -> 'fo.Dataset':
+    """
+    Export images from a Pixeltable table as a Voxel51 dataset. The data must consist of a single column
+    (or expression) containing image data, along with optional additional columns containing labels. Currently, only
+    classification and detection labels are supported.
+    The [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial contains a
+    fully worked example showing how to export data from a Pixeltable table and load it into Voxel51.
+    Images in the dataset that already exist on disk will be exported directly, in whatever format they
+    are stored in. Images that are not already on disk (such as frames extracted using a
+    [`FrameIterator`][pixeltable.iterators.FrameIterator]) will first be written to disk in the specified
+    `image_format`.
+    The label parameters accept one or more sets of labels of each type. If a single `Expr` is provided, then it will
+    be exported as a single set of labels with a default name such as `classifications`.
+    (The single set of labels may still containing multiple individual labels; see below.)
+    If a list of `Expr`s is provided, then each one will be exported as a separate set of labels with a default name
+    such as `classifications`, `classifications_1`, etc. If a dictionary of `Expr`s is provided, then each entry will
+    be exported as a set of labels with the specified name.
+    __Requirements:__
+    - `pip install fiftyone`
+    Args:
+        tbl: The table from which to export data.
+        images: A column or expression that contains the images to export.
+        image_format: The format to use when writing out images for export.
+        classifications: Optional image classification labels. If a single `Expr` is provided, it must be a table
+            column or an expression that evaluates to a list of dictionaries. Each dictionary in the list corresponds
+            to an image class and must have the following structure:
+            ```python
+            {'label': 'zebra', 'confidence': 0.325}
+            ```
+            If multiple `Expr`s are provided, each one must evaluate to a list of such dictionaries.
+        detections: Optional image detection labels. If a single `Expr` is provided, it must be a table column or an
+            expression that evaluates to a list of dictionaries. Each dictionary in the list corresponds to an image
+            detection, and must have the following structure:
+            ```python
+            {
+                'label': 'giraffe',
+                'confidence': 0.99,
+                'bounding_box': [0.081, 0.836, 0.202, 0.136]  # [x, y, w, h], fractional coordinates
+            }
+            ```
+            If multiple `Expr`s are provided, each one must evaluate to a list of such dictionaries.
+    Returns:
+        A Voxel51 dataset.
+    Example:
+        Export the images in the `image` column of the table `tbl` as a Voxel51 dataset, using classification
+        labels from `tbl.classifications`:
+        >>> export_as_fiftyone(
+        ...     tbl,
+        ...     tbl.image,
+        ...     classifications=tbl.classifications
+        ... )
+        See the [Working with Voxel51 in Pixeltable](https://docs.pixeltable.com/docs/working-with-voxel51) tutorial
+        for a fully worked example.
+    """
+    Env.get().require_package('fiftyone')
+    import fiftyone as fo
+    from pixeltable.io.fiftyone import PxtImageDatasetImporter
+    if not images.col_type.is_image_type():
+        raise excs.Error(f'`images` must be an expression of type Image (got {images.col_type._to_base_str()})')
+    return fo.Dataset.from_importer(PxtImageDatasetImporter(
+        tbl, images, image_format, classifications=classifications, detections=detections
+    ))

pixeltable/io/hf_datasets.py CHANGED Viewed

@@ -11,7 +11,7 @@ import pixeltable.type_system as ts
 from pixeltable import exceptions as excs
 if typing.TYPE_CHECKING:
-    import datasets
+    import datasets  # type: ignore[import-untyped]
 _logger = logging.getLogger(__name__)

pixeltable/io/label_studio.py CHANGED Viewed

@@ -4,17 +4,17 @@ import logging
 import os
 from dataclasses import dataclass
 from pathlib import Path
-from typing import Any, Iterator, Optional, Literal
+from typing import Any, Iterator, Literal, Optional, cast
 from xml.etree import ElementTree
+import label_studio_sdk  # type: ignore[import-untyped]
 import PIL.Image
-import label_studio_sdk
 from requests.exceptions import HTTPError
 import pixeltable as pxt
 import pixeltable.env as env
 import pixeltable.exceptions as excs
-from pixeltable import Table, Column
+from pixeltable import Column, Table
 from pixeltable.exprs import ColumnRef, DataRow, Expr
 from pixeltable.io.external_store import Project, SyncStatus
 from pixeltable.utils import coco
@@ -211,7 +211,7 @@ class LabelStudioProject(Project):
                     assert isinstance(row[media_col_idx], PIL.Image.Image)
                     file = env.Env.get().create_tmp_path(extension='.png')
                     row[media_col_idx].save(file, format='png')
-                    task_id: int = self.project.import_tasks(file)[0]
+                    task_id = self.project.import_tasks(file)[0]
                     os.remove(file)
                 # Update the task with `rowid` metadata
@@ -256,7 +256,7 @@ class LabelStudioProject(Project):
                 assert self.media_import_method == 'file'
                 if not col.col_type.is_media_type():
                     # Not a media column; query the data directly
-                    expr_refs[col_name] = t[col_name]
+                    expr_refs[col_name] = cast(ColumnRef, t[col_name])
                 elif col in self.stored_proxies:
                     # Media column that has a stored proxy; use it. We have to give it a name,
                     # since it's an anonymous column
@@ -267,7 +267,7 @@ class LabelStudioProject(Project):
                     # and we can just use the localpath
                     expr_refs[col_name] = t[col_name].localpath
-        df = t.select(*[t[col] for col in t_rl_cols], **expr_refs)
+        df = t.select(*[t[col.name] for col in t_rl_cols], **expr_refs)
         # The following buffers will hold `DataRow` indices that correspond to each of the selected
         # columns. `rl_col_idxs` holds the indices for the columns that map to RectangleLabels
         # preannotations; `data_col_idxs` holds the indices for the columns that map to data fields.

pixeltable/io/parquet.py CHANGED Viewed

@@ -7,24 +7,23 @@ import random
 import typing
 from collections import deque
 from pathlib import Path
-from typing import Dict, Optional, Any
+from typing import Any, Optional
-import PIL.Image
 import numpy as np
+import PIL.Image
 import pixeltable.exceptions as exc
 import pixeltable.type_system as ts
 from pixeltable.utils.transactional_directory import transactional_directory
 if typing.TYPE_CHECKING:
-    import pixeltable as pxt
     import pyarrow as pa
-    from pyarrow import parquet
+    import pixeltable as pxt
 _logger = logging.getLogger(__name__)
-def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
+def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path: Path) -> None:
     import pyarrow as pa
     from pyarrow import parquet
@@ -37,7 +36,7 @@ def _write_batch(value_batch: Dict[str, deque], schema: pa.Schema, output_path:
             pydict[field.name] = value_batch[field.name]
     tab = pa.Table.from_pydict(pydict, schema=schema)
-    parquet.write_table(tab, output_path)
+    parquet.write_table(tab, str(output_path))
 def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
@@ -67,7 +66,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
         json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w'))  # keep type metadata
         batch_num = 0
-        current_value_batch: Dict[str, deque] = {k: deque() for k in df.schema.keys()}
+        current_value_batch: dict[str, deque] = {k: deque() for k in df.schema.keys()}
         current_byte_estimate = 0
         for data_row in df._exec():
@@ -128,13 +127,14 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
         _write_batch(current_value_batch, arrow_schema, temp_path / f'part-{batch_num:05d}.parquet')
-def parquet_schema_to_pixeltable_schema(parquet_path: str) -> Dict[str, Optional[ts.ColumnType]]:
+def parquet_schema_to_pixeltable_schema(parquet_path: str) -> dict[str, Optional[ts.ColumnType]]:
     """Generate a default pixeltable schema for the given parquet file. Returns None for unknown types."""
     from pyarrow import parquet
     from pixeltable.utils.arrow import to_pixeltable_schema
     input_path = Path(parquet_path).expanduser()
-    parquet_dataset = parquet.ParquetDataset(input_path)
+    parquet_dataset = parquet.ParquetDataset(str(input_path))
     return to_pixeltable_schema(parquet_dataset.schema)
@@ -142,7 +142,7 @@ def import_parquet(
     table_path: str,
     *,
     parquet_path: str,
-    schema_overrides: Optional[Dict[str, ts.ColumnType]] = None,
+    schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
     **kwargs: Any,
 ) -> pxt.Table:
     """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
@@ -159,12 +159,13 @@ def import_parquet(
     Returns:
         A handle to the newly created [`Table`][pixeltable.Table].
     """
-    import pixeltable as pxt
     from pyarrow import parquet
+    import pixeltable as pxt
     from pixeltable.utils.arrow import iter_tuples
     input_path = Path(parquet_path).expanduser()
-    parquet_dataset = parquet.ParquetDataset(input_path)
+    parquet_dataset = parquet.ParquetDataset(str(input_path))
     schema = parquet_schema_to_pixeltable_schema(parquet_path)
     if schema_overrides is None:
@@ -181,7 +182,7 @@ def import_parquet(
     try:
         tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
         tab = pxt.create_table(tmp_name, schema, **kwargs)
-        for fragment in parquet_dataset.fragments:
+        for fragment in parquet_dataset.fragments:  # type: ignore[attr-defined]
             for batch in fragment.to_batches():
                 dict_batch = list(iter_tuples(batch))
                 tab.insert(dict_batch)

pixeltable/iterators/base.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from __future__ import annotations
-from typing import Dict, Any, Tuple, List
-from abc import abstractmethod, ABC
+from abc import ABC, abstractmethod
+from typing import Any
 from pixeltable.type_system import ColumnType

pixeltable/iterators/document.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import dataclasses
 import enum
 import logging
-from typing import Any, Iterable, Iterator, Optional
+from typing import Any, Iterable, Iterator, Optional, Union
 import ftfy
@@ -152,7 +152,7 @@ class DocumentSplitter(ComponentIterator):
             assert self._doc_handle.pdf_doc is not None
             self._sections = self._pdf_sections()
         else:
-            assert False, f'unknown document format: {self._doc_handle.format}'
+            assert False, f'Unsupported document format: {self._doc_handle.format}'
         if Separator.SENTENCE in self._separators:
             self._sections = self._sentence_sections(self._sections)
@@ -176,7 +176,7 @@ class DocumentSplitter(ComponentIterator):
     @classmethod
     def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
-        schema = {'text': StringType()}
+        schema: dict[str, ColumnType] = {'text': StringType()}
         md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
         for md_field in md_fields:
@@ -214,7 +214,7 @@ class DocumentSplitter(ComponentIterator):
             section = next(self._sections)
             if section.text is None:
                 continue
-            result = {'text': section.text}
+            result: dict[str, Any] = {'text': section.text}
             for md_field in self._metadata_fields:
                 if md_field == ChunkMetadata.TITLE:
                     result[md_field.name.lower()] = self._doc_title
@@ -234,7 +234,7 @@ class DocumentSplitter(ComponentIterator):
         emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
         emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
         # current state
-        accumulated_text = []  # currently accumulated text
+        accumulated_text: list[str] = []  # currently accumulated text
         # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
         headings: dict[str, str] = {}   # current state of observed headings (level -> text)
@@ -260,9 +260,10 @@ class DocumentSplitter(ComponentIterator):
                 yield DocumentSection(text=full_text, metadata=md)
                 accumulated_text = []
-        def process_element(el: bs4.PageElement) -> Iterator[DocumentSection]:
+        def process_element(el: Union[bs4.element.Tag, bs4.NavigableString]) -> Iterator[DocumentSection]:
             # process the element and emit sections as necessary
             nonlocal accumulated_text, headings, sourceline, emit_on_heading, emit_on_paragraph
             if el.name in self._skip_tags:
                 return
@@ -282,6 +283,7 @@ class DocumentSplitter(ComponentIterator):
                     yield from emit()
                 update_metadata(el)
             for child in el.children:
+                assert isinstance(child, (bs4.element.Tag, bs4.NavigableString)), type(el)
                 yield from process_element(child)
         yield from process_element(self._doc_handle.bs_doc)
@@ -293,7 +295,7 @@ class DocumentSplitter(ComponentIterator):
         emit_on_paragraph = Separator.PARAGRAPH in self._separators or Separator.SENTENCE in self._separators
         emit_on_heading = Separator.HEADING in self._separators or emit_on_paragraph
         # current state
-        accumulated_text = []  # currently accumulated text
+        accumulated_text: list[str] = []  # currently accumulated text
         # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
         headings: dict[str, str] = {}   # current state of observed headings (level -> text)
@@ -347,7 +349,7 @@ class DocumentSplitter(ComponentIterator):
     def _pdf_sections(self) -> Iterator[DocumentSection]:
         """Create DocumentSections reflecting the pdf-specific separators"""
-        import fitz
+        import fitz  # type: ignore[import-untyped]
         doc: fitz.Document = self._doc_handle.pdf_doc
         assert doc is not None

pixeltable 0.2.20__py3-none-any.whl → 0.2.22__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.20py3-none-any.whl → 0.2.22py3-none-any.whl