PyPI - pixeltable - Versions diffs - 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl - Mend

pixeltable 0.2.6py3-none-any.whl → 0.2.7py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (56) hide show

pixeltable/__init__.py +3 -1
pixeltable/__version__.py +2 -2
pixeltable/catalog/column.py +14 -2
pixeltable/catalog/insertable_table.py +32 -17
pixeltable/catalog/table.py +194 -12
pixeltable/catalog/table_version.py +270 -110
pixeltable/catalog/table_version_path.py +6 -1
pixeltable/datatransfer/__init__.py +1 -0
pixeltable/datatransfer/label_studio.py +526 -0
pixeltable/datatransfer/remote.py +113 -0
pixeltable/env.py +156 -73
pixeltable/exprs/column_ref.py +2 -2
pixeltable/exprs/comparison.py +39 -1
pixeltable/exprs/data_row.py +7 -0
pixeltable/exprs/expr.py +11 -12
pixeltable/exprs/function_call.py +0 -3
pixeltable/exprs/globals.py +14 -2
pixeltable/exprs/similarity_expr.py +5 -3
pixeltable/ext/functions/whisperx.py +30 -0
pixeltable/ext/functions/yolox.py +16 -0
pixeltable/func/aggregate_function.py +2 -2
pixeltable/func/expr_template_function.py +3 -1
pixeltable/func/udf.py +2 -2
pixeltable/functions/fireworks.py +9 -4
pixeltable/functions/huggingface.py +25 -1
pixeltable/functions/openai.py +15 -10
pixeltable/functions/together.py +11 -6
pixeltable/functions/util.py +0 -43
pixeltable/functions/video.py +46 -8
pixeltable/globals.py +20 -2
pixeltable/index/__init__.py +1 -0
pixeltable/index/base.py +6 -1
pixeltable/index/btree.py +54 -0
pixeltable/index/embedding_index.py +4 -1
pixeltable/io/__init__.py +1 -0
pixeltable/io/globals.py +59 -0
pixeltable/iterators/base.py +4 -4
pixeltable/iterators/document.py +26 -15
pixeltable/iterators/video.py +9 -1
pixeltable/metadata/__init__.py +2 -2
pixeltable/metadata/converters/convert_14.py +13 -0
pixeltable/metadata/converters/convert_15.py +29 -0
pixeltable/metadata/converters/util.py +63 -0
pixeltable/metadata/schema.py +12 -6
pixeltable/plan.py +9 -5
pixeltable/store.py +14 -21
pixeltable/tool/create_test_db_dump.py +16 -0
pixeltable/type_system.py +14 -4
pixeltable/utils/coco.py +94 -0
pixeltable-0.2.7.dist-info/METADATA +137 -0
{pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/RECORD +53 -46
pixeltable/func/nos_function.py +0 -202
pixeltable/utils/clip.py +0 -18
pixeltable-0.2.6.dist-info/METADATA +0 -131
{pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/LICENSE +0 -0
{pixeltable-0.2.6.dist-info → pixeltable-0.2.7.dist-info}/WHEEL +0 -0

pixeltable/functions/fireworks.py CHANGED Viewed

@@ -6,8 +6,13 @@ import pixeltable as pxt
 from pixeltable import env
-def fireworks_client() -> fireworks.client.Fireworks:
-    return env.Env.get().get_client('fireworks', lambda api_key: fireworks.client.Fireworks(api_key=api_key))
+@env.register_client('fireworks')
+def _(api_key: str) -> fireworks.client.Fireworks:
+    return fireworks.client.Fireworks(api_key=api_key)
+def _fireworks_client() -> fireworks.client.Fireworks:
+    return env.Env.get().get_client('fireworks')
 @pxt.udf
@@ -26,8 +31,8 @@ def chat_completions(
         'top_p': top_p,
         'temperature': temperature
     }
-    kwargs_not_none = dict(filter(lambda x: x[1] is not None, kwargs.items()))
-    return fireworks_client().chat.completions.create(
+    kwargs_not_none = {k: v for k, v in kwargs.items() if v is not None}
+    return _fireworks_client().chat.completions.create(
         model=model,
         messages=messages,
         **kwargs_not_none

pixeltable/functions/huggingface.py CHANGED Viewed

@@ -1,4 +1,4 @@
-from typing import Callable, TypeVar, Optional
+from typing import Callable, TypeVar, Optional, Any
 import PIL.Image
 import numpy as np
@@ -14,6 +14,7 @@ from pixeltable.functions.util import resolve_torch_device
 def sentence_transformer(
         sentences: Batch[str], *, model_id: str, normalize_embeddings: bool = False
 ) -> Batch[np.ndarray]:
+    """Runs the specified sentence transformer model."""
     env.Env.get().require_package('sentence_transformers')
     from sentence_transformers import SentenceTransformer
@@ -46,6 +47,7 @@ def sentence_transformer_list(sentences: list, *, model_id: str, normalize_embed
 @pxt.udf(batch_size=32)
 def cross_encoder(sentences1: Batch[str], sentences2: Batch[str], *, model_id: str) -> Batch[float]:
+    """Runs the specified cross-encoder model."""
     env.Env.get().require_package('sentence_transformers')
     from sentence_transformers import CrossEncoder
@@ -68,6 +70,7 @@ def cross_encoder_list(sentence1: str, sentences2: list, *, model_id: str) -> li
 @pxt.udf(batch_size=32, return_type=ts.ArrayType((None,), dtype=ts.FloatType(), nullable=False))
 def clip_text(text: Batch[str], *, model_id: str) -> Batch[np.ndarray]:
+    """Runs the specified CLIP model on text."""
     env.Env.get().require_package('transformers')
     device = resolve_torch_device('auto')
     import torch
@@ -85,6 +88,7 @@ def clip_text(text: Batch[str], *, model_id: str) -> Batch[np.ndarray]:
 @pxt.udf(batch_size=32, return_type=ts.ArrayType((None,), dtype=ts.FloatType(), nullable=False))
 def clip_image(image: Batch[PIL.Image.Image], *, model_id: str) -> Batch[np.ndarray]:
+    """Runs the specified CLIP model on images."""
     env.Env.get().require_package('transformers')
     device = resolve_torch_device('auto')
     import torch
@@ -113,6 +117,7 @@ def _(model_id: str) -> ts.ArrayType:
 @pxt.udf(batch_size=4)
 def detr_for_object_detection(image: Batch[PIL.Image.Image], *, model_id: str, threshold: float = 0.5) -> Batch[dict]:
+    """Runs the specified DETR model."""
     env.Env.get().require_package('transformers')
     device = resolve_torch_device('auto')
     import torch
@@ -140,6 +145,25 @@ def detr_for_object_detection(image: Batch[PIL.Image.Image], *, model_id: str, t
     ]
+@pxt.udf
+def detr_to_coco(image: PIL.Image.Image, detr_info: dict[str, Any]) -> dict[str, Any]:
+    bboxes, labels = detr_info['boxes'], detr_info['labels']
+    annotations = [
+        {
+            'bbox': [bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]],
+            'category': label
+        }
+        for bbox, label in zip(bboxes, labels)
+    ]
+    return {
+        'image': {
+            'width': image.width,
+            'height': image.height
+        },
+        'annotations': annotations
+    }
 T = TypeVar('T')

pixeltable/functions/openai.py CHANGED Viewed

@@ -16,8 +16,13 @@ from pixeltable import env
 from pixeltable.func import Batch
-def openai_client() -> openai.OpenAI:
-    return env.Env.get().get_client('openai', lambda api_key: openai.OpenAI(api_key=api_key))
+@env.register_client('openai')
+def _(api_key: str) -> openai.OpenAI:
+    return openai.OpenAI(api_key=api_key)
+def _openai_client() -> openai.OpenAI:
+    return env.Env.get().get_client('openai')
 # Exponential backoff decorator using tenacity.
@@ -44,7 +49,7 @@ def speech(
         response_format: Optional[str] = None,
         speed: Optional[float] = None
 ) -> str:
-    content = openai_client().audio.speech.create(
+    content = _openai_client().audio.speech.create(
         input=input,
         model=model,
         voice=voice,
@@ -71,7 +76,7 @@ def transcriptions(
         temperature: Optional[float] = None
 ) -> dict:
     file = pathlib.Path(audio)
-    transcription = openai_client().audio.transcriptions.create(
+    transcription = _openai_client().audio.transcriptions.create(
         file=file,
         model=model,
         language=_opt(language),
@@ -93,7 +98,7 @@ def translations(
         temperature: Optional[float] = None
 ) -> dict:
     file = pathlib.Path(audio)
-    translation = openai_client().audio.translations.create(
+    translation = _openai_client().audio.translations.create(
         file=file,
         model=model,
         prompt=_opt(prompt),
@@ -127,7 +132,7 @@ def chat_completions(
         tool_choice: Optional[dict] = None,
         user: Optional[str] = None
 ) -> dict:
-    result = openai_client().chat.completions.create(
+    result = _openai_client().chat.completions.create(
         messages=messages,
         model=model,
         frequency_penalty=_opt(frequency_penalty),
@@ -171,7 +176,7 @@ def vision(
              }}
          ]}
     ]
-    result = openai_client().chat.completions.create(
+    result = _openai_client().chat.completions.create(
         messages=messages,
         model=model
     )
@@ -197,7 +202,7 @@ def embeddings(
         dimensions: Optional[int] = None,
         user: Optional[str] = None
 ) -> Batch[np.ndarray]:
-    result = openai_client().embeddings.create(
+    result = _openai_client().embeddings.create(
         input=input,
         model=model,
         dimensions=_opt(dimensions),
@@ -235,7 +240,7 @@ def image_generations(
         user: Optional[str] = None
 ) -> PIL.Image.Image:
     # TODO(aaron-siegel): Decompose CPU/GPU ops into separate functions
-    result = openai_client().images.generate(
+    result = _openai_client().images.generate(
         prompt=prompt,
         model=_opt(model),
         quality=_opt(quality),
@@ -275,7 +280,7 @@ def moderations(
         *,
         model: Optional[str] = None
 ) -> dict:
-    result = openai_client().moderations.create(
+    result = _openai_client().moderations.create(
         input=input,
         model=_opt(model)
     )

pixeltable/functions/together.py CHANGED Viewed

@@ -11,8 +11,13 @@ from pixeltable import env
 from pixeltable.func import Batch
-def together_client() -> together.Together:
-    return env.Env.get().get_client('together', lambda api_key: together.Together(api_key=api_key))
+@env.register_client('together')
+def _(api_key: str) -> together.Together:
+    return together.Together(api_key=api_key)
+def _together_client() -> together.Together:
+    return env.Env.get().get_client('together')
 @pxt.udf
@@ -31,7 +36,7 @@ def completions(
         n: Optional[int] = None,
         safety_model: Optional[str] = None
 ) -> dict:
-    return together_client().completions.create(
+    return _together_client().completions.create(
         prompt=prompt,
         model=model,
         max_tokens=max_tokens,
@@ -66,7 +71,7 @@ def chat_completions(
         tools: Optional[dict] = None,
         tool_choice: Optional[dict] = None
 ) -> dict:
-    return together_client().chat.completions.create(
+    return _together_client().chat.completions.create(
         messages=messages,
         model=model,
         max_tokens=max_tokens,
@@ -99,7 +104,7 @@ _embedding_dimensions_cache = {
 @pxt.udf(batch_size=32, return_type=pxt.ArrayType((None,), dtype=pxt.FloatType()))
 def embeddings(input: Batch[str], *, model: str) -> Batch[np.ndarray]:
-    result = together_client().embeddings.create(input=input, model=model)
+    result = _together_client().embeddings.create(input=input, model=model)
     return [
         np.array(data.embedding, dtype=np.float64)
         for data in result.data
@@ -127,7 +132,7 @@ def image_generations(
         negative_prompt: Optional[str] = None,
 ) -> PIL.Image.Image:
     # TODO(aaron-siegel): Decompose CPU/GPU ops into separate functions
-    result = together_client().images.generate(
+    result = _together_client().images.generate(
         prompt=prompt,
         model=model,
         steps=steps,

pixeltable/functions/util.py CHANGED Viewed

@@ -1,46 +1,3 @@
-from typing import Tuple, List, Optional
-import types
-import sys
-import pixeltable.func as func
-import pixeltable.type_system as ts
-import pixeltable.env as env
-def create_nos_modules() -> List[types.ModuleType]:
-    """Create module pixeltable.functions.nos with one submodule per task and return the submodules"""
-    models = env.Env.get().nos_client.ListModels()
-    model_info = [env.Env.get().nos_client.GetModelInfo(model) for model in models]
-    model_info.sort(key=lambda info: info.task.value)
-    module_name = 'pixeltable.functions.nos'
-    nos_module = types.ModuleType(module_name)
-    nos_module.__package__ = 'pixeltable.functions'
-    sys.modules[module_name] = nos_module
-    prev_task = ''
-    new_modules: List[types.ModuleType] = []
-    sub_module: Optional[types.ModuleType] = None
-    for info in model_info:
-        if info.task.value != prev_task:
-            # we construct one submodule per task
-            namespace = info.task.name.lower()
-            submodule_name = f'{module_name}.{namespace}'
-            sub_module = types.ModuleType(submodule_name)
-            sub_module.__package__ = module_name
-            setattr(nos_module, namespace, sub_module)
-            new_modules.append(sub_module)
-            sys.modules[submodule_name] = sub_module
-            prev_task = info.task.value
-        # add a Function for this model to the module
-        model_id = info.name.replace("/", "_").replace("-", "_")
-        pt_func = func.NOSFunction(info, f'{submodule_name}.{model_id}')
-        setattr(sub_module, model_id, pt_func)
-    return new_modules
 def resolve_torch_device(device: str) -> str:
     import torch
     if device == 'auto':

pixeltable/functions/video.py CHANGED Viewed

@@ -1,14 +1,13 @@
-from typing import Optional
 import uuid
+from typing import Optional
 import av
-import sys
 import pixeltable.env as env
 import pixeltable.func as func
 import pixeltable.type_system as ts
-_format_defaults = { # format -> (codec, ext)
+_format_defaults = {  # format -> (codec, ext)
     'wav': ('pcm_s16le', 'wav'),
     'mp3': ('libmp3lame', 'mp3'),
     'flac': ('flac', 'flac'),
@@ -35,11 +34,13 @@ _extract_audio_param_types = [
     ts.VideoType(nullable=False),
     ts.IntType(nullable=False),
     ts.StringType(nullable=False),
-    ts.StringType(nullable=False)
+    ts.StringType(nullable=True),
 ]
 @func.udf(return_type=ts.AudioType(nullable=True), param_types=_extract_audio_param_types)
 def extract_audio(
-        video_path: str, stream_idx: int = 0, format: str = 'wav', codec: Optional[str] = None
+    video_path: str, stream_idx: int = 0, format: str = 'wav', codec: Optional[str] = None
 ) -> Optional[str]:
     """Extract an audio stream from a video file, save it as a media file and return its path"""
     if format not in _format_defaults:
@@ -51,12 +52,49 @@ def extract_audio(
             return None
         audio_stream = container.streams.audio[stream_idx]
         # create this in our tmp directory, so it'll get cleaned up if it's being generated as part of a query
-        output_filename = str(env.Env.get().tmp_dir / f"{uuid.uuid4()}.{ext}")
+        output_filename = str(env.Env.get().tmp_dir / f'{uuid.uuid4()}.{ext}')
-        with av.open(output_filename, "w", format=format) as output_container:
+        with av.open(output_filename, 'w', format=format) as output_container:
             output_stream = output_container.add_stream(codec or default_codec)
             for packet in container.demux(audio_stream):
                 for frame in packet.decode():
                     output_container.mux(output_stream.encode(frame))
         return output_filename
+@func.udf(return_type=ts.JsonType(nullable=False), param_types=[ts.VideoType(nullable=False)])
+def get_metadata(video: str) -> dict:
+    """Gets various metadata associated with a video file.
+    Args:
+        video (str): Path to the video file.
+    Returns:
+        A dictionary containing the associated metadata.
+    """
+    with av.open(video) as container:
+        assert isinstance(container, av.container.InputContainer)
+        video_streams_info = [
+            {
+                'duration': stream.duration,
+                'frames': stream.frames,
+                'language': stream.language,
+                'average_rate': float(stream.average_rate) if stream.average_rate is not None else None,
+                'base_rate': float(stream.base_rate) if stream.base_rate is not None else None,
+                'guessed_rate': float(stream.guessed_rate) if stream.guessed_rate is not None else None,
+                'pix_fmt': getattr(stream.codec_context, 'pix_fmt', None),
+                'width': stream.width,
+                'height': stream.height,
+            }
+            for stream in container.streams
+            if isinstance(stream, av.video.stream.VideoStream)
+        ]
+        result = {
+            'bit_exact': container.bit_exact,
+            'bit_rate': container.bit_rate,
+            'size': container.size,
+            'metadata': container.metadata,
+            'streams': video_streams_info,  # TODO: Audio streams?
+        }
+    return result

pixeltable/globals.py CHANGED Viewed

@@ -96,8 +96,8 @@ def create_view(
         schema: dictionary mapping column names to column types, value expressions, or to column specifications.
         filter: Predicate to filter rows of the base table.
         is_snapshot: Whether the view is a snapshot.
-        iterator_class: Class of the iterator to use for the view.
-        iterator_args: Arguments to pass to the iterator class.
+        iterator: The iterator to use for this view. If specified, then this view will be a one-to-many view of
+            the base table.
         num_retained_versions: Number of versions of the view to retain.
         ignore_errors: if True, fail silently if the path already exists or is invalid.
@@ -423,3 +423,21 @@ def get_path(schema_obj: catalog.SchemaObject) -> str:
         dir_id = dir._dir_id
     path_elements.append(schema_obj._name)
     return '.'.join(path_elements)
+def configure_logging(
+    *,
+    to_stdout: Optional[bool] = None,
+    level: Optional[int] = None,
+    add: Optional[str] = None,
+    remove: Optional[str] = None,
+) -> None:
+    """Configure logging.
+    Args:
+        to_stdout: if True, also log to stdout
+        level: default log level
+        add: comma-separated list of 'module name:log level' pairs; ex.: add='video:10'
+        remove: comma-separated list of module names
+    """
+    return Env.get().configure_logging(to_stdout=to_stdout, level=level, add=add, remove=remove)

pixeltable/index/__init__.py CHANGED Viewed

@@ -1,2 +1,3 @@
 from .base import IndexBase
 from .embedding_index import EmbeddingIndex
+from .btree import BtreeIndex

pixeltable/index/base.py CHANGED Viewed

@@ -27,7 +27,12 @@ class IndexBase(abc.ABC):
         pass
     @abc.abstractmethod
-    def index_sa_type(self) -> sql.sqltypes.TypeEngine:
+    def records_value_errors(self) -> bool:
+        """True if index_value_expr() can raise errors"""
+        pass
+    @abc.abstractmethod
+    def index_sa_type(self) -> sql.types.TypeEngine:
         """Return the sqlalchemy type of the index value column"""
         pass

pixeltable/index/btree.py ADDED Viewed

@@ -0,0 +1,54 @@
+from typing import Optional
+import sqlalchemy as sql
+# TODO: why does this import result in a circular import, but the one im embedding_index.py doesn't?
+#import pixeltable.catalog as catalog
+import pixeltable.exceptions as excs
+import pixeltable.func as func
+from .base import IndexBase
+class BtreeIndex(IndexBase):
+    """
+    Interface to B-tree indices in Postgres.
+    """
+    MAX_STRING_LEN = 256
+    @func.udf
+    def str_filter(s: Optional[str]) -> Optional[str]:
+        if s is None:
+            return None
+        return s[:BtreeIndex.MAX_STRING_LEN]
+    def __init__(self, c: 'catalog.Column'):
+        if not c.col_type.is_scalar_type() and not c.col_type.is_media_type():
+            raise excs.Error(f'Index on column {c.name}: B-tree index requires scalar or media type, got {c.col_type}')
+        from pixeltable.exprs import ColumnRef
+        self.value_expr = self.str_filter(ColumnRef(c)) if c.col_type.is_string_type() else ColumnRef(c)
+    def index_value_expr(self) -> 'pixeltable.exprs.Expr':
+        return self.value_expr
+    def records_value_errors(self) -> bool:
+        return False
+    def index_sa_type(self) -> sql.types.TypeEngine:
+        """Return the sqlalchemy type of the index value column"""
+        return self.value_expr.col_type.to_sa_type()
+    def create_index(self, index_name: str, index_value_col: 'catalog.Column', conn: sql.engine.Connection) -> None:
+        """Create the index on the index value column"""
+        idx = sql.Index(index_name, index_value_col.sa_col, postgresql_using='btree')
+        idx.create(bind=conn)
+    @classmethod
+    def display_name(cls) -> str:
+        return 'btree'
+    def as_dict(self) -> dict:
+        return {}
+    @classmethod
+    def from_dict(cls, c: 'catalog.Column', d: dict) -> 'BtreeIndex':
+        return cls(c)

pixeltable/index/embedding_index.py CHANGED Viewed

@@ -70,7 +70,10 @@ class EmbeddingIndex(IndexBase):
         """Return expression that computes the value that goes into the index"""
         return self.value_expr
-    def index_sa_type(self) -> sql.sqltypes.TypeEngine:
+    def records_value_errors(self) -> bool:
+        return True
+    def index_sa_type(self) -> sql.types.TypeEngine:
         """Return the sqlalchemy type of the index value column"""
         return self.index_col_type

pixeltable/io/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
+from .globals import create_label_studio_project
 from .hf_datasets import import_huggingface_dataset
 from .pandas import import_csv, import_excel, import_pandas
 from .parquet import import_parquet

pixeltable/io/globals.py ADDED Viewed

@@ -0,0 +1,59 @@
+from typing import Any, Optional, Literal
+import pixeltable as pxt
+from pixeltable import Table
+def create_label_studio_project(
+        t: Table,
+        label_config: str,
+        col_mapping: Optional[dict[str, str]] = None,
+        title: Optional[str] = None,
+        media_import_method: Literal['post', 'file'] = 'file',
+        sync_immediately: bool = True,
+        **kwargs: Any
+) -> None:
+    """
+    Creates a new Label Studio project and links it to the specified `Table`.
+    The required parameter `label_config` specifies the Label Studio project configuration,
+    in XML format, as described in the Label Studio documentation. The linked project will
+    have one column for each data field in the configuration; for example, if the
+    configuration has an entry
+    ```
+    <Image name="image_obj" value="$image"/>
+    ```
+    then the linked project will have a column named `image`. In addition, the linked project
+    will always have a JSON-typed column `annotations` representing the output.
+    By default, Pixeltable will link each of these columns to a column of the specified `Table`
+    with the same name. If any of the data fields are missing, an exception will be thrown. If
+    the `annotations` column is missing, it will be created. The default names can be overridden
+    by specifying an optional `col_mapping`, with Pixeltable column names as keys and Label
+    Studio field names as values.
+    Args:
+        t: The Table to link to.
+        label_config: The Label Studio project configuration, in XML format.
+        col_mapping: An optional mapping of local column names to remote column names.
+        title: An optional title for the Label Studio project. If not specified, the
+            name of the `Table` will be used as a default.
+        sync_immediately: If `True`, immediately perform an initial synchronization by
+            importing all rows of the `Table` as Label Studio tasks.
+    """
+    from pixeltable.datatransfer.label_studio import LabelStudioProject, ANNOTATIONS_COLUMN
+    ls_project = LabelStudioProject.create(title or t.get_name(), label_config, media_import_method, **kwargs)
+    # Create a column to hold the annotations, if one does not yet exist.
+    if col_mapping is not None and ANNOTATIONS_COLUMN in col_mapping.values():
+        local_annotations_column = next(k for k, v in col_mapping.items() if v == ANNOTATIONS_COLUMN)
+    else:
+        local_annotations_column = ANNOTATIONS_COLUMN
+    if local_annotations_column not in t.column_names():
+        t[local_annotations_column] = pxt.JsonType(nullable=True)
+    # Link the project to `t`, and sync if appropriate.
+    t._link(ls_project, col_mapping)
+    if sync_immediately:
+        t.sync()

pixeltable/iterators/base.py CHANGED Viewed

@@ -6,11 +6,11 @@ from pixeltable.type_system import ColumnType
 class ComponentIterator(ABC):
-    """Base class for iterators."""
+    """Base class for Pixeltable iterators."""
     @classmethod
     @abstractmethod
-    def input_schema(cls) -> Dict[str, ColumnType]:
+    def input_schema(cls) -> dict[str, ColumnType]:
         """Provide the Pixeltable types of the init() parameters
         The keys need to match the names of the init() parameters. This is equivalent to the parameters_types
@@ -20,7 +20,7 @@ class ComponentIterator(ABC):
     @classmethod
     @abstractmethod
-    def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
+    def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
         """Specify the dictionary returned by next() and a list of unstored column names
         Returns:
@@ -33,7 +33,7 @@ class ComponentIterator(ABC):
         return self
     @abstractmethod
-    def __next__(self) -> Dict[str, Any]:
+    def __next__(self) -> dict[str, Any]:
         """Return the next element of the iterator as a dictionary or raise StopIteration"""
         raise NotImplementedError

pixeltable/iterators/document.py CHANGED Viewed

@@ -13,6 +13,7 @@ from .base import ComponentIterator
 _logger = logging.getLogger('pixeltable')
 class ChunkMetadata(enum.Enum):
     TITLE = 1
     HEADING = 2
@@ -20,6 +21,7 @@ class ChunkMetadata(enum.Enum):
     PAGE = 4
     BOUNDING_BOX = 5
 class Separator(enum.Enum):
     HEADING = 1
     PARAGRAPH = 2
@@ -28,6 +30,7 @@ class Separator(enum.Enum):
     CHAR_LIMIT = 5
     PAGE = 6
 @dataclasses.dataclass
 class DocumentSectionMetadata:
     """Metadata for a subsection of a document (ie, a structural element like a heading or paragraph)"""
@@ -42,6 +45,7 @@ class DocumentSectionMetadata:
     # bounding box as an {x1, y1, x2, y2} dictionary
     bounding_box: Optional[Dict[str, float]] = None
 @dataclasses.dataclass
 class DocumentSection:
     """A single document chunk, according to some of the splitting criteria"""
@@ -79,20 +83,14 @@ def _parse_metadata(metadata: str) -> List[ChunkMetadata]:
 _HTML_HEADINGS = {'h1', 'h2', 'h3', 'h4', 'h5', 'h6'}
 class DocumentSplitter(ComponentIterator):
-    """Iterator over pieces of a document. The document is split into chunks based on the specified separators.
-    The iterator output tuples are of schema {'text': StringType()}, but can include additional metadata fields if specified
-    in the `metadata` argument as explained below.
-    All chunk text is passed through `ftfy.fix_text` to fix up common problems with unicode sequences.
-    Args:
-        `metadata`: which additional metadata fields to include in the output schema:
-             'title', 'heading' (HTML and Markdown), 'sourceline' (HTML), 'page' (PDF), 'bounding_box' (PDF).
-             The input can be a comma-separated string of these values eg. 'title,heading,sourceline'.
-        `separators`: which separators to use to split the document into rows. Options are:
-             'heading', 'paragraph', 'sentence', 'token_limit', 'char_limit', 'page'. As with metadata, this is can be a
-                comma-separated string eg. 'heading, token_limit'.
-        `limit`: the maximum number of tokens or characters in each chunk if 'token_limit' or 'char_limit' is specified.
+    """Iterator over chunks of a document. The document is chunked according to the specified `separators`.
+    The iterator yields a `text` field containing the text of the chunk, and it may also
+    include additional metadata fields if specified in the `metadata` parameter, as explained below.
+    Chunked text will be cleaned with `ftfy.fix_text` to fix up common problems with unicode sequences.
     """
     METADATA_COLUMN_TYPES = {
         ChunkMetadata.TITLE: StringType(nullable=True),
@@ -103,10 +101,23 @@ class DocumentSplitter(ComponentIterator):
     }
     def __init__(
-            self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None, metadata: str = '',
-            html_skip_tags: Optional[List[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
+            self, document: str, *, separators: str, limit: Optional[int] = None, overlap: Optional[int] = None,
+            metadata: str = '',
+            html_skip_tags: Optional[list[str]] = None, tiktoken_encoding: Optional[str] = 'cl100k_base',
             tiktoken_target_model: Optional[str] = None
     ):
+        """Init method for `DocumentSplitter` class.
+        Args:
+            separators: separators to use to chunk the document. Options are:
+                 `'heading'`, `'paragraph'`, `'sentence'`, `'token_limit'`, `'char_limit'`, `'page'`.
+                 This may be a comma-separated string, e.g., `'heading,token_limit'`.
+            limit: the maximum number of tokens or characters in each chunk, if `'token_limit'`
+                 or `'char_limit'` is specified.
+            metadata: additional metadata fields to include in the output. Options are:
+                 `'title'`, `'heading'` (HTML and Markdown), `'sourceline'` (HTML), `'page'` (PDF), `'bounding_box'`
+                 (PDF). The input may be a comma-separated string, e.g., `'title,heading,sourceline'`.
+        """
         if html_skip_tags is None:
             html_skip_tags = ['nav']
         self._doc_handle = get_document_handle(document)

pixeltable 0.2.6__py3-none-any.whl → 0.2.7__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.6py3-none-any.whl → 0.2.7py3-none-any.whl