PyPI - pixeltable - Versions diffs - 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl - Mend

pixeltable 0.2.16py3-none-any.whl → 0.2.18py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (79) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/catalog.py +8 -7
pixeltable/catalog/column.py +11 -8
pixeltable/catalog/insertable_table.py +1 -1
pixeltable/catalog/path_dict.py +8 -6
pixeltable/catalog/table.py +20 -13
pixeltable/catalog/table_version.py +91 -54
pixeltable/catalog/table_version_path.py +7 -9
pixeltable/catalog/view.py +2 -1
pixeltable/dataframe.py +1 -1
pixeltable/env.py +173 -82
pixeltable/exec/aggregation_node.py +2 -1
pixeltable/exec/component_iteration_node.py +1 -1
pixeltable/exec/sql_node.py +11 -8
pixeltable/exprs/__init__.py +1 -0
pixeltable/exprs/arithmetic_expr.py +4 -4
pixeltable/exprs/array_slice.py +2 -1
pixeltable/exprs/column_property_ref.py +9 -7
pixeltable/exprs/column_ref.py +2 -1
pixeltable/exprs/comparison.py +10 -7
pixeltable/exprs/compound_predicate.py +3 -2
pixeltable/exprs/data_row.py +19 -4
pixeltable/exprs/expr.py +46 -35
pixeltable/exprs/expr_set.py +32 -9
pixeltable/exprs/function_call.py +56 -32
pixeltable/exprs/in_predicate.py +3 -2
pixeltable/exprs/inline_array.py +2 -1
pixeltable/exprs/inline_dict.py +2 -1
pixeltable/exprs/is_null.py +3 -2
pixeltable/exprs/json_mapper.py +5 -4
pixeltable/exprs/json_path.py +7 -1
pixeltable/exprs/literal.py +34 -7
pixeltable/exprs/method_ref.py +3 -3
pixeltable/exprs/object_ref.py +6 -5
pixeltable/exprs/row_builder.py +25 -17
pixeltable/exprs/rowid_ref.py +2 -1
pixeltable/exprs/similarity_expr.py +2 -1
pixeltable/exprs/sql_element_cache.py +30 -0
pixeltable/exprs/type_cast.py +3 -3
pixeltable/exprs/variable.py +2 -1
pixeltable/ext/functions/whisperx.py +4 -4
pixeltable/ext/functions/yolox.py +6 -6
pixeltable/func/aggregate_function.py +1 -0
pixeltable/func/function.py +28 -4
pixeltable/functions/__init__.py +4 -2
pixeltable/functions/anthropic.py +107 -0
pixeltable/functions/fireworks.py +2 -2
pixeltable/functions/globals.py +6 -1
pixeltable/functions/huggingface.py +2 -2
pixeltable/functions/image.py +17 -2
pixeltable/functions/json.py +5 -5
pixeltable/functions/mistralai.py +188 -0
pixeltable/functions/openai.py +6 -10
pixeltable/functions/string.py +3 -2
pixeltable/functions/timestamp.py +95 -7
pixeltable/functions/together.py +5 -5
pixeltable/functions/video.py +2 -2
pixeltable/functions/vision.py +27 -17
pixeltable/functions/whisper.py +1 -1
pixeltable/io/hf_datasets.py +17 -15
pixeltable/io/pandas.py +0 -2
pixeltable/io/parquet.py +15 -14
pixeltable/iterators/document.py +16 -15
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_19.py +46 -0
pixeltable/metadata/notes.py +1 -0
pixeltable/metadata/schema.py +5 -4
pixeltable/plan.py +100 -78
pixeltable/store.py +5 -1
pixeltable/tool/create_test_db_dump.py +4 -3
pixeltable/type_system.py +12 -14
pixeltable/utils/documents.py +45 -42
pixeltable/utils/formatter.py +2 -2
{pixeltable-0.2.16.dist-info → pixeltable-0.2.18.dist-info}/METADATA +79 -21
pixeltable-0.2.18.dist-info/RECORD +147 -0
pixeltable-0.2.16.dist-info/RECORD +0 -143
{pixeltable-0.2.16.dist-info → pixeltable-0.2.18.dist-info}/LICENSE +0 -0
{pixeltable-0.2.16.dist-info → pixeltable-0.2.18.dist-info}/WHEEL +0 -0
{pixeltable-0.2.16.dist-info → pixeltable-0.2.18.dist-info}/entry_points.txt +0 -0

pixeltable/io/hf_datasets.py CHANGED Viewed

@@ -6,7 +6,7 @@ import random
 import typing
 from typing import Union, Optional, Any
-import pixeltable
+import pixeltable as pxt
 import pixeltable.type_system as ts
 from pixeltable import exceptions as excs
@@ -81,24 +81,26 @@ def import_huggingface_dataset(
     dataset: Union[datasets.Dataset, datasets.DatasetDict],
     *,
     column_name_for_split: Optional[str] = None,
-    schema_override: Optional[dict[str, Any]] = None,
-    **kwargs,
-) -> 'pixeltable.InsertableTable':
-    """Create a new `Table` from a Huggingface dataset, or dataset dict with multiple splits.
-        Requires datasets library to be installed.
+    schema_overrides: Optional[dict[str, Any]] = None,
+    **kwargs: Any,
+) -> pxt.Table:
+    """Create a new base table from a Huggingface dataset, or dataset dict with multiple splits.
+        Requires `datasets` library to be installed.
     Args:
-        path_str: Path to the table.
-        dataset: Huggingface datasets.Dataset or datasets.DatasetDict to insert into the table.
+        table_path: Path to the table.
+        dataset: Huggingface [`datasets.Dataset`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.Dataset)
+            or [`datasets.DatasetDict`](https://huggingface.co/docs/datasets/en/package_reference/main_classes#datasets.DatasetDict)
+            to insert into the table.
         column_name_for_split: column name to use for split information. If None, no split information will be stored.
-        schema_override: Optional dictionary mapping column names to column type to override the corresponding defaults from
-        `pixeltable.utils.hf_datasets.huggingface_schema_to_pixeltable_schema`. The column type should be a pixeltable ColumnType.
-        For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
+        schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
+            name `name` will be given type `type`, instead of being inferred from the `Dataset` or `DatasetDict`. The keys in
+            `schema_overrides` should be the column names of the `Dataset` or `DatasetDict` (whether or not they are valid
+            Pixeltable identifiers).
         kwargs: Additional arguments to pass to `create_table`.
     Returns:
-        The newly created table. The table will have loaded the data from the dataset.
+        A handle to the newly created [`Table`][pixeltable.Table].
     """
     import datasets
     import pixeltable as pxt
@@ -118,8 +120,8 @@ def import_huggingface_dataset(
         dataset_dict = dataset
     pixeltable_schema = huggingface_schema_to_pixeltable_schema(dataset)
-    if schema_override is not None:
-        pixeltable_schema.update(schema_override)
+    if schema_overrides is not None:
+        pixeltable_schema.update(schema_overrides)
     if column_name_for_split is not None:
         if column_name_for_split in pixeltable_schema:

pixeltable/io/pandas.py CHANGED Viewed

@@ -1,9 +1,7 @@
-import datetime
 from typing import Any, Optional, Union
 import numpy as np
 import pandas as pd
-import PIL.Image
 import pixeltable as pxt
 import pixeltable.exceptions as excs

pixeltable/io/parquet.py CHANGED Viewed

@@ -7,7 +7,7 @@ import random
 import typing
 from collections import deque
 from pathlib import Path
-from typing import Dict, Optional
+from typing import Dict, Optional, Any
 import PIL.Image
 import numpy as np
@@ -142,21 +142,22 @@ def import_parquet(
     table_path: str,
     *,
     parquet_path: str,
-    schema_override: Optional[Dict[str, ts.ColumnType]] = None,
-    **kwargs,
-) -> pxt.catalog.InsertableTable:
-    """Create a new `Table` from a Parquet file or set of files. Requires pyarrow to be installed.
+    schema_overrides: Optional[Dict[str, ts.ColumnType]] = None,
+    **kwargs: Any,
+) -> pxt.Table:
+    """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
     Args:
-        path_str: Path to the table within pixeltable.
+        table_path: Path to the table.
         parquet_path: Path to an individual Parquet file or directory of Parquet files.
-        schema_override: Optional dictionary mapping column names to column type to override the default
-                        schema inferred from the Parquet file. The column type should be a pixeltable ColumnType.
-                        For example, {'col_vid': VideoType()}, rather than {'col_vid': StringType()}.
-                        Any fields not provided explicitly will map to types with `pixeltable.utils.parquet.parquet_schema_to_pixeltable_schema`
+        schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
+            name `name` will be given type `type`, instead of being inferred from the Parquet dataset. The keys in
+            `schema_overrides` should be the column names of the Parquet dataset (whether or not they are valid
+            Pixeltable identifiers).
         kwargs: Additional arguments to pass to `create_table`.
     Returns:
-        The newly created table. The table will have loaded the data from the Parquet file(s).
+        A handle to the newly created [`Table`][pixeltable.Table].
     """
     import pixeltable as pxt
     from pyarrow import parquet
@@ -166,10 +167,10 @@ def import_parquet(
     parquet_dataset = parquet.ParquetDataset(input_path)
     schema = parquet_schema_to_pixeltable_schema(parquet_path)
-    if schema_override is None:
-        schema_override = {}
+    if schema_overrides is None:
+        schema_overrides = {}
-    schema.update(schema_override)
+    schema.update(schema_overrides)
     for k, v in schema.items():
         if v is None:
             raise exc.Error(f'Could not infer pixeltable type for column {k} from parquet file')

pixeltable/iterators/document.py CHANGED Viewed

@@ -1,14 +1,15 @@
 import dataclasses
 import enum
 import logging
-from typing import Dict, Any, List, Tuple, Optional, Iterable, Iterator
+from typing import Any, Iterable, Iterator, Optional
 import ftfy
 from pixeltable.env import Env
 from pixeltable.exceptions import Error
-from pixeltable.type_system import ColumnType, DocumentType, StringType, IntType, JsonType
+from pixeltable.type_system import ColumnType, DocumentType, IntType, JsonType, StringType
 from pixeltable.utils.documents import get_document_handle
 from .base import ComponentIterator
 _logger = logging.getLogger('pixeltable')
@@ -38,12 +39,12 @@ class DocumentSectionMetadata:
     sourceline: Optional[int] = None
     # the stack of headings up to the most recently observed one;
     # eg, if the most recent one was an h2, 'headings' would contain keys 1 and 2, but nothing below that
-    heading: Optional[Dict[str, str]] = None
+    heading: Optional[dict[str, str]] = None
     # pdf-specific metadata
     page: Optional[int] = None
     # bounding box as an {x1, y1, x2, y2} dictionary
-    bounding_box: Optional[Dict[str, float]] = None
+    bounding_box: Optional[dict[str, float]] = None
 @dataclasses.dataclass
@@ -53,7 +54,7 @@ class DocumentSection:
     metadata: Optional[DocumentSectionMetadata]
-def _parse_separators(separators: str) -> List[Separator]:
+def _parse_separators(separators: str) -> list[Separator]:
     ret = []
     for s in separators.split(','):
         clean_s = s.strip().upper()
@@ -67,7 +68,7 @@ def _parse_separators(separators: str) -> List[Separator]:
     return ret
-def _parse_metadata(metadata: str) -> List[ChunkMetadata]:
+def _parse_metadata(metadata: str) -> list[ChunkMetadata]:
     ret = []
     for m in metadata.split(','):
         clean_m = m.strip().upper()
@@ -161,7 +162,7 @@ class DocumentSplitter(ComponentIterator):
             self._sections = self._char_chunks(self._sections)
     @classmethod
-    def input_schema(cls) -> Dict[str, ColumnType]:
+    def input_schema(cls) -> dict[str, ColumnType]:
         return {
             'document': DocumentType(nullable=False),
             'separators': StringType(nullable=False),
@@ -174,7 +175,7 @@ class DocumentSplitter(ComponentIterator):
         }
     @classmethod
-    def output_schema(cls, *args: Any, **kwargs: Any) -> Tuple[Dict[str, ColumnType], List[str]]:
+    def output_schema(cls, *args: Any, **kwargs: Any) -> tuple[dict[str, ColumnType], list[str]]:
         schema = {'text': StringType()}
         md_fields = _parse_metadata(kwargs['metadata']) if 'metadata' in kwargs else []
@@ -208,7 +209,7 @@ class DocumentSplitter(ComponentIterator):
         return schema, []
-    def __next__(self) -> Dict[str, Any]:
+    def __next__(self) -> dict[str, Any]:
         while True:
             section = next(self._sections)
             if section.text is None:
@@ -236,7 +237,7 @@ class DocumentSplitter(ComponentIterator):
         accumulated_text = []  # currently accumulated text
         # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
-        headings: Dict[str, str] = {}   # current state of observed headings (level -> text)
+        headings: dict[str, str] = {}   # current state of observed headings (level -> text)
         sourceline = 0  # most recently seen sourceline
         def update_metadata(el: bs4.Tag) -> None:
@@ -250,7 +251,7 @@ class DocumentSplitter(ComponentIterator):
                     del headings[l]
                 headings[el.name] = el.get_text().strip()
-        def emit() -> None:
+        def emit() -> Iterator[DocumentSection]:
             nonlocal accumulated_text, headings, sourceline
             if len(accumulated_text) > 0:
                 md = DocumentSectionMetadata(sourceline=sourceline, heading=headings.copy())
@@ -294,9 +295,9 @@ class DocumentSplitter(ComponentIterator):
         # current state
         accumulated_text = []  # currently accumulated text
         # accumulate pieces then join before emit to avoid quadratic complexity of string concatenation
-        headings: Dict[str, str] = {}   # current state of observed headings (level -> text)
+        headings: dict[str, str] = {}   # current state of observed headings (level -> text)
-        def update_headings(heading: Dict) -> None:
+        def update_headings(heading: dict) -> None:
             # update current state
             nonlocal headings
             assert 'type' in heading and heading['type'] == 'heading'
@@ -309,14 +310,14 @@ class DocumentSplitter(ComponentIterator):
                 del headings[l]
             headings[level] = text
-        def emit() -> None:
+        def emit() -> Iterator[DocumentSection]:
             nonlocal accumulated_text, headings
             if len(accumulated_text) > 0:
                 metadata = DocumentSectionMetadata(sourceline=0, heading=headings.copy())
                 yield DocumentSection(text=ftfy.fix_text(' '.join(accumulated_text)), metadata=metadata)
                 accumulated_text = []
-        def process_element(el: Dict) -> Iterator[DocumentSection]:
+        def process_element(el: dict) -> Iterator[DocumentSection]:
             # process the element and emit sections as necessary
             nonlocal accumulated_text, headings, emit_on_heading, emit_on_paragraph
             assert 'type' in el

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
 from .schema import SystemInfo, SystemInfoMd
 # current version of the metadata; this is incremented whenever the metadata schema changes
-VERSION = 19
+VERSION = 20
 def create_system_info(engine: sql.engine.Engine) -> None:

pixeltable/metadata/converters/convert_19.py ADDED Viewed

@@ -0,0 +1,46 @@
+import datetime
+from typing import Any, Optional
+import sqlalchemy as sql
+import pixeltable as pxt
+from pixeltable.metadata import register_converter, schema
+from pixeltable.metadata.converters.util import convert_table_md
+@register_converter(version=19)
+def _(engine: sql.engine.Engine) -> None:
+    # Convert all timestamp literals to aware datetimes
+    convert_table_md(engine, substitution_fn=__update_timestamp_literals)
+    # Convert all timestamp columns to TIMESTAMPTZ. (This conversion will take place in the database
+    # default time zone, which is what we want, since in versions <= 19 they were naive timestamps.)
+    with engine.begin() as conn:
+        tables = conn.execute(sql.select(schema.Table.id, schema.Table.md))
+        for id, md in tables:
+            store_prefix = 'view' if md['view_md'] is not None else 'tbl'
+            store_name = f'{store_prefix}_{id.hex}'
+            column_md = md['column_md']
+            timestamp_cols = [
+                col_id for col_id, col in column_md.items()
+                if col['col_type']['_classname'] == 'TimestampType'
+            ]
+            for col_id in timestamp_cols:
+                conn.execute(
+                    sql.text(f'ALTER TABLE {store_name} ALTER COLUMN col_{col_id} TYPE TIMESTAMPTZ')
+                )
+def __update_timestamp_literals(k: Any, v: Any) -> Optional[tuple[Any, Any]]:
+    if isinstance(v, dict) and 'val_t' in v:
+        # It's a literal with an explicit 'val_t' field. In version 19 this can only mean a
+        # timestamp literal, which (in version 19) is stored in the DB as a naive datetime.
+        # We convert it to an aware datetime, stored in UTC.
+        assert v['_classname'] == 'Literal'
+        assert v['val_t'] == pxt.ColumnType.Type.TIMESTAMP.name
+        assert isinstance(v['val'], str)
+        dt = datetime.datetime.fromisoformat(v['val'])
+        assert dt.tzinfo is None  # In version 19 all timestamps are naive
+        dt_utc = dt.astimezone(datetime.timezone.utc)
+        v['val'] = dt_utc.isoformat()
+        return k, v

pixeltable/metadata/notes.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # rather than as a comment, so that the existence of a description can be enforced by
 # the unit tests when new versions are added.
 VERSION_NOTES = {
+    20: 'Store DB timestamps in UTC',
     19: 'UDF renames; ImageMemberAccess removal',
     18: 'Restructured index metadata',
     17: 'Renamed remotes to external_stores',

pixeltable/metadata/schema.py CHANGED Viewed

@@ -3,6 +3,7 @@ import uuid
 from typing import Optional, List, get_type_hints, Type, Any, TypeVar, Tuple, Union
 import sqlalchemy as sql
+import sqlalchemy.orm as orm
 from sqlalchemy import ForeignKey
 from sqlalchemy import Integer, BigInteger, LargeBinary
 from sqlalchemy.dialects.postgresql import UUID, JSONB
@@ -64,8 +65,8 @@ class DirMd:
 class Dir(Base):
     __tablename__ = 'dirs'
-    id = sql.Column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
-    parent_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
+    id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), primary_key=True, default=uuid.uuid4, nullable=False)
+    parent_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=True)
     md = sql.Column(JSONB, nullable=False)
@@ -163,8 +164,8 @@ class Table(Base):
     MAX_VERSION = 9223372036854775807  # 2^63 - 1
-    id = sql.Column(UUID(as_uuid=True), primary_key=True, nullable=False)
-    dir_id = sql.Column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
+    id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), primary_key=True, nullable=False)
+    dir_id: orm.Mapped[uuid.UUID] = orm.mapped_column(UUID(as_uuid=True), ForeignKey('dirs.id'), nullable=False)
     md = sql.Column(JSONB, nullable=False)  # TableMd

pixeltable 0.2.16__py3-none-any.whl → 0.2.18__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.16py3-none-any.whl → 0.2.18py3-none-any.whl