PyPI - pixeltable - Versions diffs - 0.2.25__py3-none-any.whl → 0.2.26__py3-none-any.whl - Mend

pixeltable 0.2.25py3-none-any.whl → 0.2.26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

pixeltable/__version__.py +2 -2
pixeltable/catalog/table.py +118 -44
pixeltable/catalog/view.py +2 -2
pixeltable/dataframe.py +240 -92
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/exec_node.py +6 -7
pixeltable/exec/sql_node.py +91 -44
pixeltable/exprs/__init__.py +1 -0
pixeltable/exprs/arithmetic_expr.py +1 -1
pixeltable/exprs/array_slice.py +1 -1
pixeltable/exprs/column_property_ref.py +1 -1
pixeltable/exprs/column_ref.py +29 -2
pixeltable/exprs/comparison.py +1 -1
pixeltable/exprs/compound_predicate.py +1 -1
pixeltable/exprs/expr.py +11 -5
pixeltable/exprs/expr_set.py +8 -0
pixeltable/exprs/function_call.py +14 -11
pixeltable/exprs/in_predicate.py +1 -1
pixeltable/exprs/inline_expr.py +3 -3
pixeltable/exprs/is_null.py +1 -1
pixeltable/exprs/json_mapper.py +1 -1
pixeltable/exprs/json_path.py +1 -1
pixeltable/exprs/method_ref.py +1 -1
pixeltable/exprs/rowid_ref.py +1 -1
pixeltable/exprs/similarity_expr.py +1 -1
pixeltable/exprs/sql_element_cache.py +4 -0
pixeltable/exprs/type_cast.py +2 -2
pixeltable/exprs/variable.py +3 -0
pixeltable/func/expr_template_function.py +3 -0
pixeltable/functions/ollama.py +4 -4
pixeltable/globals.py +4 -1
pixeltable/io/__init__.py +1 -1
pixeltable/io/parquet.py +39 -19
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_22.py +17 -0
pixeltable/metadata/notes.py +1 -0
pixeltable/plan.py +128 -50
pixeltable/store.py +1 -1
pixeltable/type_system.py +1 -1
pixeltable/utils/arrow.py +8 -3
pixeltable/utils/description_helper.py +89 -0
{pixeltable-0.2.25.dist-info → pixeltable-0.2.26.dist-info}/METADATA +26 -10
{pixeltable-0.2.25.dist-info → pixeltable-0.2.26.dist-info}/RECORD +46 -44
{pixeltable-0.2.25.dist-info → pixeltable-0.2.26.dist-info}/WHEEL +1 -1
{pixeltable-0.2.25.dist-info → pixeltable-0.2.26.dist-info}/LICENSE +0 -0
{pixeltable-0.2.25.dist-info → pixeltable-0.2.26.dist-info}/entry_points.txt +0 -0

pixeltable/globals.py CHANGED Viewed

@@ -46,6 +46,7 @@ def create_table(
         num_retained_versions: Number of versions of the table to retain.
         comment: An optional comment; its meaning is user-defined.
         media_validation: Media validation policy for the table.
             - `'on_read'`: validate media files at query time
             - `'on_write'`: validate media files during insert/update operations
@@ -149,7 +150,9 @@ def create_view(
         tbl_version_path = base._tbl_version_path
     elif isinstance(base, DataFrame):
         base._validate_mutable('create_view')
-        tbl_version_path = base.tbl
+        if len(base._from_clause.tbls) > 1:
+            raise excs.Error('Cannot create a view of a join')
+        tbl_version_path = base._from_clause.tbls[0]
         where = base.where_clause
     else:
         raise excs.Error('`base` must be an instance of `Table` or `DataFrame`')

pixeltable/io/__init__.py CHANGED Viewed

@@ -2,7 +2,7 @@ from .external_store import ExternalStore, SyncStatus
 from .globals import create_label_studio_project, export_images_as_fo_dataset, import_json, import_rows
 from .hf_datasets import import_huggingface_dataset
 from .pandas import import_csv, import_excel, import_pandas
-from .parquet import import_parquet
+from .parquet import import_parquet, export_parquet
 __default_dir = set(symbol for symbol in dir() if not symbol.startswith('_'))
 __removed_symbols = {'globals', 'hf_datasets', 'pandas', 'parquet'}

pixeltable/io/parquet.py CHANGED Viewed

@@ -7,11 +7,14 @@ import random
 import typing
 from collections import deque
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, Optional, Union
 import numpy as np
 import PIL.Image
+import datetime
+import pixeltable as pxt
+from pixeltable.env import Env
 import pixeltable.exceptions as exc
 import pixeltable.type_system as ts
 from pixeltable.utils.transactional_directory import transactional_directory
@@ -39,28 +42,44 @@ def _write_batch(value_batch: dict[str, deque], schema: pa.Schema, output_path:
     parquet.write_table(tab, str(output_path))
-def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int = 100_000_000) -> None:
+def export_parquet(
+            table_or_df: Union[pxt.Table, pxt.DataFrame],
+            parquet_path: Path,
+            partition_size_bytes: int = 100_000_000,
+            inline_images: bool = False
+            ) -> None:
     """
-    Internal method to stream dataframe data to parquet format.
-    Does not materialize the dataset to memory.
+    Exports a dataframe's data to one or more Parquet files. Requires pyarrow to be installed.
-    It preserves pixeltable type metadata in a json file, which would otherwise
+    It additionally writes the pixeltable metadata in a json file, which would otherwise
     not be available in the parquet format.
-    Images are stored inline in a compressed format in their parquet file.
     Args:
-        df : dataframe to save.
-        dest_path : path to directory to save the parquet files to.
-        partition_size_bytes : maximum target size for each chunk. Default 100_000_000 bytes.
+        table_or_df : Table or Dataframe to export.
+        parquet_path : Path to directory to write the parquet files to.
+        partition_size_bytes : The maximum target size for each chunk. Default 100_000_000 bytes.
+        inline_images : If True, images are stored inline in the parquet file. This is useful
+                        for small images, to be imported as pytorch dataset. But can be inefficient
+                        for large images, and cannot be imported into pixeltable.
+                        If False, will raise an error if the Dataframe has any image column.
+                        Default False.
     """
     from pixeltable.utils.arrow import to_arrow_schema
+    df: pxt.DataFrame
+    if isinstance(table_or_df, pxt.catalog.Table):
+        df = table_or_df._df()
+    else:
+        df = table_or_df
     type_dict = {k: v.as_dict() for k, v in df.schema.items()}
     arrow_schema = to_arrow_schema(df.schema)
+    if not inline_images and any(col_type.is_image_type() for col_type in df.schema.values()):
+        raise exc.Error('Cannot export Dataframe with image columns when inline_images is False')
     # store the changes atomically
-    with transactional_directory(dest_path) as temp_path:
+    with transactional_directory(parquet_path) as temp_path:
         # dump metadata json file so we can inspect what was the source of the parquet file later on.
         json.dump(df.as_dict(), (temp_path / '.pixeltable.json').open('w'))
         json.dump(type_dict, (temp_path / '.pixeltable.column_types.json').open('w'))  # keep type metadata
@@ -111,6 +130,7 @@ def save_parquet(df: pxt.DataFrame, dest_path: Path, partition_size_bytes: int =
                 elif col_type.is_bool_type():
                     length = 1
                 elif col_type.is_timestamp_type():
+                    val = val.astimezone(datetime.timezone.utc)
                     length = 8
                 else:
                     assert False, f'unknown type {col_type} for {col_name}'
@@ -139,7 +159,7 @@ def parquet_schema_to_pixeltable_schema(parquet_path: str) -> dict[str, Optional
 def import_parquet(
-    table_path: str,
+    table: str,
     *,
     parquet_path: str,
     schema_overrides: Optional[dict[str, ts.ColumnType]] = None,
@@ -148,7 +168,7 @@ def import_parquet(
     """Creates a new base table from a Parquet file or set of files. Requires pyarrow to be installed.
     Args:
-        table_path: Path to the table.
+        table: Fully qualified name of the table to import the data into.
         parquet_path: Path to an individual Parquet file or directory of Parquet files.
         schema_overrides: If specified, then for each (name, type) pair in `schema_overrides`, the column with
             name `name` will be given type `type`, instead of being inferred from the Parquet dataset. The keys in
@@ -157,7 +177,7 @@ def import_parquet(
         kwargs: Additional arguments to pass to `create_table`.
     Returns:
-        A handle to the newly created [`Table`][pixeltable.Table].
+        A handle to the newly created table.
     """
     from pyarrow import parquet
@@ -176,11 +196,11 @@ def import_parquet(
         if v is None:
             raise exc.Error(f'Could not infer pixeltable type for column {k} from parquet file')
-    if table_path in pxt.list_tables():
-        raise exc.Error(f'Table {table_path} already exists')
+    if table in pxt.list_tables():
+        raise exc.Error(f'Table {table} already exists')
     try:
-        tmp_name = f'{table_path}_tmp_{random.randint(0, 100000000)}'
+        tmp_name = f'{table}_tmp_{random.randint(0, 100000000)}'
         tab = pxt.create_table(tmp_name, schema, **kwargs)
         for fragment in parquet_dataset.fragments:  # type: ignore[attr-defined]
             for batch in fragment.to_batches():
@@ -190,5 +210,5 @@ def import_parquet(
         _logger.error(f'Error while inserting Parquet file into table: {e}')
         raise e
-    pxt.move(tmp_name, table_path)
-    return pxt.get_table(table_path)
+    pxt.move(tmp_name, table)
+    return pxt.get_table(table)

pixeltable/metadata/__init__.py CHANGED Viewed

@@ -10,7 +10,7 @@ import sqlalchemy.orm as orm
 from .schema import SystemInfo, SystemInfoMd
 # current version of the metadata; this is incremented whenever the metadata schema changes
-VERSION = 22
+VERSION = 23
 def create_system_info(engine: sql.engine.Engine) -> None:

pixeltable/metadata/converters/convert_22.py ADDED Viewed

@@ -0,0 +1,17 @@
+from typing import Any, Optional
+import sqlalchemy as sql
+from pixeltable.metadata import register_converter
+from pixeltable.metadata.converters.util import convert_table_md
+@register_converter(version=22)
+def _(engine: sql.engine.Engine) -> None:
+    convert_table_md(engine, substitution_fn=__substitute_md)
+def __substitute_md(k: Optional[str], v: Any) -> Optional[tuple[Optional[str], Any]]:
+    if isinstance(v, dict) and '_classname' in v and v['_classname'] == 'DataFrame':
+        v['from_clause'] = {'tbls': [v['tbl']], 'join_clauses': []}
+        return k, v
+    return None

pixeltable/metadata/notes.py CHANGED Viewed

@@ -2,6 +2,7 @@
 # rather than as a comment, so that the existence of a description can be enforced by
 # the unit tests when new versions are added.
 VERSION_NOTES = {
+    23: 'DataFrame.from_clause',
     22: 'TableMd/ColumnMd.media_validation',
     21: 'Separate InlineArray and InlineList',
     20: 'Store DB timestamps in UTC',

pixeltable/plan.py CHANGED Viewed

@@ -1,4 +1,8 @@
-from typing import Any, Iterable, Optional, Sequence
+from __future__ import annotations
+import dataclasses
+import enum
+from typing import Any, Iterable, Optional, Sequence, Literal
 from uuid import UUID
 import sqlalchemy as sql
@@ -38,13 +42,46 @@ def _get_combined_ordering(
     return result
+class JoinType(enum.Enum):
+    INNER = 0
+    LEFT = 1
+    # TODO: implement
+    # RIGHT = 2
+    FULL_OUTER = 3
+    CROSS = 4
+    LiteralType = Literal['inner', 'left', 'full_outer', 'cross']
+    @classmethod
+    def validated(cls, name: str, error_prefix: str) -> JoinType:
+        try:
+            return cls[name.upper()]
+        except KeyError:
+            val_strs = ', '.join(f'{s.lower()!r}' for s in cls.__members__.keys())
+            raise excs.Error(f'{error_prefix} must be one of: [{val_strs}]')
+@dataclasses.dataclass
+class JoinClause:
+    """Corresponds to a single 'JOIN ... ON (...)' clause in a SELECT statement; excludes the joined table."""
+    join_type: JoinType
+    join_predicate: Optional[exprs.Expr]  # None for join_type == CROSS
+@dataclasses.dataclass
+class FromClause:
+    """Corresponds to the From-clause ('FROM <tbl> JOIN ... ON (...) JOIN ...') of a SELECT statement """
+    tbls: list[catalog.TableVersionPath]
+    join_clauses: list[JoinClause] = dataclasses.field(default_factory=list)
 class Analyzer:
     """
     Performs semantic analysis of a query and stores the analysis state.
     """
-    tbl: catalog.TableVersionPath
-    all_exprs: list[exprs.Expr]
+    from_clause: FromClause
+    all_exprs: list[exprs.Expr]  # union of all exprs, aside from sql_where_clause
     select_list: list[exprs.Expr]
     group_by_clause: Optional[list[exprs.Expr]]  # None for non-aggregate queries; [] for agg query w/o grouping
     grouping_exprs: list[exprs.Expr]  # [] for non-aggregate queries or agg query w/o grouping
@@ -63,12 +100,12 @@ class Analyzer:
     agg_order_by: list[exprs.Expr]
     def __init__(
-            self, tbl: catalog.TableVersionPath, select_list: Sequence[exprs.Expr],
+            self, from_clause: FromClause, select_list: Sequence[exprs.Expr],
             where_clause: Optional[exprs.Expr] = None, group_by_clause: Optional[list[exprs.Expr]] = None,
             order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None):
         if order_by_clause is None:
             order_by_clause = []
-        self.tbl = tbl
+        self.from_clause = from_clause
         self.sql_elements = exprs.SqlElementCache()
         # remove references to unstored computed cols
@@ -88,6 +125,9 @@ class Analyzer:
         # all exprs that are evaluated in Python; not executable
         self.all_exprs = self.select_list.copy()
+        for join_clause in from_clause.join_clauses:
+            if join_clause.join_predicate is not None:
+                self.all_exprs.append(join_clause.join_predicate)
         if self.group_by_clause is not None:
             self.all_exprs.extend(self.group_by_clause)
         self.all_exprs.extend(e for e, _ in self.order_by_clause)
@@ -316,7 +356,8 @@ class Planner:
             recomputed_cols = {c for c in recomputed_cols if c.is_stored}
         recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
         copied_cols = [
-            col for col in target.cols if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
+            col for col in target.cols_by_id.values()
+            if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
         ]
         select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in copied_cols]
         select_list.extend(update_targets.values())
@@ -329,7 +370,7 @@ class Planner:
         select_list.extend(recomputed_exprs)
         # we need to retrieve the PK columns of the existing rows
-        plan = cls.create_query_plan(tbl, select_list, where_clause=where_clause, ignore_errors=True)
+        plan = cls.create_query_plan(FromClause(tbls=[tbl]), select_list, where_clause=where_clause, ignore_errors=True)
         all_base_cols = copied_cols + updated_cols + list(recomputed_base_cols)  # same order as select_list
         # update row builder with column information
         for i, col in enumerate(all_base_cols):
@@ -373,7 +414,8 @@ class Planner:
         recomputed_cols = {c for c in recomputed_cols if c.is_stored}
         recomputed_base_cols = {col for col in recomputed_cols if col.tbl == target}
         copied_cols = [
-            col for col in target.cols if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
+            col for col in target.cols_by_id.values()
+            if col.is_stored and not col in updated_cols and not col in recomputed_base_cols
         ]
         select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in copied_cols]
         select_list.extend(exprs.ColumnRef(col) for col in updated_cols)
@@ -387,13 +429,12 @@ class Planner:
         # - SqlLookupNode to retrieve the existing rows
         # - RowUpdateNode to update the retrieved rows
         # - ExprEvalNode to evaluate the remaining output exprs
-        analyzer = Analyzer(tbl, select_list)
+        analyzer = Analyzer(FromClause(tbls=[tbl]), select_list)
         sql_exprs = list(exprs.Expr.list_subexprs(
             analyzer.all_exprs, filter=analyzer.sql_elements.contains, traverse_matches=False))
         row_builder = exprs.RowBuilder(analyzer.all_exprs, [], sql_exprs)
         analyzer.finalize(row_builder)
         sql_lookup_node = exec.SqlLookupNode(tbl, row_builder, sql_exprs, sa_key_cols, key_vals)
-        delete_where_clause = sql_lookup_node.where_clause
         col_vals = [{col: row[col].val for col in updated_cols} for row in batch]
         row_update_node = exec.RowUpdateNode(tbl, key_vals, len(rowids) > 0, col_vals, row_builder, sql_lookup_node)
         plan: exec.ExecNode = row_update_node
@@ -412,7 +453,8 @@ class Planner:
         plan.set_ctx(ctx)
         recomputed_user_cols = [c for c in recomputed_cols if c.name is not None]
         return (
-            plan, row_update_node, delete_where_clause, list(updated_cols) + recomputed_user_cols, recomputed_user_cols
+            plan, row_update_node, sql_lookup_node.where_clause_element, list(updated_cols) + recomputed_user_cols,
+            recomputed_user_cols
         )
     @classmethod
@@ -439,7 +481,7 @@ class Planner:
         target = view.tbl_version  # the one we need to update
         # retrieve all stored cols and all target exprs
         recomputed_cols = set(recompute_targets.copy())
-        copied_cols = [col for col in target.cols if col.is_stored and not col in recomputed_cols]
+        copied_cols = [col for col in target.cols_by_id.values() if col.is_stored and not col in recomputed_cols]
         select_list: list[exprs.Expr] = [exprs.ColumnRef(col) for col in copied_cols]
         # resolve recomputed exprs to stored columns in the base
         recomputed_exprs = \
@@ -448,7 +490,8 @@ class Planner:
         # we need to retrieve the PK columns of the existing rows
         plan = cls.create_query_plan(
-            view, select_list, where_clause=target.predicate, ignore_errors=True, exact_version_only=view.get_bases())
+            FromClause(tbls=[view]), select_list, where_clause=target.predicate, ignore_errors=True,
+            exact_version_only=view.get_bases())
         for i, col in enumerate(copied_cols + list(recomputed_cols)):  # same order as select_list
             plan.row_builder.add_table_column(col, select_list[i].slot_idx)
         # TODO: avoid duplication with view_load_plan() logic (where does this belong?)
@@ -459,7 +502,7 @@ class Planner:
     @classmethod
     def create_view_load_plan(
-            cls, view: catalog.TableVersionPath, propagates_insert: bool = False
+        cls, view: catalog.TableVersionPath, propagates_insert: bool = False
     ) -> tuple[exec.ExecNode, int]:
         """Creates a query plan for populating a view.
@@ -479,7 +522,7 @@ class Planner:
         # - we can ignore stored non-computed columns because they have a default value that is supplied directly by
         #   the store
         target = view.tbl_version  # the one we need to populate
-        stored_cols = [c for c in target.cols if c.is_stored]
+        stored_cols = [c for c in target.cols_by_id.values() if c.is_stored]
         # 2. for component views: iterator args
         iterator_args = [target.iterator_args] if target.iterator_args is not None else []
@@ -489,16 +532,16 @@ class Planner:
         # 1. materialize exprs computed from the base that are needed for stored view columns
         # 2. if it's an iterator view, expand the base rows into component rows
         # 3. materialize stored view columns that haven't been produced by step 1
-        base_output_exprs = [e for e in row_builder.default_eval_ctx.exprs if e.is_bound_by(view.base)]
+        base_output_exprs = [e for e in row_builder.default_eval_ctx.exprs if e.is_bound_by([view.base])]
         view_output_exprs = [
             e for e in row_builder.default_eval_ctx.target_exprs
-            if e.is_bound_by(view) and not e.is_bound_by(view.base)
+            if e.is_bound_by([view]) and not e.is_bound_by([view.base])
         ]
         # if we're propagating an insert, we only want to see those base rows that were created for the current version
-        base_analyzer = Analyzer(view, base_output_exprs, where_clause=target.predicate)
+        base_analyzer = Analyzer(FromClause(tbls=[view.base]), base_output_exprs, where_clause=target.predicate)
         base_eval_ctx = row_builder.create_eval_ctx(base_analyzer.all_exprs)
         plan = cls._create_query_plan(
-            view.base, row_builder=row_builder, analyzer=base_analyzer, eval_ctx=base_eval_ctx, with_pk=True,
+            row_builder=row_builder, analyzer=base_analyzer, eval_ctx=base_eval_ctx, with_pk=True,
             exact_version_only=view.get_bases() if propagates_insert else [])
         exec_ctx = plan.ctx
         if target.is_component_view():
@@ -513,6 +556,13 @@ class Planner:
         plan.set_ctx(exec_ctx)
         return plan, len(row_builder.default_eval_ctx.target_exprs)
+    @classmethod
+    def _verify_join_clauses(cls, analyzer: Analyzer) -> None:
+        """Verify that join clauses are expressible in SQL"""
+        for join_clause in analyzer.from_clause.join_clauses:
+            if join_clause.join_predicate is not None and analyzer.sql_elements.get(join_clause.join_predicate) is None:
+                raise excs.Error(f'Join predicate {join_clause.join_predicate} not expressible in SQL')
     @classmethod
     def _verify_ordering(cls, analyzer: Analyzer, verify_agg: bool) -> None:
         """Verify that the various ordering requirements don't conflict"""
@@ -551,9 +601,7 @@ class Planner:
         return s1 <= s2
     @classmethod
-    def _insert_prefetch_node(
-            cls, tbl_id: UUID, output_exprs: list[exprs.Expr], row_builder: exprs.RowBuilder, input: exec.ExecNode
-    ) -> exec.ExecNode:
+    def _insert_prefetch_node(cls, tbl_id: UUID, row_builder: exprs.RowBuilder, input: exec.ExecNode) -> exec.ExecNode:
         """Returns a CachePrefetchNode into the plan if needed, otherwise returns input"""
         # we prefetch external files for all media ColumnRefs, even those that aren't part of the dependencies
         # of output_exprs: if unstored iterator columns are present, we might need to materialize ColumnRefs that
@@ -570,7 +618,7 @@ class Planner:
     @classmethod
     def create_query_plan(
-            cls, tbl: catalog.TableVersionPath, select_list: Optional[list[exprs.Expr]] = None,
+            cls, from_clause: FromClause, select_list: Optional[list[exprs.Expr]] = None,
             where_clause: Optional[exprs.Expr] = None, group_by_clause: Optional[list[exprs.Expr]] = None,
             order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None, limit: Optional[int] = None,
             ignore_errors: bool = False, exact_version_only: Optional[list[catalog.TableVersion]] = None
@@ -586,7 +634,7 @@ class Planner:
         if exact_version_only is None:
             exact_version_only = []
         analyzer = Analyzer(
-            tbl, select_list, where_clause=where_clause, group_by_clause=group_by_clause,
+            from_clause, select_list, where_clause=where_clause, group_by_clause=group_by_clause,
             order_by_clause=order_by_clause)
         row_builder = exprs.RowBuilder(analyzer.all_exprs, [], [])
@@ -595,7 +643,7 @@ class Planner:
         # with_pk: for now, we always retrieve the PK, because we need it for the file cache
         eval_ctx = row_builder.create_eval_ctx(analyzer.select_list)
         plan = cls._create_query_plan(
-            tbl, row_builder, analyzer=analyzer, eval_ctx=eval_ctx, limit=limit, with_pk=True,
+            row_builder=row_builder, analyzer=analyzer, eval_ctx=eval_ctx, limit=limit, with_pk=True,
             exact_version_only=exact_version_only)
         plan.ctx.ignore_errors = ignore_errors
         select_list.clear()
@@ -604,10 +652,9 @@ class Planner:
     @classmethod
     def _create_query_plan(
-            cls, tbl: catalog.TableVersionPath, row_builder: exprs.RowBuilder, analyzer: Analyzer,
-            eval_ctx: exprs.RowBuilder.EvalCtx,
-            limit: Optional[int] = None, with_pk: bool = False,
-            exact_version_only: Optional[list[catalog.TableVersion]] = None
+        cls, row_builder: exprs.RowBuilder, analyzer: Analyzer, eval_ctx: exprs.RowBuilder.EvalCtx,
+        limit: Optional[int] = None, with_pk: bool = False,
+        exact_version_only: Optional[list[catalog.TableVersion]] = None
     ) -> exec.ExecNode:
         """
         Create plan to materialize eval_ctx.
@@ -619,7 +666,6 @@ class Planner:
         """
         if exact_version_only is None:
             exact_version_only = []
-        assert isinstance(tbl, catalog.TableVersionPath)
         sql_elements = analyzer.sql_elements
         is_python_agg = (
             not sql_elements.contains_all(analyzer.agg_fn_calls)
@@ -627,17 +673,19 @@ class Planner:
         )
         ctx = exec.ExecContext(row_builder)
         cls._verify_ordering(analyzer, verify_agg=is_python_agg)
+        cls._verify_join_clauses(analyzer)
-        # materialized with SQL scan:
+        # materialized with SQL table scans (ie, single-table SELECT statements):
         # - select list subexprs that aren't aggregates
-        # - Where clause conjuncts that can't be run in SQL
+        # - join clause subexprs
+        # - subexprs of Where clause conjuncts that can't be run in SQL
         # - all grouping exprs, if any aggregate function call can't be run in SQL (in that case, they all have to be
         #   run in Python)
         candidates = list(exprs.Expr.list_subexprs(
             analyzer.select_list,
             filter=lambda e: (
-                sql_elements.contains(e)
-                and not e._contains(cls=exprs.FunctionCall, filter=lambda e: bool(e.is_agg_fn_call))
+                    sql_elements.contains(e)
+                    and not e._contains(cls=exprs.FunctionCall, filter=lambda e: bool(e.is_agg_fn_call))
             ),
             traverse_matches=False))
         if analyzer.filter is not None:
@@ -647,15 +695,44 @@ class Planner:
             candidates.extend(exprs.Expr.list_subexprs(
                 analyzer.group_by_clause, filter=lambda e: sql_elements.contains(e), traverse_matches=False))
         # not isinstance(...): we don't want to materialize Literals via a Select
-        sql_scan_exprs = exprs.ExprSet(e for e in candidates if not isinstance(e, exprs.Literal))
+        sql_exprs = exprs.ExprSet(e for e in candidates if not isinstance(e, exprs.Literal))
+        # create table scans; each scan produces subexprs of (sql_exprs + join clauses)
+        join_exprs = exprs.ExprSet(
+            join_clause.join_predicate
+            for join_clause in analyzer.from_clause.join_clauses
+            if join_clause.join_predicate is not None)
+        scan_target_exprs = sql_exprs | join_exprs
+        tbl_scan_plans: list[exec.SqlScanNode] = []
+        plan: exec.ExecNode
+        for tbl in analyzer.from_clause.tbls:
+            # materialize all subexprs of scan_target_exprs that are bound by tbl
+            tbl_scan_exprs = exprs.ExprSet(
+                exprs.Expr.list_subexprs(
+                    scan_target_exprs,
+                    filter=lambda e: e.is_bound_by([tbl]) and not isinstance(e, exprs.Literal),
+                    traverse_matches=False))
+            plan = exec.SqlScanNode(
+                tbl, row_builder, select_list=tbl_scan_exprs,
+                set_pk=with_pk, exact_version_only=exact_version_only)
+            tbl_scan_plans.append(plan)
+        if len(analyzer.from_clause.join_clauses) > 0:
+            plan = exec.SqlJoinNode(
+                row_builder, inputs=tbl_scan_plans, join_clauses=analyzer.from_clause.join_clauses,
+                select_list=sql_exprs)
+        else:
+            plan = tbl_scan_plans[0]
-        plan = exec.SqlScanNode(
-            tbl, row_builder, select_list=sql_scan_exprs, where_clause=analyzer.sql_where_clause,
-            filter=analyzer.filter, set_pk=with_pk, exact_version_only=exact_version_only)
+        if analyzer.sql_where_clause is not None:
+            plan.set_where(analyzer.sql_where_clause)
+        if analyzer.filter is not None:
+            plan.set_py_filter(analyzer.filter)
         if len(analyzer.window_fn_calls) > 0:
             # we need to order the input for window functions
-            plan.add_order_by(analyzer.get_window_fn_ob_clause())
-        plan = cls._insert_prefetch_node(tbl.tbl_version.id, analyzer.select_list, row_builder, plan)
+            plan.set_order_by(analyzer.get_window_fn_ob_clause())
+        plan = cls._insert_prefetch_node(tbl.tbl_version.id, row_builder, plan)
         if analyzer.group_by_clause is not None:
             # we're doing grouping aggregation; the input of the AggregateNode are the grouping exprs plus the
@@ -663,9 +740,9 @@ class Planner:
             agg_input = exprs.ExprSet(analyzer.grouping_exprs.copy())
             for fn_call in analyzer.agg_fn_calls:
                 agg_input.update(fn_call.components)
-            if not sql_scan_exprs.issuperset(agg_input):
+            if not sql_exprs.issuperset(agg_input):
                 # we need an ExprEvalNode
-                plan = exec.ExprEvalNode(row_builder, agg_input, sql_scan_exprs, input=plan)
+                plan = exec.ExprEvalNode(row_builder, agg_input, sql_exprs, input=plan)
             # batch size for aggregation input: this could be the entire table, so we need to divide it into
             # smaller batches; at the same time, we need to make the batches large enough to amortize the
@@ -689,16 +766,17 @@ class Planner:
                     # we need an ExprEvalNode to evaluate the remaining output exprs
                     plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, agg_output, input=plan)
         else:
-            if not exprs.ExprSet(sql_scan_exprs).issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
+            if not exprs.ExprSet(sql_exprs).issuperset(exprs.ExprSet(eval_ctx.target_exprs)):
                 # we need an ExprEvalNode to evaluate the remaining output exprs
-                plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, sql_scan_exprs, input=plan)
+                plan = exec.ExprEvalNode(row_builder, eval_ctx.target_exprs, sql_exprs, input=plan)
             # we're returning everything to the user, so we might as well do it in a single batch
             ctx.batch_size = 0
-        sql_node = plan.get_sql_node()
-        assert sql_node is not None
         if len(analyzer.order_by_clause) > 0:
-            sql_node.add_order_by(analyzer.order_by_clause)
+            # we have the last SqlNode we created produce the ordering
+            sql_node = plan.get_node(exec.SqlNode)
+            assert sql_node is not None
+            sql_node.set_order_by(analyzer.order_by_clause)
         if limit is not None:
             plan.set_limit(limit)
@@ -708,7 +786,7 @@ class Planner:
     @classmethod
     def analyze(cls, tbl: catalog.TableVersionPath, where_clause: exprs.Expr) -> Analyzer:
-        return Analyzer(tbl, [], where_clause=where_clause)
+        return Analyzer(FromClause(tbls=[tbl]), [], where_clause=where_clause)
     @classmethod
     def create_add_column_plan(
@@ -721,9 +799,9 @@ class Planner:
         """
         assert isinstance(tbl, catalog.TableVersionPath)
         row_builder = exprs.RowBuilder(output_exprs=[], columns=[col], input_exprs=[])
-        analyzer = Analyzer(tbl, row_builder.default_eval_ctx.target_exprs)
+        analyzer = Analyzer(FromClause(tbls=[tbl]), row_builder.default_eval_ctx.target_exprs)
         plan = cls._create_query_plan(
-            tbl, row_builder=row_builder, analyzer=analyzer, eval_ctx=row_builder.default_eval_ctx, with_pk=True)
+            row_builder=row_builder, analyzer=analyzer, eval_ctx=row_builder.default_eval_ctx, with_pk=True)
         plan.ctx.batch_size = 16
         plan.ctx.show_pbar = True
         plan.ctx.ignore_errors = True

pixeltable/store.py CHANGED Viewed

@@ -159,7 +159,7 @@ class StoreBase:
     def count(self, conn: Optional[sql.engine.Connection] = None) -> int:
         """Return the number of rows visible in self.tbl_version"""
         stmt = (
-            sql.select(sql.func.count('*'))  # type: ignore
+            sql.select(sql.func.count('*'))
             .select_from(self.sa_tbl)
             .where(self.v_min_col <= self.tbl_version.version)
             .where(self.v_max_col > self.tbl_version.version)

pixeltable/type_system.py CHANGED Viewed

@@ -166,7 +166,7 @@ class ColumnType:
         if t == cls.Type.DOCUMENT:
             return DocumentType()
-    def __str__(self) -> str:
+    def __repr__(self) -> str:
         return self._to_str(as_schema=False)
     def _to_str(self, as_schema: bool) -> str:

pixeltable/utils/arrow.py CHANGED Viewed

@@ -3,14 +3,17 @@ from typing import Any, Iterator, Optional, Union
 import numpy as np
 import pyarrow as pa
+import datetime
 import pixeltable.type_system as ts
+from pixeltable.env import Env
+_tz_def = Env().get().default_time_zone
 _logger = logging.getLogger(__name__)
 _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
     pa.string(): ts.StringType(nullable=True),
-    pa.timestamp('us'): ts.TimestampType(nullable=True),
     pa.bool_(): ts.BoolType(nullable=True),
     pa.uint8(): ts.IntType(nullable=True),
     pa.int8(): ts.IntType(nullable=True),
@@ -23,7 +26,7 @@ _pa_to_pt: dict[pa.DataType, ts.ColumnType] = {
 _pt_to_pa: dict[type[ts.ColumnType], pa.DataType] = {
     ts.StringType: pa.string(),
-    ts.TimestampType: pa.timestamp('us'),  # postgres timestamp is microseconds
+    ts.TimestampType: pa.timestamp('us', tz=datetime.timezone.utc),  # postgres timestamp is microseconds
     ts.BoolType: pa.bool_(),
     ts.IntType: pa.int64(),
     ts.FloatType: pa.float32(),
@@ -39,7 +42,9 @@ def to_pixeltable_type(arrow_type: pa.DataType) -> Optional[ts.ColumnType]:
     """Convert a pyarrow DataType to a pixeltable ColumnType if one is defined.
     Returns None if no conversion is currently implemented.
     """
-    if arrow_type in _pa_to_pt:
+    if isinstance(arrow_type, pa.TimestampType):
+        return ts.TimestampType(nullable=True)
+    elif arrow_type in _pa_to_pt:
         return _pa_to_pt[arrow_type]
     elif isinstance(arrow_type, pa.FixedShapeTensorType):
         dtype = to_pixeltable_type(arrow_type.value_type)

pixeltable 0.2.25__py3-none-any.whl → 0.2.26__py3-none-any.whl

pixeltable 0.2.25py3-none-any.whl → 0.2.26py3-none-any.whl