PyPI - pixeltable - Versions diffs - 0.2.25__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

pixeltable 0.2.25py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (97) hide show

pixeltable/__init__.py +2 -2
pixeltable/__version__.py +2 -2
pixeltable/catalog/__init__.py +1 -1
pixeltable/catalog/dir.py +6 -0
pixeltable/catalog/globals.py +25 -0
pixeltable/catalog/named_function.py +4 -0
pixeltable/catalog/path_dict.py +37 -11
pixeltable/catalog/schema_object.py +6 -0
pixeltable/catalog/table.py +421 -231
pixeltable/catalog/table_version.py +22 -8
pixeltable/catalog/view.py +5 -7
pixeltable/dataframe.py +439 -105
pixeltable/env.py +19 -5
pixeltable/exec/__init__.py +1 -1
pixeltable/exec/exec_node.py +6 -7
pixeltable/exec/expr_eval_node.py +1 -1
pixeltable/exec/sql_node.py +92 -45
pixeltable/exprs/__init__.py +1 -0
pixeltable/exprs/arithmetic_expr.py +1 -1
pixeltable/exprs/array_slice.py +1 -1
pixeltable/exprs/column_property_ref.py +1 -1
pixeltable/exprs/column_ref.py +29 -2
pixeltable/exprs/comparison.py +1 -1
pixeltable/exprs/compound_predicate.py +1 -1
pixeltable/exprs/expr.py +12 -5
pixeltable/exprs/expr_set.py +8 -0
pixeltable/exprs/function_call.py +147 -39
pixeltable/exprs/in_predicate.py +1 -1
pixeltable/exprs/inline_expr.py +25 -5
pixeltable/exprs/is_null.py +1 -1
pixeltable/exprs/json_mapper.py +1 -1
pixeltable/exprs/json_path.py +1 -1
pixeltable/exprs/method_ref.py +1 -1
pixeltable/exprs/row_builder.py +1 -1
pixeltable/exprs/rowid_ref.py +1 -1
pixeltable/exprs/similarity_expr.py +14 -7
pixeltable/exprs/sql_element_cache.py +4 -0
pixeltable/exprs/type_cast.py +2 -2
pixeltable/exprs/variable.py +3 -0
pixeltable/func/__init__.py +5 -4
pixeltable/func/aggregate_function.py +151 -68
pixeltable/func/callable_function.py +48 -16
pixeltable/func/expr_template_function.py +64 -23
pixeltable/func/function.py +195 -27
pixeltable/func/function_registry.py +2 -1
pixeltable/func/query_template_function.py +51 -9
pixeltable/func/signature.py +64 -7
pixeltable/func/tools.py +153 -0
pixeltable/func/udf.py +57 -35
pixeltable/functions/__init__.py +2 -2
pixeltable/functions/anthropic.py +51 -4
pixeltable/functions/gemini.py +85 -0
pixeltable/functions/globals.py +54 -34
pixeltable/functions/huggingface.py +10 -28
pixeltable/functions/json.py +3 -8
pixeltable/functions/math.py +67 -0
pixeltable/functions/ollama.py +8 -8
pixeltable/functions/openai.py +51 -4
pixeltable/functions/timestamp.py +1 -1
pixeltable/functions/video.py +3 -9
pixeltable/functions/vision.py +1 -1
pixeltable/globals.py +354 -80
pixeltable/index/embedding_index.py +106 -34
pixeltable/io/__init__.py +1 -1
pixeltable/io/label_studio.py +1 -1
pixeltable/io/parquet.py +39 -19
pixeltable/iterators/document.py +12 -0
pixeltable/metadata/__init__.py +1 -1
pixeltable/metadata/converters/convert_16.py +2 -1
pixeltable/metadata/converters/convert_17.py +2 -1
pixeltable/metadata/converters/convert_22.py +17 -0
pixeltable/metadata/converters/convert_23.py +35 -0
pixeltable/metadata/converters/convert_24.py +56 -0
pixeltable/metadata/converters/convert_25.py +19 -0
pixeltable/metadata/converters/util.py +4 -2
pixeltable/metadata/notes.py +4 -0
pixeltable/metadata/schema.py +1 -0
pixeltable/plan.py +128 -50
pixeltable/store.py +1 -1
pixeltable/type_system.py +196 -54
pixeltable/utils/arrow.py +8 -3
pixeltable/utils/description_helper.py +89 -0
pixeltable/utils/documents.py +14 -0
{pixeltable-0.2.25.dist-info → pixeltable-0.3.0.dist-info}/METADATA +30 -20
pixeltable-0.3.0.dist-info/RECORD +155 -0
{pixeltable-0.2.25.dist-info → pixeltable-0.3.0.dist-info}/WHEEL +1 -1
pixeltable-0.3.0.dist-info/entry_points.txt +3 -0
pixeltable/tool/create_test_db_dump.py +0 -311
pixeltable/tool/create_test_video.py +0 -81
pixeltable/tool/doc_plugins/griffe.py +0 -50
pixeltable/tool/doc_plugins/mkdocstrings.py +0 -6
pixeltable/tool/doc_plugins/templates/material/udf.html.jinja +0 -135
pixeltable/tool/embed_udf.py +0 -9
pixeltable/tool/mypy_plugin.py +0 -55
pixeltable-0.2.25.dist-info/RECORD +0 -154
pixeltable-0.2.25.dist-info/entry_points.txt +0 -3
{pixeltable-0.2.25.dist-info → pixeltable-0.3.0.dist-info}/LICENSE +0 -0

pixeltable/dataframe.py CHANGED Viewed

@@ -2,13 +2,13 @@ from __future__ import annotations
 import builtins
 import copy
+import dataclasses
 import hashlib
 import json
 import logging
-import mimetypes
 import traceback
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterator, Optional, Sequence, Union
+from typing import TYPE_CHECKING, Any, Callable, Hashable, Iterator, Optional, Sequence, Union, Literal
 import pandas as pd
 import pandas.io.formats.style
@@ -17,14 +17,15 @@ import sqlalchemy as sql
 import pixeltable.catalog as catalog
 import pixeltable.exceptions as excs
 import pixeltable.exprs as exprs
+import pixeltable.type_system as ts
 from pixeltable import exec
+from pixeltable import plan
 from pixeltable.catalog import is_valid_identifier
 from pixeltable.catalog.globals import UpdateStatus
 from pixeltable.env import Env
-from pixeltable.plan import Planner
 from pixeltable.type_system import ColumnType
+from pixeltable.utils.description_helper import DescriptionHelper
 from pixeltable.utils.formatter import Formatter
-from pixeltable.utils.http_server import get_file_uri
 if TYPE_CHECKING:
     import torch
@@ -131,9 +132,19 @@ class DataFrameResultSet:
 class DataFrame:
+    _from_clause: plan.FromClause
+    _select_list_exprs: list[exprs.Expr]
+    _schema: dict[str, ts.ColumnType]
+    select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]]
+    where_clause: Optional[exprs.Expr]
+    group_by_clause: Optional[list[exprs.Expr]]
+    grouping_tbl: Optional[catalog.TableVersion]
+    order_by_clause: Optional[list[tuple[exprs.Expr, bool]]]
+    limit_val: Optional[int]
     def __init__(
         self,
-        tbl: catalog.TableVersionPath,
+        from_clause: Optional[plan.FromClause] = None,
         select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]] = None,
         where_clause: Optional[exprs.Expr] = None,
         group_by_clause: Optional[list[exprs.Expr]] = None,
@@ -141,14 +152,11 @@ class DataFrame:
         order_by_clause: Optional[list[tuple[exprs.Expr, bool]]] = None,  # list[(expr, asc)]
         limit: Optional[int] = None,
     ):
-        self.tbl = tbl
+        self._from_clause = from_clause
-        # select list logic
-        DataFrame._select_list_check_rep(select_list)  # check select list without expansion
         # exprs contain execution state and therefore cannot be shared
         select_list = copy.deepcopy(select_list)
-        select_list_exprs, column_names = DataFrame._normalize_select_list(tbl, select_list)
-        DataFrame._select_list_check_rep(list(zip(select_list_exprs, column_names)))
+        select_list_exprs, column_names = DataFrame._normalize_select_list(self._from_clause.tbls, select_list)
         # check select list after expansion to catch early
         # the following two lists are always non empty, even if select list is None.
         assert len(column_names) == len(select_list_exprs)
@@ -163,28 +171,10 @@ class DataFrame:
         self.order_by_clause = copy.deepcopy(order_by_clause)
         self.limit_val = limit
-    @classmethod
-    def _select_list_check_rep(
-        cls,
-        select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
-    ) -> None:
-        """Validate basic select list types."""
-        if select_list is None:  # basic check for valid select list
-            return
-        assert len(select_list) > 0
-        for ent in select_list:
-            assert isinstance(ent, tuple)
-            assert len(ent) == 2
-            assert isinstance(ent[0], exprs.Expr)
-            assert ent[1] is None or isinstance(ent[1], str)
-            if isinstance(ent[1], str):
-                assert is_valid_identifier(ent[1])
     @classmethod
     def _normalize_select_list(
         cls,
-        tbl: catalog.TableVersionPath,
+        tbls: list[catalog.TableVersionPath],
         select_list: Optional[list[tuple[exprs.Expr, Optional[str]]]],
     ) -> tuple[list[exprs.Expr], list[str]]:
         """
@@ -193,7 +183,7 @@ class DataFrame:
             a pair composed of the list of expressions and the list of corresponding names
         """
         if select_list is None:
-            select_list = [(exprs.ColumnRef(col), None) for col in tbl.columns()]
+            select_list = [(exprs.ColumnRef(col), None) for tbl in tbls for col in tbl.columns()]
         out_exprs: list[exprs.Expr] = []
         out_names: list[str] = []  # keep track of order
@@ -222,6 +212,11 @@ class DataFrame:
         assert set(out_names) == seen_out_names
         return out_exprs, out_names
+    @property
+    def _first_tbl(self) -> catalog.TableVersionPath:
+        assert len(self._from_clause.tbls) == 1
+        return self._from_clause.tbls[0]
     def _vars(self) -> dict[str, exprs.Variable]:
         """
         Return a dict mapping variable name to Variable for all Variables contained in any component of the DataFrame
@@ -280,16 +275,16 @@ class DataFrame:
             assert self.group_by_clause is None
             num_rowid_cols = len(self.grouping_tbl.store_tbl.rowid_columns())
             # the grouping table must be a base of self.tbl
-            assert num_rowid_cols <= len(self.tbl.tbl_version.store_tbl.rowid_columns())
-            group_by_clause = [exprs.RowidRef(self.tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
+            assert num_rowid_cols <= len(self._first_tbl.tbl_version.store_tbl.rowid_columns())
+            group_by_clause = [exprs.RowidRef(self._first_tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
         elif self.group_by_clause is not None:
             group_by_clause = self.group_by_clause
         for item in self._select_list_exprs:
             item.bind_rel_paths(None)
-        return Planner.create_query_plan(
-            self.tbl,
+        return plan.Planner.create_query_plan(
+            self._from_clause,
             self._select_list_exprs,
             where_clause=self.where_clause,
             group_by_clause=group_by_clause,
@@ -297,23 +292,57 @@ class DataFrame:
             limit=self.limit_val
         )
+    def _has_joins(self) -> bool:
+        return len(self._from_clause.join_clauses) > 0
     def show(self, n: int = 20) -> DataFrameResultSet:
         assert n is not None
         return self.limit(n).collect()
     def head(self, n: int = 10) -> DataFrameResultSet:
+        """Return the first n rows of the DataFrame, in insertion order of the underlying Table.
+        head() is not supported for joins.
+        Args:
+            n: Number of rows to select. Default is 10.
+        Returns:
+            A DataFrameResultSet with the first n rows of the DataFrame.
+        Raises:
+            Error: If the DataFrame is the result of a join or
+                if the DataFrame has an order_by clause.
+        """
         if self.order_by_clause is not None:
             raise excs.Error(f'head() cannot be used with order_by()')
-        num_rowid_cols = len(self.tbl.tbl_version.store_tbl.rowid_columns())
-        order_by_clause = [exprs.RowidRef(self.tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
+        if self._has_joins():
+            raise excs.Error(f'head() not supported for joins')
+        num_rowid_cols = len(self._first_tbl.tbl_version.store_tbl.rowid_columns())
+        order_by_clause = [exprs.RowidRef(self._first_tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
         return self.order_by(*order_by_clause, asc=True).limit(n).collect()
     def tail(self, n: int = 10) -> DataFrameResultSet:
+        """Return the last n rows of the DataFrame, in insertion order of the underlying Table.
+        tail() is not supported for joins.
+        Args:
+            n: Number of rows to select. Default is 10.
+        Returns:
+            A DataFrameResultSet with the last n rows of the DataFrame.
+        Raises:
+            Error: If the DataFrame is the result of a join or
+                if the DataFrame has an order_by clause.
+        """
         if self.order_by_clause is not None:
             raise excs.Error(f'tail() cannot be used with order_by()')
-        num_rowid_cols = len(self.tbl.tbl_version.store_tbl.rowid_columns())
-        order_by_clause = [exprs.RowidRef(self.tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
+        if self._has_joins():
+            raise excs.Error(f'tail() not supported for joins')
+        num_rowid_cols = len(self._first_tbl.tbl_version.store_tbl.rowid_columns())
+        order_by_clause = [exprs.RowidRef(self._first_tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
         result = self.order_by(*order_by_clause, asc=False).limit(n).collect()
         result._reverse()
         return result
@@ -359,7 +388,7 @@ class DataFrame:
             ]
         return DataFrame(
-            self.tbl, select_list=select_list, where_clause=where_clause,
+            from_clause=self._from_clause, select_list=select_list, where_clause=where_clause,
             group_by_clause=group_by_clause, grouping_tbl=self.grouping_tbl,
             order_by_clause=order_by_clause, limit=self.limit_val)
@@ -393,30 +422,49 @@ class DataFrame:
         return DataFrameResultSet(list(self._output_row_iterator(conn)), self.schema)
     def count(self) -> int:
+        """Return the number of rows in the DataFrame.
+        Returns:
+            The number of rows in the DataFrame.
+        """
         from pixeltable.plan import Planner
-        stmt = Planner.create_count_stmt(self.tbl, self.where_clause)
+        stmt = Planner.create_count_stmt(self._first_tbl, self.where_clause)
         with Env.get().engine.connect() as conn:
             result: int = conn.execute(stmt).scalar_one()
             assert isinstance(result, int)
             return result
-    def _description(self) -> pd.DataFrame:
-        """see DataFrame.describe()"""
+    def _descriptors(self) -> DescriptionHelper:
+        helper = DescriptionHelper()
+        helper.append(self._col_descriptor())
+        qd = self._query_descriptor()
+        if not qd.empty:
+            helper.append(qd, show_index=True, show_header=False)
+        return helper
+    def _col_descriptor(self) -> pd.DataFrame:
+        return pd.DataFrame([
+            {
+                'Name': name,
+                'Type': expr.col_type._to_str(as_schema=True),
+                'Expression': expr.display_str(inline=False),
+            }
+            for name, expr in zip(self.schema.keys(), self._select_list_exprs)
+        ])
+    def _query_descriptor(self) -> pd.DataFrame:
         heading_vals: list[str] = []
         info_vals: list[str] = []
-        if self.select_list is not None:
-            assert len(self.select_list) > 0
-            heading_vals.append('Select')
-            heading_vals.extend([''] * (len(self.select_list) - 1))
-            info_vals.extend(self.schema.keys())
+        heading_vals.append('From')
+        info_vals.extend(tbl.tbl_name() for tbl in self._from_clause.tbls)
         if self.where_clause is not None:
             heading_vals.append('Where')
             info_vals.append(self.where_clause.display_str(inline=False))
         if self.group_by_clause is not None:
             heading_vals.append('Group By')
             heading_vals.extend([''] * (len(self.group_by_clause) - 1))
-            info_vals.extend([e.display_str(inline=False) for e in self.group_by_clause])
+            info_vals.extend(e.display_str(inline=False) for e in self.group_by_clause)
         if self.order_by_clause is not None:
             heading_vals.append('Order By')
             heading_vals.extend([''] * (len(self.order_by_clause) - 1))
@@ -426,22 +474,8 @@ class DataFrame:
         if self.limit_val is not None:
             heading_vals.append('Limit')
             info_vals.append(str(self.limit_val))
-        assert len(heading_vals) > 0
-        assert len(info_vals) > 0
         assert len(heading_vals) == len(info_vals)
-        return pd.DataFrame({'Heading': heading_vals, 'Info': info_vals})
-    def _description_html(self) -> pandas.io.formats.style.Styler:
-        """Return the description in an ipython-friendly manner."""
-        pd_df = self._description()
-        # white-space: pre-wrap: print \n as newline
-        # th: center-align headings
-        return (
-            pd_df.style.set_properties(None, **{'white-space': 'pre-wrap', 'text-align': 'left'})
-            .set_table_styles([dict(selector='th', props=[('text-align', 'center')])])
-            .hide(axis='index')
-            .hide(axis='columns')
-        )
+        return pd.DataFrame(info_vals, index=heading_vals)
     def describe(self) -> None:
         """
@@ -451,17 +485,47 @@ class DataFrame:
         """
         if getattr(builtins, '__IPYTHON__', False):
             from IPython.display import display
-            display(self._description_html())
+            display(self._repr_html_())
         else:
-            print(self.__repr__())
+            print(repr(self))
     def __repr__(self) -> str:
-        return self._description().to_string(header=False, index=False)
+        return self._descriptors().to_string()
     def _repr_html_(self) -> str:
-        return self._description_html()._repr_html_()  # type: ignore[attr-defined]
+        return self._descriptors().to_html()
     def select(self, *items: Any, **named_items: Any) -> DataFrame:
+        """ Select columns or expressions from the DataFrame.
+        Args:
+            items: expressions to be selected
+            named_items: named expressions to be selected
+        Returns:
+            A new DataFrame with the specified select list.
+        Raises:
+            Error: If the select list is already specified,
+                or if any of the specified expressions are invalid,
+                or refer to tables not in the DataFrame.
+        Examples:
+            Given the DataFrame person from a table t with all its columns and rows:
+            >>> person = t.select()
+            Select the columns 'name' and 'age' (referenced in table t) from the DataFrame person:
+            >>> df = person.select(t.name, t.age)
+            Select the columns 'name' (referenced in table t) from the DataFrame person,
+            and a named column 'is_adult' from the expression `age >= 18` where 'age' is
+            another column in table t:
+            >>> df = person.select(t.name, is_adult=(t.age >= 18))
+        """
         if self.select_list is not None:
             raise excs.Error(f'Select list already specified')
         for name, _ in named_items.items():
@@ -472,7 +536,7 @@ class DataFrame:
             return self
         # analyze select list; wrap literals with the corresponding expressions
-        select_list = []
+        select_list: list[tuple[exprs.Expr, Optional[str]]] = []
         for raw_expr, name in base_list:
             if isinstance(raw_expr, exprs.Expr):
                 select_list.append((raw_expr, name))
@@ -485,12 +549,14 @@ class DataFrame:
             expr = select_list[-1][0]
             if expr.col_type.is_invalid_type():
                 raise excs.Error(f'Invalid type: {raw_expr}')
-            # TODO: check that ColumnRefs in expr refer to self.tbl
+            if not expr.is_bound_by(self._from_clause.tbls):
+                raise excs.Error(
+                    f"Expression '{expr}' cannot be evaluated in the context of this query's tables "
+                    f"({','.join(tbl.tbl_name() for tbl in self._from_clause.tbls)})")
-        # check user provided names do not conflict among themselves
-        # or with auto-generated ones
+        # check user provided names do not conflict among themselves or with auto-generated ones
         seen: set[str] = set()
-        _, names = DataFrame._normalize_select_list(self.tbl, select_list)
+        _, names = DataFrame._normalize_select_list(self._from_clause.tbls, select_list)
         for name in names:
             if name in seen:
                 repeated_names = [j for j, x in enumerate(names) if x == name]
@@ -499,7 +565,7 @@ class DataFrame:
             seen.add(name)
         return DataFrame(
-            self.tbl,
+            from_clause=self._from_clause,
             select_list=select_list,
             where_clause=self.where_clause,
             group_by_clause=self.group_by_clause,
@@ -509,12 +575,35 @@ class DataFrame:
         )
     def where(self, pred: exprs.Expr) -> DataFrame:
+        """Filter rows based on a predicate.
+        Args:
+            pred: the predicate to filter rows
+        Returns:
+            A new DataFrame with the specified predicates replacing the where-clause.
+        Raises:
+            Error: If the predicate is not a Pixeltable expression,
+                or if it does not return a boolean value,
+                or refers to tables not in the DataFrame.
+        Examples:
+            Given the DataFrame person from a table t with all its columns and rows:
+            >>> person = t.select()
+            Filter the above DataFrame person to only include rows where the column 'age'
+            (referenced in table t) is greater than 30:
+            >>> df = person.where(t.age > 30)
+        """
         if not isinstance(pred, exprs.Expr):
             raise excs.Error(f'Where() requires a Pixeltable expression, but instead got {type(pred)}')
         if not pred.col_type.is_bool_type():
             raise excs.Error(f'Where(): expression needs to return bool, but instead returns {pred.col_type}')
         return DataFrame(
-            self.tbl,
+            from_clause=self._from_clause,
             select_list=self.select_list,
             where_clause=pred,
             group_by_clause=self.group_by_clause,
@@ -523,11 +612,181 @@ class DataFrame:
             limit=self.limit_val,
         )
+    def _create_join_predicate(
+            self, other: catalog.TableVersionPath, on: Union[exprs.Expr, Sequence[exprs.ColumnRef]]
+    ) -> exprs.Expr:
+        """Verifies user-specified 'on' argument and converts it into a join predicate."""
+        col_refs: list[exprs.ColumnRef] = []
+        joined_tbls = self._from_clause.tbls + [other]
+        if isinstance(on, exprs.ColumnRef):
+            on = [on]
+        elif isinstance(on, exprs.Expr):
+            if not on.is_bound_by(joined_tbls):
+                raise excs.Error(f"'on': expression cannot be evaluated in the context of the joined tables: {on}")
+            if not on.col_type.is_bool_type():
+                raise excs.Error(f"'on': boolean expression expected, but got {on.col_type}: {on}")
+            return on
+        else:
+            if not isinstance(on, Sequence) or len(on) == 0:
+                raise excs.Error(
+                    f"'on': must be a sequence of column references or a boolean expression")
+        assert isinstance(on, Sequence)
+        for col_ref in on:
+            if not isinstance(col_ref, exprs.ColumnRef):
+                raise excs.Error(
+                    f"'on': must be a sequence of column references or a boolean expression")
+            if not col_ref.is_bound_by(joined_tbls):
+                raise excs.Error(f"'on': expression cannot be evaluated in the context of the joined tables: {col_ref}")
+            col_refs.append(col_ref)
+        predicates: list[exprs.Expr] = []
+        # try to turn ColumnRefs into equality predicates
+        assert len(col_refs) > 0 and len(joined_tbls) >= 2
+        for col_ref in col_refs:
+            # identify the referenced column by name in 'other'
+            rhs_col = other.get_column(col_ref.col.name, include_bases=True)
+            if rhs_col is None:
+                raise excs.Error(f"'on': column {col_ref.col.name!r} not found in joined table")
+            rhs_col_ref = exprs.ColumnRef(rhs_col)
+            lhs_col_ref: Optional[exprs.ColumnRef] = None
+            if any(tbl.has_column(col_ref.col, include_bases=True) for tbl in self._from_clause.tbls):
+                # col_ref comes from the existing from_clause, we use that directly
+                lhs_col_ref = col_ref
+            else:
+                # col_ref comes from other, we need to look for a match in the existing from_clause by name
+                for tbl in self._from_clause.tbls:
+                    col = tbl.get_column(col_ref.col.name, include_bases=True)
+                    if col is None:
+                        continue
+                    if lhs_col_ref is not None:
+                        raise excs.Error(f"'on': ambiguous column reference: {col_ref.col.name!r}")
+                    lhs_col_ref = exprs.ColumnRef(col)
+                if lhs_col_ref is None:
+                    tbl_names = [tbl.tbl_name() for tbl in self._from_clause.tbls]
+                    raise excs.Error(
+                        f"'on': column {col_ref.col.name!r} not found in any of: {' '.join(tbl_names)}")
+            pred = exprs.Comparison(exprs.ComparisonOperator.EQ, lhs_col_ref, rhs_col_ref)
+            predicates.append(pred)
+        assert len(predicates) > 0
+        if len(predicates) == 1:
+            return predicates[0]
+        else:
+            return exprs.CompoundPredicate(operator=exprs.LogicalOperator.AND, operands=predicates)
+    def join(
+        self, other: catalog.Table,  on: Optional[Union[exprs.Expr, Sequence[exprs.ColumnRef]]] = None,
+        how: plan.JoinType.LiteralType = 'inner'
+    ) -> DataFrame:
+        """
+        Join this DataFrame with a table.
+        Args:
+            other: the table to join with
+            on: the join condition, which can be either a) references to one or more columns or b) a boolean
+                expression.
+                - column references: implies an equality predicate that matches columns in both this
+                    DataFrame and `other` by name.
+                    - column in `other`: A column with that same name must be present in this DataFrame, and **it must
+                        be unique** (otherwise the join is ambiguous).
+                    - column in this DataFrame: A column with that same name must be present in `other`.
+                - boolean expression: The expressions must be valid in the context of the joined tables.
+            how: the type of join to perform.
+                - `'inner'`: only keep rows that have a match in both
+                - `'left'`: keep all rows from this DataFrame and only matching rows from the other table
+                - `'right'`: keep all rows from the other table and only matching rows from this DataFrame
+                - `'full_outer'`: keep all rows from both this DataFrame and the other table
+                - `'cross'`: Cartesian product; no `on` condition allowed
+        Returns:
+            A new DataFrame.
+        Examples:
+            Perform an inner join between t1 and t2 on the column id:
+            >>> join1 = t1.join(t2, on=t2.id)
+            Perform a left outer join of join1 with t3, also on id (note that we can't specify `on=t3.id` here,
+            because that would be ambiguous, since both t1 and t2 have a column named id):
+            >>> join2 = join1.join(t3, on=t2.id, how='left')
+            Do the same, but now with an explicit join predicate:
+            >>> join2 = join1.join(t3, on=t2.id == t3.id, how='left')
+            Join t with d, which has a composite primary key (columns pk1 and pk2, with corresponding foreign
+            key columns d1 and d2 in t):
+            >>> df = t.join(d, on=(t.d1 == d.pk1) & (t.d2 == d.pk2), how='left')
+        """
+        join_pred: Optional[exprs.Expr]
+        if how == 'cross':
+            if on is not None:
+                raise excs.Error(f"'on' not allowed for cross join")
+            join_pred = None
+        else:
+            if on is None:
+                raise excs.Error(f"how={how!r} requires 'on'")
+            join_pred = self._create_join_predicate(other._tbl_version_path, on)
+        join_clause = plan.JoinClause(join_type=plan.JoinType.validated(how, "'how'"), join_predicate=join_pred)
+        from_clause = plan.FromClause(
+            tbls=[*self._from_clause.tbls, other._tbl_version_path],
+            join_clauses=[*self._from_clause.join_clauses, join_clause])
+        return DataFrame(
+            from_clause=from_clause,
+            select_list=self.select_list, where_clause=self.where_clause,
+            group_by_clause=self.group_by_clause, grouping_tbl=self.grouping_tbl,
+            order_by_clause=self.order_by_clause, limit=self.limit_val,
+        )
     def group_by(self, *grouping_items: Any) -> DataFrame:
-        """Add a group-by clause to this DataFrame.
+        """ Add a group-by clause to this DataFrame.
         Variants:
         - group_by(<base table>): group a component view by their respective base table rows
         - group_by(<expr>, ...): group by the given expressions
+        Note, that grouping will be applied to the rows and take effect when
+        used with an aggregation function like sum(), count() etc.
+        Args:
+            grouping_items: expressions to group by
+        Returns:
+            A new DataFrame with the specified group-by clause.
+        Raises:
+            Error: If the group-by clause is already specified,
+                or if the specified expression is invalid,
+                or refer to tables not in the DataFrame,
+                or if the DataFrame is a result of a join.
+        Examples:
+            Given the DataFrame book from a table t with all its columns and rows:
+            >>> book = t.select()
+            Group the above DataFrame book by the 'genre' column (referenced in table t):
+            >>> df = book.group_by(t.genre)
+            Use the above DataFrame df grouped by genre to count the number of
+            books for each 'genre':
+            >>> df = book.group_by(t.genre).select(t.genre, count=count(t.genre)).show()
+            Use the above DataFrame df grouped by genre to the total price of
+            books for each 'genre':
+            >>> df = book.group_by(t.genre).select(t.genre, total=sum(t.price)).show()
         """
         if self.group_by_clause is not None:
             raise excs.Error(f'Group-by already specified')
@@ -537,10 +796,12 @@ class DataFrame:
             if isinstance(item, catalog.Table):
                 if len(grouping_items) > 1:
                     raise excs.Error(f'group_by(): only one table can be specified')
+                if len(self._from_clause.tbls) > 1:
+                    raise excs.Error(f'group_by() with Table not supported for joins')
                 # we need to make sure that the grouping table is a base of self.tbl
-                base = self.tbl.find_tbl_version(item._tbl_version_path.tbl_id())
-                if base is None or base.id == self.tbl.tbl_id():
-                    raise excs.Error(f'group_by(): {item._name} is not a base table of {self.tbl.tbl_name()}')
+                base = self._first_tbl.find_tbl_version(item._tbl_version_path.tbl_id())
+                if base is None or base.id == self._first_tbl.tbl_id():
+                    raise excs.Error(f'group_by(): {item._name} is not a base table of {self._first_tbl.tbl_name()}')
                 grouping_tbl = item._tbl_version_path.tbl_version
                 break
             if not isinstance(item, exprs.Expr):
@@ -548,7 +809,7 @@ class DataFrame:
         if grouping_tbl is None:
             group_by_clause = list(grouping_items)
         return DataFrame(
-            self.tbl,
+            from_clause=self._from_clause,
             select_list=self.select_list,
             where_clause=self.where_clause,
             group_by_clause=group_by_clause,
@@ -558,13 +819,42 @@ class DataFrame:
         )
     def order_by(self, *expr_list: exprs.Expr, asc: bool = True) -> DataFrame:
+        """ Add an order-by clause to this DataFrame.
+        Args:
+            expr_list: expressions to order by
+            asc: whether to order in ascending order (True) or descending order (False).
+                Default is True.
+        Returns:
+            A new DataFrame with the specified order-by clause.
+        Raises:
+            Error: If the order-by clause is already specified,
+                or if the specified expression is invalid,
+                or refer to tables not in the DataFrame.
+        Examples:
+            Given the DataFrame book from a table t with all its columns and rows:
+            >>> book = t.select()
+            Order the above DataFrame book by two columns (price, pages) in descending order:
+            >>> df = book.order_by(t.price, t.pages, asc=False)
+            Order the above DataFrame book by price in descending order, but order the pages
+            in ascending order:
+            >>> df = book.order_by(t.price, asc=False).order_by(t.pages)
+        """
         for e in expr_list:
             if not isinstance(e, exprs.Expr):
                 raise excs.Error(f'Invalid expression in order_by(): {e}')
         order_by_clause = self.order_by_clause if self.order_by_clause is not None else []
         order_by_clause.extend([(e.copy(), asc) for e in expr_list])
         return DataFrame(
-            self.tbl,
+            from_clause=self._from_clause,
             select_list=self.select_list,
             where_clause=self.where_clause,
             group_by_clause=self.group_by_clause,
@@ -574,10 +864,18 @@ class DataFrame:
         )
     def limit(self, n: int) -> DataFrame:
+        """ Limit the number of rows in the DataFrame.
+        Args:
+            n: Number of rows to select.
+        Returns:
+            A new DataFrame with the specified limited rows.
+        """
         # TODO: allow n to be a Variable that can be substituted in bind()
         assert n is not None and isinstance(n, int)
         return DataFrame(
-            self.tbl,
+            from_clause=self._from_clause,
             select_list=self.select_list,
             where_clause=self.where_clause,
             group_by_clause=self.group_by_clause,
@@ -587,17 +885,58 @@ class DataFrame:
         )
     def update(self, value_spec: dict[str, Any], cascade: bool = True) -> UpdateStatus:
+        """ Update rows in the underlying table of the DataFrame.
+        Update rows in the table with the specified value_spec.
+        Args:
+            value_spec: a dict of column names to update and the new value to update it to.
+            cascade: if True, also update all computed columns that transitively depend
+                    on the updated columns, including within views. Default is True.
+        Returns:
+            UpdateStatus: the status of the update operation.
+        Example:
+            Given the DataFrame person from a table t with all its columns and rows:
+            >>> person = t.select()
+            Via the above DataFrame person, update the column 'city' to 'Oakland' and 'state' to 'CA' in the table t:
+            >>> df = person.update({'city': 'Oakland', 'state': 'CA'})
+            Via the above DataFrame person, update the column 'age' to 30 for any rows where 'year' is 2014 in the table t:
+            >>> df = person.where(t.year == 2014).update({'age': 30})
+        """
         self._validate_mutable('update')
-        return self.tbl.tbl_version.update(value_spec, where=self.where_clause, cascade=cascade)
+        return self._first_tbl.tbl_version.update(value_spec, where=self.where_clause, cascade=cascade)
     def delete(self) -> UpdateStatus:
+        """ Delete rows form the underlying table of the DataFrame.
+        The delete operation is only allowed for DataFrames on base tables.
+        Returns:
+            UpdateStatus: the status of the delete operation.
+        Example:
+            Given the DataFrame person from a table t with all its columns and rows:
+            >>> person = t.select()
+            Via the above DataFrame person, delete all rows from the table t where the column 'age' is less than 18:
+            >>> df = person.where(t.age < 18).delete()
+        """
         self._validate_mutable('delete')
-        if not self.tbl.is_insertable():
+        if not self._first_tbl.is_insertable():
             raise excs.Error(f'Cannot delete from view')
-        return self.tbl.tbl_version.delete(where=self.where_clause)
+        return self._first_tbl.tbl_version.delete(where=self.where_clause)
     def _validate_mutable(self, op_name: str) -> None:
-        """Tests whether this `DataFrame` can be mutated (such as by an update operation)."""
+        """Tests whether this DataFrame can be mutated (such as by an update operation)."""
         if self.group_by_clause is not None or self.grouping_tbl is not None:
             raise excs.Error(f'Cannot use `{op_name}` after `group_by`')
         if self.order_by_clause is not None:
@@ -607,27 +946,17 @@ class DataFrame:
         if self.limit_val is not None:
             raise excs.Error(f'Cannot use `{op_name}` after `limit`')
-    def __getitem__(self, index: Union[exprs.Expr, Sequence[exprs.Expr]]) -> DataFrame:
-        """
-        Allowed:
-        - [list[Expr]]/[tuple[Expr]]: setting the select list
-        - [Expr]: setting a single-col select list
-        """
-        if isinstance(index, exprs.Expr):
-            return self.select(index)
-        if isinstance(index, Sequence):
-            return self.select(*index)
-        raise TypeError(f'Invalid index type: {type(index)}')
     def as_dict(self) -> dict[str, Any]:
         """
         Returns:
             Dictionary representing this dataframe.
         """
-        tbl_versions = self.tbl.get_tbl_versions()
         d = {
             '_classname': 'DataFrame',
-            'tbl': self.tbl.as_dict(),
+            'from_clause': {
+                'tbls': [tbl.as_dict() for tbl in self._from_clause.tbls],
+                'join_clauses': [dataclasses.asdict(clause) for clause in self._from_clause.join_clauses]
+            },
             'select_list':
                 [(e.as_dict(), name) for (e, name) in self.select_list] if self.select_list is not None else None,
             'where_clause': self.where_clause.as_dict() if self.where_clause is not None else None,
@@ -642,7 +971,9 @@ class DataFrame:
     @classmethod
     def from_dict(cls, d: dict[str, Any]) -> 'DataFrame':
-        tbl = catalog.TableVersionPath.from_dict(d['tbl'])
+        tbls = [catalog.TableVersionPath.from_dict(tbl_dict) for tbl_dict in d['from_clause']['tbls']]
+        join_clauses = [plan.JoinClause(**clause_dict) for clause_dict in d['from_clause']['join_clauses']]
+        from_clause = plan.FromClause(tbls=tbls, join_clauses=join_clauses)
         select_list = [(exprs.Expr.from_dict(e), name) for e, name in d['select_list']] \
             if d['select_list'] is not None else None
         where_clause = exprs.Expr.from_dict(d['where_clause']) \
@@ -655,15 +986,18 @@ class DataFrame:
             if d['order_by_clause'] is not None else None
         limit_val = d['limit_val']
         return DataFrame(
-            tbl, select_list=select_list, where_clause=where_clause, group_by_clause=group_by_clause,
-            grouping_tbl=grouping_tbl, order_by_clause=order_by_clause, limit=limit_val)
+            from_clause=from_clause, select_list=select_list, where_clause=where_clause,
+            group_by_clause=group_by_clause, grouping_tbl=grouping_tbl, order_by_clause=order_by_clause,
+            limit=limit_val)
     def _hash_result_set(self) -> str:
         """Return a hash that changes when the result set changes."""
         d = self.as_dict()
         # add list of referenced table versions (the actual versions, not the effective ones) in order to force cache
         # invalidation when any of the referenced tables changes
-        d['tbl_versions'] = [tbl_version.version for tbl_version in self.tbl.get_tbl_versions()]
+        d['tbl_versions'] = [
+            tbl_version.version for tbl in self._from_clause.tbls for tbl_version in tbl.get_tbl_versions()
+        ]
         summary_string = json.dumps(d)
         return hashlib.sha256(summary_string.encode()).hexdigest()
@@ -732,7 +1066,7 @@ class DataFrame:
         Env.get().require_package('torch')
         Env.get().require_package('torchvision')
-        from pixeltable.io.parquet import save_parquet
+        from pixeltable.io import export_parquet
         from pixeltable.utils.pytorch import PixeltablePytorchDataset
         cache_key = self._hash_result_set()
@@ -741,6 +1075,6 @@ class DataFrame:
         if dest_path.exists():  # fast path: use cache
             assert dest_path.is_dir()
         else:
-            save_parquet(self, dest_path)
+            export_parquet(self, dest_path, inline_images=True)
         return PixeltablePytorchDataset(path=dest_path, image_format=image_format)

pixeltable 0.2.25__py3-none-any.whl → 0.3.0__py3-none-any.whl

Potentially problematic release.

pixeltable 0.2.25py3-none-any.whl → 0.3.0py3-none-any.whl