PyPI - pixeltable - Versions diffs - 0.1.0__py3-none-any.whl → 0.2.4__py3-none-any.whl - Mend

pixeltable 0.1.0py3-none-any.whl → 0.2.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of pixeltable might be problematic. Click here for more details.

Files changed (147) hide show

pixeltable/__init__.py +34 -6
pixeltable/catalog/__init__.py +13 -0
pixeltable/catalog/catalog.py +159 -0
pixeltable/catalog/column.py +200 -0
pixeltable/catalog/dir.py +32 -0
pixeltable/catalog/globals.py +33 -0
pixeltable/catalog/insertable_table.py +191 -0
pixeltable/catalog/named_function.py +36 -0
pixeltable/catalog/path.py +58 -0
pixeltable/catalog/path_dict.py +139 -0
pixeltable/catalog/schema_object.py +39 -0
pixeltable/catalog/table.py +581 -0
pixeltable/catalog/table_version.py +749 -0
pixeltable/catalog/table_version_path.py +133 -0
pixeltable/catalog/view.py +203 -0
pixeltable/client.py +590 -30
pixeltable/dataframe.py +540 -349
pixeltable/env.py +359 -45
pixeltable/exceptions.py +12 -21
pixeltable/exec/__init__.py +9 -0
pixeltable/exec/aggregation_node.py +78 -0
pixeltable/exec/cache_prefetch_node.py +116 -0
pixeltable/exec/component_iteration_node.py +79 -0
pixeltable/exec/data_row_batch.py +95 -0
pixeltable/exec/exec_context.py +22 -0
pixeltable/exec/exec_node.py +61 -0
pixeltable/exec/expr_eval_node.py +217 -0
pixeltable/exec/in_memory_data_node.py +69 -0
pixeltable/exec/media_validation_node.py +43 -0
pixeltable/exec/sql_scan_node.py +225 -0
pixeltable/exprs/__init__.py +24 -0
pixeltable/exprs/arithmetic_expr.py +102 -0
pixeltable/exprs/array_slice.py +71 -0
pixeltable/exprs/column_property_ref.py +77 -0
pixeltable/exprs/column_ref.py +105 -0
pixeltable/exprs/comparison.py +77 -0
pixeltable/exprs/compound_predicate.py +98 -0
pixeltable/exprs/data_row.py +195 -0
pixeltable/exprs/expr.py +586 -0
pixeltable/exprs/expr_set.py +39 -0
pixeltable/exprs/function_call.py +380 -0
pixeltable/exprs/globals.py +69 -0
pixeltable/exprs/image_member_access.py +115 -0
pixeltable/exprs/image_similarity_predicate.py +58 -0
pixeltable/exprs/inline_array.py +107 -0
pixeltable/exprs/inline_dict.py +101 -0
pixeltable/exprs/is_null.py +38 -0
pixeltable/exprs/json_mapper.py +121 -0
pixeltable/exprs/json_path.py +159 -0
pixeltable/exprs/literal.py +54 -0
pixeltable/exprs/object_ref.py +41 -0
pixeltable/exprs/predicate.py +44 -0
pixeltable/exprs/row_builder.py +355 -0
pixeltable/exprs/rowid_ref.py +94 -0
pixeltable/exprs/type_cast.py +53 -0
pixeltable/exprs/variable.py +45 -0
pixeltable/func/__init__.py +9 -0
pixeltable/func/aggregate_function.py +194 -0
pixeltable/func/batched_function.py +53 -0
pixeltable/func/callable_function.py +69 -0
pixeltable/func/expr_template_function.py +82 -0
pixeltable/func/function.py +110 -0
pixeltable/func/function_registry.py +227 -0
pixeltable/func/globals.py +36 -0
pixeltable/func/nos_function.py +202 -0
pixeltable/func/signature.py +166 -0
pixeltable/func/udf.py +163 -0
pixeltable/functions/__init__.py +52 -103
pixeltable/functions/eval.py +216 -0
pixeltable/functions/fireworks.py +34 -0
pixeltable/functions/huggingface.py +120 -0
pixeltable/functions/image.py +16 -0
pixeltable/functions/openai.py +256 -0
pixeltable/functions/pil/image.py +148 -7
pixeltable/functions/string.py +13 -0
pixeltable/functions/together.py +122 -0
pixeltable/functions/util.py +41 -0
pixeltable/functions/video.py +62 -0
pixeltable/iterators/__init__.py +3 -0
pixeltable/iterators/base.py +48 -0
pixeltable/iterators/document.py +311 -0
pixeltable/iterators/video.py +89 -0
pixeltable/metadata/__init__.py +54 -0
pixeltable/metadata/converters/convert_10.py +18 -0
pixeltable/metadata/schema.py +211 -0
pixeltable/plan.py +656 -0
pixeltable/store.py +418 -182
pixeltable/tests/conftest.py +146 -88
pixeltable/tests/functions/test_fireworks.py +42 -0
pixeltable/tests/functions/test_functions.py +60 -0
pixeltable/tests/functions/test_huggingface.py +158 -0
pixeltable/tests/functions/test_openai.py +152 -0
pixeltable/tests/functions/test_together.py +111 -0
pixeltable/tests/test_audio.py +65 -0
pixeltable/tests/test_catalog.py +27 -0
pixeltable/tests/test_client.py +14 -14
pixeltable/tests/test_component_view.py +370 -0
pixeltable/tests/test_dataframe.py +439 -0
pixeltable/tests/test_dirs.py +78 -62
pixeltable/tests/test_document.py +120 -0
pixeltable/tests/test_exprs.py +592 -135
pixeltable/tests/test_function.py +297 -67
pixeltable/tests/test_migration.py +43 -0
pixeltable/tests/test_nos.py +54 -0
pixeltable/tests/test_snapshot.py +208 -0
pixeltable/tests/test_table.py +1195 -263
pixeltable/tests/test_transactional_directory.py +42 -0
pixeltable/tests/test_types.py +5 -11
pixeltable/tests/test_video.py +151 -34
pixeltable/tests/test_view.py +530 -0
pixeltable/tests/utils.py +320 -45
pixeltable/tool/create_test_db_dump.py +149 -0
pixeltable/tool/create_test_video.py +81 -0
pixeltable/type_system.py +445 -124
pixeltable/utils/__init__.py +17 -46
pixeltable/utils/arrow.py +98 -0
pixeltable/utils/clip.py +12 -15
pixeltable/utils/coco.py +136 -0
pixeltable/utils/documents.py +39 -0
pixeltable/utils/filecache.py +195 -0
pixeltable/utils/help.py +11 -0
pixeltable/utils/hf_datasets.py +157 -0
pixeltable/utils/media_store.py +76 -0
pixeltable/utils/parquet.py +167 -0
pixeltable/utils/pytorch.py +91 -0
pixeltable/utils/s3.py +13 -0
pixeltable/utils/sql.py +17 -0
pixeltable/utils/transactional_directory.py +35 -0
pixeltable-0.2.4.dist-info/LICENSE +18 -0
pixeltable-0.2.4.dist-info/METADATA +127 -0
pixeltable-0.2.4.dist-info/RECORD +132 -0
{pixeltable-0.1.0.dist-info → pixeltable-0.2.4.dist-info}/WHEEL +1 -1
pixeltable/catalog.py +0 -1421
pixeltable/exprs.py +0 -1745
pixeltable/function.py +0 -269
pixeltable/functions/clip.py +0 -10
pixeltable/functions/pil/__init__.py +0 -23
pixeltable/functions/tf.py +0 -21
pixeltable/index.py +0 -57
pixeltable/tests/test_dict.py +0 -24
pixeltable/tests/test_functions.py +0 -11
pixeltable/tests/test_tf.py +0 -69
pixeltable/tf.py +0 -33
pixeltable/utils/tf.py +0 -33
pixeltable/utils/video.py +0 -32
pixeltable-0.1.0.dist-info/METADATA +0 -34
pixeltable-0.1.0.dist-info/RECORD +0 -36

pixeltable/dataframe.py CHANGED Viewed

@@ -1,24 +1,35 @@
+from __future__ import annotations
 import base64
+import copy
+import hashlib
 import io
-import os
-from typing import List, Optional, Any, Dict, Generator, Tuple, Set
+import json
+import logging
+import mimetypes
+import traceback
 from pathlib import Path
-from dataclasses import dataclass, field
+from typing import List, Optional, Any, Dict, Generator, Tuple, Set
 import pandas as pd
+import pandas.io.formats.style
 import sqlalchemy as sql
 from PIL import Image
-import copy
-from pixeltable import catalog
+import pixeltable.catalog as catalog
+import pixeltable.exceptions as excs
+import pixeltable.exprs as exprs
+import pixeltable.type_system as ts
+from pixeltable.catalog import is_valid_identifier
 from pixeltable.env import Env
+from pixeltable.plan import Planner
 from pixeltable.type_system import ColumnType
-from pixeltable import exprs
-from pixeltable import exceptions as exc
 __all__ = [
     'DataFrame'
 ]
+_logger = logging.getLogger('pixeltable')
 def _format_img(img: object) -> str:
     """
@@ -28,360 +39,479 @@ def _format_img(img: object) -> str:
     with io.BytesIO() as buffer:
         img.save(buffer, 'jpeg')
         img_base64 = base64.b64encode(buffer.getvalue()).decode()
-        return f'<img src="data:image/jpeg;base64,{img_base64}">'
-def _format_video(video_file_path: str) -> str:
-    # turn absolute video_file_path into relative path, absolute paths don't work
-    p = Path(video_file_path)
-    root = Path(os.getcwd())
-    #print(root)
-    #return f'<video controls><source src="{video_file_path}" type="video/mp4"></video>'
-    try:
-        rel_path = p.relative_to(root)
-        return f'<video controls><source src="{rel_path}" type="video/mp4"></video>'
-    except ValueError:
-        # display path as string
-        return video_file_path
+        return f'<div style="width:200px;"><img src="data:image/jpeg;base64,{img_base64}" width="200" /></div>'
+def _create_source_tag(file_path: str) -> str:
+    abs_path = Path(file_path)
+    assert abs_path.is_absolute()
+    src_url = f'{Env.get().http_address}/{abs_path}'
+    mime = mimetypes.guess_type(src_url)[0]
+    # if mime is None, the attribute string would not be valid html.
+    mime_attr = f'type="{mime}"' if mime is not None else ''
+    return f'<source src="{src_url}" {mime_attr} />'
+def _format_video(file_path: str) -> str:
+    return f'<video controls>{_create_source_tag(file_path)}</video>'
+def _format_audio(file_path: str) -> str:
+    return f'<audio controls>{_create_source_tag(file_path)}</audio>'
 class DataFrameResultSet:
-    def __init__(self, rows: List[List], col_names: List[str], col_types: List[ColumnType]):
-        self.rows = rows
-        self.col_names = col_names
-        self.col_types = col_types
+    def __init__(self, rows: List[List[Any]], col_names: List[str], col_types: List[ColumnType]):
+        self._rows = rows
+        self._col_names = col_names
+        self._col_types = col_types
+        self._formatters = {
+            ts.ImageType: _format_img,
+            ts.VideoType: _format_video,
+            ts.AudioType: _format_audio,
+        }
     def __len__(self) -> int:
-        return len(self.rows)
+        return len(self._rows)
+    def column_names(self) -> List[str]:
+        return self._col_names
+    def column_types(self) -> List[ColumnType]:
+        return self._col_types
+    def __repr__(self) -> str:
+        return self.to_pandas().__repr__()
     def _repr_html_(self) -> str:
-        img_col_idxs = [i for i, col_type in enumerate(self.col_types) if col_type.is_image_type()]
-        video_col_idxs = [i for i, col_type in enumerate(self.col_types) if col_type.is_video_type()]
-        formatters = {self.col_names[i]: _format_img for i in img_col_idxs}
-        formatters.update({self.col_names[i]: _format_video for i in video_col_idxs})
-        # escape=False: make sure <img> tags stay intact
+        formatters = {
+            col_name: self._formatters[col_type.__class__]
+            for col_name, col_type in zip(self._col_names, self._col_types)
+            if col_type.__class__ in self._formatters
+        }
         # TODO: why does mypy complain about formatters having an incorrect type?
         return self.to_pandas().to_html(formatters=formatters, escape=False, index=False)  # type: ignore[arg-type]
     def __str__(self) -> str:
         return self.to_pandas().to_string()
-    def to_pandas(self) -> pd.DataFrame:
-        return pd.DataFrame.from_records(self.rows, columns=self.col_names)
+    def _reverse(self) -> None:
+        """Reverse order of rows"""
+        self._rows.reverse()
-    def __getitem__(self, index: Any) -> Any:
-        if isinstance(index, tuple):
-            if len(index) != 2 or not isinstance(index[0], int) or not isinstance(index[1], int):
-                raise exc.OperationalError(f'Bad index: {index}')
-            return self.rows[index[0]][index[1]]
+    def to_pandas(self) -> pd.DataFrame:
+        return pd.DataFrame.from_records(self._rows, columns=self._col_names)
+    def _row_to_dict(self, row_idx: int) -> Dict[str, Any]:
+        return {self._col_names[i]: self._rows[row_idx][i] for i in range(len(self._col_names))}
+    def __getitem__(self, index: Any) -> Any:
+        if isinstance(index, str):
+            if index not in self._col_names:
+                raise excs.Error(f'Invalid column name: {index}')
+            col_idx = self._col_names.index(index)
+            return [row[col_idx] for row in self._rows]
+        if isinstance(index, int):
+            return self._row_to_dict(index)
+        if isinstance(index, tuple) and len(index) == 2:
+            if not isinstance(index[0], int) or not (isinstance(index[1], str) or isinstance(index[1], int)):
+                raise excs.Error(f'Bad index, expected [<row idx>, <column name | column index>]: {index}')
+            if isinstance(index[1], str) and index[1] not in self._col_names:
+                raise excs.Error(f'Invalid column name: {index[1]}')
+            col_idx = self._col_names.index(index[1]) if isinstance(index[1], str) else index[1]
+            return self._rows[index[0]][col_idx]
+        raise excs.Error(f'Bad index: {index}')
+    def __iter__(self) -> DataFrameResultSetIterator:
+        return DataFrameResultSetIterator(self)
+    def __eq__(self, other):
+        if not isinstance(other, DataFrameResultSet):
+            return False
+        return self.to_pandas().equals(other.to_pandas())
+class DataFrameResultSetIterator:
+    def __init__(self, result_set: DataFrameResultSet):
+        self._result_set = result_set
+        self._idx = 0
+    def __next__(self) -> Dict[str, Any]:
+        if self._idx >= len(self._result_set):
+            raise StopIteration
+        row = self._result_set._row_to_dict(self._idx)
+        self._idx += 1
+        return row
+# TODO: remove this; it's only here as a reminder that we still need to call release() in the current implementation
 class AnalysisInfo:
-    def __init__(self):
+    def __init__(self, tbl: catalog.TableVersion):
+        self.tbl = tbl
         # output of the SQL scan stage
         self.sql_scan_output_exprs: List[exprs.Expr] = []
         # output of the agg stage
         self.agg_output_exprs: List[exprs.Expr] = []
-        # select list providing the input to the SQL scan stage
-        self.sql_select_list: List[sql.sql.expression.ClauseElement] = []
         # Where clause of the Select stmt of the SQL scan stage
-        self.sql_where_clause: Optional[sql.sql.expression.ClauseElement] = None
+        self.sql_where_clause: Optional[sql.ClauseElement] = None
         # filter predicate applied to input rows of the SQL scan stage
         self.filter: Optional[exprs.Predicate] = None
         self.similarity_clause: Optional[exprs.ImageSimilarityPredicate] = None
         self.agg_fn_calls: List[exprs.FunctionCall] = []  # derived from unique_exprs
+        self.has_frame_col: bool = False  # True if we're referencing the frame col
-        self.unique_exprs = exprs.UniqueExprSet()
-        self.next_data_row_idx = 0
+        self.evaluator: Optional[exprs.Evaluator] = None
+        self.sql_scan_eval_ctx: List[exprs.Expr] = []  # needed to materialize output of SQL scan stage
+        self.agg_eval_ctx: List[exprs.Expr] = []  # needed to materialize output of agg stage
+        self.filter_eval_ctx: List[exprs.Expr] = []
+        self.group_by_eval_ctx: List[exprs.Expr] = []
-    @property
-    def num_materialized(self) -> int:
-        return self.next_data_row_idx
-    def assign_idxs(self, expr_list: List[exprs.Expr]) -> None:
+    def finalize_exec(self) -> None:
         """
-        Assign data/sql_row_idx to exprs in expr_list and all their subcomponents.
-        An expr with to_sql() != None is assumed to be materialized fully via SQL; its components
-        aren't materialized and don't receive idxs.
+        Call release() on all collected Exprs.
         """
-        for e in expr_list:
-            self._assign_idxs_aux(e)
-        self.agg_fn_calls = [e for e in self.unique_exprs if isinstance(e, exprs.FunctionCall) and e.is_agg_fn_call]
-    def _assign_idxs_aux(self, expr: exprs.Expr) -> None:
-        if not self.unique_exprs.add(expr):
-            # nothing left to do
-            return
-        sql_expr = expr.sql_expr()
-        # if this can be materialized via SQL we don't need to look at its components;
-        # we special-case Literals because we don't want to have to materialize them via SQL
-        if sql_expr is not None and not isinstance(expr, exprs.Literal):
-            assert expr.data_row_idx < 0
-            expr.data_row_idx = self.next_data_row_idx
-            self.next_data_row_idx += 1
-            expr.sql_row_idx = len(self.sql_select_list)
-            self.sql_select_list.append(sql_expr)
-            return
+        exprs.Expr.release_list(self.sql_scan_output_exprs)
+        exprs.Expr.release_list(self.agg_output_exprs)
+        if self.filter is not None:
+            self.filter.release()
-        # expr value needs to be computed via Expr.eval()
-        for c in expr.components:
-            self._assign_idxs_aux(c)
-        assert expr.data_row_idx < 0
-        expr.data_row_idx = self.next_data_row_idx
-        self.next_data_row_idx += 1
 class DataFrame:
     def __init__(
-            self, tbl: catalog.Table,
-            select_list: Optional[List[exprs.Expr]] = None,
-            where_clause: Optional[exprs.Predicate] = None):
+            self, tbl: catalog.TableVersionPath,
+            select_list: Optional[List[Tuple[exprs.Expr, Optional[str]]]] = None,
+            where_clause: Optional[exprs.Predicate] = None,
+            group_by_clause: Optional[List[exprs.Expr]] = None,
+            grouping_tbl: Optional[catalog.TableVersion] = None,
+            order_by_clause: Optional[List[Tuple[exprs.Expr, bool]]] = None,  # List[(expr, asc)]
+            limit: Optional[int] = None):
         self.tbl = tbl
-        # self.select_list and self.where_clause contain execution state and therefore cannot be shared
-        self.select_list: Optional[List[exprs.Expr]] = None  # None: implies all cols
-        if select_list is not None:
-            self.select_list = [e.copy() for e in select_list]
-        self.where_clause: Optional[exprs.Predicate] = None
-        if where_clause is not None:
-            self.where_clause = where_clause.copy()
-        self.group_by_clause: Optional[List[exprs.Expr]] = None
-        self.analysis_info: Optional[AnalysisInfo] = None
-    def analyze(self) -> None:
-        """
-        Populates self.analysis_info.
-        """
-        info = self.analysis_info = AnalysisInfo()
-        if self.where_clause is not None:
-            info.sql_where_clause, info.filter = self.where_clause.extract_sql_predicate()
-            if info.filter is not None:
-                similarity_clauses, info.filter = info.filter.split_conjuncts(
-                    lambda e: isinstance(e, exprs.ImageSimilarityPredicate))
-                if len(similarity_clauses) > 1:
-                    raise exc.OperationalError(f'More than one nearest() or matches() not supported')
-                if len(similarity_clauses) == 1:
-                    info.similarity_clause = similarity_clauses[0]
-                    img_col = info.similarity_clause.img_col_ref.col
-                    if not img_col.is_indexed:
-                        raise exc.OperationalError(
-                            f'nearest()/matches() not available for unindexed column {img_col.name}')
-        if info.filter is not None:
-            info.assign_idxs([info.filter])
-        if len(self.group_by_clause) > 0:
-            info.assign_idxs(self.group_by_clause)
-            for e in self.group_by_clause:
-                self._analyze_group_by(e, True)
-        info.assign_idxs(self.select_list)
-        grouping_expr_idxs = set([e.data_row_idx for e in self.group_by_clause])
-        item_is_agg = [self._analyze_select_list(e, grouping_expr_idxs)[0]  for e in self.select_list]
-        if self.is_agg():
-            # this is an aggregation
-            if item_is_agg.count(False) > 0:
-                raise exc.Error(f'Invalid non-aggregate in select list: {self.select_list[item_is_agg.find(False)]}')
-            # the agg stage materializes select list items that haven't already been provided by SQL
-            info.agg_output_exprs = [e for e in self.select_list if e.sql_row_idx == -1]
-            # our sql scan stage needs to materialize: grouping exprs, arguments of agg fn calls
-            info.sql_scan_output_exprs = copy.copy(self.group_by_clause)
-            unique_args: Set[int] = set()
-            for fn_call in info.agg_fn_calls:
-                for c in fn_call.components:
-                    unique_args.add(c.data_row_idx)
-            all_exprs = {e.data_row_idx: e for e in info.unique_exprs}
-            info.sql_scan_output_exprs.extend([all_exprs[idx] for idx in unique_args])
-        else:
-            info.sql_scan_output_exprs = self.select_list
-    def is_agg(self) -> bool:
-        return len(self.group_by_clause) > 0 \
-            or (self.analysis_info is not None and len(self.analysis_info.agg_fn_calls) > 0)
-    def _is_agg_fn_call(self, e: exprs.Expr) -> bool:
-        return isinstance(e, exprs.FunctionCall) and e.is_agg_fn_call
-    def _analyze_group_by(self, e: exprs.Expr, check_sql: bool) -> None:
+        # select list logic
+        DataFrame._select_list_check_rep(select_list) # check select list without expansion
+        # exprs contain execution state and therefore cannot be shared
+        select_list = copy.deepcopy(select_list)
+        select_list_exprs, column_names = DataFrame._normalize_select_list(tbl, select_list)
+        DataFrame._select_list_check_rep(list(zip(select_list_exprs, column_names)))
+        # check select list after expansion to catch early
+        # the following two lists are always non empty, even if select list is None.
+        self._select_list_exprs = select_list_exprs
+        self._column_names = column_names
+        self.select_list = select_list
+        self.where_clause = copy.deepcopy(where_clause)
+        assert group_by_clause is None or grouping_tbl is None
+        self.group_by_clause = copy.deepcopy(group_by_clause)
+        self.grouping_tbl = grouping_tbl
+        self.order_by_clause = copy.deepcopy(order_by_clause)
+        self.limit_val = limit
+    @classmethod
+    def _select_list_check_rep(cls,
+        select_list: Optional[List[Tuple[exprs.Expr, Optional[str]]]],
+    ) -> None:
+        """Validate basic select list types.
         """
-        Make sure that group-by exprs don't contain aggregates.
-        """
-        if e.sql_row_idx == -1 and check_sql:
-            raise exc.Error(f'Invalid grouping expr, needs to be expressible in SQL: {e}')
-        if self._is_agg_fn_call(e):
-            raise exc.Error(f'Cannot group by aggregate function: {e}')
-        for c in e.components:
-            self._analyze_group_by(c, False)
-    def _analyze_select_list(self, e: exprs.Expr, grouping_exprs: Set[int]) -> Tuple[bool, bool]:
+        if select_list is None: # basic check for valid select list
+            return
+        assert len(select_list) > 0
+        for ent in select_list:
+            assert isinstance(ent, tuple)
+            assert len(ent) == 2
+            assert isinstance(ent[0], exprs.Expr)
+            assert ent[1] is None or isinstance(ent[1], str)
+            if isinstance(ent[1], str):
+                assert is_valid_identifier(ent[1])
+    @classmethod
+    def _normalize_select_list(cls,
+        tbl: catalog.TableVersionPath,
+        select_list: Optional[List[Tuple[exprs.Expr, Optional[str]]]],
+    ) -> Tuple[List[exprs.Expr], List[str]]:
         """
-        Analyzes select list item. Returns (list item is output of agg stage, item is output of scan stage).
-        Collects agg fn calls in self.analysis_info.
+        Expand select list information with all columns and their names
+        Returns:
+            a pair composed of the list of expressions and the list of corresponding names
         """
-        if e.data_row_idx in grouping_exprs:
-            return True, True
-        elif self._is_agg_fn_call(e):
-            for c in e.components:
-                _, is_scan_output = self._analyze_select_list(c, grouping_exprs)
-                if not is_scan_output:
-                    raise exc.Error(f'Invalid nested aggregates: {e}')
-            return True, False
-        elif isinstance(e, exprs.Literal):
-            return True, True
-        elif isinstance(e, exprs.ColumnRef):
-            # we already know that this isn't a grouping expr
-            return False, True
+        if select_list is None:
+            expanded_list = [(exprs.ColumnRef(col), None) for col in tbl.columns()]
         else:
-            # an expression such as <grouping expr 1> + <grouping expr 2> can be the output of both
-            # the agg stage and the scan stage
-            component_is_agg: List[bool] = []
-            component_is_scan: List[bool] = []
-            for c in e.components:
-                is_agg, is_scan = self._analyze_select_list(c, grouping_exprs)
-                component_is_agg.append(is_agg)
-                component_is_scan.append(is_scan)
-            is_agg = component_is_agg.count(True) == len(e.components)
-            is_scan = component_is_scan.count(True) == len(e.components)
-            if not is_agg and not is_scan:
-                raise exc.Error(f'Invalid expression, mixes aggregate with non-aggregate: {e}')
-            return is_agg, is_scan
-    def exec(self, n: int = 20, select_pk: bool = False) -> Generator[List[Any], None, None]:
-        """
-        Returned value: list of select list values.
-        If select_pk == True, also selects the primary key of the storage table (which is rowid and v_min).
+            expanded_list = select_list
+        out_exprs : List[exprs.Expr] = []
+        out_names : List[str] = [] # keep track of order
+        seen_out_names : set[str] = set() # use to check for duplicates in loop, avoid square complexity
+        for i, (expr, name) in enumerate(expanded_list):
+            if name is None:
+                # use default, add suffix if needed so default adds no duplicates
+                default_name = expr.default_column_name()
+                if default_name is not None:
+                    column_name = default_name
+                    if default_name in seen_out_names:
+                        # already used, then add suffix until unique name is found
+                        for j in range(1, len(out_names)+1):
+                            column_name = f'{default_name}_{j}'
+                            if column_name not in seen_out_names:
+                                break
+                else: # no default name, eg some expressions
+                    column_name = f'col_{i}'
+            else: # user provided name, no attempt to rename
+                column_name = name
+            out_exprs.append(expr)
+            out_names.append(column_name)
+            seen_out_names.add(column_name)
+        assert len(out_exprs) == len(out_names)
+        assert set(out_names) == seen_out_names
+        return out_exprs, out_names
+    def _exec(self) -> Generator[exprs.DataRow, None, None]:
+        """Run the query and return rows as a generator.
+        This function must not modify the state of the DataFrame, otherwise it breaks dataset caching.
         """
-        if self.select_list is None:
-            self.select_list = [exprs.ColumnRef(col) for col in self.tbl.columns]
-        if self.group_by_clause is None:
-            self.group_by_clause = []
-        for item in self.select_list:
+        # construct a group-by clause if we're grouping by a table
+        group_by_clause: List[exprs.Expr] = []
+        if self.grouping_tbl is not None:
+            assert self.group_by_clause is None
+            num_rowid_cols = len(self.grouping_tbl.store_tbl.rowid_columns())
+            # the grouping table must be a base of self.tbl
+            assert num_rowid_cols <= len(self.tbl.tbl_version.store_tbl.rowid_columns())
+            group_by_clause = [exprs.RowidRef(self.tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
+        elif self.group_by_clause is not None:
+            group_by_clause = self.group_by_clause
+        for item in self._select_list_exprs:
             item.bind_rel_paths(None)
-        if self.analysis_info is None:
-            self.analyze()
-        if self.analysis_info.similarity_clause is not None and n > 100:
-            raise exc.OperationalError(f'nearest()/matches() requires show(n <= 100): n={n}')
-        # determine order_by clause for window functions or grouping, if present
-        window_fn_calls = [
-            e for e in self.analysis_info.unique_exprs
-            if isinstance(e, exprs.FunctionCall) and e.is_window_fn_call
-        ]
-        if len(window_fn_calls) > 0 and self.is_agg():
-            raise exc.Error(f'Cannot combine window functions with non-windowed aggregation')
-        order_by_exprs: List[exprs.Expr] = []
-        # TODO: check compatibility of window clauses
-        if len(window_fn_calls) > 0:
-            order_by_exprs = window_fn_calls[0].get_window_sort_exprs()
-        elif self.is_agg():
-            # TODO: collect aggs with order-by and analyze for compatibility
-            order_by_exprs = self.group_by_clause + self.analysis_info.agg_fn_calls[0].get_agg_order_by()
-        order_by_clause = [e.sql_expr() for e in order_by_exprs]
-        for i in range(len(order_by_exprs)):
-            if order_by_clause[i] is None:
-                raise exc.Error(f'order_by element cannot be expressed in SQL: {order_by_exprs[i]}')
-        idx_rowids: List[int] = []  # rowids returned by index lookup
-        if self.analysis_info.similarity_clause is not None:
-            # do index lookup
-            assert self.analysis_info.similarity_clause.img_col_ref.col.idx is not None
-            embed = self.analysis_info.similarity_clause.embedding()
-            idx_rowids = self.analysis_info.similarity_clause.img_col_ref.col.idx.search(embed, n, self.tbl.valid_rowids)
-        with Env.get().engine.connect() as conn:
-            stmt = self._create_select_stmt(
-                self.analysis_info.sql_select_list, self.analysis_info.sql_where_clause, idx_rowids, select_pk,
-                order_by_clause)
-            num_rows = 0
-            sql_scan_evaluator = exprs.ExprEvaluator(
-                self.analysis_info.sql_scan_output_exprs, self.analysis_info.filter)
-            agg_evaluator = exprs.ExprEvaluator(self.analysis_info.agg_output_exprs, None)
-            current_group: Optional[List[Any]] = None  # for grouping agg, the values of the group-by exprs
-            for row in conn.execute(stmt):
-                sql_row = row._data
-                data_row: List[Any] = [None] * self.analysis_info.num_materialized
-                if not sql_scan_evaluator.eval(sql_row, data_row):
-                    continue
-                # copy select list results into contiguous array
-                result_row: Optional[List[Any]] = None
-                if self.is_agg():
-                    group = [data_row[e.data_row_idx] for e in self.group_by_clause]
-                    if current_group is None:
-                        current_group = group
-                    if group != current_group:
-                        # we're entering a new group, emit a row for the last one
-                        agg_evaluator.eval(last_sql_row, last_data_row)
-                        result_row = [last_data_row[e.data_row_idx] for e in self.select_list]
-                        current_group = group
-                        for fn_call in self.analysis_info.agg_fn_calls:
-                            fn_call.reset_agg()
-                    for fn_call in self.analysis_info.agg_fn_calls:
-                        fn_call.update(data_row)
-                else:
-                    result_row = [data_row[e.data_row_idx] for e in self.select_list]
-                    if select_pk:
-                        result_row.extend(sql_row[-2:])
-                last_data_row = data_row
-                last_sql_row = row._data
-                if result_row is not None:
-                    yield result_row
-                    num_rows += 1
-                    if n > 0 and num_rows == n:
-                        break
-            if self.is_agg():
-                # we need to emit the output row for the current group
-                agg_evaluator.eval(sql_row, data_row)
-                result_row = [data_row[e.data_row_idx] for e in self.select_list]
-                yield result_row
+        plan = Planner.create_query_plan(
+            self.tbl, self._select_list_exprs, where_clause=self.where_clause, group_by_clause=group_by_clause,
+            order_by_clause=self.order_by_clause if self.order_by_clause is not None else [],
+            limit=self.limit_val if self.limit_val is not None else 0)  # limit_val == 0: no limit_val
+        with Env.get().engine.begin() as conn:
+            plan.ctx.conn = conn
+            plan.open()
+            try:
+                for row_batch in plan:
+                    for data_row in row_batch:
+                        yield data_row
+            finally:
+                plan.close()
+            return
     def show(self, n: int = 20) -> DataFrameResultSet:
-        data_rows = [row for row in self.exec(n)]
-        col_names = [expr.display_name() for expr in self.select_list]
-        # replace ''
-        col_names = [n if n != '' else f'col_{i}' for i, n in enumerate(col_names)]
-        return DataFrameResultSet(data_rows, col_names, [expr.col_type for expr in self.select_list])
+        assert n is not None
+        return self.limit(n).collect()
+    def head(self, n: int = 10) -> DataFrameResultSet:
+        if self.order_by_clause is not None:
+            raise excs.Error(f'head() cannot be used with order_by()')
+        num_rowid_cols = len(self.tbl.tbl_version.store_tbl.rowid_columns())
+        order_by_clause = [exprs.RowidRef(self.tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
+        return self.order_by(*order_by_clause, asc=True).limit(n).collect()
+    def tail(self, n: int = 10) -> DataFrameResultSet:
+        if self.order_by_clause is not None:
+            raise excs.Error(f'tail() cannot be used with order_by()')
+        num_rowid_cols = len(self.tbl.tbl_version.store_tbl.rowid_columns())
+        order_by_clause = [exprs.RowidRef(self.tbl.tbl_version, idx) for idx in range(num_rowid_cols)]
+        result = self.order_by(*order_by_clause, asc=False).limit(n).collect()
+        result._reverse()
+        return result
+    def get_column_names(self) -> List[str]:
+        return self._column_names
+    def get_column_types(self) -> List[ColumnType]:
+        return [expr.col_type for expr in self._select_list_exprs]
+    def collect(self) -> DataFrameResultSet:
+        try:
+            result_rows = []
+            for data_row in self._exec():
+                result_row = [data_row[e.slot_idx] for e in self._select_list_exprs]
+                result_rows.append(result_row)
+        except excs.ExprEvalError as e:
+            msg = (f'In row {e.row_num} the {e.expr_msg} encountered exception '
+                   f'{type(e.exc).__name__}:\n{str(e.exc)}')
+            if len(e.input_vals) > 0:
+                input_msgs = [
+                    f"'{d}' = {d.col_type.print_value(e.input_vals[i])}"
+                    for i, d in enumerate(e.expr.dependencies())
+                ]
+                msg += f'\nwith {", ".join(input_msgs)}'
+            assert e.exc_tb is not None
+            stack_trace = traceback.format_tb(e.exc_tb)
+            if len(stack_trace) > 2:
+                # append a stack trace if the exception happened in user code
+                # (frame 0 is ExprEvaluator and frame 1 is some expr's eval()
+                nl = '\n'
+                # [-1:0:-1]: leave out entry 0 and reverse order, so that the most recent frame is at the top
+                msg += f'\nStack:\n{nl.join(stack_trace[-1:1:-1])}'
+            raise excs.Error(msg)
+        except sql.exc.DBAPIError as e:
+            raise excs.Error(f'Error during SQL execution:\n{e}')
+        col_types = self.get_column_types()
+        return DataFrameResultSet(result_rows, self._column_names, col_types)
     def count(self) -> int:
-        """
-        TODO: implement as part of DataFrame.agg()
-        """
-        stmt = sql.select(sql.func.count('*')).select_from(self.tbl.sa_tbl) \
-            .where(self.tbl.v_min_col <= self.tbl.version) \
-            .where(self.tbl.v_max_col > self.tbl.version)
-        if self.where_clause is not None:
-            sql_where_clause = self.where_clause.sql_expr()
-            assert sql_where_clause is not None
-            stmt = stmt.where(sql_where_clause)
+        from pixeltable.plan import Planner
+        stmt = Planner.create_count_stmt(self.tbl, self.where_clause)
         with Env.get().engine.connect() as conn:
             result: int = conn.execute(stmt).scalar_one()
             assert isinstance(result, int)
             return result
-    def categorical_map(self) -> Dict[str, int]:
+    def _description(self) -> pd.DataFrame:
+        """see DataFrame.describe()"""
+        heading_vals: List[str] = []
+        info_vals: List[str] = []
+        if self.select_list is not None:
+            assert len(self.select_list) > 0
+            heading_vals.append('Select')
+            heading_vals.extend([''] * (len(self.select_list) - 1))
+            info_vals.extend(self.get_column_names())
+        if self.where_clause is not None:
+            heading_vals.append('Where')
+            info_vals.append(self.where_clause.display_str(inline=False))
+        if self.group_by_clause is not None:
+            heading_vals.append('Group By')
+            heading_vals.extend([''] * (len(self.group_by_clause) - 1))
+            info_vals.extend([e.display_str(inline=False) for e in self.group_by_clause])
+        if self.order_by_clause is not None:
+            heading_vals.append('Order By')
+            heading_vals.extend([''] * (len(self.order_by_clause) - 1))
+            info_vals.extend([
+                f'{e[0].display_str(inline=False)} {"asc" if e[1] else "desc"}' for e in self.order_by_clause
+            ])
+        if self.limit_val is not None:
+            heading_vals.append('Limit')
+            info_vals.append(str(self.limit_val))
+        assert len(heading_vals) > 0
+        assert len(info_vals) > 0
+        assert len(heading_vals) == len(info_vals)
+        return pd.DataFrame({'Heading': heading_vals, 'Info': info_vals})
+    def _description_html(self) -> pandas.io.formats.style.Styler:
+        """Return the description in an ipython-friendly manner."""
+        pd_df = self._description()
+        # white-space: pre-wrap: print \n as newline
+        # th: center-align headings
+        return pd_df.style.set_properties(**{'white-space': 'pre-wrap', 'text-align': 'left'}) \
+            .set_table_styles([dict(selector='th', props=[('text-align', 'center')])]) \
+            .hide(axis='index').hide(axis='columns')
+    def describe(self) -> None:
         """
-        Return map of distinct values in string ColumnRef to increasing integers.
-        TODO: implement as part of DataFrame.agg()
+        Prints a tabular description of this DataFrame.
+        The description has two columns, heading and info, which list the contents of each 'component'
+                (select list, where clause, ...) vertically.
         """
-        if self.select_list is None or len(self.select_list) != 1 \
-            or not isinstance(self.select_list[0], exprs.ColumnRef) \
-            or not self.select_list[0].col_type.is_string_type():
-            raise exc.OperationalError(f'categoricals_map() can only be applied to an individual string column')
-        assert isinstance(self.select_list[0], exprs.ColumnRef)
-        col = self.select_list[0].col
-        stmt = sql.select(sql.distinct(col.sa_col)) \
-            .where(self.tbl.v_min_col <= self.tbl.version) \
-            .where(self.tbl.v_max_col > self.tbl.version) \
-            .order_by(col.sa_col)
-        if self.where_clause is not None:
-            sql_where_clause = self.where_clause.sql_expr()
-            assert sql_where_clause is not None
-            stmt = stmt.where(sql_where_clause)
-        with Env.get().engine.connect() as conn:
-            result = {row._data[0]: i for i, row in enumerate(conn.execute(stmt))}
-            return result
+        try:
+            __IPYTHON__
+            from IPython.display import display
+            display(self._description_html())
+        except NameError:
+            print(self.__repr__())
-    def __getitem__(self, index: object) -> 'DataFrame':
+    def __repr__(self) -> str:
+        return self._description().to_string(header=False, index=False)
+    def _repr_html_(self) -> str:
+        return self._description_html()._repr_html_()
+    def select(self, *items: Any, **named_items : Any) -> DataFrame:
+        if self.select_list is not None:
+            raise excs.Error(f'Select list already specified')
+        for (name, _) in named_items.items():
+            if not isinstance(name, str) or not is_valid_identifier(name):
+                raise excs.Error(f'Invalid name: {name}')
+        base_list = [(expr, None) for expr in items] + [(expr, k) for (k, expr) in named_items.items()]
+        if len(base_list) == 0:
+            raise excs.Error(f'Empty select list')
+        # analyze select list; wrap literals with the corresponding expressions
+        select_list = []
+        for raw_expr, name in base_list:
+            if isinstance(raw_expr, exprs.Expr):
+                select_list.append((raw_expr, name))
+            elif isinstance(raw_expr, dict):
+                select_list.append((exprs.InlineDict(raw_expr), name))
+            elif isinstance(raw_expr, list):
+                select_list.append((exprs.InlineArray(raw_expr), name))
+            else:
+                select_list.append((exprs.Literal(raw_expr), name))
+            expr = select_list[-1][0]
+            if expr.col_type.is_invalid_type():
+                raise excs.Error(f'Invalid type: {raw_expr}')
+            # TODO: check that ColumnRefs in expr refer to self.tbl
+        # check user provided names do not conflict among themselves
+        # or with auto-generated ones
+        seen: Set[str] = set()
+        _, names = DataFrame._normalize_select_list(self.tbl, select_list)
+        for name in names:
+            if name in seen:
+                repeated_names = [j for j, x in enumerate(names) if x == name]
+                pretty = ', '.join(map(str, repeated_names))
+                raise excs.Error(f'Repeated column name "{name}" in select() at positions: {pretty}')
+            seen.add(name)
+        return DataFrame(
+            self.tbl, select_list=select_list, where_clause=self.where_clause, group_by_clause=self.group_by_clause,
+            grouping_tbl=self.grouping_tbl, order_by_clause=self.order_by_clause, limit=self.limit_val)
+    def where(self, pred: exprs.Predicate) -> DataFrame:
+        return DataFrame(
+            self.tbl, select_list=self.select_list, where_clause=pred, group_by_clause=self.group_by_clause,
+            grouping_tbl=self.grouping_tbl, order_by_clause=self.order_by_clause, limit=self.limit_val)
+    def group_by(self, *grouping_items: Any) -> DataFrame:
+        """Add a group-by clause to this DataFrame.
+        Variants:
+        - group_by(<base table>): group a component view by their respective base table rows
+        - group_by(<expr>, ...): group by the given expressions
+        """
+        if self.group_by_clause is not None:
+            raise excs.Error(f'Group-by already specified')
+        grouping_tbl: Optional[catalog.TableVersion] = None
+        group_by_clause: Optional[List[exprs.Expr]] = None
+        for item in grouping_items:
+            if isinstance(item, catalog.Table):
+                if len(grouping_items) > 1:
+                    raise excs.Error(f'group_by(): only one table can be specified')
+                # we need to make sure that the grouping table is a base of self.tbl
+                base = self.tbl.find_tbl_version(item.tbl_version_path.tbl_id())
+                if base is None or base.id == self.tbl.tbl_id():
+                    raise excs.Error(f'group_by(): {item.name} is not a base table of {self.tbl.tbl_name()}')
+                grouping_tbl = item.tbl_version_path.tbl_version
+                break
+            if not isinstance(item, exprs.Expr):
+                raise excs.Error(f'Invalid expression in group_by(): {item}')
+        if grouping_tbl is None:
+            group_by_clause = list(grouping_items)
+        return DataFrame(
+            self.tbl, select_list=self.select_list, where_clause=self.where_clause, group_by_clause=group_by_clause,
+            grouping_tbl=grouping_tbl, order_by_clause=self.order_by_clause, limit=self.limit_val)
+    def order_by(self, *expr_list: exprs.Expr, asc: bool = True) -> DataFrame:
+        for e in expr_list:
+            if not isinstance(e, exprs.Expr):
+                raise excs.Error(f'Invalid expression in order_by(): {e}')
+        order_by_clause = self.order_by_clause if self.order_by_clause is not None else []
+        order_by_clause.extend([(e.copy(), asc) for e in expr_list])
+        return DataFrame(
+            self.tbl, select_list=self.select_list, where_clause=self.where_clause,
+            group_by_clause=self.group_by_clause, grouping_tbl=self.grouping_tbl, order_by_clause=order_by_clause,
+            limit=self.limit_val)
+    def limit(self, n: int) -> DataFrame:
+        assert n is not None and isinstance(n, int)
+        return DataFrame(
+            self.tbl, select_list=self.select_list, where_clause=self.where_clause,
+            group_by_clause=self.group_by_clause, grouping_tbl=self.grouping_tbl, order_by_clause=self.order_by_clause,
+            limit=n)
+    def __getitem__(self, index: object) -> DataFrame:
         """
         Allowed:
         - [<Predicate>]: filter operation
@@ -389,52 +519,113 @@ class DataFrame:
         - [Expr]: setting a single-col select list
         """
         if isinstance(index, exprs.Predicate):
-            return DataFrame(self.tbl, select_list=self.select_list, where_clause=index)
+            return self.where(index)
         if isinstance(index, tuple):
             index = list(index)
         if isinstance(index, exprs.Expr):
             index = [index]
         if isinstance(index, list):
-            if self.select_list is not None:
-                raise exc.OperationalError(f'[] for column selection is only allowed once')
-            # analyze select list; wrap literals with the corresponding expressions and update it in place
-            for i in range(len(index)):
-                expr = index[i]
-                if isinstance(expr, dict):
-                    index[i] = expr = exprs.InlineDict(expr)
-                if isinstance(expr, list):
-                    index[i] = expr = exprs.InlineArray(tuple(expr))
-                if not isinstance(expr, exprs.Expr):
-                    raise exc.OperationalError(f'Invalid expression in []: {expr}')
-                if expr.col_type.is_invalid_type():
-                    raise exc.OperationalError(f'Invalid type: {expr}')
-                # TODO: check that ColumnRefs in expr refer to self.tbl
-            return DataFrame(self.tbl, select_list=index, where_clause=self.where_clause)
+            return self.select(*index)
         raise TypeError(f'Invalid index type: {type(index)}')
+    def _as_dict(self) -> Dict[str, Any]:
+        """
+            Returns:
+                Dictionary representing this dataframe.
+        """
+        tbl_versions = self.tbl.get_tbl_versions()
+        d = {
+            '_classname': 'DataFrame',
+            'tbl_ids': [str(t.id) for t in tbl_versions],
+            'tbl_versions': [t.version for t in tbl_versions],
+            'select_list':
+                [(e.as_dict(), name) for (e, name) in self.select_list] if self.select_list is not None else None,
+            'where_clause': self.where_clause.as_dict() if self.where_clause is not None else None,
+            'group_by_clause':
+                [e.as_dict() for e in self.group_by_clause] if self.group_by_clause is not None else None,
+            'order_by_clause':
+                [(e.as_dict(), asc) for (e,asc) in self.order_by_clause] if self.order_by_clause is not None else None,
+            'limit_val': self.limit_val,
+        }
+        return d
+    def to_coco_dataset(self) -> Path:
+        """Convert the dataframe to a COCO dataset.
+        This dataframe must return a single json-typed output column in the following format:
+        {
+            'image': PIL.Image.Image,
+            'annotations': [
+                {
+                    'bbox': [x: int, y: int, w: int, h: int],
+                    'category': str | int,
+                },
+                ...
+            ],
+        }
+        Returns:
+            Path to the COCO dataset file.
+        """
+        from pixeltable.utils.coco import write_coco_dataset
+        summary_string = json.dumps(self._as_dict())
+        cache_key = hashlib.sha256(summary_string.encode()).hexdigest()
+        dest_path = (Env.get().dataset_cache_dir / f'coco_{cache_key}')
+        if dest_path.exists():
+            assert dest_path.is_dir()
+            data_file_path = dest_path / 'data.json'
+            assert data_file_path.exists()
+            assert data_file_path.is_file()
+            return data_file_path
+        else:
+            return write_coco_dataset(self, dest_path)
-    def group_by(self, *expr_list: Tuple[exprs.Expr]) -> 'DataFrame':
-        for e in expr_list:
-            if not isinstance(e, exprs.Expr):
-                raise exc.Error(f'Invalid expr in group_by(): {e}')
-        self.group_by_clause = [e.copy() for e in expr_list]
-        return self
-    def _create_select_stmt(
-            self, select_list: List[sql.sql.expression.ClauseElement],
-            where_clause: Optional[sql.sql.expression.ClauseElement],
-            valid_rowids: List[int],
-            select_pk: bool,
-            order_by_exprs: List[sql.sql.expression.ClauseElement]
-    ) -> sql.sql.expression.Select:
-        pk_cols = [self.tbl.rowid_col, self.tbl.v_min_col] if select_pk else []
-        # we add pk_cols at the end so that the already-computed sql row indices remain correct
-        stmt = sql.select(*select_list, *pk_cols) \
-            .where(self.tbl.v_min_col <= self.tbl.version) \
-            .where(self.tbl.v_max_col > self.tbl.version)
-        if where_clause is not None:
-            stmt = stmt.where(where_clause)
-        if len(valid_rowids) > 0:
-            stmt = stmt.where(self.tbl.rowid_col.in_(valid_rowids))
-        if len(order_by_exprs) > 0:
-            stmt = stmt.order_by(*order_by_exprs)
-        return stmt
+    # TODO Factor this out into a separate module.
+    # The return type is unresolvable, but torch can't be imported since it's an optional dependency.
+    def to_pytorch_dataset(self, image_format: str = 'pt') -> 'torch.utils.data.IterableDataset':
+        """
+        Convert the dataframe to a pytorch IterableDataset suitable for parallel loading
+        with torch.utils.data.DataLoader.
+        This method requires pyarrow >= 13, torch and torchvision to work.
+        This method serializes data so it can be read from disk efficiently and repeatedly without
+        re-executing the query. This data is cached to disk for future re-use.
+        Args:
+            image_format: format of the images. Can be 'pt' (pytorch tensor) or 'np' (numpy array).
+                    'np' means image columns return as an RGB uint8 array of shape HxWxC.
+                    'pt' means image columns return as a CxHxW tensor with values in [0,1] and type torch.float32.
+                        (the format output by torchvision.transforms.ToTensor())
+        Returns:
+            A pytorch IterableDataset: Columns become fields of the dataset, where rows are returned as a dictionary
+                compatible with torch.utils.data.DataLoader default collation.
+        Constraints:
+            The default collate_fn for torch.data.util.DataLoader cannot represent null values as part of a
+            pytorch tensor when forming batches. These values will raise an exception while running the dataloader.
+            If you have them, you can work around None values by providing your custom collate_fn to the DataLoader
+            (and have your model handle it). Or, if these are not meaningful values within a minibtach, you can
+            modify or remove any such values through selections and filters prior to calling to_pytorch_dataset().
+        """
+        # check dependencies
+        Env.get().require_package('pyarrow', [13])
+        Env.get().require_package('torch')
+        Env.get().require_package('torchvision')
+        from pixeltable.utils.parquet import save_parquet # pylint: disable=import-outside-toplevel
+        from pixeltable.utils.pytorch import PixeltablePytorchDataset # pylint: disable=import-outside-toplevel
+        summary_string = json.dumps(self._as_dict())
+        cache_key = hashlib.sha256(summary_string.encode()).hexdigest()
+        dest_path = (Env.get().dataset_cache_dir / f'df_{cache_key}').with_suffix('.parquet') # pylint: disable = protected-access
+        if dest_path.exists(): # fast path: use cache
+            assert dest_path.is_dir()
+        else:
+            save_parquet(self, dest_path)
+        return PixeltablePytorchDataset(path=dest_path, image_format=image_format)

pixeltable 0.1.0__py3-none-any.whl → 0.2.4__py3-none-any.whl

Potentially problematic release.

pixeltable 0.1.0py3-none-any.whl → 0.2.4py3-none-any.whl