PyPI - corvic-engine - Versions diffs - 0.3.0rc62__cp38-abi3-win_amd64.whl → 0.3.0rc64__cp38-abi3-win_amd64.whl - Mend

corvic-engine 0.3.0rc62__cp38-abi3-win_amd64.whl → 0.3.0rc64__cp38-abi3-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

buf/validate/validate_pb2.py +415 -698
buf/validate/validate_pb2.pyi +173 -362
buf/validate/validate_pb2_grpc.py +1 -1
buf/validate/validate_pb2_grpc.pyi +6 -10
corvic/engine/_native.pyd +0 -0
corvic/system/_embedder.py +31 -8
corvic/system/_image_embedder.py +33 -12
corvic/system/in_memory_executor.py +292 -331
corvic/system_sqlite/staging.py +17 -9
{corvic_engine-0.3.0rc62.dist-info → corvic_engine-0.3.0rc64.dist-info}/METADATA +1 -1
{corvic_engine-0.3.0rc62.dist-info → corvic_engine-0.3.0rc64.dist-info}/RECORD +27 -35
corvic_generated/feature/v1/experiment_pb2.py +2 -2
corvic_generated/feature/v1/space_pb2.py +2 -2
corvic_generated/feature/v2/feature_view_pb2.py +2 -2
corvic_generated/feature/v2/space_pb2.py +5 -5
corvic_generated/ingest/v2/pipeline_pb2.py +24 -22
corvic_generated/ingest/v2/pipeline_pb2.pyi +4 -2
corvic_generated/ingest/v2/resource_pb2.py +4 -4
corvic_generated/ingest/v2/room_pb2.py +31 -31
corvic_generated/ingest/v2/room_pb2.pyi +4 -2
corvic_generated/ingest/v2/source_pb2.py +4 -4
corvic_generated/ingest/v2/table_pb2.py +3 -3
corvic_generated/orm/v1/agent_pb2.py +2 -2
corvic_generated/orm/v1/agent_pb2.pyi +6 -0
corvic_generated/orm/v1/table_pb2.py +2 -2
buf/validate/expression_pb2.py +0 -37
buf/validate/expression_pb2.pyi +0 -52
buf/validate/expression_pb2_grpc.py +0 -4
buf/validate/expression_pb2_grpc.pyi +0 -34
buf/validate/priv/private_pb2.py +0 -37
buf/validate/priv/private_pb2.pyi +0 -37
buf/validate/priv/private_pb2_grpc.py +0 -4
buf/validate/priv/private_pb2_grpc.pyi +0 -34
{corvic_engine-0.3.0rc62.dist-info → corvic_engine-0.3.0rc64.dist-info}/WHEEL +0 -0
{corvic_engine-0.3.0rc62.dist-info → corvic_engine-0.3.0rc64.dist-info}/licenses/LICENSE +0 -0

corvic/system/in_memory_executor.py CHANGED Viewed

@@ -6,8 +6,9 @@ import dataclasses
 import datetime
 import functools
 import math
-from collections.abc import MutableMapping
-from contextlib import nullcontext
+from collections.abc import Callable, Mapping, MutableMapping
+from contextlib import AbstractContextManager, ExitStack, nullcontext
+from types import TracebackType
 from typing import Any, Final, cast
 import numpy as np
@@ -18,7 +19,7 @@ import pyarrow.parquet as pq
 import structlog
 from google.protobuf import json_format, struct_pb2
 from more_itertools import flatten
-from typing_extensions import deprecated
+from typing_extensions import Self, deprecated
 from corvic import embed, embedding_metric, op_graph, sql
 from corvic.result import (
@@ -170,12 +171,30 @@ def _as_df(
     )
+@dataclasses.dataclass(frozen=True)
+class _LazyFrameWithMetrics:
+    data: pl.LazyFrame
+    metrics: dict[str, Any]
+    def apply(
+        self, lf_op: Callable[[pl.LazyFrame], pl.LazyFrame]
+    ) -> _LazyFrameWithMetrics:
+        return _LazyFrameWithMetrics(lf_op(self.data), self.metrics)
+    def with_data(self, data: pl.LazyFrame):
+        return _LazyFrameWithMetrics(data, self.metrics)
 @dataclasses.dataclass(frozen=True)
 class _SchemaAndBatches:
     schema: pa.Schema
     batches: list[pa.RecordBatch]
     metrics: dict[str, Any]
+    @classmethod
+    def from_lazy_frame_with_metrics(cls, lfm: _LazyFrameWithMetrics):
+        return cls.from_dataframe(lfm.data.collect(), lfm.metrics)
     def to_batch_reader(self):
         return pa.RecordBatchReader.from_batches(
             schema=self.schema,
@@ -209,16 +228,29 @@ class _SlicedTable:
 @dataclasses.dataclass
-class _InMemoryExecutionContext:
+class _InMemoryExecutionContext(AbstractContextManager["_InMemoryExecutionContext"]):
     exec_context: ExecutionContext
     current_output_context: TableComputeContext | None = None
     # Using _SchemaAndBatches rather than a RecordBatchReader since the latter's
     # contract only guarantees one iteration and these might be accessed more than
     # once
-    computed_batches_for_op_graph: dict[_SlicedTable, _SchemaAndBatches] = (
+    computed_batches_for_op_graph: dict[_SlicedTable, _LazyFrameWithMetrics] = (
         dataclasses.field(default_factory=dict)
     )
+    exit_stack: ExitStack = dataclasses.field(default_factory=ExitStack)
+    def __enter__(self) -> Self:
+        self.exit_stack = self.exit_stack.__enter__()
+        return self
+    def __exit__(
+        self,
+        __exc_type: type[BaseException] | None,
+        __exc_value: BaseException | None,
+        __traceback: TracebackType | None,
+    ) -> bool | None:
+        return self.exit_stack.__exit__(__exc_type, __exc_value, __traceback)
     @classmethod
     def count_source_op_uses(
@@ -313,13 +345,13 @@ class InMemoryExecutionResult(ExecutionResult):
     def make(
         cls,
         storage_manager: StorageManager,
-        in_memory_context: _InMemoryExecutionContext,
+        computed_tables: Mapping[_SlicedTable, _SchemaAndBatches],
         context: ExecutionContext,
     ) -> InMemoryExecutionResult:
         tables = [
             InMemoryTableComputeResult(
                 storage_manager,
-                in_memory_context.computed_batches_for_op_graph[
+                computed_tables[
                     _SlicedTable(
                         table_context.table_op_graph,
                         table_context.sql_output_slice_args,
@@ -362,80 +394,69 @@ class InMemoryExecutor(OpGraphExecutor):
     def _execute_read_from_parquet(
         self, op: op_graph.op.ReadFromParquet, context: _InMemoryExecutionContext
-    ) -> Ok[_SchemaAndBatches]:
-        batches: list[pa.RecordBatch] = []
-        for blob_name in op.blob_names:
-            with (
-                self._storage_manager.blob_from_url(blob_name).open("rb") as stream,
-            ):
-                batches.extend(
-                    # reading files with pyarrow, then converting them to polars
-                    # can cause "ShapeError" bugs. That's why we're not reading this
-                    # using pyarrow.
-                    pl.read_parquet(
-                        source=stream,
-                        columns=op.arrow_schema.names,
-                        use_pyarrow=False,
-                    )
-                    .to_arrow()
-                    .to_batches()
+    ) -> Ok[_LazyFrameWithMetrics]:
+        data = cast(pl.DataFrame, pl.from_arrow(op.arrow_schema.empty_table()))
+        data = pl.scan_parquet(
+            [
+                context.exit_stack.enter_context(
+                    self._storage_manager.blob_from_url(blob_name).open("rb")
                 )
-        return Ok(_SchemaAndBatches(op.arrow_schema, batches=batches, metrics={}))
+                for blob_name in op.blob_names
+            ],
+            schema=data.schema,
+        )
+        return Ok(_LazyFrameWithMetrics(data, metrics={}))
     def _execute_rollup_by_aggregation(
         self, op: op_graph.op.RollupByAggregation, context: _InMemoryExecutionContext
-    ) -> Ok[_SchemaAndBatches]:
+    ) -> Ok[_LazyFrameWithMetrics]:
         raise NotImplementedError(
             "rollup by aggregation outside of sql not implemented"
         )
+    def _compute_source_then_apply(
+        self,
+        source: op_graph.Op,
+        lf_op: Callable[[pl.LazyFrame], pl.LazyFrame],
+        context: _InMemoryExecutionContext,
+    ):
+        return self._execute(source, context).map(
+            lambda source_lfm: source_lfm.apply(lf_op)
+        )
     def _execute_rename_columns(
         self, op: op_graph.op.RenameColumns, context: _InMemoryExecutionContext
     ):
-        return self._execute(op.source, context).map(
-            lambda source_batches: _SchemaAndBatches.from_dataframe(
-                _as_df(source_batches).rename(dict(op.old_name_to_new)),
-                source_batches.metrics,
-            )
+        return self._compute_source_then_apply(
+            op.source, lambda lf: lf.rename(dict(op.old_name_to_new)), context
         )
     def _execute_select_columns(
         self, op: op_graph.op.SelectColumns, context: _InMemoryExecutionContext
     ):
-        return self._execute(op.source, context).map(
-            lambda source_batches: _SchemaAndBatches.from_dataframe(
-                _as_df(source_batches).select(op.columns), source_batches.metrics
-            )
+        return self._compute_source_then_apply(
+            op.source, lambda lf: lf.select(op.columns), context
         )
     def _execute_limit_rows(
         self, op: op_graph.op.LimitRows, context: _InMemoryExecutionContext
     ):
-        return self._execute(op.source, context).map(
-            lambda source_batches: _SchemaAndBatches.from_dataframe(
-                _as_df(source_batches).limit(op.num_rows),
-                source_batches.metrics,
-            )
+        return self._compute_source_then_apply(
+            op.source, lambda lf: lf.limit(op.num_rows), context
         )
     def _execute_offset_rows(
         self, op: op_graph.op.OffsetRows, context: _InMemoryExecutionContext
     ):
-        return self._execute(op.source, context).map(
-            lambda source_batches: _SchemaAndBatches.from_dataframe(
-                _as_df(source_batches).slice(op.num_rows),
-                source_batches.metrics,
-            )
+        return self._compute_source_then_apply(
+            op.source, lambda lf: lf.slice(op.num_rows), context
         )
     def _execute_order_by(
         self, op: op_graph.op.OrderBy, context: _InMemoryExecutionContext
     ):
-        return self._execute(op.source, context).map(
-            lambda source_batches: _SchemaAndBatches.from_dataframe(
-                _as_df(source_batches).sort(op.columns, descending=op.desc),
-                source_batches.metrics,
-            )
+        return self._compute_source_then_apply(
+            op.source, lambda lf: lf.sort(op.columns, descending=op.desc), context
         )
     def _row_filter_literal_comparison_to_condition(
@@ -503,34 +524,31 @@ class InMemoryExecutor(OpGraphExecutor):
     def _execute_filter_rows(
         self, op: op_graph.op.FilterRows, context: _InMemoryExecutionContext
     ):
-        match self._execute(op.source, context):
-            case Ok(source_batches):
-                return self._row_filter_to_condition(op.row_filter).map_or_else(
-                    lambda err: InternalError.from_(err),
-                    lambda row_filter: Ok(
-                        _SchemaAndBatches.from_dataframe(
-                            _as_df(source_batches).filter(row_filter),
-                            source_batches.metrics,
-                        )
-                    ),
-                )
-            case err:
-                return err
+        match self._row_filter_to_condition(op.row_filter):
+            case op_graph.OpParseError() as err:
+                return InternalError.from_(err)
+            case Ok(row_filter):
+                pass
+        return self._compute_source_then_apply(
+            op.source, lambda lf: lf.filter(row_filter), context
+        )
     def _execute_embedding_metrics(  # noqa: C901
         self, op: op_graph.op.EmbeddingMetrics, context: _InMemoryExecutionContext
     ):
         match self._execute(op.table, context):
-            case Ok(source_batches):
+            case Ok(source_lfm):
                 pass
             case err:
                 return err
-        embedding_df = _as_df(source_batches)
+        embedding_df = source_lfm.data.collect()
         if len(embedding_df) < _MIN_EMBEDDINGS_FOR_EMBEDDINGS_SUMMARY:
             # downstream consumers handle empty metadata by substituting their
             # own values
-            return Ok(source_batches)
+            return Ok(
+                _LazyFrameWithMetrics(embedding_df.lazy(), metrics=source_lfm.metrics)
+            )
         # before it was configurable, this op assumed that the column's name was
         # this hardcoded name
@@ -541,7 +559,7 @@ class InMemoryExecutor(OpGraphExecutor):
             case InvalidArgumentError() as err:
                 return InternalError.from_(err)
-        metrics = source_batches.metrics.copy()
+        metrics = source_lfm.metrics.copy()
         match embedding_metric.ne_sum(embedding, normalize=True):
             case Ok(metric):
                 metrics["ne_sum"] = metric
@@ -564,17 +582,17 @@ class InMemoryExecutor(OpGraphExecutor):
                 metrics["stable_rank"] = metric
             case InvalidArgumentError() as err:
                 _logger.warning("could not compute stable_rank", exc_info=str(err))
-        return Ok(_SchemaAndBatches.from_dataframe(embedding_df, metrics=metrics))
+        return Ok(_LazyFrameWithMetrics(embedding_df.lazy(), metrics=metrics))
     def _execute_embedding_coordinates(
         self, op: op_graph.op.EmbeddingCoordinates, context: _InMemoryExecutionContext
     ):
         match self._execute(op.table, context):
-            case Ok(source_batches):
+            case Ok(source_lfm):
                 pass
             case err:
                 return err
-        embedding_df = _as_df(source_batches)
+        embedding_df = source_lfm.data.collect()
         # before it was configurable, this op assumed that the column's name was
         # this hardcoded name
@@ -583,16 +601,14 @@ class InMemoryExecutor(OpGraphExecutor):
         # the neighbors of a point includes itself. That does mean, that an n_neighbors
         # value of less than 3 simply does not work
         if len(embedding_df) < _MIN_EMBEDDINGS_FOR_EMBEDDINGS_SUMMARY:
-            coordinates_df = embedding_df.with_columns(
+            coordinates_df = embedding_df.lazy().with_columns(
                 pl.Series(
                     name=embedding_column_name,
                     values=[[0.0] * op.n_components] * len(embedding_df),
                     dtype=pl.List(pl.Float32),
                 )
             )
-            return Ok(
-                _SchemaAndBatches.from_dataframe(coordinates_df, source_batches.metrics)
-            )
+            return Ok(_LazyFrameWithMetrics(coordinates_df, source_lfm.metrics))
         match get_polars_embedding(embedding_df, embedding_column_name):
             case Ok(embedding):
@@ -608,39 +624,37 @@ class InMemoryExecutor(OpGraphExecutor):
             case InvalidArgumentError() as err:
                 raise err
-        coordinates_df = embedding_df.with_columns(
+        coordinates_df = embedding_df.lazy().with_columns(
             pl.Series(
                 name=embedding_column_name,
                 values=coordinates,
                 dtype=pl.List(pl.Float32),
             )
         )
-        return Ok(
-            _SchemaAndBatches.from_dataframe(coordinates_df, source_batches.metrics)
-        )
+        return Ok(_LazyFrameWithMetrics(coordinates_df, source_lfm.metrics))
     def _execute_distinct_rows(
         self, op: op_graph.op.DistinctRows, context: _InMemoryExecutionContext
     ):
         return self._execute(op.source, context).map(
-            lambda source_batches: _SchemaAndBatches.from_dataframe(
-                _as_df(source_batches).unique(), source_batches.metrics
+            lambda source_lfm: _LazyFrameWithMetrics(
+                source_lfm.data.unique(), source_lfm.metrics
             )
         )
     def _execute_join(self, op: op_graph.op.Join, context: _InMemoryExecutionContext):
         match self._execute(op.left_source, context):
-            case Ok(left_batches):
+            case Ok(left_lfm):
                 pass
             case err:
                 return err
         match self._execute(op.right_source, context):
-            case Ok(right_batches):
+            case Ok(right_lfm):
                 pass
             case err:
                 return err
-        left_df = _as_df(left_batches)
-        right_df = _as_df(right_batches)
+        left_lf = left_lfm.data
+        right_lf = right_lfm.data
         match op.how:
             case table_pb2.JOIN_TYPE_INNER:
@@ -651,32 +665,20 @@ class InMemoryExecutor(OpGraphExecutor):
                 join_type = "inner"
         # in our join semantics we drop columns from the right source on conflict
-        right_df = right_df.select(
+        right_lf = right_lf.select(
             [
                 col
-                for col in right_df.columns
-                if col in op.right_join_columns or col not in left_df.columns
+                for col in right_lf.columns
+                if col in op.right_join_columns or col not in left_lf.columns
             ]
         )
-        metrics = right_batches.metrics.copy()
-        metrics.update(left_batches.metrics)
-        # polars doesn't behave so well when one side is empty, just
-        # compute the trivial empty join when the result is guaranteed
-        # to be empty instead.
-        if len(left_df) == 0 or len(right_df) == 0 and join_type == "inner":
-            return Ok(
-                _SchemaAndBatches(
-                    schema=op.schema.to_arrow(),
-                    batches=op.schema.to_arrow().empty_table().to_batches(),
-                    metrics=metrics,
-                )
-            )
+        metrics = right_lfm.metrics.copy()
+        metrics.update(left_lfm.metrics)
         return Ok(
-            _SchemaAndBatches.from_dataframe(
-                left_df.join(
-                    right_df,
+            _LazyFrameWithMetrics(
+                left_lf.join(
+                    right_lf,
                     left_on=op.left_join_columns,
                     right_on=op.right_join_columns,
                     how=join_type,
@@ -686,62 +688,47 @@ class InMemoryExecutor(OpGraphExecutor):
         )
     def _execute_empty(self, op: op_graph.op.Empty, context: _InMemoryExecutionContext):
-        empty_table = pa.schema([]).empty_table()
-        return Ok(
-            _SchemaAndBatches(empty_table.schema, empty_table.to_batches(), metrics={})
-        )
+        empty_table = cast(pl.DataFrame, pl.from_arrow(pa.schema([]).empty_table()))
+        return Ok(_LazyFrameWithMetrics(empty_table.lazy(), metrics={}))
     def _execute_concat(
         self, op: op_graph.op.Concat, context: _InMemoryExecutionContext
     ):
-        source_batches = list[_SchemaAndBatches]()
+        source_lfms = list[_LazyFrameWithMetrics]()
         for table in op.tables:
             match self._execute(table, context):
                 case Ok(batches):
-                    source_batches.append(batches)
+                    source_lfms.append(batches)
                 case err:
                     return err
-        dataframes = [_as_df(batches) for batches in source_batches]
+        data = pl.concat([lfm.data for lfm in source_lfms], how=op.how)
         metrics = dict[str, Any]()
-        for batches in source_batches:
-            metrics.update(batches.metrics)
-        return Ok(
-            _SchemaAndBatches.from_dataframe(
-                pl.concat(dataframes, how=op.how), metrics=metrics
-            )
-        )
+        for lfm in source_lfms:
+            metrics.update(lfm.metrics)
+        return Ok(_LazyFrameWithMetrics(data, metrics=metrics))
     def _execute_unnest_struct(
         self, op: op_graph.op.UnnestStruct, context: _InMemoryExecutionContext
     ):
-        return self._execute(op.source, context).map(
-            lambda source_batches: _SchemaAndBatches.from_dataframe(
-                _as_df(source_batches).unnest(op.struct_column_name),
-                source_batches.metrics,
-            )
+        return self._compute_source_then_apply(
+            op.source, lambda lf: lf.unnest(op.struct_column_name), context
         )
     def _execute_nest_into_struct(
         self, op: op_graph.op.NestIntoStruct, context: _InMemoryExecutionContext
     ):
-        match self._execute(op.source, context):
-            case Ok(source_batches):
-                pass
-            case err:
-                return err
         non_struct_columns = [
-            name
-            for name in source_batches.schema.names
-            if name not in op.column_names_to_nest
+            field.name
+            for field in op.source.schema
+            if field.name not in op.column_names_to_nest
         ]
-        return Ok(
-            _SchemaAndBatches.from_dataframe(
-                _as_df(source_batches).select(
-                    *non_struct_columns,
-                    pl.struct(op.column_names_to_nest).alias(op.struct_column_name),
-                ),
-                source_batches.metrics,
-            )
+        return self._compute_source_then_apply(
+            op.source,
+            lambda lf: lf.select(
+                *non_struct_columns,
+                pl.struct(op.column_names_to_nest).alias(op.struct_column_name),
+            ),
+            context,
         )
     def _execute_add_literal_column(
@@ -758,57 +745,49 @@ class InMemoryExecutor(OpGraphExecutor):
         else:
             column = pl.Series(name, literals).cast(dtype)
-        def do_work(source_batches: _SchemaAndBatches):
-            return _SchemaAndBatches.from_dataframe(
-                _as_df(source_batches).with_columns(column),
-                source_batches.metrics,
-            )
-        return self._execute(op.source, context).map(do_work)
+        return self._compute_source_then_apply(
+            op.source,
+            lambda lf: lf.with_columns(column),
+            context,
+        )
     def _execute_combine_columns(
         self, op: op_graph.op.CombineColumns, context: _InMemoryExecutionContext
     ):
-        match self._execute(op.source, context):
-            case Ok(source_batches):
-                pass
-            case err:
-                return err
-        source_df = _as_df(source_batches)
         match op.reduction:
-            case op_graph.ConcatString():
+            case op_graph.ConcatString() as reduction:
                 # if we do not ignore nulls then all concatenated rows that
                 # have a single column that contain a null value will be output
                 # as null.
-                result_df = source_df.with_columns(
-                    pl.concat_str(
-                        [pl.col(col) for col in op.column_names],
-                        separator=op.reduction.separator,
-                        ignore_nulls=True,
-                    ).alias(op.combined_column_name)
-                )
+                concat_expr = pl.concat_str(
+                    [pl.col(col) for col in op.column_names],
+                    separator=reduction.separator,
+                    ignore_nulls=True,
+                ).alias(op.combined_column_name)
             case op_graph.ConcatList():
                 if op.column_names:
-                    result_df = source_df.with_columns(
-                        pl.concat_list(*op.column_names).alias(op.combined_column_name)
+                    concat_expr = pl.concat_list(*op.column_names).alias(
+                        op.combined_column_name
                     )
                 else:
-                    result_df = source_df.with_columns(
-                        pl.Series(op.combined_column_name, [])
-                    )
+                    concat_expr = pl.Series(op.combined_column_name, [])
-        return Ok(_SchemaAndBatches.from_dataframe(result_df, source_batches.metrics))
+        return self._compute_source_then_apply(
+            op.source,
+            lambda lf: lf.with_columns(concat_expr),
+            context,
+        )
     def _execute_embed_column(
         self, op: op_graph.op.EmbedColumn, context: _InMemoryExecutionContext
     ):
         match self._execute(op.source, context):
-            case Ok(source_batches):
+            case Ok(source_lfm):
                 pass
             case err:
                 return err
-        source_df = _as_df(source_batches)
+        source_df = source_lfm.data.collect()
         to_embed = source_df[op.column_name].cast(pl.String())
         embed_context = EmbedTextContext(
@@ -825,17 +804,14 @@ class InMemoryExecutor(OpGraphExecutor):
             case InvalidArgumentError() | InternalError() as err:
                 raise InternalError("Failed to embed column") from err
-        result_df = source_df.with_columns(
-            result.embeddings.alias(op.embedding_column_name)
-        ).drop_nulls(op.embedding_column_name)
-        return Ok(
-            _SchemaAndBatches.from_dataframe(
-                result_df,
-                source_batches.metrics,
-            )
+        result_df = (
+            source_df.lazy()
+            .with_columns(result.embeddings.alias(op.embedding_column_name))
+            .drop_nulls(op.embedding_column_name)
         )
+        return Ok(source_lfm.with_data(result_df))
     @staticmethod
     def get_cyclic_encoding(
         series: pl.Series,
@@ -952,12 +928,12 @@ class InMemoryExecutor(OpGraphExecutor):
         self, op: op_graph.op.EncodeColumns, context: _InMemoryExecutionContext
     ):
         match self._execute(op.source, context):
-            case Ok(source_batches):
+            case Ok(source_lfm):
                 pass
             case err:
                 return err
-        source_df = _as_df(source_batches)
-        metrics = source_batches.metrics.copy()
+        source_df = source_lfm.data.collect()
+        metrics = source_lfm.metrics.copy()
         metric = metrics.get("one_hot_encoder", {})
         for encoder_arg in op.encoded_columns:
             to_encode = source_df[encoder_arg.column_name]
@@ -1084,8 +1060,8 @@ class InMemoryExecutor(OpGraphExecutor):
             )
         metrics["one_hot_encoder"] = metric
         return Ok(
-            _SchemaAndBatches.from_dataframe(
-                source_df,
+            _LazyFrameWithMetrics(
+                source_df.lazy(),
                 metrics,
             )
         )
@@ -1122,43 +1098,40 @@ class InMemoryExecutor(OpGraphExecutor):
         metrics = dict[str, Any]()
-        edge_list_batches = list[_SchemaAndBatches]()
+        edge_list_lfms = list[_LazyFrameWithMetrics]()
         for edge_list in op.edge_list_tables:
             match self._execute(edge_list.table, context):
-                case Ok(source_batches):
-                    edge_list_batches.append(source_batches)
+                case Ok(source_lfm):
+                    edge_list_lfms.append(source_lfm)
                 case err:
                     return err
         def edge_generator():
-            for edge_list, batches in zip(
-                op.edge_list_tables, edge_list_batches, strict=True
-            ):
+            for edge_list, lfm in zip(op.edge_list_tables, edge_list_lfms, strict=True):
                 start_column_name = edge_list.start_column_name
                 end_column_name = edge_list.end_column_name
                 start_column_type_name = entities_dtypes[start_column_name]
                 end_column_type_name = entities_dtypes[end_column_name]
-                metrics.update(batches.metrics)
-                for batch in batches.batches:
-                    yield (
-                        _as_df(batch)
-                        .with_columns(
-                            pl.col(edge_list.start_column_name).alias(
-                                f"start_id_{start_column_type_name}"
-                            ),
-                            pl.lit(edge_list.start_entity_name).alias("start_source"),
-                            pl.col(edge_list.end_column_name).alias(
-                                f"end_id_{end_column_type_name}"
-                            ),
-                            pl.lit(edge_list.end_entity_name).alias("end_source"),
-                        )
-                        .select(
-                            f"start_id_{start_column_type_name}",
-                            "start_source",
-                            f"end_id_{end_column_type_name}",
-                            "end_source",
-                        )
+                metrics.update(lfm.metrics)
+                yield (
+                    lfm.data.with_columns(
+                        pl.col(edge_list.start_column_name).alias(
+                            f"start_id_{start_column_type_name}"
+                        ),
+                        pl.lit(edge_list.start_entity_name).alias("start_source"),
+                        pl.col(edge_list.end_column_name).alias(
+                            f"end_id_{end_column_type_name}"
+                        ),
+                        pl.lit(edge_list.end_entity_name).alias("end_source"),
+                    )
+                    .select(
+                        f"start_id_{start_column_type_name}",
+                        "start_source",
+                        f"end_id_{end_column_type_name}",
+                        "end_source",
                     )
+                    .collect()
+                )
         edges = pl.concat(
             [
@@ -1187,18 +1160,17 @@ class InMemoryExecutor(OpGraphExecutor):
             negative=op.negative,
         )
         n2v_runner.train(epochs=op.epochs)
-        return Ok(_SchemaAndBatches.from_dataframe(n2v_runner.wv.to_polars(), metrics))
+        return Ok(_LazyFrameWithMetrics(n2v_runner.wv.to_polars().lazy(), metrics))
     def _execute_aggregate_columns(
         self, op: op_graph.op.AggregateColumns, context: _InMemoryExecutionContext
     ):
         match self._execute(op.source, context):
-            case Ok(source_batches):
+            case Ok(source_lfm):
                 pass
             case err:
                 return err
-        source_df = _as_df(source_batches)
-        to_aggregate = source_df[op.column_names]
+        to_aggregate = source_lfm.data.select(op.column_names)
         match op.aggregation:
             case op_graph.aggregation.Min():
@@ -1216,106 +1188,92 @@ class InMemoryExecutor(OpGraphExecutor):
             case op_graph.aggregation.NullCount():
                 aggregate = to_aggregate.null_count()
-        return Ok(_SchemaAndBatches.from_dataframe(aggregate, metrics={}))
+        return Ok(source_lfm.with_data(aggregate))
     def _execute_correlate_columns(
         self, op: op_graph.op.CorrelateColumns, context: _InMemoryExecutionContext
     ):
         match self._execute(op.source, context):
-            case Ok(source_batches):
+            case Ok(source_lfm):
                 pass
             case err:
                 return err
-        source_df = _as_df(source_batches)
+        source_df = source_lfm.data.collect()
         with np.errstate(invalid="ignore"):
-            corr_df = source_df[op.column_names].corr(dtype="float32")
+            corr_df = source_df.select(op.column_names).corr(dtype="float32")
-        return Ok(
-            _SchemaAndBatches.from_dataframe(
-                corr_df,
-                metrics={},
-            )
-        )
+        return Ok(source_lfm.with_data(corr_df.lazy()))
     def _execute_histogram_column(
         self, op: op_graph.op.HistogramColumn, context: _InMemoryExecutionContext
     ):
-        return self._execute(op.source, context).map(
-            lambda source_batches: _SchemaAndBatches.from_dataframe(
-                _as_df(source_batches)[op.column_name]
-                .hist(include_category=False)
-                .rename(
-                    {
-                        "breakpoint": op.breakpoint_column_name,
-                        "count": op.count_column_name,
-                    }
-                ),
-                metrics={},
-            )
+        return self._compute_source_then_apply(
+            op.source,
+            lambda lf: lf.collect()[op.column_name]
+            .hist(include_category=False)
+            .lazy()
+            .rename(
+                {
+                    "breakpoint": op.breakpoint_column_name,
+                    "count": op.count_column_name,
+                }
+            ),
+            context,
         )
     def _execute_convert_column_to_string(
         self, op: op_graph.op.ConvertColumnToString, context: _InMemoryExecutionContext
     ):
-        match self._execute(op.source, context):
-            case Ok(source_batches):
-                pass
-            case err:
-                return err
-        source_df = _as_df(source_batches)
-        column = source_df[op.column_name]
-        if not column.dtype.is_nested():
-            source_df = source_df.with_columns(column.cast(pl.String(), strict=False))
-        elif isinstance(column.dtype, pl.Array | pl.List):
-            source_df = source_df.with_columns(
-                column.cast(pl.List(pl.String())).list.join(",")
-            )
+        dtype = op.source.schema.to_polars()[op.column_name]
+        if not dtype.is_nested():
+            cast_expr = pl.col(op.column_name).cast(pl.String(), strict=False)
+        elif isinstance(dtype, pl.Array | pl.List):
+            cast_expr = pl.col(op.column_name).cast(pl.List(pl.String())).list.join(",")
         else:
             raise NotImplementedError(
                 "converting struct columns to strings is not implemented"
             )
-        return Ok(
-            _SchemaAndBatches.from_dataframe(source_df, metrics=source_batches.metrics)
+        return self._compute_source_then_apply(
+            op.source, lambda lf: lf.collect().with_columns(cast_expr).lazy(), context
         )
     def _execute_add_row_index(
         self, op: op_graph.op.AddRowIndex, context: _InMemoryExecutionContext
     ):
-        return self._execute(op.source, context).map(
-            lambda source_batches: _SchemaAndBatches.from_dataframe(
-                _as_df(source_batches)
-                .with_row_index(name=op.row_index_column_name, offset=op.offset)
-                .with_columns(pl.col(op.row_index_column_name).cast(pl.UInt64())),
-                metrics=source_batches.metrics,
-            )
+        return self._compute_source_then_apply(
+            op.source,
+            lambda lf: lf.with_row_index(
+                name=op.row_index_column_name, offset=op.offset
+            ).with_columns(pl.col(op.row_index_column_name).cast(pl.UInt64())),
+            context,
         )
     def _execute_output_csv(
         self, op: op_graph.op.OutputCsv, context: _InMemoryExecutionContext
     ):
         match self._execute(op.source, context):
-            case Ok(source_batches):
+            case Ok(source_lfm):
                 pass
             case err:
                 return err
-        source_df = _as_df(source_batches)
+        source_df = source_lfm.data.collect()
         source_df.write_csv(
             op.csv_url,
             quote_style="never",
             include_header=op.include_header,
         )
-        return Ok(source_batches)
+        return Ok(source_lfm.with_data(source_df.lazy()))
     def _execute_truncate_list(
         self, op: op_graph.op.TruncateList, context: _InMemoryExecutionContext
     ):
         # TODO(Patrick): verify this approach works for arrays
         match self._execute(op.source, context):
-            case Ok(source_batches):
+            case Ok(source_lfm):
                 pass
             case err:
                 return err
-        source_df = _as_df(source_batches)
+        source_df = source_lfm.data.collect()
         if len(source_df):
             existing_length = get_polars_embedding_length(
                 source_df, op.column_name
@@ -1336,6 +1294,7 @@ class InMemoryExecutor(OpGraphExecutor):
         else:
             return InternalError("unexpected type", cause="expected list or array type")
+        source_df = source_df.lazy()
         if head_length < op.target_column_length:
             padding_length = op.target_column_length - head_length
             padding = [op.padding_value_as_py] * padding_length
@@ -1347,16 +1306,14 @@ class InMemoryExecutor(OpGraphExecutor):
             .list.to_array(width=op.target_column_length)
             .cast(pl.List(inner_type))
         )
-        return Ok(
-            _SchemaAndBatches.from_dataframe(source_df, metrics=source_batches.metrics)
-        )
+        return Ok(source_lfm.with_data(source_df))
     def _execute_union(self, op: op_graph.op.Union, context: _InMemoryExecutionContext):
-        sources = list[_SchemaAndBatches]()
+        sources = list[_LazyFrameWithMetrics]()
         for source in op.sources():
             match self._execute(source, context):
-                case Ok(source_df):
-                    sources.append(source_df)
+                case Ok(source_lfm):
+                    sources.append(source_lfm)
                 case err:
                     return err
@@ -1364,20 +1321,20 @@ class InMemoryExecutor(OpGraphExecutor):
         for src in sources:
             metrics.update(src.metrics)
-        result_df = pl.concat((_as_df(src) for src in sources), how="vertical_relaxed")
+        result_lf = pl.concat((src.data for src in sources), how="vertical_relaxed")
         if op.distinct:
-            result_df = result_df.unique()
-        return Ok(_SchemaAndBatches.from_dataframe(result_df, metrics=metrics))
+            result_lf = result_lf.unique()
+        return Ok(_LazyFrameWithMetrics(result_lf, metrics=metrics))
     def _execute_embed_image_column(
         self, op: op_graph.op.EmbedImageColumn, context: _InMemoryExecutionContext
     ):
         match self._execute(op.source, context):
-            case Ok(source_batches):
+            case Ok(source_lfm):
                 pass
             case err:
                 return err
-        source_df = _as_df(source_batches)
+        source_df = source_lfm.data.collect()
         to_embed = source_df[op.column_name].cast(pl.Binary())
         embed_context = EmbedImageContext(
@@ -1392,14 +1349,12 @@ class InMemoryExecutor(OpGraphExecutor):
             case InvalidArgumentError() | InternalError() as err:
                 raise InternalError("Failed to embed column") from err
-        result_df = source_df.with_columns(
-            result.embeddings.alias(op.embedding_column_name)
-        ).drop_nulls(op.embedding_column_name)
         return Ok(
-            _SchemaAndBatches.from_dataframe(
-                result_df,
-                source_batches.metrics,
+            _LazyFrameWithMetrics(
+                source_df.lazy()
+                .with_columns(result.embeddings.alias(op.embedding_column_name))
+                .drop_nulls(op.embedding_column_name),
+                source_lfm.metrics,
             )
         )
@@ -1407,13 +1362,15 @@ class InMemoryExecutor(OpGraphExecutor):
         self, op: op_graph.op.AddDecisionTreeSummary, context: _InMemoryExecutionContext
     ):
         match self._execute(op.source, context):
-            case Ok(source_batches):
+            case Ok(source_lfm):
                 pass
             case err:
                 return err
-        df_input = _as_df(source_batches)
-        dataframe = df_input[list({*op.feature_column_names, op.label_column_name})]
+        df_input = source_lfm.data.collect()
+        dataframe = df_input.select(
+            list({*op.feature_column_names, op.label_column_name})
+        )
         boolean_columns = [
             name
             for name, dtype in dataframe.schema.items()
@@ -1473,36 +1430,33 @@ class InMemoryExecutor(OpGraphExecutor):
             )
             tree_str = tree_str.replace(f"{boolean_column} >  0.50", boolean_column)
-        metrics = source_batches.metrics.copy()
+        metrics = source_lfm.metrics.copy()
         metrics[op.output_metric_key] = table_pb2.DecisionTreeSummary(
             text=tree_str, graphviz=tree_graphviz
         )
-        return Ok(_SchemaAndBatches.from_dataframe(df_input, metrics=metrics))
+        return Ok(_LazyFrameWithMetrics(df_input.lazy(), metrics=metrics))
     def _execute_unnest_list(
         self, op: op_graph.op.UnnestList, context: _InMemoryExecutionContext
     ):
-        return self._execute(op.source, context).map(
-            lambda source_batches: _SchemaAndBatches.from_dataframe(
-                _as_df(source_batches)
-                .with_columns(
-                    pl.col(op.list_column_name).list.get(i).alias(column_name)
-                    for i, column_name in enumerate(op.column_names)
-                )
-                .drop(op.list_column_name),
-                source_batches.metrics,
-            )
+        return self._compute_source_then_apply(
+            op.source,
+            lambda lf: lf.with_columns(
+                pl.col(op.list_column_name).list.get(i).alias(column_name)
+                for i, column_name in enumerate(op.column_names)
+            ).drop(op.list_column_name),
+            context,
         )
     def _execute_sample_rows(
         self, op: op_graph.op.SampleRows, context: _InMemoryExecutionContext
     ):
         match self._execute(op.source, context):
-            case Ok(source_batches):
+            case Ok(source_lfm):
                 pass
             case err:
                 return err
-        source_df = _as_df(source_batches)
+        source_df = source_lfm.data.collect()
         n = min(op.num_rows, source_df.shape[0])
         sample_strategy = op.sample_strategy
         match sample_strategy:
@@ -1513,9 +1467,9 @@ class InMemoryExecutor(OpGraphExecutor):
                 )
         return Ok(
-            _SchemaAndBatches.from_dataframe(
-                result_df,
-                source_batches.metrics,
+            _LazyFrameWithMetrics(
+                result_df.lazy(),
+                source_lfm.metrics,
             )
         )
@@ -1523,15 +1477,16 @@ class InMemoryExecutor(OpGraphExecutor):
         self, op: op_graph.op.DescribeColumns, context: _InMemoryExecutionContext
     ):
         match self._execute(op.source, context):
-            case Ok(source_batches):
+            case Ok(source_lfm):
                 pass
             case err:
                 return err
-        source_df = _as_df(source_batches)
+        source_df = source_lfm.data.collect()
         return Ok(
-            _SchemaAndBatches.from_dataframe(
-                source_df.describe().rename({"statistic": op.statistic_column_name}),
-                source_batches.metrics,
+            source_lfm.with_data(
+                source_df.describe()
+                .lazy()
+                .rename({"statistic": op.statistic_column_name})
             )
         )
@@ -1552,7 +1507,7 @@ class InMemoryExecutor(OpGraphExecutor):
         op: op_graph.Op,
         context: _InMemoryExecutionContext,
     ) -> (
-        Ok[_SchemaAndBatches]
+        Ok[_LazyFrameWithMetrics]
         | InternalError
         | ResourceExhaustedError
         | InvalidArgumentError
@@ -1572,13 +1527,12 @@ class InMemoryExecutor(OpGraphExecutor):
                     return InternalError.from_(err)
                 case sql.NoRowsError() as err:
                     return Ok(
-                        _SchemaAndBatches.from_dataframe(
+                        _LazyFrameWithMetrics(
                             cast(
                                 pl.DataFrame,
                                 pl.from_arrow(expected_schema.empty_table()),
-                            ),
+                            ).lazy(),
                             metrics={},
-                            expected_schema=expected_schema,
                         )
                     )
                 case Ok(query):
@@ -1586,10 +1540,9 @@ class InMemoryExecutor(OpGraphExecutor):
             return self._staging_db.run_select_query(
                 query, expected_schema, context.current_slice_args
             ).map(
-                lambda rbr: _SchemaAndBatches.from_dataframe(
-                    _as_df(rbr, expected_schema),
+                lambda rbr: _LazyFrameWithMetrics(
+                    _as_df(rbr, expected_schema).lazy(),
                     metrics={},
-                    expected_schema=expected_schema,
                 )
             )
@@ -1681,7 +1634,7 @@ class InMemoryExecutor(OpGraphExecutor):
         op: op_graph.Op,
         context: _InMemoryExecutionContext,
     ) -> (
-        Ok[_SchemaAndBatches]
+        Ok[_LazyFrameWithMetrics]
         | InternalError
         | ResourceExhaustedError
         | InvalidArgumentError
@@ -1713,11 +1666,11 @@ class InMemoryExecutor(OpGraphExecutor):
             try:
                 _logger.info("starting op execution")
-                maybe_batches = self._do_execute(op=op, context=context)
+                maybe_lfm = self._do_execute(op=op, context=context)
             finally:
                 _logger.info("op execution complete")
-            match maybe_batches:
-                case Ok(batches):
+            match maybe_lfm:
+                case Ok(lfm):
                     pass
                 case err:
                     if span:
@@ -1728,8 +1681,12 @@ class InMemoryExecutor(OpGraphExecutor):
                 sliced_table in context.output_tables
                 or sliced_table in context.reused_tables
             ):
-                context.computed_batches_for_op_graph[sliced_table] = batches
-            return Ok(batches)
+                # collect the lazy frame since it will be re-used to avoid
+                # re-computation
+                dataframe = lfm.data.collect()
+                lfm = _LazyFrameWithMetrics(dataframe.lazy(), lfm.metrics)
+                context.computed_batches_for_op_graph[sliced_table] = lfm
+            return Ok(lfm)
     def execute(
         self, context: ExecutionContext
@@ -1739,22 +1696,26 @@ class InMemoryExecutor(OpGraphExecutor):
         | InternalError
         | ResourceExhaustedError
     ):
-        in_memory_context = _InMemoryExecutionContext(context)
-        for table_context in context.tables_to_compute:
-            in_memory_context.current_output_context = table_context
-            sliced_table = _SlicedTable(
-                table_context.table_op_graph, table_context.sql_output_slice_args
-            )
-            if sliced_table not in in_memory_context.computed_batches_for_op_graph:
-                match self._execute(sliced_table.op_graph, in_memory_context):
-                    case Ok():
-                        pass
-                    case err:
-                        return err
+        with _InMemoryExecutionContext(context) as in_memory_context:
+            for table_context in context.tables_to_compute:
+                in_memory_context.current_output_context = table_context
+                sliced_table = _SlicedTable(
+                    table_context.table_op_graph, table_context.sql_output_slice_args
+                )
+                if sliced_table not in in_memory_context.computed_batches_for_op_graph:
+                    match self._execute(sliced_table.op_graph, in_memory_context):
+                        case Ok():
+                            pass
+                        case err:
+                            return err
+        args_lfm_iterator = in_memory_context.computed_batches_for_op_graph.items()
+        computed_tables = {
+            slice_args: _SchemaAndBatches.from_lazy_frame_with_metrics(lfm)
+            for slice_args, lfm in args_lfm_iterator
+        }
         return Ok(
             InMemoryExecutionResult.make(
-                self._storage_manager, in_memory_context, context
+                self._storage_manager, computed_tables, context
             )
         )