PyPI - palimpzest - Versions diffs - 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

palimpzest 0.6.4py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

palimpzest/__init__.py +5 -0
palimpzest/constants.py +110 -43
palimpzest/core/__init__.py +0 -78
palimpzest/core/data/dataclasses.py +382 -44
palimpzest/core/elements/filters.py +7 -3
palimpzest/core/elements/index.py +70 -0
palimpzest/core/elements/records.py +33 -11
palimpzest/core/lib/fields.py +1 -0
palimpzest/core/lib/schemas.py +4 -3
palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
palimpzest/prompts/prompt_factory.py +44 -7
palimpzest/prompts/split_merge_prompts.py +56 -0
palimpzest/prompts/split_proposer_prompts.py +55 -0
palimpzest/query/execution/execution_strategy.py +435 -53
palimpzest/query/execution/execution_strategy_type.py +20 -0
palimpzest/query/execution/mab_execution_strategy.py +532 -0
palimpzest/query/execution/parallel_execution_strategy.py +143 -172
palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
palimpzest/query/generators/api_client_factory.py +31 -0
palimpzest/query/generators/generators.py +256 -76
palimpzest/query/operators/__init__.py +1 -2
palimpzest/query/operators/code_synthesis_convert.py +33 -18
palimpzest/query/operators/convert.py +30 -97
palimpzest/query/operators/critique_and_refine_convert.py +5 -6
palimpzest/query/operators/filter.py +7 -10
palimpzest/query/operators/logical.py +54 -10
palimpzest/query/operators/map.py +130 -0
palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
palimpzest/query/operators/physical.py +3 -12
palimpzest/query/operators/rag_convert.py +66 -18
palimpzest/query/operators/retrieve.py +230 -34
palimpzest/query/operators/scan.py +5 -2
palimpzest/query/operators/split_convert.py +169 -0
palimpzest/query/operators/token_reduction_convert.py +8 -14
palimpzest/query/optimizer/__init__.py +4 -16
palimpzest/query/optimizer/cost_model.py +73 -266
palimpzest/query/optimizer/optimizer.py +87 -58
palimpzest/query/optimizer/optimizer_strategy.py +18 -97
palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/primitives.py +5 -3
palimpzest/query/optimizer/rules.py +336 -172
palimpzest/query/optimizer/tasks.py +30 -100
palimpzest/query/processor/config.py +38 -22
palimpzest/query/processor/nosentinel_processor.py +16 -520
palimpzest/query/processor/processing_strategy_type.py +28 -0
palimpzest/query/processor/query_processor.py +38 -206
palimpzest/query/processor/query_processor_factory.py +117 -130
palimpzest/query/processor/sentinel_processor.py +90 -0
palimpzest/query/processor/streaming_processor.py +25 -32
palimpzest/sets.py +88 -41
palimpzest/utils/model_helpers.py +8 -7
palimpzest/utils/progress.py +368 -152
palimpzest/utils/token_reduction_helpers.py +1 -3
{palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/METADATA +19 -9
palimpzest-0.7.0.dist-info/RECORD +96 -0
{palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
palimpzest/query/processor/mab_sentinel_processor.py +0 -884
palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
palimpzest/utils/index_helpers.py +0 -6
palimpzest-0.6.4.dist-info/RECORD +0 -87
{palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
{palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0

palimpzest/query/optimizer/cost_model.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from __future__ import annotations
+import logging
 import math
 # NOTE: the answer.mode() call(s) inside of _est_quality() throw a UserWarning when there are multiple
@@ -12,11 +13,9 @@ import warnings
 from typing import Any
 import pandas as pd
-import scipy.stats as stats
 from palimpzest.constants import MODEL_CARDS, NAIVE_BYTES_PER_RECORD, GPT_4o_MODEL_CARD, Model
-from palimpzest.core.data.dataclasses import OperatorCostEstimates, PlanCost, RecordOpStats
-from palimpzest.core.elements.records import DataRecordSet
+from palimpzest.core.data.dataclasses import OperatorCostEstimates, PlanCost, RecordOpStats, SentinelPlanStats
 from palimpzest.query.operators.aggregate import ApplyGroupByOp, AverageAggregateOp, CountAggregateOp
 from palimpzest.query.operators.code_synthesis_convert import CodeSynthesisConvert
 from palimpzest.query.operators.convert import LLMConvert
@@ -25,12 +24,13 @@ from palimpzest.query.operators.limit import LimitScanOp
 from palimpzest.query.operators.physical import PhysicalOperator
 from palimpzest.query.operators.rag_convert import RAGConvert
 from palimpzest.query.operators.scan import CacheScanDataOp, MarshalAndScanDataOp, ScanPhysicalOp
-from palimpzest.query.operators.token_reduction_convert import TokenReducedConvert
-from palimpzest.query.optimizer.plan import SentinelPlan
+from palimpzest.query.operators.token_reduction_convert import TokenReducedConvertBonded
 from palimpzest.utils.model_helpers import get_champion_model_name, get_models
 warnings.simplefilter(action='ignore', category=UserWarning)
+logger = logging.getLogger(__name__)
 class BaseCostModel:
     """
     This base class contains the interface/abstraction that every CostModel must implement
@@ -64,14 +64,13 @@ class SampleBasedCostModel:
     """
     def __init__(
         self,
-        sentinel_plan: SentinelPlan,
-        execution_data: dict[str, dict[str, list[DataRecordSet]]],
+        sentinel_plan_stats: SentinelPlanStats,
         verbose: bool = False,
         exp_name: str | None = None,
     ):
-        # store sentinel plan
-        self.sentinel_plan = sentinel_plan
+        """
+        execution_data is: {logical_op_id: {physical_op_id: [DataRecordSet]}}
+        """
         # store verbose argument
         self.verbose = verbose
@@ -79,7 +78,7 @@ class SampleBasedCostModel:
         self.exp_name = exp_name
         # construct cost, time, quality, and selectivity matrices for each operator set;
-        self.operator_to_stats = self.compute_operator_stats(execution_data)
+        self.operator_to_stats = self.compute_operator_stats(sentinel_plan_stats)
         # compute set of costed physical op ids from operator_to_stats
         self.costed_phys_op_ids = set([
@@ -88,30 +87,23 @@ class SampleBasedCostModel:
             for phys_op_id, _ in phys_op_id_to_stats.items()
         ])
+        logger.info(f"Initialized SampleBasedCostModel with verbose={self.verbose}")
+        logger.debug(f"Initialized SampleBasedCostModel with params: {self.__dict__}")
     def get_costed_phys_op_ids(self):
         return self.costed_phys_op_ids
-    def compute_operator_stats(
-            self,
-            execution_data: dict[str, dict[str, list[DataRecordSet]]],
-        ):
+    def compute_operator_stats(self, sentinel_plan_stats: SentinelPlanStats) -> dict:
+        logger.debug("Computing operator statistics")
         # flatten the nested dictionary of execution data and pull out fields relevant to cost estimation
         execution_record_op_stats = []
-        for idx, (logical_op_id, _, _) in enumerate(self.sentinel_plan):
-            # initialize variables
-            upstream_logical_op_id = self.sentinel_plan.logical_op_ids[idx - 1] if idx > 0 else None
-            # filter for the execution data from this operator set
-            op_set_execution_data = execution_data[logical_op_id]
+        for logical_op_id, phys_op_id_to_op_stats in sentinel_plan_stats.operator_stats.items():
+            logger.debug(f"Computing operator statistics for logical_op_id: {logical_op_id}")
             # flatten the execution data into a list of RecordOpStats
             op_set_execution_data = [
                 record_op_stats
-                for _, record_sets in op_set_execution_data.items()
-                for record_set in record_sets
-                for record_op_stats in record_set.record_op_stats
+                for _, op_stats in phys_op_id_to_op_stats.items()
+                for record_op_stats in op_stats.record_op_stats_lst
             ]
             # add entries from execution data into matrices
@@ -119,7 +111,6 @@ class SampleBasedCostModel:
                 record_op_stats_dict = {
                     "logical_op_id": logical_op_id,
                     "physical_op_id": record_op_stats.op_id,
-                    "upstream_logical_op_id": upstream_logical_op_id,
                     "record_id": record_op_stats.record_id,
                     "record_parent_id": record_op_stats.record_parent_id,
                     "cost_per_record": record_op_stats.cost_per_record,
@@ -138,21 +129,19 @@ class SampleBasedCostModel:
         # for each physical_op_id, compute its average cost_per_record, time_per_record, selectivity, and quality
         operator_to_stats = {}
         for logical_op_id, logical_op_df in operator_stats_df.groupby("logical_op_id"):
+            logger.debug(f"Computing operator statistics for logical_op_id: {logical_op_id}")
             operator_to_stats[logical_op_id] = {}
-            # get the logical_op_id of the upstream operator
-            upstream_logical_op_ids = logical_op_df.upstream_logical_op_id.unique()
-            assert len(upstream_logical_op_ids) == 1, "More than one upstream logical_op_id"
-            upstream_logical_op_id = upstream_logical_op_ids[0]
             for physical_op_id, physical_op_df in logical_op_df.groupby("physical_op_id"):
-                # find set of parent records for this operator
-                num_upstream_records = len(physical_op_df.record_parent_id.unique())
+                # compute the number of input records processed by this operator; use source_idx for scan operator(s)
+                num_source_records = (
+                    len(physical_op_df.record_parent_id.unique())
+                    if not physical_op_df.record_parent_id.isna().all()
+                    else len(physical_op_df.source_idx.unique())
+                )
                 # compute selectivity
-                selectivity = (
-                    1.0 if upstream_logical_op_id is None else physical_op_df.passed_operator.sum() / num_upstream_records
-                )
+                selectivity = physical_op_df.passed_operator.sum() / num_source_records
                 operator_to_stats[logical_op_id][physical_op_id] = {
                     "cost": physical_op_df.cost_per_record.mean(),
@@ -165,6 +154,7 @@ class SampleBasedCostModel:
         if self.exp_name is not None:
             operator_stats_df.to_csv(f"opt-profiling-data/{self.exp_name}-operator-stats.csv", index=False)
+        logger.debug(f"Done computing operator statistics for {len(operator_to_stats)} operators!")
         return operator_to_stats
@@ -176,7 +166,10 @@ class SampleBasedCostModel:
         # look up physical and logical op ids associated with this physical operator
         phys_op_id = operator.get_op_id()
         logical_op_id = operator.logical_op_id
-        assert self.operator_to_stats.get(logical_op_id).get(phys_op_id) is not None, f"No execution data for {str(operator)}"
+        physical_op_to_stats = self.operator_to_stats.get(logical_op_id)
+        assert physical_op_to_stats is not None, f"No execution data for logical operator: {str(operator)}"
+        assert physical_op_to_stats.get(phys_op_id) is not None, f"No execution data for physical operator: {str(operator)}"
+        logger.debug(f"Calling __call__ for {str(operator)}")
         # look up stats for this operation
         est_cost_per_record = self.operator_to_stats[logical_op_id][phys_op_id]["cost"]
@@ -210,7 +203,10 @@ class SampleBasedCostModel:
         op_quality = op_estimates.quality
         # construct and return op estimates
-        return PlanCost(cost=op_cost, time=op_time, quality=op_quality, op_estimates=op_estimates)
+        plan_cost = PlanCost(cost=op_cost, time=op_time, quality=op_quality, op_estimates=op_estimates)
+        logger.debug(f"Done calling __call__ for {str(operator)}")
+        logger.debug(f"Plan cost: {plan_cost}")
+        return plan_cost
 class CostModel(BaseCostModel):
@@ -223,7 +219,6 @@ class CostModel(BaseCostModel):
         self,
         sample_execution_data: list[RecordOpStats] | None = None,
         available_models: list[Model] | None = None,
-        confidence_level: float = 0.90,
     ) -> None:
         if sample_execution_data is None:
             sample_execution_data = []
@@ -242,107 +237,54 @@ class CostModel(BaseCostModel):
         # set available models
         self.available_models = available_models
-        # set confidence level for CI estimates
-        self.conf_level = confidence_level
         # compute per-operator estimates
         self.operator_estimates = self._compute_operator_estimates()
         # compute set of costed physical op ids from operator_to_stats
         self.costed_phys_op_ids = None if self.operator_estimates is None else set(self.operator_estimates.keys())
+        logger.info("Initialized CostModel.")
+        logger.debug(f"Initialized CostModel with params: {self.__dict__}")
     def get_costed_phys_op_ids(self):
         return self.costed_phys_op_ids
-    def _compute_ci(self, sample_mean: float, n_samples: int, std_dev: float) -> tuple[float, float]:
+    def _compute_mean(self, df: pd.DataFrame, col: str, model_name: str | None = None) -> float:
         """
-        Compute confidence interval (for non-proportion quantities) given the sample mean, number of samples,
-        and sample std. deviation at the CostModel's given confidence level. We use a t-distribution for
-        computing the interval as many sample estimates in PZ may have few samples.
-        """
-        ci = stats.t.interval(
-            confidence=self.conf_level,  # Confidence level
-            df=n_samples - 1,            # Degrees of freedom
-            loc=sample_mean,             # Sample mean
-            scale=std_dev,               # Standard deviation estimate
-        )
-        return ci
-    def _compute_proportion_ci(self, sample_prop: float, n_samples: int) -> tuple[float, float]:
-        """
-        Compute confidence interval for proportion quantities (i.e. selectivity) given the sample proportion
-        and the number of samples. We use the normal distribution for computing the interval here, for reasons
-        summarized by this post: https://stats.stackexchange.com/a/411727.
-        """
-        if sample_prop == 0.0 or sample_prop == 1.0:
-            return (sample_prop, sample_prop)
-        scaling_factor = math.sqrt((sample_prop * (1 - sample_prop)) / n_samples)
-        lower_bound, upper_bound = stats.norm.interval(
-            confidence=self.conf_level,  # Confidence level
-            loc=sample_prop,             # Sample proportion
-            scale=scaling_factor,        # Scaling factor
-        )
-        lower_bound = max(lower_bound, 0.0)
-        upper_bound = max(upper_bound, 1.0)
-        return (lower_bound, upper_bound)
-    def _compute_mean_and_ci(self, df: pd.DataFrame, col: str, model_name: str | None = None, non_negative_lb: bool = False) -> tuple[float, float, float]:
-        """
-        Compute the mean and CI for the given column and dataframe. If the model_name is provided, filter
+        Compute the mean for the given column and dataframe. If the model_name is provided, filter
         for the subset of rows belonging to the model.
         """
         # use model-specific estimate if possible
         if model_name is not None:
             model_df = df[df.model_name == model_name]
             if not model_df.empty:
-                col_mean = model_df[col].mean()
-                col_lb, col_ub = self._compute_ci(
-                    sample_mean=col_mean,
-                    n_samples=model_df[col].notna().sum(),
-                    std_dev=model_df[col].std(),
-                )
-                if non_negative_lb:
-                    col_lb = max(col_lb, 0.0)
+                return model_df[col].mean()
-                return col_mean, col_lb, col_ub
-        # compute aggregate
-        col_mean = df[col].mean()
-        col_lb, col_ub = self._compute_ci(
-            sample_mean=col_mean,
-            n_samples=df[col].notna().sum(),
-            std_dev=df[col].std(),
-        )
-        if non_negative_lb:
-            col_lb = max(col_lb, 0.0)
-        return col_mean, col_lb, col_ub
+        # compute aggregate mean across all models
+        return df[col].mean()
     def _est_time_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float, float]:
         """
         Given sample cost data observations for a specific operation, compute the mean and CI
         for the time per record.
         """
-        return self._compute_mean_and_ci(df=op_df, col="time_per_record", model_name=model_name, non_negative_lb=True)
+        return self._compute_mean(df=op_df, col="time_per_record", model_name=model_name)
     def _est_cost_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float, float]:
         """
         Given sample cost data observations for a specific operation, compute the mean and CI
         for the cost per record.
         """
-        return self._compute_mean_and_ci(df=op_df, col="cost_per_record", model_name=model_name, non_negative_lb=True)
+        return self._compute_mean(df=op_df, col="cost_per_record", model_name=model_name)
-    def _est_tokens_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[tuple[float, float, float], tuple[float, float, float]]:
+    def _est_tokens_per_record(self, op_df: pd.DataFrame, model_name: str | None = None) -> tuple[float, float]:
         """
         Given sample cost data observations for a specific operation, compute the mean and CI
         for the total input tokens and total output tokens.
         """
-        total_input_tokens_tuple = self._compute_mean_and_ci(df=op_df, col="total_input_tokens", model_name=model_name, non_negative_lb=True)
-        total_output_tokens_tuple = self._compute_mean_and_ci(df=op_df, col="total_output_tokens", model_name=model_name, non_negative_lb=True)
+        total_input_tokens = self._compute_mean(df=op_df, col="total_input_tokens", model_name=model_name)
+        total_output_tokens = self._compute_mean(df=op_df, col="total_output_tokens", model_name=model_name)
-        return total_input_tokens_tuple, total_output_tokens_tuple
+        return total_input_tokens, total_output_tokens
     def _est_cardinality(self, op_df: pd.DataFrame, model_name: str | None = None) -> float:
         """
@@ -382,18 +324,8 @@ class CostModel(BaseCostModel):
                     plan_ids = model_op_df.plan_id.unique().tolist()
                     num_output_records = df[df.source_op_id.isin(op_ids) & df.plan_id.isin(plan_ids)].shape[0]
-                # estimate the selectivity / fan-out and compute bounds
-                est_selectivity = num_output_records / num_input_records
-                if is_filter_op:
-                    est_selectivity_lb, est_selectivity_ub = self._compute_proportion_ci(est_selectivity, n_samples=num_input_records)
-                # for now, if we are doing a convert operation w/fan-out then the assumptions of _compute_proportion_ci
-                # do not hold; until we have a better method for estimating bounds, just set them to the estimate
-                else:
-                    est_selectivity_lb = est_selectivity
-                    est_selectivity_ub = est_selectivity
-                return est_selectivity, est_selectivity_lb, est_selectivity_ub
+                # estimate the selectivity / fan-out
+                return num_output_records / num_input_records
         # otherwise average selectivity across all ops
         num_input_records = op_df.shape[0]
@@ -406,18 +338,8 @@ class CostModel(BaseCostModel):
             op_ids = op_df.op_id.unique().tolist()
             num_output_records = df[df.source_op_id.isin(op_ids)].shape[0]
-        # estimate the selectivity / fan-out and compute bounds
-        est_selectivity = num_output_records / num_input_records
-        if is_filter_op:
-            est_selectivity_lb, est_selectivity_ub = self._compute_proportion_ci(est_selectivity, n_samples=num_input_records)
-        # for now, if we are doing a convert operation w/fan-out then the assumptions of _compute_proportion_ci
-        # do not hold; until we have a better method for estimating bounds, just set them to the estimate
-        else:
-            est_selectivity_lb = est_selectivity
-            est_selectivity_ub = est_selectivity
-        return est_selectivity, est_selectivity_lb, est_selectivity_ub
+        # estimate the selectivity / fan-out
+        return num_output_records / num_input_records
     def _compute_quality(self, row):
         # compute accuracy for filter
@@ -491,10 +413,7 @@ class CostModel(BaseCostModel):
         total_answers = model_df.num_answers.sum() if not model_df.empty else op_df.num_answers.sum()
         est_quality = num_correct / total_answers
-        # compute CI on the proportion of correct answers
-        est_quality_lb, est_quality_ub = self._compute_proportion_ci(est_quality, n_samples=total_answers)
-        return est_quality, est_quality_lb, est_quality_ub
+        return est_quality
     def _compute_operator_estimates(self) -> dict[str, Any] | None:
         """
@@ -532,64 +451,36 @@ class CostModel(BaseCostModel):
                 # model_names = op_df.model_name.unique().tolist()
                 estimates = {model_name: None for model_name in model_names}
                 for model_name in model_names:
-                    time_per_record, time_per_record_lb, time_per_record_ub = self._est_time_per_record(op_df, model_name=model_name)
-                    cost_per_record, cost_per_record_lb, cost_per_record_ub = self._est_cost_per_record(op_df, model_name=model_name)
-                    input_tokens_tup, output_tokens_tup = self._est_tokens_per_record(op_df, model_name=model_name)
-                    selectivity, selectivity_lb, selectivity_ub = self._est_selectivity(self.sample_execution_data_df, op_df, model_name=model_name)
-                    quality, quality_lb, quality_ub = self._est_quality(op_df, model_name=model_name)
+                    time_per_record = self._est_time_per_record(op_df, model_name=model_name)
+                    cost_per_record = self._est_cost_per_record(op_df, model_name=model_name)
+                    input_tokens, output_tokens = self._est_tokens_per_record(op_df, model_name=model_name)
+                    selectivity = self._est_selectivity(self.sample_execution_data_df, op_df, model_name=model_name)
+                    quality = self._est_quality(op_df, model_name=model_name)
                     model_estimates = {
                         "time_per_record": time_per_record,
-                        "time_per_record_lower_bound": time_per_record_lb,
-                        "time_per_record_upper_bound": time_per_record_ub,
                         "cost_per_record": cost_per_record,
-                        "cost_per_record_lower_bound": cost_per_record_lb,
-                        "cost_per_record_upper_bound": cost_per_record_ub,
-                        "total_input_tokens": input_tokens_tup[0],
-                        "total_input_tokens_lower_bound": input_tokens_tup[1],
-                        "total_input_tokens_upper_bound": input_tokens_tup[2],
-                        "total_output_tokens": output_tokens_tup[0],
-                        "total_output_tokens_lower_bound": output_tokens_tup[1],
-                        "total_output_tokens_upper_bound": output_tokens_tup[2],
+                        "total_input_tokens": input_tokens,
+                        "total_output_tokens": output_tokens,
                         "selectivity": selectivity,
-                        "selectivity_lower_bound": selectivity_lb,
-                        "selectivity_upper_bound": selectivity_ub,
                         "quality": quality,
-                        "quality_lower_bound": quality_lb,
-                        "quality_upper_bound": quality_ub,
                     }
                     estimates[model_name] = model_estimates
             # TODO pre-compute lists of op_names in groups
             elif op_name in ["NonLLMFilter"]:
-                time_per_record, time_per_record_lb, time_per_record_ub = self._est_time_per_record(op_df)
-                selectivity, selectivity_lb, selectivity_ub = self._est_selectivity(self.sample_execution_data_df, op_df)
-                estimates = {
-                    "time_per_record": time_per_record,
-                    "time_per_record_lower_bound": time_per_record_lb,
-                    "time_per_record_upper_bound": time_per_record_ub,
-                    "selectivity": selectivity,
-                    "selectivity_lower_bound": selectivity_lb,
-                    "selectivity_upper_bound": selectivity_ub,
-                }
+                time_per_record = self._est_time_per_record(op_df)
+                selectivity = self._est_selectivity(self.sample_execution_data_df, op_df)
+                estimates = {"time_per_record": time_per_record, "selectivity": selectivity}
             elif op_name in ["MarshalAndScanDataOp", "CacheScanDataOp", "LimitScanOp", "CountAggregateOp", "AverageAggregateOp"]:
-                time_per_record, time_per_record_lb, time_per_record_ub = self._est_time_per_record(op_df)
-                estimates = {
-                    "time_per_record": time_per_record,
-                    "time_per_record_lower_bound": time_per_record_lb,
-                    "time_per_record_upper_bound": time_per_record_ub,
-                }
+                time_per_record = self._est_time_per_record(op_df)
+                estimates = {"time_per_record": time_per_record}
             elif op_name in ["ApplyGroupByOp"]:
-                time_per_record, time_per_record_lb, time_per_record_ub = self._est_time_per_record(op_df)
+                time_per_record = self._est_time_per_record(op_df)
                 cardinality = self._est_cardinality(op_df)
-                estimates = {
-                    "time_per_record": time_per_record,
-                    "time_per_record_lower_bound": time_per_record_lb,
-                    "time_per_record_upper_bound": time_per_record_ub,
-                    "cardinality": cardinality,
-                }
+                estimates = {"time_per_record": time_per_record, "cardinality": cardinality}
             operator_estimates[op_id] = estimates
@@ -598,6 +489,7 @@ class CostModel(BaseCostModel):
     def __call__(self, operator: PhysicalOperator, source_op_estimates: OperatorCostEstimates | None = None) -> PlanCost:
         # get identifier for operation which is unique within sentinel plan but consistent across sentinels
         op_id = operator.get_op_id()
+        logger.debug(f"Calling __call__ for {str(operator)} with op_id: {op_id}")
         # initialize estimates of operator metrics based on naive (but sometimes precise) logic
         if isinstance(operator, MarshalAndScanDataOp):
@@ -633,8 +525,6 @@ class CostModel(BaseCostModel):
         if sample_op_estimates is not None and op_id in sample_op_estimates:
             if isinstance(operator, (MarshalAndScanDataOp, CacheScanDataOp)):
                 op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
-                op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
-                op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
             elif isinstance(operator, ApplyGroupByOp):
                 # NOTE: in theory we should also treat this cardinality est. as a random variable, but in practice we will
@@ -646,48 +536,24 @@ class CostModel(BaseCostModel):
                 #       actual cardinalities of operators we estimate their selectivities / fan-outs and multiply those by
                 #       the input cardinality (where the initial input cardinality from the datareader is known).
                 op_estimates.cardinality = sample_op_estimates[op_id]["cardinality"]
-                op_estimates.cardinality_lower_bound = op_estimates.cardinality
-                op_estimates.cardinality_upper_bound = op_estimates.cardinality
                 op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
-                op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
-                op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
             elif isinstance(operator, (CountAggregateOp, AverageAggregateOp)):  # noqa: SIM114
                 op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
-                op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
-                op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
             elif isinstance(operator, LimitScanOp):
                 op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
-                op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
-                op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
             elif isinstance(operator, NonLLMFilter):
                 op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id]["selectivity"]
-                op_estimates.cardinality_lower_bound = source_op_estimates.cardinality_lower_bound * sample_op_estimates[op_id]["selectivity_lower_bound"]
-                op_estimates.cardinality_upper_bound = source_op_estimates.cardinality_upper_bound * sample_op_estimates[op_id]["selectivity_upper_bound"]
                 op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
-                op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id]["time_per_record_lower_bound"]
-                op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id]["time_per_record_upper_bound"]
             elif isinstance(operator, LLMFilter):
                 model_name = operator.model.value
                 op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id][model_name]["selectivity"]
-                op_estimates.cardinality_lower_bound = source_op_estimates.cardinality_lower_bound * sample_op_estimates[op_id][model_name]["selectivity_lower_bound"]
-                op_estimates.cardinality_upper_bound = source_op_estimates.cardinality_upper_bound * sample_op_estimates[op_id][model_name]["selectivity_upper_bound"]
                 op_estimates.time_per_record = sample_op_estimates[op_id][model_name]["time_per_record"]
-                op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id][model_name]["time_per_record_lower_bound"]
-                op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id][model_name]["time_per_record_upper_bound"]
                 op_estimates.cost_per_record = sample_op_estimates[op_id][model_name]["cost_per_record"]
-                op_estimates.cost_per_record_lower_bound = sample_op_estimates[op_id][model_name]["cost_per_record_lower_bound"]
-                op_estimates.cost_per_record_upper_bound = sample_op_estimates[op_id][model_name]["cost_per_record_upper_bound"]
                 op_estimates.quality = sample_op_estimates[op_id][model_name]["quality"]
-                op_estimates.quality_lower_bound = sample_op_estimates[op_id][model_name]["quality_lower_bound"]
-                op_estimates.quality_upper_bound = sample_op_estimates[op_id][model_name]["quality_upper_bound"]
             elif isinstance(operator, LLMConvert):
                 # TODO: EVEN BETTER: do similarity match (e.g. largest param intersection, more exotic techniques);
@@ -696,60 +562,28 @@ class CostModel(BaseCostModel):
                 # NOTE: code synthesis does not have a model attribute
                 model_name = operator.model.value if hasattr(operator, "model") else None
                 op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id][model_name]["selectivity"]
-                op_estimates.cardinality_lower_bound = source_op_estimates.cardinality_lower_bound * sample_op_estimates[op_id][model_name]["selectivity_lower_bound"]
-                op_estimates.cardinality_upper_bound = source_op_estimates.cardinality_upper_bound * sample_op_estimates[op_id][model_name]["selectivity_upper_bound"]
                 op_estimates.time_per_record = sample_op_estimates[op_id][model_name]["time_per_record"]
-                op_estimates.time_per_record_lower_bound = sample_op_estimates[op_id][model_name]["time_per_record_lower_bound"]
-                op_estimates.time_per_record_upper_bound = sample_op_estimates[op_id][model_name]["time_per_record_upper_bound"]
                 op_estimates.cost_per_record = sample_op_estimates[op_id][model_name]["cost_per_record"]
-                op_estimates.cost_per_record_lower_bound = sample_op_estimates[op_id][model_name]["cost_per_record_lower_bound"]
-                op_estimates.cost_per_record_upper_bound = sample_op_estimates[op_id][model_name]["cost_per_record_upper_bound"]
                 op_estimates.quality = sample_op_estimates[op_id][model_name]["quality"]
-                op_estimates.quality_lower_bound = sample_op_estimates[op_id][model_name]["quality_lower_bound"]
-                op_estimates.quality_upper_bound = sample_op_estimates[op_id][model_name]["quality_upper_bound"]
                 # NOTE: if code synth. fails, this will turn into ConventionalQuery calls to GPT-3.5,
                 #       which would wildly mess up estimate of time and cost per-record
                 # do code synthesis adjustment
                 if isinstance(operator, CodeSynthesisConvert):
                     op_estimates.time_per_record = 1e-5
-                    op_estimates.time_per_record_lower_bound = op_estimates.time_per_record
-                    op_estimates.time_per_record_upper_bound = op_estimates.time_per_record
                     op_estimates.cost_per_record = 1e-4
-                    op_estimates.cost_per_record_lower_bound = op_estimates.cost_per_record
-                    op_estimates.cost_per_record_upper_bound = op_estimates.cost_per_record
                     op_estimates.quality = op_estimates.quality * (GPT_4o_MODEL_CARD["code"] / 100.0)
-                    op_estimates.quality_lower_bound = op_estimates.quality_lower_bound * (GPT_4o_MODEL_CARD["code"] / 100.0)
-                    op_estimates.quality_upper_bound = op_estimates.quality_upper_bound * (GPT_4o_MODEL_CARD["code"] / 100.0)
                 # token reduction adjustment
-                if isinstance(operator, TokenReducedConvert):
+                if isinstance(operator, TokenReducedConvertBonded):
                     total_input_tokens = operator.token_budget * sample_op_estimates[op_id][model_name]["total_input_tokens"]
                     total_output_tokens = sample_op_estimates[op_id][model_name]["total_output_tokens"]
                     op_estimates.cost_per_record = (
                         MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens
                         + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens
                     )
-                    total_input_tokens_lb = operator.token_budget * sample_op_estimates[op_id][model_name]["total_input_tokens_lower_bound"]
-                    total_output_tokens_lb = sample_op_estimates[op_id][model_name]["total_output_tokens_lower_bound"]
-                    op_estimates.cost_per_record_lower_bound = (
-                        MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens_lb
-                        + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens_lb
-                    )
-                    total_input_tokens_ub = operator.token_budget * sample_op_estimates[op_id][model_name]["total_input_tokens_upper_bound"]
-                    total_output_tokens_ub = sample_op_estimates[op_id][model_name]["total_output_tokens_upper_bound"]
-                    op_estimates.cost_per_record_upper_bound = (
-                        MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens_ub
-                        + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens_ub
-                    )
                     op_estimates.quality = op_estimates.quality * math.sqrt(math.sqrt(operator.token_budget))
-                    op_estimates.quality_lower_bound = op_estimates.quality_lower_bound * math.sqrt(math.sqrt(operator.token_budget))
-                    op_estimates.quality_upper_bound = op_estimates.quality_upper_bound * math.sqrt(math.sqrt(operator.token_budget))
                 # rag convert adjustment
                 if isinstance(operator, RAGConvert):
                     total_input_tokens = operator.num_chunks_per_field * operator.chunk_size
@@ -758,22 +592,7 @@ class CostModel(BaseCostModel):
                         MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens
                         + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens
                     )
-                    total_input_tokens_lb = operator.num_chunks_per_field * operator.chunk_size
-                    total_output_tokens_lb = sample_op_estimates[op_id][model_name]["total_output_tokens_lower_bound"]
-                    op_estimates.cost_per_record_lower_bound = (
-                        MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens_lb
-                        + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens_lb
-                    )
-                    total_input_tokens_ub = operator.num_chunks_per_field * operator.chunk_size
-                    total_output_tokens_ub = sample_op_estimates[op_id][model_name]["total_output_tokens_upper_bound"]
-                    op_estimates.cost_per_record_upper_bound = (
-                        MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens_ub
-                        + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens_ub
-                    )
                     op_estimates.quality = op_estimates.quality * operator.naive_quality_adjustment
-                    op_estimates.quality_lower_bound = op_estimates.quality_lower_bound * operator.naive_quality_adjustment
-                    op_estimates.quality_upper_bound = op_estimates.quality_upper_bound * operator.naive_quality_adjustment
             else:
                 raise Exception("Unknown operator")
@@ -783,26 +602,14 @@ class CostModel(BaseCostModel):
         op_cost = op_estimates.cost_per_record * source_op_estimates.cardinality
         op_quality = op_estimates.quality
-        # compute bounds on total time and cost estimates for this operator
-        op_cost_lower_bound = op_estimates.cost_per_record_lower_bound * source_op_estimates.cardinality_lower_bound
-        op_cost_upper_bound = op_estimates.cost_per_record_upper_bound * source_op_estimates.cardinality_upper_bound
-        op_time_lower_bound = op_estimates.time_per_record_lower_bound * source_op_estimates.cardinality_lower_bound
-        op_time_upper_bound = op_estimates.time_per_record_upper_bound * source_op_estimates.cardinality_upper_bound
-        op_quality_lower_bound = op_estimates.quality_lower_bound
-        op_quality_upper_bound = op_estimates.quality_upper_bound
         # create and return PlanCost object for this op's statistics
         op_plan_cost = PlanCost(
             cost=op_cost,
             time=op_time,
             quality=op_quality,
             op_estimates=op_estimates,
-            cost_lower_bound=op_cost_lower_bound,
-            cost_upper_bound=op_cost_upper_bound,
-            time_lower_bound=op_time_lower_bound,
-            time_upper_bound=op_time_upper_bound,
-            quality_lower_bound=op_quality_lower_bound,
-            quality_upper_bound=op_quality_upper_bound,
         )
+        logger.debug(f"Done calling __call__ for {str(operator)} with op_id: {op_id}")
+        logger.debug(f"Plan cost: {op_plan_cost}")
         return op_plan_cost

palimpzest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl

palimpzest 0.6.4py3-none-any.whl → 0.7.0py3-none-any.whl