PyPI - palimpzest - Versions diffs - 0.7.6__py3-none-any.whl → 0.7.8__py3-none-any.whl - Mend

palimpzest 0.7.6py3-none-any.whl → 0.7.8py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

palimpzest/constants.py +113 -75
palimpzest/core/data/dataclasses.py +55 -38
palimpzest/core/elements/index.py +5 -15
palimpzest/core/elements/records.py +1 -1
palimpzest/prompts/prompt_factory.py +1 -1
palimpzest/query/execution/all_sample_execution_strategy.py +216 -0
palimpzest/query/execution/execution_strategy.py +4 -4
palimpzest/query/execution/execution_strategy_type.py +7 -1
palimpzest/query/execution/mab_execution_strategy.py +184 -72
palimpzest/query/execution/parallel_execution_strategy.py +182 -15
palimpzest/query/execution/single_threaded_execution_strategy.py +21 -21
palimpzest/query/generators/api_client_factory.py +6 -7
palimpzest/query/generators/generators.py +5 -8
palimpzest/query/operators/aggregate.py +4 -3
palimpzest/query/operators/convert.py +1 -1
palimpzest/query/operators/filter.py +1 -1
palimpzest/query/operators/limit.py +1 -1
palimpzest/query/operators/map.py +1 -1
palimpzest/query/operators/physical.py +8 -4
palimpzest/query/operators/project.py +1 -1
palimpzest/query/operators/retrieve.py +7 -23
palimpzest/query/operators/scan.py +1 -1
palimpzest/query/optimizer/cost_model.py +54 -62
palimpzest/query/optimizer/optimizer.py +2 -6
palimpzest/query/optimizer/plan.py +4 -4
palimpzest/query/optimizer/primitives.py +1 -1
palimpzest/query/optimizer/rules.py +8 -26
palimpzest/query/optimizer/tasks.py +3 -3
palimpzest/query/processor/processing_strategy_type.py +2 -2
palimpzest/query/processor/sentinel_processor.py +0 -2
palimpzest/sets.py +2 -3
palimpzest/utils/generation_helpers.py +1 -1
palimpzest/utils/model_helpers.py +27 -9
palimpzest/utils/progress.py +81 -72
{palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/METADATA +4 -2
{palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/RECORD +39 -38
{palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/WHEEL +1 -1
{palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.6.dist-info → palimpzest-0.7.8.dist-info}/top_level.txt +0 -0

palimpzest/query/operators/scan.py CHANGED Viewed

@@ -87,7 +87,7 @@ class ScanPhysicalOp(PhysicalOperator, ABC):
             record_parent_id=dr.parent_id,
             record_source_idx=dr.source_idx,
             record_state=dr.to_dict(include_bytes=False),
-            op_id=self.get_op_id(),
+            full_op_id=self.get_full_op_id(),
             logical_op_id=self.logical_op_id,
             op_name=self.op_name(),
             time_per_record=(end_time - start_time),

palimpzest/query/optimizer/cost_model.py CHANGED Viewed

@@ -43,11 +43,11 @@ class BaseCostModel:
         """
         pass
-    def get_costed_phys_op_ids(self) -> set[str]:
+    def get_costed_full_op_ids(self) -> set[str]:
         """
-        Return the set of physical op ids which the cost model has cost estimates for.
+        Return the set of full op ids which the cost model has cost estimates for.
         """
-        raise NotImplementedError("Calling get_costed_phys_op_ids from abstract method")
+        raise NotImplementedError("Calling get_costed_full_op_ids from abstract method")
     def __call__(self, operator: PhysicalOperator) -> PlanCost:
         """
@@ -66,9 +66,6 @@ class SampleBasedCostModel:
         verbose: bool = False,
         exp_name: str | None = None,
     ):
-        """
-        execution_data is: {logical_op_id: {physical_op_id: [DataRecordSet]}}
-        """
         # store verbose argument
         self.verbose = verbose
@@ -77,30 +74,28 @@ class SampleBasedCostModel:
         # construct cost, time, quality, and selectivity matrices for each operator set;
         self.operator_to_stats = self.compute_operator_stats(sentinel_plan_stats)
-        # compute set of costed physical op ids from operator_to_stats
-        self.costed_phys_op_ids = set([
-            phys_op_id
-            for _, phys_op_id_to_stats in self.operator_to_stats.items()
-            for phys_op_id, _ in phys_op_id_to_stats.items()
+        self.costed_full_op_ids = set([
+            full_op_id
+            for _, full_op_id_to_stats in self.operator_to_stats.items()
+            for full_op_id in full_op_id_to_stats
         ])
         logger.info(f"Initialized SampleBasedCostModel with verbose={self.verbose}")
         logger.debug(f"Initialized SampleBasedCostModel with params: {self.__dict__}")
-    def get_costed_phys_op_ids(self):
-        return self.costed_phys_op_ids
+    def get_costed_full_op_ids(self):
+        return self.costed_full_op_ids
     def compute_operator_stats(self, sentinel_plan_stats: SentinelPlanStats) -> dict:
         logger.debug("Computing operator statistics")
         # flatten the nested dictionary of execution data and pull out fields relevant to cost estimation
         execution_record_op_stats = []
-        for logical_op_id, phys_op_id_to_op_stats in sentinel_plan_stats.operator_stats.items():
+        for logical_op_id, full_op_id_to_op_stats in sentinel_plan_stats.operator_stats.items():
             logger.debug(f"Computing operator statistics for logical_op_id: {logical_op_id}")
             # flatten the execution data into a list of RecordOpStats
             op_set_execution_data = [
                 record_op_stats
-                for _, op_stats in phys_op_id_to_op_stats.items()
+                for _, op_stats in full_op_id_to_op_stats.items()
                 for record_op_stats in op_stats.record_op_stats_lst
             ]
@@ -108,7 +103,7 @@ class SampleBasedCostModel:
             for record_op_stats in op_set_execution_data:
                 record_op_stats_dict = {
                     "logical_op_id": logical_op_id,
-                    "physical_op_id": record_op_stats.op_id,
+                    "full_op_id": record_op_stats.full_op_id,
                     "record_id": record_op_stats.record_id,
                     "record_parent_id": record_op_stats.record_parent_id,
                     "cost_per_record": record_op_stats.cost_per_record,
@@ -124,13 +119,13 @@ class SampleBasedCostModel:
         # convert flattened execution data into dataframe
         operator_stats_df = pd.DataFrame(execution_record_op_stats)
-        # for each physical_op_id, compute its average cost_per_record, time_per_record, selectivity, and quality
+        # for each full_op_id, compute its average cost_per_record, time_per_record, selectivity, and quality
         operator_to_stats = {}
         for logical_op_id, logical_op_df in operator_stats_df.groupby("logical_op_id"):
             logger.debug(f"Computing operator statistics for logical_op_id: {logical_op_id}")
             operator_to_stats[logical_op_id] = {}
-            for physical_op_id, physical_op_df in logical_op_df.groupby("physical_op_id"):
+            for full_op_id, physical_op_df in logical_op_df.groupby("full_op_id"):
                 # compute the number of input records processed by this operator; use source_idx for scan operator(s)
                 num_source_records = (
                     len(physical_op_df.record_parent_id.unique())
@@ -138,10 +133,10 @@ class SampleBasedCostModel:
                     else len(physical_op_df.source_idx.unique())
                 )
-                # compute selectivity
+                # compute selectivity
                 selectivity = physical_op_df.passed_operator.sum() / num_source_records
-                operator_to_stats[logical_op_id][physical_op_id] = {
+                operator_to_stats[logical_op_id][full_op_id] = {
                     "cost": physical_op_df.cost_per_record.mean(),
                     "time": physical_op_df.time_per_record.mean(),
                     "quality": physical_op_df.quality.mean(),
@@ -162,18 +157,18 @@ class SampleBasedCostModel:
         #       we will have execution data for each operator passed into __call__; nevertheless, we
         #       still perform a sanity check
         # look up physical and logical op ids associated with this physical operator
-        phys_op_id = operator.get_op_id()
+        full_op_id = operator.get_full_op_id()
         logical_op_id = operator.logical_op_id
         physical_op_to_stats = self.operator_to_stats.get(logical_op_id)
         assert physical_op_to_stats is not None, f"No execution data for logical operator: {str(operator)}"
-        assert physical_op_to_stats.get(phys_op_id) is not None, f"No execution data for physical operator: {str(operator)}"
+        assert physical_op_to_stats.get(full_op_id) is not None, f"No execution data for physical operator: {str(operator)}"
         logger.debug(f"Calling __call__ for {str(operator)}")
         # look up stats for this operation
-        est_cost_per_record = self.operator_to_stats[logical_op_id][phys_op_id]["cost"]
-        est_time_per_record = self.operator_to_stats[logical_op_id][phys_op_id]["time"]
-        est_quality = self.operator_to_stats[logical_op_id][phys_op_id]["quality"]
-        est_selectivity = self.operator_to_stats[logical_op_id][phys_op_id]["selectivity"]
+        est_cost_per_record = self.operator_to_stats[logical_op_id][full_op_id]["cost"]
+        est_time_per_record = self.operator_to_stats[logical_op_id][full_op_id]["time"]
+        est_quality = self.operator_to_stats[logical_op_id][full_op_id]["quality"]
+        est_selectivity = self.operator_to_stats[logical_op_id][full_op_id]["selectivity"]
         # create source_op_estimates for scan operators if they are not provided
         if isinstance(operator, ScanPhysicalOp):
@@ -238,13 +233,13 @@ class CostModel(BaseCostModel):
         # compute per-operator estimates
         self.operator_estimates = self._compute_operator_estimates()
-        # compute set of costed physical op ids from operator_to_stats
-        self.costed_phys_op_ids = None if self.operator_estimates is None else set(self.operator_estimates.keys())
+        # compute set of costed full op ids from operator_to_stats
+        self.costed_full_op_ids = None if self.operator_estimates is None else set(self.operator_estimates.keys())
         logger.info("Initialized CostModel.")
         logger.debug(f"Initialized CostModel with params: {self.__dict__}")
-    def get_costed_phys_op_ids(self):
-        return self.costed_phys_op_ids
+    def get_costed_full_op_ids(self):
+        return self.costed_full_op_ids
     def _compute_mean(self, df: pd.DataFrame, col: str, model_name: str | None = None) -> float:
         """
@@ -318,9 +313,9 @@ class CostModel(BaseCostModel):
                 if is_filter_op:
                     num_output_records = model_op_df.passed_operator.sum()
                 else:
-                    op_ids = model_op_df.op_id.unique().tolist()
+                    full_op_ids = model_op_df.full_op_id.unique().tolist()
                     plan_ids = model_op_df.plan_id.unique().tolist()
-                    num_output_records = df[df.source_op_id.isin(op_ids) & df.plan_id.isin(plan_ids)].shape[0]
+                    num_output_records = df[df.source_full_op_id.isin(full_op_ids) & df.plan_id.isin(plan_ids)].shape[0]
                 # estimate the selectivity / fan-out
                 return num_output_records / num_input_records
@@ -333,8 +328,8 @@ class CostModel(BaseCostModel):
         if is_filter_op:
             num_output_records = op_df.passed_operator.sum()
         else:
-            op_ids = op_df.op_id.unique().tolist()
-            num_output_records = df[df.source_op_id.isin(op_ids)].shape[0]
+            full_op_ids = op_df.full_op_id.unique().tolist()
+            num_output_records = df[df.source_full_op_id.isin(full_op_ids)].shape[0]
         # estimate the selectivity / fan-out
         return num_output_records / num_input_records
@@ -422,14 +417,14 @@ class CostModel(BaseCostModel):
             return None
         # get the set of operator ids for which we have sample data
-        op_ids = self.sample_execution_data_df.op_id.unique()
+        full_op_ids = self.sample_execution_data_df.full_op_id.unique()
         # compute estimates of runtime, cost, and quality (and intermediates like cardinality) for every operator
         operator_estimates = {}
-        for op_id in op_ids:
+        for full_op_id in full_op_ids:
             # filter for subset of sample execution data related to this operation
             op_df = self.sample_execution_data_df[
-                self.sample_execution_data_df.op_id == op_id
+                self.sample_execution_data_df.full_op_id == full_op_id
             ]
             # skip computing an estimate if we didn't capture any sampling data for this operator
@@ -480,14 +475,14 @@ class CostModel(BaseCostModel):
                 cardinality = self._est_cardinality(op_df)
                 estimates = {"time_per_record": time_per_record, "cardinality": cardinality}
-            operator_estimates[op_id] = estimates
+            operator_estimates[full_op_id] = estimates
         return operator_estimates
     def __call__(self, operator: PhysicalOperator, source_op_estimates: OperatorCostEstimates | None = None) -> PlanCost:
         # get identifier for operation which is unique within sentinel plan but consistent across sentinels
-        op_id = operator.get_op_id()
-        logger.debug(f"Calling __call__ for {str(operator)} with op_id: {op_id}")
+        full_op_id = operator.get_full_op_id()
+        logger.debug(f"Calling __call__ for {str(operator)} with full_op_id: {full_op_id}")
         # initialize estimates of operator metrics based on naive (but sometimes precise) logic
         if isinstance(operator, MarshalAndScanDataOp):
@@ -520,9 +515,9 @@ class CostModel(BaseCostModel):
         # if we have sample execution data, update naive estimates with more informed ones
         sample_op_estimates = self.operator_estimates
-        if sample_op_estimates is not None and op_id in sample_op_estimates:
+        if sample_op_estimates is not None and full_op_id in sample_op_estimates:
             if isinstance(operator, (MarshalAndScanDataOp, CacheScanDataOp)):
-                op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
+                op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
             elif isinstance(operator, ApplyGroupByOp):
                 # NOTE: in theory we should also treat this cardinality est. as a random variable, but in practice we will
@@ -533,36 +528,33 @@ class CostModel(BaseCostModel):
                 #       produced by the groupby in our sample and assume it may generalize to the full workload. To estimate
                 #       actual cardinalities of operators we estimate their selectivities / fan-outs and multiply those by
                 #       the input cardinality (where the initial input cardinality from the datareader is known).
-                op_estimates.cardinality = sample_op_estimates[op_id]["cardinality"]
-                op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
+                op_estimates.cardinality = sample_op_estimates[full_op_id]["cardinality"]
+                op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
             elif isinstance(operator, (CountAggregateOp, AverageAggregateOp)):  # noqa: SIM114
-                op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
+                op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
             elif isinstance(operator, LimitScanOp):
-                op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
+                op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
             elif isinstance(operator, NonLLMFilter):
-                op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id]["selectivity"]
-                op_estimates.time_per_record = sample_op_estimates[op_id]["time_per_record"]
+                op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id]["selectivity"]
+                op_estimates.time_per_record = sample_op_estimates[full_op_id]["time_per_record"]
             elif isinstance(operator, LLMFilter):
                 model_name = operator.model.value
-                op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id][model_name]["selectivity"]
-                op_estimates.time_per_record = sample_op_estimates[op_id][model_name]["time_per_record"]
-                op_estimates.cost_per_record = sample_op_estimates[op_id][model_name]["cost_per_record"]
-                op_estimates.quality = sample_op_estimates[op_id][model_name]["quality"]
+                op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id][model_name]["selectivity"]
+                op_estimates.time_per_record = sample_op_estimates[full_op_id][model_name]["time_per_record"]
+                op_estimates.cost_per_record = sample_op_estimates[full_op_id][model_name]["cost_per_record"]
+                op_estimates.quality = sample_op_estimates[full_op_id][model_name]["quality"]
             elif isinstance(operator, LLMConvert):
-                # TODO: EVEN BETTER: do similarity match (e.g. largest param intersection, more exotic techniques);
-                #       another heuristic: logical_op_id-->subclass_physical_op_id-->specific_physical_op_id-->most_param_match_physical_op_id
-                # TODO: instead of [op_id][model_name] --> [logical_op_id][physical_op_id]
                 # NOTE: code synthesis does not have a model attribute
                 model_name = operator.model.value if hasattr(operator, "model") else None
-                op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[op_id][model_name]["selectivity"]
-                op_estimates.time_per_record = sample_op_estimates[op_id][model_name]["time_per_record"]
-                op_estimates.cost_per_record = sample_op_estimates[op_id][model_name]["cost_per_record"]
-                op_estimates.quality = sample_op_estimates[op_id][model_name]["quality"]
+                op_estimates.cardinality = source_op_estimates.cardinality * sample_op_estimates[full_op_id][model_name]["selectivity"]
+                op_estimates.time_per_record = sample_op_estimates[full_op_id][model_name]["time_per_record"]
+                op_estimates.cost_per_record = sample_op_estimates[full_op_id][model_name]["cost_per_record"]
+                op_estimates.quality = sample_op_estimates[full_op_id][model_name]["quality"]
                 # NOTE: if code synth. fails, this will turn into ConventionalQuery calls to GPT-3.5,
                 #       which would wildly mess up estimate of time and cost per-record
@@ -575,7 +567,7 @@ class CostModel(BaseCostModel):
                 # rag convert adjustment
                 if isinstance(operator, RAGConvert):
                     total_input_tokens = operator.num_chunks_per_field * operator.chunk_size
-                    total_output_tokens = sample_op_estimates[op_id][model_name]["total_output_tokens"]
+                    total_output_tokens = sample_op_estimates[full_op_id][model_name]["total_output_tokens"]
                     op_estimates.cost_per_record = (
                         MODEL_CARDS[model_name]["usd_per_input_token"] * total_input_tokens
                         + MODEL_CARDS[model_name]["usd_per_output_token"] * total_output_tokens
@@ -597,7 +589,7 @@ class CostModel(BaseCostModel):
             quality=op_quality,
             op_estimates=op_estimates,
         )
-        logger.debug(f"Done calling __call__ for {str(operator)} with op_id: {op_id}")
+        logger.debug(f"Done calling __call__ for {str(operator)} with full_op_id: {full_op_id}")
         logger.debug(f"Plan cost: {op_plan_cost}")
         return op_plan_cost

palimpzest/query/optimizer/optimizer.py CHANGED Viewed

@@ -103,9 +103,6 @@ class Optimizer:
         # store the cost model
         self.cost_model = cost_model
-        # store the set of physical operators for which our cost model has cost estimates
-        self.costed_phys_op_ids = cost_model.get_costed_phys_op_ids()
         # mapping from each group id to its Group object
         self.groups = {}
@@ -189,7 +186,6 @@ class Optimizer:
     def update_cost_model(self, cost_model: CostModel):
         self.cost_model = cost_model
-        self.costed_phys_op_ids = cost_model.get_costed_phys_op_ids()
     def get_physical_op_params(self):
         return {
@@ -338,7 +334,7 @@ class Optimizer:
         # compute all properties including this operations'
         all_properties = deepcopy(input_group_properties)
         if isinstance(op, FilteredScan):
-            # NOTE: we could use op.get_op_id() here, but storing filter strings makes
+            # NOTE: we could use op.get_full_op_id() here, but storing filter strings makes
             #       debugging a bit easier as you can read which filters are in the Group
             op_filter_str = op.filter.get_filter_str()
             if "filters" in all_properties:
@@ -464,7 +460,7 @@ class Optimizer:
             elif isinstance(task, OptimizeLogicalExpression):
                 new_tasks = task.perform(self.transformation_rules, self.implementation_rules)
             elif isinstance(task, ApplyRule):
-                context = {"costed_phys_op_ids": self.costed_phys_op_ids}
+                context = {"costed_full_op_ids": self.cost_model.get_costed_full_op_ids()}
                 new_tasks = task.perform(
                     self.groups, self.expressions, context=context, **self.get_physical_op_params()
                 )

palimpzest/query/optimizer/plan.py CHANGED Viewed

@@ -53,7 +53,7 @@ class PhysicalPlan(Plan):
         Two different PhysicalPlan instances with the identical lists of operators will have equivalent plan_ids.
         """
-        hash_str = str(tuple(op.get_op_id() for op in self.operators))
+        hash_str = str(tuple(op.get_full_op_id() for op in self.operators))
         return hash_for_id(hash_str)
     def __eq__(self, other):
@@ -103,9 +103,9 @@ class SentinelPlan(Plan):
             assert isinstance(operator_sets[0][0], ScanPhysicalOp), "first operator set must be a scan"
             assert all(len(op_set) > 0 for op_set in operator_sets), "every operator set must have at least one operator"
-        # store operator_sets and logical_op_ids; sort operator_sets internally by op_id
+        # store operator_sets and logical_op_ids; sort operator_sets internally by full_op_id
         self.operator_sets = operator_sets
-        self.operator_sets = [sorted(op_set, key=lambda op: op.get_op_id()) for op_set in self.operator_sets]
+        self.operator_sets = [sorted(op_set, key=lambda op: op.get_full_op_id()) for op_set in self.operator_sets]
         self.logical_op_ids = [op_set[0].logical_op_id for op_set in self.operator_sets]
         self.plan_id = self.compute_plan_id()
@@ -117,7 +117,7 @@ class SentinelPlan(Plan):
         """
         hash_str = ""
         for logical_op_id, op_set in zip(self.logical_op_ids, self.operator_sets):
-            hash_str += f"{logical_op_id} {tuple(op.get_op_id() for op in op_set)} "
+            hash_str += f"{logical_op_id} {tuple(op.get_full_op_id() for op in op_set)} "
         return hash_for_id(hash_str)
     def __eq__(self, other):

palimpzest/query/optimizer/primitives.py CHANGED Viewed

@@ -43,7 +43,7 @@ class Expression:
         return self.operator == other.operator and self.input_group_ids == other.input_group_ids
     def __str__(self):
-        op_id = self.operator.get_logical_op_id() if isinstance(self.operator, LogicalOperator) else self.operator.get_op_id()
+        op_id = self.operator.get_logical_op_id() if isinstance(self.operator, LogicalOperator) else self.operator.get_full_op_id()
         return str(tuple(sorted(self.input_group_ids)) + (op_id, str(self.__class__.__name__)))
     def __hash__(self):

palimpzest/query/optimizer/rules.py CHANGED Viewed

@@ -2,7 +2,7 @@ import logging
 from copy import deepcopy
 from itertools import combinations
-from palimpzest.constants import AggFunc, Cardinality, Model, PromptStrategy
+from palimpzest.constants import AggFunc, Cardinality, PromptStrategy
 from palimpzest.query.operators.aggregate import ApplyGroupByOp, AverageAggregateOp, CountAggregateOp
 from palimpzest.query.operators.code_synthesis_convert import CodeSynthesisConvertSingle
 from palimpzest.query.operators.convert import LLMConvertBonded, NonLLMConvert
@@ -285,9 +285,6 @@ class LLMConvertBondedRule(ImplementationRule):
             }
         )
-        # NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
-        #       thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
-        #
         # identify models which can be used strictly for text or strictly for images
         vision_models = set(get_vision_models())
         text_models = set(get_models())
@@ -322,10 +319,10 @@ class LLMConvertBondedRule(ImplementationRule):
             # skip this model if:
             # 1. this is a pure vision model and we're not doing an image conversion, or
             # 2. this is a pure text model and we're doing an image conversion, or
-            # 3. this is a vision model hosted by Together (i.e. LLAMA3_V) and there is more than one image field
+            # 3. this is a vision model hosted by Together (i.e. LLAMA3 vision) and there is more than one image field
             first_criteria = model in pure_vision_models and not is_image_conversion
             second_criteria = model in pure_text_models and is_image_conversion
-            third_criteria = model == Model.LLAMA3_V and (num_image_fields > 1 or list_image_field)
+            third_criteria = model.is_llama_model() and model.is_vision_model() and (num_image_fields > 1 or list_image_field)
             if first_criteria or second_criteria or third_criteria:
                 continue
@@ -465,9 +462,6 @@ class RAGConvertRule(ImplementationRule):
             }
         )
-        # NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
-        #       thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
-        #
         # identify models which can be used strictly for text or strictly for images
         vision_models = set(get_vision_models())
         text_models = set(get_models())
@@ -536,9 +530,6 @@ class MixtureOfAgentsConvertRule(ImplementationRule):
             }
         )
-        # NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
-        #       thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
-        #
         # identify models which can be used strictly for text or strictly for images
         vision_models = set(get_vision_models())
         text_models = set(get_models())
@@ -560,7 +551,7 @@ class MixtureOfAgentsConvertRule(ImplementationRule):
         )
         proposer_model_set, is_image_conversion = text_models, False
         if num_image_fields > 1 or list_image_field:
-            proposer_model_set = [model for model in vision_models if model != Model.LLAMA3_V]
+            proposer_model_set = [model for model in vision_models if not model.is_llama_model()]
             is_image_conversion = True
         elif num_image_fields == 1:
             proposer_model_set = vision_models
@@ -636,9 +627,6 @@ class CriticAndRefineConvertRule(ImplementationRule):
             }
         )
-        # NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
-        #       thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
-        #
         # identify models which can be used strictly for text or strictly for images
         vision_models = set(get_vision_models())
         text_models = set(get_models())
@@ -674,10 +662,10 @@ class CriticAndRefineConvertRule(ImplementationRule):
             # skip this model if:
             # 1. this is a pure vision model and we're not doing an image conversion, or
             # 2. this is a pure text model and we're doing an image conversion, or
-            # 3. this is a vision model hosted by Together (i.e. LLAMA3_V) and there is more than one image field
+            # 3. this is a vision model hosted by Together (i.e. LLAMA3 vision) and there is more than one image field
             first_criteria = model in pure_vision_models and not is_image_conversion
             second_criteria = model in pure_text_models and is_image_conversion
-            third_criteria = model == Model.LLAMA3_V and (num_image_fields > 1 or list_image_field)
+            third_criteria = model.is_llama_model() and model.is_vision_model() and (num_image_fields > 1 or list_image_field)
             if first_criteria or second_criteria or third_criteria:
                 continue
@@ -750,9 +738,6 @@ class SplitConvertRule(ImplementationRule):
             }
         )
-        # NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
-        #       thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
-        #
         # identify models which can be used strictly for text or strictly for images
         vision_models = set(get_vision_models())
         text_models = set(get_models())
@@ -911,9 +896,6 @@ class LLMFilterRule(ImplementationRule):
             }
         )
-        # NOTE: when comparing pz.Model(s), equality is determined by the string (i.e. pz.Model.value)
-        #       thus, Model.GPT_4o and Model.GPT_4o_V map to the same value; this allows us to use set logic
-        #
         # identify models which can be used strictly for text or strictly for images
         vision_models = set(get_vision_models())
         text_models = set(get_models())
@@ -948,10 +930,10 @@ class LLMFilterRule(ImplementationRule):
             # skip this model if:
             # 1. this is a pure vision model and we're not doing an image filter, or
             # 2. this is a pure text model and we're doing an image filter, or
-            # 3. this is a vision model hosted by Together (i.e. LLAMA3_V) and there is more than one image field
+            # 3. this is a vision model hosted by Together (i.e. LLAMA3 vision) and there is more than one image field
             first_criteria = model in pure_vision_models and not is_image_filter
             second_criteria = model in pure_text_models and is_image_filter
-            third_criteria = model == Model.LLAMA3_V and (num_image_fields > 1 or list_image_field)
+            third_criteria = model.is_llama_model() and model.is_vision_model() and (num_image_fields > 1 or list_image_field)
             if first_criteria or second_criteria or third_criteria:
                 continue

palimpzest/query/optimizer/tasks.py CHANGED Viewed

@@ -235,9 +235,9 @@ class ApplyRule(Task):
             # apply implementation rule
             new_expressions = self.rule.substitute(self.logical_expression, **physical_op_params)
             new_expressions = [expr for expr in new_expressions if expr.get_expr_id() not in expressions]
-            costed_phys_op_ids = context['costed_phys_op_ids']
-            if costed_phys_op_ids is not None:
-                new_expressions = [expr for expr in new_expressions if expr.operator.get_op_id() in costed_phys_op_ids]
+            costed_full_op_ids = context['costed_full_op_ids']
+            if costed_full_op_ids is not None:
+                new_expressions = [expr for expr in new_expressions if expr.operator.get_full_op_id() in costed_full_op_ids]
             expressions.update({expr.get_expr_id(): expr for expr in new_expressions})
             group.physical_expressions.update(new_expressions)

palimpzest/query/processor/processing_strategy_type.py CHANGED Viewed

@@ -17,9 +17,9 @@ class ProcessingStrategyType(Enum):
         Returns a list of valid execution strategies for the given processing strategy.
         """
         if self == ProcessingStrategyType.SENTINEL or self == ProcessingStrategyType.NO_SENTINEL:
-            return [ExecutionStrategyType.SEQUENTIAL, ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL]
+            return [ExecutionStrategyType.SEQUENTIAL, ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL, ExecutionStrategyType.SEQUENTIAL_PARALLEL]
         elif self == ProcessingStrategyType.STREAMING:
-            return [ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL]
+            return [ExecutionStrategyType.PIPELINED, ExecutionStrategyType.PARALLEL, ExecutionStrategyType.SEQUENTIAL_PARALLEL]
     def is_sentinel_strategy(self) -> bool:
         """

palimpzest/query/processor/sentinel_processor.py CHANGED Viewed

@@ -33,8 +33,6 @@ class SentinelQueryProcessor(QueryProcessor):
         """
         Generates and returns a SentinelPlan for the given dataset.
         """
-        # TODO: explicitly pull up filters; for SIGMOD we can explicitly write plans w/filters pulled up
         # create a new optimizer and update its strategy to SENTINEL
         optimizer = self.optimizer.deepcopy_clean()
         optimizer.update_strategy(OptimizationStrategyType.SENTINEL)

palimpzest/sets.py CHANGED Viewed

@@ -5,7 +5,6 @@ from typing import Callable
 import pandas as pd
 from chromadb.api.models.Collection import Collection
-from ragatouille.RAGPretrainedModel import RAGPretrainedModel
 from palimpzest.constants import AggFunc, Cardinality
 from palimpzest.core.data.datareaders import DataReader
@@ -35,7 +34,7 @@ class Set:
         agg_func: AggFunc | None = None,
         group_by: GroupBySig | None = None,
         project_cols: list[str] | None = None,
-        index: Collection | RAGPretrainedModel | None = None,
+        index: Collection | None = None,
         search_func: Callable | None = None,
         search_attr: str | None = None,
         output_attrs: list[dict] | None = None,
@@ -340,7 +339,7 @@ class Dataset(Set):
     def retrieve(
         self,
-        index: Collection | RAGPretrainedModel,
+        index: Collection,
         search_attr: str,
         output_attrs: list[dict] | type[Schema],
         search_func: Callable | None = None,

palimpzest/utils/generation_helpers.py CHANGED Viewed

@@ -12,7 +12,7 @@ def get_json_from_answer(answer: str, model: Model, cardinality: Cardinality) ->
     and optimistically searches for the substring containing the JSON object.
     """
     # model-specific trimming for LLAMA3 responses
-    if model in [Model.LLAMA3, Model.LLAMA3_V]:
+    if model.is_llama_model():
         answer = answer.split("---")[0]
         answer = answer.replace("True", "true")
         answer = answer.replace("False", "false")

palimpzest/utils/model_helpers.py CHANGED Viewed

@@ -9,10 +9,18 @@ def get_vision_models() -> list[Model]:
     """
     models = []
     if os.getenv("OPENAI_API_KEY") is not None:
-        models.extend([Model.GPT_4o_V, Model.GPT_4o_MINI_V])
+        openai_vision_models = [
+            model for model in Model
+            if model.is_openai_model() and model.is_vision_model()
+        ]
+        models.extend(openai_vision_models)
     if os.getenv("TOGETHER_API_KEY") is not None:
-        models.extend([Model.LLAMA3_V])
+        together_vision_models = [
+            model for model in Model
+            if model.is_together_model() and model.is_vision_model()
+        ]
+        models.extend(together_vision_models)
     return models
@@ -23,10 +31,16 @@ def get_models(include_vision: bool = False) -> list[Model]:
     """
     models = []
     if os.getenv("OPENAI_API_KEY") is not None:
-        models.extend([Model.GPT_4o, Model.GPT_4o_MINI])
+        openai_models = [model for model in Model if model.is_openai_model()]
+        models.extend(openai_models)
     if os.getenv("TOGETHER_API_KEY") is not None:
-        models.extend([Model.LLAMA3, Model.MIXTRAL, Model.DEEPSEEK])
+        together_models = [model for model in Model if model.is_together_model()]
+        if not include_vision:
+            together_models = [
+                model for model in together_models if not model.is_vision_model()
+            ]
+        models.extend(together_models)
     if include_vision:
         vision_models = get_vision_models()
@@ -36,17 +50,21 @@ def get_models(include_vision: bool = False) -> list[Model]:
 # The order is the priority of the model
 TEXT_MODEL_PRIORITY = [
+    # Model.o1,
     Model.GPT_4o,
     Model.GPT_4o_MINI,
-    Model.LLAMA3,
+    Model.LLAMA3_3_70B,
     Model.MIXTRAL,
-    Model.DEEPSEEK,
+    Model.DEEPSEEK_V3,
+    Model.LLAMA3_2_3B,
+    Model.LLAMA3_1_8B,
+    Model.DEEPSEEK_R1_DISTILL_QWEN_1_5B,
 ]
 VISION_MODEL_PRIORITY = [
-    Model.GPT_4o_V,
-    Model.GPT_4o_MINI_V,
-    Model.LLAMA3_V,
+    Model.GPT_4o,
+    Model.GPT_4o_MINI,
+    Model.LLAMA3_2_90B_V,
 ]
 def get_champion_model(available_models, vision=False):
     # Select appropriate priority list based on task

palimpzest 0.7.6__py3-none-any.whl → 0.7.8__py3-none-any.whl

palimpzest 0.7.6py3-none-any.whl → 0.7.8py3-none-any.whl