PyPI - palimpzest - Versions diffs - 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl - Mend

palimpzest 0.6.4py3-none-any.whl → 0.7.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (64) hide show

palimpzest/__init__.py +5 -0
palimpzest/constants.py +110 -43
palimpzest/core/__init__.py +0 -78
palimpzest/core/data/dataclasses.py +382 -44
palimpzest/core/elements/filters.py +7 -3
palimpzest/core/elements/index.py +70 -0
palimpzest/core/elements/records.py +33 -11
palimpzest/core/lib/fields.py +1 -0
palimpzest/core/lib/schemas.py +4 -3
palimpzest/prompts/moa_proposer_convert_prompts.py +0 -4
palimpzest/prompts/prompt_factory.py +44 -7
palimpzest/prompts/split_merge_prompts.py +56 -0
palimpzest/prompts/split_proposer_prompts.py +55 -0
palimpzest/query/execution/execution_strategy.py +435 -53
palimpzest/query/execution/execution_strategy_type.py +20 -0
palimpzest/query/execution/mab_execution_strategy.py +532 -0
palimpzest/query/execution/parallel_execution_strategy.py +143 -172
palimpzest/query/execution/random_sampling_execution_strategy.py +240 -0
palimpzest/query/execution/single_threaded_execution_strategy.py +173 -203
palimpzest/query/generators/api_client_factory.py +31 -0
palimpzest/query/generators/generators.py +256 -76
palimpzest/query/operators/__init__.py +1 -2
palimpzest/query/operators/code_synthesis_convert.py +33 -18
palimpzest/query/operators/convert.py +30 -97
palimpzest/query/operators/critique_and_refine_convert.py +5 -6
palimpzest/query/operators/filter.py +7 -10
palimpzest/query/operators/logical.py +54 -10
palimpzest/query/operators/map.py +130 -0
palimpzest/query/operators/mixture_of_agents_convert.py +6 -6
palimpzest/query/operators/physical.py +3 -12
palimpzest/query/operators/rag_convert.py +66 -18
palimpzest/query/operators/retrieve.py +230 -34
palimpzest/query/operators/scan.py +5 -2
palimpzest/query/operators/split_convert.py +169 -0
palimpzest/query/operators/token_reduction_convert.py +8 -14
palimpzest/query/optimizer/__init__.py +4 -16
palimpzest/query/optimizer/cost_model.py +73 -266
palimpzest/query/optimizer/optimizer.py +87 -58
palimpzest/query/optimizer/optimizer_strategy.py +18 -97
palimpzest/query/optimizer/optimizer_strategy_type.py +37 -0
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/primitives.py +5 -3
palimpzest/query/optimizer/rules.py +336 -172
palimpzest/query/optimizer/tasks.py +30 -100
palimpzest/query/processor/config.py +38 -22
palimpzest/query/processor/nosentinel_processor.py +16 -520
palimpzest/query/processor/processing_strategy_type.py +28 -0
palimpzest/query/processor/query_processor.py +38 -206
palimpzest/query/processor/query_processor_factory.py +117 -130
palimpzest/query/processor/sentinel_processor.py +90 -0
palimpzest/query/processor/streaming_processor.py +25 -32
palimpzest/sets.py +88 -41
palimpzest/utils/model_helpers.py +8 -7
palimpzest/utils/progress.py +368 -152
palimpzest/utils/token_reduction_helpers.py +1 -3
{palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/METADATA +19 -9
palimpzest-0.7.0.dist-info/RECORD +96 -0
{palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/WHEEL +1 -1
palimpzest/query/processor/mab_sentinel_processor.py +0 -884
palimpzest/query/processor/random_sampling_sentinel_processor.py +0 -639
palimpzest/utils/index_helpers.py +0 -6
palimpzest-0.6.4.dist-info/RECORD +0 -87
{palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info/licenses}/LICENSE +0 -0
{palimpzest-0.6.4.dist-info → palimpzest-0.7.0.dist-info}/top_level.txt +0 -0

palimpzest/core/data/dataclasses.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from __future__ import annotations
+import time
+from abc import abstractmethod
 from dataclasses import dataclass, field, fields
 from typing import Any
@@ -38,6 +40,12 @@ class GenerationStats:
     # (if applicable) the time (in seconds) spent executing a call to a function
     fn_call_duration_secs: float = 0.0
+    # (if applicable) the total number of LLM calls made by this operator
+    total_llm_calls: int = 0
+    # (if applicable) the total number of embedding LLM calls made by this operator
+    total_embedding_llm_calls: int = 0
     def __iadd__(self, other: GenerationStats) -> GenerationStats:
         # self.raw_answers.extend(other.raw_answers)
         for dataclass_field in [
@@ -48,6 +56,8 @@ class GenerationStats:
             "cost_per_record",
             "llm_call_duration_secs",
             "fn_call_duration_secs",
+            "total_llm_calls",
+            "total_embedding_llm_calls",
         ]:
             setattr(self, dataclass_field, getattr(self, dataclass_field) + getattr(other, dataclass_field))
         return self
@@ -63,6 +73,8 @@ class GenerationStats:
                 "llm_call_duration_secs",
                 "fn_call_duration_secs",
                 "cost_per_record",
+                "total_llm_calls",
+                "total_embedding_llm_calls",
             ]
         }
         # dct['raw_answers'] = self.raw_answers + other.raw_answers
@@ -83,6 +95,8 @@ class GenerationStats:
             "cost_per_record",
             "llm_call_duration_secs",
             "fn_call_duration_secs",
+            "total_llm_calls",
+            "total_embedding_llm_calls",
         ]:
             setattr(self, dataclass_field, getattr(self, dataclass_field) / quotient)
         return self
@@ -101,6 +115,8 @@ class GenerationStats:
                 "total_output_cost",
                 "llm_call_duration_secs",
                 "fn_call_duration_secs",
+                "total_llm_calls",
+                "total_embedding_llm_calls",
                 "cost_per_record",
             ]
         }
@@ -108,6 +124,7 @@ class GenerationStats:
         return GenerationStats(**dct)
     def __radd__(self, other: int) -> GenerationStats:
+        assert not isinstance(other, GenerationStats), "This should not be called with a GenerationStats object"
         return self
@@ -198,6 +215,12 @@ class RecordOpStats:
     # (if applicable) the time (in seconds) spent executing a UDF or calling an external api
     fn_call_duration_secs: float = 0.0
+    # (if applicable) the total number of LLM calls made by this operator
+    total_llm_calls: int = 0
+    # (if applicable) the total number of embedding LLM calls made by this operator
+    total_embedding_llm_calls: int = 0
     # (if applicable) a boolean indicating whether this is the statistics captured from a failed convert operation
     failed_convert: bool | None = None
@@ -232,32 +255,41 @@ class OperatorStats:
     # a list of RecordOpStats processed by the operation
     record_op_stats_lst: list[RecordOpStats] = field(default_factory=list)
+    # the ID of the physical operator which precedes this one
+    source_op_id: str | None = None
+    # the ID of the physical plan which this operator is part of
+    plan_id: str = ""
     # an OPTIONAL dictionary with more detailed information about this operation;
     op_details: dict[str, Any] = field(default_factory=dict)
-    def add_record_op_stats(
-        self,
-        record_op_stats_lst: RecordOpStats | list[RecordOpStats],
-        source_op_id: str | None,
-        plan_id: str,
-    ):
-        # convert individual record into list
-        if not isinstance(record_op_stats_lst, list):
-            record_op_stats_lst = [record_op_stats_lst]
-        # update op stats
-        for record_op_stats in record_op_stats_lst:
-            record_op_stats.source_op_id = source_op_id
-            record_op_stats.plan_id = plan_id
-            self.record_op_stats_lst.append(record_op_stats)
-            self.total_op_time += record_op_stats.time_per_record
-            self.total_op_cost += record_op_stats.cost_per_record
-    def __iadd__(self, op_stats: OperatorStats):
-        """NOTE: we assume the execution layer guarantees these op_stats belong to the same operator."""
-        self.total_op_time += op_stats.total_op_time
-        self.total_op_cost += op_stats.total_op_cost
-        self.record_op_stats_lst.extend(op_stats.record_op_stats_lst)
+    def __iadd__(self, stats: OperatorStats | RecordOpStats) -> OperatorStats:
+        """
+        Sum the given stats to this operator's stats. The given stats can be either:
+        1. an OperatorStats object
+        2. a RecordOpStats object
+        NOTE: in case (1.) we assume the execution layer guarantees that `stats` is
+              generated by the same operator in the same plan. Thus, we assume the
+              op_ids, op_name, source_op_id, etc. do not need to be updated.
+        """
+        if isinstance(stats, OperatorStats):
+            self.total_op_time += stats.total_op_time
+            self.total_op_cost += stats.total_op_cost
+            self.record_op_stats_lst.extend(stats.record_op_stats_lst)
+        elif isinstance(stats, RecordOpStats):
+            stats.source_op_id = self.source_op_id
+            stats.plan_id = self.plan_id
+            self.record_op_stats_lst.append(stats)
+            self.total_op_time += stats.time_per_record
+            self.total_op_cost += stats.cost_per_record
+        else:
+            raise TypeError(f"Cannot add {type(stats)} to OperatorStats")
         return self
     def to_json(self):
@@ -270,11 +302,22 @@ class OperatorStats:
             "op_details": self.op_details,
         }
 @dataclass
-class PlanStats:
+class BasePlanStats:
     """
     Dataclass for storing statistics captured for an entire plan.
+    This class is subclassed for tracking:
+    - PlanStats: the statistics for execution of a PhysicalPlan
+    - SentinelPlanStats: the statistics for execution of a SentinelPlan
+    The key difference between the two subclasses is that the `operator_stats`
+    field in the PlanStats maps from the physical operator ids to their corresponding
+    OperatorStats objects.
+    The `operator_stats` field in the SentinelPlanStats maps from a logical operator id
+    to another dictionary which maps from the physical operator ids to their corresponding
+    OperatorStats objects.
     """
     # id for identifying the physical plan
@@ -283,8 +326,10 @@ class PlanStats:
     # string representation of the physical plan
     plan_str: str | None = None
-    # dictionary of OperatorStats objects (one for each operator)
-    operator_stats: dict[str, OperatorStats] = field(default_factory=dict)
+    # dictionary whose values are OperatorStats objects;
+    # PlanStats maps {physical_op_id -> OperatorStats}
+    # SentinelPlanStats maps {logical_op_id -> {physical_op_id -> OperatorStats}}
+    operator_stats: dict = field(default_factory=dict)
     # total runtime for the plan measured from the start to the end of PhysicalPlan.execute()
     total_plan_time: float = 0.0
@@ -292,7 +337,108 @@ class PlanStats:
     # total cost for plan
     total_plan_cost: float = 0.0
-    def __iadd__(self, plan_stats: PlanStats):
+    # start time for the plan execution; should be set by calling PlanStats.start()
+    start_time: float | None = None
+    def start(self) -> None:
+        """Start the timer for this plan execution."""
+        self.start_time = time.time()
+    def finish(self) -> None:
+        """Finish the timer for this plan execution."""
+        if self.start_time is None:
+            raise RuntimeError("PlanStats.start() must be called before PlanStats.finish()")
+        self.total_plan_time = time.time() - self.start_time
+        self.total_plan_cost = self.sum_op_costs()
+    @staticmethod
+    @abstractmethod
+    def from_plan(plan) -> BasePlanStats:
+        """
+        Initialize this PlanStats object from a PhysicalPlan or SentinelPlan object.
+        """
+        pass
+    @abstractmethod
+    def sum_op_costs(self) -> float:
+        """
+        Sum the costs of all operators in this plan.
+        """
+        pass
+    @abstractmethod
+    def add_record_op_stats(self, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
+        """
+        Add the given RecordOpStats to this plan's operator stats.
+        """
+        pass
+    @abstractmethod
+    def __iadd__(self, plan_stats: BasePlanStats) -> None:
+        """
+        Add the given PlanStats to this plan's operator stats.
+        """
+        pass
+    @abstractmethod
+    def __str__(self) -> str:
+        """
+        Return a string representation of this plan's statistics.
+        """
+        pass
+    @abstractmethod
+    def to_json(self) -> dict:
+        """
+        Return a JSON representation of this plan's statistics.
+        """
+        pass
+@dataclass
+class PlanStats(BasePlanStats):
+    """
+    Subclass of BasePlanStats which captures statistics from the execution of a single PhysicalPlan.
+    """
+    @staticmethod
+    def from_plan(plan) -> PlanStats:
+        """
+        Initialize this PlanStats object from a PhysicalPlan object.
+        """
+        operator_stats = {}
+        for op_idx, op in enumerate(plan.operators):
+            op_id = op.get_op_id()
+            operator_stats[op_id] = OperatorStats(
+                op_id=op_id,
+                op_name=op.op_name(),
+                source_op_id=None if op_idx == 0 else plan.operators[op_idx - 1].get_op_id(),
+                plan_id=plan.plan_id,
+                op_details={k: str(v) for k, v in op.get_id_params().items()},
+            )
+        return PlanStats(plan_id=plan.plan_id, plan_str=str(plan), operator_stats=operator_stats)
+    def sum_op_costs(self) -> float:
+        """
+        Sum the costs of all operators in this plan.
+        """
+        return sum([op_stats.total_op_cost for _, op_stats in self.operator_stats.items()])
+    def add_record_op_stats(self, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
+        """
+        Add the given RecordOpStats to this plan's operator stats.
+        """
+        # normalize input type to be list[RecordOpStats]
+        record_op_stats_lst = record_op_stats if isinstance(record_op_stats, list) else [record_op_stats]
+        # update operator stats
+        for record_op_stats in record_op_stats_lst:
+            op_id = record_op_stats.op_id
+            if op_id in self.operator_stats:
+                self.operator_stats[op_id] += record_op_stats
+            else:
+                raise ValueError(f"RecordOpStats with physical_op_id {op_id} not found in PlanStats")
+    def __iadd__(self, plan_stats: PlanStats) -> None:
         """
         NOTE: we assume the execution layer guarantees:
         1. these plan_stats belong to the same plan
@@ -302,24 +448,20 @@ class PlanStats:
         """
         self.total_plan_time += plan_stats.total_plan_time
         self.total_plan_cost += plan_stats.total_plan_cost
-        for op, op_stats in plan_stats.operator_stats.items():
-            if op in self.operator_stats:
-                self.operator_stats[op] += op_stats
+        for op_id, op_stats in plan_stats.operator_stats.items():
+            if op_id in self.operator_stats:
+                self.operator_stats[op_id] += op_stats
             else:
-                self.operator_stats[op] = op_stats
+                self.operator_stats[op_id] = op_stats
-    def finalize(self, total_plan_time: float):
-        self.total_plan_time = total_plan_time
-        self.total_plan_cost = sum([op_stats.total_op_cost for _, op_stats in self.operator_stats.items()])
-    def __str__(self):
-        stats = f"Total_plan_time={self.total_plan_time} \n"
-        stats += f"Total_plan_cost={self.total_plan_cost} \n"
+    def __str__(self) -> str:
+        stats = f"total_plan_time={self.total_plan_time} \n"
+        stats += f"total_plan_cost={self.total_plan_cost} \n"
         for idx, op_stats in enumerate(self.operator_stats.values()):
             stats += f"{idx}. {op_stats.op_name} time={op_stats.total_op_time} cost={op_stats.total_op_cost} \n"
         return stats
-    def to_json(self):
+    def to_json(self) -> dict:
         return {
             "plan_id": self.plan_id,
             "plan_str": self.plan_str,
@@ -329,6 +471,100 @@ class PlanStats:
         }
+@dataclass
+class SentinelPlanStats(BasePlanStats):
+    """
+    Subclass of BasePlanStats which captures statistics from the execution of a single SentinelPlan.
+    """
+    @staticmethod
+    def from_plan(plan) -> SentinelPlanStats:
+        """
+        Initialize this PlanStats object from a Sentinel object.
+        """
+        operator_stats = {}
+        for op_set_idx, (logical_op_id, op_set) in enumerate(plan):
+            operator_stats[logical_op_id] = {}
+            for physical_op in op_set:
+                op_id = physical_op.get_op_id()
+                operator_stats[logical_op_id][op_id] = OperatorStats(
+                    op_id=op_id,
+                    op_name=physical_op.op_name(),
+                    source_op_id=None if op_set_idx == 0 else plan.logical_op_ids[op_set_idx - 1],
+                    plan_id=plan.plan_id,
+                    op_details={k: str(v) for k, v in physical_op.get_id_params().items()},
+                )
+        return SentinelPlanStats(plan_id=plan.plan_id, plan_str=str(plan), operator_stats=operator_stats)
+    def sum_op_costs(self) -> float:
+        """
+        Sum the costs of all operators in this plan.
+        """
+        return sum(sum([op_stats.total_op_cost for _, op_stats in phys_op_stats.items()]) for _, phys_op_stats in self.operator_stats.items())
+    def add_record_op_stats(self, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
+        """
+        Add the given RecordOpStats to this plan's operator stats.
+        """
+        # normalize input type to be list[RecordOpStats]
+        record_op_stats_lst = record_op_stats if isinstance(record_op_stats, list) else [record_op_stats]
+        # update operator stats
+        for record_op_stats in record_op_stats_lst:
+            logical_op_id = record_op_stats.logical_op_id
+            physical_op_id = record_op_stats.op_id
+            if logical_op_id in self.operator_stats:
+                if physical_op_id in self.operator_stats[logical_op_id]:
+                    self.operator_stats[logical_op_id][physical_op_id] += record_op_stats
+                else:
+                    raise ValueError(f"RecordOpStats with physical_op_id {physical_op_id} not found in SentinelPlanStats")
+            else:
+                raise ValueError(f"RecordOpStats with logical_op_id {logical_op_id} not found in SentinelPlanStats")
+    def __iadd__(self, plan_stats: SentinelPlanStats) -> None:
+        """
+        NOTE: we assume the execution layer guarantees:
+        1. these plan_stats belong to the same plan
+        2. these plan_stats come from sequential (non-overlapping) executions of the same plan
+        The latter criteria implies it is okay for this method to sum the plan (and operator) runtimes.
+        """
+        self.total_plan_time += plan_stats.total_plan_time
+        self.total_plan_cost += plan_stats.total_plan_cost
+        for logical_op_id, physical_op_stats in plan_stats.operator_stats.items():
+            for physical_op_id, op_stats in physical_op_stats.items():
+                if logical_op_id in self.operator_stats:
+                    if physical_op_id in self.operator_stats[logical_op_id]:
+                        self.operator_stats[logical_op_id][physical_op_id] += op_stats
+                    else:
+                        self.operator_stats[logical_op_id][physical_op_id] = op_stats
+                else:
+                    self.operator_stats[logical_op_id] = physical_op_stats
+    def __str__(self) -> str:
+        stats = f"total_plan_time={self.total_plan_time} \n"
+        stats += f"total_plan_cost={self.total_plan_cost} \n"
+        for outer_idx, physical_op_stats in enumerate(self.operator_stats.values()):
+            total_time = sum([op_stats.total_op_time for op_stats in physical_op_stats.values()])
+            total_cost = sum([op_stats.total_op_cost for op_stats in physical_op_stats.values()])
+            stats += f"{outer_idx}. total_time={total_time} total_cost={total_cost} \n"
+            for inner_idx, op_stats in enumerate(physical_op_stats.values()):
+                stats += f"    {outer_idx}.{inner_idx}. {op_stats.op_name} time={op_stats.total_op_time} cost={op_stats.total_op_cost} \n"
+        return stats
+    def to_json(self) -> dict:
+        return {
+            "plan_id": self.plan_id,
+            "plan_str": self.plan_str,
+            "operator_stats": {
+                logical_op_id: {physical_op_id: op_stats.to_json() for physical_op_id, op_stats in physical_op_stats.items()}
+                for logical_op_id, physical_op_stats in self.operator_stats.items()
+            },
+            "total_plan_time": self.total_plan_time,
+            "total_plan_cost": self.total_plan_cost,
+        }
 @dataclass
 class ExecutionStats:
     """
@@ -338,28 +574,130 @@ class ExecutionStats:
     # string for identifying this workload execution
     execution_id: str | None = None
+    # dictionary of SentinelPlanStats objects (one for each sentinel plan run during execution)
+    sentinel_plan_stats: dict[str, SentinelPlanStats] = field(default_factory=dict)
     # dictionary of PlanStats objects (one for each plan run during execution)
     plan_stats: dict[str, PlanStats] = field(default_factory=dict)
     # total time spent optimizing
-    total_optimization_time: float = 0.0
+    optimization_time: float = 0.0
-    # total runtime for a plan's execution
+    # total cost of optimizing
+    optimization_cost: float = 0.0
+    # total time spent executing the optimized plan
+    plan_execution_time: float = 0.0
+    # total cost of executing the optimized plan
+    plan_execution_cost: float = 0.0
+    # total runtime for the entire execution
     total_execution_time: float = 0.0
-    # total cost for a plan's execution
+    # total cost for the entire execution
     total_execution_cost: float = 0.0
+    # dictionary of sentinel plan strings; useful for printing executed sentinel plans in demos
+    sentinel_plan_strs: dict[str, str] = field(default_factory=dict)
     # dictionary of plan strings; useful for printing executed plans in demos
     plan_strs: dict[str, str] = field(default_factory=dict)
+    # start time for the execution; should be set by calling ExecutionStats.start()
+    start_time: float | None = None
+    # end time for the optimization;
+    optimization_end_time: float | None = None
+    def start(self) -> None:
+        """Start the timer for this execution."""
+        self.start_time = time.time()
+    def finish_optimization(self) -> None:
+        """Finish the timer for the optimization phase of this execution."""
+        if self.start_time is None:
+            raise RuntimeError("ExecutionStats.start() must be called before ExecutionStats.finish_optimization()")
+        # compute optimization time and cost
+        self.optimization_end_time = time.time()
+        self.optimization_time = self.optimization_end_time - self.start_time
+        self.optimization_cost = self.sum_sentinel_plan_costs()
+        # compute sentinel_plan_strs
+        self.sentinel_plan_strs = {plan_id: plan_stats.plan_str for plan_id, plan_stats in self.sentinel_plan_stats.items()}
+    def finish(self) -> None:
+        """Finish the timer for this execution."""
+        if self.start_time is None:
+            raise RuntimeError("ExecutionStats.start() must be called before ExecutionStats.finish()")
+        # compute time for plan and total execution
+        end_time = time.time()
+        self.plan_execution_time = (
+            end_time - self.optimization_end_time
+            if self.optimization_end_time is not None
+            else end_time - self.start_time
+        )
+        self.total_execution_time = end_time - self.start_time
+        # compute the cost for plan and total execution
+        self.plan_execution_cost = self.sum_plan_costs()
+        self.total_execution_cost = self.optimization_cost + self.plan_execution_cost
+        # compute plan_strs
+        self.plan_strs = {plan_id: plan_stats.plan_str for plan_id, plan_stats in self.plan_stats.items()}
+    def sum_sentinel_plan_costs(self) -> float:
+        """
+        Sum the costs of all SentinelPlans in this execution.
+        """
+        return sum([plan_stats.sum_op_costs() for _, plan_stats in self.sentinel_plan_stats.items()])
+    def sum_plan_costs(self) -> float:
+        """
+        Sum the costs of all PhysicalPlans in this execution.
+        """
+        return sum([plan_stats.sum_op_costs() for _, plan_stats in self.plan_stats.items()])
+    def add_plan_stats(self, plan_stats: PlanStats | SentinelPlanStats | list[PlanStats] | list[SentinelPlanStats]) -> None:
+        """
+        Add the given PlanStats (or SentinelPlanStats) to this execution's plan stats.
+        NOTE: we make the assumption that the same plan cannot be run more than once in parallel,
+        i.e. each plan stats object for an individual plan comes from two different (sequential)
+        periods in time. Thus, PlanStats objects can be summed.
+        """
+        # normalize input type to be list[PlanStats] or list[SentinelPlanStats]
+        if isinstance(plan_stats, (PlanStats, SentinelPlanStats)):
+            plan_stats = [plan_stats]
+        for plan_stats_obj in plan_stats:
+            if isinstance(plan_stats_obj, PlanStats) and plan_stats_obj.plan_id not in self.plan_stats:
+                self.plan_stats[plan_stats_obj.plan_id] = plan_stats_obj
+            elif isinstance(plan_stats_obj, PlanStats):
+                self.plan_stats[plan_stats_obj.plan_id] += plan_stats_obj
+            elif isinstance(plan_stats_obj, SentinelPlanStats) and plan_stats_obj.plan_id not in self.sentinel_plan_stats:
+                self.sentinel_plan_stats[plan_stats_obj.plan_id] = plan_stats_obj
+            elif isinstance(plan_stats_obj, SentinelPlanStats):
+                self.sentinel_plan_stats[plan_stats_obj.plan_id] += plan_stats_obj
+            else:
+                raise TypeError(f"Cannot add {type(plan_stats)} to ExecutionStats")
     def to_json(self):
         return {
             "execution_id": self.execution_id,
+            "sentinel_plan_stats": {
+                plan_id: plan_stats.to_json() for plan_id, plan_stats in self.sentinel_plan_stats.items()
+            },
             "plan_stats": {plan_id: plan_stats.to_json() for plan_id, plan_stats in self.plan_stats.items()},
-            "total_optimization_time": self.total_optimization_time,
+            "optimization_time": self.optimization_time,
+            "optimization_cost": self.optimization_cost,
+            "plan_execution_time": self.plan_execution_time,
+            "plan_execution_cost": self.plan_execution_cost,
             "total_execution_time": self.total_execution_time,
             "total_execution_cost": self.total_execution_cost,
+            "sentinel_plan_strs": self.sentinel_plan_strs,
             "plan_strs": self.plan_strs,
         }

palimpzest/core/elements/filters.py CHANGED Viewed

@@ -16,17 +16,20 @@ class Filter:
         self.filter_fn = filter_fn
     def serialize(self) -> dict[str, Any]:
-        return {"filter_condition": self.filter_condition, "filter_fn": str(self.filter_fn)}
+        return {
+            "filter_condition": self.filter_condition,
+            "filter_fn": self.filter_fn.__name__ if self.filter_fn is not None else None,
+        }
     def get_filter_str(self) -> str:
-        return self.filter_condition if self.filter_condition is not None else str(self.filter_fn)
+        return self.filter_condition if self.filter_condition is not None else self.filter_fn.__name__
     def __repr__(self) -> str:
         return "Filter(" + self.get_filter_str() + ")"
     def __hash__(self) -> int:
         # custom hash function
-        return hash(self.filter_condition) if self.filter_condition is not None else hash(str(self.filter_fn))
+        return hash(self.filter_condition) if self.filter_condition is not None else hash(self.filter_fn.__name__)
     def __eq__(self, other) -> bool:
         # __eq__ should be defined for consistency with __hash__
@@ -35,5 +38,6 @@ class Filter:
             and self.filter_condition == other.filter_condition
             and self.filter_fn == other.filter_fn
         )
     def __str__(self) -> str:
         return self.get_filter_str()

palimpzest/core/elements/index.py ADDED Viewed

@@ -0,0 +1,70 @@
+from __future__ import annotations
+from abc import ABC, abstractmethod
+from chromadb.api.models.Collection import Collection
+from ragatouille.RAGPretrainedModel import RAGPretrainedModel
+def index_factory(index: Collection | RAGPretrainedModel) -> PZIndex:
+    """
+    Factory function to create a PZ index based on the type of the provided index.
+    Args:
+        index (Collection | RAGPretrainedModel): The index provided by the user.
+    Returns:
+        PZIndex: The PZ wrapped Index.
+    """
+    if isinstance(index, Collection):
+        return ChromaIndex(index)
+    elif isinstance(index, RAGPretrainedModel):
+        return RagatouilleIndex(index)
+    else:
+        raise TypeError(f"Unsupported index type: {type(index)}\nindex must be a `chromadb.api.models.Collection.Collection` or `ragatouille.RAGPretrainedModel.RAGPretrainedModel`")
+class BaseIndex(ABC):
+    def __init__(self, index: Collection | RAGPretrainedModel):
+        self.index = index
+    def __str__(self):
+        """
+        Return a string representation of the index.
+        """
+        return f"{self.__class__.__name__}"
+    @abstractmethod
+    def search(self, query_embedding: list[float] | list[list[float]], results_per_query: int = 1) -> list | list[list]:
+        """
+        Query the index with a string or a list of strings.
+        Args:
+            query (str | list[str]): The query string or list of strings to search for.
+            results_per_query (int): The number of top results to retrieve for each query.
+        Returns:
+            list | list[list]: The top results for the query. If query is a list, the top
+                results for each query in the list are returned. Each list will contain the
+                raw elements yielded by the index. This way, users can program against the
+                results they expect to get from e.g. chromadb or ragatouille.
+        """
+        pass
+class ChromaIndex(BaseIndex):
+    def __init__(self, index: Collection):
+        assert isinstance(index, Collection), "ChromaIndex input must be a `chromadb.api.models.Collection.Collection`"
+        super().__init__(index)
+class RagatouilleIndex(BaseIndex):
+    def __init__(self, index: RAGPretrainedModel):
+        assert isinstance(index, RAGPretrainedModel), "RagatouilleIndex input must be a `ragatouille.RAGPretrainedModel.RAGPretrainedModel`"
+        super().__init__(index)
+# define type for PZIndex
+PZIndex = ChromaIndex | RagatouilleIndex

palimpzest 0.6.4__py3-none-any.whl → 0.7.0__py3-none-any.whl

palimpzest 0.6.4py3-none-any.whl → 0.7.0py3-none-any.whl