PyPI - palimpzest - Versions diffs - 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl - Mend

palimpzest 0.7.20py3-none-any.whl → 0.8.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

palimpzest/__init__.py +37 -6
palimpzest/agents/__init__.py +0 -0
palimpzest/agents/compute_agents.py +0 -0
palimpzest/agents/search_agents.py +637 -0
palimpzest/constants.py +259 -197
palimpzest/core/data/context.py +393 -0
palimpzest/core/data/context_manager.py +163 -0
palimpzest/core/data/dataset.py +634 -0
palimpzest/core/data/{datareaders.py → iter_dataset.py} +202 -126
palimpzest/core/elements/groupbysig.py +16 -13
palimpzest/core/elements/records.py +166 -75
palimpzest/core/lib/schemas.py +152 -390
palimpzest/core/{data/dataclasses.py → models.py} +306 -170
palimpzest/policy.py +2 -27
palimpzest/prompts/__init__.py +35 -5
palimpzest/prompts/agent_prompts.py +357 -0
palimpzest/prompts/context_search.py +9 -0
palimpzest/prompts/convert_prompts.py +61 -5
palimpzest/prompts/filter_prompts.py +50 -5
palimpzest/prompts/join_prompts.py +163 -0
palimpzest/prompts/moa_proposer_convert_prompts.py +5 -5
palimpzest/prompts/prompt_factory.py +358 -46
palimpzest/prompts/validator.py +239 -0
palimpzest/query/execution/all_sample_execution_strategy.py +134 -76
palimpzest/query/execution/execution_strategy.py +210 -317
palimpzest/query/execution/execution_strategy_type.py +5 -7
palimpzest/query/execution/mab_execution_strategy.py +249 -136
palimpzest/query/execution/parallel_execution_strategy.py +153 -244
palimpzest/query/execution/single_threaded_execution_strategy.py +107 -64
palimpzest/query/generators/generators.py +157 -330
palimpzest/query/operators/__init__.py +15 -5
palimpzest/query/operators/aggregate.py +50 -33
palimpzest/query/operators/compute.py +201 -0
palimpzest/query/operators/convert.py +27 -21
palimpzest/query/operators/critique_and_refine_convert.py +7 -5
palimpzest/query/operators/distinct.py +62 -0
palimpzest/query/operators/filter.py +22 -13
palimpzest/query/operators/join.py +402 -0
palimpzest/query/operators/limit.py +3 -3
palimpzest/query/operators/logical.py +198 -80
palimpzest/query/operators/mixture_of_agents_convert.py +10 -8
palimpzest/query/operators/physical.py +27 -21
palimpzest/query/operators/project.py +3 -3
palimpzest/query/operators/rag_convert.py +7 -7
palimpzest/query/operators/retrieve.py +9 -9
palimpzest/query/operators/scan.py +81 -42
palimpzest/query/operators/search.py +524 -0
palimpzest/query/operators/split_convert.py +10 -8
palimpzest/query/optimizer/__init__.py +7 -9
palimpzest/query/optimizer/cost_model.py +108 -441
palimpzest/query/optimizer/optimizer.py +123 -181
palimpzest/query/optimizer/optimizer_strategy.py +66 -61
palimpzest/query/optimizer/plan.py +352 -67
palimpzest/query/optimizer/primitives.py +43 -19
palimpzest/query/optimizer/rules.py +484 -646
palimpzest/query/optimizer/tasks.py +127 -58
palimpzest/query/processor/config.py +41 -76
palimpzest/query/processor/query_processor.py +73 -18
palimpzest/query/processor/query_processor_factory.py +46 -38
palimpzest/schemabuilder/schema_builder.py +15 -28
palimpzest/utils/model_helpers.py +27 -77
palimpzest/utils/progress.py +114 -102
palimpzest/validator/__init__.py +0 -0
palimpzest/validator/validator.py +306 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/METADATA +6 -1
palimpzest-0.8.0.dist-info/RECORD +95 -0
palimpzest/core/lib/fields.py +0 -141
palimpzest/prompts/code_synthesis_prompts.py +0 -28
palimpzest/query/execution/random_sampling_execution_strategy.py +0 -240
palimpzest/query/generators/api_client_factory.py +0 -30
palimpzest/query/operators/code_synthesis_convert.py +0 -488
palimpzest/query/operators/map.py +0 -130
palimpzest/query/processor/nosentinel_processor.py +0 -33
palimpzest/query/processor/processing_strategy_type.py +0 -28
palimpzest/query/processor/sentinel_processor.py +0 -88
palimpzest/query/processor/streaming_processor.py +0 -149
palimpzest/sets.py +0 -405
palimpzest/utils/datareader_helpers.py +0 -61
palimpzest/utils/demo_helpers.py +0 -75
palimpzest/utils/field_helpers.py +0 -69
palimpzest/utils/generation_helpers.py +0 -69
palimpzest/utils/sandbox.py +0 -183
palimpzest-0.7.20.dist-info/RECORD +0 -95
/palimpzest/core/{elements/index.py → data/index_dataset.py} +0 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/WHEEL +0 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.7.20.dist-info → palimpzest-0.8.0.dist-info}/top_level.txt +0 -0

palimpzest/core/{data/dataclasses.py → models.py} RENAMED Viewed

@@ -1,17 +1,16 @@
 from __future__ import annotations
+import json
 import time
 from abc import abstractmethod
-from dataclasses import dataclass, field, fields
 from typing import Any
-import numpy as np
+from pydantic import BaseModel, Field
-@dataclass
-class GenerationStats:
+class GenerationStats(BaseModel):
     """
-    Dataclass for storing statistics about the execution of an operator on a single record.
+    Model for storing statistics about the execution of an operator on a single record.
     """
     model_name: str | None = None
@@ -19,6 +18,15 @@ class GenerationStats:
     # The raw answer as output from the generator (a list of strings, possibly of len 1)
     # raw_answers: Optional[List[str]] = field(default_factory=list)
+    # the number of input audio tokens
+    input_audio_tokens: int = 0
+    # the number of input text tokens
+    input_text_tokens: int = 0
+    # the number of input image tokens
+    input_image_tokens: int = 0
     # the total number of input tokens processed by this operator; None if this operation did not use an LLM
     # typed as a float because GenerationStats may be amortized (i.e. divided) across a number of output records
     total_input_tokens: float = 0.0
@@ -33,7 +41,7 @@ class GenerationStats:
     # the total cost of processing the output tokens; None if this operation did not use an LLM
     total_output_cost: float = 0.0
-    # the total cost of processing the output tokens; None if this operation did not use an LLM
+    # the total cost of processing the input and output tokens; None if this operation did not use an LLM
     cost_per_record: float = 0.0
     # (if applicable) the time (in seconds) spent executing a call to an LLM
@@ -50,7 +58,7 @@ class GenerationStats:
     def __iadd__(self, other: GenerationStats) -> GenerationStats:
         # self.raw_answers.extend(other.raw_answers)
-        for dataclass_field in [
+        for model_field in [
             "total_input_tokens",
             "total_output_tokens",
             "total_input_cost",
@@ -61,7 +69,7 @@ class GenerationStats:
             "total_llm_calls",
             "total_embedding_llm_calls",
         ]:
-            setattr(self, dataclass_field, getattr(self, dataclass_field) + getattr(other, dataclass_field))
+            setattr(self, model_field, getattr(self, model_field) + getattr(other, model_field))
         return self
     def __add__(self, other: GenerationStats) -> GenerationStats:
@@ -89,7 +97,7 @@ class GenerationStats:
             raise ZeroDivisionError("Cannot divide by zero")
         if isinstance(quotient, int):
             quotient = float(quotient)
-        for dataclass_field in [
+        for model_field in [
             "total_input_tokens",
             "total_output_tokens",
             "total_input_cost",
@@ -100,7 +108,7 @@ class GenerationStats:
             "total_llm_calls",
             "total_embedding_llm_calls",
         ]:
-            setattr(self, dataclass_field, getattr(self, dataclass_field) / quotient)
+            setattr(self, model_field, getattr(self, model_field) / quotient)
         return self
     def __truediv__(self, quotient: float) -> GenerationStats:
@@ -129,22 +137,30 @@ class GenerationStats:
         assert not isinstance(other, GenerationStats), "This should not be called with a GenerationStats object"
         return self
+    # NOTE: this is added temporarily to help track cost of compute agent writing PZ code;
+    #       once we find a long-term solution for tracking that cost, we can remove this
+    def to_json(self, filepath: str | None = None) -> dict | None:
+        if filepath is None:
+            return self.model_dump(mode="json")
+        with open(filepath, "w") as f:
+            json.dump(self.model_dump(mode="json"), f)
-@dataclass
-class RecordOpStats:
+class RecordOpStats(BaseModel):
     """
-    Dataclass for storing statistics about the execution of an operator on a single record.
+    Model for storing statistics about the execution of an operator on a single record.
     """
     ##### REQUIRED FIELDS #####
     # record id; an identifier for this record
-    record_id: str
+    record_id: str | int
-    # identifier for the parent of this record
-    record_parent_id: str
+    # identifier for the parent(s) of this record
+    record_parent_ids: list[str | int] | None
-    # idenifier for the source idx of this record
-    record_source_idx: str
+    # idenifier for the source indices of this record
+    record_source_indices: list[str | int]
     # a dictionary with the record state after being processed by the operator
     record_state: dict[str, Any]
@@ -165,8 +181,11 @@ class RecordOpStats:
     cost_per_record: float
     ##### NOT-OPTIONAL, BUT FILLED BY EXECUTION CLASS AFTER CONSTRUCTOR CALL #####
-    # the ID of the physical operation which produced the input record for this record at this operation
-    source_full_op_id: str | None = None
+    # the ID(s) of the physical operation(s) which produced the input record(s) for this record at this operation
+    source_unique_full_op_ids: list[str] | None = None
+    # the ID(s) of the logical operation(s) which produced the input record(s) for this record at this operation
+    source_unique_logical_op_ids: list[str] | None = None
     # the ID of the physical plan which produced this record at this operation
     plan_id: str = ""
@@ -207,8 +226,11 @@ class RecordOpStats:
     # (if applicable) the filter text (or a string representation of the filter function) applied to this record
     filter_str: str | None = None
+    # (if applicable) the join condition applied to this record
+    join_condition: str | None = None
     # the True/False result of whether this record was output by the operator or not
-    # (can only be False if the operator is as Filter)
+    # (can only be False if the operator is a Filter or Join)
     passed_operator: bool = True
     # (if applicable) the time (in seconds) spent executing a call to an LLM
@@ -230,16 +252,12 @@ class RecordOpStats:
     image_operation: bool | None = None
     # an OPTIONAL dictionary with more detailed information about this operation;
-    op_details: dict[str, Any] = field(default_factory=dict)
-    def to_json(self):
-        return {field.name: getattr(self, field.name) for field in fields(self)}
+    op_details: dict[str, Any] = Field(default_factory=dict)
-@dataclass
-class OperatorStats:
+class OperatorStats(BaseModel):
     """
-    Dataclass for storing statistics captured within a given operator.
+    Model for storing statistics captured within a given operator.
     """
     # the full ID of the physical operation in which these stats were collected
@@ -254,17 +272,26 @@ class OperatorStats:
     # the total cost of this operation
     total_op_cost: float = 0.0
+    # the total input tokens processed by this operation
+    total_input_tokens: int = 0
+    # the total output tokens processed by this operation
+    total_output_tokens: int = 0
     # a list of RecordOpStats processed by the operation
-    record_op_stats_lst: list[RecordOpStats] = field(default_factory=list)
+    record_op_stats_lst: list[RecordOpStats] = Field(default_factory=list)
-    # the full ID of the physical operator which precedes this one
-    source_full_op_id: str | None = None
+    # the unique full ID(s) of the physical operator(s) which precede this one (used by PlanStats)
+    source_unique_full_op_ids: list[str] | None = None
+    # the unique full ID(s) of the logical operator(s) which precede this one (used by SentinelPlanStats)
+    source_unique_logical_op_ids: list[str] | None = None
     # the ID of the physical plan which this operator is part of
     plan_id: str = ""
     # an OPTIONAL dictionary with more detailed information about this operation;
-    op_details: dict[str, Any] = field(default_factory=dict)
+    op_details: dict[str, Any] = Field(default_factory=dict)
     def __iadd__(self, stats: OperatorStats | RecordOpStats) -> OperatorStats:
         """
@@ -280,34 +307,28 @@ class OperatorStats:
         if isinstance(stats, OperatorStats):
             self.total_op_time += stats.total_op_time
             self.total_op_cost += stats.total_op_cost
+            self.total_input_tokens += stats.total_input_tokens
+            self.total_output_tokens += stats.total_output_tokens
             self.record_op_stats_lst.extend(stats.record_op_stats_lst)
         elif isinstance(stats, RecordOpStats):
-            stats.source_full_op_id = self.source_full_op_id
+            stats.source_unique_full_op_ids = self.source_unique_full_op_ids
             stats.plan_id = self.plan_id
             self.record_op_stats_lst.append(stats)
             self.total_op_time += stats.time_per_record
             self.total_op_cost += stats.cost_per_record
+            self.total_input_tokens += stats.total_input_tokens
+            self.total_output_tokens += stats.total_output_tokens
         else:
             raise TypeError(f"Cannot add {type(stats)} to OperatorStats")
         return self
-    def to_json(self):
-        return {
-            "full_op_id": self.full_op_id,
-            "op_name": self.op_name,
-            "total_op_time": self.total_op_time,
-            "total_op_cost": self.total_op_cost,
-            "record_op_stats_lst": [record_op_stats.to_json() for record_op_stats in self.record_op_stats_lst],
-            "op_details": self.op_details,
-        }
-@dataclass
-class BasePlanStats:
+class BasePlanStats(BaseModel):
     """
-    Dataclass for storing statistics captured for an entire plan.
+    Model for storing statistics captured for an entire plan.
     This class is subclassed for tracking:
     - PlanStats: the statistics for execution of a PhysicalPlan
@@ -331,7 +352,11 @@ class BasePlanStats:
     # dictionary whose values are OperatorStats objects;
     # PlanStats maps {full_op_id -> OperatorStats}
     # SentinelPlanStats maps {logical_op_id -> {full_op_id -> OperatorStats}}
-    operator_stats: dict = field(default_factory=dict)
+    operator_stats: dict = Field(default_factory=dict)
+    # dictionary whose values are GenerationStats objects for validation;
+    # only used by SentinelPlanStats
+    validation_gen_stats: dict[str, GenerationStats] = Field(default_factory=dict)
     # total runtime for the plan measured from the start to the end of PhysicalPlan.execute()
     total_plan_time: float = 0.0
@@ -339,6 +364,12 @@ class BasePlanStats:
     # total cost for plan
     total_plan_cost: float = 0.0
+    # total input tokens processed by this plan
+    total_input_tokens: int = 0
+    # total output tokens processed by this plan
+    total_output_tokens: int = 0
     # start time for the plan execution; should be set by calling PlanStats.start()
     start_time: float | None = None
@@ -351,7 +382,9 @@ class BasePlanStats:
         if self.start_time is None:
             raise RuntimeError("PlanStats.start() must be called before PlanStats.finish()")
         self.total_plan_time = time.time() - self.start_time
-        self.total_plan_cost = self.sum_op_costs()
+        self.total_plan_cost = self.sum_op_costs() + self.sum_validation_costs()
+        self.total_input_tokens = self.sum_input_tokens() + self.sum_validation_input_tokens()
+        self.total_output_tokens = self.sum_output_tokens() + self.sum_validation_output_tokens()
     @staticmethod
     @abstractmethod
@@ -369,9 +402,23 @@ class BasePlanStats:
         pass
     @abstractmethod
-    def add_record_op_stats(self, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
+    def sum_input_tokens(self) -> int:
+        """
+        Sum the input tokens processed by all operators in this plan.
+        """
+        pass
+    @abstractmethod
+    def sum_output_tokens(self) -> int:
+        """
+        Sum the output tokens processed by all operators in this plan.
+        """
+        pass
+    @abstractmethod
+    def add_record_op_stats(self, unique_full_op_id: str, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
         """
-        Add the given RecordOpStats to this plan's operator stats.
+        Add the given RecordOpStats to this plan's operator stats for the given operator id.
         """
         pass
@@ -389,14 +436,25 @@ class BasePlanStats:
         """
         pass
-    @abstractmethod
-    def to_json(self) -> dict:
+    def sum_validation_costs(self) -> float:
         """
-        Return a JSON representation of this plan's statistics.
+        Sum the costs of all validation generations in this plan.
         """
-        pass
+        return sum([gen_stats.cost_per_record for _, gen_stats in self.validation_gen_stats.items()])
+    def sum_validation_input_tokens(self) -> int:
+        """
+        Sum the input tokens processed by all validation generations in this plan.
+        """
+        return sum([gen_stats.total_input_tokens for _, gen_stats in self.validation_gen_stats.items()])
+    def sum_validation_output_tokens(self) -> int:
+        """
+        Sum the output tokens processed by all validation generations in this plan.
+        """
+        return sum([gen_stats.total_output_tokens for _, gen_stats in self.validation_gen_stats.items()])
-@dataclass
 class PlanStats(BasePlanStats):
     """
     Subclass of BasePlanStats which captures statistics from the execution of a single PhysicalPlan.
@@ -406,17 +464,18 @@ class PlanStats(BasePlanStats):
         """
         Initialize this PlanStats object from a PhysicalPlan object.
         """
+        # TODO?: have PhysicalPlan return PlanStats object
         operator_stats = {}
-        for op_idx, op in enumerate(plan.operators):
-            full_op_id = op.get_full_op_id()
-            operator_stats[full_op_id] = OperatorStats(
-                full_op_id=full_op_id,
+        for topo_idx, op in enumerate(plan):
+            unique_full_op_id = f"{topo_idx}-{op.get_full_op_id()}"
+            operator_stats[unique_full_op_id] = OperatorStats(
+                full_op_id=op.get_full_op_id(),
                 op_name=op.op_name(),
-                source_full_op_id=None if op_idx == 0 else plan.operators[op_idx - 1].get_full_op_id(),
+                source_unique_full_op_ids=plan.get_source_unique_full_op_ids(topo_idx, op),
                 plan_id=plan.plan_id,
                 op_details={k: str(v) for k, v in op.get_id_params().items()},
             )
         return PlanStats(plan_id=plan.plan_id, plan_str=str(plan), operator_stats=operator_stats)
     def sum_op_costs(self) -> float:
@@ -425,20 +484,31 @@ class PlanStats(BasePlanStats):
         """
         return sum([op_stats.total_op_cost for _, op_stats in self.operator_stats.items()])
-    def add_record_op_stats(self, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
+    def sum_input_tokens(self) -> int:
+        """
+        Sum the input tokens processed by all operators in this plan.
+        """
+        return sum([op_stats.total_input_tokens for _, op_stats in self.operator_stats.items()])
+    def sum_output_tokens(self) -> int:
         """
-        Add the given RecordOpStats to this plan's operator stats.
+        Sum the output tokens processed by all operators in this plan.
+        """
+        return sum([op_stats.total_output_tokens for _, op_stats in self.operator_stats.items()])
+    def add_record_op_stats(self, unique_full_op_id: str, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
+        """
+        Add the given RecordOpStats to this plan's operator stats for the given operator id.
         """
         # normalize input type to be list[RecordOpStats]
         record_op_stats_lst = record_op_stats if isinstance(record_op_stats, list) else [record_op_stats]
         # update operator stats
         for record_op_stats in record_op_stats_lst:
-            full_op_id = record_op_stats.full_op_id
-            if full_op_id in self.operator_stats:
-                self.operator_stats[full_op_id] += record_op_stats
+            if unique_full_op_id in self.operator_stats:
+                self.operator_stats[unique_full_op_id] += record_op_stats
             else:
-                raise ValueError(f"RecordOpStats with full_op_id {full_op_id} not found in PlanStats")
+                raise ValueError(f"RecordOpStats with unique_full_op_id {unique_full_op_id} not found in PlanStats")
     def __iadd__(self, plan_stats: PlanStats) -> None:
         """
@@ -450,30 +520,24 @@ class PlanStats(BasePlanStats):
         """
         self.total_plan_time += plan_stats.total_plan_time
         self.total_plan_cost += plan_stats.total_plan_cost
-        for full_op_id, op_stats in plan_stats.operator_stats.items():
-            if full_op_id in self.operator_stats:
-                self.operator_stats[full_op_id] += op_stats
+        self.total_input_tokens += plan_stats.total_input_tokens
+        self.total_output_tokens += plan_stats.total_output_tokens
+        for unique_full_op_id, op_stats in plan_stats.operator_stats.items():
+            if unique_full_op_id in self.operator_stats:
+                self.operator_stats[unique_full_op_id] += op_stats
             else:
-                self.operator_stats[full_op_id] = op_stats
+                self.operator_stats[unique_full_op_id] = op_stats
     def __str__(self) -> str:
         stats = f"total_plan_time={self.total_plan_time} \n"
         stats += f"total_plan_cost={self.total_plan_cost} \n"
+        stats += f"total_input_tokens={self.total_input_tokens} \n"
+        stats += f"total_output_tokens={self.total_output_tokens} \n"
         for idx, op_stats in enumerate(self.operator_stats.values()):
             stats += f"{idx}. {op_stats.op_name} time={op_stats.total_op_time} cost={op_stats.total_op_cost} \n"
         return stats
-    def to_json(self) -> dict:
-        return {
-            "plan_id": self.plan_id,
-            "plan_str": self.plan_str,
-            "operator_stats": {full_op_id: op_stats.to_json() for full_op_id, op_stats in self.operator_stats.items()},
-            "total_plan_time": self.total_plan_time,
-            "total_plan_cost": self.total_plan_cost,
-        }
-@dataclass
 class SentinelPlanStats(BasePlanStats):
     """
     Subclass of BasePlanStats which captures statistics from the execution of a single SentinelPlan.
@@ -484,18 +548,19 @@ class SentinelPlanStats(BasePlanStats):
         Initialize this PlanStats object from a Sentinel object.
         """
         operator_stats = {}
-        for op_set_idx, (logical_op_id, op_set) in enumerate(plan):
-            operator_stats[logical_op_id] = {}
+        for topo_idx, (logical_op_id, op_set) in enumerate(plan):
+            unique_logical_op_id = f"{topo_idx}-{logical_op_id}"
+            operator_stats[unique_logical_op_id] = {}
             for physical_op in op_set:
                 full_op_id = physical_op.get_full_op_id()
-                operator_stats[logical_op_id][full_op_id] = OperatorStats(
+                operator_stats[unique_logical_op_id][full_op_id] = OperatorStats(
                     full_op_id=full_op_id,
                     op_name=physical_op.op_name(),
-                    source_full_op_id=None if op_set_idx == 0 else plan.logical_op_ids[op_set_idx - 1],  # NOTE: this may be a reason to keep `source_op_id` instead of `source_full_op_id`
+                    source_unique_logical_op_ids=plan.get_source_unique_logical_op_ids(unique_logical_op_id),
                     plan_id=plan.plan_id,
                     op_details={k: str(v) for k, v in physical_op.get_id_params().items()},
                 )
         return SentinelPlanStats(plan_id=plan.plan_id, plan_str=str(plan), operator_stats=operator_stats)
     def sum_op_costs(self) -> float:
@@ -504,24 +569,45 @@ class SentinelPlanStats(BasePlanStats):
         """
         return sum(sum([op_stats.total_op_cost for _, op_stats in phys_op_stats.items()]) for _, phys_op_stats in self.operator_stats.items())
-    def add_record_op_stats(self, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
+    def sum_input_tokens(self) -> int:
+        """
+        Sum the input tokens processed by all operators in this plan.
+        """
+        return sum(sum([op_stats.total_input_tokens for _, op_stats in phys_op_stats.items()]) for _, phys_op_stats in self.operator_stats.items())
+    def sum_output_tokens(self) -> int:
         """
-        Add the given RecordOpStats to this plan's operator stats.
+        Sum the output tokens processed by all operators in this plan.
+        """
+        return sum(sum([op_stats.total_output_tokens for _, op_stats in phys_op_stats.items()]) for _, phys_op_stats in self.operator_stats.items())
+    def add_record_op_stats(self, unique_logical_op_id: str, record_op_stats: RecordOpStats | list[RecordOpStats]) -> None:
+        """
+        Add the given RecordOpStats to this plan's operator stats for the given operator set id.
         """
         # normalize input type to be list[RecordOpStats]
         record_op_stats_lst = record_op_stats if isinstance(record_op_stats, list) else [record_op_stats]
         # update operator stats
         for record_op_stats in record_op_stats_lst:
-            logical_op_id = record_op_stats.logical_op_id
             full_op_id = record_op_stats.full_op_id
-            if logical_op_id in self.operator_stats:
-                if full_op_id in self.operator_stats[logical_op_id]:
-                    self.operator_stats[logical_op_id][full_op_id] += record_op_stats
+            if unique_logical_op_id in self.operator_stats:
+                if full_op_id in self.operator_stats[unique_logical_op_id]:
+                    self.operator_stats[unique_logical_op_id][full_op_id] += record_op_stats
                 else:
                     raise ValueError(f"RecordOpStats with full_op_id {full_op_id} not found in SentinelPlanStats")
             else:
-                raise ValueError(f"RecordOpStats with logical_op_id {logical_op_id} not found in SentinelPlanStats")
+                raise ValueError(f"RecordOpStats with unique_logical_op_id {unique_logical_op_id} not found in SentinelPlanStats")
+    def add_validation_gen_stats(self, unique_logical_op_id: str, gen_stats: GenerationStats) -> None:
+        """
+        Add the given GenerationStats to this plan's validation generation stats for the given logical operator id.
+        """
+        if unique_logical_op_id in self.validation_gen_stats:
+            self.validation_gen_stats[unique_logical_op_id] += gen_stats
+        else:
+            self.validation_gen_stats[unique_logical_op_id] = gen_stats
     def __iadd__(self, plan_stats: SentinelPlanStats) -> None:
         """
@@ -533,19 +619,29 @@ class SentinelPlanStats(BasePlanStats):
         """
         self.total_plan_time += plan_stats.total_plan_time
         self.total_plan_cost += plan_stats.total_plan_cost
-        for logical_op_id, physical_op_stats in plan_stats.operator_stats.items():
+        self.total_input_tokens += plan_stats.total_input_tokens
+        self.total_output_tokens += plan_stats.total_output_tokens
+        for unique_logical_op_id, physical_op_stats in plan_stats.operator_stats.items():
             for full_op_id, op_stats in physical_op_stats.items():
-                if logical_op_id in self.operator_stats:
-                    if full_op_id in self.operator_stats[logical_op_id]:
-                        self.operator_stats[logical_op_id][full_op_id] += op_stats
+                if unique_logical_op_id in self.operator_stats:
+                    if full_op_id in self.operator_stats[unique_logical_op_id]:
+                        self.operator_stats[unique_logical_op_id][full_op_id] += op_stats
                     else:
-                        self.operator_stats[logical_op_id][full_op_id] = op_stats
+                        self.operator_stats[unique_logical_op_id][full_op_id] = op_stats
                 else:
-                    self.operator_stats[logical_op_id] = physical_op_stats
+                    self.operator_stats[unique_logical_op_id] = physical_op_stats
+        for unique_logical_op_id, gen_stats in plan_stats.validation_gen_stats.items():
+            if unique_logical_op_id in self.validation_gen_stats:
+                self.validation_gen_stats[unique_logical_op_id] += gen_stats
+            else:
+                self.validation_gen_stats[unique_logical_op_id] = gen_stats
     def __str__(self) -> str:
         stats = f"total_plan_time={self.total_plan_time} \n"
         stats += f"total_plan_cost={self.total_plan_cost} \n"
+        stats += f"total_input_tokens={self.total_input_tokens} \n"
+        stats += f"total_output_tokens={self.total_output_tokens} \n"
         for outer_idx, physical_op_stats in enumerate(self.operator_stats.values()):
             total_time = sum([op_stats.total_op_time for op_stats in physical_op_stats.values()])
             total_cost = sum([op_stats.total_op_cost for op_stats in physical_op_stats.values()])
@@ -554,33 +650,20 @@ class SentinelPlanStats(BasePlanStats):
                 stats += f"    {outer_idx}.{inner_idx}. {op_stats.op_name} time={op_stats.total_op_time} cost={op_stats.total_op_cost} \n"
         return stats
-    def to_json(self) -> dict:
-        return {
-            "plan_id": self.plan_id,
-            "plan_str": self.plan_str,
-            "operator_stats": {
-                logical_op_id: {full_op_id: op_stats.to_json() for full_op_id, op_stats in physical_op_stats.items()}
-                for logical_op_id, physical_op_stats in self.operator_stats.items()
-            },
-            "total_plan_time": self.total_plan_time,
-            "total_plan_cost": self.total_plan_cost,
-        }
-@dataclass
-class ExecutionStats:
+class ExecutionStats(BaseModel):
     """
-    Dataclass for storing statistics captured for the entire execution of a workload.
+    Model for storing statistics captured for the entire execution of a workload.
     """
     # string for identifying this workload execution
     execution_id: str | None = None
     # dictionary of SentinelPlanStats objects (one for each sentinel plan run during execution)
-    sentinel_plan_stats: dict[str, SentinelPlanStats] = field(default_factory=dict)
+    sentinel_plan_stats: dict[str, SentinelPlanStats] = Field(default_factory=dict)
     # dictionary of PlanStats objects (one for each plan run during execution)
-    plan_stats: dict[str, PlanStats] = field(default_factory=dict)
+    plan_stats: dict[str, PlanStats] = Field(default_factory=dict)
     # total time spent optimizing
     optimization_time: float = 0.0
@@ -600,16 +683,25 @@ class ExecutionStats:
     # total cost for the entire execution
     total_execution_cost: float = 0.0
+    # total number of input tokens processed
+    total_input_tokens: int = 0
+    # total number of output tokens processed
+    total_output_tokens: int = 0
+    # total number of tokens processed
+    total_tokens: int = 0
     # dictionary of sentinel plan strings; useful for printing executed sentinel plans in demos
-    sentinel_plan_strs: dict[str, str] = field(default_factory=dict)
+    sentinel_plan_strs: dict[str, str] = Field(default_factory=dict)
     # dictionary of plan strings; useful for printing executed plans in demos
-    plan_strs: dict[str, str] = field(default_factory=dict)
+    plan_strs: dict[str, str] = Field(default_factory=dict)
     # start time for the execution; should be set by calling ExecutionStats.start()
     start_time: float | None = None
-    # end time for the optimization;
+    # end time for the optimization;
     optimization_end_time: float | None = None
     def start(self) -> None:
@@ -647,6 +739,11 @@ class ExecutionStats:
         self.plan_execution_cost = self.sum_plan_costs()
         self.total_execution_cost = self.optimization_cost + self.plan_execution_cost
+        # compute the tokens for total execution
+        self.total_input_tokens = self.sum_input_tokens()
+        self.total_output_tokens = self.sum_output_tokens()
+        self.total_tokens = self.total_input_tokens + self.total_output_tokens
         # compute plan_strs
         self.plan_strs = {plan_id: plan_stats.plan_str for plan_id, plan_stats in self.plan_stats.items()}
@@ -654,7 +751,7 @@ class ExecutionStats:
         """
         Sum the costs of all SentinelPlans in this execution.
         """
-        return sum([plan_stats.sum_op_costs() for _, plan_stats in self.sentinel_plan_stats.items()])
+        return sum([plan_stats.sum_op_costs() + plan_stats.sum_validation_costs() for _, plan_stats in self.sentinel_plan_stats.items()])
     def sum_plan_costs(self) -> float:
         """
@@ -662,6 +759,22 @@ class ExecutionStats:
         """
         return sum([plan_stats.sum_op_costs() for _, plan_stats in self.plan_stats.items()])
+    def sum_input_tokens(self) -> int:
+        """
+        Sum the input tokens processed in this execution
+        """
+        sentinel_plan_input_tokens = sum([plan_stats.sum_input_tokens() for _, plan_stats in self.sentinel_plan_stats.items()])
+        plan_input_tokens = sum([plan_stats.sum_input_tokens() for _, plan_stats in self.plan_stats.items()])
+        return plan_input_tokens + sentinel_plan_input_tokens
+    def sum_output_tokens(self) -> int:
+        """
+        Sum the output tokens processed in this execution
+        """
+        sentinel_plan_output_tokens = sum([plan_stats.sum_output_tokens() for _, plan_stats in self.sentinel_plan_stats.items()])
+        plan_output_tokens = sum([plan_stats.sum_output_tokens() for _, plan_stats in self.plan_stats.items()])
+        return plan_output_tokens + sentinel_plan_output_tokens
     def add_plan_stats(self, plan_stats: PlanStats | SentinelPlanStats | list[PlanStats] | list[SentinelPlanStats]) -> None:
         """
         Add the given PlanStats (or SentinelPlanStats) to this execution's plan stats.
@@ -686,43 +799,17 @@ class ExecutionStats:
             else:
                 raise TypeError(f"Cannot add {type(plan_stats)} to ExecutionStats")
-    def clean_json(self, stats: dict):
-        """
-        Convert np.int64 and np.float64 to int and float for all values in stats.
-        """
-        for key, value in stats.items():
-            if isinstance(value, dict):
-                stats[key] = self.clean_json(value)
-            elif isinstance(value, np.int64):
-                stats[key] = int(value)
-            elif isinstance(value, np.float64):
-                stats[key] = float(value)
-        return stats
+    def to_json(self, filepath: str | None = None) -> dict | None:
+        if filepath is None:
+            return self.model_dump(mode="json")
-    def to_json(self):
-        stats = {
-            "execution_id": self.execution_id,
-            "sentinel_plan_stats": {
-                plan_id: plan_stats.to_json() for plan_id, plan_stats in self.sentinel_plan_stats.items()
-            },
-            "plan_stats": {plan_id: plan_stats.to_json() for plan_id, plan_stats in self.plan_stats.items()},
-            "optimization_time": self.optimization_time,
-            "optimization_cost": self.optimization_cost,
-            "plan_execution_time": self.plan_execution_time,
-            "plan_execution_cost": self.plan_execution_cost,
-            "total_execution_time": self.total_execution_time,
-            "total_execution_cost": self.total_execution_cost,
-            "sentinel_plan_strs": self.sentinel_plan_strs,
-            "plan_strs": self.plan_strs,
-        }
-        stats = self.clean_json(stats)
-        return stats
+        with open(filepath, "w") as f:
+            json.dump(self.model_dump(mode="json"), f)
-@dataclass
-class OperatorCostEstimates:
+class OperatorCostEstimates(BaseModel):
     """
-    Dataclass for storing estimates of key metrics of interest for each operator.
+    Model for storing estimates of key metrics of interest for each operator.
     """
     # (estimated) number of records output by this operator
@@ -765,10 +852,10 @@ class OperatorCostEstimates:
         """
         Multiply all fields by a scalar.
         """
-        dct = {field.name: getattr(self, field.name) * multiplier for field in fields(self)}
+        dct = {field_name: getattr(self, field_name) * multiplier for field_name in self.model_fields}
         return OperatorCostEstimates(**dct)
-    def __post_init__(self):
+    def model_post_init(self, __context: Any) -> None:
         if self.cardinality_lower_bound is None and self.cardinality_upper_bound is None:
             self.cardinality_lower_bound = self.cardinality
             self.cardinality_upper_bound = self.cardinality
@@ -786,10 +873,9 @@ class OperatorCostEstimates:
             self.quality_upper_bound = self.quality
-@dataclass
-class PlanCost:
+class PlanCost(BaseModel):
     """
-    Dataclass for storing the (cost, time, quality) estimates of (sub)-plans and their upper and lower bounds.
+    Model for storing the (cost, time, quality) estimates of (sub)-plans and their upper and lower bounds.
     """
     # the expression cost
@@ -825,7 +911,16 @@ class PlanCost:
     def __hash__(self):
         return hash(f"{self.cost}-{self.time}-{self.quality}")
-    def __post_init__(self):
+    def __eq__(self, other: Any) -> bool:
+        if not isinstance(other, PlanCost):
+            return False
+        return (
+            self.cost == other.cost
+            and self.time == other.time
+            and self.quality == other.quality
+        )
+    def model_post_init(self, __context: Any) -> None:
         if self.time_lower_bound is None and self.time_upper_bound is None:
             self.time_lower_bound = self.time
             self.time_upper_bound = self.time
@@ -838,30 +933,71 @@ class PlanCost:
             self.quality_lower_bound = self.quality
             self.quality_upper_bound = self.quality
+    def join_add(self, left_plan_cost: PlanCost, right_plan_cost: PlanCost, execution_strategy: str = "parallel") -> PlanCost:
+        """
+        Add the PlanCost objects for two joined plans (left_plan_cost and right_plan_cost)
+        to the PlanCost object for the join operator. The execution strategy determines how
+        the input times are combined. If the execution strategy is "parallel", the input time
+        is the maximum of the two times. If the execution strategy is "sequential" (which is
+        currently anything else), the input time is the sum of the two times.
+        For quality, we compute the produce of the operator quality with the average of the
+        two input qualities.
+        NOTE: we currently assume the updating of the op_estimates are handled by the caller
+        as there is not a universally correct meaning of addition of op_estimates.
+        """
+        dct = {}
+        for model_field in ["cost", "cost_lower_bound", "cost_upper_bound"]:
+            op_field_value = getattr(self, model_field)
+            left_plan_field_value = getattr(left_plan_cost, model_field)
+            right_plan_field_value = getattr(right_plan_cost, model_field)
+            if op_field_value is not None and left_plan_field_value is not None and right_plan_field_value is not None:
+                dct[model_field] = op_field_value + left_plan_field_value + right_plan_field_value
+        for model_field in ["time", "time_lower_bound", "time_upper_bound"]:
+            op_field_value = getattr(self, model_field)
+            left_plan_field_value = getattr(left_plan_cost, model_field)
+            right_plan_field_value = getattr(right_plan_cost, model_field)
+            if op_field_value is not None and left_plan_field_value is not None and right_plan_field_value is not None:
+                if execution_strategy == "parallel":
+                    dct[model_field] = op_field_value + max(left_plan_field_value, right_plan_field_value)
+                else:
+                    dct[model_field] = op_field_value + left_plan_field_value + right_plan_field_value
+        for model_field in ["quality", "quality_lower_bound", "quality_upper_bound"]:
+            op_field_value = getattr(self, model_field)
+            left_plan_field_value = getattr(left_plan_cost, model_field)
+            right_plan_field_value = getattr(right_plan_cost, model_field)
+            if op_field_value is not None and left_plan_field_value is not None and right_plan_field_value is not None:
+                dct[model_field] = op_field_value * ((left_plan_field_value + right_plan_field_value) / 2.0)
+        return PlanCost(**dct)
     def __iadd__(self, other: PlanCost) -> PlanCost:
         """
         NOTE: we currently assume the updating of the op_estimates are handled by the caller
-        as there is not a universally correct meaning of addition of op_estiamtes.
+        as there is not a universally correct meaning of addition of op_estimates.
         """
         self.cost += other.cost
         self.time += other.time
         self.quality *= other.quality
-        for dataclass_field in ["cost_lower_bound", "cost_upper_bound", "time_lower_bound", "time_upper_bound"]:
-            if getattr(self, dataclass_field) is not None and getattr(other, dataclass_field) is not None:
-                summation = getattr(self, dataclass_field) + getattr(other, dataclass_field)
-                setattr(self, dataclass_field, summation)
+        for model_field in ["cost_lower_bound", "cost_upper_bound", "time_lower_bound", "time_upper_bound"]:
+            if getattr(self, model_field) is not None and getattr(other, model_field) is not None:
+                summation = getattr(self, model_field) + getattr(other, model_field)
+                setattr(self, model_field, summation)
-        for dataclass_field in ["quality_lower_bound", "quality_upper_bound"]:
-            if getattr(self, dataclass_field) is not None and getattr(other, dataclass_field) is not None:
-                product = getattr(self, dataclass_field) * getattr(other, dataclass_field)
-                setattr(self, dataclass_field, product)
+        for model_field in ["quality_lower_bound", "quality_upper_bound"]:
+            if getattr(self, model_field) is not None and getattr(other, model_field) is not None:
+                product = getattr(self, model_field) * getattr(other, model_field)
+                setattr(self, model_field, product)
         return self
     def __add__(self, other: PlanCost) -> PlanCost:
         """
         NOTE: we currently assume the updating of the op_estimates are handled by the caller
-        as there is not a universally correct meaning of addition of op_estiamtes.
+        as there is not a universally correct meaning of addition of op_estimates.
         """
         dct = {
             field: getattr(self, field) + getattr(other, field)
@@ -874,7 +1010,7 @@ class PlanCost:
                 "time_upper_bound",
             ]
         }
-        for dataclass_field in ["quality", "quality_lower_bound", "quality_upper_bound"]:
-            dct[dataclass_field] = getattr(self, dataclass_field) * getattr(other, dataclass_field)
+        for model_field in ["quality", "quality_lower_bound", "quality_upper_bound"]:
+            dct[model_field] = getattr(self, model_field) * getattr(other, model_field)
         return PlanCost(**dct)

palimpzest 0.7.20__py3-none-any.whl → 0.8.0__py3-none-any.whl

palimpzest 0.7.20py3-none-any.whl → 0.8.0py3-none-any.whl