PyPI - palimpzest - Versions diffs - 0.8.7__py3-none-any.whl → 1.0.0__py3-none-any.whl - Mend

palimpzest 0.8.7py3-none-any.whl → 1.0.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

palimpzest/constants.py +13 -4
palimpzest/core/data/dataset.py +75 -5
palimpzest/core/elements/groupbysig.py +5 -1
palimpzest/core/elements/records.py +16 -7
palimpzest/core/lib/schemas.py +26 -3
palimpzest/core/models.py +4 -4
palimpzest/prompts/aggregate_prompts.py +99 -0
palimpzest/prompts/prompt_factory.py +162 -75
palimpzest/prompts/utils.py +38 -1
palimpzest/prompts/validator.py +24 -24
palimpzest/query/execution/all_sample_execution_strategy.py +1 -1
palimpzest/query/execution/execution_strategy.py +8 -8
palimpzest/query/execution/mab_execution_strategy.py +30 -11
palimpzest/query/execution/parallel_execution_strategy.py +31 -7
palimpzest/query/execution/single_threaded_execution_strategy.py +23 -6
palimpzest/query/generators/generators.py +9 -7
palimpzest/query/operators/__init__.py +10 -6
palimpzest/query/operators/aggregate.py +394 -10
palimpzest/query/operators/convert.py +1 -1
palimpzest/query/operators/join.py +279 -23
palimpzest/query/operators/logical.py +36 -11
palimpzest/query/operators/mixture_of_agents.py +3 -1
palimpzest/query/operators/physical.py +5 -2
palimpzest/query/operators/{retrieve.py → topk.py} +10 -10
palimpzest/query/optimizer/__init__.py +11 -3
palimpzest/query/optimizer/cost_model.py +5 -5
palimpzest/query/optimizer/optimizer.py +3 -2
palimpzest/query/optimizer/plan.py +2 -3
palimpzest/query/optimizer/rules.py +73 -13
palimpzest/query/optimizer/tasks.py +4 -4
palimpzest/utils/progress.py +19 -17
palimpzest/validator/validator.py +7 -7
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/METADATA +26 -66
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/RECORD +37 -36
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/WHEEL +0 -0
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.8.7.dist-info → palimpzest-1.0.0.dist-info}/top_level.txt +0 -0

palimpzest/query/operators/aggregate.py CHANGED Viewed

@@ -1,12 +1,23 @@
 from __future__ import annotations
+import contextlib
 import time
-from palimpzest.constants import NAIVE_EST_NUM_GROUPS, AggFunc
+from typing import Any
+from palimpzest.constants import (
+    MODEL_CARDS,
+    NAIVE_EST_NUM_GROUPS,
+    NAIVE_EST_NUM_INPUT_TOKENS,
+    NAIVE_EST_NUM_OUTPUT_TOKENS,
+    AggFunc,
+    Model,
+    PromptStrategy,
+)
 from palimpzest.core.elements.groupbysig import GroupBySig
 from palimpzest.core.elements.records import DataRecord, DataRecordSet
-from palimpzest.core.lib.schemas import Average, Count
+from palimpzest.core.lib.schemas import Average, Count, Max, Min, Sum
 from palimpzest.core.models import OperatorCostEstimates, RecordOpStats
+from palimpzest.query.generators.generators import Generator
 from palimpzest.query.operators.physical import PhysicalOperator
@@ -58,6 +69,16 @@ class ApplyGroupByOp(AggregateOp):
             return 0
         elif func.lower() == "average":
             return (0, 0)
+        elif func.lower() == "sum":
+            return 0
+        elif func.lower() == "min":
+            return float("inf")
+        elif func.lower() == "max":
+            return float("-inf")
+        elif func.lower() == "list":
+            return []
+        elif func.lower() == "set":
+            return set()
         else:
             raise Exception("Unknown agg function " + func)
@@ -66,16 +87,34 @@ class ApplyGroupByOp(AggregateOp):
         if func.lower() == "count":
             return state + 1
         elif func.lower() == "average":
-            sum, cnt = state
+            sum_, cnt = state
+            if val is None:
+                return (sum_, cnt)
+            return (sum_ + val, cnt + 1)
+        elif func.lower() == "sum":
             if val is None:
-                return (sum, cnt)
-            return (sum + val, cnt + 1)
+                return state
+            return state + sum(val) if isinstance(val, list) else state + val
+        elif func.lower() == "min":
+            if val is None:
+                return state
+            return min(state, min(val) if isinstance(val, list) else val)
+        elif func.lower() == "max":
+            if val is None:
+                return state
+            return max(state, max(val) if isinstance(val, list) else val)
+        elif func.lower() == "list":
+            state.append(val)
+            return state
+        elif func.lower() == "set":
+            state.add(val)
+            return state
         else:
             raise Exception("Unknown agg function " + func)
     @staticmethod
     def agg_final(func, state):
-        if func.lower() == "count":
+        if func.lower() in ["count", "sum", "min", "max", "list", "set"]:
             return state
         elif func.lower() == "average":
             sum, cnt = state
@@ -156,12 +195,17 @@ class AverageAggregateOp(AggregateOp):
     def __init__(self, agg_func: AggFunc, *args, **kwargs):
         # enforce that output schema is correct
-        assert kwargs["output_schema"] == Average, "AverageAggregateOp requires output_schema to be Average"
+        assert kwargs["output_schema"].model_fields.keys() == Average.model_fields.keys(), "AverageAggregateOp requires output_schema to be Average"
         # enforce that input schema is a single numeric field
         input_field_types = list(kwargs["input_schema"].model_fields.values())
         assert len(input_field_types) == 1, "AverageAggregateOp requires input_schema to have exactly one field"
-        numeric_field_types = [bool, int, float, bool | None, int | None, float | None, int | float, int | float | None]
+        numeric_field_types = [
+            bool, int, float, int | float,
+            bool | None, int | None, float | None, int | float | None,
+            bool | Any, int | Any, float | Any, int | float | Any,
+            bool | None | Any, int | None | Any, float | None | Any, int | float | None | Any,
+        ]
         is_numeric = input_field_types[0].annotation in numeric_field_types
         assert is_numeric, f"AverageAggregateOp requires input_schema to have a numeric field type, i.e. one of: {numeric_field_types}\nGot: {input_field_types[0]}"
@@ -225,12 +269,88 @@ class AverageAggregateOp(AggregateOp):
         return DataRecordSet([dr], [record_op_stats])
+class SumAggregateOp(AggregateOp):
+    # NOTE: we don't actually need / use agg_func here (yet)
+    def __init__(self, agg_func: AggFunc, *args, **kwargs):
+        # enforce that output schema is correct
+        assert kwargs["output_schema"].model_fields.keys() == Sum.model_fields.keys(), "SumAggregateOp requires output_schema to be Sum"
+        # enforce that input schema is a single numeric field
+        input_field_types = list(kwargs["input_schema"].model_fields.values())
+        assert len(input_field_types) == 1, "SumAggregateOp requires input_schema to have exactly one field"
+        numeric_field_types = [
+            bool, int, float, int | float,
+            bool | None, int | None, float | None, int | float | None,
+            bool | Any, int | Any, float | Any, int | float | Any,
+            bool | None | Any, int | None | Any, float | None | Any, int | float | None | Any,
+        ]
+        is_numeric = input_field_types[0].annotation in numeric_field_types
+        assert is_numeric, f"SumAggregateOp requires input_schema to have a numeric field type, i.e. one of: {numeric_field_types}\nGot: {input_field_types[0]}"
+        # call parent constructor
+        super().__init__(*args, **kwargs)
+        self.agg_func = agg_func
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Function: {str(self.agg_func)}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        return {"agg_func": str(self.agg_func), **id_params}
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        return {"agg_func": self.agg_func, **op_params}
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        # for now, assume applying the aggregation takes negligible additional time (and no cost in USD)
+        return OperatorCostEstimates(
+            cardinality=1,
+            time_per_record=0,
+            cost_per_record=0,
+            quality=1.0,
+        )
+    def __call__(self, candidates: list[DataRecord]) -> DataRecordSet:
+        start_time = time.time()
+        # NOTE: we currently do not guarantee that input values conform to their specified type;
+        #       as a result, we simply omit any values which do not parse to a float from the average
+        # NOTE: right now we perform a check in the constructor which enforces that the input_schema
+        #       has a single field which is numeric in nature; in the future we may want to have a
+        #       cleaner way of computing the value (rather than `float(list(candidate...))` below)
+        summation = 0
+        for candidate in candidates:
+            with contextlib.suppress(Exception):
+                summation += float(list(candidate.to_dict().values())[0])
+        data_item = Sum(sum=summation)
+        dr = DataRecord.from_agg_parents(data_item, parent_records=candidates)
+        # create RecordOpStats object
+        record_op_stats = RecordOpStats(
+            record_id=dr._id,
+            record_parent_ids=dr._parent_ids,
+            record_source_indices=dr._source_indices,
+            record_state=dr.to_dict(include_bytes=False),
+            full_op_id=self.get_full_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=time.time() - start_time,
+            cost_per_record=0.0,
+        )
+        return DataRecordSet([dr], [record_op_stats])
 class CountAggregateOp(AggregateOp):
     # NOTE: we don't actually need / use agg_func here (yet)
     def __init__(self, agg_func: AggFunc, *args, **kwargs):
         # enforce that output schema is correct
-        assert kwargs["output_schema"] == Count, "CountAggregateOp requires output_schema to be Count"
+        assert kwargs["output_schema"].model_fields.keys() == Count.model_fields.keys(), "CountAggregateOp requires output_schema to be Count"
         # call parent constructor
         super().__init__(*args, **kwargs)
@@ -280,3 +400,267 @@ class CountAggregateOp(AggregateOp):
         )
         return DataRecordSet([dr], [record_op_stats])
+class MinAggregateOp(AggregateOp):
+    # NOTE: we don't actually need / use agg_func here (yet)
+    def __init__(self, agg_func: AggFunc, *args, **kwargs):
+        # enforce that output schema is correct
+        assert kwargs["output_schema"].model_fields.keys() == Min.model_fields.keys(), "MinAggregateOp requires output_schema to be Min"
+        # call parent constructor
+        super().__init__(*args, **kwargs)
+        self.agg_func = agg_func
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Function: {str(self.agg_func)}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        return {"agg_func": str(self.agg_func), **id_params}
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        return {"agg_func": self.agg_func, **op_params}
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        # for now, assume applying the aggregation takes negligible additional time (and no cost in USD)
+        return OperatorCostEstimates(
+            cardinality=1,
+            time_per_record=0,
+            cost_per_record=0,
+            quality=1.0,
+        )
+    def __call__(self, candidates: list[DataRecord]) -> DataRecordSet:
+        start_time = time.time()
+        # create new DataRecord
+        min = float("inf")
+        for candidate in candidates:
+            try:  # noqa: SIM105
+                min = min(float(list(candidate.to_dict().values())[0]), min)
+            except Exception:
+                pass
+        data_item = Min(min=min if min != float("inf") else None)
+        dr = DataRecord.from_agg_parents(data_item, parent_records=candidates)
+        # create RecordOpStats object
+        record_op_stats = RecordOpStats(
+            record_id=dr.id,
+            record_parent_ids=dr.parent_ids,
+            record_source_indices=dr.source_indices,
+            record_state=dr.to_dict(include_bytes=False),
+            full_op_id=self.get_full_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=time.time() - start_time,
+            cost_per_record=0.0,
+            op_details={k: str(v) for k, v in self.get_id_params().items()},
+        )
+        return DataRecordSet([dr], [record_op_stats])
+class MaxAggregateOp(AggregateOp):
+    # NOTE: we don't actually need / use agg_func here (yet)
+    def __init__(self, agg_func: AggFunc, *args, **kwargs):
+        # enforce that output schema is correct
+        assert kwargs["output_schema"].model_fields.keys() == Max.model_fields.keys(), "MaxAggregateOp requires output_schema to be Max"
+        # call parent constructor
+        super().__init__(*args, **kwargs)
+        self.agg_func = agg_func
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Function: {str(self.agg_func)}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        return {"agg_func": str(self.agg_func), **id_params}
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        return {"agg_func": self.agg_func, **op_params}
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        # for now, assume applying the aggregation takes negligible additional time (and no cost in USD)
+        return OperatorCostEstimates(
+            cardinality=1,
+            time_per_record=0,
+            cost_per_record=0,
+            quality=1.0,
+        )
+    def __call__(self, candidates: list[DataRecord]) -> DataRecordSet:
+        start_time = time.time()
+        # create new DataRecord
+        max = float("-inf")
+        for candidate in candidates:
+            try:  # noqa: SIM105
+                max = max(float(list(candidate.to_dict().values())[0]), max)
+            except Exception:
+                pass
+        data_item = Max(max=max if max != float("-inf") else None)
+        dr = DataRecord.from_agg_parents(data_item, parent_records=candidates)
+        # create RecordOpStats object
+        record_op_stats = RecordOpStats(
+            record_id=dr.id,
+            record_parent_ids=dr.parent_ids,
+            record_source_indices=dr.source_indices,
+            record_state=dr.to_dict(include_bytes=False),
+            full_op_id=self.get_full_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=time.time() - start_time,
+            cost_per_record=0.0,
+            op_details={k: str(v) for k, v in self.get_id_params().items()},
+        )
+        return DataRecordSet([dr], [record_op_stats])
+class SemanticAggregate(AggregateOp):
+    def __init__(self, agg_str: str, model: Model, prompt_strategy: PromptStrategy = PromptStrategy.AGG, reasoning_effort: str | None = None, *args, **kwargs):
+        # call parent constructor
+        super().__init__(*args, **kwargs)
+        self.agg_str = agg_str
+        self.model = model
+        self.prompt_strategy = prompt_strategy
+        self.reasoning_effort = reasoning_effort
+        if model is not None:
+            self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base)
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Prompt Strategy: {self.prompt_strategy}\n"
+        op += f"    Reasoning Effort: {self.reasoning_effort}\n"
+        op += f"    Agg: {str(self.agg_str)}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        id_params = {
+            "agg_str": self.agg_str,
+            "model": None if self.model is None else self.model.value,
+            "prompt_strategy": None if self.prompt_strategy is None else self.prompt_strategy.value,
+            "reasoning_effort": self.reasoning_effort,
+            **id_params,
+        }
+        return id_params
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        op_params = {
+            "agg_str": self.agg_str,
+            "model": self.model,
+            "prompt_strategy": self.prompt_strategy,
+            "reasoning_effort": self.reasoning_effort,
+            **op_params,
+        }
+        return op_params
+    def get_model_name(self) -> str:
+        return self.model.value
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        """
+        Compute naive cost estimates for the LLMConvert operation. Implicitly, these estimates
+        assume the use of a single LLM call for each input record. Child classes of LLMConvert
+        may call this function through super() and adjust these estimates as needed (or they can
+        completely override this function).
+        """
+        # estimate number of input and output tokens from source
+        est_num_input_tokens = NAIVE_EST_NUM_INPUT_TOKENS * source_op_cost_estimates.cardinality
+        est_num_output_tokens = NAIVE_EST_NUM_OUTPUT_TOKENS
+        # get est. of conversion time per record from model card;
+        model_name = self.model.value
+        model_conversion_time_per_record = MODEL_CARDS[model_name]["seconds_per_output_token"] * est_num_output_tokens
+        # get est. of conversion cost (in USD) per record from model card
+        usd_per_input_token = MODEL_CARDS[model_name].get("usd_per_input_token")
+        if getattr(self, "prompt_strategy", None) is not None and self.prompt_strategy.is_audio_prompt():
+            usd_per_input_token = MODEL_CARDS[model_name]["usd_per_audio_input_token"]
+        model_conversion_usd_per_record = (
+            usd_per_input_token * est_num_input_tokens
+            + MODEL_CARDS[model_name]["usd_per_output_token"] * est_num_output_tokens
+        )
+        # estimate quality of output based on the strength of the model being used
+        quality = (MODEL_CARDS[model_name]["overall"] / 100.0)
+        return OperatorCostEstimates(
+            cardinality=1.0,
+            time_per_record=model_conversion_time_per_record,
+            cost_per_record=model_conversion_usd_per_record,
+            quality=quality,
+        )
+    def __call__(self, candidates: list[DataRecord]) -> DataRecordSet:
+        start_time = time.time()
+        # TODO: if candidates is an empty list, return an empty DataRecordSet
+        if len(candidates) == 0:
+            return DataRecordSet([], [])
+        # get the set of input fields to use for the operation
+        input_fields = self.get_input_fields()
+        # get the set of output fields to use for the operation
+        fields_to_generate = self.get_fields_to_generate(candidates[0])
+        fields = {field: field_type for field, field_type in self.output_schema.model_fields.items() if field in fields_to_generate}
+        # construct kwargs for generation
+        gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema, "agg_instruction": self.agg_str}
+        # generate outputs for all fields in a single query
+        field_answers, _, generation_stats, _ = self.generator(candidates, fields, **gen_kwargs)
+        assert all([field in field_answers for field in fields]), "Not all fields were generated!"
+        # construct data record for the output
+        field, value = fields_to_generate[0], field_answers[fields_to_generate[0]][0]
+        data_item = self.output_schema(**{field: value})
+        dr = DataRecord.from_agg_parents(data_item, parent_records=candidates)
+        # create RecordOpStats object
+        record_op_stats = RecordOpStats(
+            record_id=dr._id,
+            record_parent_ids=dr._parent_ids,
+            record_source_indices=dr._source_indices,
+            record_state=dr.to_dict(include_bytes=False),
+            full_op_id=self.get_full_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=time.time() - start_time,
+            cost_per_record=generation_stats.cost_per_record,
+            model_name=self.get_model_name(),
+            answer={field: value},
+            input_fields=input_fields,
+            generated_fields=fields_to_generate,
+            total_input_tokens=generation_stats.total_input_tokens,
+            total_output_tokens=generation_stats.total_output_tokens,
+            total_input_cost=generation_stats.total_input_cost,
+            total_output_cost=generation_stats.total_output_cost,
+            llm_call_duration_secs=generation_stats.llm_call_duration_secs,
+            fn_call_duration_secs=generation_stats.fn_call_duration_secs,
+            total_llm_calls=generation_stats.total_llm_calls,
+            total_embedding_llm_calls=generation_stats.total_embedding_llm_calls,
+            image_operation=self.is_image_op(),
+            op_details={k: str(v) for k, v in self.get_id_params().items()},
+        )
+        return DataRecordSet([dr], [record_op_stats])

palimpzest/query/operators/convert.py CHANGED Viewed

@@ -320,7 +320,7 @@ class LLMConvert(ConvertOp):
         est_num_output_tokens = NAIVE_EST_NUM_OUTPUT_TOKENS
         # get est. of conversion time per record from model card;
-        model_name = self.model.value if getattr(self, "model", None) is not None else Model.GPT_4o_MINI.value
+        model_name = self.model.value
         model_conversion_time_per_record = MODEL_CARDS[model_name]["seconds_per_output_token"] * est_num_output_tokens
         # get est. of conversion cost (in USD) per record from model card

palimpzest 0.8.7__py3-none-any.whl → 1.0.0__py3-none-any.whl

palimpzest 0.8.7py3-none-any.whl → 1.0.0py3-none-any.whl