PyPI - palimpzest - Versions diffs - 0.8.6__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

palimpzest 0.8.6py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

palimpzest/constants.py +12 -4
palimpzest/core/data/dataset.py +42 -0
palimpzest/core/elements/records.py +5 -1
palimpzest/core/lib/schemas.py +13 -0
palimpzest/prompts/aggregate_prompts.py +99 -0
palimpzest/prompts/prompt_factory.py +163 -75
palimpzest/prompts/utils.py +38 -1
palimpzest/prompts/validator.py +24 -24
palimpzest/query/generators/generators.py +9 -7
palimpzest/query/operators/__init__.py +4 -1
palimpzest/query/operators/aggregate.py +285 -6
palimpzest/query/operators/logical.py +17 -4
palimpzest/query/optimizer/__init__.py +4 -0
palimpzest/query/optimizer/rules.py +42 -2
palimpzest/validator/validator.py +7 -7
{palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/METADATA +1 -1
{palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/RECORD +20 -19
{palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/WHEEL +0 -0
{palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/top_level.txt +0 -0

palimpzest/prompts/validator.py CHANGED Viewed

@@ -22,17 +22,17 @@ OUTPUT FIELDS:
 - birth_year: the year the scientist was born
 CONTEXT:
-{{
+{
   "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
   "birthday": "December 10, 1815"
-}}
+}
 OUTPUT:
 --------
-{{
+{
   "name": "Charles Babbage",
   "birth_year": 1815
-}}
+}
 EVALUATION: {"name": 0.0, "birth_year": 1.0}
@@ -66,18 +66,18 @@ OUTPUT FIELDS:
 - person_in_image: true if a person is in the image and false otherwise
 CONTEXT:
-{{
+{
   "image": <bytes>,
   "photographer": "CameraEnthusiast1"
-}}
+}
 <image content provided here; assume in this example the image shows a dog and a cat playing>
 OUTPUT:
 --------
-{{
+{
   "dog_in_image": true,
   "person_in_image": true
-}}
+}
 EVALUATION: {"dog_in_image": 1.0, "person_in_image": 0.0}
@@ -113,22 +113,22 @@ OUTPUT FIELDS:
 - birth_year: the year the scientist was born
 CONTEXT:
-{{
+{
   "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
   "birthdays": "...Lovelace was born on December 10, 1815, almost exactly 24 years after Babbage's birth on 26 December 1791..."
-}}
+}
 OUTPUTS:
 --------
 [
-  {{
+  {
     "name": "Ada Lovelace",
     "birth_year": 1815
-  }},
-  {{
+  },
+  {
     "name": "Charles Babbage",
     "birth_year": 1790
-  }}
+  }
 ]
 EVALUATION: [{"name": 1.0, "birth_year": 1.0}, {"name": 1.0, "birth_year": 0.0}]
@@ -163,23 +163,23 @@ OUTPUT FIELDS:
 - animal_is_canine: true if the animal is a canine and false otherwise
 CONTEXT:
-{{
+{
   "image": <bytes>,
   "photographer": "CameraEnthusiast1"
-}}
+}
 <image content provided here; assume in this example the image shows a dog and a cat playing>
 OUTPUT:
 --------
 [
-  {{
+  {
     "animal": "dog",
     "animal_is_canine": true
-  }},
-  {{
+  },
+  {
     "animal": "cat",
     "animal_is_canine": true
-  }}
+  }
 ]
 EVALUATION: [{"animal": 1.0, "animal_is_canine": 1.0}, {"animal": 1.0, "animal_is_canine": 0.0}]
@@ -214,20 +214,20 @@ OUTPUT FIELDS:
 - related_scientists: list of scientists who perform similar work as the scientist described in the text
 CONTEXT:
-{{
+{
   "text": "Augusta Ada King, Countess of Lovelace, also known as Ada Lovelace, was an English mathematician and writer chiefly known for her work on Charles Babbage's proposed mechanical general-purpose computer, the Analytical Engine. She was the first to recognise that the machine had applications beyond pure calculation.",
-}}
+}
 OUTPUT:
 --------
-{{
+{
   "related_scientists": [
     "Charles Babbage",
     "Alan Turing",
     "Charles Darwin",
     "John von Neumann",
   ]
-}}
+}
 EVALUATION: {"related_scientists": 0.75}

palimpzest/query/generators/generators.py CHANGED Viewed

@@ -296,9 +296,9 @@ class Generator(Generic[ContextType, InputType]):
         return field_answers
-    def __call__(self, candidate: DataRecord, fields: dict[str, FieldInfo] | None, right_candidate: DataRecord | None = None, json_output: bool=True, **kwargs) -> GenerationOutput:
-        """Take the input record (`candidate`), generate the output `fields`, and return the generated output."""
-        logger.debug(f"Generating for candidate {candidate} with fields {fields}")
+    def __call__(self, candidate: DataRecord | list[DataRecord], fields: dict[str, FieldInfo] | None, right_candidate: DataRecord | None = None, json_output: bool=True, **kwargs) -> GenerationOutput:
+        """Take the input record(s) (`candidate`), generate the output `fields`, and return the generated output."""
+        logger.debug(f"Generating for candidate(s) {candidate} with fields {fields}")
         # fields can only be None if the user provides an answer parser
         fields_check = fields is not None or "parse_answer" in kwargs
@@ -338,7 +338,7 @@ class Generator(Generic[ContextType, InputType]):
                     reasoning_effort = "minimal" if self.reasoning_effort is None else self.reasoning_effort
                     completion_kwargs = {"reasoning_effort": reasoning_effort, **completion_kwargs}
             if self.model.is_vllm_model():
-                completion_kwargs = {"api_base": self.api_base, **completion_kwargs}
+                completion_kwargs = {"api_base": self.api_base, "api_key": os.environ.get("VLLM_API_KEY", "fake-api-key") **completion_kwargs}
             completion = litellm.completion(model=self.model_name, messages=messages, **completion_kwargs)
             end_time = time.time()
             logger.debug(f"Generated completion in {end_time - start_time:.2f} seconds")
@@ -405,15 +405,17 @@ class Generator(Generic[ContextType, InputType]):
         # pretty print prompt + full completion output for debugging
         completion_text = completion.choices[0].message.content
-        prompt = ""
+        prompt, system_prompt = "", ""
         for message in messages:
+            if message["role"] == "system":
+                system_prompt += message["content"] + "\n"
             if message["role"] == "user":
                 if message["type"] == "text":
                     prompt += message["content"] + "\n"
                 elif message["type"] == "image":
-                    prompt += "<image>\n"
+                    prompt += "<image>\n" * len(message["content"])
                 elif message["type"] == "input_audio":
-                    prompt += "<audio>\n"
+                    prompt += "<audio>\n" * len(message["content"])
         logger.debug(f"PROMPT:\n{prompt}")
         logger.debug(Fore.GREEN + f"{completion_text}\n" + Style.RESET_ALL)

palimpzest/query/operators/__init__.py CHANGED Viewed

@@ -2,6 +2,9 @@ from palimpzest.query.operators.aggregate import AggregateOp as _AggregateOp
 from palimpzest.query.operators.aggregate import ApplyGroupByOp as _ApplyGroupByOp
 from palimpzest.query.operators.aggregate import AverageAggregateOp as _AverageAggregateOp
 from palimpzest.query.operators.aggregate import CountAggregateOp as _CountAggregateOp
+from palimpzest.query.operators.aggregate import MaxAggregateOp as _MaxAggregateOp
+from palimpzest.query.operators.aggregate import MinAggregateOp as _MinAggregateOp
+from palimpzest.query.operators.aggregate import SemanticAggregate as _SemanticAggregate
 from palimpzest.query.operators.convert import ConvertOp as _ConvertOp
 from palimpzest.query.operators.convert import LLMConvert as _LLMConvert
 from palimpzest.query.operators.convert import LLMConvertBonded as _LLMConvertBonded
@@ -77,7 +80,7 @@ LOGICAL_OPERATORS = [
 PHYSICAL_OPERATORS = (
     # aggregate
-    [_AggregateOp, _ApplyGroupByOp, _AverageAggregateOp, _CountAggregateOp]
+    [_AggregateOp, _ApplyGroupByOp, _AverageAggregateOp, _CountAggregateOp, _MaxAggregateOp, _MinAggregateOp, _SemanticAggregate]
     # convert
     + [_ConvertOp, _NonLLMConvert, _LLMConvert, _LLMConvertBonded]
     # critique and refine

palimpzest/query/operators/aggregate.py CHANGED Viewed

@@ -1,12 +1,22 @@
 from __future__ import annotations
 import time
-from palimpzest.constants import NAIVE_EST_NUM_GROUPS, AggFunc
+from typing import Any
+from palimpzest.constants import (
+    MODEL_CARDS,
+    NAIVE_EST_NUM_GROUPS,
+    NAIVE_EST_NUM_INPUT_TOKENS,
+    NAIVE_EST_NUM_OUTPUT_TOKENS,
+    AggFunc,
+    Model,
+    PromptStrategy,
+)
 from palimpzest.core.elements.groupbysig import GroupBySig
 from palimpzest.core.elements.records import DataRecord, DataRecordSet
-from palimpzest.core.lib.schemas import Average, Count
+from palimpzest.core.lib.schemas import Average, Count, Max, Min
 from palimpzest.core.models import OperatorCostEstimates, RecordOpStats
+from palimpzest.query.generators.generators import Generator
 from palimpzest.query.operators.physical import PhysicalOperator
@@ -156,12 +166,17 @@ class AverageAggregateOp(AggregateOp):
     def __init__(self, agg_func: AggFunc, *args, **kwargs):
         # enforce that output schema is correct
-        assert kwargs["output_schema"] == Average, "AverageAggregateOp requires output_schema to be Average"
+        assert kwargs["output_schema"].model_fields.keys() == Average.model_fields.keys(), "AverageAggregateOp requires output_schema to be Average"
         # enforce that input schema is a single numeric field
         input_field_types = list(kwargs["input_schema"].model_fields.values())
         assert len(input_field_types) == 1, "AverageAggregateOp requires input_schema to have exactly one field"
-        numeric_field_types = [bool, int, float, bool | None, int | None, float | None, int | float, int | float | None]
+        numeric_field_types = [
+            bool, int, float, int | float,
+            bool | None, int | None, float | None, int | float | None,
+            bool | Any, int | Any, float | Any, int | float | Any,
+            bool | None | Any, int | None | Any, float | None | Any, int | float | None | Any,
+        ]
         is_numeric = input_field_types[0].annotation in numeric_field_types
         assert is_numeric, f"AverageAggregateOp requires input_schema to have a numeric field type, i.e. one of: {numeric_field_types}\nGot: {input_field_types[0]}"
@@ -230,7 +245,7 @@ class CountAggregateOp(AggregateOp):
     def __init__(self, agg_func: AggFunc, *args, **kwargs):
         # enforce that output schema is correct
-        assert kwargs["output_schema"] == Count, "CountAggregateOp requires output_schema to be Count"
+        assert kwargs["output_schema"].model_fields.keys() == Count.model_fields.keys(), "CountAggregateOp requires output_schema to be Count"
         # call parent constructor
         super().__init__(*args, **kwargs)
@@ -280,3 +295,267 @@ class CountAggregateOp(AggregateOp):
         )
         return DataRecordSet([dr], [record_op_stats])
+class MinAggregateOp(AggregateOp):
+    # NOTE: we don't actually need / use agg_func here (yet)
+    def __init__(self, agg_func: AggFunc, *args, **kwargs):
+        # enforce that output schema is correct
+        assert kwargs["output_schema"].model_fields.keys() == Min.model_fields.keys(), "MinAggregateOp requires output_schema to be Min"
+        # call parent constructor
+        super().__init__(*args, **kwargs)
+        self.agg_func = agg_func
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Function: {str(self.agg_func)}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        return {"agg_func": str(self.agg_func), **id_params}
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        return {"agg_func": self.agg_func, **op_params}
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        # for now, assume applying the aggregation takes negligible additional time (and no cost in USD)
+        return OperatorCostEstimates(
+            cardinality=1,
+            time_per_record=0,
+            cost_per_record=0,
+            quality=1.0,
+        )
+    def __call__(self, candidates: list[DataRecord]) -> DataRecordSet:
+        start_time = time.time()
+        # create new DataRecord
+        min = float("inf")
+        for candidate in candidates:
+            try:  # noqa: SIM105
+                min = min(float(list(candidate.to_dict().values())[0]), min)
+            except Exception:
+                pass
+        data_item = Min(min=min if min != float("inf") else None)
+        dr = DataRecord.from_agg_parents(data_item, parent_records=candidates)
+        # create RecordOpStats object
+        record_op_stats = RecordOpStats(
+            record_id=dr.id,
+            record_parent_ids=dr.parent_ids,
+            record_source_indices=dr.source_indices,
+            record_state=dr.to_dict(include_bytes=False),
+            full_op_id=self.get_full_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=time.time() - start_time,
+            cost_per_record=0.0,
+            op_details={k: str(v) for k, v in self.get_id_params().items()},
+        )
+        return DataRecordSet([dr], [record_op_stats])
+class MaxAggregateOp(AggregateOp):
+    # NOTE: we don't actually need / use agg_func here (yet)
+    def __init__(self, agg_func: AggFunc, *args, **kwargs):
+        # enforce that output schema is correct
+        assert kwargs["output_schema"].model_fields.keys() == Max.model_fields.keys(), "MaxAggregateOp requires output_schema to be Max"
+        # call parent constructor
+        super().__init__(*args, **kwargs)
+        self.agg_func = agg_func
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Function: {str(self.agg_func)}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        return {"agg_func": str(self.agg_func), **id_params}
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        return {"agg_func": self.agg_func, **op_params}
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        # for now, assume applying the aggregation takes negligible additional time (and no cost in USD)
+        return OperatorCostEstimates(
+            cardinality=1,
+            time_per_record=0,
+            cost_per_record=0,
+            quality=1.0,
+        )
+    def __call__(self, candidates: list[DataRecord]) -> DataRecordSet:
+        start_time = time.time()
+        # create new DataRecord
+        max = float("-inf")
+        for candidate in candidates:
+            try:  # noqa: SIM105
+                max = max(float(list(candidate.to_dict().values())[0]), max)
+            except Exception:
+                pass
+        data_item = Max(max=max if max != float("-inf") else None)
+        dr = DataRecord.from_agg_parents(data_item, parent_records=candidates)
+        # create RecordOpStats object
+        record_op_stats = RecordOpStats(
+            record_id=dr.id,
+            record_parent_ids=dr.parent_ids,
+            record_source_indices=dr.source_indices,
+            record_state=dr.to_dict(include_bytes=False),
+            full_op_id=self.get_full_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=time.time() - start_time,
+            cost_per_record=0.0,
+            op_details={k: str(v) for k, v in self.get_id_params().items()},
+        )
+        return DataRecordSet([dr], [record_op_stats])
+class SemanticAggregate(AggregateOp):
+    def __init__(self, agg_str: str, model: Model, prompt_strategy: PromptStrategy = PromptStrategy.AGG, reasoning_effort: str | None = None, *args, **kwargs):
+        # call parent constructor
+        super().__init__(*args, **kwargs)
+        self.agg_str = agg_str
+        self.model = model
+        self.prompt_strategy = prompt_strategy
+        self.reasoning_effort = reasoning_effort
+        if model is not None:
+            self.generator = Generator(model, prompt_strategy, reasoning_effort, self.api_base)
+    def __str__(self):
+        op = super().__str__()
+        op += f"    Prompt Strategy: {self.prompt_strategy}\n"
+        op += f"    Reasoning Effort: {self.reasoning_effort}\n"
+        op += f"    Agg: {str(self.agg_str)}\n"
+        return op
+    def get_id_params(self):
+        id_params = super().get_id_params()
+        id_params = {
+            "agg_str": self.agg_str,
+            "model": None if self.model is None else self.model.value,
+            "prompt_strategy": None if self.prompt_strategy is None else self.prompt_strategy.value,
+            "reasoning_effort": self.reasoning_effort,
+            **id_params,
+        }
+        return id_params
+    def get_op_params(self):
+        op_params = super().get_op_params()
+        op_params = {
+            "agg_str": self.agg_str,
+            "model": self.model,
+            "prompt_strategy": self.prompt_strategy,
+            "reasoning_effort": self.reasoning_effort,
+            **op_params,
+        }
+        return op_params
+    def get_model_name(self) -> str:
+        return self.model.value
+    def naive_cost_estimates(self, source_op_cost_estimates: OperatorCostEstimates) -> OperatorCostEstimates:
+        """
+        Compute naive cost estimates for the LLMConvert operation. Implicitly, these estimates
+        assume the use of a single LLM call for each input record. Child classes of LLMConvert
+        may call this function through super() and adjust these estimates as needed (or they can
+        completely override this function).
+        """
+        # estimate number of input and output tokens from source
+        est_num_input_tokens = NAIVE_EST_NUM_INPUT_TOKENS * source_op_cost_estimates.cardinality
+        est_num_output_tokens = NAIVE_EST_NUM_OUTPUT_TOKENS
+        # get est. of conversion time per record from model card;
+        model_name = self.model.value
+        model_conversion_time_per_record = MODEL_CARDS[model_name]["seconds_per_output_token"] * est_num_output_tokens
+        # get est. of conversion cost (in USD) per record from model card
+        usd_per_input_token = MODEL_CARDS[model_name].get("usd_per_input_token")
+        if getattr(self, "prompt_strategy", None) is not None and self.prompt_strategy.is_audio_prompt():
+            usd_per_input_token = MODEL_CARDS[model_name]["usd_per_audio_input_token"]
+        model_conversion_usd_per_record = (
+            usd_per_input_token * est_num_input_tokens
+            + MODEL_CARDS[model_name]["usd_per_output_token"] * est_num_output_tokens
+        )
+        # estimate quality of output based on the strength of the model being used
+        quality = (MODEL_CARDS[model_name]["overall"] / 100.0)
+        return OperatorCostEstimates(
+            cardinality=1.0,
+            time_per_record=model_conversion_time_per_record,
+            cost_per_record=model_conversion_usd_per_record,
+            quality=quality,
+        )
+    def __call__(self, candidates: list[DataRecord]) -> DataRecordSet:
+        start_time = time.time()
+        # TODO: if candidates is an empty list, return an empty DataRecordSet
+        if len(candidates) == 0:
+            return DataRecordSet([], [])
+        # get the set of input fields to use for the operation
+        input_fields = self.get_input_fields()
+        # get the set of output fields to use for the operation
+        fields_to_generate = self.get_fields_to_generate(candidates[0])
+        fields = {field: field_type for field, field_type in self.output_schema.model_fields.items() if field in fields_to_generate}
+        # construct kwargs for generation
+        gen_kwargs = {"project_cols": input_fields, "output_schema": self.output_schema, "agg_instruction": self.agg_str}
+        # generate outputs for all fields in a single query
+        field_answers, _, generation_stats, _ = self.generator(candidates, fields, **gen_kwargs)
+        assert all([field in field_answers for field in fields]), "Not all fields were generated!"
+        # construct data record for the output
+        field, value = fields_to_generate[0], field_answers[fields_to_generate[0]][0]
+        data_item = self.output_schema(**{field: value})
+        dr = DataRecord.from_agg_parents(data_item, parent_records=candidates)
+        # create RecordOpStats object
+        record_op_stats = RecordOpStats(
+            record_id=dr._id,
+            record_parent_ids=dr._parent_ids,
+            record_source_indices=dr._source_indices,
+            record_state=dr.to_dict(include_bytes=False),
+            full_op_id=self.get_full_op_id(),
+            logical_op_id=self.logical_op_id,
+            op_name=self.op_name(),
+            time_per_record=time.time() - start_time,
+            cost_per_record=generation_stats.cost_per_record,
+            model_name=self.get_model_name(),
+            answer={field: value},
+            input_fields=input_fields,
+            generated_fields=fields_to_generate,
+            total_input_tokens=generation_stats.total_input_tokens,
+            total_output_tokens=generation_stats.total_output_tokens,
+            total_input_cost=generation_stats.total_input_cost,
+            total_output_cost=generation_stats.total_output_cost,
+            llm_call_duration_secs=generation_stats.llm_call_duration_secs,
+            fn_call_duration_secs=generation_stats.fn_call_duration_secs,
+            total_llm_calls=generation_stats.total_llm_calls,
+            total_embedding_llm_calls=generation_stats.total_embedding_llm_calls,
+            image_operation=self.is_image_op(),
+            op_details={k: str(v) for k, v in self.get_id_params().items()},
+        )
+        return DataRecordSet([dr], [record_op_stats])

palimpzest/query/operators/logical.py CHANGED Viewed

@@ -9,7 +9,7 @@ from palimpzest.constants import AggFunc, Cardinality
 from palimpzest.core.data import context, dataset
 from palimpzest.core.elements.filters import Filter
 from palimpzest.core.elements.groupbysig import GroupBySig
-from palimpzest.core.lib.schemas import Average, Count
+from palimpzest.core.lib.schemas import Average, Count, Max, Min
 from palimpzest.utils.hash_helpers import hash_for_id
@@ -149,27 +149,39 @@ class Aggregate(LogicalOperator):
     def __init__(
         self,
-        agg_func: AggFunc,
+        agg_func: AggFunc | None = None,
+        agg_str: str | None = None,
         *args,
         **kwargs,
     ):
+        assert agg_func is not None or agg_str is not None, "Either agg_func or agg_str must be provided"
         if kwargs.get("output_schema") is None:
             if agg_func == AggFunc.COUNT:
                 kwargs["output_schema"] = Count
             elif agg_func == AggFunc.AVERAGE:
                 kwargs["output_schema"] = Average
+            elif agg_func == AggFunc.MIN:
+                kwargs["output_schema"] = Min
+            elif agg_func == AggFunc.MAX:
+                kwargs["output_schema"] = Max
             else:
                 raise ValueError(f"Unsupported aggregation function: {agg_func}")
         super().__init__(*args, **kwargs)
         self.agg_func = agg_func
+        self.agg_str = agg_str
     def __str__(self):
-        return f"{self.__class__.__name__}(function: {str(self.agg_func.value)})"
+        desc = f"function: {str(self.agg_func.value)}" if self.agg_func else f"agg: {self.agg_str}"
+        return f"{self.__class__.__name__}({desc})"
     def get_logical_id_params(self) -> dict:
         logical_id_params = super().get_logical_id_params()
-        logical_id_params = {"agg_func": self.agg_func, **logical_id_params}
+        logical_id_params = {
+            "agg_func": self.agg_func,
+            "agg_str": self.agg_str,
+            **logical_id_params,
+        }
         return logical_id_params
@@ -177,6 +189,7 @@ class Aggregate(LogicalOperator):
         logical_op_params = super().get_logical_op_params()
         logical_op_params = {
             "agg_func": self.agg_func,
+            "agg_str": self.agg_str,
             **logical_op_params,
         }

palimpzest/query/optimizer/__init__.py CHANGED Viewed

@@ -47,6 +47,9 @@ from palimpzest.query.optimizer.rules import (
 from palimpzest.query.optimizer.rules import (
     Rule as _Rule,
 )
+from palimpzest.query.optimizer.rules import (
+    SemanticAggregateRule as _SemanticAggregateRule,
+)
 from palimpzest.query.optimizer.rules import (
     SplitRule as _SplitRule,
 )
@@ -72,6 +75,7 @@ ALL_RULES = [
     _ReorderConverts,
     _RetrieveRule,
     _Rule,
+    _SemanticAggregateRule,
     _SplitRule,
     _TransformationRule,
 ]

palimpzest/query/optimizer/rules.py CHANGED Viewed

@@ -12,7 +12,14 @@ from palimpzest.core.lib.schemas import (
     IMAGE_LIST_FIELD_TYPES,
 )
 from palimpzest.prompts import CONTEXT_SEARCH_PROMPT
-from palimpzest.query.operators.aggregate import ApplyGroupByOp, AverageAggregateOp, CountAggregateOp
+from palimpzest.query.operators.aggregate import (
+    ApplyGroupByOp,
+    AverageAggregateOp,
+    CountAggregateOp,
+    MaxAggregateOp,
+    MinAggregateOp,
+    SemanticAggregate,
+)
 from palimpzest.query.operators.compute import SmolAgentsCompute
 from palimpzest.query.operators.convert import LLMConvertBonded, NonLLMConvert
 from palimpzest.query.operators.critique_and_refine import CritiqueAndRefineConvert, CritiqueAndRefineFilter
@@ -924,6 +931,35 @@ class EmbeddingJoinRule(ImplementationRule):
         return cls._perform_substitution(logical_expression, EmbeddingJoin, runtime_kwargs, variable_op_kwargs)
+class SemanticAggregateRule(ImplementationRule):
+    """
+    Substitute a logical expression for a SemanticAggregate with an llm physical implementation.
+    """
+    @classmethod
+    def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
+        is_match = isinstance(logical_expression.operator, Aggregate) and logical_expression.operator.agg_str is not None
+        logger.debug(f"SemanticAggregateRule matches_pattern: {is_match} for {logical_expression}")
+        return is_match
+    @classmethod
+    def substitute(cls, logical_expression: LogicalExpression, **runtime_kwargs) -> set[PhysicalExpression]:
+        logger.debug(f"Substituting SemanticAggregateRule for {logical_expression}")
+        # create variable physical operator kwargs for each model which can implement this logical_expression
+        models = [model for model in runtime_kwargs["available_models"] if cls._model_matches_input(model, logical_expression) and not model.is_llama_model()]
+        no_reasoning = runtime_kwargs["reasoning_effort"] in [None, "minimal", "low"]
+        variable_op_kwargs = [
+            {
+                "model": model,
+                "prompt_strategy": PromptStrategy.AGG_NO_REASONING if model.is_reasoning_model() and no_reasoning else PromptStrategy.AGG,
+                "reasoning_effort": runtime_kwargs["reasoning_effort"]
+            }
+            for model in models
+        ]
+        return cls._perform_substitution(logical_expression, SemanticAggregate, runtime_kwargs, variable_op_kwargs)
 class AggregateRule(ImplementationRule):
     """
@@ -932,7 +968,7 @@ class AggregateRule(ImplementationRule):
     @classmethod
     def matches_pattern(cls, logical_expression: LogicalExpression) -> bool:
-        is_match = isinstance(logical_expression.operator, Aggregate)
+        is_match = isinstance(logical_expression.operator, Aggregate) and logical_expression.operator.agg_func is not None
         logger.debug(f"AggregateRule matches_pattern: {is_match} for {logical_expression}")
         return is_match
@@ -946,6 +982,10 @@ class AggregateRule(ImplementationRule):
             physical_op_class = CountAggregateOp
         elif logical_expression.operator.agg_func == AggFunc.AVERAGE:
             physical_op_class = AverageAggregateOp
+        elif logical_expression.operator.agg_func == AggFunc.MIN:
+            physical_op_class = MinAggregateOp
+        elif logical_expression.operator.agg_func == AggFunc.MAX:
+            physical_op_class = MaxAggregateOp
         else:
             raise Exception(f"Cannot support aggregate function: {logical_expression.operator.agg_func}")

palimpzest 0.8.6__py3-none-any.whl → 0.9.0__py3-none-any.whl

palimpzest 0.8.6py3-none-any.whl → 0.9.0py3-none-any.whl