PyPI - palimpzest - Versions diffs - 0.8.6__py3-none-any.whl → 0.9.0__py3-none-any.whl - Mend

palimpzest 0.8.6py3-none-any.whl → 0.9.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (20) hide show

palimpzest/constants.py +12 -4
palimpzest/core/data/dataset.py +42 -0
palimpzest/core/elements/records.py +5 -1
palimpzest/core/lib/schemas.py +13 -0
palimpzest/prompts/aggregate_prompts.py +99 -0
palimpzest/prompts/prompt_factory.py +163 -75
palimpzest/prompts/utils.py +38 -1
palimpzest/prompts/validator.py +24 -24
palimpzest/query/generators/generators.py +9 -7
palimpzest/query/operators/__init__.py +4 -1
palimpzest/query/operators/aggregate.py +285 -6
palimpzest/query/operators/logical.py +17 -4
palimpzest/query/optimizer/__init__.py +4 -0
palimpzest/query/optimizer/rules.py +42 -2
palimpzest/validator/validator.py +7 -7
{palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/METADATA +1 -1
{palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/RECORD +20 -19
{palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/WHEEL +0 -0
{palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/licenses/LICENSE +0 -0
{palimpzest-0.8.6.dist-info → palimpzest-0.9.0.dist-info}/top_level.txt +0 -0

palimpzest/constants.py CHANGED Viewed

@@ -136,13 +136,17 @@ class PromptStrategy(str, Enum):
     performing some task with a specified Model.
     """
+    # aggregation prompt strategies
+    AGG = "aggregation"
+    AGG_NO_REASONING = "aggregation-no-reasoning"
     # filter prompt strategies
     FILTER = "filter"
     FILTER_NO_REASONING = "filter-no-reasoning"
     FILTER_CRITIC = "filter-critic"
     FILTER_REFINE = "filter-refine"
     FILTER_MOA_PROPOSER = "filter-mixture-of-agents-proposer"
-    FILTER_MOA_AGG = "filter-mixture-of-agents-aggregation"
+    FILTER_MOA_AGG = "filter-mixture-of-agents-aggregator"
     FILTER_SPLIT_PROPOSER = "filter-split-proposer"
     FILTER_SPLIT_MERGER = "filter-split-merger"
@@ -156,10 +160,13 @@ class PromptStrategy(str, Enum):
     MAP_CRITIC = "map-critic"
     MAP_REFINE = "map-refine"
     MAP_MOA_PROPOSER = "map-mixture-of-agents-proposer"
-    MAP_MOA_AGG = "map-mixture-of-agents-aggregation"
+    MAP_MOA_AGG = "map-mixture-of-agents-aggregator"
     MAP_SPLIT_PROPOSER = "map-split-proposer"
     MAP_SPLIT_MERGER = "map-split-merger"
+    def is_agg_prompt(self):
+        return "aggregation" in self.value
     def is_filter_prompt(self):
         return "filter" in self.value
@@ -179,7 +186,7 @@ class PromptStrategy(str, Enum):
         return "mixture-of-agents-proposer" in self.value
     def is_moa_aggregator_prompt(self):
-        return "mixture-of-agents-aggregation" in self.value
+        return "mixture-of-agents-aggregator" in self.value
     def is_split_proposer_prompt(self):
         return "split-proposer" in self.value
@@ -200,7 +207,8 @@ class Modality(str, Enum):
 class AggFunc(str, Enum):
     COUNT = "count"
     AVERAGE = "average"
+    MIN = "min"
+    MAX = "max"
 class Cardinality(str, Enum):
     ONE_TO_ONE = "one-to-one"

palimpzest/core/data/dataset.py CHANGED Viewed

@@ -534,11 +534,53 @@ class Dataset:
         operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.AVERAGE)
         return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
+    def min(self) -> Dataset:
+        """Apply an min operator to this set"""
+        operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.MIN)
+        return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
+    def max(self) -> Dataset:
+        """Apply an max operator to this set"""
+        operator = Aggregate(input_schema=self.schema, agg_func=AggFunc.MAX)
+        return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
     def groupby(self, groupby: GroupBySig) -> Dataset:
         output_schema = groupby.output_schema()
         operator = GroupByAggregate(input_schema=self.schema, output_schema=output_schema, group_by_sig=groupby)
         return Dataset(sources=[self], operator=operator, schema=output_schema)
+    def sem_agg(self, col: dict | type[BaseModel], agg: str, depends_on: str | list[str] | None = None) -> Dataset:
+        """
+        Apply a semantic aggregation to this set. The `agg` string will be applied using an LLM
+        over the entire set of inputs' fields specified in `depends_on` to generate the output `col`.
+        Example:
+            sem_agg(
+                col={'name': 'overall_sentiment', 'desc': 'The overall sentiment of the reviews', 'type': str},
+                agg="Compute the overall sentiment of the reviews as POSITIVE or NEGATIVE.",
+                depends_on="review_text",
+            )
+        """
+        # construct new output schema
+        new_output_schema = None
+        if isinstance(col, dict):
+            col_schema = create_schema_from_fields([col])
+            new_output_schema = union_schemas([self.schema, col_schema])
+        elif issubclass(col, BaseModel):
+            assert len(col.model_fields) == 1, "For semantic aggregation, when passing a BaseModel to `col` it must have exactly one field."
+            new_output_schema = union_schemas([self.schema, col])
+        else:
+            raise ValueError("`col` must be a dictionary or a single-field BaseModel.")
+        # enforce type for depends_on
+        if isinstance(depends_on, str):
+            depends_on = [depends_on]
+        # construct logical operator
+        operator = Aggregate(input_schema=self.schema, output_schema=new_output_schema, agg_str=agg, depends_on=depends_on)
+        return Dataset(sources=[self], operator=operator, schema=operator.output_schema)
     def retrieve(
         self,
         index: Collection,

palimpzest/core/elements/records.py CHANGED Viewed

@@ -11,6 +11,8 @@ from pydantic.fields import FieldInfo
 from palimpzest.core.data import context
 from palimpzest.core.lib.schemas import (
+    AUDIO_FIELD_TYPES,
+    IMAGE_FIELD_TYPES,
     AudioBase64,
     AudioFilepath,
     ImageBase64,
@@ -303,9 +305,11 @@ class DataRecord:
             dct = {k: v for k, v in dct.items() if k in project_field_names}
         if not include_bytes:
+            bytes_field_types = [bytes, list[bytes], bytes | None, list[bytes] | None, bytes | Any, list[bytes] | Any]
+            bytes_field_types += AUDIO_FIELD_TYPES + IMAGE_FIELD_TYPES
             for k in dct:
                 field_type = self.get_field_type(k)
-                if field_type.annotation in [bytes, AudioBase64, ImageBase64, list[bytes], list[ImageBase64]]:
+                if field_type.annotation in bytes_field_types:
                     dct[k] = "<bytes>"
         if bytes_to_str:

palimpzest/core/lib/schemas.py CHANGED Viewed

@@ -33,20 +33,27 @@ IMAGE_LIST_FIELD_TYPES = [
     list[ImageBase64] | None,
     list[ImageFilepath] | None,
     list[ImageURL] | None,
+    list[ImageBase64] | Any,
+    list[ImageFilepath] | Any,
+    list[ImageURL] | Any,
 ]
 IMAGE_FIELD_TYPES = IMAGE_LIST_FIELD_TYPES + [
     ImageBase64, ImageFilepath, ImageURL,
     ImageBase64 | None, ImageFilepath | None, ImageURL | None,
+    ImageBase64 | Any, ImageFilepath | Any, ImageURL | Any,
 ]
 AUDIO_LIST_FIELD_TYPES = [
     list[AudioBase64],
     list[AudioFilepath],
     list[AudioBase64] | None,
     list[AudioFilepath] | None,
+    list[AudioBase64] | Any,
+    list[AudioFilepath] | Any,
 ]
 AUDIO_FIELD_TYPES = AUDIO_LIST_FIELD_TYPES + [
     AudioBase64, AudioFilepath,
     AudioBase64 | None, AudioFilepath | None,
+    AudioBase64 | Any, AudioFilepath | Any,
 ]
@@ -187,6 +194,12 @@ class Average(BaseModel):
 class Count(BaseModel):
     count: int = Field(description="The count of items in the dataset")
+class Min(BaseModel):
+    min: int | float = Field(description="The minimum value of some items in the dataset")
+class Max(BaseModel):
+    max: int | float = Field(description="The maximum value of some items in the dataset")
 class OperatorDerivedSchema(BaseModel):
     """Schema defined by an operator, e.g., a join or a group by"""

palimpzest/prompts/aggregate_prompts.py ADDED Viewed

@@ -0,0 +1,99 @@
+"""This file contains prompts for aggregation operations."""
+### BASE PROMPTS ###
+AGG_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
+You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
+You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
+{output_format_instruction} Finish your response with a newline character followed by ---
+An example is shown below:
+---
+INPUT FIELDS:
+{example_input_fields}
+OUTPUT FIELDS:
+{example_output_fields}
+CONTEXT:
+{{{example_context}}}
+{{{second_example_context}}}
+{{{third_example_context}}}{image_disclaimer}{audio_disclaimer}
+AGGREGATION INSTRUCTION: {example_agg_instruction}
+Let's think step-by-step in order to answer the question.
+REASONING: {example_reasoning}
+ANSWER:
+{{{example_answer}}}
+---
+"""
+AGG_NO_REASONING_BASE_SYSTEM_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
+You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
+You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
+{output_format_instruction} Finish your response with a newline character followed by ---
+An example is shown below:
+---
+INPUT FIELDS:
+{example_input_fields}
+OUTPUT FIELDS:
+{example_output_fields}
+CONTEXT:
+{{{example_context}}}
+{{{second_example_context}}}
+{{{third_example_context}}}{image_disclaimer}{audio_disclaimer}
+AGGREGATION INSTRUCTION: {example_agg_instruction}
+ANSWER:
+{{{example_answer}}}
+---
+"""
+AGG_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
+You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
+You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
+{desc_section}
+{output_format_instruction} Finish your response with a newline character followed by ---
+---
+INPUT FIELDS:
+{input_fields_desc}
+OUTPUT FIELDS:
+{output_fields_desc}
+CONTEXT:
+{context}<<image-audio-placeholder>>
+AGGREGATION INSTRUCTION: {agg_instruction}
+Let's think step-by-step in order to answer the question.
+REASONING: """
+AGG_NO_REASONING_BASE_USER_PROMPT = """You are a helpful assistant whose job is to {job_instruction}.
+You will be presented with a context and an output field to generate. Your task is to generate a JSON object which aggregates the input and fills in the output field with the correct value.
+You will be provided with a description of each input field and each output field. The field in the output JSON object can be derived using information from the context.
+{desc_section}
+{output_format_instruction} Finish your response with a newline character followed by ---
+---
+INPUT FIELDS:
+{input_fields_desc}
+OUTPUT FIELDS:
+{output_fields_desc}
+CONTEXT:
+{context}<<image-audio-placeholder>>
+AGGREGATION INSTRUCTION: {agg_instruction}
+ANSWER: """

palimpzest 0.8.6__py3-none-any.whl → 0.9.0__py3-none-any.whl

palimpzest 0.8.6py3-none-any.whl → 0.9.0py3-none-any.whl